1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/atomic.hpp"
  45 #include "runtime/continuation.hpp"
  46 #include "runtime/continuationEntry.inline.hpp"
  47 #include "runtime/frame.inline.hpp"
  48 #include "runtime/handles.inline.hpp"
  49 #include "runtime/javaThread.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubCodeGenerator.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "utilities/align.hpp"
  54 #include "utilities/globalDefinitions.hpp"
  55 #include "utilities/powerOfTwo.hpp"
  56 #ifdef COMPILER2
  57 #include "opto/runtime.hpp"
  58 #endif
  59 #if INCLUDE_ZGC
  60 #include "gc/z/zThreadLocalData.hpp"
  61 #endif
  62 
  63 // Declaration and definition of StubGenerator (no .hpp file).
  64 // For a more detailed description of the stub routine structure
  65 // see the comment in stubRoutines.hpp
  66 
  67 #undef __
  68 #define __ _masm->
  69 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  70 
  71 #ifdef PRODUCT
  72 #define BLOCK_COMMENT(str) /* nothing */
  73 #else
  74 #define BLOCK_COMMENT(str) __ block_comment(str)
  75 #endif
  76 
  77 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  78 
  79 OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots);
  80 void fill_continuation_entry(MacroAssembler* masm);
  81 void continuation_enter_cleanup(MacroAssembler* masm);
  82 
  83 // Stub Code definitions
  84 
  85 class StubGenerator: public StubCodeGenerator {
  86  private:
  87 
  88 #ifdef PRODUCT
  89 #define inc_counter_np(counter) ((void)0)
  90 #else
  91   void inc_counter_np_(int& counter) {
  92     __ lea(rscratch2, ExternalAddress((address)&counter));
  93     __ ldrw(rscratch1, Address(rscratch2));
  94     __ addw(rscratch1, rscratch1, 1);
  95     __ strw(rscratch1, Address(rscratch2));
  96   }
  97 #define inc_counter_np(counter) \
  98   BLOCK_COMMENT("inc_counter " #counter); \
  99   inc_counter_np_(counter);
 100 #endif
 101 
 102   // Call stubs are used to call Java from C
 103   //
 104   // Arguments:
 105   //    c_rarg0:   call wrapper address                   address
 106   //    c_rarg1:   result                                 address
 107   //    c_rarg2:   result type                            BasicType
 108   //    c_rarg3:   method                                 Method*
 109   //    c_rarg4:   (interpreter) entry point              address
 110   //    c_rarg5:   parameters                             intptr_t*
 111   //    c_rarg6:   parameter size (in words)              int
 112   //    c_rarg7:   thread                                 Thread*
 113   //
 114   // There is no return from the stub itself as any Java result
 115   // is written to result
 116   //
 117   // we save r30 (lr) as the return PC at the base of the frame and
 118   // link r29 (fp) below it as the frame pointer installing sp (r31)
 119   // into fp.
 120   //
 121   // we save r0-r7, which accounts for all the c arguments.
 122   //
 123   // TODO: strictly do we need to save them all? they are treated as
 124   // volatile by C so could we omit saving the ones we are going to
 125   // place in global registers (thread? method?) or those we only use
 126   // during setup of the Java call?
 127   //
 128   // we don't need to save r8 which C uses as an indirect result location
 129   // return register.
 130   //
 131   // we don't need to save r9-r15 which both C and Java treat as
 132   // volatile
 133   //
 134   // we don't need to save r16-18 because Java does not use them
 135   //
 136   // we save r19-r28 which Java uses as scratch registers and C
 137   // expects to be callee-save
 138   //
 139   // we save the bottom 64 bits of each value stored in v8-v15; it is
 140   // the responsibility of the caller to preserve larger values.
 141   //
 142   // so the stub frame looks like this when we enter Java code
 143   //
 144   //     [ return_from_Java     ] <--- sp
 145   //     [ argument word n      ]
 146   //      ...
 147   // -27 [ argument word 1      ]
 148   // -26 [ saved v15            ] <--- sp_after_call
 149   // -25 [ saved v14            ]
 150   // -24 [ saved v13            ]
 151   // -23 [ saved v12            ]
 152   // -22 [ saved v11            ]
 153   // -21 [ saved v10            ]
 154   // -20 [ saved v9             ]
 155   // -19 [ saved v8             ]
 156   // -18 [ saved r28            ]
 157   // -17 [ saved r27            ]
 158   // -16 [ saved r26            ]
 159   // -15 [ saved r25            ]
 160   // -14 [ saved r24            ]
 161   // -13 [ saved r23            ]
 162   // -12 [ saved r22            ]
 163   // -11 [ saved r21            ]
 164   // -10 [ saved r20            ]
 165   //  -9 [ saved r19            ]
 166   //  -8 [ call wrapper    (r0) ]
 167   //  -7 [ result          (r1) ]
 168   //  -6 [ result type     (r2) ]
 169   //  -5 [ method          (r3) ]
 170   //  -4 [ entry point     (r4) ]
 171   //  -3 [ parameters      (r5) ]
 172   //  -2 [ parameter size  (r6) ]
 173   //  -1 [ thread (r7)          ]
 174   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 175   //   1 [ saved lr       (r30) ]
 176 
 177   // Call stub stack layout word offsets from fp
 178   enum call_stub_layout {
 179     sp_after_call_off = -26,
 180 
 181     d15_off            = -26,
 182     d13_off            = -24,
 183     d11_off            = -22,
 184     d9_off             = -20,
 185 
 186     r28_off            = -18,
 187     r26_off            = -16,
 188     r24_off            = -14,
 189     r22_off            = -12,
 190     r20_off            = -10,
 191     call_wrapper_off   =  -8,
 192     result_off         =  -7,
 193     result_type_off    =  -6,
 194     method_off         =  -5,
 195     entry_point_off    =  -4,
 196     parameter_size_off =  -2,
 197     thread_off         =  -1,
 198     fp_f               =   0,
 199     retaddr_off        =   1,
 200   };
 201 
 202   address generate_call_stub(address& return_address) {
 203     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 204            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 205            "adjust this code");
 206 
 207     StubCodeMark mark(this, "StubRoutines", "call_stub");
 208     address start = __ pc();
 209 
 210     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 211 
 212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 213     const Address result        (rfp, result_off         * wordSize);
 214     const Address result_type   (rfp, result_type_off    * wordSize);
 215     const Address method        (rfp, method_off         * wordSize);
 216     const Address entry_point   (rfp, entry_point_off    * wordSize);
 217     const Address parameter_size(rfp, parameter_size_off * wordSize);
 218 
 219     const Address thread        (rfp, thread_off         * wordSize);
 220 
 221     const Address d15_save      (rfp, d15_off * wordSize);
 222     const Address d13_save      (rfp, d13_off * wordSize);
 223     const Address d11_save      (rfp, d11_off * wordSize);
 224     const Address d9_save       (rfp, d9_off * wordSize);
 225 
 226     const Address r28_save      (rfp, r28_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r24_save      (rfp, r24_off * wordSize);
 229     const Address r22_save      (rfp, r22_off * wordSize);
 230     const Address r20_save      (rfp, r20_off * wordSize);
 231 
 232     // stub code
 233 
 234     address aarch64_entry = __ pc();
 235 
 236     // set up frame and move sp to end of save area
 237     __ enter();
 238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 239 
 240     // save register parameters and Java scratch/global registers
 241     // n.b. we save thread even though it gets installed in
 242     // rthread because we want to sanity check rthread later
 243     __ str(c_rarg7,  thread);
 244     __ strw(c_rarg6, parameter_size);
 245     __ stp(c_rarg4, c_rarg5,  entry_point);
 246     __ stp(c_rarg2, c_rarg3,  result_type);
 247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 248 
 249     __ stp(r20, r19,   r20_save);
 250     __ stp(r22, r21,   r22_save);
 251     __ stp(r24, r23,   r24_save);
 252     __ stp(r26, r25,   r26_save);
 253     __ stp(r28, r27,   r28_save);
 254 
 255     __ stpd(v9,  v8,   d9_save);
 256     __ stpd(v11, v10,  d11_save);
 257     __ stpd(v13, v12,  d13_save);
 258     __ stpd(v15, v14,  d15_save);
 259 
 260     // install Java thread in global register now we have saved
 261     // whatever value it held
 262     __ mov(rthread, c_rarg7);
 263     // And method
 264     __ mov(rmethod, c_rarg3);
 265 
 266     // set up the heapbase register
 267     __ reinit_heapbase();
 268 
 269 #ifdef ASSERT
 270     // make sure we have no pending exceptions
 271     {
 272       Label L;
 273       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 274       __ cmp(rscratch1, (u1)NULL_WORD);
 275       __ br(Assembler::EQ, L);
 276       __ stop("StubRoutines::call_stub: entered with pending exception");
 277       __ BIND(L);
 278     }
 279 #endif
 280     // pass parameters if any
 281     __ mov(esp, sp);
 282     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 283     __ andr(sp, rscratch1, -2 * wordSize);
 284 
 285     BLOCK_COMMENT("pass parameters if any");
 286     Label parameters_done;
 287     // parameter count is still in c_rarg6
 288     // and parameter pointer identifying param 1 is in c_rarg5
 289     __ cbzw(c_rarg6, parameters_done);
 290 
 291     address loop = __ pc();
 292     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 293     __ subsw(c_rarg6, c_rarg6, 1);
 294     __ push(rscratch1);
 295     __ br(Assembler::GT, loop);
 296 
 297     __ BIND(parameters_done);
 298 
 299     // call Java entry -- passing methdoOop, and current sp
 300     //      rmethod: Method*
 301     //      r19_sender_sp: sender sp
 302     BLOCK_COMMENT("call Java function");
 303     __ mov(r19_sender_sp, sp);
 304     __ blr(c_rarg4);
 305 
 306     // we do this here because the notify will already have been done
 307     // if we get to the next instruction via an exception
 308     //
 309     // n.b. adding this instruction here affects the calculation of
 310     // whether or not a routine returns to the call stub (used when
 311     // doing stack walks) since the normal test is to check the return
 312     // pc against the address saved below. so we may need to allow for
 313     // this extra instruction in the check.
 314 
 315     // save current address for use by exception handling code
 316 
 317     return_address = __ pc();
 318 
 319     // store result depending on type (everything that is not
 320     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 321     // n.b. this assumes Java returns an integral result in r0
 322     // and a floating result in j_farg0
 323     __ ldr(j_rarg2, result);
 324     Label is_long, is_float, is_double, exit;
 325     __ ldr(j_rarg1, result_type);
 326     __ cmp(j_rarg1, (u1)T_OBJECT);
 327     __ br(Assembler::EQ, is_long);
 328     __ cmp(j_rarg1, (u1)T_LONG);
 329     __ br(Assembler::EQ, is_long);
 330     __ cmp(j_rarg1, (u1)T_FLOAT);
 331     __ br(Assembler::EQ, is_float);
 332     __ cmp(j_rarg1, (u1)T_DOUBLE);
 333     __ br(Assembler::EQ, is_double);
 334 
 335     // handle T_INT case
 336     __ strw(r0, Address(j_rarg2));
 337 
 338     __ BIND(exit);
 339 
 340     // pop parameters
 341     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 342 
 343 #ifdef ASSERT
 344     // verify that threads correspond
 345     {
 346       Label L, S;
 347       __ ldr(rscratch1, thread);
 348       __ cmp(rthread, rscratch1);
 349       __ br(Assembler::NE, S);
 350       __ get_thread(rscratch1);
 351       __ cmp(rthread, rscratch1);
 352       __ br(Assembler::EQ, L);
 353       __ BIND(S);
 354       __ stop("StubRoutines::call_stub: threads must correspond");
 355       __ BIND(L);
 356     }
 357 #endif
 358 
 359     __ pop_cont_fastpath(rthread);
 360 
 361     // restore callee-save registers
 362     __ ldpd(v15, v14,  d15_save);
 363     __ ldpd(v13, v12,  d13_save);
 364     __ ldpd(v11, v10,  d11_save);
 365     __ ldpd(v9,  v8,   d9_save);
 366 
 367     __ ldp(r28, r27,   r28_save);
 368     __ ldp(r26, r25,   r26_save);
 369     __ ldp(r24, r23,   r24_save);
 370     __ ldp(r22, r21,   r22_save);
 371     __ ldp(r20, r19,   r20_save);
 372 
 373     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 374     __ ldrw(c_rarg2, result_type);
 375     __ ldr(c_rarg3,  method);
 376     __ ldp(c_rarg4, c_rarg5,  entry_point);
 377     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 378 
 379     // leave frame and return to caller
 380     __ leave();
 381     __ ret(lr);
 382 
 383     // handle return types different from T_INT
 384 
 385     __ BIND(is_long);
 386     __ str(r0, Address(j_rarg2, 0));
 387     __ br(Assembler::AL, exit);
 388 
 389     __ BIND(is_float);
 390     __ strs(j_farg0, Address(j_rarg2, 0));
 391     __ br(Assembler::AL, exit);
 392 
 393     __ BIND(is_double);
 394     __ strd(j_farg0, Address(j_rarg2, 0));
 395     __ br(Assembler::AL, exit);
 396 
 397     return start;
 398   }
 399 
 400   // Return point for a Java call if there's an exception thrown in
 401   // Java code.  The exception is caught and transformed into a
 402   // pending exception stored in JavaThread that can be tested from
 403   // within the VM.
 404   //
 405   // Note: Usually the parameters are removed by the callee. In case
 406   // of an exception crossing an activation frame boundary, that is
 407   // not the case if the callee is compiled code => need to setup the
 408   // rsp.
 409   //
 410   // r0: exception oop
 411 
 412   address generate_catch_exception() {
 413     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 414     address start = __ pc();
 415 
 416     // same as in generate_call_stub():
 417     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 418     const Address thread        (rfp, thread_off         * wordSize);
 419 
 420 #ifdef ASSERT
 421     // verify that threads correspond
 422     {
 423       Label L, S;
 424       __ ldr(rscratch1, thread);
 425       __ cmp(rthread, rscratch1);
 426       __ br(Assembler::NE, S);
 427       __ get_thread(rscratch1);
 428       __ cmp(rthread, rscratch1);
 429       __ br(Assembler::EQ, L);
 430       __ bind(S);
 431       __ stop("StubRoutines::catch_exception: threads must correspond");
 432       __ bind(L);
 433     }
 434 #endif
 435 
 436     // set pending exception
 437     __ verify_oop(r0);
 438 
 439     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 440     __ mov(rscratch1, (address)__FILE__);
 441     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 442     __ movw(rscratch1, (int)__LINE__);
 443     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 444 
 445     // complete return to VM
 446     assert(StubRoutines::_call_stub_return_address != NULL,
 447            "_call_stub_return_address must have been generated before");
 448     __ b(StubRoutines::_call_stub_return_address);
 449 
 450     return start;
 451   }
 452 
 453   // Continuation point for runtime calls returning with a pending
 454   // exception.  The pending exception check happened in the runtime
 455   // or native call stub.  The pending exception in Thread is
 456   // converted into a Java-level exception.
 457   //
 458   // Contract with Java-level exception handlers:
 459   // r0: exception
 460   // r3: throwing pc
 461   //
 462   // NOTE: At entry of this stub, exception-pc must be in LR !!
 463 
 464   // NOTE: this is always used as a jump target within generated code
 465   // so it just needs to be generated code with no x86 prolog
 466 
 467   address generate_forward_exception() {
 468     StubCodeMark mark(this, "StubRoutines", "forward exception");
 469     address start = __ pc();
 470 
 471     // Upon entry, LR points to the return address returning into
 472     // Java (interpreted or compiled) code; i.e., the return address
 473     // becomes the throwing pc.
 474     //
 475     // Arguments pushed before the runtime call are still on the stack
 476     // but the exception handler will reset the stack pointer ->
 477     // ignore them.  A potential result in registers can be ignored as
 478     // well.
 479 
 480 #ifdef ASSERT
 481     // make sure this code is only executed if there is a pending exception
 482     {
 483       Label L;
 484       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 485       __ cbnz(rscratch1, L);
 486       __ stop("StubRoutines::forward exception: no pending exception (1)");
 487       __ bind(L);
 488     }
 489 #endif
 490 
 491     // compute exception handler into r19
 492 
 493     // call the VM to find the handler address associated with the
 494     // caller address. pass thread in r0 and caller pc (ret address)
 495     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 496     // the stack.
 497     __ mov(c_rarg1, lr);
 498     // lr will be trashed by the VM call so we move it to R19
 499     // (callee-saved) because we also need to pass it to the handler
 500     // returned by this call.
 501     __ mov(r19, lr);
 502     BLOCK_COMMENT("call exception_handler_for_return_address");
 503     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 504                          SharedRuntime::exception_handler_for_return_address),
 505                     rthread, c_rarg1);
 506     // Reinitialize the ptrue predicate register, in case the external runtime
 507     // call clobbers ptrue reg, as we may return to SVE compiled code.
 508     __ reinitialize_ptrue();
 509 
 510     // we should not really care that lr is no longer the callee
 511     // address. we saved the value the handler needs in r19 so we can
 512     // just copy it to r3. however, the C2 handler will push its own
 513     // frame and then calls into the VM and the VM code asserts that
 514     // the PC for the frame above the handler belongs to a compiled
 515     // Java method. So, we restore lr here to satisfy that assert.
 516     __ mov(lr, r19);
 517     // setup r0 & r3 & clear pending exception
 518     __ mov(r3, r19);
 519     __ mov(r19, r0);
 520     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 521     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 522 
 523 #ifdef ASSERT
 524     // make sure exception is set
 525     {
 526       Label L;
 527       __ cbnz(r0, L);
 528       __ stop("StubRoutines::forward exception: no pending exception (2)");
 529       __ bind(L);
 530     }
 531 #endif
 532 
 533     // continue at exception handler
 534     // r0: exception
 535     // r3: throwing pc
 536     // r19: exception handler
 537     __ verify_oop(r0);
 538     __ br(r19);
 539 
 540     return start;
 541   }
 542 
 543   // Non-destructive plausibility checks for oops
 544   //
 545   // Arguments:
 546   //    r0: oop to verify
 547   //    rscratch1: error message
 548   //
 549   // Stack after saving c_rarg3:
 550   //    [tos + 0]: saved c_rarg3
 551   //    [tos + 1]: saved c_rarg2
 552   //    [tos + 2]: saved lr
 553   //    [tos + 3]: saved rscratch2
 554   //    [tos + 4]: saved r0
 555   //    [tos + 5]: saved rscratch1
 556   address generate_verify_oop() {
 557 
 558     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 559     address start = __ pc();
 560 
 561     Label exit, error;
 562 
 563     // save c_rarg2 and c_rarg3
 564     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 565 
 566     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 567     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 568     __ ldr(c_rarg3, Address(c_rarg2));
 569     __ add(c_rarg3, c_rarg3, 1);
 570     __ str(c_rarg3, Address(c_rarg2));
 571 
 572     // object is in r0
 573     // make sure object is 'reasonable'
 574     __ cbz(r0, exit); // if obj is NULL it is OK
 575 
 576 #if INCLUDE_ZGC
 577     if (UseZGC) {
 578       // Check if mask is good.
 579       // verifies that ZAddressBadMask & r0 == 0
 580       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 581       __ andr(c_rarg2, r0, c_rarg3);
 582       __ cbnz(c_rarg2, error);
 583     }
 584 #endif
 585 
 586     // Check if the oop is in the right area of memory
 587     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 588     __ andr(c_rarg2, r0, c_rarg3);
 589     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 590 
 591     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 592     // instruction here because the flags register is live.
 593     __ eor(c_rarg2, c_rarg2, c_rarg3);
 594     __ cbnz(c_rarg2, error);
 595 
 596     // make sure klass is 'reasonable', which is not zero.
 597     // NOTE: We used to load the Klass* here, and compare that to zero.
 598     // However, with current Lilliput implementation, that would require
 599     // checking the locking bits and calling into the runtime, which
 600     // clobbers the condition flags, which may be live around this call.
 601     // OTOH, this is a simple NULL-check, and we can simply load the upper
 602     // 32bit of the header as narrowKlass, and compare that to 0. The
 603     // worst that can happen (rarely) is that the object is locked and
 604     // we have lock pointer bits in the upper 32bits. We can't get a false
 605     // negative.
 606     assert(oopDesc::klass_offset_in_bytes() % 4 == 0, "must be 4 byte aligned");
 607     __ ldrw(r0, Address(r0, oopDesc::klass_offset_in_bytes()));  // get klass
 608     __ cbzw(r0, error);      // if klass is NULL it is broken
 609 
 610     // return if everything seems ok
 611     __ bind(exit);
 612 
 613     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 614     __ ret(lr);
 615 
 616     // handle errors
 617     __ bind(error);
 618     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 619 
 620     __ push(RegSet::range(r0, r29), sp);
 621     // debug(char* msg, int64_t pc, int64_t regs[])
 622     __ mov(c_rarg0, rscratch1);      // pass address of error message
 623     __ mov(c_rarg1, lr);             // pass return address
 624     __ mov(c_rarg2, sp);             // pass address of regs on stack
 625 #ifndef PRODUCT
 626     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 627 #endif
 628     BLOCK_COMMENT("call MacroAssembler::debug");
 629     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 630     __ blr(rscratch1);
 631     __ hlt(0);
 632 
 633     return start;
 634   }
 635 
 636   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 637 
 638   // Generate indices for iota vector.
 639   address generate_iota_indices(const char *stub_name) {
 640     __ align(CodeEntryAlignment);
 641     StubCodeMark mark(this, "StubRoutines", stub_name);
 642     address start = __ pc();
 643     __ emit_data64(0x0706050403020100, relocInfo::none);
 644     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 645     return start;
 646   }
 647 
 648   // The inner part of zero_words().  This is the bulk operation,
 649   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 650   // caller is responsible for zeroing the last few words.
 651   //
 652   // Inputs:
 653   // r10: the HeapWord-aligned base address of an array to zero.
 654   // r11: the count in HeapWords, r11 > 0.
 655   //
 656   // Returns r10 and r11, adjusted for the caller to clear.
 657   // r10: the base address of the tail of words left to clear.
 658   // r11: the number of words in the tail.
 659   //      r11 < MacroAssembler::zero_words_block_size.
 660 
 661   address generate_zero_blocks() {
 662     Label done;
 663     Label base_aligned;
 664 
 665     Register base = r10, cnt = r11;
 666 
 667     __ align(CodeEntryAlignment);
 668     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 669     address start = __ pc();
 670 
 671     if (UseBlockZeroing) {
 672       int zva_length = VM_Version::zva_length();
 673 
 674       // Ensure ZVA length can be divided by 16. This is required by
 675       // the subsequent operations.
 676       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 677 
 678       __ tbz(base, 3, base_aligned);
 679       __ str(zr, Address(__ post(base, 8)));
 680       __ sub(cnt, cnt, 1);
 681       __ bind(base_aligned);
 682 
 683       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 684       // alignment.
 685       Label small;
 686       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 687       __ subs(rscratch1, cnt, low_limit >> 3);
 688       __ br(Assembler::LT, small);
 689       __ zero_dcache_blocks(base, cnt);
 690       __ bind(small);
 691     }
 692 
 693     {
 694       // Number of stp instructions we'll unroll
 695       const int unroll =
 696         MacroAssembler::zero_words_block_size / 2;
 697       // Clear the remaining blocks.
 698       Label loop;
 699       __ subs(cnt, cnt, unroll * 2);
 700       __ br(Assembler::LT, done);
 701       __ bind(loop);
 702       for (int i = 0; i < unroll; i++)
 703         __ stp(zr, zr, __ post(base, 16));
 704       __ subs(cnt, cnt, unroll * 2);
 705       __ br(Assembler::GE, loop);
 706       __ bind(done);
 707       __ add(cnt, cnt, unroll * 2);
 708     }
 709 
 710     __ ret(lr);
 711 
 712     return start;
 713   }
 714 
 715 
 716   typedef enum {
 717     copy_forwards = 1,
 718     copy_backwards = -1
 719   } copy_direction;
 720 
 721   // Bulk copy of blocks of 8 words.
 722   //
 723   // count is a count of words.
 724   //
 725   // Precondition: count >= 8
 726   //
 727   // Postconditions:
 728   //
 729   // The least significant bit of count contains the remaining count
 730   // of words to copy.  The rest of count is trash.
 731   //
 732   // s and d are adjusted to point to the remaining words to copy
 733   //
 734   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 735                            copy_direction direction) {
 736     int unit = wordSize * direction;
 737     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 738 
 739     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 740       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 741     const Register stride = r13;
 742 
 743     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 744     assert_different_registers(s, d, count, rscratch1);
 745 
 746     Label again, drain;
 747     const char *stub_name;
 748     if (direction == copy_forwards)
 749       stub_name = "forward_copy_longs";
 750     else
 751       stub_name = "backward_copy_longs";
 752 
 753     __ align(CodeEntryAlignment);
 754 
 755     StubCodeMark mark(this, "StubRoutines", stub_name);
 756 
 757     __ bind(start);
 758 
 759     Label unaligned_copy_long;
 760     if (AvoidUnalignedAccesses) {
 761       __ tbnz(d, 3, unaligned_copy_long);
 762     }
 763 
 764     if (direction == copy_forwards) {
 765       __ sub(s, s, bias);
 766       __ sub(d, d, bias);
 767     }
 768 
 769 #ifdef ASSERT
 770     // Make sure we are never given < 8 words
 771     {
 772       Label L;
 773       __ cmp(count, (u1)8);
 774       __ br(Assembler::GE, L);
 775       __ stop("genrate_copy_longs called with < 8 words");
 776       __ bind(L);
 777     }
 778 #endif
 779 
 780     // Fill 8 registers
 781     if (UseSIMDForMemoryOps) {
 782       __ ldpq(v0, v1, Address(s, 4 * unit));
 783       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 784     } else {
 785       __ ldp(t0, t1, Address(s, 2 * unit));
 786       __ ldp(t2, t3, Address(s, 4 * unit));
 787       __ ldp(t4, t5, Address(s, 6 * unit));
 788       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 789     }
 790 
 791     __ subs(count, count, 16);
 792     __ br(Assembler::LO, drain);
 793 
 794     int prefetch = PrefetchCopyIntervalInBytes;
 795     bool use_stride = false;
 796     if (direction == copy_backwards) {
 797        use_stride = prefetch > 256;
 798        prefetch = -prefetch;
 799        if (use_stride) __ mov(stride, prefetch);
 800     }
 801 
 802     __ bind(again);
 803 
 804     if (PrefetchCopyIntervalInBytes > 0)
 805       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 806 
 807     if (UseSIMDForMemoryOps) {
 808       __ stpq(v0, v1, Address(d, 4 * unit));
 809       __ ldpq(v0, v1, Address(s, 4 * unit));
 810       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 811       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 812     } else {
 813       __ stp(t0, t1, Address(d, 2 * unit));
 814       __ ldp(t0, t1, Address(s, 2 * unit));
 815       __ stp(t2, t3, Address(d, 4 * unit));
 816       __ ldp(t2, t3, Address(s, 4 * unit));
 817       __ stp(t4, t5, Address(d, 6 * unit));
 818       __ ldp(t4, t5, Address(s, 6 * unit));
 819       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 820       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 821     }
 822 
 823     __ subs(count, count, 8);
 824     __ br(Assembler::HS, again);
 825 
 826     // Drain
 827     __ bind(drain);
 828     if (UseSIMDForMemoryOps) {
 829       __ stpq(v0, v1, Address(d, 4 * unit));
 830       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 831     } else {
 832       __ stp(t0, t1, Address(d, 2 * unit));
 833       __ stp(t2, t3, Address(d, 4 * unit));
 834       __ stp(t4, t5, Address(d, 6 * unit));
 835       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 836     }
 837 
 838     {
 839       Label L1, L2;
 840       __ tbz(count, exact_log2(4), L1);
 841       if (UseSIMDForMemoryOps) {
 842         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 843         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 844       } else {
 845         __ ldp(t0, t1, Address(s, 2 * unit));
 846         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 847         __ stp(t0, t1, Address(d, 2 * unit));
 848         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 849       }
 850       __ bind(L1);
 851 
 852       if (direction == copy_forwards) {
 853         __ add(s, s, bias);
 854         __ add(d, d, bias);
 855       }
 856 
 857       __ tbz(count, 1, L2);
 858       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 859       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 860       __ bind(L2);
 861     }
 862 
 863     __ ret(lr);
 864 
 865     if (AvoidUnalignedAccesses) {
 866       Label drain, again;
 867       // Register order for storing. Order is different for backward copy.
 868 
 869       __ bind(unaligned_copy_long);
 870 
 871       // source address is even aligned, target odd aligned
 872       //
 873       // when forward copying word pairs we read long pairs at offsets
 874       // {0, 2, 4, 6} (in long words). when backwards copying we read
 875       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 876       // address by -2 in the forwards case so we can compute the
 877       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 878       // or -1.
 879       //
 880       // when forward copying we need to store 1 word, 3 pairs and
 881       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 882       // zero offset We adjust the destination by -1 which means we
 883       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 884       //
 885       // When backwards copyng we need to store 1 word, 3 pairs and
 886       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 887       // offsets {1, 3, 5, 7, 8} * unit.
 888 
 889       if (direction == copy_forwards) {
 890         __ sub(s, s, 16);
 891         __ sub(d, d, 8);
 892       }
 893 
 894       // Fill 8 registers
 895       //
 896       // for forwards copy s was offset by -16 from the original input
 897       // value of s so the register contents are at these offsets
 898       // relative to the 64 bit block addressed by that original input
 899       // and so on for each successive 64 byte block when s is updated
 900       //
 901       // t0 at offset 0,  t1 at offset 8
 902       // t2 at offset 16, t3 at offset 24
 903       // t4 at offset 32, t5 at offset 40
 904       // t6 at offset 48, t7 at offset 56
 905 
 906       // for backwards copy s was not offset so the register contents
 907       // are at these offsets into the preceding 64 byte block
 908       // relative to that original input and so on for each successive
 909       // preceding 64 byte block when s is updated. this explains the
 910       // slightly counter-intuitive looking pattern of register usage
 911       // in the stp instructions for backwards copy.
 912       //
 913       // t0 at offset -16, t1 at offset -8
 914       // t2 at offset -32, t3 at offset -24
 915       // t4 at offset -48, t5 at offset -40
 916       // t6 at offset -64, t7 at offset -56
 917 
 918       __ ldp(t0, t1, Address(s, 2 * unit));
 919       __ ldp(t2, t3, Address(s, 4 * unit));
 920       __ ldp(t4, t5, Address(s, 6 * unit));
 921       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 922 
 923       __ subs(count, count, 16);
 924       __ br(Assembler::LO, drain);
 925 
 926       int prefetch = PrefetchCopyIntervalInBytes;
 927       bool use_stride = false;
 928       if (direction == copy_backwards) {
 929          use_stride = prefetch > 256;
 930          prefetch = -prefetch;
 931          if (use_stride) __ mov(stride, prefetch);
 932       }
 933 
 934       __ bind(again);
 935 
 936       if (PrefetchCopyIntervalInBytes > 0)
 937         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 938 
 939       if (direction == copy_forwards) {
 940        // allowing for the offset of -8 the store instructions place
 941        // registers into the target 64 bit block at the following
 942        // offsets
 943        //
 944        // t0 at offset 0
 945        // t1 at offset 8,  t2 at offset 16
 946        // t3 at offset 24, t4 at offset 32
 947        // t5 at offset 40, t6 at offset 48
 948        // t7 at offset 56
 949 
 950         __ str(t0, Address(d, 1 * unit));
 951         __ stp(t1, t2, Address(d, 2 * unit));
 952         __ ldp(t0, t1, Address(s, 2 * unit));
 953         __ stp(t3, t4, Address(d, 4 * unit));
 954         __ ldp(t2, t3, Address(s, 4 * unit));
 955         __ stp(t5, t6, Address(d, 6 * unit));
 956         __ ldp(t4, t5, Address(s, 6 * unit));
 957         __ str(t7, Address(__ pre(d, 8 * unit)));
 958         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 959       } else {
 960        // d was not offset when we started so the registers are
 961        // written into the 64 bit block preceding d with the following
 962        // offsets
 963        //
 964        // t1 at offset -8
 965        // t3 at offset -24, t0 at offset -16
 966        // t5 at offset -48, t2 at offset -32
 967        // t7 at offset -56, t4 at offset -48
 968        //                   t6 at offset -64
 969        //
 970        // note that this matches the offsets previously noted for the
 971        // loads
 972 
 973         __ str(t1, Address(d, 1 * unit));
 974         __ stp(t3, t0, Address(d, 3 * unit));
 975         __ ldp(t0, t1, Address(s, 2 * unit));
 976         __ stp(t5, t2, Address(d, 5 * unit));
 977         __ ldp(t2, t3, Address(s, 4 * unit));
 978         __ stp(t7, t4, Address(d, 7 * unit));
 979         __ ldp(t4, t5, Address(s, 6 * unit));
 980         __ str(t6, Address(__ pre(d, 8 * unit)));
 981         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 982       }
 983 
 984       __ subs(count, count, 8);
 985       __ br(Assembler::HS, again);
 986 
 987       // Drain
 988       //
 989       // this uses the same pattern of offsets and register arguments
 990       // as above
 991       __ bind(drain);
 992       if (direction == copy_forwards) {
 993         __ str(t0, Address(d, 1 * unit));
 994         __ stp(t1, t2, Address(d, 2 * unit));
 995         __ stp(t3, t4, Address(d, 4 * unit));
 996         __ stp(t5, t6, Address(d, 6 * unit));
 997         __ str(t7, Address(__ pre(d, 8 * unit)));
 998       } else {
 999         __ str(t1, Address(d, 1 * unit));
1000         __ stp(t3, t0, Address(d, 3 * unit));
1001         __ stp(t5, t2, Address(d, 5 * unit));
1002         __ stp(t7, t4, Address(d, 7 * unit));
1003         __ str(t6, Address(__ pre(d, 8 * unit)));
1004       }
1005       // now we need to copy any remaining part block which may
1006       // include a 4 word block subblock and/or a 2 word subblock.
1007       // bits 2 and 1 in the count are the tell-tale for whether we
1008       // have each such subblock
1009       {
1010         Label L1, L2;
1011         __ tbz(count, exact_log2(4), L1);
1012        // this is the same as above but copying only 4 longs hence
1013        // with only one intervening stp between the str instructions
1014        // but note that the offsets and registers still follow the
1015        // same pattern
1016         __ ldp(t0, t1, Address(s, 2 * unit));
1017         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1018         if (direction == copy_forwards) {
1019           __ str(t0, Address(d, 1 * unit));
1020           __ stp(t1, t2, Address(d, 2 * unit));
1021           __ str(t3, Address(__ pre(d, 4 * unit)));
1022         } else {
1023           __ str(t1, Address(d, 1 * unit));
1024           __ stp(t3, t0, Address(d, 3 * unit));
1025           __ str(t2, Address(__ pre(d, 4 * unit)));
1026         }
1027         __ bind(L1);
1028 
1029         __ tbz(count, 1, L2);
1030        // this is the same as above but copying only 2 longs hence
1031        // there is no intervening stp between the str instructions
1032        // but note that the offset and register patterns are still
1033        // the same
1034         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1035         if (direction == copy_forwards) {
1036           __ str(t0, Address(d, 1 * unit));
1037           __ str(t1, Address(__ pre(d, 2 * unit)));
1038         } else {
1039           __ str(t1, Address(d, 1 * unit));
1040           __ str(t0, Address(__ pre(d, 2 * unit)));
1041         }
1042         __ bind(L2);
1043 
1044        // for forwards copy we need to re-adjust the offsets we
1045        // applied so that s and d are follow the last words written
1046 
1047        if (direction == copy_forwards) {
1048          __ add(s, s, 16);
1049          __ add(d, d, 8);
1050        }
1051 
1052       }
1053 
1054       __ ret(lr);
1055       }
1056   }
1057 
1058   // Small copy: less than 16 bytes.
1059   //
1060   // NB: Ignores all of the bits of count which represent more than 15
1061   // bytes, so a caller doesn't have to mask them.
1062 
1063   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1064     bool is_backwards = step < 0;
1065     size_t granularity = uabs(step);
1066     int direction = is_backwards ? -1 : 1;
1067     int unit = wordSize * direction;
1068 
1069     Label Lword, Lint, Lshort, Lbyte;
1070 
1071     assert(granularity
1072            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1073 
1074     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1075 
1076     // ??? I don't know if this bit-test-and-branch is the right thing
1077     // to do.  It does a lot of jumping, resulting in several
1078     // mispredicted branches.  It might make more sense to do this
1079     // with something like Duff's device with a single computed branch.
1080 
1081     __ tbz(count, 3 - exact_log2(granularity), Lword);
1082     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1083     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1084     __ bind(Lword);
1085 
1086     if (granularity <= sizeof (jint)) {
1087       __ tbz(count, 2 - exact_log2(granularity), Lint);
1088       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1089       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1090       __ bind(Lint);
1091     }
1092 
1093     if (granularity <= sizeof (jshort)) {
1094       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1095       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1096       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1097       __ bind(Lshort);
1098     }
1099 
1100     if (granularity <= sizeof (jbyte)) {
1101       __ tbz(count, 0, Lbyte);
1102       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1103       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1104       __ bind(Lbyte);
1105     }
1106   }
1107 
1108   Label copy_f, copy_b;
1109 
1110   // All-singing all-dancing memory copy.
1111   //
1112   // Copy count units of memory from s to d.  The size of a unit is
1113   // step, which can be positive or negative depending on the direction
1114   // of copy.  If is_aligned is false, we align the source address.
1115   //
1116 
1117   void copy_memory(bool is_aligned, Register s, Register d,
1118                    Register count, Register tmp, int step) {
1119     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1120     bool is_backwards = step < 0;
1121     unsigned int granularity = uabs(step);
1122     const Register t0 = r3, t1 = r4;
1123 
1124     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1125     // load all the data before writing anything
1126     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1127     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1128     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1129     const Register send = r17, dend = r16;
1130 
1131     if (PrefetchCopyIntervalInBytes > 0)
1132       __ prfm(Address(s, 0), PLDL1KEEP);
1133     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1134     __ br(Assembler::HI, copy_big);
1135 
1136     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1137     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1138 
1139     __ cmp(count, u1(16/granularity));
1140     __ br(Assembler::LS, copy16);
1141 
1142     __ cmp(count, u1(64/granularity));
1143     __ br(Assembler::HI, copy80);
1144 
1145     __ cmp(count, u1(32/granularity));
1146     __ br(Assembler::LS, copy32);
1147 
1148     // 33..64 bytes
1149     if (UseSIMDForMemoryOps) {
1150       __ ldpq(v0, v1, Address(s, 0));
1151       __ ldpq(v2, v3, Address(send, -32));
1152       __ stpq(v0, v1, Address(d, 0));
1153       __ stpq(v2, v3, Address(dend, -32));
1154     } else {
1155       __ ldp(t0, t1, Address(s, 0));
1156       __ ldp(t2, t3, Address(s, 16));
1157       __ ldp(t4, t5, Address(send, -32));
1158       __ ldp(t6, t7, Address(send, -16));
1159 
1160       __ stp(t0, t1, Address(d, 0));
1161       __ stp(t2, t3, Address(d, 16));
1162       __ stp(t4, t5, Address(dend, -32));
1163       __ stp(t6, t7, Address(dend, -16));
1164     }
1165     __ b(finish);
1166 
1167     // 17..32 bytes
1168     __ bind(copy32);
1169     __ ldp(t0, t1, Address(s, 0));
1170     __ ldp(t2, t3, Address(send, -16));
1171     __ stp(t0, t1, Address(d, 0));
1172     __ stp(t2, t3, Address(dend, -16));
1173     __ b(finish);
1174 
1175     // 65..80/96 bytes
1176     // (96 bytes if SIMD because we do 32 byes per instruction)
1177     __ bind(copy80);
1178     if (UseSIMDForMemoryOps) {
1179       __ ldpq(v0, v1, Address(s, 0));
1180       __ ldpq(v2, v3, Address(s, 32));
1181       // Unaligned pointers can be an issue for copying.
1182       // The issue has more chances to happen when granularity of data is
1183       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1184       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1185       // The most performance drop has been seen for the range 65-80 bytes.
1186       // For such cases using the pair of ldp/stp instead of the third pair of
1187       // ldpq/stpq fixes the performance issue.
1188       if (granularity < sizeof (jint)) {
1189         Label copy96;
1190         __ cmp(count, u1(80/granularity));
1191         __ br(Assembler::HI, copy96);
1192         __ ldp(t0, t1, Address(send, -16));
1193 
1194         __ stpq(v0, v1, Address(d, 0));
1195         __ stpq(v2, v3, Address(d, 32));
1196         __ stp(t0, t1, Address(dend, -16));
1197         __ b(finish);
1198 
1199         __ bind(copy96);
1200       }
1201       __ ldpq(v4, v5, Address(send, -32));
1202 
1203       __ stpq(v0, v1, Address(d, 0));
1204       __ stpq(v2, v3, Address(d, 32));
1205       __ stpq(v4, v5, Address(dend, -32));
1206     } else {
1207       __ ldp(t0, t1, Address(s, 0));
1208       __ ldp(t2, t3, Address(s, 16));
1209       __ ldp(t4, t5, Address(s, 32));
1210       __ ldp(t6, t7, Address(s, 48));
1211       __ ldp(t8, t9, Address(send, -16));
1212 
1213       __ stp(t0, t1, Address(d, 0));
1214       __ stp(t2, t3, Address(d, 16));
1215       __ stp(t4, t5, Address(d, 32));
1216       __ stp(t6, t7, Address(d, 48));
1217       __ stp(t8, t9, Address(dend, -16));
1218     }
1219     __ b(finish);
1220 
1221     // 0..16 bytes
1222     __ bind(copy16);
1223     __ cmp(count, u1(8/granularity));
1224     __ br(Assembler::LO, copy8);
1225 
1226     // 8..16 bytes
1227     __ ldr(t0, Address(s, 0));
1228     __ ldr(t1, Address(send, -8));
1229     __ str(t0, Address(d, 0));
1230     __ str(t1, Address(dend, -8));
1231     __ b(finish);
1232 
1233     if (granularity < 8) {
1234       // 4..7 bytes
1235       __ bind(copy8);
1236       __ tbz(count, 2 - exact_log2(granularity), copy4);
1237       __ ldrw(t0, Address(s, 0));
1238       __ ldrw(t1, Address(send, -4));
1239       __ strw(t0, Address(d, 0));
1240       __ strw(t1, Address(dend, -4));
1241       __ b(finish);
1242       if (granularity < 4) {
1243         // 0..3 bytes
1244         __ bind(copy4);
1245         __ cbz(count, finish); // get rid of 0 case
1246         if (granularity == 2) {
1247           __ ldrh(t0, Address(s, 0));
1248           __ strh(t0, Address(d, 0));
1249         } else { // granularity == 1
1250           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1251           // the first and last byte.
1252           // Handle the 3 byte case by loading and storing base + count/2
1253           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1254           // This does means in the 1 byte case we load/store the same
1255           // byte 3 times.
1256           __ lsr(count, count, 1);
1257           __ ldrb(t0, Address(s, 0));
1258           __ ldrb(t1, Address(send, -1));
1259           __ ldrb(t2, Address(s, count));
1260           __ strb(t0, Address(d, 0));
1261           __ strb(t1, Address(dend, -1));
1262           __ strb(t2, Address(d, count));
1263         }
1264         __ b(finish);
1265       }
1266     }
1267 
1268     __ bind(copy_big);
1269     if (is_backwards) {
1270       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1271       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1272     }
1273 
1274     // Now we've got the small case out of the way we can align the
1275     // source address on a 2-word boundary.
1276 
1277     Label aligned;
1278 
1279     if (is_aligned) {
1280       // We may have to adjust by 1 word to get s 2-word-aligned.
1281       __ tbz(s, exact_log2(wordSize), aligned);
1282       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1283       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1284       __ sub(count, count, wordSize/granularity);
1285     } else {
1286       if (is_backwards) {
1287         __ andr(rscratch2, s, 2 * wordSize - 1);
1288       } else {
1289         __ neg(rscratch2, s);
1290         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1291       }
1292       // rscratch2 is the byte adjustment needed to align s.
1293       __ cbz(rscratch2, aligned);
1294       int shift = exact_log2(granularity);
1295       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1296       __ sub(count, count, rscratch2);
1297 
1298 #if 0
1299       // ?? This code is only correct for a disjoint copy.  It may or
1300       // may not make sense to use it in that case.
1301 
1302       // Copy the first pair; s and d may not be aligned.
1303       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1304       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1305 
1306       // Align s and d, adjust count
1307       if (is_backwards) {
1308         __ sub(s, s, rscratch2);
1309         __ sub(d, d, rscratch2);
1310       } else {
1311         __ add(s, s, rscratch2);
1312         __ add(d, d, rscratch2);
1313       }
1314 #else
1315       copy_memory_small(s, d, rscratch2, rscratch1, step);
1316 #endif
1317     }
1318 
1319     __ bind(aligned);
1320 
1321     // s is now 2-word-aligned.
1322 
1323     // We have a count of units and some trailing bytes.  Adjust the
1324     // count and do a bulk copy of words.
1325     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1326     if (direction == copy_forwards)
1327       __ bl(copy_f);
1328     else
1329       __ bl(copy_b);
1330 
1331     // And the tail.
1332     copy_memory_small(s, d, count, tmp, step);
1333 
1334     if (granularity >= 8) __ bind(copy8);
1335     if (granularity >= 4) __ bind(copy4);
1336     __ bind(finish);
1337   }
1338 
1339 
1340   void clobber_registers() {
1341 #ifdef ASSERT
1342     RegSet clobbered
1343       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1344     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1345     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1346     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1347       __ mov(*it, rscratch1);
1348     }
1349 #endif
1350 
1351   }
1352 
1353   // Scan over array at a for count oops, verifying each one.
1354   // Preserves a and count, clobbers rscratch1 and rscratch2.
1355   void verify_oop_array (int size, Register a, Register count, Register temp) {
1356     Label loop, end;
1357     __ mov(rscratch1, a);
1358     __ mov(rscratch2, zr);
1359     __ bind(loop);
1360     __ cmp(rscratch2, count);
1361     __ br(Assembler::HS, end);
1362     if (size == wordSize) {
1363       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1364       __ verify_oop(temp);
1365     } else {
1366       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1367       __ decode_heap_oop(temp); // calls verify_oop
1368     }
1369     __ add(rscratch2, rscratch2, 1);
1370     __ b(loop);
1371     __ bind(end);
1372   }
1373 
1374   // Arguments:
1375   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1376   //             ignored
1377   //   is_oop  - true => oop array, so generate store check code
1378   //   name    - stub name string
1379   //
1380   // Inputs:
1381   //   c_rarg0   - source array address
1382   //   c_rarg1   - destination array address
1383   //   c_rarg2   - element count, treated as ssize_t, can be zero
1384   //
1385   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1386   // the hardware handle it.  The two dwords within qwords that span
1387   // cache line boundaries will still be loaded and stored atomically.
1388   //
1389   // Side Effects:
1390   //   disjoint_int_copy_entry is set to the no-overlap entry point
1391   //   used by generate_conjoint_int_oop_copy().
1392   //
1393   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1394                                   const char *name, bool dest_uninitialized = false) {
1395     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1396     RegSet saved_reg = RegSet::of(s, d, count);
1397     __ align(CodeEntryAlignment);
1398     StubCodeMark mark(this, "StubRoutines", name);
1399     address start = __ pc();
1400     __ enter();
1401 
1402     if (entry != NULL) {
1403       *entry = __ pc();
1404       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1405       BLOCK_COMMENT("Entry:");
1406     }
1407 
1408     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1409     if (dest_uninitialized) {
1410       decorators |= IS_DEST_UNINITIALIZED;
1411     }
1412     if (aligned) {
1413       decorators |= ARRAYCOPY_ALIGNED;
1414     }
1415 
1416     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1417     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1418 
1419     if (is_oop) {
1420       // save regs before copy_memory
1421       __ push(RegSet::of(d, count), sp);
1422     }
1423     {
1424       // UnsafeCopyMemory page error: continue after ucm
1425       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1426       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1427       copy_memory(aligned, s, d, count, rscratch1, size);
1428     }
1429 
1430     if (is_oop) {
1431       __ pop(RegSet::of(d, count), sp);
1432       if (VerifyOops)
1433         verify_oop_array(size, d, count, r16);
1434     }
1435 
1436     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1437 
1438     __ leave();
1439     __ mov(r0, zr); // return 0
1440     __ ret(lr);
1441     return start;
1442   }
1443 
1444   // Arguments:
1445   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1446   //             ignored
1447   //   is_oop  - true => oop array, so generate store check code
1448   //   name    - stub name string
1449   //
1450   // Inputs:
1451   //   c_rarg0   - source array address
1452   //   c_rarg1   - destination array address
1453   //   c_rarg2   - element count, treated as ssize_t, can be zero
1454   //
1455   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1456   // the hardware handle it.  The two dwords within qwords that span
1457   // cache line boundaries will still be loaded and stored atomically.
1458   //
1459   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1460                                  address *entry, const char *name,
1461                                  bool dest_uninitialized = false) {
1462     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1463     RegSet saved_regs = RegSet::of(s, d, count);
1464     StubCodeMark mark(this, "StubRoutines", name);
1465     address start = __ pc();
1466     __ enter();
1467 
1468     if (entry != NULL) {
1469       *entry = __ pc();
1470       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1471       BLOCK_COMMENT("Entry:");
1472     }
1473 
1474     // use fwd copy when (d-s) above_equal (count*size)
1475     __ sub(rscratch1, d, s);
1476     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1477     __ br(Assembler::HS, nooverlap_target);
1478 
1479     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1480     if (dest_uninitialized) {
1481       decorators |= IS_DEST_UNINITIALIZED;
1482     }
1483     if (aligned) {
1484       decorators |= ARRAYCOPY_ALIGNED;
1485     }
1486 
1487     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1488     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1489 
1490     if (is_oop) {
1491       // save regs before copy_memory
1492       __ push(RegSet::of(d, count), sp);
1493     }
1494     {
1495       // UnsafeCopyMemory page error: continue after ucm
1496       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1497       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1498       copy_memory(aligned, s, d, count, rscratch1, -size);
1499     }
1500     if (is_oop) {
1501       __ pop(RegSet::of(d, count), sp);
1502       if (VerifyOops)
1503         verify_oop_array(size, d, count, r16);
1504     }
1505     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1506     __ leave();
1507     __ mov(r0, zr); // return 0
1508     __ ret(lr);
1509     return start;
1510 }
1511 
1512   // Arguments:
1513   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1514   //             ignored
1515   //   name    - stub name string
1516   //
1517   // Inputs:
1518   //   c_rarg0   - source array address
1519   //   c_rarg1   - destination array address
1520   //   c_rarg2   - element count, treated as ssize_t, can be zero
1521   //
1522   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1523   // we let the hardware handle it.  The one to eight bytes within words,
1524   // dwords or qwords that span cache line boundaries will still be loaded
1525   // and stored atomically.
1526   //
1527   // Side Effects:
1528   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1529   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1530   // we let the hardware handle it.  The one to eight bytes within words,
1531   // dwords or qwords that span cache line boundaries will still be loaded
1532   // and stored atomically.
1533   //
1534   // Side Effects:
1535   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1536   //   used by generate_conjoint_byte_copy().
1537   //
1538   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1539     const bool not_oop = false;
1540     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1541   }
1542 
1543   // Arguments:
1544   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1545   //             ignored
1546   //   name    - stub name string
1547   //
1548   // Inputs:
1549   //   c_rarg0   - source array address
1550   //   c_rarg1   - destination array address
1551   //   c_rarg2   - element count, treated as ssize_t, can be zero
1552   //
1553   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1554   // we let the hardware handle it.  The one to eight bytes within words,
1555   // dwords or qwords that span cache line boundaries will still be loaded
1556   // and stored atomically.
1557   //
1558   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1559                                       address* entry, const char *name) {
1560     const bool not_oop = false;
1561     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1562   }
1563 
1564   // Arguments:
1565   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1566   //             ignored
1567   //   name    - stub name string
1568   //
1569   // Inputs:
1570   //   c_rarg0   - source array address
1571   //   c_rarg1   - destination array address
1572   //   c_rarg2   - element count, treated as ssize_t, can be zero
1573   //
1574   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1575   // let the hardware handle it.  The two or four words within dwords
1576   // or qwords that span cache line boundaries will still be loaded
1577   // and stored atomically.
1578   //
1579   // Side Effects:
1580   //   disjoint_short_copy_entry is set to the no-overlap entry point
1581   //   used by generate_conjoint_short_copy().
1582   //
1583   address generate_disjoint_short_copy(bool aligned,
1584                                        address* entry, const char *name) {
1585     const bool not_oop = false;
1586     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1587   }
1588 
1589   // Arguments:
1590   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1591   //             ignored
1592   //   name    - stub name string
1593   //
1594   // Inputs:
1595   //   c_rarg0   - source array address
1596   //   c_rarg1   - destination array address
1597   //   c_rarg2   - element count, treated as ssize_t, can be zero
1598   //
1599   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1600   // let the hardware handle it.  The two or four words within dwords
1601   // or qwords that span cache line boundaries will still be loaded
1602   // and stored atomically.
1603   //
1604   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1605                                        address *entry, const char *name) {
1606     const bool not_oop = false;
1607     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1608 
1609   }
1610   // Arguments:
1611   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1612   //             ignored
1613   //   name    - stub name string
1614   //
1615   // Inputs:
1616   //   c_rarg0   - source array address
1617   //   c_rarg1   - destination array address
1618   //   c_rarg2   - element count, treated as ssize_t, can be zero
1619   //
1620   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1621   // the hardware handle it.  The two dwords within qwords that span
1622   // cache line boundaries will still be loaded and stored atomically.
1623   //
1624   // Side Effects:
1625   //   disjoint_int_copy_entry is set to the no-overlap entry point
1626   //   used by generate_conjoint_int_oop_copy().
1627   //
1628   address generate_disjoint_int_copy(bool aligned, address *entry,
1629                                          const char *name, bool dest_uninitialized = false) {
1630     const bool not_oop = false;
1631     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1632   }
1633 
1634   // Arguments:
1635   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1636   //             ignored
1637   //   name    - stub name string
1638   //
1639   // Inputs:
1640   //   c_rarg0   - source array address
1641   //   c_rarg1   - destination array address
1642   //   c_rarg2   - element count, treated as ssize_t, can be zero
1643   //
1644   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1645   // the hardware handle it.  The two dwords within qwords that span
1646   // cache line boundaries will still be loaded and stored atomically.
1647   //
1648   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1649                                      address *entry, const char *name,
1650                                      bool dest_uninitialized = false) {
1651     const bool not_oop = false;
1652     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1653   }
1654 
1655 
1656   // Arguments:
1657   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1658   //             ignored
1659   //   name    - stub name string
1660   //
1661   // Inputs:
1662   //   c_rarg0   - source array address
1663   //   c_rarg1   - destination array address
1664   //   c_rarg2   - element count, treated as size_t, can be zero
1665   //
1666   // Side Effects:
1667   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1668   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1669   //
1670   address generate_disjoint_long_copy(bool aligned, address *entry,
1671                                           const char *name, bool dest_uninitialized = false) {
1672     const bool not_oop = false;
1673     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1674   }
1675 
1676   // Arguments:
1677   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1678   //             ignored
1679   //   name    - stub name string
1680   //
1681   // Inputs:
1682   //   c_rarg0   - source array address
1683   //   c_rarg1   - destination array address
1684   //   c_rarg2   - element count, treated as size_t, can be zero
1685   //
1686   address generate_conjoint_long_copy(bool aligned,
1687                                       address nooverlap_target, address *entry,
1688                                       const char *name, bool dest_uninitialized = false) {
1689     const bool not_oop = false;
1690     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1691   }
1692 
1693   // Arguments:
1694   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1695   //             ignored
1696   //   name    - stub name string
1697   //
1698   // Inputs:
1699   //   c_rarg0   - source array address
1700   //   c_rarg1   - destination array address
1701   //   c_rarg2   - element count, treated as size_t, can be zero
1702   //
1703   // Side Effects:
1704   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1705   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1706   //
1707   address generate_disjoint_oop_copy(bool aligned, address *entry,
1708                                      const char *name, bool dest_uninitialized) {
1709     const bool is_oop = true;
1710     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1711     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1712   }
1713 
1714   // Arguments:
1715   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1716   //             ignored
1717   //   name    - stub name string
1718   //
1719   // Inputs:
1720   //   c_rarg0   - source array address
1721   //   c_rarg1   - destination array address
1722   //   c_rarg2   - element count, treated as size_t, can be zero
1723   //
1724   address generate_conjoint_oop_copy(bool aligned,
1725                                      address nooverlap_target, address *entry,
1726                                      const char *name, bool dest_uninitialized) {
1727     const bool is_oop = true;
1728     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1729     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1730                                   name, dest_uninitialized);
1731   }
1732 
1733 
1734   // Helper for generating a dynamic type check.
1735   // Smashes rscratch1, rscratch2.
1736   void generate_type_check(Register sub_klass,
1737                            Register super_check_offset,
1738                            Register super_klass,
1739                            Label& L_success) {
1740     assert_different_registers(sub_klass, super_check_offset, super_klass);
1741 
1742     BLOCK_COMMENT("type_check:");
1743 
1744     Label L_miss;
1745 
1746     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1747                                      super_check_offset);
1748     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1749 
1750     // Fall through on failure!
1751     __ BIND(L_miss);
1752   }
1753 
1754   //
1755   //  Generate checkcasting array copy stub
1756   //
1757   //  Input:
1758   //    c_rarg0   - source array address
1759   //    c_rarg1   - destination array address
1760   //    c_rarg2   - element count, treated as ssize_t, can be zero
1761   //    c_rarg3   - size_t ckoff (super_check_offset)
1762   //    c_rarg4   - oop ckval (super_klass)
1763   //
1764   //  Output:
1765   //    r0 ==  0  -  success
1766   //    r0 == -1^K - failure, where K is partial transfer count
1767   //
1768   address generate_checkcast_copy(const char *name, address *entry,
1769                                   bool dest_uninitialized = false) {
1770 
1771     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1772 
1773     // Input registers (after setup_arg_regs)
1774     const Register from        = c_rarg0;   // source array address
1775     const Register to          = c_rarg1;   // destination array address
1776     const Register count       = c_rarg2;   // elementscount
1777     const Register ckoff       = c_rarg3;   // super_check_offset
1778     const Register ckval       = c_rarg4;   // super_klass
1779 
1780     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1781     RegSet wb_post_saved_regs = RegSet::of(count);
1782 
1783     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1784     const Register copied_oop  = r22;       // actual oop copied
1785     const Register count_save  = r21;       // orig elementscount
1786     const Register start_to    = r20;       // destination array start address
1787     const Register r19_klass   = r19;       // oop._klass
1788 
1789     //---------------------------------------------------------------
1790     // Assembler stub will be used for this call to arraycopy
1791     // if the two arrays are subtypes of Object[] but the
1792     // destination array type is not equal to or a supertype
1793     // of the source type.  Each element must be separately
1794     // checked.
1795 
1796     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1797                                copied_oop, r19_klass, count_save);
1798 
1799     __ align(CodeEntryAlignment);
1800     StubCodeMark mark(this, "StubRoutines", name);
1801     address start = __ pc();
1802 
1803     __ enter(); // required for proper stackwalking of RuntimeStub frame
1804 
1805 #ifdef ASSERT
1806     // caller guarantees that the arrays really are different
1807     // otherwise, we would have to make conjoint checks
1808     { Label L;
1809       array_overlap_test(L, TIMES_OOP);
1810       __ stop("checkcast_copy within a single array");
1811       __ bind(L);
1812     }
1813 #endif //ASSERT
1814 
1815     // Caller of this entry point must set up the argument registers.
1816     if (entry != NULL) {
1817       *entry = __ pc();
1818       BLOCK_COMMENT("Entry:");
1819     }
1820 
1821      // Empty array:  Nothing to do.
1822     __ cbz(count, L_done);
1823     __ push(RegSet::of(r19, r20, r21, r22), sp);
1824 
1825 #ifdef ASSERT
1826     BLOCK_COMMENT("assert consistent ckoff/ckval");
1827     // The ckoff and ckval must be mutually consistent,
1828     // even though caller generates both.
1829     { Label L;
1830       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1831       __ ldrw(start_to, Address(ckval, sco_offset));
1832       __ cmpw(ckoff, start_to);
1833       __ br(Assembler::EQ, L);
1834       __ stop("super_check_offset inconsistent");
1835       __ bind(L);
1836     }
1837 #endif //ASSERT
1838 
1839     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1840     bool is_oop = true;
1841     if (dest_uninitialized) {
1842       decorators |= IS_DEST_UNINITIALIZED;
1843     }
1844 
1845     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1846     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1847 
1848     // save the original count
1849     __ mov(count_save, count);
1850 
1851     // Copy from low to high addresses
1852     __ mov(start_to, to);              // Save destination array start address
1853     __ b(L_load_element);
1854 
1855     // ======== begin loop ========
1856     // (Loop is rotated; its entry is L_load_element.)
1857     // Loop control:
1858     //   for (; count != 0; count--) {
1859     //     copied_oop = load_heap_oop(from++);
1860     //     ... generate_type_check ...;
1861     //     store_heap_oop(to++, copied_oop);
1862     //   }
1863     __ align(OptoLoopAlignment);
1864 
1865     __ BIND(L_store_element);
1866     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
1867     __ sub(count, count, 1);
1868     __ cbz(count, L_do_card_marks);
1869 
1870     // ======== loop entry is here ========
1871     __ BIND(L_load_element);
1872     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1873     __ cbz(copied_oop, L_store_element);
1874 
1875     __ load_klass(r19_klass, copied_oop);// query the object klass
1876     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1877     // ======== end loop ========
1878 
1879     // It was a real error; we must depend on the caller to finish the job.
1880     // Register count = remaining oops, count_orig = total oops.
1881     // Emit GC store barriers for the oops we have copied and report
1882     // their number to the caller.
1883 
1884     __ subs(count, count_save, count);     // K = partially copied oop count
1885     __ eon(count, count, zr);                   // report (-1^K) to caller
1886     __ br(Assembler::EQ, L_done_pop);
1887 
1888     __ BIND(L_do_card_marks);
1889     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1890 
1891     __ bind(L_done_pop);
1892     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1893     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1894 
1895     __ bind(L_done);
1896     __ mov(r0, count);
1897     __ leave();
1898     __ ret(lr);
1899 
1900     return start;
1901   }
1902 
1903   // Perform range checks on the proposed arraycopy.
1904   // Kills temp, but nothing else.
1905   // Also, clean the sign bits of src_pos and dst_pos.
1906   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1907                               Register src_pos, // source position (c_rarg1)
1908                               Register dst,     // destination array oo (c_rarg2)
1909                               Register dst_pos, // destination position (c_rarg3)
1910                               Register length,
1911                               Register temp,
1912                               Label& L_failed) {
1913     BLOCK_COMMENT("arraycopy_range_checks:");
1914 
1915     assert_different_registers(rscratch1, temp);
1916 
1917     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1918     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1919     __ addw(temp, length, src_pos);
1920     __ cmpw(temp, rscratch1);
1921     __ br(Assembler::HI, L_failed);
1922 
1923     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1924     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1925     __ addw(temp, length, dst_pos);
1926     __ cmpw(temp, rscratch1);
1927     __ br(Assembler::HI, L_failed);
1928 
1929     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1930     __ movw(src_pos, src_pos);
1931     __ movw(dst_pos, dst_pos);
1932 
1933     BLOCK_COMMENT("arraycopy_range_checks done");
1934   }
1935 
1936   // These stubs get called from some dumb test routine.
1937   // I'll write them properly when they're called from
1938   // something that's actually doing something.
1939   static void fake_arraycopy_stub(address src, address dst, int count) {
1940     assert(count == 0, "huh?");
1941   }
1942 
1943 
1944   //
1945   //  Generate 'unsafe' array copy stub
1946   //  Though just as safe as the other stubs, it takes an unscaled
1947   //  size_t argument instead of an element count.
1948   //
1949   //  Input:
1950   //    c_rarg0   - source array address
1951   //    c_rarg1   - destination array address
1952   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1953   //
1954   // Examines the alignment of the operands and dispatches
1955   // to a long, int, short, or byte copy loop.
1956   //
1957   address generate_unsafe_copy(const char *name,
1958                                address byte_copy_entry,
1959                                address short_copy_entry,
1960                                address int_copy_entry,
1961                                address long_copy_entry) {
1962     Label L_long_aligned, L_int_aligned, L_short_aligned;
1963     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1964 
1965     __ align(CodeEntryAlignment);
1966     StubCodeMark mark(this, "StubRoutines", name);
1967     address start = __ pc();
1968     __ enter(); // required for proper stackwalking of RuntimeStub frame
1969 
1970     // bump this on entry, not on exit:
1971     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1972 
1973     __ orr(rscratch1, s, d);
1974     __ orr(rscratch1, rscratch1, count);
1975 
1976     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1977     __ cbz(rscratch1, L_long_aligned);
1978     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1979     __ cbz(rscratch1, L_int_aligned);
1980     __ tbz(rscratch1, 0, L_short_aligned);
1981     __ b(RuntimeAddress(byte_copy_entry));
1982 
1983     __ BIND(L_short_aligned);
1984     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1985     __ b(RuntimeAddress(short_copy_entry));
1986     __ BIND(L_int_aligned);
1987     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1988     __ b(RuntimeAddress(int_copy_entry));
1989     __ BIND(L_long_aligned);
1990     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1991     __ b(RuntimeAddress(long_copy_entry));
1992 
1993     return start;
1994   }
1995 
1996   //
1997   //  Generate generic array copy stubs
1998   //
1999   //  Input:
2000   //    c_rarg0    -  src oop
2001   //    c_rarg1    -  src_pos (32-bits)
2002   //    c_rarg2    -  dst oop
2003   //    c_rarg3    -  dst_pos (32-bits)
2004   //    c_rarg4    -  element count (32-bits)
2005   //
2006   //  Output:
2007   //    r0 ==  0  -  success
2008   //    r0 == -1^K - failure, where K is partial transfer count
2009   //
2010   address generate_generic_copy(const char *name,
2011                                 address byte_copy_entry, address short_copy_entry,
2012                                 address int_copy_entry, address oop_copy_entry,
2013                                 address long_copy_entry, address checkcast_copy_entry) {
2014 
2015     Label L_failed, L_objArray;
2016     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2017 
2018     // Input registers
2019     const Register src        = c_rarg0;  // source array oop
2020     const Register src_pos    = c_rarg1;  // source position
2021     const Register dst        = c_rarg2;  // destination array oop
2022     const Register dst_pos    = c_rarg3;  // destination position
2023     const Register length     = c_rarg4;
2024 
2025 
2026     // Registers used as temps
2027     const Register dst_klass  = c_rarg5;
2028 
2029     __ align(CodeEntryAlignment);
2030 
2031     StubCodeMark mark(this, "StubRoutines", name);
2032 
2033     address start = __ pc();
2034 
2035     __ enter(); // required for proper stackwalking of RuntimeStub frame
2036 
2037     // bump this on entry, not on exit:
2038     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2039 
2040     //-----------------------------------------------------------------------
2041     // Assembler stub will be used for this call to arraycopy
2042     // if the following conditions are met:
2043     //
2044     // (1) src and dst must not be null.
2045     // (2) src_pos must not be negative.
2046     // (3) dst_pos must not be negative.
2047     // (4) length  must not be negative.
2048     // (5) src klass and dst klass should be the same and not NULL.
2049     // (6) src and dst should be arrays.
2050     // (7) src_pos + length must not exceed length of src.
2051     // (8) dst_pos + length must not exceed length of dst.
2052     //
2053 
2054     //  if (src == NULL) return -1;
2055     __ cbz(src, L_failed);
2056 
2057     //  if (src_pos < 0) return -1;
2058     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2059 
2060     //  if (dst == NULL) return -1;
2061     __ cbz(dst, L_failed);
2062 
2063     //  if (dst_pos < 0) return -1;
2064     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2065 
2066     // registers used as temp
2067     const Register scratch_length    = r16; // elements count to copy
2068     const Register scratch_src_klass = r17; // array klass
2069     const Register lh                = r15; // layout helper
2070 
2071     //  if (length < 0) return -1;
2072     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2073     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2074 
2075     __ load_klass(scratch_src_klass, src);
2076 #ifdef ASSERT
2077     //  assert(src->klass() != NULL);
2078     {
2079       BLOCK_COMMENT("assert klasses not null {");
2080       Label L1, L2;
2081       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2082       __ bind(L1);
2083       __ stop("broken null klass");
2084       __ bind(L2);
2085       __ load_klass(rscratch1, dst);
2086       __ cbz(rscratch1, L1);     // this would be broken also
2087       BLOCK_COMMENT("} assert klasses not null done");
2088     }
2089 #endif
2090 
2091     // Load layout helper (32-bits)
2092     //
2093     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2094     // 32        30    24            16              8     2                 0
2095     //
2096     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2097     //
2098 
2099     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2100 
2101     // Handle objArrays completely differently...
2102     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2103     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2104     __ movw(rscratch1, objArray_lh);
2105     __ eorw(rscratch2, lh, rscratch1);
2106     __ cbzw(rscratch2, L_objArray);
2107 
2108     //  if (src->klass() != dst->klass()) return -1;
2109     __ load_klass(rscratch2, dst);
2110     __ eor(rscratch2, rscratch2, scratch_src_klass);
2111     __ cbnz(rscratch2, L_failed);
2112 
2113     //  if (!src->is_Array()) return -1;
2114     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2115 
2116     // At this point, it is known to be a typeArray (array_tag 0x3).
2117 #ifdef ASSERT
2118     {
2119       BLOCK_COMMENT("assert primitive array {");
2120       Label L;
2121       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2122       __ cmpw(lh, rscratch2);
2123       __ br(Assembler::GE, L);
2124       __ stop("must be a primitive array");
2125       __ bind(L);
2126       BLOCK_COMMENT("} assert primitive array done");
2127     }
2128 #endif
2129 
2130     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2131                            rscratch2, L_failed);
2132 
2133     // TypeArrayKlass
2134     //
2135     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2136     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2137     //
2138 
2139     const Register rscratch1_offset = rscratch1;    // array offset
2140     const Register r15_elsize = lh; // element size
2141 
2142     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2143            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2144     __ add(src, src, rscratch1_offset);           // src array offset
2145     __ add(dst, dst, rscratch1_offset);           // dst array offset
2146     BLOCK_COMMENT("choose copy loop based on element size");
2147 
2148     // next registers should be set before the jump to corresponding stub
2149     const Register from     = c_rarg0;  // source array address
2150     const Register to       = c_rarg1;  // destination array address
2151     const Register count    = c_rarg2;  // elements count
2152 
2153     // 'from', 'to', 'count' registers should be set in such order
2154     // since they are the same as 'src', 'src_pos', 'dst'.
2155 
2156     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2157 
2158     // The possible values of elsize are 0-3, i.e. exact_log2(element
2159     // size in bytes).  We do a simple bitwise binary search.
2160   __ BIND(L_copy_bytes);
2161     __ tbnz(r15_elsize, 1, L_copy_ints);
2162     __ tbnz(r15_elsize, 0, L_copy_shorts);
2163     __ lea(from, Address(src, src_pos));// src_addr
2164     __ lea(to,   Address(dst, dst_pos));// dst_addr
2165     __ movw(count, scratch_length); // length
2166     __ b(RuntimeAddress(byte_copy_entry));
2167 
2168   __ BIND(L_copy_shorts);
2169     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2170     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2171     __ movw(count, scratch_length); // length
2172     __ b(RuntimeAddress(short_copy_entry));
2173 
2174   __ BIND(L_copy_ints);
2175     __ tbnz(r15_elsize, 0, L_copy_longs);
2176     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2177     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2178     __ movw(count, scratch_length); // length
2179     __ b(RuntimeAddress(int_copy_entry));
2180 
2181   __ BIND(L_copy_longs);
2182 #ifdef ASSERT
2183     {
2184       BLOCK_COMMENT("assert long copy {");
2185       Label L;
2186       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2187       __ cmpw(r15_elsize, LogBytesPerLong);
2188       __ br(Assembler::EQ, L);
2189       __ stop("must be long copy, but elsize is wrong");
2190       __ bind(L);
2191       BLOCK_COMMENT("} assert long copy done");
2192     }
2193 #endif
2194     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2195     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2196     __ movw(count, scratch_length); // length
2197     __ b(RuntimeAddress(long_copy_entry));
2198 
2199     // ObjArrayKlass
2200   __ BIND(L_objArray);
2201     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2202 
2203     Label L_plain_copy, L_checkcast_copy;
2204     //  test array classes for subtyping
2205     __ load_klass(r15, dst);
2206     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2207     __ br(Assembler::NE, L_checkcast_copy);
2208 
2209     // Identically typed arrays can be copied without element-wise checks.
2210     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2211                            rscratch2, L_failed);
2212 
2213     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2214     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2215     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2216     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2217     __ movw(count, scratch_length); // length
2218   __ BIND(L_plain_copy);
2219     __ b(RuntimeAddress(oop_copy_entry));
2220 
2221   __ BIND(L_checkcast_copy);
2222     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2223     {
2224       // Before looking at dst.length, make sure dst is also an objArray.
2225       __ ldrw(rscratch1, Address(r15, lh_offset));
2226       __ movw(rscratch2, objArray_lh);
2227       __ eorw(rscratch1, rscratch1, rscratch2);
2228       __ cbnzw(rscratch1, L_failed);
2229 
2230       // It is safe to examine both src.length and dst.length.
2231       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2232                              r15, L_failed);
2233 
2234       __ load_klass(dst_klass, dst); // reload
2235 
2236       // Marshal the base address arguments now, freeing registers.
2237       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2238       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2239       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2240       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2241       __ movw(count, length);           // length (reloaded)
2242       Register sco_temp = c_rarg3;      // this register is free now
2243       assert_different_registers(from, to, count, sco_temp,
2244                                  dst_klass, scratch_src_klass);
2245       // assert_clean_int(count, sco_temp);
2246 
2247       // Generate the type check.
2248       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2249       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2250 
2251       // Smashes rscratch1, rscratch2
2252       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2253 
2254       // Fetch destination element klass from the ObjArrayKlass header.
2255       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2256       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2257       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2258 
2259       // the checkcast_copy loop needs two extra arguments:
2260       assert(c_rarg3 == sco_temp, "#3 already in place");
2261       // Set up arguments for checkcast_copy_entry.
2262       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2263       __ b(RuntimeAddress(checkcast_copy_entry));
2264     }
2265 
2266   __ BIND(L_failed);
2267     __ mov(r0, -1);
2268     __ leave();   // required for proper stackwalking of RuntimeStub frame
2269     __ ret(lr);
2270 
2271     return start;
2272   }
2273 
2274   //
2275   // Generate stub for array fill. If "aligned" is true, the
2276   // "to" address is assumed to be heapword aligned.
2277   //
2278   // Arguments for generated stub:
2279   //   to:    c_rarg0
2280   //   value: c_rarg1
2281   //   count: c_rarg2 treated as signed
2282   //
2283   address generate_fill(BasicType t, bool aligned, const char *name) {
2284     __ align(CodeEntryAlignment);
2285     StubCodeMark mark(this, "StubRoutines", name);
2286     address start = __ pc();
2287 
2288     BLOCK_COMMENT("Entry:");
2289 
2290     const Register to        = c_rarg0;  // source array address
2291     const Register value     = c_rarg1;  // value
2292     const Register count     = c_rarg2;  // elements count
2293 
2294     const Register bz_base = r10;        // base for block_zero routine
2295     const Register cnt_words = r11;      // temp register
2296 
2297     __ enter();
2298 
2299     Label L_fill_elements, L_exit1;
2300 
2301     int shift = -1;
2302     switch (t) {
2303       case T_BYTE:
2304         shift = 0;
2305         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2306         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2307         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2308         __ br(Assembler::LO, L_fill_elements);
2309         break;
2310       case T_SHORT:
2311         shift = 1;
2312         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2313         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2314         __ br(Assembler::LO, L_fill_elements);
2315         break;
2316       case T_INT:
2317         shift = 2;
2318         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2319         __ br(Assembler::LO, L_fill_elements);
2320         break;
2321       default: ShouldNotReachHere();
2322     }
2323 
2324     // Align source address at 8 bytes address boundary.
2325     Label L_skip_align1, L_skip_align2, L_skip_align4;
2326     if (!aligned) {
2327       switch (t) {
2328         case T_BYTE:
2329           // One byte misalignment happens only for byte arrays.
2330           __ tbz(to, 0, L_skip_align1);
2331           __ strb(value, Address(__ post(to, 1)));
2332           __ subw(count, count, 1);
2333           __ bind(L_skip_align1);
2334           // Fallthrough
2335         case T_SHORT:
2336           // Two bytes misalignment happens only for byte and short (char) arrays.
2337           __ tbz(to, 1, L_skip_align2);
2338           __ strh(value, Address(__ post(to, 2)));
2339           __ subw(count, count, 2 >> shift);
2340           __ bind(L_skip_align2);
2341           // Fallthrough
2342         case T_INT:
2343           // Align to 8 bytes, we know we are 4 byte aligned to start.
2344           __ tbz(to, 2, L_skip_align4);
2345           __ strw(value, Address(__ post(to, 4)));
2346           __ subw(count, count, 4 >> shift);
2347           __ bind(L_skip_align4);
2348           break;
2349         default: ShouldNotReachHere();
2350       }
2351     }
2352 
2353     //
2354     //  Fill large chunks
2355     //
2356     __ lsrw(cnt_words, count, 3 - shift); // number of words
2357     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2358     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2359     if (UseBlockZeroing) {
2360       Label non_block_zeroing, rest;
2361       // If the fill value is zero we can use the fast zero_words().
2362       __ cbnz(value, non_block_zeroing);
2363       __ mov(bz_base, to);
2364       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2365       __ zero_words(bz_base, cnt_words);
2366       __ b(rest);
2367       __ bind(non_block_zeroing);
2368       __ fill_words(to, cnt_words, value);
2369       __ bind(rest);
2370     } else {
2371       __ fill_words(to, cnt_words, value);
2372     }
2373 
2374     // Remaining count is less than 8 bytes. Fill it by a single store.
2375     // Note that the total length is no less than 8 bytes.
2376     if (t == T_BYTE || t == T_SHORT) {
2377       Label L_exit1;
2378       __ cbzw(count, L_exit1);
2379       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2380       __ str(value, Address(to, -8));    // overwrite some elements
2381       __ bind(L_exit1);
2382       __ leave();
2383       __ ret(lr);
2384     }
2385 
2386     // Handle copies less than 8 bytes.
2387     Label L_fill_2, L_fill_4, L_exit2;
2388     __ bind(L_fill_elements);
2389     switch (t) {
2390       case T_BYTE:
2391         __ tbz(count, 0, L_fill_2);
2392         __ strb(value, Address(__ post(to, 1)));
2393         __ bind(L_fill_2);
2394         __ tbz(count, 1, L_fill_4);
2395         __ strh(value, Address(__ post(to, 2)));
2396         __ bind(L_fill_4);
2397         __ tbz(count, 2, L_exit2);
2398         __ strw(value, Address(to));
2399         break;
2400       case T_SHORT:
2401         __ tbz(count, 0, L_fill_4);
2402         __ strh(value, Address(__ post(to, 2)));
2403         __ bind(L_fill_4);
2404         __ tbz(count, 1, L_exit2);
2405         __ strw(value, Address(to));
2406         break;
2407       case T_INT:
2408         __ cbzw(count, L_exit2);
2409         __ strw(value, Address(to));
2410         break;
2411       default: ShouldNotReachHere();
2412     }
2413     __ bind(L_exit2);
2414     __ leave();
2415     __ ret(lr);
2416     return start;
2417   }
2418 
2419   address generate_data_cache_writeback() {
2420     const Register line        = c_rarg0;  // address of line to write back
2421 
2422     __ align(CodeEntryAlignment);
2423 
2424     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2425 
2426     address start = __ pc();
2427     __ enter();
2428     __ cache_wb(Address(line, 0));
2429     __ leave();
2430     __ ret(lr);
2431 
2432     return start;
2433   }
2434 
2435   address generate_data_cache_writeback_sync() {
2436     const Register is_pre     = c_rarg0;  // pre or post sync
2437 
2438     __ align(CodeEntryAlignment);
2439 
2440     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2441 
2442     // pre wbsync is a no-op
2443     // post wbsync translates to an sfence
2444 
2445     Label skip;
2446     address start = __ pc();
2447     __ enter();
2448     __ cbnz(is_pre, skip);
2449     __ cache_wbsync(false);
2450     __ bind(skip);
2451     __ leave();
2452     __ ret(lr);
2453 
2454     return start;
2455   }
2456 
2457   void generate_arraycopy_stubs() {
2458     address entry;
2459     address entry_jbyte_arraycopy;
2460     address entry_jshort_arraycopy;
2461     address entry_jint_arraycopy;
2462     address entry_oop_arraycopy;
2463     address entry_jlong_arraycopy;
2464     address entry_checkcast_arraycopy;
2465 
2466     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2467     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2468 
2469     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2470 
2471     //*** jbyte
2472     // Always need aligned and unaligned versions
2473     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2474                                                                                   "jbyte_disjoint_arraycopy");
2475     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2476                                                                                   &entry_jbyte_arraycopy,
2477                                                                                   "jbyte_arraycopy");
2478     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2479                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2480     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2481                                                                                   "arrayof_jbyte_arraycopy");
2482 
2483     //*** jshort
2484     // Always need aligned and unaligned versions
2485     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2486                                                                                     "jshort_disjoint_arraycopy");
2487     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2488                                                                                     &entry_jshort_arraycopy,
2489                                                                                     "jshort_arraycopy");
2490     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2491                                                                                     "arrayof_jshort_disjoint_arraycopy");
2492     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2493                                                                                     "arrayof_jshort_arraycopy");
2494 
2495     //*** jint
2496     // Aligned versions
2497     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2498                                                                                 "arrayof_jint_disjoint_arraycopy");
2499     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2500                                                                                 "arrayof_jint_arraycopy");
2501     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2502     // entry_jint_arraycopy always points to the unaligned version
2503     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2504                                                                                 "jint_disjoint_arraycopy");
2505     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2506                                                                                 &entry_jint_arraycopy,
2507                                                                                 "jint_arraycopy");
2508 
2509     //*** jlong
2510     // It is always aligned
2511     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2512                                                                                   "arrayof_jlong_disjoint_arraycopy");
2513     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2514                                                                                   "arrayof_jlong_arraycopy");
2515     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2516     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2517 
2518     //*** oops
2519     {
2520       // With compressed oops we need unaligned versions; notice that
2521       // we overwrite entry_oop_arraycopy.
2522       bool aligned = !UseCompressedOops;
2523 
2524       StubRoutines::_arrayof_oop_disjoint_arraycopy
2525         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2526                                      /*dest_uninitialized*/false);
2527       StubRoutines::_arrayof_oop_arraycopy
2528         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2529                                      /*dest_uninitialized*/false);
2530       // Aligned versions without pre-barriers
2531       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2532         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2533                                      /*dest_uninitialized*/true);
2534       StubRoutines::_arrayof_oop_arraycopy_uninit
2535         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2536                                      /*dest_uninitialized*/true);
2537     }
2538 
2539     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2540     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2541     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2542     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2543 
2544     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2545     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2546                                                                         /*dest_uninitialized*/true);
2547 
2548     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2549                                                               entry_jbyte_arraycopy,
2550                                                               entry_jshort_arraycopy,
2551                                                               entry_jint_arraycopy,
2552                                                               entry_jlong_arraycopy);
2553 
2554     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2555                                                                entry_jbyte_arraycopy,
2556                                                                entry_jshort_arraycopy,
2557                                                                entry_jint_arraycopy,
2558                                                                entry_oop_arraycopy,
2559                                                                entry_jlong_arraycopy,
2560                                                                entry_checkcast_arraycopy);
2561 
2562     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2563     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2564     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2565     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2566     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2567     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2568   }
2569 
2570   void generate_math_stubs() { Unimplemented(); }
2571 
2572   // Arguments:
2573   //
2574   // Inputs:
2575   //   c_rarg0   - source byte array address
2576   //   c_rarg1   - destination byte array address
2577   //   c_rarg2   - K (key) in little endian int array
2578   //
2579   address generate_aescrypt_encryptBlock() {
2580     __ align(CodeEntryAlignment);
2581     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2582 
2583     const Register from        = c_rarg0;  // source array address
2584     const Register to          = c_rarg1;  // destination array address
2585     const Register key         = c_rarg2;  // key array address
2586     const Register keylen      = rscratch1;
2587 
2588     address start = __ pc();
2589     __ enter();
2590 
2591     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2592 
2593     __ aesenc_loadkeys(key, keylen);
2594     __ aesecb_encrypt(from, to, keylen);
2595 
2596     __ mov(r0, 0);
2597 
2598     __ leave();
2599     __ ret(lr);
2600 
2601     return start;
2602   }
2603 
2604   // Arguments:
2605   //
2606   // Inputs:
2607   //   c_rarg0   - source byte array address
2608   //   c_rarg1   - destination byte array address
2609   //   c_rarg2   - K (key) in little endian int array
2610   //
2611   address generate_aescrypt_decryptBlock() {
2612     assert(UseAES, "need AES cryptographic extension support");
2613     __ align(CodeEntryAlignment);
2614     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2615     Label L_doLast;
2616 
2617     const Register from        = c_rarg0;  // source array address
2618     const Register to          = c_rarg1;  // destination array address
2619     const Register key         = c_rarg2;  // key array address
2620     const Register keylen      = rscratch1;
2621 
2622     address start = __ pc();
2623     __ enter(); // required for proper stackwalking of RuntimeStub frame
2624 
2625     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2626 
2627     __ aesecb_decrypt(from, to, key, keylen);
2628 
2629     __ mov(r0, 0);
2630 
2631     __ leave();
2632     __ ret(lr);
2633 
2634     return start;
2635   }
2636 
2637   // Arguments:
2638   //
2639   // Inputs:
2640   //   c_rarg0   - source byte array address
2641   //   c_rarg1   - destination byte array address
2642   //   c_rarg2   - K (key) in little endian int array
2643   //   c_rarg3   - r vector byte array address
2644   //   c_rarg4   - input length
2645   //
2646   // Output:
2647   //   x0        - input length
2648   //
2649   address generate_cipherBlockChaining_encryptAESCrypt() {
2650     assert(UseAES, "need AES cryptographic extension support");
2651     __ align(CodeEntryAlignment);
2652     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2653 
2654     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2655 
2656     const Register from        = c_rarg0;  // source array address
2657     const Register to          = c_rarg1;  // destination array address
2658     const Register key         = c_rarg2;  // key array address
2659     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2660                                            // and left with the results of the last encryption block
2661     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2662     const Register keylen      = rscratch1;
2663 
2664     address start = __ pc();
2665 
2666       __ enter();
2667 
2668       __ movw(rscratch2, len_reg);
2669 
2670       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2671 
2672       __ ld1(v0, __ T16B, rvec);
2673 
2674       __ cmpw(keylen, 52);
2675       __ br(Assembler::CC, L_loadkeys_44);
2676       __ br(Assembler::EQ, L_loadkeys_52);
2677 
2678       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2679       __ rev32(v17, __ T16B, v17);
2680       __ rev32(v18, __ T16B, v18);
2681     __ BIND(L_loadkeys_52);
2682       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2683       __ rev32(v19, __ T16B, v19);
2684       __ rev32(v20, __ T16B, v20);
2685     __ BIND(L_loadkeys_44);
2686       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2687       __ rev32(v21, __ T16B, v21);
2688       __ rev32(v22, __ T16B, v22);
2689       __ rev32(v23, __ T16B, v23);
2690       __ rev32(v24, __ T16B, v24);
2691       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2692       __ rev32(v25, __ T16B, v25);
2693       __ rev32(v26, __ T16B, v26);
2694       __ rev32(v27, __ T16B, v27);
2695       __ rev32(v28, __ T16B, v28);
2696       __ ld1(v29, v30, v31, __ T16B, key);
2697       __ rev32(v29, __ T16B, v29);
2698       __ rev32(v30, __ T16B, v30);
2699       __ rev32(v31, __ T16B, v31);
2700 
2701     __ BIND(L_aes_loop);
2702       __ ld1(v1, __ T16B, __ post(from, 16));
2703       __ eor(v0, __ T16B, v0, v1);
2704 
2705       __ br(Assembler::CC, L_rounds_44);
2706       __ br(Assembler::EQ, L_rounds_52);
2707 
2708       __ aese(v0, v17); __ aesmc(v0, v0);
2709       __ aese(v0, v18); __ aesmc(v0, v0);
2710     __ BIND(L_rounds_52);
2711       __ aese(v0, v19); __ aesmc(v0, v0);
2712       __ aese(v0, v20); __ aesmc(v0, v0);
2713     __ BIND(L_rounds_44);
2714       __ aese(v0, v21); __ aesmc(v0, v0);
2715       __ aese(v0, v22); __ aesmc(v0, v0);
2716       __ aese(v0, v23); __ aesmc(v0, v0);
2717       __ aese(v0, v24); __ aesmc(v0, v0);
2718       __ aese(v0, v25); __ aesmc(v0, v0);
2719       __ aese(v0, v26); __ aesmc(v0, v0);
2720       __ aese(v0, v27); __ aesmc(v0, v0);
2721       __ aese(v0, v28); __ aesmc(v0, v0);
2722       __ aese(v0, v29); __ aesmc(v0, v0);
2723       __ aese(v0, v30);
2724       __ eor(v0, __ T16B, v0, v31);
2725 
2726       __ st1(v0, __ T16B, __ post(to, 16));
2727 
2728       __ subw(len_reg, len_reg, 16);
2729       __ cbnzw(len_reg, L_aes_loop);
2730 
2731       __ st1(v0, __ T16B, rvec);
2732 
2733       __ mov(r0, rscratch2);
2734 
2735       __ leave();
2736       __ ret(lr);
2737 
2738       return start;
2739   }
2740 
2741   // Arguments:
2742   //
2743   // Inputs:
2744   //   c_rarg0   - source byte array address
2745   //   c_rarg1   - destination byte array address
2746   //   c_rarg2   - K (key) in little endian int array
2747   //   c_rarg3   - r vector byte array address
2748   //   c_rarg4   - input length
2749   //
2750   // Output:
2751   //   r0        - input length
2752   //
2753   address generate_cipherBlockChaining_decryptAESCrypt() {
2754     assert(UseAES, "need AES cryptographic extension support");
2755     __ align(CodeEntryAlignment);
2756     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2757 
2758     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2759 
2760     const Register from        = c_rarg0;  // source array address
2761     const Register to          = c_rarg1;  // destination array address
2762     const Register key         = c_rarg2;  // key array address
2763     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2764                                            // and left with the results of the last encryption block
2765     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2766     const Register keylen      = rscratch1;
2767 
2768     address start = __ pc();
2769 
2770       __ enter();
2771 
2772       __ movw(rscratch2, len_reg);
2773 
2774       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2775 
2776       __ ld1(v2, __ T16B, rvec);
2777 
2778       __ ld1(v31, __ T16B, __ post(key, 16));
2779       __ rev32(v31, __ T16B, v31);
2780 
2781       __ cmpw(keylen, 52);
2782       __ br(Assembler::CC, L_loadkeys_44);
2783       __ br(Assembler::EQ, L_loadkeys_52);
2784 
2785       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2786       __ rev32(v17, __ T16B, v17);
2787       __ rev32(v18, __ T16B, v18);
2788     __ BIND(L_loadkeys_52);
2789       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2790       __ rev32(v19, __ T16B, v19);
2791       __ rev32(v20, __ T16B, v20);
2792     __ BIND(L_loadkeys_44);
2793       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2794       __ rev32(v21, __ T16B, v21);
2795       __ rev32(v22, __ T16B, v22);
2796       __ rev32(v23, __ T16B, v23);
2797       __ rev32(v24, __ T16B, v24);
2798       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2799       __ rev32(v25, __ T16B, v25);
2800       __ rev32(v26, __ T16B, v26);
2801       __ rev32(v27, __ T16B, v27);
2802       __ rev32(v28, __ T16B, v28);
2803       __ ld1(v29, v30, __ T16B, key);
2804       __ rev32(v29, __ T16B, v29);
2805       __ rev32(v30, __ T16B, v30);
2806 
2807     __ BIND(L_aes_loop);
2808       __ ld1(v0, __ T16B, __ post(from, 16));
2809       __ orr(v1, __ T16B, v0, v0);
2810 
2811       __ br(Assembler::CC, L_rounds_44);
2812       __ br(Assembler::EQ, L_rounds_52);
2813 
2814       __ aesd(v0, v17); __ aesimc(v0, v0);
2815       __ aesd(v0, v18); __ aesimc(v0, v0);
2816     __ BIND(L_rounds_52);
2817       __ aesd(v0, v19); __ aesimc(v0, v0);
2818       __ aesd(v0, v20); __ aesimc(v0, v0);
2819     __ BIND(L_rounds_44);
2820       __ aesd(v0, v21); __ aesimc(v0, v0);
2821       __ aesd(v0, v22); __ aesimc(v0, v0);
2822       __ aesd(v0, v23); __ aesimc(v0, v0);
2823       __ aesd(v0, v24); __ aesimc(v0, v0);
2824       __ aesd(v0, v25); __ aesimc(v0, v0);
2825       __ aesd(v0, v26); __ aesimc(v0, v0);
2826       __ aesd(v0, v27); __ aesimc(v0, v0);
2827       __ aesd(v0, v28); __ aesimc(v0, v0);
2828       __ aesd(v0, v29); __ aesimc(v0, v0);
2829       __ aesd(v0, v30);
2830       __ eor(v0, __ T16B, v0, v31);
2831       __ eor(v0, __ T16B, v0, v2);
2832 
2833       __ st1(v0, __ T16B, __ post(to, 16));
2834       __ orr(v2, __ T16B, v1, v1);
2835 
2836       __ subw(len_reg, len_reg, 16);
2837       __ cbnzw(len_reg, L_aes_loop);
2838 
2839       __ st1(v2, __ T16B, rvec);
2840 
2841       __ mov(r0, rscratch2);
2842 
2843       __ leave();
2844       __ ret(lr);
2845 
2846     return start;
2847   }
2848 
2849   // CTR AES crypt.
2850   // Arguments:
2851   //
2852   // Inputs:
2853   //   c_rarg0   - source byte array address
2854   //   c_rarg1   - destination byte array address
2855   //   c_rarg2   - K (key) in little endian int array
2856   //   c_rarg3   - counter vector byte array address
2857   //   c_rarg4   - input length
2858   //   c_rarg5   - saved encryptedCounter start
2859   //   c_rarg6   - saved used length
2860   //
2861   // Output:
2862   //   r0       - input length
2863   //
2864   address generate_counterMode_AESCrypt() {
2865     const Register in = c_rarg0;
2866     const Register out = c_rarg1;
2867     const Register key = c_rarg2;
2868     const Register counter = c_rarg3;
2869     const Register saved_len = c_rarg4, len = r10;
2870     const Register saved_encrypted_ctr = c_rarg5;
2871     const Register used_ptr = c_rarg6, used = r12;
2872 
2873     const Register offset = r7;
2874     const Register keylen = r11;
2875 
2876     const unsigned char block_size = 16;
2877     const int bulk_width = 4;
2878     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2879     // performance with larger data sizes, but it also means that the
2880     // fast path isn't used until you have at least 8 blocks, and up
2881     // to 127 bytes of data will be executed on the slow path. For
2882     // that reason, and also so as not to blow away too much icache, 4
2883     // blocks seems like a sensible compromise.
2884 
2885     // Algorithm:
2886     //
2887     //    if (len == 0) {
2888     //        goto DONE;
2889     //    }
2890     //    int result = len;
2891     //    do {
2892     //        if (used >= blockSize) {
2893     //            if (len >= bulk_width * blockSize) {
2894     //                CTR_large_block();
2895     //                if (len == 0)
2896     //                    goto DONE;
2897     //            }
2898     //            for (;;) {
2899     //                16ByteVector v0 = counter;
2900     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2901     //                used = 0;
2902     //                if (len < blockSize)
2903     //                    break;    /* goto NEXT */
2904     //                16ByteVector v1 = load16Bytes(in, offset);
2905     //                v1 = v1 ^ encryptedCounter;
2906     //                store16Bytes(out, offset);
2907     //                used = blockSize;
2908     //                offset += blockSize;
2909     //                len -= blockSize;
2910     //                if (len == 0)
2911     //                    goto DONE;
2912     //            }
2913     //        }
2914     //      NEXT:
2915     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2916     //        len--;
2917     //    } while (len != 0);
2918     //  DONE:
2919     //    return result;
2920     //
2921     // CTR_large_block()
2922     //    Wide bulk encryption of whole blocks.
2923 
2924     __ align(CodeEntryAlignment);
2925     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2926     const address start = __ pc();
2927     __ enter();
2928 
2929     Label DONE, CTR_large_block, large_block_return;
2930     __ ldrw(used, Address(used_ptr));
2931     __ cbzw(saved_len, DONE);
2932 
2933     __ mov(len, saved_len);
2934     __ mov(offset, 0);
2935 
2936     // Compute #rounds for AES based on the length of the key array
2937     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2938 
2939     __ aesenc_loadkeys(key, keylen);
2940 
2941     {
2942       Label L_CTR_loop, NEXT;
2943 
2944       __ bind(L_CTR_loop);
2945 
2946       __ cmp(used, block_size);
2947       __ br(__ LO, NEXT);
2948 
2949       // Maybe we have a lot of data
2950       __ subsw(rscratch1, len, bulk_width * block_size);
2951       __ br(__ HS, CTR_large_block);
2952       __ BIND(large_block_return);
2953       __ cbzw(len, DONE);
2954 
2955       // Setup the counter
2956       __ movi(v4, __ T4S, 0);
2957       __ movi(v5, __ T4S, 1);
2958       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2959 
2960       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2961       __ rev32(v16, __ T16B, v0);
2962       __ addv(v16, __ T4S, v16, v4);
2963       __ rev32(v16, __ T16B, v16);
2964       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2965 
2966       {
2967         // We have fewer than bulk_width blocks of data left. Encrypt
2968         // them one by one until there is less than a full block
2969         // remaining, being careful to save both the encrypted counter
2970         // and the counter.
2971 
2972         Label inner_loop;
2973         __ bind(inner_loop);
2974         // Counter to encrypt is in v0
2975         __ aesecb_encrypt(noreg, noreg, keylen);
2976         __ st1(v0, __ T16B, saved_encrypted_ctr);
2977 
2978         // Do we have a remaining full block?
2979 
2980         __ mov(used, 0);
2981         __ cmp(len, block_size);
2982         __ br(__ LO, NEXT);
2983 
2984         // Yes, we have a full block
2985         __ ldrq(v1, Address(in, offset));
2986         __ eor(v1, __ T16B, v1, v0);
2987         __ strq(v1, Address(out, offset));
2988         __ mov(used, block_size);
2989         __ add(offset, offset, block_size);
2990 
2991         __ subw(len, len, block_size);
2992         __ cbzw(len, DONE);
2993 
2994         // Increment the counter, store it back
2995         __ orr(v0, __ T16B, v16, v16);
2996         __ rev32(v16, __ T16B, v16);
2997         __ addv(v16, __ T4S, v16, v4);
2998         __ rev32(v16, __ T16B, v16);
2999         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3000 
3001         __ b(inner_loop);
3002       }
3003 
3004       __ BIND(NEXT);
3005 
3006       // Encrypt a single byte, and loop.
3007       // We expect this to be a rare event.
3008       __ ldrb(rscratch1, Address(in, offset));
3009       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3010       __ eor(rscratch1, rscratch1, rscratch2);
3011       __ strb(rscratch1, Address(out, offset));
3012       __ add(offset, offset, 1);
3013       __ add(used, used, 1);
3014       __ subw(len, len,1);
3015       __ cbnzw(len, L_CTR_loop);
3016     }
3017 
3018     __ bind(DONE);
3019     __ strw(used, Address(used_ptr));
3020     __ mov(r0, saved_len);
3021 
3022     __ leave(); // required for proper stackwalking of RuntimeStub frame
3023     __ ret(lr);
3024 
3025     // Bulk encryption
3026 
3027     __ BIND (CTR_large_block);
3028     assert(bulk_width == 4 || bulk_width == 8, "must be");
3029 
3030     if (bulk_width == 8) {
3031       __ sub(sp, sp, 4 * 16);
3032       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3033     }
3034     __ sub(sp, sp, 4 * 16);
3035     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3036     RegSet saved_regs = (RegSet::of(in, out, offset)
3037                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3038     __ push(saved_regs, sp);
3039     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3040     __ add(in, in, offset);
3041     __ add(out, out, offset);
3042 
3043     // Keys should already be loaded into the correct registers
3044 
3045     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3046     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3047 
3048     // AES/CTR loop
3049     {
3050       Label L_CTR_loop;
3051       __ BIND(L_CTR_loop);
3052 
3053       // Setup the counters
3054       __ movi(v8, __ T4S, 0);
3055       __ movi(v9, __ T4S, 1);
3056       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3057 
3058       for (int i = 0; i < bulk_width; i++) {
3059         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3060         __ rev32(v0_ofs, __ T16B, v16);
3061         __ addv(v16, __ T4S, v16, v8);
3062       }
3063 
3064       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3065 
3066       // Encrypt the counters
3067       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3068 
3069       if (bulk_width == 8) {
3070         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3071       }
3072 
3073       // XOR the encrypted counters with the inputs
3074       for (int i = 0; i < bulk_width; i++) {
3075         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3076         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3077         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3078       }
3079 
3080       // Write the encrypted data
3081       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3082       if (bulk_width == 8) {
3083         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3084       }
3085 
3086       __ subw(len, len, 16 * bulk_width);
3087       __ cbnzw(len, L_CTR_loop);
3088     }
3089 
3090     // Save the counter back where it goes
3091     __ rev32(v16, __ T16B, v16);
3092     __ st1(v16, __ T16B, counter);
3093 
3094     __ pop(saved_regs, sp);
3095 
3096     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3097     if (bulk_width == 8) {
3098       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3099     }
3100 
3101     __ andr(rscratch1, len, -16 * bulk_width);
3102     __ sub(len, len, rscratch1);
3103     __ add(offset, offset, rscratch1);
3104     __ mov(used, 16);
3105     __ strw(used, Address(used_ptr));
3106     __ b(large_block_return);
3107 
3108     return start;
3109   }
3110 
3111   // Vector AES Galois Counter Mode implementation. Parameters:
3112   //
3113   // in = c_rarg0
3114   // len = c_rarg1
3115   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3116   // out = c_rarg3
3117   // key = c_rarg4
3118   // state = c_rarg5 - GHASH.state
3119   // subkeyHtbl = c_rarg6 - powers of H
3120   // counter = c_rarg7 - 16 bytes of CTR
3121   // return - number of processed bytes
3122   address generate_galoisCounterMode_AESCrypt() {
3123     address ghash_polynomial = __ pc();
3124     __ emit_int64(0x87);  // The low-order bits of the field
3125                           // polynomial (i.e. p = z^7+z^2+z+1)
3126                           // repeated in the low and high parts of a
3127                           // 128-bit vector
3128     __ emit_int64(0x87);
3129 
3130     __ align(CodeEntryAlignment);
3131      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3132     address start = __ pc();
3133     __ enter();
3134 
3135     const Register in = c_rarg0;
3136     const Register len = c_rarg1;
3137     const Register ct = c_rarg2;
3138     const Register out = c_rarg3;
3139     // and updated with the incremented counter in the end
3140 
3141     const Register key = c_rarg4;
3142     const Register state = c_rarg5;
3143 
3144     const Register subkeyHtbl = c_rarg6;
3145 
3146     const Register counter = c_rarg7;
3147 
3148     const Register keylen = r10;
3149     // Save state before entering routine
3150     __ sub(sp, sp, 4 * 16);
3151     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3152     __ sub(sp, sp, 4 * 16);
3153     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3154 
3155     // __ andr(len, len, -512);
3156     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3157     __ str(len, __ pre(sp, -2 * wordSize));
3158 
3159     Label DONE;
3160     __ cbz(len, DONE);
3161 
3162     // Compute #rounds for AES based on the length of the key array
3163     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3164 
3165     __ aesenc_loadkeys(key, keylen);
3166     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3167     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3168 
3169     // AES/CTR loop
3170     {
3171       Label L_CTR_loop;
3172       __ BIND(L_CTR_loop);
3173 
3174       // Setup the counters
3175       __ movi(v8, __ T4S, 0);
3176       __ movi(v9, __ T4S, 1);
3177       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3178 
3179       assert(v0->encoding() < v8->encoding(), "");
3180       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3181         FloatRegister f = as_FloatRegister(i);
3182         __ rev32(f, __ T16B, v16);
3183         __ addv(v16, __ T4S, v16, v8);
3184       }
3185 
3186       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3187 
3188       // Encrypt the counters
3189       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3190 
3191       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3192 
3193       // XOR the encrypted counters with the inputs
3194       for (int i = 0; i < 8; i++) {
3195         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3196         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3197         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3198       }
3199       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3200       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3201 
3202       __ subw(len, len, 16 * 8);
3203       __ cbnzw(len, L_CTR_loop);
3204     }
3205 
3206     __ rev32(v16, __ T16B, v16);
3207     __ st1(v16, __ T16B, counter);
3208 
3209     __ ldr(len, Address(sp));
3210     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3211 
3212     // GHASH/CTR loop
3213     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3214                                 len, /*unrolls*/4);
3215 
3216 #ifdef ASSERT
3217     { Label L;
3218       __ cmp(len, (unsigned char)0);
3219       __ br(Assembler::EQ, L);
3220       __ stop("stubGenerator: abort");
3221       __ bind(L);
3222   }
3223 #endif
3224 
3225   __ bind(DONE);
3226     // Return the number of bytes processed
3227     __ ldr(r0, __ post(sp, 2 * wordSize));
3228 
3229     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3230     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3231 
3232     __ leave(); // required for proper stackwalking of RuntimeStub frame
3233     __ ret(lr);
3234      return start;
3235   }
3236 
3237   // Utility routines for md5.
3238   // Clobbers r10 and r11.
3239   void md5_FF(Register buf, Register r1, Register r2, Register r3, Register r4,
3240               int k, int s, int t) {
3241     Register rscratch3 = r10;
3242     Register rscratch4 = r11;
3243 
3244     __ eorw(rscratch3, r3, r4);
3245     __ movw(rscratch2, t);
3246     __ andw(rscratch3, rscratch3, r2);
3247     __ addw(rscratch4, r1, rscratch2);
3248     __ ldrw(rscratch1, Address(buf, k*4));
3249     __ eorw(rscratch3, rscratch3, r4);
3250     __ addw(rscratch3, rscratch3, rscratch1);
3251     __ addw(rscratch3, rscratch3, rscratch4);
3252     __ rorw(rscratch2, rscratch3, 32 - s);
3253     __ addw(r1, rscratch2, r2);
3254   }
3255 
3256   void md5_GG(Register buf, Register r1, Register r2, Register r3, Register r4,
3257               int k, int s, int t) {
3258     Register rscratch3 = r10;
3259     Register rscratch4 = r11;
3260 
3261     __ eorw(rscratch2, r2, r3);
3262     __ ldrw(rscratch1, Address(buf, k*4));
3263     __ andw(rscratch3, rscratch2, r4);
3264     __ movw(rscratch2, t);
3265     __ eorw(rscratch3, rscratch3, r3);
3266     __ addw(rscratch4, r1, rscratch2);
3267     __ addw(rscratch3, rscratch3, rscratch1);
3268     __ addw(rscratch3, rscratch3, rscratch4);
3269     __ rorw(rscratch2, rscratch3, 32 - s);
3270     __ addw(r1, rscratch2, r2);
3271   }
3272 
3273   void md5_HH(Register buf, Register r1, Register r2, Register r3, Register r4,
3274               int k, int s, int t) {
3275     Register rscratch3 = r10;
3276     Register rscratch4 = r11;
3277 
3278     __ eorw(rscratch3, r3, r4);
3279     __ movw(rscratch2, t);
3280     __ addw(rscratch4, r1, rscratch2);
3281     __ ldrw(rscratch1, Address(buf, k*4));
3282     __ eorw(rscratch3, rscratch3, r2);
3283     __ addw(rscratch3, rscratch3, rscratch1);
3284     __ addw(rscratch3, rscratch3, rscratch4);
3285     __ rorw(rscratch2, rscratch3, 32 - s);
3286     __ addw(r1, rscratch2, r2);
3287   }
3288 
3289   void md5_II(Register buf, Register r1, Register r2, Register r3, Register r4,
3290               int k, int s, int t) {
3291     Register rscratch3 = r10;
3292     Register rscratch4 = r11;
3293 
3294     __ movw(rscratch3, t);
3295     __ ornw(rscratch2, r2, r4);
3296     __ addw(rscratch4, r1, rscratch3);
3297     __ ldrw(rscratch1, Address(buf, k*4));
3298     __ eorw(rscratch3, rscratch2, r3);
3299     __ addw(rscratch3, rscratch3, rscratch1);
3300     __ addw(rscratch3, rscratch3, rscratch4);
3301     __ rorw(rscratch2, rscratch3, 32 - s);
3302     __ addw(r1, rscratch2, r2);
3303   }
3304 
3305   // Arguments:
3306   //
3307   // Inputs:
3308   //   c_rarg0   - byte[]  source+offset
3309   //   c_rarg1   - int[]   SHA.state
3310   //   c_rarg2   - int     offset
3311   //   c_rarg3   - int     limit
3312   //
3313   address generate_md5_implCompress(bool multi_block, const char *name) {
3314     __ align(CodeEntryAlignment);
3315     StubCodeMark mark(this, "StubRoutines", name);
3316     address start = __ pc();
3317 
3318     Register buf       = c_rarg0;
3319     Register state     = c_rarg1;
3320     Register ofs       = c_rarg2;
3321     Register limit     = c_rarg3;
3322     Register a         = r4;
3323     Register b         = r5;
3324     Register c         = r6;
3325     Register d         = r7;
3326     Register rscratch3 = r10;
3327     Register rscratch4 = r11;
3328 
3329     Label md5_loop;
3330     __ BIND(md5_loop);
3331 
3332     // Save hash values for addition after rounds
3333     __ ldrw(a, Address(state,  0));
3334     __ ldrw(b, Address(state,  4));
3335     __ ldrw(c, Address(state,  8));
3336     __ ldrw(d, Address(state, 12));
3337 
3338     // Round 1
3339     md5_FF(buf, a, b, c, d,  0,  7, 0xd76aa478);
3340     md5_FF(buf, d, a, b, c,  1, 12, 0xe8c7b756);
3341     md5_FF(buf, c, d, a, b,  2, 17, 0x242070db);
3342     md5_FF(buf, b, c, d, a,  3, 22, 0xc1bdceee);
3343     md5_FF(buf, a, b, c, d,  4,  7, 0xf57c0faf);
3344     md5_FF(buf, d, a, b, c,  5, 12, 0x4787c62a);
3345     md5_FF(buf, c, d, a, b,  6, 17, 0xa8304613);
3346     md5_FF(buf, b, c, d, a,  7, 22, 0xfd469501);
3347     md5_FF(buf, a, b, c, d,  8,  7, 0x698098d8);
3348     md5_FF(buf, d, a, b, c,  9, 12, 0x8b44f7af);
3349     md5_FF(buf, c, d, a, b, 10, 17, 0xffff5bb1);
3350     md5_FF(buf, b, c, d, a, 11, 22, 0x895cd7be);
3351     md5_FF(buf, a, b, c, d, 12,  7, 0x6b901122);
3352     md5_FF(buf, d, a, b, c, 13, 12, 0xfd987193);
3353     md5_FF(buf, c, d, a, b, 14, 17, 0xa679438e);
3354     md5_FF(buf, b, c, d, a, 15, 22, 0x49b40821);
3355 
3356     // Round 2
3357     md5_GG(buf, a, b, c, d,  1,  5, 0xf61e2562);
3358     md5_GG(buf, d, a, b, c,  6,  9, 0xc040b340);
3359     md5_GG(buf, c, d, a, b, 11, 14, 0x265e5a51);
3360     md5_GG(buf, b, c, d, a,  0, 20, 0xe9b6c7aa);
3361     md5_GG(buf, a, b, c, d,  5,  5, 0xd62f105d);
3362     md5_GG(buf, d, a, b, c, 10,  9, 0x02441453);
3363     md5_GG(buf, c, d, a, b, 15, 14, 0xd8a1e681);
3364     md5_GG(buf, b, c, d, a,  4, 20, 0xe7d3fbc8);
3365     md5_GG(buf, a, b, c, d,  9,  5, 0x21e1cde6);
3366     md5_GG(buf, d, a, b, c, 14,  9, 0xc33707d6);
3367     md5_GG(buf, c, d, a, b,  3, 14, 0xf4d50d87);
3368     md5_GG(buf, b, c, d, a,  8, 20, 0x455a14ed);
3369     md5_GG(buf, a, b, c, d, 13,  5, 0xa9e3e905);
3370     md5_GG(buf, d, a, b, c,  2,  9, 0xfcefa3f8);
3371     md5_GG(buf, c, d, a, b,  7, 14, 0x676f02d9);
3372     md5_GG(buf, b, c, d, a, 12, 20, 0x8d2a4c8a);
3373 
3374     // Round 3
3375     md5_HH(buf, a, b, c, d,  5,  4, 0xfffa3942);
3376     md5_HH(buf, d, a, b, c,  8, 11, 0x8771f681);
3377     md5_HH(buf, c, d, a, b, 11, 16, 0x6d9d6122);
3378     md5_HH(buf, b, c, d, a, 14, 23, 0xfde5380c);
3379     md5_HH(buf, a, b, c, d,  1,  4, 0xa4beea44);
3380     md5_HH(buf, d, a, b, c,  4, 11, 0x4bdecfa9);
3381     md5_HH(buf, c, d, a, b,  7, 16, 0xf6bb4b60);
3382     md5_HH(buf, b, c, d, a, 10, 23, 0xbebfbc70);
3383     md5_HH(buf, a, b, c, d, 13,  4, 0x289b7ec6);
3384     md5_HH(buf, d, a, b, c,  0, 11, 0xeaa127fa);
3385     md5_HH(buf, c, d, a, b,  3, 16, 0xd4ef3085);
3386     md5_HH(buf, b, c, d, a,  6, 23, 0x04881d05);
3387     md5_HH(buf, a, b, c, d,  9,  4, 0xd9d4d039);
3388     md5_HH(buf, d, a, b, c, 12, 11, 0xe6db99e5);
3389     md5_HH(buf, c, d, a, b, 15, 16, 0x1fa27cf8);
3390     md5_HH(buf, b, c, d, a,  2, 23, 0xc4ac5665);
3391 
3392     // Round 4
3393     md5_II(buf, a, b, c, d,  0,  6, 0xf4292244);
3394     md5_II(buf, d, a, b, c,  7, 10, 0x432aff97);
3395     md5_II(buf, c, d, a, b, 14, 15, 0xab9423a7);
3396     md5_II(buf, b, c, d, a,  5, 21, 0xfc93a039);
3397     md5_II(buf, a, b, c, d, 12,  6, 0x655b59c3);
3398     md5_II(buf, d, a, b, c,  3, 10, 0x8f0ccc92);
3399     md5_II(buf, c, d, a, b, 10, 15, 0xffeff47d);
3400     md5_II(buf, b, c, d, a,  1, 21, 0x85845dd1);
3401     md5_II(buf, a, b, c, d,  8,  6, 0x6fa87e4f);
3402     md5_II(buf, d, a, b, c, 15, 10, 0xfe2ce6e0);
3403     md5_II(buf, c, d, a, b,  6, 15, 0xa3014314);
3404     md5_II(buf, b, c, d, a, 13, 21, 0x4e0811a1);
3405     md5_II(buf, a, b, c, d,  4,  6, 0xf7537e82);
3406     md5_II(buf, d, a, b, c, 11, 10, 0xbd3af235);
3407     md5_II(buf, c, d, a, b,  2, 15, 0x2ad7d2bb);
3408     md5_II(buf, b, c, d, a,  9, 21, 0xeb86d391);
3409 
3410     // write hash values back in the correct order
3411     __ ldrw(rscratch1, Address(state,  0));
3412     __ addw(rscratch1, rscratch1, a);
3413     __ strw(rscratch1, Address(state,  0));
3414 
3415     __ ldrw(rscratch2, Address(state,  4));
3416     __ addw(rscratch2, rscratch2, b);
3417     __ strw(rscratch2, Address(state,  4));
3418 
3419     __ ldrw(rscratch3, Address(state,  8));
3420     __ addw(rscratch3, rscratch3, c);
3421     __ strw(rscratch3, Address(state,  8));
3422 
3423     __ ldrw(rscratch4, Address(state, 12));
3424     __ addw(rscratch4, rscratch4, d);
3425     __ strw(rscratch4, Address(state, 12));
3426 
3427     if (multi_block) {
3428       __ add(buf, buf, 64);
3429       __ add(ofs, ofs, 64);
3430       __ cmp(ofs, limit);
3431       __ br(Assembler::LE, md5_loop);
3432       __ mov(c_rarg0, ofs); // return ofs
3433     }
3434 
3435     __ ret(lr);
3436 
3437     return start;
3438   }
3439 
3440   // Arguments:
3441   //
3442   // Inputs:
3443   //   c_rarg0   - byte[]  source+offset
3444   //   c_rarg1   - int[]   SHA.state
3445   //   c_rarg2   - int     offset
3446   //   c_rarg3   - int     limit
3447   //
3448   address generate_sha1_implCompress(bool multi_block, const char *name) {
3449     __ align(CodeEntryAlignment);
3450     StubCodeMark mark(this, "StubRoutines", name);
3451     address start = __ pc();
3452 
3453     Register buf   = c_rarg0;
3454     Register state = c_rarg1;
3455     Register ofs   = c_rarg2;
3456     Register limit = c_rarg3;
3457 
3458     Label keys;
3459     Label sha1_loop;
3460 
3461     // load the keys into v0..v3
3462     __ adr(rscratch1, keys);
3463     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3464     // load 5 words state into v6, v7
3465     __ ldrq(v6, Address(state, 0));
3466     __ ldrs(v7, Address(state, 16));
3467 
3468 
3469     __ BIND(sha1_loop);
3470     // load 64 bytes of data into v16..v19
3471     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3472     __ rev32(v16, __ T16B, v16);
3473     __ rev32(v17, __ T16B, v17);
3474     __ rev32(v18, __ T16B, v18);
3475     __ rev32(v19, __ T16B, v19);
3476 
3477     // do the sha1
3478     __ addv(v4, __ T4S, v16, v0);
3479     __ orr(v20, __ T16B, v6, v6);
3480 
3481     FloatRegister d0 = v16;
3482     FloatRegister d1 = v17;
3483     FloatRegister d2 = v18;
3484     FloatRegister d3 = v19;
3485 
3486     for (int round = 0; round < 20; round++) {
3487       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3488       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3489       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3490       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3491       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3492 
3493       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3494       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3495       __ sha1h(tmp2, __ T4S, v20);
3496       if (round < 5)
3497         __ sha1c(v20, __ T4S, tmp3, tmp4);
3498       else if (round < 10 || round >= 15)
3499         __ sha1p(v20, __ T4S, tmp3, tmp4);
3500       else
3501         __ sha1m(v20, __ T4S, tmp3, tmp4);
3502       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3503 
3504       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3505     }
3506 
3507     __ addv(v7, __ T2S, v7, v21);
3508     __ addv(v6, __ T4S, v6, v20);
3509 
3510     if (multi_block) {
3511       __ add(ofs, ofs, 64);
3512       __ cmp(ofs, limit);
3513       __ br(Assembler::LE, sha1_loop);
3514       __ mov(c_rarg0, ofs); // return ofs
3515     }
3516 
3517     __ strq(v6, Address(state, 0));
3518     __ strs(v7, Address(state, 16));
3519 
3520     __ ret(lr);
3521 
3522     __ bind(keys);
3523     __ emit_int32(0x5a827999);
3524     __ emit_int32(0x6ed9eba1);
3525     __ emit_int32(0x8f1bbcdc);
3526     __ emit_int32(0xca62c1d6);
3527 
3528     return start;
3529   }
3530 
3531 
3532   // Arguments:
3533   //
3534   // Inputs:
3535   //   c_rarg0   - byte[]  source+offset
3536   //   c_rarg1   - int[]   SHA.state
3537   //   c_rarg2   - int     offset
3538   //   c_rarg3   - int     limit
3539   //
3540   address generate_sha256_implCompress(bool multi_block, const char *name) {
3541     static const uint32_t round_consts[64] = {
3542       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3543       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3544       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3545       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3546       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3547       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3548       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3549       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3550       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3551       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3552       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3553       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3554       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3555       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3556       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3557       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3558     };
3559     __ align(CodeEntryAlignment);
3560     StubCodeMark mark(this, "StubRoutines", name);
3561     address start = __ pc();
3562 
3563     Register buf   = c_rarg0;
3564     Register state = c_rarg1;
3565     Register ofs   = c_rarg2;
3566     Register limit = c_rarg3;
3567 
3568     Label sha1_loop;
3569 
3570     __ stpd(v8, v9, __ pre(sp, -32));
3571     __ stpd(v10, v11, Address(sp, 16));
3572 
3573 // dga == v0
3574 // dgb == v1
3575 // dg0 == v2
3576 // dg1 == v3
3577 // dg2 == v4
3578 // t0 == v6
3579 // t1 == v7
3580 
3581     // load 16 keys to v16..v31
3582     __ lea(rscratch1, ExternalAddress((address)round_consts));
3583     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3584     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3585     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3586     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3587 
3588     // load 8 words (256 bits) state
3589     __ ldpq(v0, v1, state);
3590 
3591     __ BIND(sha1_loop);
3592     // load 64 bytes of data into v8..v11
3593     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3594     __ rev32(v8, __ T16B, v8);
3595     __ rev32(v9, __ T16B, v9);
3596     __ rev32(v10, __ T16B, v10);
3597     __ rev32(v11, __ T16B, v11);
3598 
3599     __ addv(v6, __ T4S, v8, v16);
3600     __ orr(v2, __ T16B, v0, v0);
3601     __ orr(v3, __ T16B, v1, v1);
3602 
3603     FloatRegister d0 = v8;
3604     FloatRegister d1 = v9;
3605     FloatRegister d2 = v10;
3606     FloatRegister d3 = v11;
3607 
3608 
3609     for (int round = 0; round < 16; round++) {
3610       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3611       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3612       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3613       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3614 
3615       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3616        __ orr(v4, __ T16B, v2, v2);
3617       if (round < 15)
3618         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3619       __ sha256h(v2, __ T4S, v3, tmp2);
3620       __ sha256h2(v3, __ T4S, v4, tmp2);
3621       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3622 
3623       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3624     }
3625 
3626     __ addv(v0, __ T4S, v0, v2);
3627     __ addv(v1, __ T4S, v1, v3);
3628 
3629     if (multi_block) {
3630       __ add(ofs, ofs, 64);
3631       __ cmp(ofs, limit);
3632       __ br(Assembler::LE, sha1_loop);
3633       __ mov(c_rarg0, ofs); // return ofs
3634     }
3635 
3636     __ ldpd(v10, v11, Address(sp, 16));
3637     __ ldpd(v8, v9, __ post(sp, 32));
3638 
3639     __ stpq(v0, v1, state);
3640 
3641     __ ret(lr);
3642 
3643     return start;
3644   }
3645 
3646   // Double rounds for sha512.
3647   void sha512_dround(int dr,
3648                      FloatRegister vi0, FloatRegister vi1,
3649                      FloatRegister vi2, FloatRegister vi3,
3650                      FloatRegister vi4, FloatRegister vrc0,
3651                      FloatRegister vrc1, FloatRegister vin0,
3652                      FloatRegister vin1, FloatRegister vin2,
3653                      FloatRegister vin3, FloatRegister vin4) {
3654       if (dr < 36) {
3655         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3656       }
3657       __ addv(v5, __ T2D, vrc0, vin0);
3658       __ ext(v6, __ T16B, vi2, vi3, 8);
3659       __ ext(v5, __ T16B, v5, v5, 8);
3660       __ ext(v7, __ T16B, vi1, vi2, 8);
3661       __ addv(vi3, __ T2D, vi3, v5);
3662       if (dr < 32) {
3663         __ ext(v5, __ T16B, vin3, vin4, 8);
3664         __ sha512su0(vin0, __ T2D, vin1);
3665       }
3666       __ sha512h(vi3, __ T2D, v6, v7);
3667       if (dr < 32) {
3668         __ sha512su1(vin0, __ T2D, vin2, v5);
3669       }
3670       __ addv(vi4, __ T2D, vi1, vi3);
3671       __ sha512h2(vi3, __ T2D, vi1, vi0);
3672   }
3673 
3674   // Arguments:
3675   //
3676   // Inputs:
3677   //   c_rarg0   - byte[]  source+offset
3678   //   c_rarg1   - int[]   SHA.state
3679   //   c_rarg2   - int     offset
3680   //   c_rarg3   - int     limit
3681   //
3682   address generate_sha512_implCompress(bool multi_block, const char *name) {
3683     static const uint64_t round_consts[80] = {
3684       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3685       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3686       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3687       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3688       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3689       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3690       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3691       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3692       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3693       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3694       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3695       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3696       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3697       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3698       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3699       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3700       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3701       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3702       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3703       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3704       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3705       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3706       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3707       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3708       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3709       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3710       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3711     };
3712 
3713     __ align(CodeEntryAlignment);
3714     StubCodeMark mark(this, "StubRoutines", name);
3715     address start = __ pc();
3716 
3717     Register buf   = c_rarg0;
3718     Register state = c_rarg1;
3719     Register ofs   = c_rarg2;
3720     Register limit = c_rarg3;
3721 
3722     __ stpd(v8, v9, __ pre(sp, -64));
3723     __ stpd(v10, v11, Address(sp, 16));
3724     __ stpd(v12, v13, Address(sp, 32));
3725     __ stpd(v14, v15, Address(sp, 48));
3726 
3727     Label sha512_loop;
3728 
3729     // load state
3730     __ ld1(v8, v9, v10, v11, __ T2D, state);
3731 
3732     // load first 4 round constants
3733     __ lea(rscratch1, ExternalAddress((address)round_consts));
3734     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3735 
3736     __ BIND(sha512_loop);
3737     // load 128B of data into v12..v19
3738     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3739     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3740     __ rev64(v12, __ T16B, v12);
3741     __ rev64(v13, __ T16B, v13);
3742     __ rev64(v14, __ T16B, v14);
3743     __ rev64(v15, __ T16B, v15);
3744     __ rev64(v16, __ T16B, v16);
3745     __ rev64(v17, __ T16B, v17);
3746     __ rev64(v18, __ T16B, v18);
3747     __ rev64(v19, __ T16B, v19);
3748 
3749     __ mov(rscratch2, rscratch1);
3750 
3751     __ mov(v0, __ T16B, v8);
3752     __ mov(v1, __ T16B, v9);
3753     __ mov(v2, __ T16B, v10);
3754     __ mov(v3, __ T16B, v11);
3755 
3756     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3757     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3758     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3759     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3760     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3761     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3762     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3763     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3764     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3765     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3766     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3767     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3768     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3769     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3770     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3771     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3772     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3773     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3774     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3775     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3776     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3777     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3778     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3779     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3780     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3781     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3782     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3783     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3784     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3785     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3786     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3787     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3788     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3789     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3790     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3791     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3792     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3793     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3794     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3795     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3796 
3797     __ addv(v8, __ T2D, v8, v0);
3798     __ addv(v9, __ T2D, v9, v1);
3799     __ addv(v10, __ T2D, v10, v2);
3800     __ addv(v11, __ T2D, v11, v3);
3801 
3802     if (multi_block) {
3803       __ add(ofs, ofs, 128);
3804       __ cmp(ofs, limit);
3805       __ br(Assembler::LE, sha512_loop);
3806       __ mov(c_rarg0, ofs); // return ofs
3807     }
3808 
3809     __ st1(v8, v9, v10, v11, __ T2D, state);
3810 
3811     __ ldpd(v14, v15, Address(sp, 48));
3812     __ ldpd(v12, v13, Address(sp, 32));
3813     __ ldpd(v10, v11, Address(sp, 16));
3814     __ ldpd(v8, v9, __ post(sp, 64));
3815 
3816     __ ret(lr);
3817 
3818     return start;
3819   }
3820 
3821   // Arguments:
3822   //
3823   // Inputs:
3824   //   c_rarg0   - byte[]  source+offset
3825   //   c_rarg1   - byte[]   SHA.state
3826   //   c_rarg2   - int     digest_length
3827   //   c_rarg3   - int     offset
3828   //   c_rarg4   - int     limit
3829   //
3830   address generate_sha3_implCompress(bool multi_block, const char *name) {
3831     static const uint64_t round_consts[24] = {
3832       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3833       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3834       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3835       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3836       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3837       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3838       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3839       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3840     };
3841 
3842     __ align(CodeEntryAlignment);
3843     StubCodeMark mark(this, "StubRoutines", name);
3844     address start = __ pc();
3845 
3846     Register buf           = c_rarg0;
3847     Register state         = c_rarg1;
3848     Register digest_length = c_rarg2;
3849     Register ofs           = c_rarg3;
3850     Register limit         = c_rarg4;
3851 
3852     Label sha3_loop, rounds24_loop;
3853     Label sha3_512, sha3_384_or_224, sha3_256;
3854 
3855     __ stpd(v8, v9, __ pre(sp, -64));
3856     __ stpd(v10, v11, Address(sp, 16));
3857     __ stpd(v12, v13, Address(sp, 32));
3858     __ stpd(v14, v15, Address(sp, 48));
3859 
3860     // load state
3861     __ add(rscratch1, state, 32);
3862     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3863     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3864     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3865     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3866     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3867     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3868     __ ld1(v24, __ T1D, rscratch1);
3869 
3870     __ BIND(sha3_loop);
3871 
3872     // 24 keccak rounds
3873     __ movw(rscratch2, 24);
3874 
3875     // load round_constants base
3876     __ lea(rscratch1, ExternalAddress((address) round_consts));
3877 
3878     // load input
3879     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3880     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3881     __ eor(v0, __ T8B, v0, v25);
3882     __ eor(v1, __ T8B, v1, v26);
3883     __ eor(v2, __ T8B, v2, v27);
3884     __ eor(v3, __ T8B, v3, v28);
3885     __ eor(v4, __ T8B, v4, v29);
3886     __ eor(v5, __ T8B, v5, v30);
3887     __ eor(v6, __ T8B, v6, v31);
3888 
3889     // digest_length == 64, SHA3-512
3890     __ tbnz(digest_length, 6, sha3_512);
3891 
3892     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3893     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3894     __ eor(v7, __ T8B, v7, v25);
3895     __ eor(v8, __ T8B, v8, v26);
3896     __ eor(v9, __ T8B, v9, v27);
3897     __ eor(v10, __ T8B, v10, v28);
3898     __ eor(v11, __ T8B, v11, v29);
3899     __ eor(v12, __ T8B, v12, v30);
3900 
3901     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3902     __ tbnz(digest_length, 4, sha3_384_or_224);
3903 
3904     // SHA3-256
3905     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3906     __ eor(v13, __ T8B, v13, v25);
3907     __ eor(v14, __ T8B, v14, v26);
3908     __ eor(v15, __ T8B, v15, v27);
3909     __ eor(v16, __ T8B, v16, v28);
3910     __ b(rounds24_loop);
3911 
3912     __ BIND(sha3_384_or_224);
3913     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3914 
3915     // SHA3-224
3916     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3917     __ ld1(v29, __ T8B, __ post(buf, 8));
3918     __ eor(v13, __ T8B, v13, v25);
3919     __ eor(v14, __ T8B, v14, v26);
3920     __ eor(v15, __ T8B, v15, v27);
3921     __ eor(v16, __ T8B, v16, v28);
3922     __ eor(v17, __ T8B, v17, v29);
3923     __ b(rounds24_loop);
3924 
3925     __ BIND(sha3_512);
3926     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3927     __ eor(v7, __ T8B, v7, v25);
3928     __ eor(v8, __ T8B, v8, v26);
3929 
3930     __ BIND(rounds24_loop);
3931     __ subw(rscratch2, rscratch2, 1);
3932 
3933     __ eor3(v29, __ T16B, v4, v9, v14);
3934     __ eor3(v26, __ T16B, v1, v6, v11);
3935     __ eor3(v28, __ T16B, v3, v8, v13);
3936     __ eor3(v25, __ T16B, v0, v5, v10);
3937     __ eor3(v27, __ T16B, v2, v7, v12);
3938     __ eor3(v29, __ T16B, v29, v19, v24);
3939     __ eor3(v26, __ T16B, v26, v16, v21);
3940     __ eor3(v28, __ T16B, v28, v18, v23);
3941     __ eor3(v25, __ T16B, v25, v15, v20);
3942     __ eor3(v27, __ T16B, v27, v17, v22);
3943 
3944     __ rax1(v30, __ T2D, v29, v26);
3945     __ rax1(v26, __ T2D, v26, v28);
3946     __ rax1(v28, __ T2D, v28, v25);
3947     __ rax1(v25, __ T2D, v25, v27);
3948     __ rax1(v27, __ T2D, v27, v29);
3949 
3950     __ eor(v0, __ T16B, v0, v30);
3951     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3952     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3953     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3954     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3955     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3956     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3957     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3958     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3959     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3960     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3961     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3962     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3963     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3964     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3965     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3966     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3967     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3968     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3969     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3970     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3971     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3972     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3973     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3974     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3975 
3976     __ bcax(v20, __ T16B, v31, v22, v8);
3977     __ bcax(v21, __ T16B, v8,  v23, v22);
3978     __ bcax(v22, __ T16B, v22, v24, v23);
3979     __ bcax(v23, __ T16B, v23, v31, v24);
3980     __ bcax(v24, __ T16B, v24, v8,  v31);
3981 
3982     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3983 
3984     __ bcax(v17, __ T16B, v25, v19, v3);
3985     __ bcax(v18, __ T16B, v3,  v15, v19);
3986     __ bcax(v19, __ T16B, v19, v16, v15);
3987     __ bcax(v15, __ T16B, v15, v25, v16);
3988     __ bcax(v16, __ T16B, v16, v3,  v25);
3989 
3990     __ bcax(v10, __ T16B, v29, v12, v26);
3991     __ bcax(v11, __ T16B, v26, v13, v12);
3992     __ bcax(v12, __ T16B, v12, v14, v13);
3993     __ bcax(v13, __ T16B, v13, v29, v14);
3994     __ bcax(v14, __ T16B, v14, v26, v29);
3995 
3996     __ bcax(v7, __ T16B, v30, v9,  v4);
3997     __ bcax(v8, __ T16B, v4,  v5,  v9);
3998     __ bcax(v9, __ T16B, v9,  v6,  v5);
3999     __ bcax(v5, __ T16B, v5,  v30, v6);
4000     __ bcax(v6, __ T16B, v6,  v4,  v30);
4001 
4002     __ bcax(v3, __ T16B, v27, v0,  v28);
4003     __ bcax(v4, __ T16B, v28, v1,  v0);
4004     __ bcax(v0, __ T16B, v0,  v2,  v1);
4005     __ bcax(v1, __ T16B, v1,  v27, v2);
4006     __ bcax(v2, __ T16B, v2,  v28, v27);
4007 
4008     __ eor(v0, __ T16B, v0, v31);
4009 
4010     __ cbnzw(rscratch2, rounds24_loop);
4011 
4012     if (multi_block) {
4013       // block_size =  200 - 2 * digest_length, ofs += block_size
4014       __ add(ofs, ofs, 200);
4015       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
4016 
4017       __ cmp(ofs, limit);
4018       __ br(Assembler::LE, sha3_loop);
4019       __ mov(c_rarg0, ofs); // return ofs
4020     }
4021 
4022     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4023     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4024     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4025     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4026     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4027     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4028     __ st1(v24, __ T1D, state);
4029 
4030     __ ldpd(v14, v15, Address(sp, 48));
4031     __ ldpd(v12, v13, Address(sp, 32));
4032     __ ldpd(v10, v11, Address(sp, 16));
4033     __ ldpd(v8, v9, __ post(sp, 64));
4034 
4035     __ ret(lr);
4036 
4037     return start;
4038   }
4039 
4040   /**
4041    *  Arguments:
4042    *
4043    * Inputs:
4044    *   c_rarg0   - int crc
4045    *   c_rarg1   - byte* buf
4046    *   c_rarg2   - int length
4047    *
4048    * Output:
4049    *       rax   - int crc result
4050    */
4051   address generate_updateBytesCRC32() {
4052     assert(UseCRC32Intrinsics, "what are we doing here?");
4053 
4054     __ align(CodeEntryAlignment);
4055     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4056 
4057     address start = __ pc();
4058 
4059     const Register crc   = c_rarg0;  // crc
4060     const Register buf   = c_rarg1;  // source java byte array address
4061     const Register len   = c_rarg2;  // length
4062     const Register table0 = c_rarg3; // crc_table address
4063     const Register table1 = c_rarg4;
4064     const Register table2 = c_rarg5;
4065     const Register table3 = c_rarg6;
4066     const Register tmp3 = c_rarg7;
4067 
4068     BLOCK_COMMENT("Entry:");
4069     __ enter(); // required for proper stackwalking of RuntimeStub frame
4070 
4071     __ kernel_crc32(crc, buf, len,
4072               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4073 
4074     __ leave(); // required for proper stackwalking of RuntimeStub frame
4075     __ ret(lr);
4076 
4077     return start;
4078   }
4079 
4080   /**
4081    *  Arguments:
4082    *
4083    * Inputs:
4084    *   c_rarg0   - int crc
4085    *   c_rarg1   - byte* buf
4086    *   c_rarg2   - int length
4087    *   c_rarg3   - int* table
4088    *
4089    * Output:
4090    *       r0   - int crc result
4091    */
4092   address generate_updateBytesCRC32C() {
4093     assert(UseCRC32CIntrinsics, "what are we doing here?");
4094 
4095     __ align(CodeEntryAlignment);
4096     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4097 
4098     address start = __ pc();
4099 
4100     const Register crc   = c_rarg0;  // crc
4101     const Register buf   = c_rarg1;  // source java byte array address
4102     const Register len   = c_rarg2;  // length
4103     const Register table0 = c_rarg3; // crc_table address
4104     const Register table1 = c_rarg4;
4105     const Register table2 = c_rarg5;
4106     const Register table3 = c_rarg6;
4107     const Register tmp3 = c_rarg7;
4108 
4109     BLOCK_COMMENT("Entry:");
4110     __ enter(); // required for proper stackwalking of RuntimeStub frame
4111 
4112     __ kernel_crc32c(crc, buf, len,
4113               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4114 
4115     __ leave(); // required for proper stackwalking of RuntimeStub frame
4116     __ ret(lr);
4117 
4118     return start;
4119   }
4120 
4121   /***
4122    *  Arguments:
4123    *
4124    *  Inputs:
4125    *   c_rarg0   - int   adler
4126    *   c_rarg1   - byte* buff
4127    *   c_rarg2   - int   len
4128    *
4129    * Output:
4130    *   c_rarg0   - int adler result
4131    */
4132   address generate_updateBytesAdler32() {
4133     __ align(CodeEntryAlignment);
4134     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4135     address start = __ pc();
4136 
4137     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4138 
4139     // Aliases
4140     Register adler  = c_rarg0;
4141     Register s1     = c_rarg0;
4142     Register s2     = c_rarg3;
4143     Register buff   = c_rarg1;
4144     Register len    = c_rarg2;
4145     Register nmax  = r4;
4146     Register base  = r5;
4147     Register count = r6;
4148     Register temp0 = rscratch1;
4149     Register temp1 = rscratch2;
4150     FloatRegister vbytes = v0;
4151     FloatRegister vs1acc = v1;
4152     FloatRegister vs2acc = v2;
4153     FloatRegister vtable = v3;
4154 
4155     // Max number of bytes we can process before having to take the mod
4156     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4157     uint64_t BASE = 0xfff1;
4158     uint64_t NMAX = 0x15B0;
4159 
4160     __ mov(base, BASE);
4161     __ mov(nmax, NMAX);
4162 
4163     // Load accumulation coefficients for the upper 16 bits
4164     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4165     __ ld1(vtable, __ T16B, Address(temp0));
4166 
4167     // s1 is initialized to the lower 16 bits of adler
4168     // s2 is initialized to the upper 16 bits of adler
4169     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4170     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4171 
4172     // The pipelined loop needs at least 16 elements for 1 iteration
4173     // It does check this, but it is more effective to skip to the cleanup loop
4174     __ cmp(len, (u1)16);
4175     __ br(Assembler::HS, L_nmax);
4176     __ cbz(len, L_combine);
4177 
4178     __ bind(L_simple_by1_loop);
4179     __ ldrb(temp0, Address(__ post(buff, 1)));
4180     __ add(s1, s1, temp0);
4181     __ add(s2, s2, s1);
4182     __ subs(len, len, 1);
4183     __ br(Assembler::HI, L_simple_by1_loop);
4184 
4185     // s1 = s1 % BASE
4186     __ subs(temp0, s1, base);
4187     __ csel(s1, temp0, s1, Assembler::HS);
4188 
4189     // s2 = s2 % BASE
4190     __ lsr(temp0, s2, 16);
4191     __ lsl(temp1, temp0, 4);
4192     __ sub(temp1, temp1, temp0);
4193     __ add(s2, temp1, s2, ext::uxth);
4194 
4195     __ subs(temp0, s2, base);
4196     __ csel(s2, temp0, s2, Assembler::HS);
4197 
4198     __ b(L_combine);
4199 
4200     __ bind(L_nmax);
4201     __ subs(len, len, nmax);
4202     __ sub(count, nmax, 16);
4203     __ br(Assembler::LO, L_by16);
4204 
4205     __ bind(L_nmax_loop);
4206 
4207     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4208                                       vbytes, vs1acc, vs2acc, vtable);
4209 
4210     __ subs(count, count, 16);
4211     __ br(Assembler::HS, L_nmax_loop);
4212 
4213     // s1 = s1 % BASE
4214     __ lsr(temp0, s1, 16);
4215     __ lsl(temp1, temp0, 4);
4216     __ sub(temp1, temp1, temp0);
4217     __ add(temp1, temp1, s1, ext::uxth);
4218 
4219     __ lsr(temp0, temp1, 16);
4220     __ lsl(s1, temp0, 4);
4221     __ sub(s1, s1, temp0);
4222     __ add(s1, s1, temp1, ext:: uxth);
4223 
4224     __ subs(temp0, s1, base);
4225     __ csel(s1, temp0, s1, Assembler::HS);
4226 
4227     // s2 = s2 % BASE
4228     __ lsr(temp0, s2, 16);
4229     __ lsl(temp1, temp0, 4);
4230     __ sub(temp1, temp1, temp0);
4231     __ add(temp1, temp1, s2, ext::uxth);
4232 
4233     __ lsr(temp0, temp1, 16);
4234     __ lsl(s2, temp0, 4);
4235     __ sub(s2, s2, temp0);
4236     __ add(s2, s2, temp1, ext:: uxth);
4237 
4238     __ subs(temp0, s2, base);
4239     __ csel(s2, temp0, s2, Assembler::HS);
4240 
4241     __ subs(len, len, nmax);
4242     __ sub(count, nmax, 16);
4243     __ br(Assembler::HS, L_nmax_loop);
4244 
4245     __ bind(L_by16);
4246     __ adds(len, len, count);
4247     __ br(Assembler::LO, L_by1);
4248 
4249     __ bind(L_by16_loop);
4250 
4251     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4252                                       vbytes, vs1acc, vs2acc, vtable);
4253 
4254     __ subs(len, len, 16);
4255     __ br(Assembler::HS, L_by16_loop);
4256 
4257     __ bind(L_by1);
4258     __ adds(len, len, 15);
4259     __ br(Assembler::LO, L_do_mod);
4260 
4261     __ bind(L_by1_loop);
4262     __ ldrb(temp0, Address(__ post(buff, 1)));
4263     __ add(s1, temp0, s1);
4264     __ add(s2, s2, s1);
4265     __ subs(len, len, 1);
4266     __ br(Assembler::HS, L_by1_loop);
4267 
4268     __ bind(L_do_mod);
4269     // s1 = s1 % BASE
4270     __ lsr(temp0, s1, 16);
4271     __ lsl(temp1, temp0, 4);
4272     __ sub(temp1, temp1, temp0);
4273     __ add(temp1, temp1, s1, ext::uxth);
4274 
4275     __ lsr(temp0, temp1, 16);
4276     __ lsl(s1, temp0, 4);
4277     __ sub(s1, s1, temp0);
4278     __ add(s1, s1, temp1, ext:: uxth);
4279 
4280     __ subs(temp0, s1, base);
4281     __ csel(s1, temp0, s1, Assembler::HS);
4282 
4283     // s2 = s2 % BASE
4284     __ lsr(temp0, s2, 16);
4285     __ lsl(temp1, temp0, 4);
4286     __ sub(temp1, temp1, temp0);
4287     __ add(temp1, temp1, s2, ext::uxth);
4288 
4289     __ lsr(temp0, temp1, 16);
4290     __ lsl(s2, temp0, 4);
4291     __ sub(s2, s2, temp0);
4292     __ add(s2, s2, temp1, ext:: uxth);
4293 
4294     __ subs(temp0, s2, base);
4295     __ csel(s2, temp0, s2, Assembler::HS);
4296 
4297     // Combine lower bits and higher bits
4298     __ bind(L_combine);
4299     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4300 
4301     __ ret(lr);
4302 
4303     return start;
4304   }
4305 
4306   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4307           Register temp0, Register temp1, FloatRegister vbytes,
4308           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4309     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4310     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4311     // In non-vectorized code, we update s1 and s2 as:
4312     //   s1 <- s1 + b1
4313     //   s2 <- s2 + s1
4314     //   s1 <- s1 + b2
4315     //   s2 <- s2 + b1
4316     //   ...
4317     //   s1 <- s1 + b16
4318     //   s2 <- s2 + s1
4319     // Putting above assignments together, we have:
4320     //   s1_new = s1 + b1 + b2 + ... + b16
4321     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4322     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4323     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4324     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4325 
4326     // s2 = s2 + s1 * 16
4327     __ add(s2, s2, s1, Assembler::LSL, 4);
4328 
4329     // vs1acc = b1 + b2 + b3 + ... + b16
4330     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4331     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4332     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4333     __ uaddlv(vs1acc, __ T16B, vbytes);
4334     __ uaddlv(vs2acc, __ T8H, vs2acc);
4335 
4336     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4337     __ fmovd(temp0, vs1acc);
4338     __ fmovd(temp1, vs2acc);
4339     __ add(s1, s1, temp0);
4340     __ add(s2, s2, temp1);
4341   }
4342 
4343   /**
4344    *  Arguments:
4345    *
4346    *  Input:
4347    *    c_rarg0   - x address
4348    *    c_rarg1   - x length
4349    *    c_rarg2   - y address
4350    *    c_rarg3   - y length
4351    *    c_rarg4   - z address
4352    *    c_rarg5   - z length
4353    */
4354   address generate_multiplyToLen() {
4355     __ align(CodeEntryAlignment);
4356     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4357 
4358     address start = __ pc();
4359     const Register x     = r0;
4360     const Register xlen  = r1;
4361     const Register y     = r2;
4362     const Register ylen  = r3;
4363     const Register z     = r4;
4364     const Register zlen  = r5;
4365 
4366     const Register tmp1  = r10;
4367     const Register tmp2  = r11;
4368     const Register tmp3  = r12;
4369     const Register tmp4  = r13;
4370     const Register tmp5  = r14;
4371     const Register tmp6  = r15;
4372     const Register tmp7  = r16;
4373 
4374     BLOCK_COMMENT("Entry:");
4375     __ enter(); // required for proper stackwalking of RuntimeStub frame
4376     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4377     __ leave(); // required for proper stackwalking of RuntimeStub frame
4378     __ ret(lr);
4379 
4380     return start;
4381   }
4382 
4383   address generate_squareToLen() {
4384     // squareToLen algorithm for sizes 1..127 described in java code works
4385     // faster than multiply_to_len on some CPUs and slower on others, but
4386     // multiply_to_len shows a bit better overall results
4387     __ align(CodeEntryAlignment);
4388     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4389     address start = __ pc();
4390 
4391     const Register x     = r0;
4392     const Register xlen  = r1;
4393     const Register z     = r2;
4394     const Register zlen  = r3;
4395     const Register y     = r4; // == x
4396     const Register ylen  = r5; // == xlen
4397 
4398     const Register tmp1  = r10;
4399     const Register tmp2  = r11;
4400     const Register tmp3  = r12;
4401     const Register tmp4  = r13;
4402     const Register tmp5  = r14;
4403     const Register tmp6  = r15;
4404     const Register tmp7  = r16;
4405 
4406     RegSet spilled_regs = RegSet::of(y, ylen);
4407     BLOCK_COMMENT("Entry:");
4408     __ enter();
4409     __ push(spilled_regs, sp);
4410     __ mov(y, x);
4411     __ mov(ylen, xlen);
4412     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4413     __ pop(spilled_regs, sp);
4414     __ leave();
4415     __ ret(lr);
4416     return start;
4417   }
4418 
4419   address generate_mulAdd() {
4420     __ align(CodeEntryAlignment);
4421     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4422 
4423     address start = __ pc();
4424 
4425     const Register out     = r0;
4426     const Register in      = r1;
4427     const Register offset  = r2;
4428     const Register len     = r3;
4429     const Register k       = r4;
4430 
4431     BLOCK_COMMENT("Entry:");
4432     __ enter();
4433     __ mul_add(out, in, offset, len, k);
4434     __ leave();
4435     __ ret(lr);
4436 
4437     return start;
4438   }
4439 
4440   // Arguments:
4441   //
4442   // Input:
4443   //   c_rarg0   - newArr address
4444   //   c_rarg1   - oldArr address
4445   //   c_rarg2   - newIdx
4446   //   c_rarg3   - shiftCount
4447   //   c_rarg4   - numIter
4448   //
4449   address generate_bigIntegerRightShift() {
4450     __ align(CodeEntryAlignment);
4451     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4452     address start = __ pc();
4453 
4454     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4455 
4456     Register newArr        = c_rarg0;
4457     Register oldArr        = c_rarg1;
4458     Register newIdx        = c_rarg2;
4459     Register shiftCount    = c_rarg3;
4460     Register numIter       = c_rarg4;
4461     Register idx           = numIter;
4462 
4463     Register newArrCur     = rscratch1;
4464     Register shiftRevCount = rscratch2;
4465     Register oldArrCur     = r13;
4466     Register oldArrNext    = r14;
4467 
4468     FloatRegister oldElem0        = v0;
4469     FloatRegister oldElem1        = v1;
4470     FloatRegister newElem         = v2;
4471     FloatRegister shiftVCount     = v3;
4472     FloatRegister shiftVRevCount  = v4;
4473 
4474     __ cbz(idx, Exit);
4475 
4476     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4477 
4478     // left shift count
4479     __ movw(shiftRevCount, 32);
4480     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4481 
4482     // numIter too small to allow a 4-words SIMD loop, rolling back
4483     __ cmp(numIter, (u1)4);
4484     __ br(Assembler::LT, ShiftThree);
4485 
4486     __ dup(shiftVCount,    __ T4S, shiftCount);
4487     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4488     __ negr(shiftVCount,   __ T4S, shiftVCount);
4489 
4490     __ BIND(ShiftSIMDLoop);
4491 
4492     // Calculate the load addresses
4493     __ sub(idx, idx, 4);
4494     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4495     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4496     __ add(oldArrCur,  oldArrNext, 4);
4497 
4498     // Load 4 words and process
4499     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4500     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4501     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4502     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4503     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4504     __ st1(newElem,   __ T4S,  Address(newArrCur));
4505 
4506     __ cmp(idx, (u1)4);
4507     __ br(Assembler::LT, ShiftTwoLoop);
4508     __ b(ShiftSIMDLoop);
4509 
4510     __ BIND(ShiftTwoLoop);
4511     __ cbz(idx, Exit);
4512     __ cmp(idx, (u1)1);
4513     __ br(Assembler::EQ, ShiftOne);
4514 
4515     // Calculate the load addresses
4516     __ sub(idx, idx, 2);
4517     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4518     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4519     __ add(oldArrCur,  oldArrNext, 4);
4520 
4521     // Load 2 words and process
4522     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4523     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4524     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4525     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4526     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4527     __ st1(newElem,   __ T2S, Address(newArrCur));
4528     __ b(ShiftTwoLoop);
4529 
4530     __ BIND(ShiftThree);
4531     __ tbz(idx, 1, ShiftOne);
4532     __ tbz(idx, 0, ShiftTwo);
4533     __ ldrw(r10,  Address(oldArr, 12));
4534     __ ldrw(r11,  Address(oldArr, 8));
4535     __ lsrvw(r10, r10, shiftCount);
4536     __ lslvw(r11, r11, shiftRevCount);
4537     __ orrw(r12,  r10, r11);
4538     __ strw(r12,  Address(newArr, 8));
4539 
4540     __ BIND(ShiftTwo);
4541     __ ldrw(r10,  Address(oldArr, 8));
4542     __ ldrw(r11,  Address(oldArr, 4));
4543     __ lsrvw(r10, r10, shiftCount);
4544     __ lslvw(r11, r11, shiftRevCount);
4545     __ orrw(r12,  r10, r11);
4546     __ strw(r12,  Address(newArr, 4));
4547 
4548     __ BIND(ShiftOne);
4549     __ ldrw(r10,  Address(oldArr, 4));
4550     __ ldrw(r11,  Address(oldArr));
4551     __ lsrvw(r10, r10, shiftCount);
4552     __ lslvw(r11, r11, shiftRevCount);
4553     __ orrw(r12,  r10, r11);
4554     __ strw(r12,  Address(newArr));
4555 
4556     __ BIND(Exit);
4557     __ ret(lr);
4558 
4559     return start;
4560   }
4561 
4562   // Arguments:
4563   //
4564   // Input:
4565   //   c_rarg0   - newArr address
4566   //   c_rarg1   - oldArr address
4567   //   c_rarg2   - newIdx
4568   //   c_rarg3   - shiftCount
4569   //   c_rarg4   - numIter
4570   //
4571   address generate_bigIntegerLeftShift() {
4572     __ align(CodeEntryAlignment);
4573     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4574     address start = __ pc();
4575 
4576     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4577 
4578     Register newArr        = c_rarg0;
4579     Register oldArr        = c_rarg1;
4580     Register newIdx        = c_rarg2;
4581     Register shiftCount    = c_rarg3;
4582     Register numIter       = c_rarg4;
4583 
4584     Register shiftRevCount = rscratch1;
4585     Register oldArrNext    = rscratch2;
4586 
4587     FloatRegister oldElem0        = v0;
4588     FloatRegister oldElem1        = v1;
4589     FloatRegister newElem         = v2;
4590     FloatRegister shiftVCount     = v3;
4591     FloatRegister shiftVRevCount  = v4;
4592 
4593     __ cbz(numIter, Exit);
4594 
4595     __ add(oldArrNext, oldArr, 4);
4596     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4597 
4598     // right shift count
4599     __ movw(shiftRevCount, 32);
4600     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4601 
4602     // numIter too small to allow a 4-words SIMD loop, rolling back
4603     __ cmp(numIter, (u1)4);
4604     __ br(Assembler::LT, ShiftThree);
4605 
4606     __ dup(shiftVCount,     __ T4S, shiftCount);
4607     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4608     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4609 
4610     __ BIND(ShiftSIMDLoop);
4611 
4612     // load 4 words and process
4613     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4614     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4615     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4616     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4617     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4618     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4619     __ sub(numIter,   numIter, 4);
4620 
4621     __ cmp(numIter, (u1)4);
4622     __ br(Assembler::LT, ShiftTwoLoop);
4623     __ b(ShiftSIMDLoop);
4624 
4625     __ BIND(ShiftTwoLoop);
4626     __ cbz(numIter, Exit);
4627     __ cmp(numIter, (u1)1);
4628     __ br(Assembler::EQ, ShiftOne);
4629 
4630     // load 2 words and process
4631     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4632     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4633     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4634     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4635     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4636     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4637     __ sub(numIter,   numIter, 2);
4638     __ b(ShiftTwoLoop);
4639 
4640     __ BIND(ShiftThree);
4641     __ ldrw(r10,  __ post(oldArr, 4));
4642     __ ldrw(r11,  __ post(oldArrNext, 4));
4643     __ lslvw(r10, r10, shiftCount);
4644     __ lsrvw(r11, r11, shiftRevCount);
4645     __ orrw(r12,  r10, r11);
4646     __ strw(r12,  __ post(newArr, 4));
4647     __ tbz(numIter, 1, Exit);
4648     __ tbz(numIter, 0, ShiftOne);
4649 
4650     __ BIND(ShiftTwo);
4651     __ ldrw(r10,  __ post(oldArr, 4));
4652     __ ldrw(r11,  __ post(oldArrNext, 4));
4653     __ lslvw(r10, r10, shiftCount);
4654     __ lsrvw(r11, r11, shiftRevCount);
4655     __ orrw(r12,  r10, r11);
4656     __ strw(r12,  __ post(newArr, 4));
4657 
4658     __ BIND(ShiftOne);
4659     __ ldrw(r10,  Address(oldArr));
4660     __ ldrw(r11,  Address(oldArrNext));
4661     __ lslvw(r10, r10, shiftCount);
4662     __ lsrvw(r11, r11, shiftRevCount);
4663     __ orrw(r12,  r10, r11);
4664     __ strw(r12,  Address(newArr));
4665 
4666     __ BIND(Exit);
4667     __ ret(lr);
4668 
4669     return start;
4670   }
4671 
4672   address generate_count_positives(address &count_positives_long) {
4673     const u1 large_loop_size = 64;
4674     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4675     int dcache_line = VM_Version::dcache_line_size();
4676 
4677     Register ary1 = r1, len = r2, result = r0;
4678 
4679     __ align(CodeEntryAlignment);
4680 
4681     StubCodeMark mark(this, "StubRoutines", "count_positives");
4682 
4683     address entry = __ pc();
4684 
4685     __ enter();
4686     // precondition: a copy of len is already in result
4687     // __ mov(result, len);
4688 
4689   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4690         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4691 
4692   __ cmp(len, (u1)15);
4693   __ br(Assembler::GT, LEN_OVER_15);
4694   // The only case when execution falls into this code is when pointer is near
4695   // the end of memory page and we have to avoid reading next page
4696   __ add(ary1, ary1, len);
4697   __ subs(len, len, 8);
4698   __ br(Assembler::GT, LEN_OVER_8);
4699   __ ldr(rscratch2, Address(ary1, -8));
4700   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4701   __ lsrv(rscratch2, rscratch2, rscratch1);
4702   __ tst(rscratch2, UPPER_BIT_MASK);
4703   __ csel(result, zr, result, Assembler::NE);
4704   __ leave();
4705   __ ret(lr);
4706   __ bind(LEN_OVER_8);
4707   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4708   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4709   __ tst(rscratch2, UPPER_BIT_MASK);
4710   __ br(Assembler::NE, RET_NO_POP);
4711   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4712   __ lsrv(rscratch1, rscratch1, rscratch2);
4713   __ tst(rscratch1, UPPER_BIT_MASK);
4714   __ bind(RET_NO_POP);
4715   __ csel(result, zr, result, Assembler::NE);
4716   __ leave();
4717   __ ret(lr);
4718 
4719   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4720   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4721 
4722   count_positives_long = __ pc(); // 2nd entry point
4723 
4724   __ enter();
4725 
4726   __ bind(LEN_OVER_15);
4727     __ push(spilled_regs, sp);
4728     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4729     __ cbz(rscratch2, ALIGNED);
4730     __ ldp(tmp6, tmp1, Address(ary1));
4731     __ mov(tmp5, 16);
4732     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4733     __ add(ary1, ary1, rscratch1);
4734     __ orr(tmp6, tmp6, tmp1);
4735     __ tst(tmp6, UPPER_BIT_MASK);
4736     __ br(Assembler::NE, RET_ADJUST);
4737     __ sub(len, len, rscratch1);
4738 
4739   __ bind(ALIGNED);
4740     __ cmp(len, large_loop_size);
4741     __ br(Assembler::LT, CHECK_16);
4742     // Perform 16-byte load as early return in pre-loop to handle situation
4743     // when initially aligned large array has negative values at starting bytes,
4744     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4745     // slower. Cases with negative bytes further ahead won't be affected that
4746     // much. In fact, it'll be faster due to early loads, less instructions and
4747     // less branches in LARGE_LOOP.
4748     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4749     __ sub(len, len, 16);
4750     __ orr(tmp6, tmp6, tmp1);
4751     __ tst(tmp6, UPPER_BIT_MASK);
4752     __ br(Assembler::NE, RET_ADJUST_16);
4753     __ cmp(len, large_loop_size);
4754     __ br(Assembler::LT, CHECK_16);
4755 
4756     if (SoftwarePrefetchHintDistance >= 0
4757         && SoftwarePrefetchHintDistance >= dcache_line) {
4758       // initial prefetch
4759       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4760     }
4761   __ bind(LARGE_LOOP);
4762     if (SoftwarePrefetchHintDistance >= 0) {
4763       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4764     }
4765     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4766     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4767     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4768     // instructions per cycle and have less branches, but this approach disables
4769     // early return, thus, all 64 bytes are loaded and checked every time.
4770     __ ldp(tmp2, tmp3, Address(ary1));
4771     __ ldp(tmp4, tmp5, Address(ary1, 16));
4772     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4773     __ ldp(tmp6, tmp1, Address(ary1, 48));
4774     __ add(ary1, ary1, large_loop_size);
4775     __ sub(len, len, large_loop_size);
4776     __ orr(tmp2, tmp2, tmp3);
4777     __ orr(tmp4, tmp4, tmp5);
4778     __ orr(rscratch1, rscratch1, rscratch2);
4779     __ orr(tmp6, tmp6, tmp1);
4780     __ orr(tmp2, tmp2, tmp4);
4781     __ orr(rscratch1, rscratch1, tmp6);
4782     __ orr(tmp2, tmp2, rscratch1);
4783     __ tst(tmp2, UPPER_BIT_MASK);
4784     __ br(Assembler::NE, RET_ADJUST_LONG);
4785     __ cmp(len, large_loop_size);
4786     __ br(Assembler::GE, LARGE_LOOP);
4787 
4788   __ bind(CHECK_16); // small 16-byte load pre-loop
4789     __ cmp(len, (u1)16);
4790     __ br(Assembler::LT, POST_LOOP16);
4791 
4792   __ bind(LOOP16); // small 16-byte load loop
4793     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4794     __ sub(len, len, 16);
4795     __ orr(tmp2, tmp2, tmp3);
4796     __ tst(tmp2, UPPER_BIT_MASK);
4797     __ br(Assembler::NE, RET_ADJUST_16);
4798     __ cmp(len, (u1)16);
4799     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4800 
4801   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4802     __ cmp(len, (u1)8);
4803     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4804     __ ldr(tmp3, Address(__ post(ary1, 8)));
4805     __ tst(tmp3, UPPER_BIT_MASK);
4806     __ br(Assembler::NE, RET_ADJUST);
4807     __ sub(len, len, 8);
4808 
4809   __ bind(POST_LOOP16_LOAD_TAIL);
4810     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
4811     __ ldr(tmp1, Address(ary1));
4812     __ mov(tmp2, 64);
4813     __ sub(tmp4, tmp2, len, __ LSL, 3);
4814     __ lslv(tmp1, tmp1, tmp4);
4815     __ tst(tmp1, UPPER_BIT_MASK);
4816     __ br(Assembler::NE, RET_ADJUST);
4817     // Fallthrough
4818 
4819   __ bind(RET_LEN);
4820     __ pop(spilled_regs, sp);
4821     __ leave();
4822     __ ret(lr);
4823 
4824     // difference result - len is the count of guaranteed to be
4825     // positive bytes
4826 
4827   __ bind(RET_ADJUST_LONG);
4828     __ add(len, len, (u1)(large_loop_size - 16));
4829   __ bind(RET_ADJUST_16);
4830     __ add(len, len, 16);
4831   __ bind(RET_ADJUST);
4832     __ pop(spilled_regs, sp);
4833     __ leave();
4834     __ sub(result, result, len);
4835     __ ret(lr);
4836 
4837     return entry;
4838   }
4839 
4840   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4841         bool usePrefetch, Label &NOT_EQUAL) {
4842     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4843         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4844         tmp7 = r12, tmp8 = r13;
4845     Label LOOP;
4846 
4847     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4848     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4849     __ bind(LOOP);
4850     if (usePrefetch) {
4851       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4852       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4853     }
4854     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4855     __ eor(tmp1, tmp1, tmp2);
4856     __ eor(tmp3, tmp3, tmp4);
4857     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4858     __ orr(tmp1, tmp1, tmp3);
4859     __ cbnz(tmp1, NOT_EQUAL);
4860     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4861     __ eor(tmp5, tmp5, tmp6);
4862     __ eor(tmp7, tmp7, tmp8);
4863     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4864     __ orr(tmp5, tmp5, tmp7);
4865     __ cbnz(tmp5, NOT_EQUAL);
4866     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4867     __ eor(tmp1, tmp1, tmp2);
4868     __ eor(tmp3, tmp3, tmp4);
4869     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4870     __ orr(tmp1, tmp1, tmp3);
4871     __ cbnz(tmp1, NOT_EQUAL);
4872     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4873     __ eor(tmp5, tmp5, tmp6);
4874     __ sub(cnt1, cnt1, 8 * wordSize);
4875     __ eor(tmp7, tmp7, tmp8);
4876     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4877     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4878     // cmp) because subs allows an unlimited range of immediate operand.
4879     __ subs(tmp6, cnt1, loopThreshold);
4880     __ orr(tmp5, tmp5, tmp7);
4881     __ cbnz(tmp5, NOT_EQUAL);
4882     __ br(__ GE, LOOP);
4883     // post-loop
4884     __ eor(tmp1, tmp1, tmp2);
4885     __ eor(tmp3, tmp3, tmp4);
4886     __ orr(tmp1, tmp1, tmp3);
4887     __ sub(cnt1, cnt1, 2 * wordSize);
4888     __ cbnz(tmp1, NOT_EQUAL);
4889   }
4890 
4891   void generate_large_array_equals_loop_simd(int loopThreshold,
4892         bool usePrefetch, Label &NOT_EQUAL) {
4893     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4894         tmp2 = rscratch2;
4895     Label LOOP;
4896 
4897     __ bind(LOOP);
4898     if (usePrefetch) {
4899       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4900       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4901     }
4902     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4903     __ sub(cnt1, cnt1, 8 * wordSize);
4904     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4905     __ subs(tmp1, cnt1, loopThreshold);
4906     __ eor(v0, __ T16B, v0, v4);
4907     __ eor(v1, __ T16B, v1, v5);
4908     __ eor(v2, __ T16B, v2, v6);
4909     __ eor(v3, __ T16B, v3, v7);
4910     __ orr(v0, __ T16B, v0, v1);
4911     __ orr(v1, __ T16B, v2, v3);
4912     __ orr(v0, __ T16B, v0, v1);
4913     __ umov(tmp1, v0, __ D, 0);
4914     __ umov(tmp2, v0, __ D, 1);
4915     __ orr(tmp1, tmp1, tmp2);
4916     __ cbnz(tmp1, NOT_EQUAL);
4917     __ br(__ GE, LOOP);
4918   }
4919 
4920   // a1 = r1 - array1 address
4921   // a2 = r2 - array2 address
4922   // result = r0 - return value. Already contains "false"
4923   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4924   // r3-r5 are reserved temporary registers
4925   address generate_large_array_equals() {
4926     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4927         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4928         tmp7 = r12, tmp8 = r13;
4929     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4930         SMALL_LOOP, POST_LOOP;
4931     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4932     // calculate if at least 32 prefetched bytes are used
4933     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4934     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4935     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4936     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4937         tmp5, tmp6, tmp7, tmp8);
4938 
4939     __ align(CodeEntryAlignment);
4940 
4941     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4942 
4943     address entry = __ pc();
4944     __ enter();
4945     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4946     // also advance pointers to use post-increment instead of pre-increment
4947     __ add(a1, a1, wordSize);
4948     __ add(a2, a2, wordSize);
4949     if (AvoidUnalignedAccesses) {
4950       // both implementations (SIMD/nonSIMD) are using relatively large load
4951       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4952       // on some CPUs in case of address is not at least 16-byte aligned.
4953       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4954       // load if needed at least for 1st address and make if 16-byte aligned.
4955       Label ALIGNED16;
4956       __ tbz(a1, 3, ALIGNED16);
4957       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4958       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4959       __ sub(cnt1, cnt1, wordSize);
4960       __ eor(tmp1, tmp1, tmp2);
4961       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4962       __ bind(ALIGNED16);
4963     }
4964     if (UseSIMDForArrayEquals) {
4965       if (SoftwarePrefetchHintDistance >= 0) {
4966         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4967         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4968         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4969             /* prfm = */ true, NOT_EQUAL);
4970         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4971         __ br(__ LT, TAIL);
4972       }
4973       __ bind(NO_PREFETCH_LARGE_LOOP);
4974       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4975           /* prfm = */ false, NOT_EQUAL);
4976     } else {
4977       __ push(spilled_regs, sp);
4978       if (SoftwarePrefetchHintDistance >= 0) {
4979         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4980         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4981         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4982             /* prfm = */ true, NOT_EQUAL);
4983         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4984         __ br(__ LT, TAIL);
4985       }
4986       __ bind(NO_PREFETCH_LARGE_LOOP);
4987       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4988           /* prfm = */ false, NOT_EQUAL);
4989     }
4990     __ bind(TAIL);
4991       __ cbz(cnt1, EQUAL);
4992       __ subs(cnt1, cnt1, wordSize);
4993       __ br(__ LE, POST_LOOP);
4994     __ bind(SMALL_LOOP);
4995       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4996       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4997       __ subs(cnt1, cnt1, wordSize);
4998       __ eor(tmp1, tmp1, tmp2);
4999       __ cbnz(tmp1, NOT_EQUAL);
5000       __ br(__ GT, SMALL_LOOP);
5001     __ bind(POST_LOOP);
5002       __ ldr(tmp1, Address(a1, cnt1));
5003       __ ldr(tmp2, Address(a2, cnt1));
5004       __ eor(tmp1, tmp1, tmp2);
5005       __ cbnz(tmp1, NOT_EQUAL);
5006     __ bind(EQUAL);
5007       __ mov(result, true);
5008     __ bind(NOT_EQUAL);
5009       if (!UseSIMDForArrayEquals) {
5010         __ pop(spilled_regs, sp);
5011       }
5012     __ bind(NOT_EQUAL_NO_POP);
5013     __ leave();
5014     __ ret(lr);
5015     return entry;
5016   }
5017 
5018   address generate_dsin_dcos(bool isCos) {
5019     __ align(CodeEntryAlignment);
5020     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5021     address start = __ pc();
5022     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5023         (address)StubRoutines::aarch64::_two_over_pi,
5024         (address)StubRoutines::aarch64::_pio2,
5025         (address)StubRoutines::aarch64::_dsin_coef,
5026         (address)StubRoutines::aarch64::_dcos_coef);
5027     return start;
5028   }
5029 
5030   address generate_dlog() {
5031     __ align(CodeEntryAlignment);
5032     StubCodeMark mark(this, "StubRoutines", "dlog");
5033     address entry = __ pc();
5034     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5035         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5036     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5037     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5038         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5039     return entry;
5040   }
5041 
5042 
5043   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5044   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5045       Label &DIFF2) {
5046     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5047     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5048 
5049     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5050     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5051     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5052     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5053 
5054     __ fmovd(tmpL, vtmp3);
5055     __ eor(rscratch2, tmp3, tmpL);
5056     __ cbnz(rscratch2, DIFF2);
5057 
5058     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5059     __ umov(tmpL, vtmp3, __ D, 1);
5060     __ eor(rscratch2, tmpU, tmpL);
5061     __ cbnz(rscratch2, DIFF1);
5062 
5063     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5064     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5065     __ fmovd(tmpL, vtmp);
5066     __ eor(rscratch2, tmp3, tmpL);
5067     __ cbnz(rscratch2, DIFF2);
5068 
5069     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5070     __ umov(tmpL, vtmp, __ D, 1);
5071     __ eor(rscratch2, tmpU, tmpL);
5072     __ cbnz(rscratch2, DIFF1);
5073   }
5074 
5075   // r0  = result
5076   // r1  = str1
5077   // r2  = cnt1
5078   // r3  = str2
5079   // r4  = cnt2
5080   // r10 = tmp1
5081   // r11 = tmp2
5082   address generate_compare_long_string_different_encoding(bool isLU) {
5083     __ align(CodeEntryAlignment);
5084     StubCodeMark mark(this, "StubRoutines", isLU
5085         ? "compare_long_string_different_encoding LU"
5086         : "compare_long_string_different_encoding UL");
5087     address entry = __ pc();
5088     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5089         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5090         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5091     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5092         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5093     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5094     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5095 
5096     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5097 
5098     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5099     // cnt2 == amount of characters left to compare
5100     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5101     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5102     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5103     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5104     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5105     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5106     __ eor(rscratch2, tmp1, tmp2);
5107     __ mov(rscratch1, tmp2);
5108     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5109     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5110              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5111     __ push(spilled_regs, sp);
5112     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5113     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5114 
5115     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5116 
5117     if (SoftwarePrefetchHintDistance >= 0) {
5118       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5119       __ br(__ LT, NO_PREFETCH);
5120       __ bind(LARGE_LOOP_PREFETCH);
5121         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5122         __ mov(tmp4, 2);
5123         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5124         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5125           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5126           __ subs(tmp4, tmp4, 1);
5127           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5128           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5129           __ mov(tmp4, 2);
5130         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5131           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5132           __ subs(tmp4, tmp4, 1);
5133           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5134           __ sub(cnt2, cnt2, 64);
5135           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5136           __ br(__ GE, LARGE_LOOP_PREFETCH);
5137     }
5138     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5139     __ bind(NO_PREFETCH);
5140     __ subs(cnt2, cnt2, 16);
5141     __ br(__ LT, TAIL);
5142     __ align(OptoLoopAlignment);
5143     __ bind(SMALL_LOOP); // smaller loop
5144       __ subs(cnt2, cnt2, 16);
5145       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5146       __ br(__ GE, SMALL_LOOP);
5147       __ cmn(cnt2, (u1)16);
5148       __ br(__ EQ, LOAD_LAST);
5149     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5150       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5151       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5152       __ ldr(tmp3, Address(cnt1, -8));
5153       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5154       __ b(LOAD_LAST);
5155     __ bind(DIFF2);
5156       __ mov(tmpU, tmp3);
5157     __ bind(DIFF1);
5158       __ pop(spilled_regs, sp);
5159       __ b(CALCULATE_DIFFERENCE);
5160     __ bind(LOAD_LAST);
5161       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5162       // No need to load it again
5163       __ mov(tmpU, tmp3);
5164       __ pop(spilled_regs, sp);
5165 
5166       // tmp2 points to the address of the last 4 Latin1 characters right now
5167       __ ldrs(vtmp, Address(tmp2));
5168       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5169       __ fmovd(tmpL, vtmp);
5170 
5171       __ eor(rscratch2, tmpU, tmpL);
5172       __ cbz(rscratch2, DONE);
5173 
5174     // Find the first different characters in the longwords and
5175     // compute their difference.
5176     __ bind(CALCULATE_DIFFERENCE);
5177       __ rev(rscratch2, rscratch2);
5178       __ clz(rscratch2, rscratch2);
5179       __ andr(rscratch2, rscratch2, -16);
5180       __ lsrv(tmp1, tmp1, rscratch2);
5181       __ uxthw(tmp1, tmp1);
5182       __ lsrv(rscratch1, rscratch1, rscratch2);
5183       __ uxthw(rscratch1, rscratch1);
5184       __ subw(result, tmp1, rscratch1);
5185     __ bind(DONE);
5186       __ ret(lr);
5187     return entry;
5188   }
5189 
5190   address generate_method_entry_barrier() {
5191     __ align(CodeEntryAlignment);
5192     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5193 
5194     Label deoptimize_label;
5195 
5196     address start = __ pc();
5197 
5198     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5199 
5200     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5201       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5202       // We can get here despite the nmethod being good, if we have not
5203       // yet applied our cross modification fence (or data fence).
5204       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()) + 4);
5205       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5206       __ ldrw(rscratch2, rscratch2);
5207       __ strw(rscratch2, thread_epoch_addr);
5208       __ isb();
5209       __ membar(__ LoadLoad);
5210     }
5211 
5212     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5213 
5214     __ enter();
5215     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5216 
5217     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5218 
5219     __ push_call_clobbered_registers();
5220 
5221     __ mov(c_rarg0, rscratch2);
5222     __ call_VM_leaf
5223          (CAST_FROM_FN_PTR
5224           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5225 
5226     __ reset_last_Java_frame(true);
5227 
5228     __ mov(rscratch1, r0);
5229 
5230     __ pop_call_clobbered_registers();
5231 
5232     __ cbnz(rscratch1, deoptimize_label);
5233 
5234     __ leave();
5235     __ ret(lr);
5236 
5237     __ BIND(deoptimize_label);
5238 
5239     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5240     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5241 
5242     __ mov(sp, rscratch1);
5243     __ br(rscratch2);
5244 
5245     return start;
5246   }
5247 
5248   // r0  = result
5249   // r1  = str1
5250   // r2  = cnt1
5251   // r3  = str2
5252   // r4  = cnt2
5253   // r10 = tmp1
5254   // r11 = tmp2
5255   address generate_compare_long_string_same_encoding(bool isLL) {
5256     __ align(CodeEntryAlignment);
5257     StubCodeMark mark(this, "StubRoutines", isLL
5258         ? "compare_long_string_same_encoding LL"
5259         : "compare_long_string_same_encoding UU");
5260     address entry = __ pc();
5261     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5262         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5263 
5264     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5265 
5266     // exit from large loop when less than 64 bytes left to read or we're about
5267     // to prefetch memory behind array border
5268     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5269 
5270     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5271     __ eor(rscratch2, tmp1, tmp2);
5272     __ cbnz(rscratch2, CAL_DIFFERENCE);
5273 
5274     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5275     // update pointers, because of previous read
5276     __ add(str1, str1, wordSize);
5277     __ add(str2, str2, wordSize);
5278     if (SoftwarePrefetchHintDistance >= 0) {
5279       __ align(OptoLoopAlignment);
5280       __ bind(LARGE_LOOP_PREFETCH);
5281         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5282         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5283 
5284         for (int i = 0; i < 4; i++) {
5285           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5286           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5287           __ cmp(tmp1, tmp2);
5288           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5289           __ br(Assembler::NE, DIFF);
5290         }
5291         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5292         __ add(str1, str1, 64);
5293         __ add(str2, str2, 64);
5294         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5295         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5296         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5297     }
5298 
5299     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5300     __ br(Assembler::LE, LESS16);
5301     __ align(OptoLoopAlignment);
5302     __ bind(LOOP_COMPARE16);
5303       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5304       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5305       __ cmp(tmp1, tmp2);
5306       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5307       __ br(Assembler::NE, DIFF);
5308       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5309       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5310       __ br(Assembler::LT, LESS16);
5311 
5312       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5313       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5314       __ cmp(tmp1, tmp2);
5315       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5316       __ br(Assembler::NE, DIFF);
5317       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5318       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5319       __ br(Assembler::GE, LOOP_COMPARE16);
5320       __ cbz(cnt2, LENGTH_DIFF);
5321 
5322     __ bind(LESS16);
5323       // each 8 compare
5324       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5325       __ br(Assembler::LE, LESS8);
5326       __ ldr(tmp1, Address(__ post(str1, 8)));
5327       __ ldr(tmp2, Address(__ post(str2, 8)));
5328       __ eor(rscratch2, tmp1, tmp2);
5329       __ cbnz(rscratch2, CAL_DIFFERENCE);
5330       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5331 
5332     __ bind(LESS8); // directly load last 8 bytes
5333       if (!isLL) {
5334         __ add(cnt2, cnt2, cnt2);
5335       }
5336       __ ldr(tmp1, Address(str1, cnt2));
5337       __ ldr(tmp2, Address(str2, cnt2));
5338       __ eor(rscratch2, tmp1, tmp2);
5339       __ cbz(rscratch2, LENGTH_DIFF);
5340       __ b(CAL_DIFFERENCE);
5341 
5342     __ bind(DIFF);
5343       __ cmp(tmp1, tmp2);
5344       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5345       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5346       // reuse rscratch2 register for the result of eor instruction
5347       __ eor(rscratch2, tmp1, tmp2);
5348 
5349     __ bind(CAL_DIFFERENCE);
5350       __ rev(rscratch2, rscratch2);
5351       __ clz(rscratch2, rscratch2);
5352       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5353       __ lsrv(tmp1, tmp1, rscratch2);
5354       __ lsrv(tmp2, tmp2, rscratch2);
5355       if (isLL) {
5356         __ uxtbw(tmp1, tmp1);
5357         __ uxtbw(tmp2, tmp2);
5358       } else {
5359         __ uxthw(tmp1, tmp1);
5360         __ uxthw(tmp2, tmp2);
5361       }
5362       __ subw(result, tmp1, tmp2);
5363 
5364     __ bind(LENGTH_DIFF);
5365       __ ret(lr);
5366     return entry;
5367   }
5368 
5369   enum string_compare_mode {
5370     LL,
5371     LU,
5372     UL,
5373     UU,
5374   };
5375 
5376   // The following registers are declared in aarch64.ad
5377   // r0  = result
5378   // r1  = str1
5379   // r2  = cnt1
5380   // r3  = str2
5381   // r4  = cnt2
5382   // r10 = tmp1
5383   // r11 = tmp2
5384   // z0  = ztmp1
5385   // z1  = ztmp2
5386   // p0  = pgtmp1
5387   // p1  = pgtmp2
5388   address generate_compare_long_string_sve(string_compare_mode mode) {
5389     __ align(CodeEntryAlignment);
5390     address entry = __ pc();
5391     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5392              tmp1 = r10, tmp2 = r11;
5393 
5394     Label LOOP, DONE, MISMATCH;
5395     Register vec_len = tmp1;
5396     Register idx = tmp2;
5397     // The minimum of the string lengths has been stored in cnt2.
5398     Register cnt = cnt2;
5399     FloatRegister ztmp1 = z0, ztmp2 = z1;
5400     PRegister pgtmp1 = p0, pgtmp2 = p1;
5401 
5402 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5403     switch (mode) {                                                            \
5404       case LL:                                                                 \
5405         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5406         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5407         break;                                                                 \
5408       case LU:                                                                 \
5409         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5410         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5411         break;                                                                 \
5412       case UL:                                                                 \
5413         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5414         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5415         break;                                                                 \
5416       case UU:                                                                 \
5417         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5418         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5419         break;                                                                 \
5420       default:                                                                 \
5421         ShouldNotReachHere();                                                  \
5422     }
5423 
5424     const char* stubname;
5425     switch (mode) {
5426       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5427       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5428       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5429       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5430       default: ShouldNotReachHere();
5431     }
5432 
5433     StubCodeMark mark(this, "StubRoutines", stubname);
5434 
5435     __ mov(idx, 0);
5436     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5437 
5438     if (mode == LL) {
5439       __ sve_cntb(vec_len);
5440     } else {
5441       __ sve_cnth(vec_len);
5442     }
5443 
5444     __ sub(rscratch1, cnt, vec_len);
5445 
5446     __ bind(LOOP);
5447 
5448       // main loop
5449       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5450       __ add(idx, idx, vec_len);
5451       // Compare strings.
5452       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5453       __ br(__ NE, MISMATCH);
5454       __ cmp(idx, rscratch1);
5455       __ br(__ LT, LOOP);
5456 
5457     // post loop, last iteration
5458     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5459 
5460     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5461     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5462     __ br(__ EQ, DONE);
5463 
5464     __ bind(MISMATCH);
5465 
5466     // Crop the vector to find its location.
5467     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5468     // Extract the first different characters of each string.
5469     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5470     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5471 
5472     // Compute the difference of the first different characters.
5473     __ sub(result, rscratch1, rscratch2);
5474 
5475     __ bind(DONE);
5476     __ ret(lr);
5477 #undef LOAD_PAIR
5478     return entry;
5479   }
5480 
5481   void generate_compare_long_strings() {
5482     if (UseSVE == 0) {
5483       StubRoutines::aarch64::_compare_long_string_LL
5484           = generate_compare_long_string_same_encoding(true);
5485       StubRoutines::aarch64::_compare_long_string_UU
5486           = generate_compare_long_string_same_encoding(false);
5487       StubRoutines::aarch64::_compare_long_string_LU
5488           = generate_compare_long_string_different_encoding(true);
5489       StubRoutines::aarch64::_compare_long_string_UL
5490           = generate_compare_long_string_different_encoding(false);
5491     } else {
5492       StubRoutines::aarch64::_compare_long_string_LL
5493           = generate_compare_long_string_sve(LL);
5494       StubRoutines::aarch64::_compare_long_string_UU
5495           = generate_compare_long_string_sve(UU);
5496       StubRoutines::aarch64::_compare_long_string_LU
5497           = generate_compare_long_string_sve(LU);
5498       StubRoutines::aarch64::_compare_long_string_UL
5499           = generate_compare_long_string_sve(UL);
5500     }
5501   }
5502 
5503   // R0 = result
5504   // R1 = str2
5505   // R2 = cnt1
5506   // R3 = str1
5507   // R4 = cnt2
5508   // This generic linear code use few additional ideas, which makes it faster:
5509   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5510   // in order to skip initial loading(help in systems with 1 ld pipeline)
5511   // 2) we can use "fast" algorithm of finding single character to search for
5512   // first symbol with less branches(1 branch per each loaded register instead
5513   // of branch for each symbol), so, this is where constants like
5514   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5515   // 3) after loading and analyzing 1st register of source string, it can be
5516   // used to search for every 1st character entry, saving few loads in
5517   // comparison with "simplier-but-slower" implementation
5518   // 4) in order to avoid lots of push/pop operations, code below is heavily
5519   // re-using/re-initializing/compressing register values, which makes code
5520   // larger and a bit less readable, however, most of extra operations are
5521   // issued during loads or branches, so, penalty is minimal
5522   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5523     const char* stubName = str1_isL
5524         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5525         : "indexof_linear_uu";
5526     __ align(CodeEntryAlignment);
5527     StubCodeMark mark(this, "StubRoutines", stubName);
5528     address entry = __ pc();
5529 
5530     int str1_chr_size = str1_isL ? 1 : 2;
5531     int str2_chr_size = str2_isL ? 1 : 2;
5532     int str1_chr_shift = str1_isL ? 0 : 1;
5533     int str2_chr_shift = str2_isL ? 0 : 1;
5534     bool isL = str1_isL && str2_isL;
5535    // parameters
5536     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5537     // temporary registers
5538     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5539     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5540     // redefinitions
5541     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5542 
5543     __ push(spilled_regs, sp);
5544     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5545         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5546         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5547         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5548         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5549         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5550     // Read whole register from str1. It is safe, because length >=8 here
5551     __ ldr(ch1, Address(str1));
5552     // Read whole register from str2. It is safe, because length >=8 here
5553     __ ldr(ch2, Address(str2));
5554     __ sub(cnt2, cnt2, cnt1);
5555     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5556     if (str1_isL != str2_isL) {
5557       __ eor(v0, __ T16B, v0, v0);
5558     }
5559     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5560     __ mul(first, first, tmp1);
5561     // check if we have less than 1 register to check
5562     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5563     if (str1_isL != str2_isL) {
5564       __ fmovd(v1, ch1);
5565     }
5566     __ br(__ LE, L_SMALL);
5567     __ eor(ch2, first, ch2);
5568     if (str1_isL != str2_isL) {
5569       __ zip1(v1, __ T16B, v1, v0);
5570     }
5571     __ sub(tmp2, ch2, tmp1);
5572     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5573     __ bics(tmp2, tmp2, ch2);
5574     if (str1_isL != str2_isL) {
5575       __ fmovd(ch1, v1);
5576     }
5577     __ br(__ NE, L_HAS_ZERO);
5578     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5579     __ add(result, result, wordSize/str2_chr_size);
5580     __ add(str2, str2, wordSize);
5581     __ br(__ LT, L_POST_LOOP);
5582     __ BIND(L_LOOP);
5583       __ ldr(ch2, Address(str2));
5584       __ eor(ch2, first, ch2);
5585       __ sub(tmp2, ch2, tmp1);
5586       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5587       __ bics(tmp2, tmp2, ch2);
5588       __ br(__ NE, L_HAS_ZERO);
5589     __ BIND(L_LOOP_PROCEED);
5590       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5591       __ add(str2, str2, wordSize);
5592       __ add(result, result, wordSize/str2_chr_size);
5593       __ br(__ GE, L_LOOP);
5594     __ BIND(L_POST_LOOP);
5595       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5596       __ br(__ LE, NOMATCH);
5597       __ ldr(ch2, Address(str2));
5598       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5599       __ eor(ch2, first, ch2);
5600       __ sub(tmp2, ch2, tmp1);
5601       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5602       __ mov(tmp4, -1); // all bits set
5603       __ b(L_SMALL_PROCEED);
5604     __ align(OptoLoopAlignment);
5605     __ BIND(L_SMALL);
5606       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5607       __ eor(ch2, first, ch2);
5608       if (str1_isL != str2_isL) {
5609         __ zip1(v1, __ T16B, v1, v0);
5610       }
5611       __ sub(tmp2, ch2, tmp1);
5612       __ mov(tmp4, -1); // all bits set
5613       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5614       if (str1_isL != str2_isL) {
5615         __ fmovd(ch1, v1); // move converted 4 symbols
5616       }
5617     __ BIND(L_SMALL_PROCEED);
5618       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5619       __ bic(tmp2, tmp2, ch2);
5620       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5621       __ rbit(tmp2, tmp2);
5622       __ br(__ EQ, NOMATCH);
5623     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5624       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5625       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5626       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5627       if (str2_isL) { // LL
5628         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5629         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5630         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5631         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5632         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5633       } else {
5634         __ mov(ch2, 0xE); // all bits in byte set except last one
5635         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5636         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5637         __ lslv(tmp2, tmp2, tmp4);
5638         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5639         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5640         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5641         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5642       }
5643       __ cmp(ch1, ch2);
5644       __ mov(tmp4, wordSize/str2_chr_size);
5645       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5646     __ BIND(L_SMALL_CMP_LOOP);
5647       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5648                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5649       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5650                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5651       __ add(tmp4, tmp4, 1);
5652       __ cmp(tmp4, cnt1);
5653       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5654       __ cmp(first, ch2);
5655       __ br(__ EQ, L_SMALL_CMP_LOOP);
5656     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5657       __ cbz(tmp2, NOMATCH); // no more matches. exit
5658       __ clz(tmp4, tmp2);
5659       __ add(result, result, 1); // advance index
5660       __ add(str2, str2, str2_chr_size); // advance pointer
5661       __ b(L_SMALL_HAS_ZERO_LOOP);
5662     __ align(OptoLoopAlignment);
5663     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5664       __ cmp(first, ch2);
5665       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5666       __ b(DONE);
5667     __ align(OptoLoopAlignment);
5668     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5669       if (str2_isL) { // LL
5670         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5671         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5672         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5673         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5674         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5675       } else {
5676         __ mov(ch2, 0xE); // all bits in byte set except last one
5677         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5678         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5679         __ lslv(tmp2, tmp2, tmp4);
5680         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5681         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5682         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5683         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5684       }
5685       __ cmp(ch1, ch2);
5686       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5687       __ b(DONE);
5688     __ align(OptoLoopAlignment);
5689     __ BIND(L_HAS_ZERO);
5690       __ rbit(tmp2, tmp2);
5691       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5692       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5693       // It's fine because both counters are 32bit and are not changed in this
5694       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5695       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5696       __ sub(result, result, 1);
5697     __ BIND(L_HAS_ZERO_LOOP);
5698       __ mov(cnt1, wordSize/str2_chr_size);
5699       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5700       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5701       if (str2_isL) {
5702         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5703         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5704         __ lslv(tmp2, tmp2, tmp4);
5705         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5706         __ add(tmp4, tmp4, 1);
5707         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5708         __ lsl(tmp2, tmp2, 1);
5709         __ mov(tmp4, wordSize/str2_chr_size);
5710       } else {
5711         __ mov(ch2, 0xE);
5712         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5713         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5714         __ lslv(tmp2, tmp2, tmp4);
5715         __ add(tmp4, tmp4, 1);
5716         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5717         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5718         __ lsl(tmp2, tmp2, 1);
5719         __ mov(tmp4, wordSize/str2_chr_size);
5720         __ sub(str2, str2, str2_chr_size);
5721       }
5722       __ cmp(ch1, ch2);
5723       __ mov(tmp4, wordSize/str2_chr_size);
5724       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5725     __ BIND(L_CMP_LOOP);
5726       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5727                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5728       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5729                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5730       __ add(tmp4, tmp4, 1);
5731       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5732       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5733       __ cmp(cnt1, ch2);
5734       __ br(__ EQ, L_CMP_LOOP);
5735     __ BIND(L_CMP_LOOP_NOMATCH);
5736       // here we're not matched
5737       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5738       __ clz(tmp4, tmp2);
5739       __ add(str2, str2, str2_chr_size); // advance pointer
5740       __ b(L_HAS_ZERO_LOOP);
5741     __ align(OptoLoopAlignment);
5742     __ BIND(L_CMP_LOOP_LAST_CMP);
5743       __ cmp(cnt1, ch2);
5744       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5745       __ b(DONE);
5746     __ align(OptoLoopAlignment);
5747     __ BIND(L_CMP_LOOP_LAST_CMP2);
5748       if (str2_isL) {
5749         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5750         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5751         __ lslv(tmp2, tmp2, tmp4);
5752         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5753         __ add(tmp4, tmp4, 1);
5754         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5755         __ lsl(tmp2, tmp2, 1);
5756       } else {
5757         __ mov(ch2, 0xE);
5758         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5759         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5760         __ lslv(tmp2, tmp2, tmp4);
5761         __ add(tmp4, tmp4, 1);
5762         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5763         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5764         __ lsl(tmp2, tmp2, 1);
5765         __ sub(str2, str2, str2_chr_size);
5766       }
5767       __ cmp(ch1, ch2);
5768       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5769       __ b(DONE);
5770     __ align(OptoLoopAlignment);
5771     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5772       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5773       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5774       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5775       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5776       // result by analyzed characters value, so, we can just reset lower bits
5777       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5778       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5779       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5780       // index of last analyzed substring inside current octet. So, str2 in at
5781       // respective start address. We need to advance it to next octet
5782       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5783       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5784       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5785       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5786       __ movw(cnt2, cnt2);
5787       __ b(L_LOOP_PROCEED);
5788     __ align(OptoLoopAlignment);
5789     __ BIND(NOMATCH);
5790       __ mov(result, -1);
5791     __ BIND(DONE);
5792       __ pop(spilled_regs, sp);
5793       __ ret(lr);
5794     return entry;
5795   }
5796 
5797   void generate_string_indexof_stubs() {
5798     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5799     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5800     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5801   }
5802 
5803   void inflate_and_store_2_fp_registers(bool generatePrfm,
5804       FloatRegister src1, FloatRegister src2) {
5805     Register dst = r1;
5806     __ zip1(v1, __ T16B, src1, v0);
5807     __ zip2(v2, __ T16B, src1, v0);
5808     if (generatePrfm) {
5809       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5810     }
5811     __ zip1(v3, __ T16B, src2, v0);
5812     __ zip2(v4, __ T16B, src2, v0);
5813     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5814   }
5815 
5816   // R0 = src
5817   // R1 = dst
5818   // R2 = len
5819   // R3 = len >> 3
5820   // V0 = 0
5821   // v1 = loaded 8 bytes
5822   address generate_large_byte_array_inflate() {
5823     __ align(CodeEntryAlignment);
5824     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5825     address entry = __ pc();
5826     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5827     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5828     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5829 
5830     // do one more 8-byte read to have address 16-byte aligned in most cases
5831     // also use single store instruction
5832     __ ldrd(v2, __ post(src, 8));
5833     __ sub(octetCounter, octetCounter, 2);
5834     __ zip1(v1, __ T16B, v1, v0);
5835     __ zip1(v2, __ T16B, v2, v0);
5836     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5837     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5838     __ subs(rscratch1, octetCounter, large_loop_threshold);
5839     __ br(__ LE, LOOP_START);
5840     __ b(LOOP_PRFM_START);
5841     __ bind(LOOP_PRFM);
5842       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5843     __ bind(LOOP_PRFM_START);
5844       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5845       __ sub(octetCounter, octetCounter, 8);
5846       __ subs(rscratch1, octetCounter, large_loop_threshold);
5847       inflate_and_store_2_fp_registers(true, v3, v4);
5848       inflate_and_store_2_fp_registers(true, v5, v6);
5849       __ br(__ GT, LOOP_PRFM);
5850       __ cmp(octetCounter, (u1)8);
5851       __ br(__ LT, DONE);
5852     __ bind(LOOP);
5853       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5854       __ bind(LOOP_START);
5855       __ sub(octetCounter, octetCounter, 8);
5856       __ cmp(octetCounter, (u1)8);
5857       inflate_and_store_2_fp_registers(false, v3, v4);
5858       inflate_and_store_2_fp_registers(false, v5, v6);
5859       __ br(__ GE, LOOP);
5860     __ bind(DONE);
5861       __ ret(lr);
5862     return entry;
5863   }
5864 
5865   /**
5866    *  Arguments:
5867    *
5868    *  Input:
5869    *  c_rarg0   - current state address
5870    *  c_rarg1   - H key address
5871    *  c_rarg2   - data address
5872    *  c_rarg3   - number of blocks
5873    *
5874    *  Output:
5875    *  Updated state at c_rarg0
5876    */
5877   address generate_ghash_processBlocks() {
5878     // Bafflingly, GCM uses little-endian for the byte order, but
5879     // big-endian for the bit order.  For example, the polynomial 1 is
5880     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5881     //
5882     // So, we must either reverse the bytes in each word and do
5883     // everything big-endian or reverse the bits in each byte and do
5884     // it little-endian.  On AArch64 it's more idiomatic to reverse
5885     // the bits in each byte (we have an instruction, RBIT, to do
5886     // that) and keep the data in little-endian bit order through the
5887     // calculation, bit-reversing the inputs and outputs.
5888 
5889     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5890     __ align(wordSize * 2);
5891     address p = __ pc();
5892     __ emit_int64(0x87);  // The low-order bits of the field
5893                           // polynomial (i.e. p = z^7+z^2+z+1)
5894                           // repeated in the low and high parts of a
5895                           // 128-bit vector
5896     __ emit_int64(0x87);
5897 
5898     __ align(CodeEntryAlignment);
5899     address start = __ pc();
5900 
5901     Register state   = c_rarg0;
5902     Register subkeyH = c_rarg1;
5903     Register data    = c_rarg2;
5904     Register blocks  = c_rarg3;
5905 
5906     FloatRegister vzr = v30;
5907     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5908 
5909     __ ldrq(v24, p);    // The field polynomial
5910 
5911     __ ldrq(v0, Address(state));
5912     __ ldrq(v1, Address(subkeyH));
5913 
5914     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5915     __ rbit(v0, __ T16B, v0);
5916     __ rev64(v1, __ T16B, v1);
5917     __ rbit(v1, __ T16B, v1);
5918 
5919     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5920     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5921 
5922     {
5923       Label L_ghash_loop;
5924       __ bind(L_ghash_loop);
5925 
5926       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5927                                                  // reversing each byte
5928       __ rbit(v2, __ T16B, v2);
5929       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5930 
5931       // Multiply state in v2 by subkey in v1
5932       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5933                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
5934                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
5935       // Reduce v7:v5 by the field polynomial
5936       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
5937 
5938       __ sub(blocks, blocks, 1);
5939       __ cbnz(blocks, L_ghash_loop);
5940     }
5941 
5942     // The bit-reversed result is at this point in v0
5943     __ rev64(v0, __ T16B, v0);
5944     __ rbit(v0, __ T16B, v0);
5945 
5946     __ st1(v0, __ T16B, state);
5947     __ ret(lr);
5948 
5949     return start;
5950   }
5951 
5952   address generate_ghash_processBlocks_wide() {
5953     address small = generate_ghash_processBlocks();
5954 
5955     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
5956     __ align(wordSize * 2);
5957     address p = __ pc();
5958     __ emit_int64(0x87);  // The low-order bits of the field
5959                           // polynomial (i.e. p = z^7+z^2+z+1)
5960                           // repeated in the low and high parts of a
5961                           // 128-bit vector
5962     __ emit_int64(0x87);
5963 
5964     __ align(CodeEntryAlignment);
5965     address start = __ pc();
5966 
5967     Register state   = c_rarg0;
5968     Register subkeyH = c_rarg1;
5969     Register data    = c_rarg2;
5970     Register blocks  = c_rarg3;
5971 
5972     const int unroll = 4;
5973 
5974     __ cmp(blocks, (unsigned char)(unroll * 2));
5975     __ br(__ LT, small);
5976 
5977     if (unroll > 1) {
5978     // Save state before entering routine
5979       __ sub(sp, sp, 4 * 16);
5980       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
5981       __ sub(sp, sp, 4 * 16);
5982       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
5983     }
5984 
5985     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
5986 
5987     if (unroll > 1) {
5988       // And restore state
5989       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
5990       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
5991     }
5992 
5993     __ cmp(blocks, (unsigned char)0);
5994     __ br(__ GT, small);
5995 
5996     __ ret(lr);
5997 
5998     return start;
5999   }
6000 
6001   void generate_base64_encode_simdround(Register src, Register dst,
6002         FloatRegister codec, u8 size) {
6003 
6004     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6005     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6006     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6007 
6008     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6009 
6010     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6011 
6012     __ ushr(ind0, arrangement, in0,  2);
6013 
6014     __ ushr(ind1, arrangement, in1,  2);
6015     __ shl(in0,   arrangement, in0,  6);
6016     __ orr(ind1,  arrangement, ind1, in0);
6017     __ ushr(ind1, arrangement, ind1, 2);
6018 
6019     __ ushr(ind2, arrangement, in2,  4);
6020     __ shl(in1,   arrangement, in1,  4);
6021     __ orr(ind2,  arrangement, in1,  ind2);
6022     __ ushr(ind2, arrangement, ind2, 2);
6023 
6024     __ shl(ind3,  arrangement, in2,  2);
6025     __ ushr(ind3, arrangement, ind3, 2);
6026 
6027     __ tbl(out0,  arrangement, codec,  4, ind0);
6028     __ tbl(out1,  arrangement, codec,  4, ind1);
6029     __ tbl(out2,  arrangement, codec,  4, ind2);
6030     __ tbl(out3,  arrangement, codec,  4, ind3);
6031 
6032     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6033   }
6034 
6035    /**
6036    *  Arguments:
6037    *
6038    *  Input:
6039    *  c_rarg0   - src_start
6040    *  c_rarg1   - src_offset
6041    *  c_rarg2   - src_length
6042    *  c_rarg3   - dest_start
6043    *  c_rarg4   - dest_offset
6044    *  c_rarg5   - isURL
6045    *
6046    */
6047   address generate_base64_encodeBlock() {
6048 
6049     static const char toBase64[64] = {
6050       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6051       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6052       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6053       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6054       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6055     };
6056 
6057     static const char toBase64URL[64] = {
6058       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6059       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6060       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6061       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6062       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6063     };
6064 
6065     __ align(CodeEntryAlignment);
6066     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6067     address start = __ pc();
6068 
6069     Register src   = c_rarg0;  // source array
6070     Register soff  = c_rarg1;  // source start offset
6071     Register send  = c_rarg2;  // source end offset
6072     Register dst   = c_rarg3;  // dest array
6073     Register doff  = c_rarg4;  // position for writing to dest array
6074     Register isURL = c_rarg5;  // Base64 or URL character set
6075 
6076     // c_rarg6 and c_rarg7 are free to use as temps
6077     Register codec  = c_rarg6;
6078     Register length = c_rarg7;
6079 
6080     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6081 
6082     __ add(src, src, soff);
6083     __ add(dst, dst, doff);
6084     __ sub(length, send, soff);
6085 
6086     // load the codec base address
6087     __ lea(codec, ExternalAddress((address) toBase64));
6088     __ cbz(isURL, ProcessData);
6089     __ lea(codec, ExternalAddress((address) toBase64URL));
6090 
6091     __ BIND(ProcessData);
6092 
6093     // too short to formup a SIMD loop, roll back
6094     __ cmp(length, (u1)24);
6095     __ br(Assembler::LT, Process3B);
6096 
6097     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6098 
6099     __ BIND(Process48B);
6100     __ cmp(length, (u1)48);
6101     __ br(Assembler::LT, Process24B);
6102     generate_base64_encode_simdround(src, dst, v0, 16);
6103     __ sub(length, length, 48);
6104     __ b(Process48B);
6105 
6106     __ BIND(Process24B);
6107     __ cmp(length, (u1)24);
6108     __ br(Assembler::LT, SIMDExit);
6109     generate_base64_encode_simdround(src, dst, v0, 8);
6110     __ sub(length, length, 24);
6111 
6112     __ BIND(SIMDExit);
6113     __ cbz(length, Exit);
6114 
6115     __ BIND(Process3B);
6116     //  3 src bytes, 24 bits
6117     __ ldrb(r10, __ post(src, 1));
6118     __ ldrb(r11, __ post(src, 1));
6119     __ ldrb(r12, __ post(src, 1));
6120     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6121     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6122     // codec index
6123     __ ubfmw(r15, r12, 18, 23);
6124     __ ubfmw(r14, r12, 12, 17);
6125     __ ubfmw(r13, r12, 6,  11);
6126     __ andw(r12,  r12, 63);
6127     // get the code based on the codec
6128     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6129     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6130     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6131     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6132     __ strb(r15, __ post(dst, 1));
6133     __ strb(r14, __ post(dst, 1));
6134     __ strb(r13, __ post(dst, 1));
6135     __ strb(r12, __ post(dst, 1));
6136     __ sub(length, length, 3);
6137     __ cbnz(length, Process3B);
6138 
6139     __ BIND(Exit);
6140     __ ret(lr);
6141 
6142     return start;
6143   }
6144 
6145   void generate_base64_decode_simdround(Register src, Register dst,
6146         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6147 
6148     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6149     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6150 
6151     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6152     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6153 
6154     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6155 
6156     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6157 
6158     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6159 
6160     // we need unsigned saturating subtract, to make sure all input values
6161     // in range [0, 63] will have 0U value in the higher half lookup
6162     __ uqsubv(decH0, __ T16B, in0, v27);
6163     __ uqsubv(decH1, __ T16B, in1, v27);
6164     __ uqsubv(decH2, __ T16B, in2, v27);
6165     __ uqsubv(decH3, __ T16B, in3, v27);
6166 
6167     // lower half lookup
6168     __ tbl(decL0, arrangement, codecL, 4, in0);
6169     __ tbl(decL1, arrangement, codecL, 4, in1);
6170     __ tbl(decL2, arrangement, codecL, 4, in2);
6171     __ tbl(decL3, arrangement, codecL, 4, in3);
6172 
6173     // higher half lookup
6174     __ tbx(decH0, arrangement, codecH, 4, decH0);
6175     __ tbx(decH1, arrangement, codecH, 4, decH1);
6176     __ tbx(decH2, arrangement, codecH, 4, decH2);
6177     __ tbx(decH3, arrangement, codecH, 4, decH3);
6178 
6179     // combine lower and higher
6180     __ orr(decL0, arrangement, decL0, decH0);
6181     __ orr(decL1, arrangement, decL1, decH1);
6182     __ orr(decL2, arrangement, decL2, decH2);
6183     __ orr(decL3, arrangement, decL3, decH3);
6184 
6185     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6186     __ cmhi(decH0, arrangement, decL0, v27);
6187     __ cmhi(decH1, arrangement, decL1, v27);
6188     __ cmhi(decH2, arrangement, decL2, v27);
6189     __ cmhi(decH3, arrangement, decL3, v27);
6190     __ orr(in0, arrangement, decH0, decH1);
6191     __ orr(in1, arrangement, decH2, decH3);
6192     __ orr(in2, arrangement, in0,   in1);
6193     __ umaxv(in3, arrangement, in2);
6194     __ umov(rscratch2, in3, __ B, 0);
6195 
6196     // get the data to output
6197     __ shl(out0,  arrangement, decL0, 2);
6198     __ ushr(out1, arrangement, decL1, 4);
6199     __ orr(out0,  arrangement, out0,  out1);
6200     __ shl(out1,  arrangement, decL1, 4);
6201     __ ushr(out2, arrangement, decL2, 2);
6202     __ orr(out1,  arrangement, out1,  out2);
6203     __ shl(out2,  arrangement, decL2, 6);
6204     __ orr(out2,  arrangement, out2,  decL3);
6205 
6206     __ cbz(rscratch2, NoIllegalData);
6207 
6208     // handle illegal input
6209     __ umov(r10, in2, __ D, 0);
6210     if (size == 16) {
6211       __ cbnz(r10, ErrorInLowerHalf);
6212 
6213       // illegal input is in higher half, store the lower half now.
6214       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6215 
6216       __ umov(r10, in2,  __ D, 1);
6217       __ umov(r11, out0, __ D, 1);
6218       __ umov(r12, out1, __ D, 1);
6219       __ umov(r13, out2, __ D, 1);
6220       __ b(StoreLegalData);
6221 
6222       __ BIND(ErrorInLowerHalf);
6223     }
6224     __ umov(r11, out0, __ D, 0);
6225     __ umov(r12, out1, __ D, 0);
6226     __ umov(r13, out2, __ D, 0);
6227 
6228     __ BIND(StoreLegalData);
6229     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6230     __ strb(r11, __ post(dst, 1));
6231     __ strb(r12, __ post(dst, 1));
6232     __ strb(r13, __ post(dst, 1));
6233     __ lsr(r10, r10, 8);
6234     __ lsr(r11, r11, 8);
6235     __ lsr(r12, r12, 8);
6236     __ lsr(r13, r13, 8);
6237     __ b(StoreLegalData);
6238 
6239     __ BIND(NoIllegalData);
6240     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6241   }
6242 
6243 
6244    /**
6245    *  Arguments:
6246    *
6247    *  Input:
6248    *  c_rarg0   - src_start
6249    *  c_rarg1   - src_offset
6250    *  c_rarg2   - src_length
6251    *  c_rarg3   - dest_start
6252    *  c_rarg4   - dest_offset
6253    *  c_rarg5   - isURL
6254    *  c_rarg6   - isMIME
6255    *
6256    */
6257   address generate_base64_decodeBlock() {
6258 
6259     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6260     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6261     // titled "Base64 decoding".
6262 
6263     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6264     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6265     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6266     static const uint8_t fromBase64ForNoSIMD[256] = {
6267       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6268       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6269       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6270        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6271       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6272        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6273       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6274        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6275       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6276       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6277       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6278       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6279       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6280       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6281       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6282       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6283     };
6284 
6285     static const uint8_t fromBase64URLForNoSIMD[256] = {
6286       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6287       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6288       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6289        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6290       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6291        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6292       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6293        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6294       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6295       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6296       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6297       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6298       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6299       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6300       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6301       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6302     };
6303 
6304     // A legal value of base64 code is in range [0, 127].  We need two lookups
6305     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6306     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6307     // table vector lookup use tbx, out of range indices are unchanged in
6308     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6309     // The value of index 64 is set to 0, so that we know that we already get the
6310     // decoded data with the 1st lookup.
6311     static const uint8_t fromBase64ForSIMD[128] = {
6312       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6313       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6314       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6315        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6316         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6317        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6318       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6319        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6320     };
6321 
6322     static const uint8_t fromBase64URLForSIMD[128] = {
6323       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6324       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6325       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6326        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6327         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6328        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6329        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6330        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6331     };
6332 
6333     __ align(CodeEntryAlignment);
6334     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6335     address start = __ pc();
6336 
6337     Register src    = c_rarg0;  // source array
6338     Register soff   = c_rarg1;  // source start offset
6339     Register send   = c_rarg2;  // source end offset
6340     Register dst    = c_rarg3;  // dest array
6341     Register doff   = c_rarg4;  // position for writing to dest array
6342     Register isURL  = c_rarg5;  // Base64 or URL character set
6343     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6344 
6345     Register length = send;    // reuse send as length of source data to process
6346 
6347     Register simd_codec   = c_rarg6;
6348     Register nosimd_codec = c_rarg7;
6349 
6350     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6351 
6352     __ enter();
6353 
6354     __ add(src, src, soff);
6355     __ add(dst, dst, doff);
6356 
6357     __ mov(doff, dst);
6358 
6359     __ sub(length, send, soff);
6360     __ bfm(length, zr, 0, 1);
6361 
6362     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6363     __ cbz(isURL, ProcessData);
6364     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6365 
6366     __ BIND(ProcessData);
6367     __ mov(rscratch1, length);
6368     __ cmp(length, (u1)144); // 144 = 80 + 64
6369     __ br(Assembler::LT, Process4B);
6370 
6371     // In the MIME case, the line length cannot be more than 76
6372     // bytes (see RFC 2045). This is too short a block for SIMD
6373     // to be worthwhile, so we use non-SIMD here.
6374     __ movw(rscratch1, 79);
6375 
6376     __ BIND(Process4B);
6377     __ ldrw(r14, __ post(src, 4));
6378     __ ubfxw(r10, r14, 0,  8);
6379     __ ubfxw(r11, r14, 8,  8);
6380     __ ubfxw(r12, r14, 16, 8);
6381     __ ubfxw(r13, r14, 24, 8);
6382     // get the de-code
6383     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6384     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6385     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6386     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6387     // error detection, 255u indicates an illegal input
6388     __ orrw(r14, r10, r11);
6389     __ orrw(r15, r12, r13);
6390     __ orrw(r14, r14, r15);
6391     __ tbnz(r14, 7, Exit);
6392     // recover the data
6393     __ lslw(r14, r10, 10);
6394     __ bfiw(r14, r11, 4, 6);
6395     __ bfmw(r14, r12, 2, 5);
6396     __ rev16w(r14, r14);
6397     __ bfiw(r13, r12, 6, 2);
6398     __ strh(r14, __ post(dst, 2));
6399     __ strb(r13, __ post(dst, 1));
6400     // non-simd loop
6401     __ subsw(rscratch1, rscratch1, 4);
6402     __ br(Assembler::GT, Process4B);
6403 
6404     // if exiting from PreProcess80B, rscratch1 == -1;
6405     // otherwise, rscratch1 == 0.
6406     __ cbzw(rscratch1, Exit);
6407     __ sub(length, length, 80);
6408 
6409     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6410     __ cbz(isURL, SIMDEnter);
6411     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6412 
6413     __ BIND(SIMDEnter);
6414     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6415     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6416     __ mov(rscratch1, 63);
6417     __ dup(v27, __ T16B, rscratch1);
6418 
6419     __ BIND(Process64B);
6420     __ cmp(length, (u1)64);
6421     __ br(Assembler::LT, Process32B);
6422     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6423     __ sub(length, length, 64);
6424     __ b(Process64B);
6425 
6426     __ BIND(Process32B);
6427     __ cmp(length, (u1)32);
6428     __ br(Assembler::LT, SIMDExit);
6429     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6430     __ sub(length, length, 32);
6431     __ b(Process32B);
6432 
6433     __ BIND(SIMDExit);
6434     __ cbz(length, Exit);
6435     __ movw(rscratch1, length);
6436     __ b(Process4B);
6437 
6438     __ BIND(Exit);
6439     __ sub(c_rarg0, dst, doff);
6440 
6441     __ leave();
6442     __ ret(lr);
6443 
6444     return start;
6445   }
6446 
6447   // Support for spin waits.
6448   address generate_spin_wait() {
6449     __ align(CodeEntryAlignment);
6450     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6451     address start = __ pc();
6452 
6453     __ spin_wait();
6454     __ ret(lr);
6455 
6456     return start;
6457   }
6458 
6459 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6460 
6461   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6462   //
6463   // If LSE is in use, generate LSE versions of all the stubs. The
6464   // non-LSE versions are in atomic_aarch64.S.
6465 
6466   // class AtomicStubMark records the entry point of a stub and the
6467   // stub pointer which will point to it. The stub pointer is set to
6468   // the entry point when ~AtomicStubMark() is called, which must be
6469   // after ICache::invalidate_range. This ensures safe publication of
6470   // the generated code.
6471   class AtomicStubMark {
6472     address _entry_point;
6473     aarch64_atomic_stub_t *_stub;
6474     MacroAssembler *_masm;
6475   public:
6476     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6477       _masm = masm;
6478       __ align(32);
6479       _entry_point = __ pc();
6480       _stub = stub;
6481     }
6482     ~AtomicStubMark() {
6483       *_stub = (aarch64_atomic_stub_t)_entry_point;
6484     }
6485   };
6486 
6487   // NB: For memory_order_conservative we need a trailing membar after
6488   // LSE atomic operations but not a leading membar.
6489   //
6490   // We don't need a leading membar because a clause in the Arm ARM
6491   // says:
6492   //
6493   //   Barrier-ordered-before
6494   //
6495   //   Barrier instructions order prior Memory effects before subsequent
6496   //   Memory effects generated by the same Observer. A read or a write
6497   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6498   //   Observer if and only if RW1 appears in program order before RW 2
6499   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6500   //   instruction with both Acquire and Release semantics.
6501   //
6502   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6503   // and Release semantics, therefore we don't need a leading
6504   // barrier. However, there is no corresponding Barrier-ordered-after
6505   // relationship, therefore we need a trailing membar to prevent a
6506   // later store or load from being reordered with the store in an
6507   // atomic instruction.
6508   //
6509   // This was checked by using the herd7 consistency model simulator
6510   // (http://diy.inria.fr/) with this test case:
6511   //
6512   // AArch64 LseCas
6513   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6514   // P0 | P1;
6515   // LDR W4, [X2] | MOV W3, #0;
6516   // DMB LD       | MOV W4, #1;
6517   // LDR W3, [X1] | CASAL W3, W4, [X1];
6518   //              | DMB ISH;
6519   //              | STR W4, [X2];
6520   // exists
6521   // (0:X3=0 /\ 0:X4=1)
6522   //
6523   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6524   // with the store to x in P1. Without the DMB in P1 this may happen.
6525   //
6526   // At the time of writing we don't know of any AArch64 hardware that
6527   // reorders stores in this way, but the Reference Manual permits it.
6528 
6529   void gen_cas_entry(Assembler::operand_size size,
6530                      atomic_memory_order order) {
6531     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6532       exchange_val = c_rarg2;
6533     bool acquire, release;
6534     switch (order) {
6535       case memory_order_relaxed:
6536         acquire = false;
6537         release = false;
6538         break;
6539       case memory_order_release:
6540         acquire = false;
6541         release = true;
6542         break;
6543       default:
6544         acquire = true;
6545         release = true;
6546         break;
6547     }
6548     __ mov(prev, compare_val);
6549     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6550     if (order == memory_order_conservative) {
6551       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6552     }
6553     if (size == Assembler::xword) {
6554       __ mov(r0, prev);
6555     } else {
6556       __ movw(r0, prev);
6557     }
6558     __ ret(lr);
6559   }
6560 
6561   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6562     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6563     // If not relaxed, then default to conservative.  Relaxed is the only
6564     // case we use enough to be worth specializing.
6565     if (order == memory_order_relaxed) {
6566       __ ldadd(size, incr, prev, addr);
6567     } else {
6568       __ ldaddal(size, incr, prev, addr);
6569       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6570     }
6571     if (size == Assembler::xword) {
6572       __ mov(r0, prev);
6573     } else {
6574       __ movw(r0, prev);
6575     }
6576     __ ret(lr);
6577   }
6578 
6579   void gen_swpal_entry(Assembler::operand_size size) {
6580     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6581     __ swpal(size, incr, prev, addr);
6582     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6583     if (size == Assembler::xword) {
6584       __ mov(r0, prev);
6585     } else {
6586       __ movw(r0, prev);
6587     }
6588     __ ret(lr);
6589   }
6590 
6591   void generate_atomic_entry_points() {
6592     if (! UseLSE) {
6593       return;
6594     }
6595 
6596     __ align(CodeEntryAlignment);
6597     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6598     address first_entry = __ pc();
6599 
6600     // ADD, memory_order_conservative
6601     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6602     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6603     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6604     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6605 
6606     // ADD, memory_order_relaxed
6607     AtomicStubMark mark_fetch_add_4_relaxed
6608       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6609     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6610     AtomicStubMark mark_fetch_add_8_relaxed
6611       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6612     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6613 
6614     // XCHG, memory_order_conservative
6615     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6616     gen_swpal_entry(Assembler::word);
6617     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6618     gen_swpal_entry(Assembler::xword);
6619 
6620     // CAS, memory_order_conservative
6621     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6622     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6623     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6624     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6625     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6626     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6627 
6628     // CAS, memory_order_relaxed
6629     AtomicStubMark mark_cmpxchg_1_relaxed
6630       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6631     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6632     AtomicStubMark mark_cmpxchg_4_relaxed
6633       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6634     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6635     AtomicStubMark mark_cmpxchg_8_relaxed
6636       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6637     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6638 
6639     AtomicStubMark mark_cmpxchg_4_release
6640       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6641     gen_cas_entry(MacroAssembler::word, memory_order_release);
6642     AtomicStubMark mark_cmpxchg_8_release
6643       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6644     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6645 
6646     AtomicStubMark mark_cmpxchg_4_seq_cst
6647       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6648     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6649     AtomicStubMark mark_cmpxchg_8_seq_cst
6650       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6651     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6652 
6653     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6654   }
6655 #endif // LINUX
6656 
6657   // Pass object argument in r0 (which has to be preserved outside this stub)
6658   // Pass back result in r0
6659   // Clobbers rscratch1
6660   address generate_load_nklass() {
6661     __ align(CodeEntryAlignment);
6662     StubCodeMark mark(this, "StubRoutines", "load_nklass");
6663 
6664     address start = __ pc();
6665 
6666     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
6667     __ enter();
6668     __ push(RegSet::of(rscratch1, rscratch2), sp);
6669     __ push_call_clobbered_registers_except(r0);
6670     __ call_VM_leaf(CAST_FROM_FN_PTR(address, oopDesc::load_nklass_runtime), 1);
6671     __ pop_call_clobbered_registers_except(r0);
6672     __ pop(RegSet::of(rscratch1, rscratch2), sp);
6673     __ leave();
6674     __ reset_last_Java_frame(true);
6675     __ ret(lr);
6676 
6677     return start;
6678   }
6679 
6680   address generate_cont_thaw(Continuation::thaw_kind kind) {
6681     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
6682     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
6683 
6684     address start = __ pc();
6685 
6686     if (return_barrier) {
6687       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6688       __ mov(sp, rscratch1);
6689     }
6690     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6691 
6692     if (return_barrier) {
6693       // preserve possible return value from a method returning to the return barrier
6694       __ fmovd(rscratch1, v0);
6695       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6696     }
6697 
6698     __ movw(c_rarg1, (return_barrier ? 1 : 0));
6699     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
6700     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
6701 
6702     if (return_barrier) {
6703       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6704       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6705       __ fmovd(v0, rscratch1);
6706     }
6707     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6708 
6709 
6710     Label thaw_success;
6711     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
6712     __ cbnz(rscratch2, thaw_success);
6713     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
6714     __ br(rscratch1);
6715     __ bind(thaw_success);
6716 
6717     // make room for the thawed frames
6718     __ sub(rscratch1, sp, rscratch2);
6719     __ andr(rscratch1, rscratch1, -16); // align
6720     __ mov(sp, rscratch1);
6721 
6722     if (return_barrier) {
6723       // save original return value -- again
6724       __ fmovd(rscratch1, v0);
6725       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6726     }
6727 
6728     // If we want, we can templatize thaw by kind, and have three different entries
6729     __ movw(c_rarg1, (uint32_t)kind);
6730 
6731     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
6732     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
6733 
6734     if (return_barrier) {
6735       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6736       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6737       __ fmovd(v0, rscratch1);
6738     } else {
6739       __ mov(r0, zr); // return 0 (success) from doYield
6740     }
6741 
6742     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
6743     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
6744     __ mov(rfp, sp);
6745 
6746     if (return_barrier_exception) {
6747       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
6748       __ verify_oop(r0);
6749       __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19
6750 
6751       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
6752 
6753       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
6754       // __ reinitialize_ptrue();
6755 
6756       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
6757 
6758       __ mov(r1, r0); // the exception handler
6759       __ mov(r0, r19); // restore return value contaning the exception oop
6760       __ verify_oop(r0);
6761 
6762       __ leave();
6763       __ mov(r3, lr);
6764       __ br(r1); // the exception handler
6765     } else {
6766       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
6767       __ leave();
6768       __ ret(lr);
6769     }
6770 
6771     return start;
6772   }
6773 
6774   address generate_cont_thaw() {
6775     if (!Continuations::enabled()) return nullptr;
6776 
6777     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
6778     address start = __ pc();
6779     generate_cont_thaw(Continuation::thaw_top);
6780     return start;
6781   }
6782 
6783   address generate_cont_returnBarrier() {
6784     if (!Continuations::enabled()) return nullptr;
6785 
6786     // TODO: will probably need multiple return barriers depending on return type
6787     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
6788     address start = __ pc();
6789 
6790     generate_cont_thaw(Continuation::thaw_return_barrier);
6791 
6792     return start;
6793   }
6794 
6795   address generate_cont_returnBarrier_exception() {
6796     if (!Continuations::enabled()) return nullptr;
6797 
6798     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
6799     address start = __ pc();
6800 
6801     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
6802 
6803     return start;
6804   }
6805 
6806 #if INCLUDE_JFR
6807 
6808   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
6809     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6810     __ mov(c_rarg0, thread);
6811   }
6812 
6813   // The handle is dereferenced through a load barrier.
6814   static void jfr_epilogue(MacroAssembler* _masm) {
6815     __ reset_last_Java_frame(true);
6816     Label null_jobject;
6817     __ cbz(r0, null_jobject);
6818     DecoratorSet decorators = ACCESS_READ | IN_NATIVE;
6819     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
6820     bs->load_at(_masm, decorators, T_OBJECT, r0, Address(r0, 0), rscratch1, rscratch2);
6821     __ bind(null_jobject);
6822   }
6823 
6824   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
6825   // It returns a jobject handle to the event writer.
6826   // The handle is dereferenced and the return value is the event writer oop.
6827   static RuntimeStub* generate_jfr_write_checkpoint() {
6828     enum layout {
6829       rbp_off,
6830       rbpH_off,
6831       return_off,
6832       return_off2,
6833       framesize // inclusive of return address
6834     };
6835 
6836     int insts_size = 512;
6837     int locs_size = 64;
6838     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
6839     OopMapSet* oop_maps = new OopMapSet();
6840     MacroAssembler* masm = new MacroAssembler(&code);
6841     MacroAssembler* _masm = masm;
6842 
6843     address start = __ pc();
6844     __ enter();
6845     int frame_complete = __ pc() - start;
6846     address the_pc = __ pc();
6847     jfr_prologue(the_pc, _masm, rthread);
6848     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
6849     jfr_epilogue(_masm);
6850     __ leave();
6851     __ ret(lr);
6852 
6853     OopMap* map = new OopMap(framesize, 1); // rfp
6854     oop_maps->add_gc_map(the_pc - start, map);
6855 
6856     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
6857       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
6858                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6859                                     oop_maps, false);
6860     return stub;
6861   }
6862 
6863 #endif // INCLUDE_JFR
6864 
6865   // Continuation point for throwing of implicit exceptions that are
6866   // not handled in the current activation. Fabricates an exception
6867   // oop and initiates normal exception dispatching in this
6868   // frame. Since we need to preserve callee-saved values (currently
6869   // only for C2, but done for C1 as well) we need a callee-saved oop
6870   // map and therefore have to make these stubs into RuntimeStubs
6871   // rather than BufferBlobs.  If the compiler needs all registers to
6872   // be preserved between the fault point and the exception handler
6873   // then it must assume responsibility for that in
6874   // AbstractCompiler::continuation_for_implicit_null_exception or
6875   // continuation_for_implicit_division_by_zero_exception. All other
6876   // implicit exceptions (e.g., NullPointerException or
6877   // AbstractMethodError on entry) are either at call sites or
6878   // otherwise assume that stack unwinding will be initiated, so
6879   // caller saved registers were assumed volatile in the compiler.
6880 
6881 #undef __
6882 #define __ masm->
6883 
6884   address generate_throw_exception(const char* name,
6885                                    address runtime_entry,
6886                                    Register arg1 = noreg,
6887                                    Register arg2 = noreg) {
6888     // Information about frame layout at time of blocking runtime call.
6889     // Note that we only have to preserve callee-saved registers since
6890     // the compilers are responsible for supplying a continuation point
6891     // if they expect all registers to be preserved.
6892     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6893     enum layout {
6894       rfp_off = 0,
6895       rfp_off2,
6896       return_off,
6897       return_off2,
6898       framesize // inclusive of return address
6899     };
6900 
6901     int insts_size = 512;
6902     int locs_size  = 64;
6903 
6904     CodeBuffer code(name, insts_size, locs_size);
6905     OopMapSet* oop_maps  = new OopMapSet();
6906     MacroAssembler* masm = new MacroAssembler(&code);
6907 
6908     address start = __ pc();
6909 
6910     // This is an inlined and slightly modified version of call_VM
6911     // which has the ability to fetch the return PC out of
6912     // thread-local storage and also sets up last_Java_sp slightly
6913     // differently than the real call_VM
6914 
6915     __ enter(); // Save FP and LR before call
6916 
6917     assert(is_even(framesize/2), "sp not 16-byte aligned");
6918 
6919     // lr and fp are already in place
6920     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
6921 
6922     int frame_complete = __ pc() - start;
6923 
6924     // Set up last_Java_sp and last_Java_fp
6925     address the_pc = __ pc();
6926     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6927 
6928     // Call runtime
6929     if (arg1 != noreg) {
6930       assert(arg2 != c_rarg1, "clobbered");
6931       __ mov(c_rarg1, arg1);
6932     }
6933     if (arg2 != noreg) {
6934       __ mov(c_rarg2, arg2);
6935     }
6936     __ mov(c_rarg0, rthread);
6937     BLOCK_COMMENT("call runtime_entry");
6938     __ mov(rscratch1, runtime_entry);
6939     __ blr(rscratch1);
6940 
6941     // Generate oop map
6942     OopMap* map = new OopMap(framesize, 0);
6943 
6944     oop_maps->add_gc_map(the_pc - start, map);
6945 
6946     __ reset_last_Java_frame(true);
6947 
6948     // Reinitialize the ptrue predicate register, in case the external runtime
6949     // call clobbers ptrue reg, as we may return to SVE compiled code.
6950     __ reinitialize_ptrue();
6951 
6952     __ leave();
6953 
6954     // check for pending exceptions
6955 #ifdef ASSERT
6956     Label L;
6957     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6958     __ cbnz(rscratch1, L);
6959     __ should_not_reach_here();
6960     __ bind(L);
6961 #endif // ASSERT
6962     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6963 
6964     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6965     RuntimeStub* stub =
6966       RuntimeStub::new_runtime_stub(name,
6967                                     &code,
6968                                     frame_complete,
6969                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6970                                     oop_maps, false);
6971     return stub->entry_point();
6972   }
6973 
6974   class MontgomeryMultiplyGenerator : public MacroAssembler {
6975 
6976     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6977       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6978 
6979     RegSet _toSave;
6980     bool _squaring;
6981 
6982   public:
6983     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6984       : MacroAssembler(as->code()), _squaring(squaring) {
6985 
6986       // Register allocation
6987 
6988       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6989       Pa_base = *regs;       // Argument registers
6990       if (squaring)
6991         Pb_base = Pa_base;
6992       else
6993         Pb_base = *++regs;
6994       Pn_base = *++regs;
6995       Rlen= *++regs;
6996       inv = *++regs;
6997       Pm_base = *++regs;
6998 
6999                           // Working registers:
7000       Ra =  *++regs;        // The current digit of a, b, n, and m.
7001       Rb =  *++regs;
7002       Rm =  *++regs;
7003       Rn =  *++regs;
7004 
7005       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7006       Pb =  *++regs;
7007       Pm =  *++regs;
7008       Pn =  *++regs;
7009 
7010       t0 =  *++regs;        // Three registers which form a
7011       t1 =  *++regs;        // triple-precision accumuator.
7012       t2 =  *++regs;
7013 
7014       Ri =  *++regs;        // Inner and outer loop indexes.
7015       Rj =  *++regs;
7016 
7017       Rhi_ab = *++regs;     // Product registers: low and high parts
7018       Rlo_ab = *++regs;     // of a*b and m*n.
7019       Rhi_mn = *++regs;
7020       Rlo_mn = *++regs;
7021 
7022       // r19 and up are callee-saved.
7023       _toSave = RegSet::range(r19, *regs) + Pm_base;
7024     }
7025 
7026   private:
7027     void save_regs() {
7028       push(_toSave, sp);
7029     }
7030 
7031     void restore_regs() {
7032       pop(_toSave, sp);
7033     }
7034 
7035     template <typename T>
7036     void unroll_2(Register count, T block) {
7037       Label loop, end, odd;
7038       tbnz(count, 0, odd);
7039       cbz(count, end);
7040       align(16);
7041       bind(loop);
7042       (this->*block)();
7043       bind(odd);
7044       (this->*block)();
7045       subs(count, count, 2);
7046       br(Assembler::GT, loop);
7047       bind(end);
7048     }
7049 
7050     template <typename T>
7051     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7052       Label loop, end, odd;
7053       tbnz(count, 0, odd);
7054       cbz(count, end);
7055       align(16);
7056       bind(loop);
7057       (this->*block)(d, s, tmp);
7058       bind(odd);
7059       (this->*block)(d, s, tmp);
7060       subs(count, count, 2);
7061       br(Assembler::GT, loop);
7062       bind(end);
7063     }
7064 
7065     void pre1(RegisterOrConstant i) {
7066       block_comment("pre1");
7067       // Pa = Pa_base;
7068       // Pb = Pb_base + i;
7069       // Pm = Pm_base;
7070       // Pn = Pn_base + i;
7071       // Ra = *Pa;
7072       // Rb = *Pb;
7073       // Rm = *Pm;
7074       // Rn = *Pn;
7075       ldr(Ra, Address(Pa_base));
7076       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7077       ldr(Rm, Address(Pm_base));
7078       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7079       lea(Pa, Address(Pa_base));
7080       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7081       lea(Pm, Address(Pm_base));
7082       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7083 
7084       // Zero the m*n result.
7085       mov(Rhi_mn, zr);
7086       mov(Rlo_mn, zr);
7087     }
7088 
7089     // The core multiply-accumulate step of a Montgomery
7090     // multiplication.  The idea is to schedule operations as a
7091     // pipeline so that instructions with long latencies (loads and
7092     // multiplies) have time to complete before their results are
7093     // used.  This most benefits in-order implementations of the
7094     // architecture but out-of-order ones also benefit.
7095     void step() {
7096       block_comment("step");
7097       // MACC(Ra, Rb, t0, t1, t2);
7098       // Ra = *++Pa;
7099       // Rb = *--Pb;
7100       umulh(Rhi_ab, Ra, Rb);
7101       mul(Rlo_ab, Ra, Rb);
7102       ldr(Ra, pre(Pa, wordSize));
7103       ldr(Rb, pre(Pb, -wordSize));
7104       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7105                                        // previous iteration.
7106       // MACC(Rm, Rn, t0, t1, t2);
7107       // Rm = *++Pm;
7108       // Rn = *--Pn;
7109       umulh(Rhi_mn, Rm, Rn);
7110       mul(Rlo_mn, Rm, Rn);
7111       ldr(Rm, pre(Pm, wordSize));
7112       ldr(Rn, pre(Pn, -wordSize));
7113       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7114     }
7115 
7116     void post1() {
7117       block_comment("post1");
7118 
7119       // MACC(Ra, Rb, t0, t1, t2);
7120       // Ra = *++Pa;
7121       // Rb = *--Pb;
7122       umulh(Rhi_ab, Ra, Rb);
7123       mul(Rlo_ab, Ra, Rb);
7124       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7125       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7126 
7127       // *Pm = Rm = t0 * inv;
7128       mul(Rm, t0, inv);
7129       str(Rm, Address(Pm));
7130 
7131       // MACC(Rm, Rn, t0, t1, t2);
7132       // t0 = t1; t1 = t2; t2 = 0;
7133       umulh(Rhi_mn, Rm, Rn);
7134 
7135 #ifndef PRODUCT
7136       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7137       {
7138         mul(Rlo_mn, Rm, Rn);
7139         add(Rlo_mn, t0, Rlo_mn);
7140         Label ok;
7141         cbz(Rlo_mn, ok); {
7142           stop("broken Montgomery multiply");
7143         } bind(ok);
7144       }
7145 #endif
7146       // We have very carefully set things up so that
7147       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7148       // the lower half of Rm * Rn because we know the result already:
7149       // it must be -t0.  t0 + (-t0) must generate a carry iff
7150       // t0 != 0.  So, rather than do a mul and an adds we just set
7151       // the carry flag iff t0 is nonzero.
7152       //
7153       // mul(Rlo_mn, Rm, Rn);
7154       // adds(zr, t0, Rlo_mn);
7155       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7156       adcs(t0, t1, Rhi_mn);
7157       adc(t1, t2, zr);
7158       mov(t2, zr);
7159     }
7160 
7161     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7162       block_comment("pre2");
7163       // Pa = Pa_base + i-len;
7164       // Pb = Pb_base + len;
7165       // Pm = Pm_base + i-len;
7166       // Pn = Pn_base + len;
7167 
7168       if (i.is_register()) {
7169         sub(Rj, i.as_register(), len);
7170       } else {
7171         mov(Rj, i.as_constant());
7172         sub(Rj, Rj, len);
7173       }
7174       // Rj == i-len
7175 
7176       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7177       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7178       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7179       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7180 
7181       // Ra = *++Pa;
7182       // Rb = *--Pb;
7183       // Rm = *++Pm;
7184       // Rn = *--Pn;
7185       ldr(Ra, pre(Pa, wordSize));
7186       ldr(Rb, pre(Pb, -wordSize));
7187       ldr(Rm, pre(Pm, wordSize));
7188       ldr(Rn, pre(Pn, -wordSize));
7189 
7190       mov(Rhi_mn, zr);
7191       mov(Rlo_mn, zr);
7192     }
7193 
7194     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7195       block_comment("post2");
7196       if (i.is_constant()) {
7197         mov(Rj, i.as_constant()-len.as_constant());
7198       } else {
7199         sub(Rj, i.as_register(), len);
7200       }
7201 
7202       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7203 
7204       // As soon as we know the least significant digit of our result,
7205       // store it.
7206       // Pm_base[i-len] = t0;
7207       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7208 
7209       // t0 = t1; t1 = t2; t2 = 0;
7210       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7211       adc(t1, t2, zr);
7212       mov(t2, zr);
7213     }
7214 
7215     // A carry in t0 after Montgomery multiplication means that we
7216     // should subtract multiples of n from our result in m.  We'll
7217     // keep doing that until there is no carry.
7218     void normalize(RegisterOrConstant len) {
7219       block_comment("normalize");
7220       // while (t0)
7221       //   t0 = sub(Pm_base, Pn_base, t0, len);
7222       Label loop, post, again;
7223       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7224       cbz(t0, post); {
7225         bind(again); {
7226           mov(i, zr);
7227           mov(cnt, len);
7228           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7229           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7230           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7231           align(16);
7232           bind(loop); {
7233             sbcs(Rm, Rm, Rn);
7234             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7235             add(i, i, 1);
7236             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7237             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7238             sub(cnt, cnt, 1);
7239           } cbnz(cnt, loop);
7240           sbc(t0, t0, zr);
7241         } cbnz(t0, again);
7242       } bind(post);
7243     }
7244 
7245     // Move memory at s to d, reversing words.
7246     //    Increments d to end of copied memory
7247     //    Destroys tmp1, tmp2
7248     //    Preserves len
7249     //    Leaves s pointing to the address which was in d at start
7250     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7251       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7252       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7253 
7254       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7255       mov(tmp1, len);
7256       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7257       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7258     }
7259     // where
7260     void reverse1(Register d, Register s, Register tmp) {
7261       ldr(tmp, pre(s, -wordSize));
7262       ror(tmp, tmp, 32);
7263       str(tmp, post(d, wordSize));
7264     }
7265 
7266     void step_squaring() {
7267       // An extra ACC
7268       step();
7269       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7270     }
7271 
7272     void last_squaring(RegisterOrConstant i) {
7273       Label dont;
7274       // if ((i & 1) == 0) {
7275       tbnz(i.as_register(), 0, dont); {
7276         // MACC(Ra, Rb, t0, t1, t2);
7277         // Ra = *++Pa;
7278         // Rb = *--Pb;
7279         umulh(Rhi_ab, Ra, Rb);
7280         mul(Rlo_ab, Ra, Rb);
7281         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7282       } bind(dont);
7283     }
7284 
7285     void extra_step_squaring() {
7286       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7287 
7288       // MACC(Rm, Rn, t0, t1, t2);
7289       // Rm = *++Pm;
7290       // Rn = *--Pn;
7291       umulh(Rhi_mn, Rm, Rn);
7292       mul(Rlo_mn, Rm, Rn);
7293       ldr(Rm, pre(Pm, wordSize));
7294       ldr(Rn, pre(Pn, -wordSize));
7295     }
7296 
7297     void post1_squaring() {
7298       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7299 
7300       // *Pm = Rm = t0 * inv;
7301       mul(Rm, t0, inv);
7302       str(Rm, Address(Pm));
7303 
7304       // MACC(Rm, Rn, t0, t1, t2);
7305       // t0 = t1; t1 = t2; t2 = 0;
7306       umulh(Rhi_mn, Rm, Rn);
7307 
7308 #ifndef PRODUCT
7309       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7310       {
7311         mul(Rlo_mn, Rm, Rn);
7312         add(Rlo_mn, t0, Rlo_mn);
7313         Label ok;
7314         cbz(Rlo_mn, ok); {
7315           stop("broken Montgomery multiply");
7316         } bind(ok);
7317       }
7318 #endif
7319       // We have very carefully set things up so that
7320       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7321       // the lower half of Rm * Rn because we know the result already:
7322       // it must be -t0.  t0 + (-t0) must generate a carry iff
7323       // t0 != 0.  So, rather than do a mul and an adds we just set
7324       // the carry flag iff t0 is nonzero.
7325       //
7326       // mul(Rlo_mn, Rm, Rn);
7327       // adds(zr, t0, Rlo_mn);
7328       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7329       adcs(t0, t1, Rhi_mn);
7330       adc(t1, t2, zr);
7331       mov(t2, zr);
7332     }
7333 
7334     void acc(Register Rhi, Register Rlo,
7335              Register t0, Register t1, Register t2) {
7336       adds(t0, t0, Rlo);
7337       adcs(t1, t1, Rhi);
7338       adc(t2, t2, zr);
7339     }
7340 
7341   public:
7342     /**
7343      * Fast Montgomery multiplication.  The derivation of the
7344      * algorithm is in A Cryptographic Library for the Motorola
7345      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7346      *
7347      * Arguments:
7348      *
7349      * Inputs for multiplication:
7350      *   c_rarg0   - int array elements a
7351      *   c_rarg1   - int array elements b
7352      *   c_rarg2   - int array elements n (the modulus)
7353      *   c_rarg3   - int length
7354      *   c_rarg4   - int inv
7355      *   c_rarg5   - int array elements m (the result)
7356      *
7357      * Inputs for squaring:
7358      *   c_rarg0   - int array elements a
7359      *   c_rarg1   - int array elements n (the modulus)
7360      *   c_rarg2   - int length
7361      *   c_rarg3   - int inv
7362      *   c_rarg4   - int array elements m (the result)
7363      *
7364      */
7365     address generate_multiply() {
7366       Label argh, nothing;
7367       bind(argh);
7368       stop("MontgomeryMultiply total_allocation must be <= 8192");
7369 
7370       align(CodeEntryAlignment);
7371       address entry = pc();
7372 
7373       cbzw(Rlen, nothing);
7374 
7375       enter();
7376 
7377       // Make room.
7378       cmpw(Rlen, 512);
7379       br(Assembler::HI, argh);
7380       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7381       andr(sp, Ra, -2 * wordSize);
7382 
7383       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7384 
7385       {
7386         // Copy input args, reversing as we go.  We use Ra as a
7387         // temporary variable.
7388         reverse(Ra, Pa_base, Rlen, t0, t1);
7389         if (!_squaring)
7390           reverse(Ra, Pb_base, Rlen, t0, t1);
7391         reverse(Ra, Pn_base, Rlen, t0, t1);
7392       }
7393 
7394       // Push all call-saved registers and also Pm_base which we'll need
7395       // at the end.
7396       save_regs();
7397 
7398 #ifndef PRODUCT
7399       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7400       {
7401         ldr(Rn, Address(Pn_base, 0));
7402         mul(Rlo_mn, Rn, inv);
7403         subs(zr, Rlo_mn, -1);
7404         Label ok;
7405         br(EQ, ok); {
7406           stop("broken inverse in Montgomery multiply");
7407         } bind(ok);
7408       }
7409 #endif
7410 
7411       mov(Pm_base, Ra);
7412 
7413       mov(t0, zr);
7414       mov(t1, zr);
7415       mov(t2, zr);
7416 
7417       block_comment("for (int i = 0; i < len; i++) {");
7418       mov(Ri, zr); {
7419         Label loop, end;
7420         cmpw(Ri, Rlen);
7421         br(Assembler::GE, end);
7422 
7423         bind(loop);
7424         pre1(Ri);
7425 
7426         block_comment("  for (j = i; j; j--) {"); {
7427           movw(Rj, Ri);
7428           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7429         } block_comment("  } // j");
7430 
7431         post1();
7432         addw(Ri, Ri, 1);
7433         cmpw(Ri, Rlen);
7434         br(Assembler::LT, loop);
7435         bind(end);
7436         block_comment("} // i");
7437       }
7438 
7439       block_comment("for (int i = len; i < 2*len; i++) {");
7440       mov(Ri, Rlen); {
7441         Label loop, end;
7442         cmpw(Ri, Rlen, Assembler::LSL, 1);
7443         br(Assembler::GE, end);
7444 
7445         bind(loop);
7446         pre2(Ri, Rlen);
7447 
7448         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7449           lslw(Rj, Rlen, 1);
7450           subw(Rj, Rj, Ri);
7451           subw(Rj, Rj, 1);
7452           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7453         } block_comment("  } // j");
7454 
7455         post2(Ri, Rlen);
7456         addw(Ri, Ri, 1);
7457         cmpw(Ri, Rlen, Assembler::LSL, 1);
7458         br(Assembler::LT, loop);
7459         bind(end);
7460       }
7461       block_comment("} // i");
7462 
7463       normalize(Rlen);
7464 
7465       mov(Ra, Pm_base);  // Save Pm_base in Ra
7466       restore_regs();  // Restore caller's Pm_base
7467 
7468       // Copy our result into caller's Pm_base
7469       reverse(Pm_base, Ra, Rlen, t0, t1);
7470 
7471       leave();
7472       bind(nothing);
7473       ret(lr);
7474 
7475       return entry;
7476     }
7477     // In C, approximately:
7478 
7479     // void
7480     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7481     //                     julong Pn_base[], julong Pm_base[],
7482     //                     julong inv, int len) {
7483     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7484     //   julong *Pa, *Pb, *Pn, *Pm;
7485     //   julong Ra, Rb, Rn, Rm;
7486 
7487     //   int i;
7488 
7489     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7490 
7491     //   for (i = 0; i < len; i++) {
7492     //     int j;
7493 
7494     //     Pa = Pa_base;
7495     //     Pb = Pb_base + i;
7496     //     Pm = Pm_base;
7497     //     Pn = Pn_base + i;
7498 
7499     //     Ra = *Pa;
7500     //     Rb = *Pb;
7501     //     Rm = *Pm;
7502     //     Rn = *Pn;
7503 
7504     //     int iters = i;
7505     //     for (j = 0; iters--; j++) {
7506     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7507     //       MACC(Ra, Rb, t0, t1, t2);
7508     //       Ra = *++Pa;
7509     //       Rb = *--Pb;
7510     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7511     //       MACC(Rm, Rn, t0, t1, t2);
7512     //       Rm = *++Pm;
7513     //       Rn = *--Pn;
7514     //     }
7515 
7516     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7517     //     MACC(Ra, Rb, t0, t1, t2);
7518     //     *Pm = Rm = t0 * inv;
7519     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7520     //     MACC(Rm, Rn, t0, t1, t2);
7521 
7522     //     assert(t0 == 0, "broken Montgomery multiply");
7523 
7524     //     t0 = t1; t1 = t2; t2 = 0;
7525     //   }
7526 
7527     //   for (i = len; i < 2*len; i++) {
7528     //     int j;
7529 
7530     //     Pa = Pa_base + i-len;
7531     //     Pb = Pb_base + len;
7532     //     Pm = Pm_base + i-len;
7533     //     Pn = Pn_base + len;
7534 
7535     //     Ra = *++Pa;
7536     //     Rb = *--Pb;
7537     //     Rm = *++Pm;
7538     //     Rn = *--Pn;
7539 
7540     //     int iters = len*2-i-1;
7541     //     for (j = i-len+1; iters--; j++) {
7542     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7543     //       MACC(Ra, Rb, t0, t1, t2);
7544     //       Ra = *++Pa;
7545     //       Rb = *--Pb;
7546     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7547     //       MACC(Rm, Rn, t0, t1, t2);
7548     //       Rm = *++Pm;
7549     //       Rn = *--Pn;
7550     //     }
7551 
7552     //     Pm_base[i-len] = t0;
7553     //     t0 = t1; t1 = t2; t2 = 0;
7554     //   }
7555 
7556     //   while (t0)
7557     //     t0 = sub(Pm_base, Pn_base, t0, len);
7558     // }
7559 
7560     /**
7561      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7562      * multiplies than Montgomery multiplication so it should be up to
7563      * 25% faster.  However, its loop control is more complex and it
7564      * may actually run slower on some machines.
7565      *
7566      * Arguments:
7567      *
7568      * Inputs:
7569      *   c_rarg0   - int array elements a
7570      *   c_rarg1   - int array elements n (the modulus)
7571      *   c_rarg2   - int length
7572      *   c_rarg3   - int inv
7573      *   c_rarg4   - int array elements m (the result)
7574      *
7575      */
7576     address generate_square() {
7577       Label argh;
7578       bind(argh);
7579       stop("MontgomeryMultiply total_allocation must be <= 8192");
7580 
7581       align(CodeEntryAlignment);
7582       address entry = pc();
7583 
7584       enter();
7585 
7586       // Make room.
7587       cmpw(Rlen, 512);
7588       br(Assembler::HI, argh);
7589       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7590       andr(sp, Ra, -2 * wordSize);
7591 
7592       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7593 
7594       {
7595         // Copy input args, reversing as we go.  We use Ra as a
7596         // temporary variable.
7597         reverse(Ra, Pa_base, Rlen, t0, t1);
7598         reverse(Ra, Pn_base, Rlen, t0, t1);
7599       }
7600 
7601       // Push all call-saved registers and also Pm_base which we'll need
7602       // at the end.
7603       save_regs();
7604 
7605       mov(Pm_base, Ra);
7606 
7607       mov(t0, zr);
7608       mov(t1, zr);
7609       mov(t2, zr);
7610 
7611       block_comment("for (int i = 0; i < len; i++) {");
7612       mov(Ri, zr); {
7613         Label loop, end;
7614         bind(loop);
7615         cmp(Ri, Rlen);
7616         br(Assembler::GE, end);
7617 
7618         pre1(Ri);
7619 
7620         block_comment("for (j = (i+1)/2; j; j--) {"); {
7621           add(Rj, Ri, 1);
7622           lsr(Rj, Rj, 1);
7623           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7624         } block_comment("  } // j");
7625 
7626         last_squaring(Ri);
7627 
7628         block_comment("  for (j = i/2; j; j--) {"); {
7629           lsr(Rj, Ri, 1);
7630           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7631         } block_comment("  } // j");
7632 
7633         post1_squaring();
7634         add(Ri, Ri, 1);
7635         cmp(Ri, Rlen);
7636         br(Assembler::LT, loop);
7637 
7638         bind(end);
7639         block_comment("} // i");
7640       }
7641 
7642       block_comment("for (int i = len; i < 2*len; i++) {");
7643       mov(Ri, Rlen); {
7644         Label loop, end;
7645         bind(loop);
7646         cmp(Ri, Rlen, Assembler::LSL, 1);
7647         br(Assembler::GE, end);
7648 
7649         pre2(Ri, Rlen);
7650 
7651         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7652           lsl(Rj, Rlen, 1);
7653           sub(Rj, Rj, Ri);
7654           sub(Rj, Rj, 1);
7655           lsr(Rj, Rj, 1);
7656           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7657         } block_comment("  } // j");
7658 
7659         last_squaring(Ri);
7660 
7661         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7662           lsl(Rj, Rlen, 1);
7663           sub(Rj, Rj, Ri);
7664           lsr(Rj, Rj, 1);
7665           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7666         } block_comment("  } // j");
7667 
7668         post2(Ri, Rlen);
7669         add(Ri, Ri, 1);
7670         cmp(Ri, Rlen, Assembler::LSL, 1);
7671 
7672         br(Assembler::LT, loop);
7673         bind(end);
7674         block_comment("} // i");
7675       }
7676 
7677       normalize(Rlen);
7678 
7679       mov(Ra, Pm_base);  // Save Pm_base in Ra
7680       restore_regs();  // Restore caller's Pm_base
7681 
7682       // Copy our result into caller's Pm_base
7683       reverse(Pm_base, Ra, Rlen, t0, t1);
7684 
7685       leave();
7686       ret(lr);
7687 
7688       return entry;
7689     }
7690     // In C, approximately:
7691 
7692     // void
7693     // montgomery_square(julong Pa_base[], julong Pn_base[],
7694     //                   julong Pm_base[], julong inv, int len) {
7695     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7696     //   julong *Pa, *Pb, *Pn, *Pm;
7697     //   julong Ra, Rb, Rn, Rm;
7698 
7699     //   int i;
7700 
7701     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7702 
7703     //   for (i = 0; i < len; i++) {
7704     //     int j;
7705 
7706     //     Pa = Pa_base;
7707     //     Pb = Pa_base + i;
7708     //     Pm = Pm_base;
7709     //     Pn = Pn_base + i;
7710 
7711     //     Ra = *Pa;
7712     //     Rb = *Pb;
7713     //     Rm = *Pm;
7714     //     Rn = *Pn;
7715 
7716     //     int iters = (i+1)/2;
7717     //     for (j = 0; iters--; j++) {
7718     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7719     //       MACC2(Ra, Rb, t0, t1, t2);
7720     //       Ra = *++Pa;
7721     //       Rb = *--Pb;
7722     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7723     //       MACC(Rm, Rn, t0, t1, t2);
7724     //       Rm = *++Pm;
7725     //       Rn = *--Pn;
7726     //     }
7727     //     if ((i & 1) == 0) {
7728     //       assert(Ra == Pa_base[j], "must be");
7729     //       MACC(Ra, Ra, t0, t1, t2);
7730     //     }
7731     //     iters = i/2;
7732     //     assert(iters == i-j, "must be");
7733     //     for (; iters--; j++) {
7734     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7735     //       MACC(Rm, Rn, t0, t1, t2);
7736     //       Rm = *++Pm;
7737     //       Rn = *--Pn;
7738     //     }
7739 
7740     //     *Pm = Rm = t0 * inv;
7741     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7742     //     MACC(Rm, Rn, t0, t1, t2);
7743 
7744     //     assert(t0 == 0, "broken Montgomery multiply");
7745 
7746     //     t0 = t1; t1 = t2; t2 = 0;
7747     //   }
7748 
7749     //   for (i = len; i < 2*len; i++) {
7750     //     int start = i-len+1;
7751     //     int end = start + (len - start)/2;
7752     //     int j;
7753 
7754     //     Pa = Pa_base + i-len;
7755     //     Pb = Pa_base + len;
7756     //     Pm = Pm_base + i-len;
7757     //     Pn = Pn_base + len;
7758 
7759     //     Ra = *++Pa;
7760     //     Rb = *--Pb;
7761     //     Rm = *++Pm;
7762     //     Rn = *--Pn;
7763 
7764     //     int iters = (2*len-i-1)/2;
7765     //     assert(iters == end-start, "must be");
7766     //     for (j = start; iters--; j++) {
7767     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7768     //       MACC2(Ra, Rb, t0, t1, t2);
7769     //       Ra = *++Pa;
7770     //       Rb = *--Pb;
7771     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7772     //       MACC(Rm, Rn, t0, t1, t2);
7773     //       Rm = *++Pm;
7774     //       Rn = *--Pn;
7775     //     }
7776     //     if ((i & 1) == 0) {
7777     //       assert(Ra == Pa_base[j], "must be");
7778     //       MACC(Ra, Ra, t0, t1, t2);
7779     //     }
7780     //     iters =  (2*len-i)/2;
7781     //     assert(iters == len-j, "must be");
7782     //     for (; iters--; j++) {
7783     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7784     //       MACC(Rm, Rn, t0, t1, t2);
7785     //       Rm = *++Pm;
7786     //       Rn = *--Pn;
7787     //     }
7788     //     Pm_base[i-len] = t0;
7789     //     t0 = t1; t1 = t2; t2 = 0;
7790     //   }
7791 
7792     //   while (t0)
7793     //     t0 = sub(Pm_base, Pn_base, t0, len);
7794     // }
7795   };
7796 
7797 
7798   // Initialization
7799   void generate_initial() {
7800     // Generate initial stubs and initializes the entry points
7801 
7802     // entry points that exist in all platforms Note: This is code
7803     // that could be shared among different platforms - however the
7804     // benefit seems to be smaller than the disadvantage of having a
7805     // much more complicated generator structure. See also comment in
7806     // stubRoutines.hpp.
7807 
7808     StubRoutines::_forward_exception_entry = generate_forward_exception();
7809 
7810     StubRoutines::_call_stub_entry =
7811       generate_call_stub(StubRoutines::_call_stub_return_address);
7812 
7813     // is referenced by megamorphic call
7814     StubRoutines::_catch_exception_entry = generate_catch_exception();
7815 
7816     // Build this early so it's available for the interpreter.
7817     StubRoutines::_throw_StackOverflowError_entry =
7818       generate_throw_exception("StackOverflowError throw_exception",
7819                                CAST_FROM_FN_PTR(address,
7820                                                 SharedRuntime::throw_StackOverflowError));
7821     StubRoutines::_throw_delayed_StackOverflowError_entry =
7822       generate_throw_exception("delayed StackOverflowError throw_exception",
7823                                CAST_FROM_FN_PTR(address,
7824                                                 SharedRuntime::throw_delayed_StackOverflowError));
7825     if (UseCRC32Intrinsics) {
7826       // set table address before stub generation which use it
7827       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7828       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7829     }
7830 
7831     if (UseCRC32CIntrinsics) {
7832       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7833     }
7834 
7835     // Disabled until JDK-8210858 is fixed
7836     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7837     //   StubRoutines::_dlog = generate_dlog();
7838     // }
7839 
7840     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7841       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7842     }
7843 
7844     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7845       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7846     }
7847 
7848     StubRoutines::_load_nklass = generate_load_nklass();
7849   }
7850 
7851   void generate_phase1() {
7852     // Continuation stubs:
7853     StubRoutines::_cont_thaw          = generate_cont_thaw();
7854     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
7855     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
7856 
7857     JFR_ONLY(StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();)
7858     JFR_ONLY(StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();)
7859   }
7860 
7861   void generate_all() {
7862     // support for verify_oop (must happen after universe_init)
7863     if (VerifyOops) {
7864       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
7865     }
7866     StubRoutines::_throw_AbstractMethodError_entry =
7867       generate_throw_exception("AbstractMethodError throw_exception",
7868                                CAST_FROM_FN_PTR(address,
7869                                                 SharedRuntime::
7870                                                 throw_AbstractMethodError));
7871 
7872     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7873       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7874                                CAST_FROM_FN_PTR(address,
7875                                                 SharedRuntime::
7876                                                 throw_IncompatibleClassChangeError));
7877 
7878     StubRoutines::_throw_NullPointerException_at_call_entry =
7879       generate_throw_exception("NullPointerException at call throw_exception",
7880                                CAST_FROM_FN_PTR(address,
7881                                                 SharedRuntime::
7882                                                 throw_NullPointerException_at_call));
7883 
7884     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7885 
7886     // arraycopy stubs used by compilers
7887     generate_arraycopy_stubs();
7888 
7889     // countPositives stub for large arrays.
7890     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
7891 
7892     // array equals stub for large arrays.
7893     if (!UseSimpleArrayEquals) {
7894       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7895     }
7896 
7897     generate_compare_long_strings();
7898 
7899     generate_string_indexof_stubs();
7900 
7901     // byte_array_inflate stub for large arrays.
7902     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7903 
7904     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7905     if (bs_nm != NULL) {
7906       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7907     }
7908 #ifdef COMPILER2
7909     if (UseMultiplyToLenIntrinsic) {
7910       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7911     }
7912 
7913     if (UseSquareToLenIntrinsic) {
7914       StubRoutines::_squareToLen = generate_squareToLen();
7915     }
7916 
7917     if (UseMulAddIntrinsic) {
7918       StubRoutines::_mulAdd = generate_mulAdd();
7919     }
7920 
7921     if (UseSIMDForBigIntegerShiftIntrinsics) {
7922       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7923       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7924     }
7925 
7926     if (UseMontgomeryMultiplyIntrinsic) {
7927       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7928       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7929       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7930     }
7931 
7932     if (UseMontgomerySquareIntrinsic) {
7933       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7934       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7935       // We use generate_multiply() rather than generate_square()
7936       // because it's faster for the sizes of modulus we care about.
7937       StubRoutines::_montgomerySquare = g.generate_multiply();
7938     }
7939 #endif // COMPILER2
7940 
7941     if (UseBASE64Intrinsics) {
7942         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7943         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7944     }
7945 
7946     // data cache line writeback
7947     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7948     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7949 
7950     if (UseAESIntrinsics) {
7951       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7952       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7953       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7954       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7955       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7956     }
7957     if (UseGHASHIntrinsics) {
7958       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7959       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
7960     }
7961     if (UseAESIntrinsics && UseGHASHIntrinsics) {
7962       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7963     }
7964 
7965     if (UseMD5Intrinsics) {
7966       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
7967       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
7968     }
7969     if (UseSHA1Intrinsics) {
7970       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7971       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7972     }
7973     if (UseSHA256Intrinsics) {
7974       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7975       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7976     }
7977     if (UseSHA512Intrinsics) {
7978       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7979       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7980     }
7981     if (UseSHA3Intrinsics) {
7982       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7983       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7984     }
7985 
7986     // generate Adler32 intrinsics code
7987     if (UseAdler32Intrinsics) {
7988       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7989     }
7990 
7991     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
7992 
7993 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
7994 
7995     generate_atomic_entry_points();
7996 
7997 #endif // LINUX
7998 
7999     StubRoutines::aarch64::set_completed();
8000   }
8001 
8002  public:
8003   StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
8004     if (phase == 0) {
8005       generate_initial();
8006     } else if (phase == 1) {
8007       generate_phase1(); // stubs that must be available for the interpreter
8008     } else {
8009       generate_all();
8010     }
8011   }
8012 }; // end class declaration
8013 
8014 #define UCM_TABLE_MAX_ENTRIES 8
8015 void StubGenerator_generate(CodeBuffer* code, int phase) {
8016   if (UnsafeCopyMemory::_table == NULL) {
8017     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
8018   }
8019   StubGenerator g(code, phase);
8020 }
8021 
8022 
8023 #if defined (LINUX)
8024 
8025 // Define pointers to atomic stubs and initialize them to point to the
8026 // code in atomic_aarch64.S.
8027 
8028 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8029   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8030     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8031   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8032     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8033 
8034 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8035 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8036 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8037 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8038 DEFAULT_ATOMIC_OP(xchg, 4, )
8039 DEFAULT_ATOMIC_OP(xchg, 8, )
8040 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8041 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8042 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8043 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8044 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8045 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8046 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8047 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8048 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8049 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8050 
8051 #undef DEFAULT_ATOMIC_OP
8052 
8053 #endif // LINUX
8054 
8055 
8056 #undef __
8057 #define __ masm->
8058 
8059 // on exit, sp points to the ContinuationEntry
8060 OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
8061   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
8062   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
8063   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
8064 
8065   stack_slots += (int)ContinuationEntry::size()/wordSize;
8066   __ sub(sp, sp, (int)ContinuationEntry::size()); // place Continuation metadata
8067 
8068   OopMap* map = new OopMap(((int)ContinuationEntry::size() + wordSize)/ VMRegImpl::stack_slot_size, 0 /* arg_slots*/);
8069   ContinuationEntry::setup_oopmap(map);
8070 
8071   __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
8072   __ str(rscratch1, Address(sp, ContinuationEntry::parent_offset()));
8073   __ mov(rscratch1, sp); // we can't use sp as the source in str
8074   __ str(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
8075 
8076   return map;
8077 }
8078 
8079 // on entry c_rarg1 points to the continuation
8080 //          sp points to ContinuationEntry
8081 //          c_rarg3 -- isVirtualThread
8082 void fill_continuation_entry(MacroAssembler* masm) {
8083 #ifdef ASSERT
8084   __ movw(rscratch1, ContinuationEntry::cookie_value());
8085   __ strw(rscratch1, Address(sp, ContinuationEntry::cookie_offset()));
8086 #endif
8087 
8088   __ str (c_rarg1, Address(sp, ContinuationEntry::cont_offset()));
8089   __ strw(c_rarg3, Address(sp, ContinuationEntry::flags_offset()));
8090   __ str (zr,      Address(sp, ContinuationEntry::chunk_offset()));
8091   __ strw(zr,      Address(sp, ContinuationEntry::argsize_offset()));
8092   __ strw(zr,      Address(sp, ContinuationEntry::pin_count_offset()));
8093 
8094   __ ldr(rscratch1, Address(rthread, JavaThread::cont_fastpath_offset()));
8095   __ str(rscratch1, Address(sp, ContinuationEntry::parent_cont_fastpath_offset()));
8096   __ ldr(rscratch1, Address(rthread, JavaThread::held_monitor_count_offset()));
8097   __ str(rscratch1, Address(sp, ContinuationEntry::parent_held_monitor_count_offset()));
8098 
8099   __ str(zr, Address(rthread, JavaThread::cont_fastpath_offset()));
8100   __ str(zr, Address(rthread, JavaThread::held_monitor_count_offset()));
8101 }
8102 
8103 // on entry, sp points to the ContinuationEntry
8104 // on exit, rfp points to the spilled rfp in the entry frame
8105 void continuation_enter_cleanup(MacroAssembler* masm) {
8106 #ifndef PRODUCT
8107   Label OK;
8108   __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
8109   __ cmp(sp, rscratch1);
8110   __ br(Assembler::EQ, OK);
8111   __ stop("incorrect sp1");
8112   __ bind(OK);
8113 #endif
8114 
8115   __ ldr(rscratch1, Address(sp, ContinuationEntry::parent_cont_fastpath_offset()));
8116   __ str(rscratch1, Address(rthread, JavaThread::cont_fastpath_offset()));
8117   __ ldr(rscratch1, Address(sp, ContinuationEntry::parent_held_monitor_count_offset()));
8118   __ str(rscratch1, Address(rthread, JavaThread::held_monitor_count_offset()));
8119 
8120   __ ldr(rscratch2, Address(sp, ContinuationEntry::parent_offset()));
8121   __ str(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
8122   __ add(rfp, sp, (int)ContinuationEntry::size());
8123 }
8124 
8125 #undef __