1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/atomic.hpp"
  45 #include "runtime/continuation.hpp"
  46 #include "runtime/continuationEntry.inline.hpp"
  47 #include "runtime/frame.inline.hpp"
  48 #include "runtime/handles.inline.hpp"
  49 #include "runtime/javaThread.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubCodeGenerator.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "utilities/align.hpp"
  54 #include "utilities/globalDefinitions.hpp"
  55 #include "utilities/powerOfTwo.hpp"
  56 #ifdef COMPILER2
  57 #include "opto/runtime.hpp"
  58 #endif
  59 #if INCLUDE_ZGC
  60 #include "gc/z/zThreadLocalData.hpp"
  61 #endif
  62 
  63 // Declaration and definition of StubGenerator (no .hpp file).
  64 // For a more detailed description of the stub routine structure
  65 // see the comment in stubRoutines.hpp
  66 
  67 #undef __
  68 #define __ _masm->
  69 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  70 
  71 #ifdef PRODUCT
  72 #define BLOCK_COMMENT(str) /* nothing */
  73 #else
  74 #define BLOCK_COMMENT(str) __ block_comment(str)
  75 #endif
  76 
  77 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  78 
  79 // Stub Code definitions
  80 
  81 class StubGenerator: public StubCodeGenerator {
  82  private:
  83 
  84 #ifdef PRODUCT
  85 #define inc_counter_np(counter) ((void)0)
  86 #else
  87   void inc_counter_np_(int& counter) {
  88     __ lea(rscratch2, ExternalAddress((address)&counter));
  89     __ ldrw(rscratch1, Address(rscratch2));
  90     __ addw(rscratch1, rscratch1, 1);
  91     __ strw(rscratch1, Address(rscratch2));
  92   }
  93 #define inc_counter_np(counter) \
  94   BLOCK_COMMENT("inc_counter " #counter); \
  95   inc_counter_np_(counter);
  96 #endif
  97 
  98   // Call stubs are used to call Java from C
  99   //
 100   // Arguments:
 101   //    c_rarg0:   call wrapper address                   address
 102   //    c_rarg1:   result                                 address
 103   //    c_rarg2:   result type                            BasicType
 104   //    c_rarg3:   method                                 Method*
 105   //    c_rarg4:   (interpreter) entry point              address
 106   //    c_rarg5:   parameters                             intptr_t*
 107   //    c_rarg6:   parameter size (in words)              int
 108   //    c_rarg7:   thread                                 Thread*
 109   //
 110   // There is no return from the stub itself as any Java result
 111   // is written to result
 112   //
 113   // we save r30 (lr) as the return PC at the base of the frame and
 114   // link r29 (fp) below it as the frame pointer installing sp (r31)
 115   // into fp.
 116   //
 117   // we save r0-r7, which accounts for all the c arguments.
 118   //
 119   // TODO: strictly do we need to save them all? they are treated as
 120   // volatile by C so could we omit saving the ones we are going to
 121   // place in global registers (thread? method?) or those we only use
 122   // during setup of the Java call?
 123   //
 124   // we don't need to save r8 which C uses as an indirect result location
 125   // return register.
 126   //
 127   // we don't need to save r9-r15 which both C and Java treat as
 128   // volatile
 129   //
 130   // we don't need to save r16-18 because Java does not use them
 131   //
 132   // we save r19-r28 which Java uses as scratch registers and C
 133   // expects to be callee-save
 134   //
 135   // we save the bottom 64 bits of each value stored in v8-v15; it is
 136   // the responsibility of the caller to preserve larger values.
 137   //
 138   // so the stub frame looks like this when we enter Java code
 139   //
 140   //     [ return_from_Java     ] <--- sp
 141   //     [ argument word n      ]
 142   //      ...
 143   // -27 [ argument word 1      ]
 144   // -26 [ saved v15            ] <--- sp_after_call
 145   // -25 [ saved v14            ]
 146   // -24 [ saved v13            ]
 147   // -23 [ saved v12            ]
 148   // -22 [ saved v11            ]
 149   // -21 [ saved v10            ]
 150   // -20 [ saved v9             ]
 151   // -19 [ saved v8             ]
 152   // -18 [ saved r28            ]
 153   // -17 [ saved r27            ]
 154   // -16 [ saved r26            ]
 155   // -15 [ saved r25            ]
 156   // -14 [ saved r24            ]
 157   // -13 [ saved r23            ]
 158   // -12 [ saved r22            ]
 159   // -11 [ saved r21            ]
 160   // -10 [ saved r20            ]
 161   //  -9 [ saved r19            ]
 162   //  -8 [ call wrapper    (r0) ]
 163   //  -7 [ result          (r1) ]
 164   //  -6 [ result type     (r2) ]
 165   //  -5 [ method          (r3) ]
 166   //  -4 [ entry point     (r4) ]
 167   //  -3 [ parameters      (r5) ]
 168   //  -2 [ parameter size  (r6) ]
 169   //  -1 [ thread (r7)          ]
 170   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 171   //   1 [ saved lr       (r30) ]
 172 
 173   // Call stub stack layout word offsets from fp
 174   enum call_stub_layout {
 175     sp_after_call_off = -26,
 176 
 177     d15_off            = -26,
 178     d13_off            = -24,
 179     d11_off            = -22,
 180     d9_off             = -20,
 181 
 182     r28_off            = -18,
 183     r26_off            = -16,
 184     r24_off            = -14,
 185     r22_off            = -12,
 186     r20_off            = -10,
 187     call_wrapper_off   =  -8,
 188     result_off         =  -7,
 189     result_type_off    =  -6,
 190     method_off         =  -5,
 191     entry_point_off    =  -4,
 192     parameter_size_off =  -2,
 193     thread_off         =  -1,
 194     fp_f               =   0,
 195     retaddr_off        =   1,
 196   };
 197 
 198   address generate_call_stub(address& return_address) {
 199     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 200            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 201            "adjust this code");
 202 
 203     StubCodeMark mark(this, "StubRoutines", "call_stub");
 204     address start = __ pc();
 205 
 206     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 207 
 208     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 209     const Address result        (rfp, result_off         * wordSize);
 210     const Address result_type   (rfp, result_type_off    * wordSize);
 211     const Address method        (rfp, method_off         * wordSize);
 212     const Address entry_point   (rfp, entry_point_off    * wordSize);
 213     const Address parameter_size(rfp, parameter_size_off * wordSize);
 214 
 215     const Address thread        (rfp, thread_off         * wordSize);
 216 
 217     const Address d15_save      (rfp, d15_off * wordSize);
 218     const Address d13_save      (rfp, d13_off * wordSize);
 219     const Address d11_save      (rfp, d11_off * wordSize);
 220     const Address d9_save       (rfp, d9_off * wordSize);
 221 
 222     const Address r28_save      (rfp, r28_off * wordSize);
 223     const Address r26_save      (rfp, r26_off * wordSize);
 224     const Address r24_save      (rfp, r24_off * wordSize);
 225     const Address r22_save      (rfp, r22_off * wordSize);
 226     const Address r20_save      (rfp, r20_off * wordSize);
 227 
 228     // stub code
 229 
 230     address aarch64_entry = __ pc();
 231 
 232     // set up frame and move sp to end of save area
 233     __ enter();
 234     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 235 
 236     // save register parameters and Java scratch/global registers
 237     // n.b. we save thread even though it gets installed in
 238     // rthread because we want to sanity check rthread later
 239     __ str(c_rarg7,  thread);
 240     __ strw(c_rarg6, parameter_size);
 241     __ stp(c_rarg4, c_rarg5,  entry_point);
 242     __ stp(c_rarg2, c_rarg3,  result_type);
 243     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 244 
 245     __ stp(r20, r19,   r20_save);
 246     __ stp(r22, r21,   r22_save);
 247     __ stp(r24, r23,   r24_save);
 248     __ stp(r26, r25,   r26_save);
 249     __ stp(r28, r27,   r28_save);
 250 
 251     __ stpd(v9,  v8,   d9_save);
 252     __ stpd(v11, v10,  d11_save);
 253     __ stpd(v13, v12,  d13_save);
 254     __ stpd(v15, v14,  d15_save);
 255 
 256     // install Java thread in global register now we have saved
 257     // whatever value it held
 258     __ mov(rthread, c_rarg7);
 259     // And method
 260     __ mov(rmethod, c_rarg3);
 261 
 262     // set up the heapbase register
 263     __ reinit_heapbase();
 264 
 265 #ifdef ASSERT
 266     // make sure we have no pending exceptions
 267     {
 268       Label L;
 269       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 270       __ cmp(rscratch1, (u1)NULL_WORD);
 271       __ br(Assembler::EQ, L);
 272       __ stop("StubRoutines::call_stub: entered with pending exception");
 273       __ BIND(L);
 274     }
 275 #endif
 276     // pass parameters if any
 277     __ mov(esp, sp);
 278     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 279     __ andr(sp, rscratch1, -2 * wordSize);
 280 
 281     BLOCK_COMMENT("pass parameters if any");
 282     Label parameters_done;
 283     // parameter count is still in c_rarg6
 284     // and parameter pointer identifying param 1 is in c_rarg5
 285     __ cbzw(c_rarg6, parameters_done);
 286 
 287     address loop = __ pc();
 288     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 289     __ subsw(c_rarg6, c_rarg6, 1);
 290     __ push(rscratch1);
 291     __ br(Assembler::GT, loop);
 292 
 293     __ BIND(parameters_done);
 294 
 295     // call Java entry -- passing methdoOop, and current sp
 296     //      rmethod: Method*
 297     //      r19_sender_sp: sender sp
 298     BLOCK_COMMENT("call Java function");
 299     __ mov(r19_sender_sp, sp);
 300     __ blr(c_rarg4);
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     // save current address for use by exception handling code
 312 
 313     return_address = __ pc();
 314 
 315     // store result depending on type (everything that is not
 316     // T_OBJECT, T_PRIMITIVE_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 317     // n.b. this assumes Java returns an integral result in r0
 318     // and a floating result in j_farg0
 319     // All of j_rargN may be used to return inline type fields so be careful
 320     // not to clobber those.
 321     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
 322     // assignment of Rresult below.
 323     Register Rresult = r14, Rresult_type = r15;
 324     __ ldr(Rresult, result);
 325     Label is_long, is_float, is_double, check_prim, exit;
 326     __ ldr(Rresult_type, result_type);
 327     __ cmp(Rresult_type, (u1)T_OBJECT);
 328     __ br(Assembler::EQ, check_prim);
 329     __ cmp(Rresult_type, (u1)T_PRIMITIVE_OBJECT);
 330     __ br(Assembler::EQ, check_prim);
 331     __ cmp(Rresult_type, (u1)T_LONG);
 332     __ br(Assembler::EQ, is_long);
 333     __ cmp(Rresult_type, (u1)T_FLOAT);
 334     __ br(Assembler::EQ, is_float);
 335     __ cmp(Rresult_type, (u1)T_DOUBLE);
 336     __ br(Assembler::EQ, is_double);
 337 
 338     // handle T_INT case
 339     __ strw(r0, Address(Rresult));
 340 
 341     __ BIND(exit);
 342 
 343     // pop parameters
 344     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 345 
 346 #ifdef ASSERT
 347     // verify that threads correspond
 348     {
 349       Label L, S;
 350       __ ldr(rscratch1, thread);
 351       __ cmp(rthread, rscratch1);
 352       __ br(Assembler::NE, S);
 353       __ get_thread(rscratch1);
 354       __ cmp(rthread, rscratch1);
 355       __ br(Assembler::EQ, L);
 356       __ BIND(S);
 357       __ stop("StubRoutines::call_stub: threads must correspond");
 358       __ BIND(L);
 359     }
 360 #endif
 361 
 362     __ pop_cont_fastpath(rthread);
 363 
 364     // restore callee-save registers
 365     __ ldpd(v15, v14,  d15_save);
 366     __ ldpd(v13, v12,  d13_save);
 367     __ ldpd(v11, v10,  d11_save);
 368     __ ldpd(v9,  v8,   d9_save);
 369 
 370     __ ldp(r28, r27,   r28_save);
 371     __ ldp(r26, r25,   r26_save);
 372     __ ldp(r24, r23,   r24_save);
 373     __ ldp(r22, r21,   r22_save);
 374     __ ldp(r20, r19,   r20_save);
 375 
 376     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 377     __ ldrw(c_rarg2, result_type);
 378     __ ldr(c_rarg3,  method);
 379     __ ldp(c_rarg4, c_rarg5,  entry_point);
 380     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 381 
 382     // leave frame and return to caller
 383     __ leave();
 384     __ ret(lr);
 385 
 386     // handle return types different from T_INT
 387     __ BIND(check_prim);
 388     if (InlineTypeReturnedAsFields) {
 389       // Check for scalarized return value
 390       __ tbz(r0, 0, is_long);
 391       // Load pack handler address
 392       __ andr(rscratch1, r0, -2);
 393       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
 394       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
 395       __ blr(rscratch1);
 396       __ b(exit);
 397     }
 398 
 399     __ BIND(is_long);
 400     __ str(r0, Address(Rresult, 0));
 401     __ br(Assembler::AL, exit);
 402 
 403     __ BIND(is_float);
 404     __ strs(j_farg0, Address(Rresult, 0));
 405     __ br(Assembler::AL, exit);
 406 
 407     __ BIND(is_double);
 408     __ strd(j_farg0, Address(Rresult, 0));
 409     __ br(Assembler::AL, exit);
 410 
 411     return start;
 412   }
 413 
 414   // Return point for a Java call if there's an exception thrown in
 415   // Java code.  The exception is caught and transformed into a
 416   // pending exception stored in JavaThread that can be tested from
 417   // within the VM.
 418   //
 419   // Note: Usually the parameters are removed by the callee. In case
 420   // of an exception crossing an activation frame boundary, that is
 421   // not the case if the callee is compiled code => need to setup the
 422   // rsp.
 423   //
 424   // r0: exception oop
 425 
 426   address generate_catch_exception() {
 427     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 428     address start = __ pc();
 429 
 430     // same as in generate_call_stub():
 431     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 432     const Address thread        (rfp, thread_off         * wordSize);
 433 
 434 #ifdef ASSERT
 435     // verify that threads correspond
 436     {
 437       Label L, S;
 438       __ ldr(rscratch1, thread);
 439       __ cmp(rthread, rscratch1);
 440       __ br(Assembler::NE, S);
 441       __ get_thread(rscratch1);
 442       __ cmp(rthread, rscratch1);
 443       __ br(Assembler::EQ, L);
 444       __ bind(S);
 445       __ stop("StubRoutines::catch_exception: threads must correspond");
 446       __ bind(L);
 447     }
 448 #endif
 449 
 450     // set pending exception
 451     __ verify_oop(r0);
 452 
 453     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 454     __ mov(rscratch1, (address)__FILE__);
 455     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 456     __ movw(rscratch1, (int)__LINE__);
 457     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 458 
 459     // complete return to VM
 460     assert(StubRoutines::_call_stub_return_address != NULL,
 461            "_call_stub_return_address must have been generated before");
 462     __ b(StubRoutines::_call_stub_return_address);
 463 
 464     return start;
 465   }
 466 
 467   // Continuation point for runtime calls returning with a pending
 468   // exception.  The pending exception check happened in the runtime
 469   // or native call stub.  The pending exception in Thread is
 470   // converted into a Java-level exception.
 471   //
 472   // Contract with Java-level exception handlers:
 473   // r0: exception
 474   // r3: throwing pc
 475   //
 476   // NOTE: At entry of this stub, exception-pc must be in LR !!
 477 
 478   // NOTE: this is always used as a jump target within generated code
 479   // so it just needs to be generated code with no x86 prolog
 480 
 481   address generate_forward_exception() {
 482     StubCodeMark mark(this, "StubRoutines", "forward exception");
 483     address start = __ pc();
 484 
 485     // Upon entry, LR points to the return address returning into
 486     // Java (interpreted or compiled) code; i.e., the return address
 487     // becomes the throwing pc.
 488     //
 489     // Arguments pushed before the runtime call are still on the stack
 490     // but the exception handler will reset the stack pointer ->
 491     // ignore them.  A potential result in registers can be ignored as
 492     // well.
 493 
 494 #ifdef ASSERT
 495     // make sure this code is only executed if there is a pending exception
 496     {
 497       Label L;
 498       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 499       __ cbnz(rscratch1, L);
 500       __ stop("StubRoutines::forward exception: no pending exception (1)");
 501       __ bind(L);
 502     }
 503 #endif
 504 
 505     // compute exception handler into r19
 506 
 507     // call the VM to find the handler address associated with the
 508     // caller address. pass thread in r0 and caller pc (ret address)
 509     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 510     // the stack.
 511     __ mov(c_rarg1, lr);
 512     // lr will be trashed by the VM call so we move it to R19
 513     // (callee-saved) because we also need to pass it to the handler
 514     // returned by this call.
 515     __ mov(r19, lr);
 516     BLOCK_COMMENT("call exception_handler_for_return_address");
 517     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 518                          SharedRuntime::exception_handler_for_return_address),
 519                     rthread, c_rarg1);
 520     // Reinitialize the ptrue predicate register, in case the external runtime
 521     // call clobbers ptrue reg, as we may return to SVE compiled code.
 522     __ reinitialize_ptrue();
 523 
 524     // we should not really care that lr is no longer the callee
 525     // address. we saved the value the handler needs in r19 so we can
 526     // just copy it to r3. however, the C2 handler will push its own
 527     // frame and then calls into the VM and the VM code asserts that
 528     // the PC for the frame above the handler belongs to a compiled
 529     // Java method. So, we restore lr here to satisfy that assert.
 530     __ mov(lr, r19);
 531     // setup r0 & r3 & clear pending exception
 532     __ mov(r3, r19);
 533     __ mov(r19, r0);
 534     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 535     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 536 
 537 #ifdef ASSERT
 538     // make sure exception is set
 539     {
 540       Label L;
 541       __ cbnz(r0, L);
 542       __ stop("StubRoutines::forward exception: no pending exception (2)");
 543       __ bind(L);
 544     }
 545 #endif
 546 
 547     // continue at exception handler
 548     // r0: exception
 549     // r3: throwing pc
 550     // r19: exception handler
 551     __ verify_oop(r0);
 552     __ br(r19);
 553 
 554     return start;
 555   }
 556 
 557   // Non-destructive plausibility checks for oops
 558   //
 559   // Arguments:
 560   //    r0: oop to verify
 561   //    rscratch1: error message
 562   //
 563   // Stack after saving c_rarg3:
 564   //    [tos + 0]: saved c_rarg3
 565   //    [tos + 1]: saved c_rarg2
 566   //    [tos + 2]: saved lr
 567   //    [tos + 3]: saved rscratch2
 568   //    [tos + 4]: saved r0
 569   //    [tos + 5]: saved rscratch1
 570   address generate_verify_oop() {
 571 
 572     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 573     address start = __ pc();
 574 
 575     Label exit, error;
 576 
 577     // save c_rarg2 and c_rarg3
 578     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 579 
 580     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 581     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 582     __ ldr(c_rarg3, Address(c_rarg2));
 583     __ add(c_rarg3, c_rarg3, 1);
 584     __ str(c_rarg3, Address(c_rarg2));
 585 
 586     // object is in r0
 587     // make sure object is 'reasonable'
 588     __ cbz(r0, exit); // if obj is NULL it is OK
 589 
 590 #if INCLUDE_ZGC
 591     if (UseZGC) {
 592       // Check if mask is good.
 593       // verifies that ZAddressBadMask & r0 == 0
 594       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 595       __ andr(c_rarg2, r0, c_rarg3);
 596       __ cbnz(c_rarg2, error);
 597     }
 598 #endif
 599 
 600     // Check if the oop is in the right area of memory
 601     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 602     __ andr(c_rarg2, r0, c_rarg3);
 603     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 604 
 605     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 606     // instruction here because the flags register is live.
 607     __ eor(c_rarg2, c_rarg2, c_rarg3);
 608     __ cbnz(c_rarg2, error);
 609 
 610     // make sure klass is 'reasonable', which is not zero.
 611     __ load_klass(r0, r0);  // get klass
 612     __ cbz(r0, error);      // if klass is NULL it is broken
 613 
 614     // return if everything seems ok
 615     __ bind(exit);
 616 
 617     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 618     __ ret(lr);
 619 
 620     // handle errors
 621     __ bind(error);
 622     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 623 
 624     __ push(RegSet::range(r0, r29), sp);
 625     // debug(char* msg, int64_t pc, int64_t regs[])
 626     __ mov(c_rarg0, rscratch1);      // pass address of error message
 627     __ mov(c_rarg1, lr);             // pass return address
 628     __ mov(c_rarg2, sp);             // pass address of regs on stack
 629 #ifndef PRODUCT
 630     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 631 #endif
 632     BLOCK_COMMENT("call MacroAssembler::debug");
 633     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 634     __ blr(rscratch1);
 635     __ hlt(0);
 636 
 637     return start;
 638   }
 639 
 640   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 641 
 642   // Generate indices for iota vector.
 643   address generate_iota_indices(const char *stub_name) {
 644     __ align(CodeEntryAlignment);
 645     StubCodeMark mark(this, "StubRoutines", stub_name);
 646     address start = __ pc();
 647     // B
 648     __ emit_data64(0x0706050403020100, relocInfo::none);
 649     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 650     // H
 651     __ emit_data64(0x0003000200010000, relocInfo::none);
 652     __ emit_data64(0x0007000600050004, relocInfo::none);
 653     // S
 654     __ emit_data64(0x0000000100000000, relocInfo::none);
 655     __ emit_data64(0x0000000300000002, relocInfo::none);
 656     // D
 657     __ emit_data64(0x0000000000000000, relocInfo::none);
 658     __ emit_data64(0x0000000000000001, relocInfo::none);
 659     // S - FP
 660     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 661     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 662     // D - FP
 663     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 664     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 665     return start;
 666   }
 667 
 668   // The inner part of zero_words().  This is the bulk operation,
 669   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 670   // caller is responsible for zeroing the last few words.
 671   //
 672   // Inputs:
 673   // r10: the HeapWord-aligned base address of an array to zero.
 674   // r11: the count in HeapWords, r11 > 0.
 675   //
 676   // Returns r10 and r11, adjusted for the caller to clear.
 677   // r10: the base address of the tail of words left to clear.
 678   // r11: the number of words in the tail.
 679   //      r11 < MacroAssembler::zero_words_block_size.
 680 
 681   address generate_zero_blocks() {
 682     Label done;
 683     Label base_aligned;
 684 
 685     Register base = r10, cnt = r11;
 686 
 687     __ align(CodeEntryAlignment);
 688     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 689     address start = __ pc();
 690 
 691     if (UseBlockZeroing) {
 692       int zva_length = VM_Version::zva_length();
 693 
 694       // Ensure ZVA length can be divided by 16. This is required by
 695       // the subsequent operations.
 696       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 697 
 698       __ tbz(base, 3, base_aligned);
 699       __ str(zr, Address(__ post(base, 8)));
 700       __ sub(cnt, cnt, 1);
 701       __ bind(base_aligned);
 702 
 703       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 704       // alignment.
 705       Label small;
 706       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 707       __ subs(rscratch1, cnt, low_limit >> 3);
 708       __ br(Assembler::LT, small);
 709       __ zero_dcache_blocks(base, cnt);
 710       __ bind(small);
 711     }
 712 
 713     {
 714       // Number of stp instructions we'll unroll
 715       const int unroll =
 716         MacroAssembler::zero_words_block_size / 2;
 717       // Clear the remaining blocks.
 718       Label loop;
 719       __ subs(cnt, cnt, unroll * 2);
 720       __ br(Assembler::LT, done);
 721       __ bind(loop);
 722       for (int i = 0; i < unroll; i++)
 723         __ stp(zr, zr, __ post(base, 16));
 724       __ subs(cnt, cnt, unroll * 2);
 725       __ br(Assembler::GE, loop);
 726       __ bind(done);
 727       __ add(cnt, cnt, unroll * 2);
 728     }
 729 
 730     __ ret(lr);
 731 
 732     return start;
 733   }
 734 
 735 
 736   typedef enum {
 737     copy_forwards = 1,
 738     copy_backwards = -1
 739   } copy_direction;
 740 
 741   // Bulk copy of blocks of 8 words.
 742   //
 743   // count is a count of words.
 744   //
 745   // Precondition: count >= 8
 746   //
 747   // Postconditions:
 748   //
 749   // The least significant bit of count contains the remaining count
 750   // of words to copy.  The rest of count is trash.
 751   //
 752   // s and d are adjusted to point to the remaining words to copy
 753   //
 754   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 755                            copy_direction direction) {
 756     int unit = wordSize * direction;
 757     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 758 
 759     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 760       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 761     const Register stride = r13;
 762 
 763     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 764     assert_different_registers(s, d, count, rscratch1);
 765 
 766     Label again, drain;
 767     const char *stub_name;
 768     if (direction == copy_forwards)
 769       stub_name = "forward_copy_longs";
 770     else
 771       stub_name = "backward_copy_longs";
 772 
 773     __ align(CodeEntryAlignment);
 774 
 775     StubCodeMark mark(this, "StubRoutines", stub_name);
 776 
 777     __ bind(start);
 778 
 779     Label unaligned_copy_long;
 780     if (AvoidUnalignedAccesses) {
 781       __ tbnz(d, 3, unaligned_copy_long);
 782     }
 783 
 784     if (direction == copy_forwards) {
 785       __ sub(s, s, bias);
 786       __ sub(d, d, bias);
 787     }
 788 
 789 #ifdef ASSERT
 790     // Make sure we are never given < 8 words
 791     {
 792       Label L;
 793       __ cmp(count, (u1)8);
 794       __ br(Assembler::GE, L);
 795       __ stop("genrate_copy_longs called with < 8 words");
 796       __ bind(L);
 797     }
 798 #endif
 799 
 800     // Fill 8 registers
 801     if (UseSIMDForMemoryOps) {
 802       __ ldpq(v0, v1, Address(s, 4 * unit));
 803       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 804     } else {
 805       __ ldp(t0, t1, Address(s, 2 * unit));
 806       __ ldp(t2, t3, Address(s, 4 * unit));
 807       __ ldp(t4, t5, Address(s, 6 * unit));
 808       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 809     }
 810 
 811     __ subs(count, count, 16);
 812     __ br(Assembler::LO, drain);
 813 
 814     int prefetch = PrefetchCopyIntervalInBytes;
 815     bool use_stride = false;
 816     if (direction == copy_backwards) {
 817        use_stride = prefetch > 256;
 818        prefetch = -prefetch;
 819        if (use_stride) __ mov(stride, prefetch);
 820     }
 821 
 822     __ bind(again);
 823 
 824     if (PrefetchCopyIntervalInBytes > 0)
 825       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 826 
 827     if (UseSIMDForMemoryOps) {
 828       __ stpq(v0, v1, Address(d, 4 * unit));
 829       __ ldpq(v0, v1, Address(s, 4 * unit));
 830       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 831       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 832     } else {
 833       __ stp(t0, t1, Address(d, 2 * unit));
 834       __ ldp(t0, t1, Address(s, 2 * unit));
 835       __ stp(t2, t3, Address(d, 4 * unit));
 836       __ ldp(t2, t3, Address(s, 4 * unit));
 837       __ stp(t4, t5, Address(d, 6 * unit));
 838       __ ldp(t4, t5, Address(s, 6 * unit));
 839       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 840       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 841     }
 842 
 843     __ subs(count, count, 8);
 844     __ br(Assembler::HS, again);
 845 
 846     // Drain
 847     __ bind(drain);
 848     if (UseSIMDForMemoryOps) {
 849       __ stpq(v0, v1, Address(d, 4 * unit));
 850       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 851     } else {
 852       __ stp(t0, t1, Address(d, 2 * unit));
 853       __ stp(t2, t3, Address(d, 4 * unit));
 854       __ stp(t4, t5, Address(d, 6 * unit));
 855       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 856     }
 857 
 858     {
 859       Label L1, L2;
 860       __ tbz(count, exact_log2(4), L1);
 861       if (UseSIMDForMemoryOps) {
 862         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 863         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 864       } else {
 865         __ ldp(t0, t1, Address(s, 2 * unit));
 866         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 867         __ stp(t0, t1, Address(d, 2 * unit));
 868         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 869       }
 870       __ bind(L1);
 871 
 872       if (direction == copy_forwards) {
 873         __ add(s, s, bias);
 874         __ add(d, d, bias);
 875       }
 876 
 877       __ tbz(count, 1, L2);
 878       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 879       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 880       __ bind(L2);
 881     }
 882 
 883     __ ret(lr);
 884 
 885     if (AvoidUnalignedAccesses) {
 886       Label drain, again;
 887       // Register order for storing. Order is different for backward copy.
 888 
 889       __ bind(unaligned_copy_long);
 890 
 891       // source address is even aligned, target odd aligned
 892       //
 893       // when forward copying word pairs we read long pairs at offsets
 894       // {0, 2, 4, 6} (in long words). when backwards copying we read
 895       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 896       // address by -2 in the forwards case so we can compute the
 897       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 898       // or -1.
 899       //
 900       // when forward copying we need to store 1 word, 3 pairs and
 901       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 902       // zero offset We adjust the destination by -1 which means we
 903       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 904       //
 905       // When backwards copyng we need to store 1 word, 3 pairs and
 906       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 907       // offsets {1, 3, 5, 7, 8} * unit.
 908 
 909       if (direction == copy_forwards) {
 910         __ sub(s, s, 16);
 911         __ sub(d, d, 8);
 912       }
 913 
 914       // Fill 8 registers
 915       //
 916       // for forwards copy s was offset by -16 from the original input
 917       // value of s so the register contents are at these offsets
 918       // relative to the 64 bit block addressed by that original input
 919       // and so on for each successive 64 byte block when s is updated
 920       //
 921       // t0 at offset 0,  t1 at offset 8
 922       // t2 at offset 16, t3 at offset 24
 923       // t4 at offset 32, t5 at offset 40
 924       // t6 at offset 48, t7 at offset 56
 925 
 926       // for backwards copy s was not offset so the register contents
 927       // are at these offsets into the preceding 64 byte block
 928       // relative to that original input and so on for each successive
 929       // preceding 64 byte block when s is updated. this explains the
 930       // slightly counter-intuitive looking pattern of register usage
 931       // in the stp instructions for backwards copy.
 932       //
 933       // t0 at offset -16, t1 at offset -8
 934       // t2 at offset -32, t3 at offset -24
 935       // t4 at offset -48, t5 at offset -40
 936       // t6 at offset -64, t7 at offset -56
 937 
 938       __ ldp(t0, t1, Address(s, 2 * unit));
 939       __ ldp(t2, t3, Address(s, 4 * unit));
 940       __ ldp(t4, t5, Address(s, 6 * unit));
 941       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 942 
 943       __ subs(count, count, 16);
 944       __ br(Assembler::LO, drain);
 945 
 946       int prefetch = PrefetchCopyIntervalInBytes;
 947       bool use_stride = false;
 948       if (direction == copy_backwards) {
 949          use_stride = prefetch > 256;
 950          prefetch = -prefetch;
 951          if (use_stride) __ mov(stride, prefetch);
 952       }
 953 
 954       __ bind(again);
 955 
 956       if (PrefetchCopyIntervalInBytes > 0)
 957         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 958 
 959       if (direction == copy_forwards) {
 960        // allowing for the offset of -8 the store instructions place
 961        // registers into the target 64 bit block at the following
 962        // offsets
 963        //
 964        // t0 at offset 0
 965        // t1 at offset 8,  t2 at offset 16
 966        // t3 at offset 24, t4 at offset 32
 967        // t5 at offset 40, t6 at offset 48
 968        // t7 at offset 56
 969 
 970         __ str(t0, Address(d, 1 * unit));
 971         __ stp(t1, t2, Address(d, 2 * unit));
 972         __ ldp(t0, t1, Address(s, 2 * unit));
 973         __ stp(t3, t4, Address(d, 4 * unit));
 974         __ ldp(t2, t3, Address(s, 4 * unit));
 975         __ stp(t5, t6, Address(d, 6 * unit));
 976         __ ldp(t4, t5, Address(s, 6 * unit));
 977         __ str(t7, Address(__ pre(d, 8 * unit)));
 978         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 979       } else {
 980        // d was not offset when we started so the registers are
 981        // written into the 64 bit block preceding d with the following
 982        // offsets
 983        //
 984        // t1 at offset -8
 985        // t3 at offset -24, t0 at offset -16
 986        // t5 at offset -48, t2 at offset -32
 987        // t7 at offset -56, t4 at offset -48
 988        //                   t6 at offset -64
 989        //
 990        // note that this matches the offsets previously noted for the
 991        // loads
 992 
 993         __ str(t1, Address(d, 1 * unit));
 994         __ stp(t3, t0, Address(d, 3 * unit));
 995         __ ldp(t0, t1, Address(s, 2 * unit));
 996         __ stp(t5, t2, Address(d, 5 * unit));
 997         __ ldp(t2, t3, Address(s, 4 * unit));
 998         __ stp(t7, t4, Address(d, 7 * unit));
 999         __ ldp(t4, t5, Address(s, 6 * unit));
1000         __ str(t6, Address(__ pre(d, 8 * unit)));
1001         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1002       }
1003 
1004       __ subs(count, count, 8);
1005       __ br(Assembler::HS, again);
1006 
1007       // Drain
1008       //
1009       // this uses the same pattern of offsets and register arguments
1010       // as above
1011       __ bind(drain);
1012       if (direction == copy_forwards) {
1013         __ str(t0, Address(d, 1 * unit));
1014         __ stp(t1, t2, Address(d, 2 * unit));
1015         __ stp(t3, t4, Address(d, 4 * unit));
1016         __ stp(t5, t6, Address(d, 6 * unit));
1017         __ str(t7, Address(__ pre(d, 8 * unit)));
1018       } else {
1019         __ str(t1, Address(d, 1 * unit));
1020         __ stp(t3, t0, Address(d, 3 * unit));
1021         __ stp(t5, t2, Address(d, 5 * unit));
1022         __ stp(t7, t4, Address(d, 7 * unit));
1023         __ str(t6, Address(__ pre(d, 8 * unit)));
1024       }
1025       // now we need to copy any remaining part block which may
1026       // include a 4 word block subblock and/or a 2 word subblock.
1027       // bits 2 and 1 in the count are the tell-tale for whether we
1028       // have each such subblock
1029       {
1030         Label L1, L2;
1031         __ tbz(count, exact_log2(4), L1);
1032        // this is the same as above but copying only 4 longs hence
1033        // with only one intervening stp between the str instructions
1034        // but note that the offsets and registers still follow the
1035        // same pattern
1036         __ ldp(t0, t1, Address(s, 2 * unit));
1037         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1038         if (direction == copy_forwards) {
1039           __ str(t0, Address(d, 1 * unit));
1040           __ stp(t1, t2, Address(d, 2 * unit));
1041           __ str(t3, Address(__ pre(d, 4 * unit)));
1042         } else {
1043           __ str(t1, Address(d, 1 * unit));
1044           __ stp(t3, t0, Address(d, 3 * unit));
1045           __ str(t2, Address(__ pre(d, 4 * unit)));
1046         }
1047         __ bind(L1);
1048 
1049         __ tbz(count, 1, L2);
1050        // this is the same as above but copying only 2 longs hence
1051        // there is no intervening stp between the str instructions
1052        // but note that the offset and register patterns are still
1053        // the same
1054         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1055         if (direction == copy_forwards) {
1056           __ str(t0, Address(d, 1 * unit));
1057           __ str(t1, Address(__ pre(d, 2 * unit)));
1058         } else {
1059           __ str(t1, Address(d, 1 * unit));
1060           __ str(t0, Address(__ pre(d, 2 * unit)));
1061         }
1062         __ bind(L2);
1063 
1064        // for forwards copy we need to re-adjust the offsets we
1065        // applied so that s and d are follow the last words written
1066 
1067        if (direction == copy_forwards) {
1068          __ add(s, s, 16);
1069          __ add(d, d, 8);
1070        }
1071 
1072       }
1073 
1074       __ ret(lr);
1075       }
1076   }
1077 
1078   // Small copy: less than 16 bytes.
1079   //
1080   // NB: Ignores all of the bits of count which represent more than 15
1081   // bytes, so a caller doesn't have to mask them.
1082 
1083   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1084     bool is_backwards = step < 0;
1085     size_t granularity = uabs(step);
1086     int direction = is_backwards ? -1 : 1;
1087     int unit = wordSize * direction;
1088 
1089     Label Lword, Lint, Lshort, Lbyte;
1090 
1091     assert(granularity
1092            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1093 
1094     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1095 
1096     // ??? I don't know if this bit-test-and-branch is the right thing
1097     // to do.  It does a lot of jumping, resulting in several
1098     // mispredicted branches.  It might make more sense to do this
1099     // with something like Duff's device with a single computed branch.
1100 
1101     __ tbz(count, 3 - exact_log2(granularity), Lword);
1102     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1103     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1104     __ bind(Lword);
1105 
1106     if (granularity <= sizeof (jint)) {
1107       __ tbz(count, 2 - exact_log2(granularity), Lint);
1108       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1109       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1110       __ bind(Lint);
1111     }
1112 
1113     if (granularity <= sizeof (jshort)) {
1114       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1115       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1116       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1117       __ bind(Lshort);
1118     }
1119 
1120     if (granularity <= sizeof (jbyte)) {
1121       __ tbz(count, 0, Lbyte);
1122       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1123       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1124       __ bind(Lbyte);
1125     }
1126   }
1127 
1128   Label copy_f, copy_b;
1129 
1130   // All-singing all-dancing memory copy.
1131   //
1132   // Copy count units of memory from s to d.  The size of a unit is
1133   // step, which can be positive or negative depending on the direction
1134   // of copy.  If is_aligned is false, we align the source address.
1135   //
1136 
1137   void copy_memory(bool is_aligned, Register s, Register d,
1138                    Register count, Register tmp, int step) {
1139     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1140     bool is_backwards = step < 0;
1141     unsigned int granularity = uabs(step);
1142     const Register t0 = r3, t1 = r4;
1143 
1144     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1145     // load all the data before writing anything
1146     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1147     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1148     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1149     const Register send = r17, dend = r16;
1150 
1151     if (PrefetchCopyIntervalInBytes > 0)
1152       __ prfm(Address(s, 0), PLDL1KEEP);
1153     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1154     __ br(Assembler::HI, copy_big);
1155 
1156     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1157     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1158 
1159     __ cmp(count, u1(16/granularity));
1160     __ br(Assembler::LS, copy16);
1161 
1162     __ cmp(count, u1(64/granularity));
1163     __ br(Assembler::HI, copy80);
1164 
1165     __ cmp(count, u1(32/granularity));
1166     __ br(Assembler::LS, copy32);
1167 
1168     // 33..64 bytes
1169     if (UseSIMDForMemoryOps) {
1170       __ ldpq(v0, v1, Address(s, 0));
1171       __ ldpq(v2, v3, Address(send, -32));
1172       __ stpq(v0, v1, Address(d, 0));
1173       __ stpq(v2, v3, Address(dend, -32));
1174     } else {
1175       __ ldp(t0, t1, Address(s, 0));
1176       __ ldp(t2, t3, Address(s, 16));
1177       __ ldp(t4, t5, Address(send, -32));
1178       __ ldp(t6, t7, Address(send, -16));
1179 
1180       __ stp(t0, t1, Address(d, 0));
1181       __ stp(t2, t3, Address(d, 16));
1182       __ stp(t4, t5, Address(dend, -32));
1183       __ stp(t6, t7, Address(dend, -16));
1184     }
1185     __ b(finish);
1186 
1187     // 17..32 bytes
1188     __ bind(copy32);
1189     __ ldp(t0, t1, Address(s, 0));
1190     __ ldp(t2, t3, Address(send, -16));
1191     __ stp(t0, t1, Address(d, 0));
1192     __ stp(t2, t3, Address(dend, -16));
1193     __ b(finish);
1194 
1195     // 65..80/96 bytes
1196     // (96 bytes if SIMD because we do 32 byes per instruction)
1197     __ bind(copy80);
1198     if (UseSIMDForMemoryOps) {
1199       __ ldpq(v0, v1, Address(s, 0));
1200       __ ldpq(v2, v3, Address(s, 32));
1201       // Unaligned pointers can be an issue for copying.
1202       // The issue has more chances to happen when granularity of data is
1203       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1204       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1205       // The most performance drop has been seen for the range 65-80 bytes.
1206       // For such cases using the pair of ldp/stp instead of the third pair of
1207       // ldpq/stpq fixes the performance issue.
1208       if (granularity < sizeof (jint)) {
1209         Label copy96;
1210         __ cmp(count, u1(80/granularity));
1211         __ br(Assembler::HI, copy96);
1212         __ ldp(t0, t1, Address(send, -16));
1213 
1214         __ stpq(v0, v1, Address(d, 0));
1215         __ stpq(v2, v3, Address(d, 32));
1216         __ stp(t0, t1, Address(dend, -16));
1217         __ b(finish);
1218 
1219         __ bind(copy96);
1220       }
1221       __ ldpq(v4, v5, Address(send, -32));
1222 
1223       __ stpq(v0, v1, Address(d, 0));
1224       __ stpq(v2, v3, Address(d, 32));
1225       __ stpq(v4, v5, Address(dend, -32));
1226     } else {
1227       __ ldp(t0, t1, Address(s, 0));
1228       __ ldp(t2, t3, Address(s, 16));
1229       __ ldp(t4, t5, Address(s, 32));
1230       __ ldp(t6, t7, Address(s, 48));
1231       __ ldp(t8, t9, Address(send, -16));
1232 
1233       __ stp(t0, t1, Address(d, 0));
1234       __ stp(t2, t3, Address(d, 16));
1235       __ stp(t4, t5, Address(d, 32));
1236       __ stp(t6, t7, Address(d, 48));
1237       __ stp(t8, t9, Address(dend, -16));
1238     }
1239     __ b(finish);
1240 
1241     // 0..16 bytes
1242     __ bind(copy16);
1243     __ cmp(count, u1(8/granularity));
1244     __ br(Assembler::LO, copy8);
1245 
1246     // 8..16 bytes
1247     __ ldr(t0, Address(s, 0));
1248     __ ldr(t1, Address(send, -8));
1249     __ str(t0, Address(d, 0));
1250     __ str(t1, Address(dend, -8));
1251     __ b(finish);
1252 
1253     if (granularity < 8) {
1254       // 4..7 bytes
1255       __ bind(copy8);
1256       __ tbz(count, 2 - exact_log2(granularity), copy4);
1257       __ ldrw(t0, Address(s, 0));
1258       __ ldrw(t1, Address(send, -4));
1259       __ strw(t0, Address(d, 0));
1260       __ strw(t1, Address(dend, -4));
1261       __ b(finish);
1262       if (granularity < 4) {
1263         // 0..3 bytes
1264         __ bind(copy4);
1265         __ cbz(count, finish); // get rid of 0 case
1266         if (granularity == 2) {
1267           __ ldrh(t0, Address(s, 0));
1268           __ strh(t0, Address(d, 0));
1269         } else { // granularity == 1
1270           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1271           // the first and last byte.
1272           // Handle the 3 byte case by loading and storing base + count/2
1273           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1274           // This does means in the 1 byte case we load/store the same
1275           // byte 3 times.
1276           __ lsr(count, count, 1);
1277           __ ldrb(t0, Address(s, 0));
1278           __ ldrb(t1, Address(send, -1));
1279           __ ldrb(t2, Address(s, count));
1280           __ strb(t0, Address(d, 0));
1281           __ strb(t1, Address(dend, -1));
1282           __ strb(t2, Address(d, count));
1283         }
1284         __ b(finish);
1285       }
1286     }
1287 
1288     __ bind(copy_big);
1289     if (is_backwards) {
1290       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1291       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1292     }
1293 
1294     // Now we've got the small case out of the way we can align the
1295     // source address on a 2-word boundary.
1296 
1297     Label aligned;
1298 
1299     if (is_aligned) {
1300       // We may have to adjust by 1 word to get s 2-word-aligned.
1301       __ tbz(s, exact_log2(wordSize), aligned);
1302       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1303       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1304       __ sub(count, count, wordSize/granularity);
1305     } else {
1306       if (is_backwards) {
1307         __ andr(rscratch2, s, 2 * wordSize - 1);
1308       } else {
1309         __ neg(rscratch2, s);
1310         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1311       }
1312       // rscratch2 is the byte adjustment needed to align s.
1313       __ cbz(rscratch2, aligned);
1314       int shift = exact_log2(granularity);
1315       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1316       __ sub(count, count, rscratch2);
1317 
1318 #if 0
1319       // ?? This code is only correct for a disjoint copy.  It may or
1320       // may not make sense to use it in that case.
1321 
1322       // Copy the first pair; s and d may not be aligned.
1323       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1324       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1325 
1326       // Align s and d, adjust count
1327       if (is_backwards) {
1328         __ sub(s, s, rscratch2);
1329         __ sub(d, d, rscratch2);
1330       } else {
1331         __ add(s, s, rscratch2);
1332         __ add(d, d, rscratch2);
1333       }
1334 #else
1335       copy_memory_small(s, d, rscratch2, rscratch1, step);
1336 #endif
1337     }
1338 
1339     __ bind(aligned);
1340 
1341     // s is now 2-word-aligned.
1342 
1343     // We have a count of units and some trailing bytes.  Adjust the
1344     // count and do a bulk copy of words.
1345     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1346     if (direction == copy_forwards)
1347       __ bl(copy_f);
1348     else
1349       __ bl(copy_b);
1350 
1351     // And the tail.
1352     copy_memory_small(s, d, count, tmp, step);
1353 
1354     if (granularity >= 8) __ bind(copy8);
1355     if (granularity >= 4) __ bind(copy4);
1356     __ bind(finish);
1357   }
1358 
1359 
1360   void clobber_registers() {
1361 #ifdef ASSERT
1362     RegSet clobbered
1363       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1364     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1365     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1366     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1367       __ mov(*it, rscratch1);
1368     }
1369 #endif
1370 
1371   }
1372 
1373   // Scan over array at a for count oops, verifying each one.
1374   // Preserves a and count, clobbers rscratch1 and rscratch2.
1375   void verify_oop_array (int size, Register a, Register count, Register temp) {
1376     Label loop, end;
1377     __ mov(rscratch1, a);
1378     __ mov(rscratch2, zr);
1379     __ bind(loop);
1380     __ cmp(rscratch2, count);
1381     __ br(Assembler::HS, end);
1382     if (size == wordSize) {
1383       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1384       __ verify_oop(temp);
1385     } else {
1386       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1387       __ decode_heap_oop(temp); // calls verify_oop
1388     }
1389     __ add(rscratch2, rscratch2, 1);
1390     __ b(loop);
1391     __ bind(end);
1392   }
1393 
1394   // Arguments:
1395   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1396   //             ignored
1397   //   is_oop  - true => oop array, so generate store check code
1398   //   name    - stub name string
1399   //
1400   // Inputs:
1401   //   c_rarg0   - source array address
1402   //   c_rarg1   - destination array address
1403   //   c_rarg2   - element count, treated as ssize_t, can be zero
1404   //
1405   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1406   // the hardware handle it.  The two dwords within qwords that span
1407   // cache line boundaries will still be loaded and stored atomically.
1408   //
1409   // Side Effects:
1410   //   disjoint_int_copy_entry is set to the no-overlap entry point
1411   //   used by generate_conjoint_int_oop_copy().
1412   //
1413   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1414                                   const char *name, bool dest_uninitialized = false) {
1415     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1416     RegSet saved_reg = RegSet::of(s, d, count);
1417     __ align(CodeEntryAlignment);
1418     StubCodeMark mark(this, "StubRoutines", name);
1419     address start = __ pc();
1420     __ enter();
1421 
1422     if (entry != NULL) {
1423       *entry = __ pc();
1424       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1425       BLOCK_COMMENT("Entry:");
1426     }
1427 
1428     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1429     if (dest_uninitialized) {
1430       decorators |= IS_DEST_UNINITIALIZED;
1431     }
1432     if (aligned) {
1433       decorators |= ARRAYCOPY_ALIGNED;
1434     }
1435 
1436     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1437     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1438 
1439     if (is_oop) {
1440       // save regs before copy_memory
1441       __ push(RegSet::of(d, count), sp);
1442     }
1443     {
1444       // UnsafeCopyMemory page error: continue after ucm
1445       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1446       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1447       copy_memory(aligned, s, d, count, rscratch1, size);
1448     }
1449 
1450     if (is_oop) {
1451       __ pop(RegSet::of(d, count), sp);
1452       if (VerifyOops)
1453         verify_oop_array(size, d, count, r16);
1454     }
1455 
1456     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1457 
1458     __ leave();
1459     __ mov(r0, zr); // return 0
1460     __ ret(lr);
1461     return start;
1462   }
1463 
1464   // Arguments:
1465   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1466   //             ignored
1467   //   is_oop  - true => oop array, so generate store check code
1468   //   name    - stub name string
1469   //
1470   // Inputs:
1471   //   c_rarg0   - source array address
1472   //   c_rarg1   - destination array address
1473   //   c_rarg2   - element count, treated as ssize_t, can be zero
1474   //
1475   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1476   // the hardware handle it.  The two dwords within qwords that span
1477   // cache line boundaries will still be loaded and stored atomically.
1478   //
1479   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1480                                  address *entry, const char *name,
1481                                  bool dest_uninitialized = false) {
1482     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1483     RegSet saved_regs = RegSet::of(s, d, count);
1484     StubCodeMark mark(this, "StubRoutines", name);
1485     address start = __ pc();
1486     __ enter();
1487 
1488     if (entry != NULL) {
1489       *entry = __ pc();
1490       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1491       BLOCK_COMMENT("Entry:");
1492     }
1493 
1494     // use fwd copy when (d-s) above_equal (count*size)
1495     __ sub(rscratch1, d, s);
1496     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1497     __ br(Assembler::HS, nooverlap_target);
1498 
1499     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1500     if (dest_uninitialized) {
1501       decorators |= IS_DEST_UNINITIALIZED;
1502     }
1503     if (aligned) {
1504       decorators |= ARRAYCOPY_ALIGNED;
1505     }
1506 
1507     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1508     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1509 
1510     if (is_oop) {
1511       // save regs before copy_memory
1512       __ push(RegSet::of(d, count), sp);
1513     }
1514     {
1515       // UnsafeCopyMemory page error: continue after ucm
1516       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1517       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1518       copy_memory(aligned, s, d, count, rscratch1, -size);
1519     }
1520     if (is_oop) {
1521       __ pop(RegSet::of(d, count), sp);
1522       if (VerifyOops)
1523         verify_oop_array(size, d, count, r16);
1524     }
1525     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1526     __ leave();
1527     __ mov(r0, zr); // return 0
1528     __ ret(lr);
1529     return start;
1530 }
1531 
1532   // Arguments:
1533   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1534   //             ignored
1535   //   name    - stub name string
1536   //
1537   // Inputs:
1538   //   c_rarg0   - source array address
1539   //   c_rarg1   - destination array address
1540   //   c_rarg2   - element count, treated as ssize_t, can be zero
1541   //
1542   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1543   // we let the hardware handle it.  The one to eight bytes within words,
1544   // dwords or qwords that span cache line boundaries will still be loaded
1545   // and stored atomically.
1546   //
1547   // Side Effects:
1548   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1549   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1550   // we let the hardware handle it.  The one to eight bytes within words,
1551   // dwords or qwords that span cache line boundaries will still be loaded
1552   // and stored atomically.
1553   //
1554   // Side Effects:
1555   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1556   //   used by generate_conjoint_byte_copy().
1557   //
1558   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1559     const bool not_oop = false;
1560     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1561   }
1562 
1563   // Arguments:
1564   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1565   //             ignored
1566   //   name    - stub name string
1567   //
1568   // Inputs:
1569   //   c_rarg0   - source array address
1570   //   c_rarg1   - destination array address
1571   //   c_rarg2   - element count, treated as ssize_t, can be zero
1572   //
1573   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1574   // we let the hardware handle it.  The one to eight bytes within words,
1575   // dwords or qwords that span cache line boundaries will still be loaded
1576   // and stored atomically.
1577   //
1578   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1579                                       address* entry, const char *name) {
1580     const bool not_oop = false;
1581     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1582   }
1583 
1584   // Arguments:
1585   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1586   //             ignored
1587   //   name    - stub name string
1588   //
1589   // Inputs:
1590   //   c_rarg0   - source array address
1591   //   c_rarg1   - destination array address
1592   //   c_rarg2   - element count, treated as ssize_t, can be zero
1593   //
1594   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1595   // let the hardware handle it.  The two or four words within dwords
1596   // or qwords that span cache line boundaries will still be loaded
1597   // and stored atomically.
1598   //
1599   // Side Effects:
1600   //   disjoint_short_copy_entry is set to the no-overlap entry point
1601   //   used by generate_conjoint_short_copy().
1602   //
1603   address generate_disjoint_short_copy(bool aligned,
1604                                        address* entry, const char *name) {
1605     const bool not_oop = false;
1606     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1607   }
1608 
1609   // Arguments:
1610   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1611   //             ignored
1612   //   name    - stub name string
1613   //
1614   // Inputs:
1615   //   c_rarg0   - source array address
1616   //   c_rarg1   - destination array address
1617   //   c_rarg2   - element count, treated as ssize_t, can be zero
1618   //
1619   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1620   // let the hardware handle it.  The two or four words within dwords
1621   // or qwords that span cache line boundaries will still be loaded
1622   // and stored atomically.
1623   //
1624   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1625                                        address *entry, const char *name) {
1626     const bool not_oop = false;
1627     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1628 
1629   }
1630   // Arguments:
1631   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1632   //             ignored
1633   //   name    - stub name string
1634   //
1635   // Inputs:
1636   //   c_rarg0   - source array address
1637   //   c_rarg1   - destination array address
1638   //   c_rarg2   - element count, treated as ssize_t, can be zero
1639   //
1640   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1641   // the hardware handle it.  The two dwords within qwords that span
1642   // cache line boundaries will still be loaded and stored atomically.
1643   //
1644   // Side Effects:
1645   //   disjoint_int_copy_entry is set to the no-overlap entry point
1646   //   used by generate_conjoint_int_oop_copy().
1647   //
1648   address generate_disjoint_int_copy(bool aligned, address *entry,
1649                                          const char *name, bool dest_uninitialized = false) {
1650     const bool not_oop = false;
1651     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1652   }
1653 
1654   // Arguments:
1655   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1656   //             ignored
1657   //   name    - stub name string
1658   //
1659   // Inputs:
1660   //   c_rarg0   - source array address
1661   //   c_rarg1   - destination array address
1662   //   c_rarg2   - element count, treated as ssize_t, can be zero
1663   //
1664   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1665   // the hardware handle it.  The two dwords within qwords that span
1666   // cache line boundaries will still be loaded and stored atomically.
1667   //
1668   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1669                                      address *entry, const char *name,
1670                                      bool dest_uninitialized = false) {
1671     const bool not_oop = false;
1672     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1673   }
1674 
1675 
1676   // Arguments:
1677   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1678   //             ignored
1679   //   name    - stub name string
1680   //
1681   // Inputs:
1682   //   c_rarg0   - source array address
1683   //   c_rarg1   - destination array address
1684   //   c_rarg2   - element count, treated as size_t, can be zero
1685   //
1686   // Side Effects:
1687   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1688   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1689   //
1690   address generate_disjoint_long_copy(bool aligned, address *entry,
1691                                           const char *name, bool dest_uninitialized = false) {
1692     const bool not_oop = false;
1693     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1694   }
1695 
1696   // Arguments:
1697   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1698   //             ignored
1699   //   name    - stub name string
1700   //
1701   // Inputs:
1702   //   c_rarg0   - source array address
1703   //   c_rarg1   - destination array address
1704   //   c_rarg2   - element count, treated as size_t, can be zero
1705   //
1706   address generate_conjoint_long_copy(bool aligned,
1707                                       address nooverlap_target, address *entry,
1708                                       const char *name, bool dest_uninitialized = false) {
1709     const bool not_oop = false;
1710     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1711   }
1712 
1713   // Arguments:
1714   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1715   //             ignored
1716   //   name    - stub name string
1717   //
1718   // Inputs:
1719   //   c_rarg0   - source array address
1720   //   c_rarg1   - destination array address
1721   //   c_rarg2   - element count, treated as size_t, can be zero
1722   //
1723   // Side Effects:
1724   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1725   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1726   //
1727   address generate_disjoint_oop_copy(bool aligned, address *entry,
1728                                      const char *name, bool dest_uninitialized) {
1729     const bool is_oop = true;
1730     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1731     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1732   }
1733 
1734   // Arguments:
1735   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1736   //             ignored
1737   //   name    - stub name string
1738   //
1739   // Inputs:
1740   //   c_rarg0   - source array address
1741   //   c_rarg1   - destination array address
1742   //   c_rarg2   - element count, treated as size_t, can be zero
1743   //
1744   address generate_conjoint_oop_copy(bool aligned,
1745                                      address nooverlap_target, address *entry,
1746                                      const char *name, bool dest_uninitialized) {
1747     const bool is_oop = true;
1748     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1749     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1750                                   name, dest_uninitialized);
1751   }
1752 
1753 
1754   // Helper for generating a dynamic type check.
1755   // Smashes rscratch1, rscratch2.
1756   void generate_type_check(Register sub_klass,
1757                            Register super_check_offset,
1758                            Register super_klass,
1759                            Label& L_success) {
1760     assert_different_registers(sub_klass, super_check_offset, super_klass);
1761 
1762     BLOCK_COMMENT("type_check:");
1763 
1764     Label L_miss;
1765 
1766     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1767                                      super_check_offset);
1768     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1769 
1770     // Fall through on failure!
1771     __ BIND(L_miss);
1772   }
1773 
1774   //
1775   //  Generate checkcasting array copy stub
1776   //
1777   //  Input:
1778   //    c_rarg0   - source array address
1779   //    c_rarg1   - destination array address
1780   //    c_rarg2   - element count, treated as ssize_t, can be zero
1781   //    c_rarg3   - size_t ckoff (super_check_offset)
1782   //    c_rarg4   - oop ckval (super_klass)
1783   //
1784   //  Output:
1785   //    r0 ==  0  -  success
1786   //    r0 == -1^K - failure, where K is partial transfer count
1787   //
1788   address generate_checkcast_copy(const char *name, address *entry,
1789                                   bool dest_uninitialized = false) {
1790 
1791     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1792 
1793     // Input registers (after setup_arg_regs)
1794     const Register from        = c_rarg0;   // source array address
1795     const Register to          = c_rarg1;   // destination array address
1796     const Register count       = c_rarg2;   // elementscount
1797     const Register ckoff       = c_rarg3;   // super_check_offset
1798     const Register ckval       = c_rarg4;   // super_klass
1799 
1800     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1801     RegSet wb_post_saved_regs = RegSet::of(count);
1802 
1803     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1804     const Register copied_oop  = r22;       // actual oop copied
1805     const Register count_save  = r21;       // orig elementscount
1806     const Register start_to    = r20;       // destination array start address
1807     const Register r19_klass   = r19;       // oop._klass
1808 
1809     //---------------------------------------------------------------
1810     // Assembler stub will be used for this call to arraycopy
1811     // if the two arrays are subtypes of Object[] but the
1812     // destination array type is not equal to or a supertype
1813     // of the source type.  Each element must be separately
1814     // checked.
1815 
1816     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1817                                copied_oop, r19_klass, count_save);
1818 
1819     __ align(CodeEntryAlignment);
1820     StubCodeMark mark(this, "StubRoutines", name);
1821     address start = __ pc();
1822 
1823     __ enter(); // required for proper stackwalking of RuntimeStub frame
1824 
1825 #ifdef ASSERT
1826     // caller guarantees that the arrays really are different
1827     // otherwise, we would have to make conjoint checks
1828     { Label L;
1829       array_overlap_test(L, TIMES_OOP);
1830       __ stop("checkcast_copy within a single array");
1831       __ bind(L);
1832     }
1833 #endif //ASSERT
1834 
1835     // Caller of this entry point must set up the argument registers.
1836     if (entry != NULL) {
1837       *entry = __ pc();
1838       BLOCK_COMMENT("Entry:");
1839     }
1840 
1841      // Empty array:  Nothing to do.
1842     __ cbz(count, L_done);
1843     __ push(RegSet::of(r19, r20, r21, r22), sp);
1844 
1845 #ifdef ASSERT
1846     BLOCK_COMMENT("assert consistent ckoff/ckval");
1847     // The ckoff and ckval must be mutually consistent,
1848     // even though caller generates both.
1849     { Label L;
1850       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1851       __ ldrw(start_to, Address(ckval, sco_offset));
1852       __ cmpw(ckoff, start_to);
1853       __ br(Assembler::EQ, L);
1854       __ stop("super_check_offset inconsistent");
1855       __ bind(L);
1856     }
1857 #endif //ASSERT
1858 
1859     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1860     bool is_oop = true;
1861     if (dest_uninitialized) {
1862       decorators |= IS_DEST_UNINITIALIZED;
1863     }
1864 
1865     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1866     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1867 
1868     // save the original count
1869     __ mov(count_save, count);
1870 
1871     // Copy from low to high addresses
1872     __ mov(start_to, to);              // Save destination array start address
1873     __ b(L_load_element);
1874 
1875     // ======== begin loop ========
1876     // (Loop is rotated; its entry is L_load_element.)
1877     // Loop control:
1878     //   for (; count != 0; count--) {
1879     //     copied_oop = load_heap_oop(from++);
1880     //     ... generate_type_check ...;
1881     //     store_heap_oop(to++, copied_oop);
1882     //   }
1883     __ align(OptoLoopAlignment);
1884 
1885     __ BIND(L_store_element);
1886     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
1887     __ sub(count, count, 1);
1888     __ cbz(count, L_do_card_marks);
1889 
1890     // ======== loop entry is here ========
1891     __ BIND(L_load_element);
1892     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1893     __ cbz(copied_oop, L_store_element);
1894 
1895     __ load_klass(r19_klass, copied_oop);// query the object klass
1896     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1897     // ======== end loop ========
1898 
1899     // It was a real error; we must depend on the caller to finish the job.
1900     // Register count = remaining oops, count_orig = total oops.
1901     // Emit GC store barriers for the oops we have copied and report
1902     // their number to the caller.
1903 
1904     __ subs(count, count_save, count);     // K = partially copied oop count
1905     __ eon(count, count, zr);                   // report (-1^K) to caller
1906     __ br(Assembler::EQ, L_done_pop);
1907 
1908     __ BIND(L_do_card_marks);
1909     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1910 
1911     __ bind(L_done_pop);
1912     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1913     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1914 
1915     __ bind(L_done);
1916     __ mov(r0, count);
1917     __ leave();
1918     __ ret(lr);
1919 
1920     return start;
1921   }
1922 
1923   // Perform range checks on the proposed arraycopy.
1924   // Kills temp, but nothing else.
1925   // Also, clean the sign bits of src_pos and dst_pos.
1926   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1927                               Register src_pos, // source position (c_rarg1)
1928                               Register dst,     // destination array oo (c_rarg2)
1929                               Register dst_pos, // destination position (c_rarg3)
1930                               Register length,
1931                               Register temp,
1932                               Label& L_failed) {
1933     BLOCK_COMMENT("arraycopy_range_checks:");
1934 
1935     assert_different_registers(rscratch1, temp);
1936 
1937     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1938     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1939     __ addw(temp, length, src_pos);
1940     __ cmpw(temp, rscratch1);
1941     __ br(Assembler::HI, L_failed);
1942 
1943     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1944     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1945     __ addw(temp, length, dst_pos);
1946     __ cmpw(temp, rscratch1);
1947     __ br(Assembler::HI, L_failed);
1948 
1949     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1950     __ movw(src_pos, src_pos);
1951     __ movw(dst_pos, dst_pos);
1952 
1953     BLOCK_COMMENT("arraycopy_range_checks done");
1954   }
1955 
1956   // These stubs get called from some dumb test routine.
1957   // I'll write them properly when they're called from
1958   // something that's actually doing something.
1959   static void fake_arraycopy_stub(address src, address dst, int count) {
1960     assert(count == 0, "huh?");
1961   }
1962 
1963 
1964   //
1965   //  Generate 'unsafe' array copy stub
1966   //  Though just as safe as the other stubs, it takes an unscaled
1967   //  size_t argument instead of an element count.
1968   //
1969   //  Input:
1970   //    c_rarg0   - source array address
1971   //    c_rarg1   - destination array address
1972   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1973   //
1974   // Examines the alignment of the operands and dispatches
1975   // to a long, int, short, or byte copy loop.
1976   //
1977   address generate_unsafe_copy(const char *name,
1978                                address byte_copy_entry,
1979                                address short_copy_entry,
1980                                address int_copy_entry,
1981                                address long_copy_entry) {
1982     Label L_long_aligned, L_int_aligned, L_short_aligned;
1983     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1984 
1985     __ align(CodeEntryAlignment);
1986     StubCodeMark mark(this, "StubRoutines", name);
1987     address start = __ pc();
1988     __ enter(); // required for proper stackwalking of RuntimeStub frame
1989 
1990     // bump this on entry, not on exit:
1991     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1992 
1993     __ orr(rscratch1, s, d);
1994     __ orr(rscratch1, rscratch1, count);
1995 
1996     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1997     __ cbz(rscratch1, L_long_aligned);
1998     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1999     __ cbz(rscratch1, L_int_aligned);
2000     __ tbz(rscratch1, 0, L_short_aligned);
2001     __ b(RuntimeAddress(byte_copy_entry));
2002 
2003     __ BIND(L_short_aligned);
2004     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2005     __ b(RuntimeAddress(short_copy_entry));
2006     __ BIND(L_int_aligned);
2007     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2008     __ b(RuntimeAddress(int_copy_entry));
2009     __ BIND(L_long_aligned);
2010     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2011     __ b(RuntimeAddress(long_copy_entry));
2012 
2013     return start;
2014   }
2015 
2016   //
2017   //  Generate generic array copy stubs
2018   //
2019   //  Input:
2020   //    c_rarg0    -  src oop
2021   //    c_rarg1    -  src_pos (32-bits)
2022   //    c_rarg2    -  dst oop
2023   //    c_rarg3    -  dst_pos (32-bits)
2024   //    c_rarg4    -  element count (32-bits)
2025   //
2026   //  Output:
2027   //    r0 ==  0  -  success
2028   //    r0 == -1^K - failure, where K is partial transfer count
2029   //
2030   address generate_generic_copy(const char *name,
2031                                 address byte_copy_entry, address short_copy_entry,
2032                                 address int_copy_entry, address oop_copy_entry,
2033                                 address long_copy_entry, address checkcast_copy_entry) {
2034 
2035     Label L_failed, L_objArray;
2036     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2037 
2038     // Input registers
2039     const Register src        = c_rarg0;  // source array oop
2040     const Register src_pos    = c_rarg1;  // source position
2041     const Register dst        = c_rarg2;  // destination array oop
2042     const Register dst_pos    = c_rarg3;  // destination position
2043     const Register length     = c_rarg4;
2044 
2045 
2046     // Registers used as temps
2047     const Register dst_klass  = c_rarg5;
2048 
2049     __ align(CodeEntryAlignment);
2050 
2051     StubCodeMark mark(this, "StubRoutines", name);
2052 
2053     address start = __ pc();
2054 
2055     __ enter(); // required for proper stackwalking of RuntimeStub frame
2056 
2057     // bump this on entry, not on exit:
2058     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2059 
2060     //-----------------------------------------------------------------------
2061     // Assembler stub will be used for this call to arraycopy
2062     // if the following conditions are met:
2063     //
2064     // (1) src and dst must not be null.
2065     // (2) src_pos must not be negative.
2066     // (3) dst_pos must not be negative.
2067     // (4) length  must not be negative.
2068     // (5) src klass and dst klass should be the same and not NULL.
2069     // (6) src and dst should be arrays.
2070     // (7) src_pos + length must not exceed length of src.
2071     // (8) dst_pos + length must not exceed length of dst.
2072     //
2073 
2074     //  if (src == NULL) return -1;
2075     __ cbz(src, L_failed);
2076 
2077     //  if (src_pos < 0) return -1;
2078     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2079 
2080     //  if (dst == NULL) return -1;
2081     __ cbz(dst, L_failed);
2082 
2083     //  if (dst_pos < 0) return -1;
2084     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2085 
2086     // registers used as temp
2087     const Register scratch_length    = r16; // elements count to copy
2088     const Register scratch_src_klass = r17; // array klass
2089     const Register lh                = r15; // layout helper
2090 
2091     //  if (length < 0) return -1;
2092     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2093     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2094 
2095     __ load_klass(scratch_src_klass, src);
2096 #ifdef ASSERT
2097     //  assert(src->klass() != NULL);
2098     {
2099       BLOCK_COMMENT("assert klasses not null {");
2100       Label L1, L2;
2101       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2102       __ bind(L1);
2103       __ stop("broken null klass");
2104       __ bind(L2);
2105       __ load_klass(rscratch1, dst);
2106       __ cbz(rscratch1, L1);     // this would be broken also
2107       BLOCK_COMMENT("} assert klasses not null done");
2108     }
2109 #endif
2110 
2111     // Load layout helper (32-bits)
2112     //
2113     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2114     // 32        30    24            16              8     2                 0
2115     //
2116     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2117     //
2118 
2119     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2120 
2121     // Handle objArrays completely differently...
2122     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2123     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2124     __ movw(rscratch1, objArray_lh);
2125     __ eorw(rscratch2, lh, rscratch1);
2126     __ cbzw(rscratch2, L_objArray);
2127 
2128     //  if (src->klass() != dst->klass()) return -1;
2129     __ load_klass(rscratch2, dst);
2130     __ eor(rscratch2, rscratch2, scratch_src_klass);
2131     __ cbnz(rscratch2, L_failed);
2132 
2133     // Check for flat inline type array -> return -1
2134     __ tst(lh, Klass::_lh_array_tag_flat_value_bit_inplace);
2135     __ br(Assembler::NE, L_failed);
2136 
2137     // Check for null-free (non-flat) inline type array -> handle as object array
2138     __ tst(lh, Klass::_lh_null_free_array_bit_inplace);
2139     __ br(Assembler::NE, L_failed);
2140 
2141     //  if (!src->is_Array()) return -1;
2142     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2143 
2144     // At this point, it is known to be a typeArray (array_tag 0x3).
2145 #ifdef ASSERT
2146     {
2147       BLOCK_COMMENT("assert primitive array {");
2148       Label L;
2149       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2150       __ cmpw(lh, rscratch2);
2151       __ br(Assembler::GE, L);
2152       __ stop("must be a primitive array");
2153       __ bind(L);
2154       BLOCK_COMMENT("} assert primitive array done");
2155     }
2156 #endif
2157 
2158     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2159                            rscratch2, L_failed);
2160 
2161     // TypeArrayKlass
2162     //
2163     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2164     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2165     //
2166 
2167     const Register rscratch1_offset = rscratch1;    // array offset
2168     const Register r15_elsize = lh; // element size
2169 
2170     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2171            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2172     __ add(src, src, rscratch1_offset);           // src array offset
2173     __ add(dst, dst, rscratch1_offset);           // dst array offset
2174     BLOCK_COMMENT("choose copy loop based on element size");
2175 
2176     // next registers should be set before the jump to corresponding stub
2177     const Register from     = c_rarg0;  // source array address
2178     const Register to       = c_rarg1;  // destination array address
2179     const Register count    = c_rarg2;  // elements count
2180 
2181     // 'from', 'to', 'count' registers should be set in such order
2182     // since they are the same as 'src', 'src_pos', 'dst'.
2183 
2184     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2185 
2186     // The possible values of elsize are 0-3, i.e. exact_log2(element
2187     // size in bytes).  We do a simple bitwise binary search.
2188   __ BIND(L_copy_bytes);
2189     __ tbnz(r15_elsize, 1, L_copy_ints);
2190     __ tbnz(r15_elsize, 0, L_copy_shorts);
2191     __ lea(from, Address(src, src_pos));// src_addr
2192     __ lea(to,   Address(dst, dst_pos));// dst_addr
2193     __ movw(count, scratch_length); // length
2194     __ b(RuntimeAddress(byte_copy_entry));
2195 
2196   __ BIND(L_copy_shorts);
2197     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2198     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2199     __ movw(count, scratch_length); // length
2200     __ b(RuntimeAddress(short_copy_entry));
2201 
2202   __ BIND(L_copy_ints);
2203     __ tbnz(r15_elsize, 0, L_copy_longs);
2204     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2205     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2206     __ movw(count, scratch_length); // length
2207     __ b(RuntimeAddress(int_copy_entry));
2208 
2209   __ BIND(L_copy_longs);
2210 #ifdef ASSERT
2211     {
2212       BLOCK_COMMENT("assert long copy {");
2213       Label L;
2214       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2215       __ cmpw(r15_elsize, LogBytesPerLong);
2216       __ br(Assembler::EQ, L);
2217       __ stop("must be long copy, but elsize is wrong");
2218       __ bind(L);
2219       BLOCK_COMMENT("} assert long copy done");
2220     }
2221 #endif
2222     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2223     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2224     __ movw(count, scratch_length); // length
2225     __ b(RuntimeAddress(long_copy_entry));
2226 
2227     // ObjArrayKlass
2228   __ BIND(L_objArray);
2229     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2230 
2231     Label L_plain_copy, L_checkcast_copy;
2232     //  test array classes for subtyping
2233     __ load_klass(r15, dst);
2234     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2235     __ br(Assembler::NE, L_checkcast_copy);
2236 
2237     // Identically typed arrays can be copied without element-wise checks.
2238     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2239                            rscratch2, L_failed);
2240 
2241     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2242     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2243     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2244     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2245     __ movw(count, scratch_length); // length
2246   __ BIND(L_plain_copy);
2247     __ b(RuntimeAddress(oop_copy_entry));
2248 
2249   __ BIND(L_checkcast_copy);
2250     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2251     {
2252       // Before looking at dst.length, make sure dst is also an objArray.
2253       __ ldrw(rscratch1, Address(r15, lh_offset));
2254       __ movw(rscratch2, objArray_lh);
2255       __ eorw(rscratch1, rscratch1, rscratch2);
2256       __ cbnzw(rscratch1, L_failed);
2257 
2258       // It is safe to examine both src.length and dst.length.
2259       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2260                              r15, L_failed);
2261 
2262       __ load_klass(dst_klass, dst); // reload
2263 
2264       // Marshal the base address arguments now, freeing registers.
2265       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2266       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2267       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2268       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2269       __ movw(count, length);           // length (reloaded)
2270       Register sco_temp = c_rarg3;      // this register is free now
2271       assert_different_registers(from, to, count, sco_temp,
2272                                  dst_klass, scratch_src_klass);
2273       // assert_clean_int(count, sco_temp);
2274 
2275       // Generate the type check.
2276       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2277       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2278 
2279       // Smashes rscratch1, rscratch2
2280       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2281 
2282       // Fetch destination element klass from the ObjArrayKlass header.
2283       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2284       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2285       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2286 
2287       // the checkcast_copy loop needs two extra arguments:
2288       assert(c_rarg3 == sco_temp, "#3 already in place");
2289       // Set up arguments for checkcast_copy_entry.
2290       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2291       __ b(RuntimeAddress(checkcast_copy_entry));
2292     }
2293 
2294   __ BIND(L_failed);
2295     __ mov(r0, -1);
2296     __ leave();   // required for proper stackwalking of RuntimeStub frame
2297     __ ret(lr);
2298 
2299     return start;
2300   }
2301 
2302   //
2303   // Generate stub for array fill. If "aligned" is true, the
2304   // "to" address is assumed to be heapword aligned.
2305   //
2306   // Arguments for generated stub:
2307   //   to:    c_rarg0
2308   //   value: c_rarg1
2309   //   count: c_rarg2 treated as signed
2310   //
2311   address generate_fill(BasicType t, bool aligned, const char *name) {
2312     __ align(CodeEntryAlignment);
2313     StubCodeMark mark(this, "StubRoutines", name);
2314     address start = __ pc();
2315 
2316     BLOCK_COMMENT("Entry:");
2317 
2318     const Register to        = c_rarg0;  // source array address
2319     const Register value     = c_rarg1;  // value
2320     const Register count     = c_rarg2;  // elements count
2321 
2322     const Register bz_base = r10;        // base for block_zero routine
2323     const Register cnt_words = r11;      // temp register
2324 
2325     __ enter();
2326 
2327     Label L_fill_elements, L_exit1;
2328 
2329     int shift = -1;
2330     switch (t) {
2331       case T_BYTE:
2332         shift = 0;
2333         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2334         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2335         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2336         __ br(Assembler::LO, L_fill_elements);
2337         break;
2338       case T_SHORT:
2339         shift = 1;
2340         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2341         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2342         __ br(Assembler::LO, L_fill_elements);
2343         break;
2344       case T_INT:
2345         shift = 2;
2346         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2347         __ br(Assembler::LO, L_fill_elements);
2348         break;
2349       default: ShouldNotReachHere();
2350     }
2351 
2352     // Align source address at 8 bytes address boundary.
2353     Label L_skip_align1, L_skip_align2, L_skip_align4;
2354     if (!aligned) {
2355       switch (t) {
2356         case T_BYTE:
2357           // One byte misalignment happens only for byte arrays.
2358           __ tbz(to, 0, L_skip_align1);
2359           __ strb(value, Address(__ post(to, 1)));
2360           __ subw(count, count, 1);
2361           __ bind(L_skip_align1);
2362           // Fallthrough
2363         case T_SHORT:
2364           // Two bytes misalignment happens only for byte and short (char) arrays.
2365           __ tbz(to, 1, L_skip_align2);
2366           __ strh(value, Address(__ post(to, 2)));
2367           __ subw(count, count, 2 >> shift);
2368           __ bind(L_skip_align2);
2369           // Fallthrough
2370         case T_INT:
2371           // Align to 8 bytes, we know we are 4 byte aligned to start.
2372           __ tbz(to, 2, L_skip_align4);
2373           __ strw(value, Address(__ post(to, 4)));
2374           __ subw(count, count, 4 >> shift);
2375           __ bind(L_skip_align4);
2376           break;
2377         default: ShouldNotReachHere();
2378       }
2379     }
2380 
2381     //
2382     //  Fill large chunks
2383     //
2384     __ lsrw(cnt_words, count, 3 - shift); // number of words
2385     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2386     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2387     if (UseBlockZeroing) {
2388       Label non_block_zeroing, rest;
2389       // If the fill value is zero we can use the fast zero_words().
2390       __ cbnz(value, non_block_zeroing);
2391       __ mov(bz_base, to);
2392       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2393       __ zero_words(bz_base, cnt_words);
2394       __ b(rest);
2395       __ bind(non_block_zeroing);
2396       __ fill_words(to, cnt_words, value);
2397       __ bind(rest);
2398     } else {
2399       __ fill_words(to, cnt_words, value);
2400     }
2401 
2402     // Remaining count is less than 8 bytes. Fill it by a single store.
2403     // Note that the total length is no less than 8 bytes.
2404     if (t == T_BYTE || t == T_SHORT) {
2405       Label L_exit1;
2406       __ cbzw(count, L_exit1);
2407       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2408       __ str(value, Address(to, -8));    // overwrite some elements
2409       __ bind(L_exit1);
2410       __ leave();
2411       __ ret(lr);
2412     }
2413 
2414     // Handle copies less than 8 bytes.
2415     Label L_fill_2, L_fill_4, L_exit2;
2416     __ bind(L_fill_elements);
2417     switch (t) {
2418       case T_BYTE:
2419         __ tbz(count, 0, L_fill_2);
2420         __ strb(value, Address(__ post(to, 1)));
2421         __ bind(L_fill_2);
2422         __ tbz(count, 1, L_fill_4);
2423         __ strh(value, Address(__ post(to, 2)));
2424         __ bind(L_fill_4);
2425         __ tbz(count, 2, L_exit2);
2426         __ strw(value, Address(to));
2427         break;
2428       case T_SHORT:
2429         __ tbz(count, 0, L_fill_4);
2430         __ strh(value, Address(__ post(to, 2)));
2431         __ bind(L_fill_4);
2432         __ tbz(count, 1, L_exit2);
2433         __ strw(value, Address(to));
2434         break;
2435       case T_INT:
2436         __ cbzw(count, L_exit2);
2437         __ strw(value, Address(to));
2438         break;
2439       default: ShouldNotReachHere();
2440     }
2441     __ bind(L_exit2);
2442     __ leave();
2443     __ ret(lr);
2444     return start;
2445   }
2446 
2447   address generate_data_cache_writeback() {
2448     const Register line        = c_rarg0;  // address of line to write back
2449 
2450     __ align(CodeEntryAlignment);
2451 
2452     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2453 
2454     address start = __ pc();
2455     __ enter();
2456     __ cache_wb(Address(line, 0));
2457     __ leave();
2458     __ ret(lr);
2459 
2460     return start;
2461   }
2462 
2463   address generate_data_cache_writeback_sync() {
2464     const Register is_pre     = c_rarg0;  // pre or post sync
2465 
2466     __ align(CodeEntryAlignment);
2467 
2468     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2469 
2470     // pre wbsync is a no-op
2471     // post wbsync translates to an sfence
2472 
2473     Label skip;
2474     address start = __ pc();
2475     __ enter();
2476     __ cbnz(is_pre, skip);
2477     __ cache_wbsync(false);
2478     __ bind(skip);
2479     __ leave();
2480     __ ret(lr);
2481 
2482     return start;
2483   }
2484 
2485   void generate_arraycopy_stubs() {
2486     address entry;
2487     address entry_jbyte_arraycopy;
2488     address entry_jshort_arraycopy;
2489     address entry_jint_arraycopy;
2490     address entry_oop_arraycopy;
2491     address entry_jlong_arraycopy;
2492     address entry_checkcast_arraycopy;
2493 
2494     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2495     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2496 
2497     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2498 
2499     //*** jbyte
2500     // Always need aligned and unaligned versions
2501     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2502                                                                                   "jbyte_disjoint_arraycopy");
2503     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2504                                                                                   &entry_jbyte_arraycopy,
2505                                                                                   "jbyte_arraycopy");
2506     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2507                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2508     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2509                                                                                   "arrayof_jbyte_arraycopy");
2510 
2511     //*** jshort
2512     // Always need aligned and unaligned versions
2513     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2514                                                                                     "jshort_disjoint_arraycopy");
2515     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2516                                                                                     &entry_jshort_arraycopy,
2517                                                                                     "jshort_arraycopy");
2518     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2519                                                                                     "arrayof_jshort_disjoint_arraycopy");
2520     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2521                                                                                     "arrayof_jshort_arraycopy");
2522 
2523     //*** jint
2524     // Aligned versions
2525     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2526                                                                                 "arrayof_jint_disjoint_arraycopy");
2527     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2528                                                                                 "arrayof_jint_arraycopy");
2529     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2530     // entry_jint_arraycopy always points to the unaligned version
2531     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2532                                                                                 "jint_disjoint_arraycopy");
2533     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2534                                                                                 &entry_jint_arraycopy,
2535                                                                                 "jint_arraycopy");
2536 
2537     //*** jlong
2538     // It is always aligned
2539     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2540                                                                                   "arrayof_jlong_disjoint_arraycopy");
2541     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2542                                                                                   "arrayof_jlong_arraycopy");
2543     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2544     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2545 
2546     //*** oops
2547     {
2548       // With compressed oops we need unaligned versions; notice that
2549       // we overwrite entry_oop_arraycopy.
2550       bool aligned = !UseCompressedOops;
2551 
2552       StubRoutines::_arrayof_oop_disjoint_arraycopy
2553         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2554                                      /*dest_uninitialized*/false);
2555       StubRoutines::_arrayof_oop_arraycopy
2556         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2557                                      /*dest_uninitialized*/false);
2558       // Aligned versions without pre-barriers
2559       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2560         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2561                                      /*dest_uninitialized*/true);
2562       StubRoutines::_arrayof_oop_arraycopy_uninit
2563         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2564                                      /*dest_uninitialized*/true);
2565     }
2566 
2567     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2568     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2569     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2570     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2571 
2572     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2573     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2574                                                                         /*dest_uninitialized*/true);
2575 
2576     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2577                                                               entry_jbyte_arraycopy,
2578                                                               entry_jshort_arraycopy,
2579                                                               entry_jint_arraycopy,
2580                                                               entry_jlong_arraycopy);
2581 
2582     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2583                                                                entry_jbyte_arraycopy,
2584                                                                entry_jshort_arraycopy,
2585                                                                entry_jint_arraycopy,
2586                                                                entry_oop_arraycopy,
2587                                                                entry_jlong_arraycopy,
2588                                                                entry_checkcast_arraycopy);
2589 
2590     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2591     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2592     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2593     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2594     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2595     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2596   }
2597 
2598   void generate_math_stubs() { Unimplemented(); }
2599 
2600   // Arguments:
2601   //
2602   // Inputs:
2603   //   c_rarg0   - source byte array address
2604   //   c_rarg1   - destination byte array address
2605   //   c_rarg2   - K (key) in little endian int array
2606   //
2607   address generate_aescrypt_encryptBlock() {
2608     __ align(CodeEntryAlignment);
2609     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2610 
2611     const Register from        = c_rarg0;  // source array address
2612     const Register to          = c_rarg1;  // destination array address
2613     const Register key         = c_rarg2;  // key array address
2614     const Register keylen      = rscratch1;
2615 
2616     address start = __ pc();
2617     __ enter();
2618 
2619     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2620 
2621     __ aesenc_loadkeys(key, keylen);
2622     __ aesecb_encrypt(from, to, keylen);
2623 
2624     __ mov(r0, 0);
2625 
2626     __ leave();
2627     __ ret(lr);
2628 
2629     return start;
2630   }
2631 
2632   // Arguments:
2633   //
2634   // Inputs:
2635   //   c_rarg0   - source byte array address
2636   //   c_rarg1   - destination byte array address
2637   //   c_rarg2   - K (key) in little endian int array
2638   //
2639   address generate_aescrypt_decryptBlock() {
2640     assert(UseAES, "need AES cryptographic extension support");
2641     __ align(CodeEntryAlignment);
2642     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2643     Label L_doLast;
2644 
2645     const Register from        = c_rarg0;  // source array address
2646     const Register to          = c_rarg1;  // destination array address
2647     const Register key         = c_rarg2;  // key array address
2648     const Register keylen      = rscratch1;
2649 
2650     address start = __ pc();
2651     __ enter(); // required for proper stackwalking of RuntimeStub frame
2652 
2653     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2654 
2655     __ aesecb_decrypt(from, to, key, keylen);
2656 
2657     __ mov(r0, 0);
2658 
2659     __ leave();
2660     __ ret(lr);
2661 
2662     return start;
2663   }
2664 
2665   // Arguments:
2666   //
2667   // Inputs:
2668   //   c_rarg0   - source byte array address
2669   //   c_rarg1   - destination byte array address
2670   //   c_rarg2   - K (key) in little endian int array
2671   //   c_rarg3   - r vector byte array address
2672   //   c_rarg4   - input length
2673   //
2674   // Output:
2675   //   x0        - input length
2676   //
2677   address generate_cipherBlockChaining_encryptAESCrypt() {
2678     assert(UseAES, "need AES cryptographic extension support");
2679     __ align(CodeEntryAlignment);
2680     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2681 
2682     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2683 
2684     const Register from        = c_rarg0;  // source array address
2685     const Register to          = c_rarg1;  // destination array address
2686     const Register key         = c_rarg2;  // key array address
2687     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2688                                            // and left with the results of the last encryption block
2689     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2690     const Register keylen      = rscratch1;
2691 
2692     address start = __ pc();
2693 
2694       __ enter();
2695 
2696       __ movw(rscratch2, len_reg);
2697 
2698       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2699 
2700       __ ld1(v0, __ T16B, rvec);
2701 
2702       __ cmpw(keylen, 52);
2703       __ br(Assembler::CC, L_loadkeys_44);
2704       __ br(Assembler::EQ, L_loadkeys_52);
2705 
2706       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2707       __ rev32(v17, __ T16B, v17);
2708       __ rev32(v18, __ T16B, v18);
2709     __ BIND(L_loadkeys_52);
2710       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2711       __ rev32(v19, __ T16B, v19);
2712       __ rev32(v20, __ T16B, v20);
2713     __ BIND(L_loadkeys_44);
2714       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2715       __ rev32(v21, __ T16B, v21);
2716       __ rev32(v22, __ T16B, v22);
2717       __ rev32(v23, __ T16B, v23);
2718       __ rev32(v24, __ T16B, v24);
2719       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2720       __ rev32(v25, __ T16B, v25);
2721       __ rev32(v26, __ T16B, v26);
2722       __ rev32(v27, __ T16B, v27);
2723       __ rev32(v28, __ T16B, v28);
2724       __ ld1(v29, v30, v31, __ T16B, key);
2725       __ rev32(v29, __ T16B, v29);
2726       __ rev32(v30, __ T16B, v30);
2727       __ rev32(v31, __ T16B, v31);
2728 
2729     __ BIND(L_aes_loop);
2730       __ ld1(v1, __ T16B, __ post(from, 16));
2731       __ eor(v0, __ T16B, v0, v1);
2732 
2733       __ br(Assembler::CC, L_rounds_44);
2734       __ br(Assembler::EQ, L_rounds_52);
2735 
2736       __ aese(v0, v17); __ aesmc(v0, v0);
2737       __ aese(v0, v18); __ aesmc(v0, v0);
2738     __ BIND(L_rounds_52);
2739       __ aese(v0, v19); __ aesmc(v0, v0);
2740       __ aese(v0, v20); __ aesmc(v0, v0);
2741     __ BIND(L_rounds_44);
2742       __ aese(v0, v21); __ aesmc(v0, v0);
2743       __ aese(v0, v22); __ aesmc(v0, v0);
2744       __ aese(v0, v23); __ aesmc(v0, v0);
2745       __ aese(v0, v24); __ aesmc(v0, v0);
2746       __ aese(v0, v25); __ aesmc(v0, v0);
2747       __ aese(v0, v26); __ aesmc(v0, v0);
2748       __ aese(v0, v27); __ aesmc(v0, v0);
2749       __ aese(v0, v28); __ aesmc(v0, v0);
2750       __ aese(v0, v29); __ aesmc(v0, v0);
2751       __ aese(v0, v30);
2752       __ eor(v0, __ T16B, v0, v31);
2753 
2754       __ st1(v0, __ T16B, __ post(to, 16));
2755 
2756       __ subw(len_reg, len_reg, 16);
2757       __ cbnzw(len_reg, L_aes_loop);
2758 
2759       __ st1(v0, __ T16B, rvec);
2760 
2761       __ mov(r0, rscratch2);
2762 
2763       __ leave();
2764       __ ret(lr);
2765 
2766       return start;
2767   }
2768 
2769   // Arguments:
2770   //
2771   // Inputs:
2772   //   c_rarg0   - source byte array address
2773   //   c_rarg1   - destination byte array address
2774   //   c_rarg2   - K (key) in little endian int array
2775   //   c_rarg3   - r vector byte array address
2776   //   c_rarg4   - input length
2777   //
2778   // Output:
2779   //   r0        - input length
2780   //
2781   address generate_cipherBlockChaining_decryptAESCrypt() {
2782     assert(UseAES, "need AES cryptographic extension support");
2783     __ align(CodeEntryAlignment);
2784     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2785 
2786     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2787 
2788     const Register from        = c_rarg0;  // source array address
2789     const Register to          = c_rarg1;  // destination array address
2790     const Register key         = c_rarg2;  // key array address
2791     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2792                                            // and left with the results of the last encryption block
2793     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2794     const Register keylen      = rscratch1;
2795 
2796     address start = __ pc();
2797 
2798       __ enter();
2799 
2800       __ movw(rscratch2, len_reg);
2801 
2802       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2803 
2804       __ ld1(v2, __ T16B, rvec);
2805 
2806       __ ld1(v31, __ T16B, __ post(key, 16));
2807       __ rev32(v31, __ T16B, v31);
2808 
2809       __ cmpw(keylen, 52);
2810       __ br(Assembler::CC, L_loadkeys_44);
2811       __ br(Assembler::EQ, L_loadkeys_52);
2812 
2813       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2814       __ rev32(v17, __ T16B, v17);
2815       __ rev32(v18, __ T16B, v18);
2816     __ BIND(L_loadkeys_52);
2817       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2818       __ rev32(v19, __ T16B, v19);
2819       __ rev32(v20, __ T16B, v20);
2820     __ BIND(L_loadkeys_44);
2821       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2822       __ rev32(v21, __ T16B, v21);
2823       __ rev32(v22, __ T16B, v22);
2824       __ rev32(v23, __ T16B, v23);
2825       __ rev32(v24, __ T16B, v24);
2826       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2827       __ rev32(v25, __ T16B, v25);
2828       __ rev32(v26, __ T16B, v26);
2829       __ rev32(v27, __ T16B, v27);
2830       __ rev32(v28, __ T16B, v28);
2831       __ ld1(v29, v30, __ T16B, key);
2832       __ rev32(v29, __ T16B, v29);
2833       __ rev32(v30, __ T16B, v30);
2834 
2835     __ BIND(L_aes_loop);
2836       __ ld1(v0, __ T16B, __ post(from, 16));
2837       __ orr(v1, __ T16B, v0, v0);
2838 
2839       __ br(Assembler::CC, L_rounds_44);
2840       __ br(Assembler::EQ, L_rounds_52);
2841 
2842       __ aesd(v0, v17); __ aesimc(v0, v0);
2843       __ aesd(v0, v18); __ aesimc(v0, v0);
2844     __ BIND(L_rounds_52);
2845       __ aesd(v0, v19); __ aesimc(v0, v0);
2846       __ aesd(v0, v20); __ aesimc(v0, v0);
2847     __ BIND(L_rounds_44);
2848       __ aesd(v0, v21); __ aesimc(v0, v0);
2849       __ aesd(v0, v22); __ aesimc(v0, v0);
2850       __ aesd(v0, v23); __ aesimc(v0, v0);
2851       __ aesd(v0, v24); __ aesimc(v0, v0);
2852       __ aesd(v0, v25); __ aesimc(v0, v0);
2853       __ aesd(v0, v26); __ aesimc(v0, v0);
2854       __ aesd(v0, v27); __ aesimc(v0, v0);
2855       __ aesd(v0, v28); __ aesimc(v0, v0);
2856       __ aesd(v0, v29); __ aesimc(v0, v0);
2857       __ aesd(v0, v30);
2858       __ eor(v0, __ T16B, v0, v31);
2859       __ eor(v0, __ T16B, v0, v2);
2860 
2861       __ st1(v0, __ T16B, __ post(to, 16));
2862       __ orr(v2, __ T16B, v1, v1);
2863 
2864       __ subw(len_reg, len_reg, 16);
2865       __ cbnzw(len_reg, L_aes_loop);
2866 
2867       __ st1(v2, __ T16B, rvec);
2868 
2869       __ mov(r0, rscratch2);
2870 
2871       __ leave();
2872       __ ret(lr);
2873 
2874     return start;
2875   }
2876 
2877   // CTR AES crypt.
2878   // Arguments:
2879   //
2880   // Inputs:
2881   //   c_rarg0   - source byte array address
2882   //   c_rarg1   - destination byte array address
2883   //   c_rarg2   - K (key) in little endian int array
2884   //   c_rarg3   - counter vector byte array address
2885   //   c_rarg4   - input length
2886   //   c_rarg5   - saved encryptedCounter start
2887   //   c_rarg6   - saved used length
2888   //
2889   // Output:
2890   //   r0       - input length
2891   //
2892   address generate_counterMode_AESCrypt() {
2893     const Register in = c_rarg0;
2894     const Register out = c_rarg1;
2895     const Register key = c_rarg2;
2896     const Register counter = c_rarg3;
2897     const Register saved_len = c_rarg4, len = r10;
2898     const Register saved_encrypted_ctr = c_rarg5;
2899     const Register used_ptr = c_rarg6, used = r12;
2900 
2901     const Register offset = r7;
2902     const Register keylen = r11;
2903 
2904     const unsigned char block_size = 16;
2905     const int bulk_width = 4;
2906     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2907     // performance with larger data sizes, but it also means that the
2908     // fast path isn't used until you have at least 8 blocks, and up
2909     // to 127 bytes of data will be executed on the slow path. For
2910     // that reason, and also so as not to blow away too much icache, 4
2911     // blocks seems like a sensible compromise.
2912 
2913     // Algorithm:
2914     //
2915     //    if (len == 0) {
2916     //        goto DONE;
2917     //    }
2918     //    int result = len;
2919     //    do {
2920     //        if (used >= blockSize) {
2921     //            if (len >= bulk_width * blockSize) {
2922     //                CTR_large_block();
2923     //                if (len == 0)
2924     //                    goto DONE;
2925     //            }
2926     //            for (;;) {
2927     //                16ByteVector v0 = counter;
2928     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2929     //                used = 0;
2930     //                if (len < blockSize)
2931     //                    break;    /* goto NEXT */
2932     //                16ByteVector v1 = load16Bytes(in, offset);
2933     //                v1 = v1 ^ encryptedCounter;
2934     //                store16Bytes(out, offset);
2935     //                used = blockSize;
2936     //                offset += blockSize;
2937     //                len -= blockSize;
2938     //                if (len == 0)
2939     //                    goto DONE;
2940     //            }
2941     //        }
2942     //      NEXT:
2943     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2944     //        len--;
2945     //    } while (len != 0);
2946     //  DONE:
2947     //    return result;
2948     //
2949     // CTR_large_block()
2950     //    Wide bulk encryption of whole blocks.
2951 
2952     __ align(CodeEntryAlignment);
2953     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2954     const address start = __ pc();
2955     __ enter();
2956 
2957     Label DONE, CTR_large_block, large_block_return;
2958     __ ldrw(used, Address(used_ptr));
2959     __ cbzw(saved_len, DONE);
2960 
2961     __ mov(len, saved_len);
2962     __ mov(offset, 0);
2963 
2964     // Compute #rounds for AES based on the length of the key array
2965     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2966 
2967     __ aesenc_loadkeys(key, keylen);
2968 
2969     {
2970       Label L_CTR_loop, NEXT;
2971 
2972       __ bind(L_CTR_loop);
2973 
2974       __ cmp(used, block_size);
2975       __ br(__ LO, NEXT);
2976 
2977       // Maybe we have a lot of data
2978       __ subsw(rscratch1, len, bulk_width * block_size);
2979       __ br(__ HS, CTR_large_block);
2980       __ BIND(large_block_return);
2981       __ cbzw(len, DONE);
2982 
2983       // Setup the counter
2984       __ movi(v4, __ T4S, 0);
2985       __ movi(v5, __ T4S, 1);
2986       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2987 
2988       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2989       __ rev32(v16, __ T16B, v0);
2990       __ addv(v16, __ T4S, v16, v4);
2991       __ rev32(v16, __ T16B, v16);
2992       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2993 
2994       {
2995         // We have fewer than bulk_width blocks of data left. Encrypt
2996         // them one by one until there is less than a full block
2997         // remaining, being careful to save both the encrypted counter
2998         // and the counter.
2999 
3000         Label inner_loop;
3001         __ bind(inner_loop);
3002         // Counter to encrypt is in v0
3003         __ aesecb_encrypt(noreg, noreg, keylen);
3004         __ st1(v0, __ T16B, saved_encrypted_ctr);
3005 
3006         // Do we have a remaining full block?
3007 
3008         __ mov(used, 0);
3009         __ cmp(len, block_size);
3010         __ br(__ LO, NEXT);
3011 
3012         // Yes, we have a full block
3013         __ ldrq(v1, Address(in, offset));
3014         __ eor(v1, __ T16B, v1, v0);
3015         __ strq(v1, Address(out, offset));
3016         __ mov(used, block_size);
3017         __ add(offset, offset, block_size);
3018 
3019         __ subw(len, len, block_size);
3020         __ cbzw(len, DONE);
3021 
3022         // Increment the counter, store it back
3023         __ orr(v0, __ T16B, v16, v16);
3024         __ rev32(v16, __ T16B, v16);
3025         __ addv(v16, __ T4S, v16, v4);
3026         __ rev32(v16, __ T16B, v16);
3027         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3028 
3029         __ b(inner_loop);
3030       }
3031 
3032       __ BIND(NEXT);
3033 
3034       // Encrypt a single byte, and loop.
3035       // We expect this to be a rare event.
3036       __ ldrb(rscratch1, Address(in, offset));
3037       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3038       __ eor(rscratch1, rscratch1, rscratch2);
3039       __ strb(rscratch1, Address(out, offset));
3040       __ add(offset, offset, 1);
3041       __ add(used, used, 1);
3042       __ subw(len, len,1);
3043       __ cbnzw(len, L_CTR_loop);
3044     }
3045 
3046     __ bind(DONE);
3047     __ strw(used, Address(used_ptr));
3048     __ mov(r0, saved_len);
3049 
3050     __ leave(); // required for proper stackwalking of RuntimeStub frame
3051     __ ret(lr);
3052 
3053     // Bulk encryption
3054 
3055     __ BIND (CTR_large_block);
3056     assert(bulk_width == 4 || bulk_width == 8, "must be");
3057 
3058     if (bulk_width == 8) {
3059       __ sub(sp, sp, 4 * 16);
3060       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3061     }
3062     __ sub(sp, sp, 4 * 16);
3063     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3064     RegSet saved_regs = (RegSet::of(in, out, offset)
3065                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3066     __ push(saved_regs, sp);
3067     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3068     __ add(in, in, offset);
3069     __ add(out, out, offset);
3070 
3071     // Keys should already be loaded into the correct registers
3072 
3073     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3074     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3075 
3076     // AES/CTR loop
3077     {
3078       Label L_CTR_loop;
3079       __ BIND(L_CTR_loop);
3080 
3081       // Setup the counters
3082       __ movi(v8, __ T4S, 0);
3083       __ movi(v9, __ T4S, 1);
3084       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3085 
3086       for (int i = 0; i < bulk_width; i++) {
3087         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3088         __ rev32(v0_ofs, __ T16B, v16);
3089         __ addv(v16, __ T4S, v16, v8);
3090       }
3091 
3092       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3093 
3094       // Encrypt the counters
3095       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3096 
3097       if (bulk_width == 8) {
3098         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3099       }
3100 
3101       // XOR the encrypted counters with the inputs
3102       for (int i = 0; i < bulk_width; i++) {
3103         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3104         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3105         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3106       }
3107 
3108       // Write the encrypted data
3109       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3110       if (bulk_width == 8) {
3111         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3112       }
3113 
3114       __ subw(len, len, 16 * bulk_width);
3115       __ cbnzw(len, L_CTR_loop);
3116     }
3117 
3118     // Save the counter back where it goes
3119     __ rev32(v16, __ T16B, v16);
3120     __ st1(v16, __ T16B, counter);
3121 
3122     __ pop(saved_regs, sp);
3123 
3124     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3125     if (bulk_width == 8) {
3126       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3127     }
3128 
3129     __ andr(rscratch1, len, -16 * bulk_width);
3130     __ sub(len, len, rscratch1);
3131     __ add(offset, offset, rscratch1);
3132     __ mov(used, 16);
3133     __ strw(used, Address(used_ptr));
3134     __ b(large_block_return);
3135 
3136     return start;
3137   }
3138 
3139   // Vector AES Galois Counter Mode implementation. Parameters:
3140   //
3141   // in = c_rarg0
3142   // len = c_rarg1
3143   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3144   // out = c_rarg3
3145   // key = c_rarg4
3146   // state = c_rarg5 - GHASH.state
3147   // subkeyHtbl = c_rarg6 - powers of H
3148   // counter = c_rarg7 - 16 bytes of CTR
3149   // return - number of processed bytes
3150   address generate_galoisCounterMode_AESCrypt() {
3151     address ghash_polynomial = __ pc();
3152     __ emit_int64(0x87);  // The low-order bits of the field
3153                           // polynomial (i.e. p = z^7+z^2+z+1)
3154                           // repeated in the low and high parts of a
3155                           // 128-bit vector
3156     __ emit_int64(0x87);
3157 
3158     __ align(CodeEntryAlignment);
3159      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3160     address start = __ pc();
3161     __ enter();
3162 
3163     const Register in = c_rarg0;
3164     const Register len = c_rarg1;
3165     const Register ct = c_rarg2;
3166     const Register out = c_rarg3;
3167     // and updated with the incremented counter in the end
3168 
3169     const Register key = c_rarg4;
3170     const Register state = c_rarg5;
3171 
3172     const Register subkeyHtbl = c_rarg6;
3173 
3174     const Register counter = c_rarg7;
3175 
3176     const Register keylen = r10;
3177     // Save state before entering routine
3178     __ sub(sp, sp, 4 * 16);
3179     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3180     __ sub(sp, sp, 4 * 16);
3181     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3182 
3183     // __ andr(len, len, -512);
3184     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3185     __ str(len, __ pre(sp, -2 * wordSize));
3186 
3187     Label DONE;
3188     __ cbz(len, DONE);
3189 
3190     // Compute #rounds for AES based on the length of the key array
3191     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3192 
3193     __ aesenc_loadkeys(key, keylen);
3194     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3195     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3196 
3197     // AES/CTR loop
3198     {
3199       Label L_CTR_loop;
3200       __ BIND(L_CTR_loop);
3201 
3202       // Setup the counters
3203       __ movi(v8, __ T4S, 0);
3204       __ movi(v9, __ T4S, 1);
3205       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3206 
3207       assert(v0->encoding() < v8->encoding(), "");
3208       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3209         FloatRegister f = as_FloatRegister(i);
3210         __ rev32(f, __ T16B, v16);
3211         __ addv(v16, __ T4S, v16, v8);
3212       }
3213 
3214       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3215 
3216       // Encrypt the counters
3217       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3218 
3219       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3220 
3221       // XOR the encrypted counters with the inputs
3222       for (int i = 0; i < 8; i++) {
3223         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3224         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3225         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3226       }
3227       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3228       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3229 
3230       __ subw(len, len, 16 * 8);
3231       __ cbnzw(len, L_CTR_loop);
3232     }
3233 
3234     __ rev32(v16, __ T16B, v16);
3235     __ st1(v16, __ T16B, counter);
3236 
3237     __ ldr(len, Address(sp));
3238     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3239 
3240     // GHASH/CTR loop
3241     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3242                                 len, /*unrolls*/4);
3243 
3244 #ifdef ASSERT
3245     { Label L;
3246       __ cmp(len, (unsigned char)0);
3247       __ br(Assembler::EQ, L);
3248       __ stop("stubGenerator: abort");
3249       __ bind(L);
3250   }
3251 #endif
3252 
3253   __ bind(DONE);
3254     // Return the number of bytes processed
3255     __ ldr(r0, __ post(sp, 2 * wordSize));
3256 
3257     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3258     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3259 
3260     __ leave(); // required for proper stackwalking of RuntimeStub frame
3261     __ ret(lr);
3262      return start;
3263   }
3264 
3265   // Utility routines for md5.
3266   // Clobbers r10 and r11.
3267   void md5_FF(Register buf, Register r1, Register r2, Register r3, Register r4,
3268               int k, int s, int t) {
3269     Register rscratch3 = r10;
3270     Register rscratch4 = r11;
3271 
3272     __ eorw(rscratch3, r3, r4);
3273     __ movw(rscratch2, t);
3274     __ andw(rscratch3, rscratch3, r2);
3275     __ addw(rscratch4, r1, rscratch2);
3276     __ ldrw(rscratch1, Address(buf, k*4));
3277     __ eorw(rscratch3, rscratch3, r4);
3278     __ addw(rscratch3, rscratch3, rscratch1);
3279     __ addw(rscratch3, rscratch3, rscratch4);
3280     __ rorw(rscratch2, rscratch3, 32 - s);
3281     __ addw(r1, rscratch2, r2);
3282   }
3283 
3284   void md5_GG(Register buf, Register r1, Register r2, Register r3, Register r4,
3285               int k, int s, int t) {
3286     Register rscratch3 = r10;
3287     Register rscratch4 = r11;
3288 
3289     __ eorw(rscratch2, r2, r3);
3290     __ ldrw(rscratch1, Address(buf, k*4));
3291     __ andw(rscratch3, rscratch2, r4);
3292     __ movw(rscratch2, t);
3293     __ eorw(rscratch3, rscratch3, r3);
3294     __ addw(rscratch4, r1, rscratch2);
3295     __ addw(rscratch3, rscratch3, rscratch1);
3296     __ addw(rscratch3, rscratch3, rscratch4);
3297     __ rorw(rscratch2, rscratch3, 32 - s);
3298     __ addw(r1, rscratch2, r2);
3299   }
3300 
3301   void md5_HH(Register buf, Register r1, Register r2, Register r3, Register r4,
3302               int k, int s, int t) {
3303     Register rscratch3 = r10;
3304     Register rscratch4 = r11;
3305 
3306     __ eorw(rscratch3, r3, r4);
3307     __ movw(rscratch2, t);
3308     __ addw(rscratch4, r1, rscratch2);
3309     __ ldrw(rscratch1, Address(buf, k*4));
3310     __ eorw(rscratch3, rscratch3, r2);
3311     __ addw(rscratch3, rscratch3, rscratch1);
3312     __ addw(rscratch3, rscratch3, rscratch4);
3313     __ rorw(rscratch2, rscratch3, 32 - s);
3314     __ addw(r1, rscratch2, r2);
3315   }
3316 
3317   void md5_II(Register buf, Register r1, Register r2, Register r3, Register r4,
3318               int k, int s, int t) {
3319     Register rscratch3 = r10;
3320     Register rscratch4 = r11;
3321 
3322     __ movw(rscratch3, t);
3323     __ ornw(rscratch2, r2, r4);
3324     __ addw(rscratch4, r1, rscratch3);
3325     __ ldrw(rscratch1, Address(buf, k*4));
3326     __ eorw(rscratch3, rscratch2, r3);
3327     __ addw(rscratch3, rscratch3, rscratch1);
3328     __ addw(rscratch3, rscratch3, rscratch4);
3329     __ rorw(rscratch2, rscratch3, 32 - s);
3330     __ addw(r1, rscratch2, r2);
3331   }
3332 
3333   // Arguments:
3334   //
3335   // Inputs:
3336   //   c_rarg0   - byte[]  source+offset
3337   //   c_rarg1   - int[]   SHA.state
3338   //   c_rarg2   - int     offset
3339   //   c_rarg3   - int     limit
3340   //
3341   address generate_md5_implCompress(bool multi_block, const char *name) {
3342     __ align(CodeEntryAlignment);
3343     StubCodeMark mark(this, "StubRoutines", name);
3344     address start = __ pc();
3345 
3346     Register buf       = c_rarg0;
3347     Register state     = c_rarg1;
3348     Register ofs       = c_rarg2;
3349     Register limit     = c_rarg3;
3350     Register a         = r4;
3351     Register b         = r5;
3352     Register c         = r6;
3353     Register d         = r7;
3354     Register rscratch3 = r10;
3355     Register rscratch4 = r11;
3356 
3357     Label md5_loop;
3358     __ BIND(md5_loop);
3359 
3360     // Save hash values for addition after rounds
3361     __ ldrw(a, Address(state,  0));
3362     __ ldrw(b, Address(state,  4));
3363     __ ldrw(c, Address(state,  8));
3364     __ ldrw(d, Address(state, 12));
3365 
3366     // Round 1
3367     md5_FF(buf, a, b, c, d,  0,  7, 0xd76aa478);
3368     md5_FF(buf, d, a, b, c,  1, 12, 0xe8c7b756);
3369     md5_FF(buf, c, d, a, b,  2, 17, 0x242070db);
3370     md5_FF(buf, b, c, d, a,  3, 22, 0xc1bdceee);
3371     md5_FF(buf, a, b, c, d,  4,  7, 0xf57c0faf);
3372     md5_FF(buf, d, a, b, c,  5, 12, 0x4787c62a);
3373     md5_FF(buf, c, d, a, b,  6, 17, 0xa8304613);
3374     md5_FF(buf, b, c, d, a,  7, 22, 0xfd469501);
3375     md5_FF(buf, a, b, c, d,  8,  7, 0x698098d8);
3376     md5_FF(buf, d, a, b, c,  9, 12, 0x8b44f7af);
3377     md5_FF(buf, c, d, a, b, 10, 17, 0xffff5bb1);
3378     md5_FF(buf, b, c, d, a, 11, 22, 0x895cd7be);
3379     md5_FF(buf, a, b, c, d, 12,  7, 0x6b901122);
3380     md5_FF(buf, d, a, b, c, 13, 12, 0xfd987193);
3381     md5_FF(buf, c, d, a, b, 14, 17, 0xa679438e);
3382     md5_FF(buf, b, c, d, a, 15, 22, 0x49b40821);
3383 
3384     // Round 2
3385     md5_GG(buf, a, b, c, d,  1,  5, 0xf61e2562);
3386     md5_GG(buf, d, a, b, c,  6,  9, 0xc040b340);
3387     md5_GG(buf, c, d, a, b, 11, 14, 0x265e5a51);
3388     md5_GG(buf, b, c, d, a,  0, 20, 0xe9b6c7aa);
3389     md5_GG(buf, a, b, c, d,  5,  5, 0xd62f105d);
3390     md5_GG(buf, d, a, b, c, 10,  9, 0x02441453);
3391     md5_GG(buf, c, d, a, b, 15, 14, 0xd8a1e681);
3392     md5_GG(buf, b, c, d, a,  4, 20, 0xe7d3fbc8);
3393     md5_GG(buf, a, b, c, d,  9,  5, 0x21e1cde6);
3394     md5_GG(buf, d, a, b, c, 14,  9, 0xc33707d6);
3395     md5_GG(buf, c, d, a, b,  3, 14, 0xf4d50d87);
3396     md5_GG(buf, b, c, d, a,  8, 20, 0x455a14ed);
3397     md5_GG(buf, a, b, c, d, 13,  5, 0xa9e3e905);
3398     md5_GG(buf, d, a, b, c,  2,  9, 0xfcefa3f8);
3399     md5_GG(buf, c, d, a, b,  7, 14, 0x676f02d9);
3400     md5_GG(buf, b, c, d, a, 12, 20, 0x8d2a4c8a);
3401 
3402     // Round 3
3403     md5_HH(buf, a, b, c, d,  5,  4, 0xfffa3942);
3404     md5_HH(buf, d, a, b, c,  8, 11, 0x8771f681);
3405     md5_HH(buf, c, d, a, b, 11, 16, 0x6d9d6122);
3406     md5_HH(buf, b, c, d, a, 14, 23, 0xfde5380c);
3407     md5_HH(buf, a, b, c, d,  1,  4, 0xa4beea44);
3408     md5_HH(buf, d, a, b, c,  4, 11, 0x4bdecfa9);
3409     md5_HH(buf, c, d, a, b,  7, 16, 0xf6bb4b60);
3410     md5_HH(buf, b, c, d, a, 10, 23, 0xbebfbc70);
3411     md5_HH(buf, a, b, c, d, 13,  4, 0x289b7ec6);
3412     md5_HH(buf, d, a, b, c,  0, 11, 0xeaa127fa);
3413     md5_HH(buf, c, d, a, b,  3, 16, 0xd4ef3085);
3414     md5_HH(buf, b, c, d, a,  6, 23, 0x04881d05);
3415     md5_HH(buf, a, b, c, d,  9,  4, 0xd9d4d039);
3416     md5_HH(buf, d, a, b, c, 12, 11, 0xe6db99e5);
3417     md5_HH(buf, c, d, a, b, 15, 16, 0x1fa27cf8);
3418     md5_HH(buf, b, c, d, a,  2, 23, 0xc4ac5665);
3419 
3420     // Round 4
3421     md5_II(buf, a, b, c, d,  0,  6, 0xf4292244);
3422     md5_II(buf, d, a, b, c,  7, 10, 0x432aff97);
3423     md5_II(buf, c, d, a, b, 14, 15, 0xab9423a7);
3424     md5_II(buf, b, c, d, a,  5, 21, 0xfc93a039);
3425     md5_II(buf, a, b, c, d, 12,  6, 0x655b59c3);
3426     md5_II(buf, d, a, b, c,  3, 10, 0x8f0ccc92);
3427     md5_II(buf, c, d, a, b, 10, 15, 0xffeff47d);
3428     md5_II(buf, b, c, d, a,  1, 21, 0x85845dd1);
3429     md5_II(buf, a, b, c, d,  8,  6, 0x6fa87e4f);
3430     md5_II(buf, d, a, b, c, 15, 10, 0xfe2ce6e0);
3431     md5_II(buf, c, d, a, b,  6, 15, 0xa3014314);
3432     md5_II(buf, b, c, d, a, 13, 21, 0x4e0811a1);
3433     md5_II(buf, a, b, c, d,  4,  6, 0xf7537e82);
3434     md5_II(buf, d, a, b, c, 11, 10, 0xbd3af235);
3435     md5_II(buf, c, d, a, b,  2, 15, 0x2ad7d2bb);
3436     md5_II(buf, b, c, d, a,  9, 21, 0xeb86d391);
3437 
3438     // write hash values back in the correct order
3439     __ ldrw(rscratch1, Address(state,  0));
3440     __ addw(rscratch1, rscratch1, a);
3441     __ strw(rscratch1, Address(state,  0));
3442 
3443     __ ldrw(rscratch2, Address(state,  4));
3444     __ addw(rscratch2, rscratch2, b);
3445     __ strw(rscratch2, Address(state,  4));
3446 
3447     __ ldrw(rscratch3, Address(state,  8));
3448     __ addw(rscratch3, rscratch3, c);
3449     __ strw(rscratch3, Address(state,  8));
3450 
3451     __ ldrw(rscratch4, Address(state, 12));
3452     __ addw(rscratch4, rscratch4, d);
3453     __ strw(rscratch4, Address(state, 12));
3454 
3455     if (multi_block) {
3456       __ add(buf, buf, 64);
3457       __ add(ofs, ofs, 64);
3458       __ cmp(ofs, limit);
3459       __ br(Assembler::LE, md5_loop);
3460       __ mov(c_rarg0, ofs); // return ofs
3461     }
3462 
3463     __ ret(lr);
3464 
3465     return start;
3466   }
3467 
3468   // Arguments:
3469   //
3470   // Inputs:
3471   //   c_rarg0   - byte[]  source+offset
3472   //   c_rarg1   - int[]   SHA.state
3473   //   c_rarg2   - int     offset
3474   //   c_rarg3   - int     limit
3475   //
3476   address generate_sha1_implCompress(bool multi_block, const char *name) {
3477     __ align(CodeEntryAlignment);
3478     StubCodeMark mark(this, "StubRoutines", name);
3479     address start = __ pc();
3480 
3481     Register buf   = c_rarg0;
3482     Register state = c_rarg1;
3483     Register ofs   = c_rarg2;
3484     Register limit = c_rarg3;
3485 
3486     Label keys;
3487     Label sha1_loop;
3488 
3489     // load the keys into v0..v3
3490     __ adr(rscratch1, keys);
3491     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3492     // load 5 words state into v6, v7
3493     __ ldrq(v6, Address(state, 0));
3494     __ ldrs(v7, Address(state, 16));
3495 
3496 
3497     __ BIND(sha1_loop);
3498     // load 64 bytes of data into v16..v19
3499     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3500     __ rev32(v16, __ T16B, v16);
3501     __ rev32(v17, __ T16B, v17);
3502     __ rev32(v18, __ T16B, v18);
3503     __ rev32(v19, __ T16B, v19);
3504 
3505     // do the sha1
3506     __ addv(v4, __ T4S, v16, v0);
3507     __ orr(v20, __ T16B, v6, v6);
3508 
3509     FloatRegister d0 = v16;
3510     FloatRegister d1 = v17;
3511     FloatRegister d2 = v18;
3512     FloatRegister d3 = v19;
3513 
3514     for (int round = 0; round < 20; round++) {
3515       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3516       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3517       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3518       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3519       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3520 
3521       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3522       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3523       __ sha1h(tmp2, __ T4S, v20);
3524       if (round < 5)
3525         __ sha1c(v20, __ T4S, tmp3, tmp4);
3526       else if (round < 10 || round >= 15)
3527         __ sha1p(v20, __ T4S, tmp3, tmp4);
3528       else
3529         __ sha1m(v20, __ T4S, tmp3, tmp4);
3530       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3531 
3532       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3533     }
3534 
3535     __ addv(v7, __ T2S, v7, v21);
3536     __ addv(v6, __ T4S, v6, v20);
3537 
3538     if (multi_block) {
3539       __ add(ofs, ofs, 64);
3540       __ cmp(ofs, limit);
3541       __ br(Assembler::LE, sha1_loop);
3542       __ mov(c_rarg0, ofs); // return ofs
3543     }
3544 
3545     __ strq(v6, Address(state, 0));
3546     __ strs(v7, Address(state, 16));
3547 
3548     __ ret(lr);
3549 
3550     __ bind(keys);
3551     __ emit_int32(0x5a827999);
3552     __ emit_int32(0x6ed9eba1);
3553     __ emit_int32(0x8f1bbcdc);
3554     __ emit_int32(0xca62c1d6);
3555 
3556     return start;
3557   }
3558 
3559 
3560   // Arguments:
3561   //
3562   // Inputs:
3563   //   c_rarg0   - byte[]  source+offset
3564   //   c_rarg1   - int[]   SHA.state
3565   //   c_rarg2   - int     offset
3566   //   c_rarg3   - int     limit
3567   //
3568   address generate_sha256_implCompress(bool multi_block, const char *name) {
3569     static const uint32_t round_consts[64] = {
3570       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3571       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3572       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3573       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3574       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3575       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3576       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3577       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3578       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3579       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3580       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3581       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3582       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3583       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3584       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3585       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3586     };
3587     __ align(CodeEntryAlignment);
3588     StubCodeMark mark(this, "StubRoutines", name);
3589     address start = __ pc();
3590 
3591     Register buf   = c_rarg0;
3592     Register state = c_rarg1;
3593     Register ofs   = c_rarg2;
3594     Register limit = c_rarg3;
3595 
3596     Label sha1_loop;
3597 
3598     __ stpd(v8, v9, __ pre(sp, -32));
3599     __ stpd(v10, v11, Address(sp, 16));
3600 
3601 // dga == v0
3602 // dgb == v1
3603 // dg0 == v2
3604 // dg1 == v3
3605 // dg2 == v4
3606 // t0 == v6
3607 // t1 == v7
3608 
3609     // load 16 keys to v16..v31
3610     __ lea(rscratch1, ExternalAddress((address)round_consts));
3611     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3612     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3613     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3614     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3615 
3616     // load 8 words (256 bits) state
3617     __ ldpq(v0, v1, state);
3618 
3619     __ BIND(sha1_loop);
3620     // load 64 bytes of data into v8..v11
3621     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3622     __ rev32(v8, __ T16B, v8);
3623     __ rev32(v9, __ T16B, v9);
3624     __ rev32(v10, __ T16B, v10);
3625     __ rev32(v11, __ T16B, v11);
3626 
3627     __ addv(v6, __ T4S, v8, v16);
3628     __ orr(v2, __ T16B, v0, v0);
3629     __ orr(v3, __ T16B, v1, v1);
3630 
3631     FloatRegister d0 = v8;
3632     FloatRegister d1 = v9;
3633     FloatRegister d2 = v10;
3634     FloatRegister d3 = v11;
3635 
3636 
3637     for (int round = 0; round < 16; round++) {
3638       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3639       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3640       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3641       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3642 
3643       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3644        __ orr(v4, __ T16B, v2, v2);
3645       if (round < 15)
3646         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3647       __ sha256h(v2, __ T4S, v3, tmp2);
3648       __ sha256h2(v3, __ T4S, v4, tmp2);
3649       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3650 
3651       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3652     }
3653 
3654     __ addv(v0, __ T4S, v0, v2);
3655     __ addv(v1, __ T4S, v1, v3);
3656 
3657     if (multi_block) {
3658       __ add(ofs, ofs, 64);
3659       __ cmp(ofs, limit);
3660       __ br(Assembler::LE, sha1_loop);
3661       __ mov(c_rarg0, ofs); // return ofs
3662     }
3663 
3664     __ ldpd(v10, v11, Address(sp, 16));
3665     __ ldpd(v8, v9, __ post(sp, 32));
3666 
3667     __ stpq(v0, v1, state);
3668 
3669     __ ret(lr);
3670 
3671     return start;
3672   }
3673 
3674   // Double rounds for sha512.
3675   void sha512_dround(int dr,
3676                      FloatRegister vi0, FloatRegister vi1,
3677                      FloatRegister vi2, FloatRegister vi3,
3678                      FloatRegister vi4, FloatRegister vrc0,
3679                      FloatRegister vrc1, FloatRegister vin0,
3680                      FloatRegister vin1, FloatRegister vin2,
3681                      FloatRegister vin3, FloatRegister vin4) {
3682       if (dr < 36) {
3683         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3684       }
3685       __ addv(v5, __ T2D, vrc0, vin0);
3686       __ ext(v6, __ T16B, vi2, vi3, 8);
3687       __ ext(v5, __ T16B, v5, v5, 8);
3688       __ ext(v7, __ T16B, vi1, vi2, 8);
3689       __ addv(vi3, __ T2D, vi3, v5);
3690       if (dr < 32) {
3691         __ ext(v5, __ T16B, vin3, vin4, 8);
3692         __ sha512su0(vin0, __ T2D, vin1);
3693       }
3694       __ sha512h(vi3, __ T2D, v6, v7);
3695       if (dr < 32) {
3696         __ sha512su1(vin0, __ T2D, vin2, v5);
3697       }
3698       __ addv(vi4, __ T2D, vi1, vi3);
3699       __ sha512h2(vi3, __ T2D, vi1, vi0);
3700   }
3701 
3702   // Arguments:
3703   //
3704   // Inputs:
3705   //   c_rarg0   - byte[]  source+offset
3706   //   c_rarg1   - int[]   SHA.state
3707   //   c_rarg2   - int     offset
3708   //   c_rarg3   - int     limit
3709   //
3710   address generate_sha512_implCompress(bool multi_block, const char *name) {
3711     static const uint64_t round_consts[80] = {
3712       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3713       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3714       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3715       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3716       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3717       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3718       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3719       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3720       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3721       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3722       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3723       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3724       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3725       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3726       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3727       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3728       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3729       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3730       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3731       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3732       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3733       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3734       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3735       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3736       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3737       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3738       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3739     };
3740 
3741     __ align(CodeEntryAlignment);
3742     StubCodeMark mark(this, "StubRoutines", name);
3743     address start = __ pc();
3744 
3745     Register buf   = c_rarg0;
3746     Register state = c_rarg1;
3747     Register ofs   = c_rarg2;
3748     Register limit = c_rarg3;
3749 
3750     __ stpd(v8, v9, __ pre(sp, -64));
3751     __ stpd(v10, v11, Address(sp, 16));
3752     __ stpd(v12, v13, Address(sp, 32));
3753     __ stpd(v14, v15, Address(sp, 48));
3754 
3755     Label sha512_loop;
3756 
3757     // load state
3758     __ ld1(v8, v9, v10, v11, __ T2D, state);
3759 
3760     // load first 4 round constants
3761     __ lea(rscratch1, ExternalAddress((address)round_consts));
3762     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3763 
3764     __ BIND(sha512_loop);
3765     // load 128B of data into v12..v19
3766     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3767     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3768     __ rev64(v12, __ T16B, v12);
3769     __ rev64(v13, __ T16B, v13);
3770     __ rev64(v14, __ T16B, v14);
3771     __ rev64(v15, __ T16B, v15);
3772     __ rev64(v16, __ T16B, v16);
3773     __ rev64(v17, __ T16B, v17);
3774     __ rev64(v18, __ T16B, v18);
3775     __ rev64(v19, __ T16B, v19);
3776 
3777     __ mov(rscratch2, rscratch1);
3778 
3779     __ mov(v0, __ T16B, v8);
3780     __ mov(v1, __ T16B, v9);
3781     __ mov(v2, __ T16B, v10);
3782     __ mov(v3, __ T16B, v11);
3783 
3784     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3785     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3786     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3787     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3788     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3789     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3790     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3791     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3792     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3793     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3794     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3795     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3796     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3797     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3798     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3799     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3800     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3801     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3802     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3803     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3804     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3805     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3806     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3807     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3808     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3809     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3810     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3811     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3812     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3813     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3814     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3815     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3816     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3817     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3818     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3819     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3820     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3821     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3822     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3823     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3824 
3825     __ addv(v8, __ T2D, v8, v0);
3826     __ addv(v9, __ T2D, v9, v1);
3827     __ addv(v10, __ T2D, v10, v2);
3828     __ addv(v11, __ T2D, v11, v3);
3829 
3830     if (multi_block) {
3831       __ add(ofs, ofs, 128);
3832       __ cmp(ofs, limit);
3833       __ br(Assembler::LE, sha512_loop);
3834       __ mov(c_rarg0, ofs); // return ofs
3835     }
3836 
3837     __ st1(v8, v9, v10, v11, __ T2D, state);
3838 
3839     __ ldpd(v14, v15, Address(sp, 48));
3840     __ ldpd(v12, v13, Address(sp, 32));
3841     __ ldpd(v10, v11, Address(sp, 16));
3842     __ ldpd(v8, v9, __ post(sp, 64));
3843 
3844     __ ret(lr);
3845 
3846     return start;
3847   }
3848 
3849   // Arguments:
3850   //
3851   // Inputs:
3852   //   c_rarg0   - byte[]  source+offset
3853   //   c_rarg1   - byte[]   SHA.state
3854   //   c_rarg2   - int     digest_length
3855   //   c_rarg3   - int     offset
3856   //   c_rarg4   - int     limit
3857   //
3858   address generate_sha3_implCompress(bool multi_block, const char *name) {
3859     static const uint64_t round_consts[24] = {
3860       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3861       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3862       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3863       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3864       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3865       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3866       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3867       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3868     };
3869 
3870     __ align(CodeEntryAlignment);
3871     StubCodeMark mark(this, "StubRoutines", name);
3872     address start = __ pc();
3873 
3874     Register buf           = c_rarg0;
3875     Register state         = c_rarg1;
3876     Register digest_length = c_rarg2;
3877     Register ofs           = c_rarg3;
3878     Register limit         = c_rarg4;
3879 
3880     Label sha3_loop, rounds24_loop;
3881     Label sha3_512, sha3_384_or_224, sha3_256;
3882 
3883     __ stpd(v8, v9, __ pre(sp, -64));
3884     __ stpd(v10, v11, Address(sp, 16));
3885     __ stpd(v12, v13, Address(sp, 32));
3886     __ stpd(v14, v15, Address(sp, 48));
3887 
3888     // load state
3889     __ add(rscratch1, state, 32);
3890     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3891     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3892     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3893     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3894     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3895     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3896     __ ld1(v24, __ T1D, rscratch1);
3897 
3898     __ BIND(sha3_loop);
3899 
3900     // 24 keccak rounds
3901     __ movw(rscratch2, 24);
3902 
3903     // load round_constants base
3904     __ lea(rscratch1, ExternalAddress((address) round_consts));
3905 
3906     // load input
3907     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3908     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3909     __ eor(v0, __ T8B, v0, v25);
3910     __ eor(v1, __ T8B, v1, v26);
3911     __ eor(v2, __ T8B, v2, v27);
3912     __ eor(v3, __ T8B, v3, v28);
3913     __ eor(v4, __ T8B, v4, v29);
3914     __ eor(v5, __ T8B, v5, v30);
3915     __ eor(v6, __ T8B, v6, v31);
3916 
3917     // digest_length == 64, SHA3-512
3918     __ tbnz(digest_length, 6, sha3_512);
3919 
3920     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3921     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3922     __ eor(v7, __ T8B, v7, v25);
3923     __ eor(v8, __ T8B, v8, v26);
3924     __ eor(v9, __ T8B, v9, v27);
3925     __ eor(v10, __ T8B, v10, v28);
3926     __ eor(v11, __ T8B, v11, v29);
3927     __ eor(v12, __ T8B, v12, v30);
3928 
3929     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3930     __ tbnz(digest_length, 4, sha3_384_or_224);
3931 
3932     // SHA3-256
3933     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3934     __ eor(v13, __ T8B, v13, v25);
3935     __ eor(v14, __ T8B, v14, v26);
3936     __ eor(v15, __ T8B, v15, v27);
3937     __ eor(v16, __ T8B, v16, v28);
3938     __ b(rounds24_loop);
3939 
3940     __ BIND(sha3_384_or_224);
3941     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3942 
3943     // SHA3-224
3944     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3945     __ ld1(v29, __ T8B, __ post(buf, 8));
3946     __ eor(v13, __ T8B, v13, v25);
3947     __ eor(v14, __ T8B, v14, v26);
3948     __ eor(v15, __ T8B, v15, v27);
3949     __ eor(v16, __ T8B, v16, v28);
3950     __ eor(v17, __ T8B, v17, v29);
3951     __ b(rounds24_loop);
3952 
3953     __ BIND(sha3_512);
3954     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3955     __ eor(v7, __ T8B, v7, v25);
3956     __ eor(v8, __ T8B, v8, v26);
3957 
3958     __ BIND(rounds24_loop);
3959     __ subw(rscratch2, rscratch2, 1);
3960 
3961     __ eor3(v29, __ T16B, v4, v9, v14);
3962     __ eor3(v26, __ T16B, v1, v6, v11);
3963     __ eor3(v28, __ T16B, v3, v8, v13);
3964     __ eor3(v25, __ T16B, v0, v5, v10);
3965     __ eor3(v27, __ T16B, v2, v7, v12);
3966     __ eor3(v29, __ T16B, v29, v19, v24);
3967     __ eor3(v26, __ T16B, v26, v16, v21);
3968     __ eor3(v28, __ T16B, v28, v18, v23);
3969     __ eor3(v25, __ T16B, v25, v15, v20);
3970     __ eor3(v27, __ T16B, v27, v17, v22);
3971 
3972     __ rax1(v30, __ T2D, v29, v26);
3973     __ rax1(v26, __ T2D, v26, v28);
3974     __ rax1(v28, __ T2D, v28, v25);
3975     __ rax1(v25, __ T2D, v25, v27);
3976     __ rax1(v27, __ T2D, v27, v29);
3977 
3978     __ eor(v0, __ T16B, v0, v30);
3979     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3980     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3981     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3982     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3983     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3984     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3985     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3986     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3987     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3988     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3989     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3990     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3991     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3992     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3993     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3994     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3995     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3996     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3997     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3998     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3999     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4000     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4001     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4002     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4003 
4004     __ bcax(v20, __ T16B, v31, v22, v8);
4005     __ bcax(v21, __ T16B, v8,  v23, v22);
4006     __ bcax(v22, __ T16B, v22, v24, v23);
4007     __ bcax(v23, __ T16B, v23, v31, v24);
4008     __ bcax(v24, __ T16B, v24, v8,  v31);
4009 
4010     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4011 
4012     __ bcax(v17, __ T16B, v25, v19, v3);
4013     __ bcax(v18, __ T16B, v3,  v15, v19);
4014     __ bcax(v19, __ T16B, v19, v16, v15);
4015     __ bcax(v15, __ T16B, v15, v25, v16);
4016     __ bcax(v16, __ T16B, v16, v3,  v25);
4017 
4018     __ bcax(v10, __ T16B, v29, v12, v26);
4019     __ bcax(v11, __ T16B, v26, v13, v12);
4020     __ bcax(v12, __ T16B, v12, v14, v13);
4021     __ bcax(v13, __ T16B, v13, v29, v14);
4022     __ bcax(v14, __ T16B, v14, v26, v29);
4023 
4024     __ bcax(v7, __ T16B, v30, v9,  v4);
4025     __ bcax(v8, __ T16B, v4,  v5,  v9);
4026     __ bcax(v9, __ T16B, v9,  v6,  v5);
4027     __ bcax(v5, __ T16B, v5,  v30, v6);
4028     __ bcax(v6, __ T16B, v6,  v4,  v30);
4029 
4030     __ bcax(v3, __ T16B, v27, v0,  v28);
4031     __ bcax(v4, __ T16B, v28, v1,  v0);
4032     __ bcax(v0, __ T16B, v0,  v2,  v1);
4033     __ bcax(v1, __ T16B, v1,  v27, v2);
4034     __ bcax(v2, __ T16B, v2,  v28, v27);
4035 
4036     __ eor(v0, __ T16B, v0, v31);
4037 
4038     __ cbnzw(rscratch2, rounds24_loop);
4039 
4040     if (multi_block) {
4041       // block_size =  200 - 2 * digest_length, ofs += block_size
4042       __ add(ofs, ofs, 200);
4043       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
4044 
4045       __ cmp(ofs, limit);
4046       __ br(Assembler::LE, sha3_loop);
4047       __ mov(c_rarg0, ofs); // return ofs
4048     }
4049 
4050     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4051     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4052     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4053     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4054     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4055     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4056     __ st1(v24, __ T1D, state);
4057 
4058     __ ldpd(v14, v15, Address(sp, 48));
4059     __ ldpd(v12, v13, Address(sp, 32));
4060     __ ldpd(v10, v11, Address(sp, 16));
4061     __ ldpd(v8, v9, __ post(sp, 64));
4062 
4063     __ ret(lr);
4064 
4065     return start;
4066   }
4067 
4068   /**
4069    *  Arguments:
4070    *
4071    * Inputs:
4072    *   c_rarg0   - int crc
4073    *   c_rarg1   - byte* buf
4074    *   c_rarg2   - int length
4075    *
4076    * Output:
4077    *       rax   - int crc result
4078    */
4079   address generate_updateBytesCRC32() {
4080     assert(UseCRC32Intrinsics, "what are we doing here?");
4081 
4082     __ align(CodeEntryAlignment);
4083     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4084 
4085     address start = __ pc();
4086 
4087     const Register crc   = c_rarg0;  // crc
4088     const Register buf   = c_rarg1;  // source java byte array address
4089     const Register len   = c_rarg2;  // length
4090     const Register table0 = c_rarg3; // crc_table address
4091     const Register table1 = c_rarg4;
4092     const Register table2 = c_rarg5;
4093     const Register table3 = c_rarg6;
4094     const Register tmp3 = c_rarg7;
4095 
4096     BLOCK_COMMENT("Entry:");
4097     __ enter(); // required for proper stackwalking of RuntimeStub frame
4098 
4099     __ kernel_crc32(crc, buf, len,
4100               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4101 
4102     __ leave(); // required for proper stackwalking of RuntimeStub frame
4103     __ ret(lr);
4104 
4105     return start;
4106   }
4107 
4108   /**
4109    *  Arguments:
4110    *
4111    * Inputs:
4112    *   c_rarg0   - int crc
4113    *   c_rarg1   - byte* buf
4114    *   c_rarg2   - int length
4115    *   c_rarg3   - int* table
4116    *
4117    * Output:
4118    *       r0   - int crc result
4119    */
4120   address generate_updateBytesCRC32C() {
4121     assert(UseCRC32CIntrinsics, "what are we doing here?");
4122 
4123     __ align(CodeEntryAlignment);
4124     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4125 
4126     address start = __ pc();
4127 
4128     const Register crc   = c_rarg0;  // crc
4129     const Register buf   = c_rarg1;  // source java byte array address
4130     const Register len   = c_rarg2;  // length
4131     const Register table0 = c_rarg3; // crc_table address
4132     const Register table1 = c_rarg4;
4133     const Register table2 = c_rarg5;
4134     const Register table3 = c_rarg6;
4135     const Register tmp3 = c_rarg7;
4136 
4137     BLOCK_COMMENT("Entry:");
4138     __ enter(); // required for proper stackwalking of RuntimeStub frame
4139 
4140     __ kernel_crc32c(crc, buf, len,
4141               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4142 
4143     __ leave(); // required for proper stackwalking of RuntimeStub frame
4144     __ ret(lr);
4145 
4146     return start;
4147   }
4148 
4149   /***
4150    *  Arguments:
4151    *
4152    *  Inputs:
4153    *   c_rarg0   - int   adler
4154    *   c_rarg1   - byte* buff
4155    *   c_rarg2   - int   len
4156    *
4157    * Output:
4158    *   c_rarg0   - int adler result
4159    */
4160   address generate_updateBytesAdler32() {
4161     __ align(CodeEntryAlignment);
4162     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4163     address start = __ pc();
4164 
4165     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4166 
4167     // Aliases
4168     Register adler  = c_rarg0;
4169     Register s1     = c_rarg0;
4170     Register s2     = c_rarg3;
4171     Register buff   = c_rarg1;
4172     Register len    = c_rarg2;
4173     Register nmax  = r4;
4174     Register base  = r5;
4175     Register count = r6;
4176     Register temp0 = rscratch1;
4177     Register temp1 = rscratch2;
4178     FloatRegister vbytes = v0;
4179     FloatRegister vs1acc = v1;
4180     FloatRegister vs2acc = v2;
4181     FloatRegister vtable = v3;
4182 
4183     // Max number of bytes we can process before having to take the mod
4184     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4185     uint64_t BASE = 0xfff1;
4186     uint64_t NMAX = 0x15B0;
4187 
4188     __ mov(base, BASE);
4189     __ mov(nmax, NMAX);
4190 
4191     // Load accumulation coefficients for the upper 16 bits
4192     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4193     __ ld1(vtable, __ T16B, Address(temp0));
4194 
4195     // s1 is initialized to the lower 16 bits of adler
4196     // s2 is initialized to the upper 16 bits of adler
4197     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4198     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4199 
4200     // The pipelined loop needs at least 16 elements for 1 iteration
4201     // It does check this, but it is more effective to skip to the cleanup loop
4202     __ cmp(len, (u1)16);
4203     __ br(Assembler::HS, L_nmax);
4204     __ cbz(len, L_combine);
4205 
4206     __ bind(L_simple_by1_loop);
4207     __ ldrb(temp0, Address(__ post(buff, 1)));
4208     __ add(s1, s1, temp0);
4209     __ add(s2, s2, s1);
4210     __ subs(len, len, 1);
4211     __ br(Assembler::HI, L_simple_by1_loop);
4212 
4213     // s1 = s1 % BASE
4214     __ subs(temp0, s1, base);
4215     __ csel(s1, temp0, s1, Assembler::HS);
4216 
4217     // s2 = s2 % BASE
4218     __ lsr(temp0, s2, 16);
4219     __ lsl(temp1, temp0, 4);
4220     __ sub(temp1, temp1, temp0);
4221     __ add(s2, temp1, s2, ext::uxth);
4222 
4223     __ subs(temp0, s2, base);
4224     __ csel(s2, temp0, s2, Assembler::HS);
4225 
4226     __ b(L_combine);
4227 
4228     __ bind(L_nmax);
4229     __ subs(len, len, nmax);
4230     __ sub(count, nmax, 16);
4231     __ br(Assembler::LO, L_by16);
4232 
4233     __ bind(L_nmax_loop);
4234 
4235     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4236                                       vbytes, vs1acc, vs2acc, vtable);
4237 
4238     __ subs(count, count, 16);
4239     __ br(Assembler::HS, L_nmax_loop);
4240 
4241     // s1 = s1 % BASE
4242     __ lsr(temp0, s1, 16);
4243     __ lsl(temp1, temp0, 4);
4244     __ sub(temp1, temp1, temp0);
4245     __ add(temp1, temp1, s1, ext::uxth);
4246 
4247     __ lsr(temp0, temp1, 16);
4248     __ lsl(s1, temp0, 4);
4249     __ sub(s1, s1, temp0);
4250     __ add(s1, s1, temp1, ext:: uxth);
4251 
4252     __ subs(temp0, s1, base);
4253     __ csel(s1, temp0, s1, Assembler::HS);
4254 
4255     // s2 = s2 % BASE
4256     __ lsr(temp0, s2, 16);
4257     __ lsl(temp1, temp0, 4);
4258     __ sub(temp1, temp1, temp0);
4259     __ add(temp1, temp1, s2, ext::uxth);
4260 
4261     __ lsr(temp0, temp1, 16);
4262     __ lsl(s2, temp0, 4);
4263     __ sub(s2, s2, temp0);
4264     __ add(s2, s2, temp1, ext:: uxth);
4265 
4266     __ subs(temp0, s2, base);
4267     __ csel(s2, temp0, s2, Assembler::HS);
4268 
4269     __ subs(len, len, nmax);
4270     __ sub(count, nmax, 16);
4271     __ br(Assembler::HS, L_nmax_loop);
4272 
4273     __ bind(L_by16);
4274     __ adds(len, len, count);
4275     __ br(Assembler::LO, L_by1);
4276 
4277     __ bind(L_by16_loop);
4278 
4279     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4280                                       vbytes, vs1acc, vs2acc, vtable);
4281 
4282     __ subs(len, len, 16);
4283     __ br(Assembler::HS, L_by16_loop);
4284 
4285     __ bind(L_by1);
4286     __ adds(len, len, 15);
4287     __ br(Assembler::LO, L_do_mod);
4288 
4289     __ bind(L_by1_loop);
4290     __ ldrb(temp0, Address(__ post(buff, 1)));
4291     __ add(s1, temp0, s1);
4292     __ add(s2, s2, s1);
4293     __ subs(len, len, 1);
4294     __ br(Assembler::HS, L_by1_loop);
4295 
4296     __ bind(L_do_mod);
4297     // s1 = s1 % BASE
4298     __ lsr(temp0, s1, 16);
4299     __ lsl(temp1, temp0, 4);
4300     __ sub(temp1, temp1, temp0);
4301     __ add(temp1, temp1, s1, ext::uxth);
4302 
4303     __ lsr(temp0, temp1, 16);
4304     __ lsl(s1, temp0, 4);
4305     __ sub(s1, s1, temp0);
4306     __ add(s1, s1, temp1, ext:: uxth);
4307 
4308     __ subs(temp0, s1, base);
4309     __ csel(s1, temp0, s1, Assembler::HS);
4310 
4311     // s2 = s2 % BASE
4312     __ lsr(temp0, s2, 16);
4313     __ lsl(temp1, temp0, 4);
4314     __ sub(temp1, temp1, temp0);
4315     __ add(temp1, temp1, s2, ext::uxth);
4316 
4317     __ lsr(temp0, temp1, 16);
4318     __ lsl(s2, temp0, 4);
4319     __ sub(s2, s2, temp0);
4320     __ add(s2, s2, temp1, ext:: uxth);
4321 
4322     __ subs(temp0, s2, base);
4323     __ csel(s2, temp0, s2, Assembler::HS);
4324 
4325     // Combine lower bits and higher bits
4326     __ bind(L_combine);
4327     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4328 
4329     __ ret(lr);
4330 
4331     return start;
4332   }
4333 
4334   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4335           Register temp0, Register temp1, FloatRegister vbytes,
4336           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4337     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4338     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4339     // In non-vectorized code, we update s1 and s2 as:
4340     //   s1 <- s1 + b1
4341     //   s2 <- s2 + s1
4342     //   s1 <- s1 + b2
4343     //   s2 <- s2 + b1
4344     //   ...
4345     //   s1 <- s1 + b16
4346     //   s2 <- s2 + s1
4347     // Putting above assignments together, we have:
4348     //   s1_new = s1 + b1 + b2 + ... + b16
4349     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4350     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4351     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4352     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4353 
4354     // s2 = s2 + s1 * 16
4355     __ add(s2, s2, s1, Assembler::LSL, 4);
4356 
4357     // vs1acc = b1 + b2 + b3 + ... + b16
4358     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4359     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4360     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4361     __ uaddlv(vs1acc, __ T16B, vbytes);
4362     __ uaddlv(vs2acc, __ T8H, vs2acc);
4363 
4364     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4365     __ fmovd(temp0, vs1acc);
4366     __ fmovd(temp1, vs2acc);
4367     __ add(s1, s1, temp0);
4368     __ add(s2, s2, temp1);
4369   }
4370 
4371   /**
4372    *  Arguments:
4373    *
4374    *  Input:
4375    *    c_rarg0   - x address
4376    *    c_rarg1   - x length
4377    *    c_rarg2   - y address
4378    *    c_rarg3   - y length
4379    *    c_rarg4   - z address
4380    *    c_rarg5   - z length
4381    */
4382   address generate_multiplyToLen() {
4383     __ align(CodeEntryAlignment);
4384     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4385 
4386     address start = __ pc();
4387     const Register x     = r0;
4388     const Register xlen  = r1;
4389     const Register y     = r2;
4390     const Register ylen  = r3;
4391     const Register z     = r4;
4392     const Register zlen  = r5;
4393 
4394     const Register tmp1  = r10;
4395     const Register tmp2  = r11;
4396     const Register tmp3  = r12;
4397     const Register tmp4  = r13;
4398     const Register tmp5  = r14;
4399     const Register tmp6  = r15;
4400     const Register tmp7  = r16;
4401 
4402     BLOCK_COMMENT("Entry:");
4403     __ enter(); // required for proper stackwalking of RuntimeStub frame
4404     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4405     __ leave(); // required for proper stackwalking of RuntimeStub frame
4406     __ ret(lr);
4407 
4408     return start;
4409   }
4410 
4411   address generate_squareToLen() {
4412     // squareToLen algorithm for sizes 1..127 described in java code works
4413     // faster than multiply_to_len on some CPUs and slower on others, but
4414     // multiply_to_len shows a bit better overall results
4415     __ align(CodeEntryAlignment);
4416     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4417     address start = __ pc();
4418 
4419     const Register x     = r0;
4420     const Register xlen  = r1;
4421     const Register z     = r2;
4422     const Register zlen  = r3;
4423     const Register y     = r4; // == x
4424     const Register ylen  = r5; // == xlen
4425 
4426     const Register tmp1  = r10;
4427     const Register tmp2  = r11;
4428     const Register tmp3  = r12;
4429     const Register tmp4  = r13;
4430     const Register tmp5  = r14;
4431     const Register tmp6  = r15;
4432     const Register tmp7  = r16;
4433 
4434     RegSet spilled_regs = RegSet::of(y, ylen);
4435     BLOCK_COMMENT("Entry:");
4436     __ enter();
4437     __ push(spilled_regs, sp);
4438     __ mov(y, x);
4439     __ mov(ylen, xlen);
4440     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4441     __ pop(spilled_regs, sp);
4442     __ leave();
4443     __ ret(lr);
4444     return start;
4445   }
4446 
4447   address generate_mulAdd() {
4448     __ align(CodeEntryAlignment);
4449     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4450 
4451     address start = __ pc();
4452 
4453     const Register out     = r0;
4454     const Register in      = r1;
4455     const Register offset  = r2;
4456     const Register len     = r3;
4457     const Register k       = r4;
4458 
4459     BLOCK_COMMENT("Entry:");
4460     __ enter();
4461     __ mul_add(out, in, offset, len, k);
4462     __ leave();
4463     __ ret(lr);
4464 
4465     return start;
4466   }
4467 
4468   // Arguments:
4469   //
4470   // Input:
4471   //   c_rarg0   - newArr address
4472   //   c_rarg1   - oldArr address
4473   //   c_rarg2   - newIdx
4474   //   c_rarg3   - shiftCount
4475   //   c_rarg4   - numIter
4476   //
4477   address generate_bigIntegerRightShift() {
4478     __ align(CodeEntryAlignment);
4479     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4480     address start = __ pc();
4481 
4482     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4483 
4484     Register newArr        = c_rarg0;
4485     Register oldArr        = c_rarg1;
4486     Register newIdx        = c_rarg2;
4487     Register shiftCount    = c_rarg3;
4488     Register numIter       = c_rarg4;
4489     Register idx           = numIter;
4490 
4491     Register newArrCur     = rscratch1;
4492     Register shiftRevCount = rscratch2;
4493     Register oldArrCur     = r13;
4494     Register oldArrNext    = r14;
4495 
4496     FloatRegister oldElem0        = v0;
4497     FloatRegister oldElem1        = v1;
4498     FloatRegister newElem         = v2;
4499     FloatRegister shiftVCount     = v3;
4500     FloatRegister shiftVRevCount  = v4;
4501 
4502     __ cbz(idx, Exit);
4503 
4504     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4505 
4506     // left shift count
4507     __ movw(shiftRevCount, 32);
4508     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4509 
4510     // numIter too small to allow a 4-words SIMD loop, rolling back
4511     __ cmp(numIter, (u1)4);
4512     __ br(Assembler::LT, ShiftThree);
4513 
4514     __ dup(shiftVCount,    __ T4S, shiftCount);
4515     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4516     __ negr(shiftVCount,   __ T4S, shiftVCount);
4517 
4518     __ BIND(ShiftSIMDLoop);
4519 
4520     // Calculate the load addresses
4521     __ sub(idx, idx, 4);
4522     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4523     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4524     __ add(oldArrCur,  oldArrNext, 4);
4525 
4526     // Load 4 words and process
4527     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4528     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4529     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4530     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4531     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4532     __ st1(newElem,   __ T4S,  Address(newArrCur));
4533 
4534     __ cmp(idx, (u1)4);
4535     __ br(Assembler::LT, ShiftTwoLoop);
4536     __ b(ShiftSIMDLoop);
4537 
4538     __ BIND(ShiftTwoLoop);
4539     __ cbz(idx, Exit);
4540     __ cmp(idx, (u1)1);
4541     __ br(Assembler::EQ, ShiftOne);
4542 
4543     // Calculate the load addresses
4544     __ sub(idx, idx, 2);
4545     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4546     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4547     __ add(oldArrCur,  oldArrNext, 4);
4548 
4549     // Load 2 words and process
4550     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4551     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4552     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4553     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4554     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4555     __ st1(newElem,   __ T2S, Address(newArrCur));
4556     __ b(ShiftTwoLoop);
4557 
4558     __ BIND(ShiftThree);
4559     __ tbz(idx, 1, ShiftOne);
4560     __ tbz(idx, 0, ShiftTwo);
4561     __ ldrw(r10,  Address(oldArr, 12));
4562     __ ldrw(r11,  Address(oldArr, 8));
4563     __ lsrvw(r10, r10, shiftCount);
4564     __ lslvw(r11, r11, shiftRevCount);
4565     __ orrw(r12,  r10, r11);
4566     __ strw(r12,  Address(newArr, 8));
4567 
4568     __ BIND(ShiftTwo);
4569     __ ldrw(r10,  Address(oldArr, 8));
4570     __ ldrw(r11,  Address(oldArr, 4));
4571     __ lsrvw(r10, r10, shiftCount);
4572     __ lslvw(r11, r11, shiftRevCount);
4573     __ orrw(r12,  r10, r11);
4574     __ strw(r12,  Address(newArr, 4));
4575 
4576     __ BIND(ShiftOne);
4577     __ ldrw(r10,  Address(oldArr, 4));
4578     __ ldrw(r11,  Address(oldArr));
4579     __ lsrvw(r10, r10, shiftCount);
4580     __ lslvw(r11, r11, shiftRevCount);
4581     __ orrw(r12,  r10, r11);
4582     __ strw(r12,  Address(newArr));
4583 
4584     __ BIND(Exit);
4585     __ ret(lr);
4586 
4587     return start;
4588   }
4589 
4590   // Arguments:
4591   //
4592   // Input:
4593   //   c_rarg0   - newArr address
4594   //   c_rarg1   - oldArr address
4595   //   c_rarg2   - newIdx
4596   //   c_rarg3   - shiftCount
4597   //   c_rarg4   - numIter
4598   //
4599   address generate_bigIntegerLeftShift() {
4600     __ align(CodeEntryAlignment);
4601     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4602     address start = __ pc();
4603 
4604     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4605 
4606     Register newArr        = c_rarg0;
4607     Register oldArr        = c_rarg1;
4608     Register newIdx        = c_rarg2;
4609     Register shiftCount    = c_rarg3;
4610     Register numIter       = c_rarg4;
4611 
4612     Register shiftRevCount = rscratch1;
4613     Register oldArrNext    = rscratch2;
4614 
4615     FloatRegister oldElem0        = v0;
4616     FloatRegister oldElem1        = v1;
4617     FloatRegister newElem         = v2;
4618     FloatRegister shiftVCount     = v3;
4619     FloatRegister shiftVRevCount  = v4;
4620 
4621     __ cbz(numIter, Exit);
4622 
4623     __ add(oldArrNext, oldArr, 4);
4624     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4625 
4626     // right shift count
4627     __ movw(shiftRevCount, 32);
4628     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4629 
4630     // numIter too small to allow a 4-words SIMD loop, rolling back
4631     __ cmp(numIter, (u1)4);
4632     __ br(Assembler::LT, ShiftThree);
4633 
4634     __ dup(shiftVCount,     __ T4S, shiftCount);
4635     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4636     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4637 
4638     __ BIND(ShiftSIMDLoop);
4639 
4640     // load 4 words and process
4641     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4642     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4643     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4644     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4645     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4646     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4647     __ sub(numIter,   numIter, 4);
4648 
4649     __ cmp(numIter, (u1)4);
4650     __ br(Assembler::LT, ShiftTwoLoop);
4651     __ b(ShiftSIMDLoop);
4652 
4653     __ BIND(ShiftTwoLoop);
4654     __ cbz(numIter, Exit);
4655     __ cmp(numIter, (u1)1);
4656     __ br(Assembler::EQ, ShiftOne);
4657 
4658     // load 2 words and process
4659     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4660     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4661     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4662     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4663     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4664     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4665     __ sub(numIter,   numIter, 2);
4666     __ b(ShiftTwoLoop);
4667 
4668     __ BIND(ShiftThree);
4669     __ ldrw(r10,  __ post(oldArr, 4));
4670     __ ldrw(r11,  __ post(oldArrNext, 4));
4671     __ lslvw(r10, r10, shiftCount);
4672     __ lsrvw(r11, r11, shiftRevCount);
4673     __ orrw(r12,  r10, r11);
4674     __ strw(r12,  __ post(newArr, 4));
4675     __ tbz(numIter, 1, Exit);
4676     __ tbz(numIter, 0, ShiftOne);
4677 
4678     __ BIND(ShiftTwo);
4679     __ ldrw(r10,  __ post(oldArr, 4));
4680     __ ldrw(r11,  __ post(oldArrNext, 4));
4681     __ lslvw(r10, r10, shiftCount);
4682     __ lsrvw(r11, r11, shiftRevCount);
4683     __ orrw(r12,  r10, r11);
4684     __ strw(r12,  __ post(newArr, 4));
4685 
4686     __ BIND(ShiftOne);
4687     __ ldrw(r10,  Address(oldArr));
4688     __ ldrw(r11,  Address(oldArrNext));
4689     __ lslvw(r10, r10, shiftCount);
4690     __ lsrvw(r11, r11, shiftRevCount);
4691     __ orrw(r12,  r10, r11);
4692     __ strw(r12,  Address(newArr));
4693 
4694     __ BIND(Exit);
4695     __ ret(lr);
4696 
4697     return start;
4698   }
4699 
4700   address generate_count_positives(address &count_positives_long) {
4701     const u1 large_loop_size = 64;
4702     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4703     int dcache_line = VM_Version::dcache_line_size();
4704 
4705     Register ary1 = r1, len = r2, result = r0;
4706 
4707     __ align(CodeEntryAlignment);
4708 
4709     StubCodeMark mark(this, "StubRoutines", "count_positives");
4710 
4711     address entry = __ pc();
4712 
4713     __ enter();
4714     // precondition: a copy of len is already in result
4715     // __ mov(result, len);
4716 
4717   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4718         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4719 
4720   __ cmp(len, (u1)15);
4721   __ br(Assembler::GT, LEN_OVER_15);
4722   // The only case when execution falls into this code is when pointer is near
4723   // the end of memory page and we have to avoid reading next page
4724   __ add(ary1, ary1, len);
4725   __ subs(len, len, 8);
4726   __ br(Assembler::GT, LEN_OVER_8);
4727   __ ldr(rscratch2, Address(ary1, -8));
4728   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4729   __ lsrv(rscratch2, rscratch2, rscratch1);
4730   __ tst(rscratch2, UPPER_BIT_MASK);
4731   __ csel(result, zr, result, Assembler::NE);
4732   __ leave();
4733   __ ret(lr);
4734   __ bind(LEN_OVER_8);
4735   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4736   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4737   __ tst(rscratch2, UPPER_BIT_MASK);
4738   __ br(Assembler::NE, RET_NO_POP);
4739   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4740   __ lsrv(rscratch1, rscratch1, rscratch2);
4741   __ tst(rscratch1, UPPER_BIT_MASK);
4742   __ bind(RET_NO_POP);
4743   __ csel(result, zr, result, Assembler::NE);
4744   __ leave();
4745   __ ret(lr);
4746 
4747   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4748   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4749 
4750   count_positives_long = __ pc(); // 2nd entry point
4751 
4752   __ enter();
4753 
4754   __ bind(LEN_OVER_15);
4755     __ push(spilled_regs, sp);
4756     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4757     __ cbz(rscratch2, ALIGNED);
4758     __ ldp(tmp6, tmp1, Address(ary1));
4759     __ mov(tmp5, 16);
4760     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4761     __ add(ary1, ary1, rscratch1);
4762     __ orr(tmp6, tmp6, tmp1);
4763     __ tst(tmp6, UPPER_BIT_MASK);
4764     __ br(Assembler::NE, RET_ADJUST);
4765     __ sub(len, len, rscratch1);
4766 
4767   __ bind(ALIGNED);
4768     __ cmp(len, large_loop_size);
4769     __ br(Assembler::LT, CHECK_16);
4770     // Perform 16-byte load as early return in pre-loop to handle situation
4771     // when initially aligned large array has negative values at starting bytes,
4772     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4773     // slower. Cases with negative bytes further ahead won't be affected that
4774     // much. In fact, it'll be faster due to early loads, less instructions and
4775     // less branches in LARGE_LOOP.
4776     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4777     __ sub(len, len, 16);
4778     __ orr(tmp6, tmp6, tmp1);
4779     __ tst(tmp6, UPPER_BIT_MASK);
4780     __ br(Assembler::NE, RET_ADJUST_16);
4781     __ cmp(len, large_loop_size);
4782     __ br(Assembler::LT, CHECK_16);
4783 
4784     if (SoftwarePrefetchHintDistance >= 0
4785         && SoftwarePrefetchHintDistance >= dcache_line) {
4786       // initial prefetch
4787       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4788     }
4789   __ bind(LARGE_LOOP);
4790     if (SoftwarePrefetchHintDistance >= 0) {
4791       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4792     }
4793     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4794     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4795     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4796     // instructions per cycle and have less branches, but this approach disables
4797     // early return, thus, all 64 bytes are loaded and checked every time.
4798     __ ldp(tmp2, tmp3, Address(ary1));
4799     __ ldp(tmp4, tmp5, Address(ary1, 16));
4800     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4801     __ ldp(tmp6, tmp1, Address(ary1, 48));
4802     __ add(ary1, ary1, large_loop_size);
4803     __ sub(len, len, large_loop_size);
4804     __ orr(tmp2, tmp2, tmp3);
4805     __ orr(tmp4, tmp4, tmp5);
4806     __ orr(rscratch1, rscratch1, rscratch2);
4807     __ orr(tmp6, tmp6, tmp1);
4808     __ orr(tmp2, tmp2, tmp4);
4809     __ orr(rscratch1, rscratch1, tmp6);
4810     __ orr(tmp2, tmp2, rscratch1);
4811     __ tst(tmp2, UPPER_BIT_MASK);
4812     __ br(Assembler::NE, RET_ADJUST_LONG);
4813     __ cmp(len, large_loop_size);
4814     __ br(Assembler::GE, LARGE_LOOP);
4815 
4816   __ bind(CHECK_16); // small 16-byte load pre-loop
4817     __ cmp(len, (u1)16);
4818     __ br(Assembler::LT, POST_LOOP16);
4819 
4820   __ bind(LOOP16); // small 16-byte load loop
4821     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4822     __ sub(len, len, 16);
4823     __ orr(tmp2, tmp2, tmp3);
4824     __ tst(tmp2, UPPER_BIT_MASK);
4825     __ br(Assembler::NE, RET_ADJUST_16);
4826     __ cmp(len, (u1)16);
4827     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4828 
4829   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4830     __ cmp(len, (u1)8);
4831     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4832     __ ldr(tmp3, Address(__ post(ary1, 8)));
4833     __ tst(tmp3, UPPER_BIT_MASK);
4834     __ br(Assembler::NE, RET_ADJUST);
4835     __ sub(len, len, 8);
4836 
4837   __ bind(POST_LOOP16_LOAD_TAIL);
4838     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
4839     __ ldr(tmp1, Address(ary1));
4840     __ mov(tmp2, 64);
4841     __ sub(tmp4, tmp2, len, __ LSL, 3);
4842     __ lslv(tmp1, tmp1, tmp4);
4843     __ tst(tmp1, UPPER_BIT_MASK);
4844     __ br(Assembler::NE, RET_ADJUST);
4845     // Fallthrough
4846 
4847   __ bind(RET_LEN);
4848     __ pop(spilled_regs, sp);
4849     __ leave();
4850     __ ret(lr);
4851 
4852     // difference result - len is the count of guaranteed to be
4853     // positive bytes
4854 
4855   __ bind(RET_ADJUST_LONG);
4856     __ add(len, len, (u1)(large_loop_size - 16));
4857   __ bind(RET_ADJUST_16);
4858     __ add(len, len, 16);
4859   __ bind(RET_ADJUST);
4860     __ pop(spilled_regs, sp);
4861     __ leave();
4862     __ sub(result, result, len);
4863     __ ret(lr);
4864 
4865     return entry;
4866   }
4867 
4868   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4869         bool usePrefetch, Label &NOT_EQUAL) {
4870     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4871         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4872         tmp7 = r12, tmp8 = r13;
4873     Label LOOP;
4874 
4875     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4876     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4877     __ bind(LOOP);
4878     if (usePrefetch) {
4879       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4880       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4881     }
4882     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4883     __ eor(tmp1, tmp1, tmp2);
4884     __ eor(tmp3, tmp3, tmp4);
4885     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4886     __ orr(tmp1, tmp1, tmp3);
4887     __ cbnz(tmp1, NOT_EQUAL);
4888     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4889     __ eor(tmp5, tmp5, tmp6);
4890     __ eor(tmp7, tmp7, tmp8);
4891     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4892     __ orr(tmp5, tmp5, tmp7);
4893     __ cbnz(tmp5, NOT_EQUAL);
4894     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4895     __ eor(tmp1, tmp1, tmp2);
4896     __ eor(tmp3, tmp3, tmp4);
4897     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4898     __ orr(tmp1, tmp1, tmp3);
4899     __ cbnz(tmp1, NOT_EQUAL);
4900     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4901     __ eor(tmp5, tmp5, tmp6);
4902     __ sub(cnt1, cnt1, 8 * wordSize);
4903     __ eor(tmp7, tmp7, tmp8);
4904     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4905     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4906     // cmp) because subs allows an unlimited range of immediate operand.
4907     __ subs(tmp6, cnt1, loopThreshold);
4908     __ orr(tmp5, tmp5, tmp7);
4909     __ cbnz(tmp5, NOT_EQUAL);
4910     __ br(__ GE, LOOP);
4911     // post-loop
4912     __ eor(tmp1, tmp1, tmp2);
4913     __ eor(tmp3, tmp3, tmp4);
4914     __ orr(tmp1, tmp1, tmp3);
4915     __ sub(cnt1, cnt1, 2 * wordSize);
4916     __ cbnz(tmp1, NOT_EQUAL);
4917   }
4918 
4919   void generate_large_array_equals_loop_simd(int loopThreshold,
4920         bool usePrefetch, Label &NOT_EQUAL) {
4921     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4922         tmp2 = rscratch2;
4923     Label LOOP;
4924 
4925     __ bind(LOOP);
4926     if (usePrefetch) {
4927       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4928       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4929     }
4930     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4931     __ sub(cnt1, cnt1, 8 * wordSize);
4932     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4933     __ subs(tmp1, cnt1, loopThreshold);
4934     __ eor(v0, __ T16B, v0, v4);
4935     __ eor(v1, __ T16B, v1, v5);
4936     __ eor(v2, __ T16B, v2, v6);
4937     __ eor(v3, __ T16B, v3, v7);
4938     __ orr(v0, __ T16B, v0, v1);
4939     __ orr(v1, __ T16B, v2, v3);
4940     __ orr(v0, __ T16B, v0, v1);
4941     __ umov(tmp1, v0, __ D, 0);
4942     __ umov(tmp2, v0, __ D, 1);
4943     __ orr(tmp1, tmp1, tmp2);
4944     __ cbnz(tmp1, NOT_EQUAL);
4945     __ br(__ GE, LOOP);
4946   }
4947 
4948   // a1 = r1 - array1 address
4949   // a2 = r2 - array2 address
4950   // result = r0 - return value. Already contains "false"
4951   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4952   // r3-r5 are reserved temporary registers
4953   address generate_large_array_equals() {
4954     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4955         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4956         tmp7 = r12, tmp8 = r13;
4957     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4958         SMALL_LOOP, POST_LOOP;
4959     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4960     // calculate if at least 32 prefetched bytes are used
4961     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4962     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4963     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4964     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4965         tmp5, tmp6, tmp7, tmp8);
4966 
4967     __ align(CodeEntryAlignment);
4968 
4969     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4970 
4971     address entry = __ pc();
4972     __ enter();
4973     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4974     // also advance pointers to use post-increment instead of pre-increment
4975     __ add(a1, a1, wordSize);
4976     __ add(a2, a2, wordSize);
4977     if (AvoidUnalignedAccesses) {
4978       // both implementations (SIMD/nonSIMD) are using relatively large load
4979       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4980       // on some CPUs in case of address is not at least 16-byte aligned.
4981       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4982       // load if needed at least for 1st address and make if 16-byte aligned.
4983       Label ALIGNED16;
4984       __ tbz(a1, 3, ALIGNED16);
4985       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4986       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4987       __ sub(cnt1, cnt1, wordSize);
4988       __ eor(tmp1, tmp1, tmp2);
4989       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4990       __ bind(ALIGNED16);
4991     }
4992     if (UseSIMDForArrayEquals) {
4993       if (SoftwarePrefetchHintDistance >= 0) {
4994         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4995         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4996         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4997             /* prfm = */ true, NOT_EQUAL);
4998         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4999         __ br(__ LT, TAIL);
5000       }
5001       __ bind(NO_PREFETCH_LARGE_LOOP);
5002       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5003           /* prfm = */ false, NOT_EQUAL);
5004     } else {
5005       __ push(spilled_regs, sp);
5006       if (SoftwarePrefetchHintDistance >= 0) {
5007         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5008         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5009         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5010             /* prfm = */ true, NOT_EQUAL);
5011         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5012         __ br(__ LT, TAIL);
5013       }
5014       __ bind(NO_PREFETCH_LARGE_LOOP);
5015       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5016           /* prfm = */ false, NOT_EQUAL);
5017     }
5018     __ bind(TAIL);
5019       __ cbz(cnt1, EQUAL);
5020       __ subs(cnt1, cnt1, wordSize);
5021       __ br(__ LE, POST_LOOP);
5022     __ bind(SMALL_LOOP);
5023       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5024       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5025       __ subs(cnt1, cnt1, wordSize);
5026       __ eor(tmp1, tmp1, tmp2);
5027       __ cbnz(tmp1, NOT_EQUAL);
5028       __ br(__ GT, SMALL_LOOP);
5029     __ bind(POST_LOOP);
5030       __ ldr(tmp1, Address(a1, cnt1));
5031       __ ldr(tmp2, Address(a2, cnt1));
5032       __ eor(tmp1, tmp1, tmp2);
5033       __ cbnz(tmp1, NOT_EQUAL);
5034     __ bind(EQUAL);
5035       __ mov(result, true);
5036     __ bind(NOT_EQUAL);
5037       if (!UseSIMDForArrayEquals) {
5038         __ pop(spilled_regs, sp);
5039       }
5040     __ bind(NOT_EQUAL_NO_POP);
5041     __ leave();
5042     __ ret(lr);
5043     return entry;
5044   }
5045 
5046   address generate_dsin_dcos(bool isCos) {
5047     __ align(CodeEntryAlignment);
5048     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5049     address start = __ pc();
5050     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5051         (address)StubRoutines::aarch64::_two_over_pi,
5052         (address)StubRoutines::aarch64::_pio2,
5053         (address)StubRoutines::aarch64::_dsin_coef,
5054         (address)StubRoutines::aarch64::_dcos_coef);
5055     return start;
5056   }
5057 
5058   address generate_dlog() {
5059     __ align(CodeEntryAlignment);
5060     StubCodeMark mark(this, "StubRoutines", "dlog");
5061     address entry = __ pc();
5062     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5063         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5064     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5065     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5066         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5067     return entry;
5068   }
5069 
5070 
5071   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5072   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5073       Label &DIFF2) {
5074     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5075     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5076 
5077     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5078     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5079     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5080     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5081 
5082     __ fmovd(tmpL, vtmp3);
5083     __ eor(rscratch2, tmp3, tmpL);
5084     __ cbnz(rscratch2, DIFF2);
5085 
5086     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5087     __ umov(tmpL, vtmp3, __ D, 1);
5088     __ eor(rscratch2, tmpU, tmpL);
5089     __ cbnz(rscratch2, DIFF1);
5090 
5091     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5092     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5093     __ fmovd(tmpL, vtmp);
5094     __ eor(rscratch2, tmp3, tmpL);
5095     __ cbnz(rscratch2, DIFF2);
5096 
5097     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5098     __ umov(tmpL, vtmp, __ D, 1);
5099     __ eor(rscratch2, tmpU, tmpL);
5100     __ cbnz(rscratch2, DIFF1);
5101   }
5102 
5103   // r0  = result
5104   // r1  = str1
5105   // r2  = cnt1
5106   // r3  = str2
5107   // r4  = cnt2
5108   // r10 = tmp1
5109   // r11 = tmp2
5110   address generate_compare_long_string_different_encoding(bool isLU) {
5111     __ align(CodeEntryAlignment);
5112     StubCodeMark mark(this, "StubRoutines", isLU
5113         ? "compare_long_string_different_encoding LU"
5114         : "compare_long_string_different_encoding UL");
5115     address entry = __ pc();
5116     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5117         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5118         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5119     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5120         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5121     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5122     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5123 
5124     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5125 
5126     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5127     // cnt2 == amount of characters left to compare
5128     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5129     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5130     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5131     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5132     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5133     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5134     __ eor(rscratch2, tmp1, tmp2);
5135     __ mov(rscratch1, tmp2);
5136     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5137     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5138              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5139     __ push(spilled_regs, sp);
5140     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5141     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5142 
5143     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5144 
5145     if (SoftwarePrefetchHintDistance >= 0) {
5146       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5147       __ br(__ LT, NO_PREFETCH);
5148       __ bind(LARGE_LOOP_PREFETCH);
5149         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5150         __ mov(tmp4, 2);
5151         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5152         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5153           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5154           __ subs(tmp4, tmp4, 1);
5155           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5156           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5157           __ mov(tmp4, 2);
5158         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5159           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5160           __ subs(tmp4, tmp4, 1);
5161           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5162           __ sub(cnt2, cnt2, 64);
5163           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5164           __ br(__ GE, LARGE_LOOP_PREFETCH);
5165     }
5166     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5167     __ bind(NO_PREFETCH);
5168     __ subs(cnt2, cnt2, 16);
5169     __ br(__ LT, TAIL);
5170     __ align(OptoLoopAlignment);
5171     __ bind(SMALL_LOOP); // smaller loop
5172       __ subs(cnt2, cnt2, 16);
5173       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5174       __ br(__ GE, SMALL_LOOP);
5175       __ cmn(cnt2, (u1)16);
5176       __ br(__ EQ, LOAD_LAST);
5177     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5178       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5179       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5180       __ ldr(tmp3, Address(cnt1, -8));
5181       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5182       __ b(LOAD_LAST);
5183     __ bind(DIFF2);
5184       __ mov(tmpU, tmp3);
5185     __ bind(DIFF1);
5186       __ pop(spilled_regs, sp);
5187       __ b(CALCULATE_DIFFERENCE);
5188     __ bind(LOAD_LAST);
5189       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5190       // No need to load it again
5191       __ mov(tmpU, tmp3);
5192       __ pop(spilled_regs, sp);
5193 
5194       // tmp2 points to the address of the last 4 Latin1 characters right now
5195       __ ldrs(vtmp, Address(tmp2));
5196       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5197       __ fmovd(tmpL, vtmp);
5198 
5199       __ eor(rscratch2, tmpU, tmpL);
5200       __ cbz(rscratch2, DONE);
5201 
5202     // Find the first different characters in the longwords and
5203     // compute their difference.
5204     __ bind(CALCULATE_DIFFERENCE);
5205       __ rev(rscratch2, rscratch2);
5206       __ clz(rscratch2, rscratch2);
5207       __ andr(rscratch2, rscratch2, -16);
5208       __ lsrv(tmp1, tmp1, rscratch2);
5209       __ uxthw(tmp1, tmp1);
5210       __ lsrv(rscratch1, rscratch1, rscratch2);
5211       __ uxthw(rscratch1, rscratch1);
5212       __ subw(result, tmp1, rscratch1);
5213     __ bind(DONE);
5214       __ ret(lr);
5215     return entry;
5216   }
5217 
5218   address generate_method_entry_barrier() {
5219     __ align(CodeEntryAlignment);
5220     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5221 
5222     Label deoptimize_label;
5223 
5224     address start = __ pc();
5225 
5226     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5227 
5228     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5229       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5230       // We can get here despite the nmethod being good, if we have not
5231       // yet applied our cross modification fence (or data fence).
5232       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()) + 4);
5233       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5234       __ ldrw(rscratch2, rscratch2);
5235       __ strw(rscratch2, thread_epoch_addr);
5236       __ isb();
5237       __ membar(__ LoadLoad);
5238     }
5239 
5240     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5241 
5242     __ enter();
5243     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5244 
5245     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5246 
5247     __ push_call_clobbered_registers();
5248 
5249     __ mov(c_rarg0, rscratch2);
5250     __ call_VM_leaf
5251          (CAST_FROM_FN_PTR
5252           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5253 
5254     __ reset_last_Java_frame(true);
5255 
5256     __ mov(rscratch1, r0);
5257 
5258     __ pop_call_clobbered_registers();
5259 
5260     __ cbnz(rscratch1, deoptimize_label);
5261 
5262     __ leave();
5263     __ ret(lr);
5264 
5265     __ BIND(deoptimize_label);
5266 
5267     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5268     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5269 
5270     __ mov(sp, rscratch1);
5271     __ br(rscratch2);
5272 
5273     return start;
5274   }
5275 
5276   // r0  = result
5277   // r1  = str1
5278   // r2  = cnt1
5279   // r3  = str2
5280   // r4  = cnt2
5281   // r10 = tmp1
5282   // r11 = tmp2
5283   address generate_compare_long_string_same_encoding(bool isLL) {
5284     __ align(CodeEntryAlignment);
5285     StubCodeMark mark(this, "StubRoutines", isLL
5286         ? "compare_long_string_same_encoding LL"
5287         : "compare_long_string_same_encoding UU");
5288     address entry = __ pc();
5289     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5290         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5291 
5292     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5293 
5294     // exit from large loop when less than 64 bytes left to read or we're about
5295     // to prefetch memory behind array border
5296     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5297 
5298     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5299     __ eor(rscratch2, tmp1, tmp2);
5300     __ cbnz(rscratch2, CAL_DIFFERENCE);
5301 
5302     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5303     // update pointers, because of previous read
5304     __ add(str1, str1, wordSize);
5305     __ add(str2, str2, wordSize);
5306     if (SoftwarePrefetchHintDistance >= 0) {
5307       __ align(OptoLoopAlignment);
5308       __ bind(LARGE_LOOP_PREFETCH);
5309         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5310         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5311 
5312         for (int i = 0; i < 4; i++) {
5313           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5314           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5315           __ cmp(tmp1, tmp2);
5316           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5317           __ br(Assembler::NE, DIFF);
5318         }
5319         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5320         __ add(str1, str1, 64);
5321         __ add(str2, str2, 64);
5322         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5323         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5324         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5325     }
5326 
5327     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5328     __ br(Assembler::LE, LESS16);
5329     __ align(OptoLoopAlignment);
5330     __ bind(LOOP_COMPARE16);
5331       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5332       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5333       __ cmp(tmp1, tmp2);
5334       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5335       __ br(Assembler::NE, DIFF);
5336       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5337       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5338       __ br(Assembler::LT, LESS16);
5339 
5340       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5341       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5342       __ cmp(tmp1, tmp2);
5343       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5344       __ br(Assembler::NE, DIFF);
5345       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5346       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5347       __ br(Assembler::GE, LOOP_COMPARE16);
5348       __ cbz(cnt2, LENGTH_DIFF);
5349 
5350     __ bind(LESS16);
5351       // each 8 compare
5352       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5353       __ br(Assembler::LE, LESS8);
5354       __ ldr(tmp1, Address(__ post(str1, 8)));
5355       __ ldr(tmp2, Address(__ post(str2, 8)));
5356       __ eor(rscratch2, tmp1, tmp2);
5357       __ cbnz(rscratch2, CAL_DIFFERENCE);
5358       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5359 
5360     __ bind(LESS8); // directly load last 8 bytes
5361       if (!isLL) {
5362         __ add(cnt2, cnt2, cnt2);
5363       }
5364       __ ldr(tmp1, Address(str1, cnt2));
5365       __ ldr(tmp2, Address(str2, cnt2));
5366       __ eor(rscratch2, tmp1, tmp2);
5367       __ cbz(rscratch2, LENGTH_DIFF);
5368       __ b(CAL_DIFFERENCE);
5369 
5370     __ bind(DIFF);
5371       __ cmp(tmp1, tmp2);
5372       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5373       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5374       // reuse rscratch2 register for the result of eor instruction
5375       __ eor(rscratch2, tmp1, tmp2);
5376 
5377     __ bind(CAL_DIFFERENCE);
5378       __ rev(rscratch2, rscratch2);
5379       __ clz(rscratch2, rscratch2);
5380       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5381       __ lsrv(tmp1, tmp1, rscratch2);
5382       __ lsrv(tmp2, tmp2, rscratch2);
5383       if (isLL) {
5384         __ uxtbw(tmp1, tmp1);
5385         __ uxtbw(tmp2, tmp2);
5386       } else {
5387         __ uxthw(tmp1, tmp1);
5388         __ uxthw(tmp2, tmp2);
5389       }
5390       __ subw(result, tmp1, tmp2);
5391 
5392     __ bind(LENGTH_DIFF);
5393       __ ret(lr);
5394     return entry;
5395   }
5396 
5397   enum string_compare_mode {
5398     LL,
5399     LU,
5400     UL,
5401     UU,
5402   };
5403 
5404   // The following registers are declared in aarch64.ad
5405   // r0  = result
5406   // r1  = str1
5407   // r2  = cnt1
5408   // r3  = str2
5409   // r4  = cnt2
5410   // r10 = tmp1
5411   // r11 = tmp2
5412   // z0  = ztmp1
5413   // z1  = ztmp2
5414   // p0  = pgtmp1
5415   // p1  = pgtmp2
5416   address generate_compare_long_string_sve(string_compare_mode mode) {
5417     __ align(CodeEntryAlignment);
5418     address entry = __ pc();
5419     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5420              tmp1 = r10, tmp2 = r11;
5421 
5422     Label LOOP, DONE, MISMATCH;
5423     Register vec_len = tmp1;
5424     Register idx = tmp2;
5425     // The minimum of the string lengths has been stored in cnt2.
5426     Register cnt = cnt2;
5427     FloatRegister ztmp1 = z0, ztmp2 = z1;
5428     PRegister pgtmp1 = p0, pgtmp2 = p1;
5429 
5430 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5431     switch (mode) {                                                            \
5432       case LL:                                                                 \
5433         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5434         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5435         break;                                                                 \
5436       case LU:                                                                 \
5437         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5438         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5439         break;                                                                 \
5440       case UL:                                                                 \
5441         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5442         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5443         break;                                                                 \
5444       case UU:                                                                 \
5445         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5446         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5447         break;                                                                 \
5448       default:                                                                 \
5449         ShouldNotReachHere();                                                  \
5450     }
5451 
5452     const char* stubname;
5453     switch (mode) {
5454       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5455       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5456       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5457       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5458       default: ShouldNotReachHere();
5459     }
5460 
5461     StubCodeMark mark(this, "StubRoutines", stubname);
5462 
5463     __ mov(idx, 0);
5464     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5465 
5466     if (mode == LL) {
5467       __ sve_cntb(vec_len);
5468     } else {
5469       __ sve_cnth(vec_len);
5470     }
5471 
5472     __ sub(rscratch1, cnt, vec_len);
5473 
5474     __ bind(LOOP);
5475 
5476       // main loop
5477       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5478       __ add(idx, idx, vec_len);
5479       // Compare strings.
5480       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5481       __ br(__ NE, MISMATCH);
5482       __ cmp(idx, rscratch1);
5483       __ br(__ LT, LOOP);
5484 
5485     // post loop, last iteration
5486     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5487 
5488     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5489     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5490     __ br(__ EQ, DONE);
5491 
5492     __ bind(MISMATCH);
5493 
5494     // Crop the vector to find its location.
5495     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5496     // Extract the first different characters of each string.
5497     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5498     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5499 
5500     // Compute the difference of the first different characters.
5501     __ sub(result, rscratch1, rscratch2);
5502 
5503     __ bind(DONE);
5504     __ ret(lr);
5505 #undef LOAD_PAIR
5506     return entry;
5507   }
5508 
5509   void generate_compare_long_strings() {
5510     if (UseSVE == 0) {
5511       StubRoutines::aarch64::_compare_long_string_LL
5512           = generate_compare_long_string_same_encoding(true);
5513       StubRoutines::aarch64::_compare_long_string_UU
5514           = generate_compare_long_string_same_encoding(false);
5515       StubRoutines::aarch64::_compare_long_string_LU
5516           = generate_compare_long_string_different_encoding(true);
5517       StubRoutines::aarch64::_compare_long_string_UL
5518           = generate_compare_long_string_different_encoding(false);
5519     } else {
5520       StubRoutines::aarch64::_compare_long_string_LL
5521           = generate_compare_long_string_sve(LL);
5522       StubRoutines::aarch64::_compare_long_string_UU
5523           = generate_compare_long_string_sve(UU);
5524       StubRoutines::aarch64::_compare_long_string_LU
5525           = generate_compare_long_string_sve(LU);
5526       StubRoutines::aarch64::_compare_long_string_UL
5527           = generate_compare_long_string_sve(UL);
5528     }
5529   }
5530 
5531   // R0 = result
5532   // R1 = str2
5533   // R2 = cnt1
5534   // R3 = str1
5535   // R4 = cnt2
5536   // This generic linear code use few additional ideas, which makes it faster:
5537   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5538   // in order to skip initial loading(help in systems with 1 ld pipeline)
5539   // 2) we can use "fast" algorithm of finding single character to search for
5540   // first symbol with less branches(1 branch per each loaded register instead
5541   // of branch for each symbol), so, this is where constants like
5542   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5543   // 3) after loading and analyzing 1st register of source string, it can be
5544   // used to search for every 1st character entry, saving few loads in
5545   // comparison with "simplier-but-slower" implementation
5546   // 4) in order to avoid lots of push/pop operations, code below is heavily
5547   // re-using/re-initializing/compressing register values, which makes code
5548   // larger and a bit less readable, however, most of extra operations are
5549   // issued during loads or branches, so, penalty is minimal
5550   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5551     const char* stubName = str1_isL
5552         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5553         : "indexof_linear_uu";
5554     __ align(CodeEntryAlignment);
5555     StubCodeMark mark(this, "StubRoutines", stubName);
5556     address entry = __ pc();
5557 
5558     int str1_chr_size = str1_isL ? 1 : 2;
5559     int str2_chr_size = str2_isL ? 1 : 2;
5560     int str1_chr_shift = str1_isL ? 0 : 1;
5561     int str2_chr_shift = str2_isL ? 0 : 1;
5562     bool isL = str1_isL && str2_isL;
5563    // parameters
5564     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5565     // temporary registers
5566     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5567     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5568     // redefinitions
5569     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5570 
5571     __ push(spilled_regs, sp);
5572     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5573         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5574         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5575         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5576         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5577         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5578     // Read whole register from str1. It is safe, because length >=8 here
5579     __ ldr(ch1, Address(str1));
5580     // Read whole register from str2. It is safe, because length >=8 here
5581     __ ldr(ch2, Address(str2));
5582     __ sub(cnt2, cnt2, cnt1);
5583     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5584     if (str1_isL != str2_isL) {
5585       __ eor(v0, __ T16B, v0, v0);
5586     }
5587     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5588     __ mul(first, first, tmp1);
5589     // check if we have less than 1 register to check
5590     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5591     if (str1_isL != str2_isL) {
5592       __ fmovd(v1, ch1);
5593     }
5594     __ br(__ LE, L_SMALL);
5595     __ eor(ch2, first, ch2);
5596     if (str1_isL != str2_isL) {
5597       __ zip1(v1, __ T16B, v1, v0);
5598     }
5599     __ sub(tmp2, ch2, tmp1);
5600     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5601     __ bics(tmp2, tmp2, ch2);
5602     if (str1_isL != str2_isL) {
5603       __ fmovd(ch1, v1);
5604     }
5605     __ br(__ NE, L_HAS_ZERO);
5606     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5607     __ add(result, result, wordSize/str2_chr_size);
5608     __ add(str2, str2, wordSize);
5609     __ br(__ LT, L_POST_LOOP);
5610     __ BIND(L_LOOP);
5611       __ ldr(ch2, Address(str2));
5612       __ eor(ch2, first, ch2);
5613       __ sub(tmp2, ch2, tmp1);
5614       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5615       __ bics(tmp2, tmp2, ch2);
5616       __ br(__ NE, L_HAS_ZERO);
5617     __ BIND(L_LOOP_PROCEED);
5618       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5619       __ add(str2, str2, wordSize);
5620       __ add(result, result, wordSize/str2_chr_size);
5621       __ br(__ GE, L_LOOP);
5622     __ BIND(L_POST_LOOP);
5623       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5624       __ br(__ LE, NOMATCH);
5625       __ ldr(ch2, Address(str2));
5626       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5627       __ eor(ch2, first, ch2);
5628       __ sub(tmp2, ch2, tmp1);
5629       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5630       __ mov(tmp4, -1); // all bits set
5631       __ b(L_SMALL_PROCEED);
5632     __ align(OptoLoopAlignment);
5633     __ BIND(L_SMALL);
5634       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5635       __ eor(ch2, first, ch2);
5636       if (str1_isL != str2_isL) {
5637         __ zip1(v1, __ T16B, v1, v0);
5638       }
5639       __ sub(tmp2, ch2, tmp1);
5640       __ mov(tmp4, -1); // all bits set
5641       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5642       if (str1_isL != str2_isL) {
5643         __ fmovd(ch1, v1); // move converted 4 symbols
5644       }
5645     __ BIND(L_SMALL_PROCEED);
5646       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5647       __ bic(tmp2, tmp2, ch2);
5648       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5649       __ rbit(tmp2, tmp2);
5650       __ br(__ EQ, NOMATCH);
5651     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5652       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5653       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5654       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5655       if (str2_isL) { // LL
5656         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5657         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5658         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5659         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5660         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5661       } else {
5662         __ mov(ch2, 0xE); // all bits in byte set except last one
5663         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5664         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5665         __ lslv(tmp2, tmp2, tmp4);
5666         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5667         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5668         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5669         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5670       }
5671       __ cmp(ch1, ch2);
5672       __ mov(tmp4, wordSize/str2_chr_size);
5673       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5674     __ BIND(L_SMALL_CMP_LOOP);
5675       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5676                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5677       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5678                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5679       __ add(tmp4, tmp4, 1);
5680       __ cmp(tmp4, cnt1);
5681       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5682       __ cmp(first, ch2);
5683       __ br(__ EQ, L_SMALL_CMP_LOOP);
5684     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5685       __ cbz(tmp2, NOMATCH); // no more matches. exit
5686       __ clz(tmp4, tmp2);
5687       __ add(result, result, 1); // advance index
5688       __ add(str2, str2, str2_chr_size); // advance pointer
5689       __ b(L_SMALL_HAS_ZERO_LOOP);
5690     __ align(OptoLoopAlignment);
5691     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5692       __ cmp(first, ch2);
5693       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5694       __ b(DONE);
5695     __ align(OptoLoopAlignment);
5696     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5697       if (str2_isL) { // LL
5698         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5699         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5700         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5701         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5702         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5703       } else {
5704         __ mov(ch2, 0xE); // all bits in byte set except last one
5705         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5706         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5707         __ lslv(tmp2, tmp2, tmp4);
5708         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5709         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5710         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5711         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5712       }
5713       __ cmp(ch1, ch2);
5714       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5715       __ b(DONE);
5716     __ align(OptoLoopAlignment);
5717     __ BIND(L_HAS_ZERO);
5718       __ rbit(tmp2, tmp2);
5719       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5720       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5721       // It's fine because both counters are 32bit and are not changed in this
5722       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5723       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5724       __ sub(result, result, 1);
5725     __ BIND(L_HAS_ZERO_LOOP);
5726       __ mov(cnt1, wordSize/str2_chr_size);
5727       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5728       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5729       if (str2_isL) {
5730         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5731         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5732         __ lslv(tmp2, tmp2, tmp4);
5733         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5734         __ add(tmp4, tmp4, 1);
5735         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5736         __ lsl(tmp2, tmp2, 1);
5737         __ mov(tmp4, wordSize/str2_chr_size);
5738       } else {
5739         __ mov(ch2, 0xE);
5740         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5741         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5742         __ lslv(tmp2, tmp2, tmp4);
5743         __ add(tmp4, tmp4, 1);
5744         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5745         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5746         __ lsl(tmp2, tmp2, 1);
5747         __ mov(tmp4, wordSize/str2_chr_size);
5748         __ sub(str2, str2, str2_chr_size);
5749       }
5750       __ cmp(ch1, ch2);
5751       __ mov(tmp4, wordSize/str2_chr_size);
5752       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5753     __ BIND(L_CMP_LOOP);
5754       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5755                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5756       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5757                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5758       __ add(tmp4, tmp4, 1);
5759       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5760       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5761       __ cmp(cnt1, ch2);
5762       __ br(__ EQ, L_CMP_LOOP);
5763     __ BIND(L_CMP_LOOP_NOMATCH);
5764       // here we're not matched
5765       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5766       __ clz(tmp4, tmp2);
5767       __ add(str2, str2, str2_chr_size); // advance pointer
5768       __ b(L_HAS_ZERO_LOOP);
5769     __ align(OptoLoopAlignment);
5770     __ BIND(L_CMP_LOOP_LAST_CMP);
5771       __ cmp(cnt1, ch2);
5772       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5773       __ b(DONE);
5774     __ align(OptoLoopAlignment);
5775     __ BIND(L_CMP_LOOP_LAST_CMP2);
5776       if (str2_isL) {
5777         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5778         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5779         __ lslv(tmp2, tmp2, tmp4);
5780         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5781         __ add(tmp4, tmp4, 1);
5782         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5783         __ lsl(tmp2, tmp2, 1);
5784       } else {
5785         __ mov(ch2, 0xE);
5786         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5787         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5788         __ lslv(tmp2, tmp2, tmp4);
5789         __ add(tmp4, tmp4, 1);
5790         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5791         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5792         __ lsl(tmp2, tmp2, 1);
5793         __ sub(str2, str2, str2_chr_size);
5794       }
5795       __ cmp(ch1, ch2);
5796       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5797       __ b(DONE);
5798     __ align(OptoLoopAlignment);
5799     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5800       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5801       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5802       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5803       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5804       // result by analyzed characters value, so, we can just reset lower bits
5805       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5806       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5807       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5808       // index of last analyzed substring inside current octet. So, str2 in at
5809       // respective start address. We need to advance it to next octet
5810       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5811       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5812       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5813       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5814       __ movw(cnt2, cnt2);
5815       __ b(L_LOOP_PROCEED);
5816     __ align(OptoLoopAlignment);
5817     __ BIND(NOMATCH);
5818       __ mov(result, -1);
5819     __ BIND(DONE);
5820       __ pop(spilled_regs, sp);
5821       __ ret(lr);
5822     return entry;
5823   }
5824 
5825   void generate_string_indexof_stubs() {
5826     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5827     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5828     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5829   }
5830 
5831   void inflate_and_store_2_fp_registers(bool generatePrfm,
5832       FloatRegister src1, FloatRegister src2) {
5833     Register dst = r1;
5834     __ zip1(v1, __ T16B, src1, v0);
5835     __ zip2(v2, __ T16B, src1, v0);
5836     if (generatePrfm) {
5837       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5838     }
5839     __ zip1(v3, __ T16B, src2, v0);
5840     __ zip2(v4, __ T16B, src2, v0);
5841     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5842   }
5843 
5844   // R0 = src
5845   // R1 = dst
5846   // R2 = len
5847   // R3 = len >> 3
5848   // V0 = 0
5849   // v1 = loaded 8 bytes
5850   address generate_large_byte_array_inflate() {
5851     __ align(CodeEntryAlignment);
5852     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5853     address entry = __ pc();
5854     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5855     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5856     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5857 
5858     // do one more 8-byte read to have address 16-byte aligned in most cases
5859     // also use single store instruction
5860     __ ldrd(v2, __ post(src, 8));
5861     __ sub(octetCounter, octetCounter, 2);
5862     __ zip1(v1, __ T16B, v1, v0);
5863     __ zip1(v2, __ T16B, v2, v0);
5864     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5865     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5866     __ subs(rscratch1, octetCounter, large_loop_threshold);
5867     __ br(__ LE, LOOP_START);
5868     __ b(LOOP_PRFM_START);
5869     __ bind(LOOP_PRFM);
5870       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5871     __ bind(LOOP_PRFM_START);
5872       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5873       __ sub(octetCounter, octetCounter, 8);
5874       __ subs(rscratch1, octetCounter, large_loop_threshold);
5875       inflate_and_store_2_fp_registers(true, v3, v4);
5876       inflate_and_store_2_fp_registers(true, v5, v6);
5877       __ br(__ GT, LOOP_PRFM);
5878       __ cmp(octetCounter, (u1)8);
5879       __ br(__ LT, DONE);
5880     __ bind(LOOP);
5881       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5882       __ bind(LOOP_START);
5883       __ sub(octetCounter, octetCounter, 8);
5884       __ cmp(octetCounter, (u1)8);
5885       inflate_and_store_2_fp_registers(false, v3, v4);
5886       inflate_and_store_2_fp_registers(false, v5, v6);
5887       __ br(__ GE, LOOP);
5888     __ bind(DONE);
5889       __ ret(lr);
5890     return entry;
5891   }
5892 
5893   /**
5894    *  Arguments:
5895    *
5896    *  Input:
5897    *  c_rarg0   - current state address
5898    *  c_rarg1   - H key address
5899    *  c_rarg2   - data address
5900    *  c_rarg3   - number of blocks
5901    *
5902    *  Output:
5903    *  Updated state at c_rarg0
5904    */
5905   address generate_ghash_processBlocks() {
5906     // Bafflingly, GCM uses little-endian for the byte order, but
5907     // big-endian for the bit order.  For example, the polynomial 1 is
5908     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5909     //
5910     // So, we must either reverse the bytes in each word and do
5911     // everything big-endian or reverse the bits in each byte and do
5912     // it little-endian.  On AArch64 it's more idiomatic to reverse
5913     // the bits in each byte (we have an instruction, RBIT, to do
5914     // that) and keep the data in little-endian bit order through the
5915     // calculation, bit-reversing the inputs and outputs.
5916 
5917     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5918     __ align(wordSize * 2);
5919     address p = __ pc();
5920     __ emit_int64(0x87);  // The low-order bits of the field
5921                           // polynomial (i.e. p = z^7+z^2+z+1)
5922                           // repeated in the low and high parts of a
5923                           // 128-bit vector
5924     __ emit_int64(0x87);
5925 
5926     __ align(CodeEntryAlignment);
5927     address start = __ pc();
5928 
5929     Register state   = c_rarg0;
5930     Register subkeyH = c_rarg1;
5931     Register data    = c_rarg2;
5932     Register blocks  = c_rarg3;
5933 
5934     FloatRegister vzr = v30;
5935     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5936 
5937     __ ldrq(v24, p);    // The field polynomial
5938 
5939     __ ldrq(v0, Address(state));
5940     __ ldrq(v1, Address(subkeyH));
5941 
5942     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5943     __ rbit(v0, __ T16B, v0);
5944     __ rev64(v1, __ T16B, v1);
5945     __ rbit(v1, __ T16B, v1);
5946 
5947     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5948     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5949 
5950     {
5951       Label L_ghash_loop;
5952       __ bind(L_ghash_loop);
5953 
5954       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5955                                                  // reversing each byte
5956       __ rbit(v2, __ T16B, v2);
5957       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5958 
5959       // Multiply state in v2 by subkey in v1
5960       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5961                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
5962                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
5963       // Reduce v7:v5 by the field polynomial
5964       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
5965 
5966       __ sub(blocks, blocks, 1);
5967       __ cbnz(blocks, L_ghash_loop);
5968     }
5969 
5970     // The bit-reversed result is at this point in v0
5971     __ rev64(v0, __ T16B, v0);
5972     __ rbit(v0, __ T16B, v0);
5973 
5974     __ st1(v0, __ T16B, state);
5975     __ ret(lr);
5976 
5977     return start;
5978   }
5979 
5980   address generate_ghash_processBlocks_wide() {
5981     address small = generate_ghash_processBlocks();
5982 
5983     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
5984     __ align(wordSize * 2);
5985     address p = __ pc();
5986     __ emit_int64(0x87);  // The low-order bits of the field
5987                           // polynomial (i.e. p = z^7+z^2+z+1)
5988                           // repeated in the low and high parts of a
5989                           // 128-bit vector
5990     __ emit_int64(0x87);
5991 
5992     __ align(CodeEntryAlignment);
5993     address start = __ pc();
5994 
5995     Register state   = c_rarg0;
5996     Register subkeyH = c_rarg1;
5997     Register data    = c_rarg2;
5998     Register blocks  = c_rarg3;
5999 
6000     const int unroll = 4;
6001 
6002     __ cmp(blocks, (unsigned char)(unroll * 2));
6003     __ br(__ LT, small);
6004 
6005     if (unroll > 1) {
6006     // Save state before entering routine
6007       __ sub(sp, sp, 4 * 16);
6008       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6009       __ sub(sp, sp, 4 * 16);
6010       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6011     }
6012 
6013     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6014 
6015     if (unroll > 1) {
6016       // And restore state
6017       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6018       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6019     }
6020 
6021     __ cmp(blocks, (unsigned char)0);
6022     __ br(__ GT, small);
6023 
6024     __ ret(lr);
6025 
6026     return start;
6027   }
6028 
6029   void generate_base64_encode_simdround(Register src, Register dst,
6030         FloatRegister codec, u8 size) {
6031 
6032     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6033     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6034     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6035 
6036     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6037 
6038     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6039 
6040     __ ushr(ind0, arrangement, in0,  2);
6041 
6042     __ ushr(ind1, arrangement, in1,  2);
6043     __ shl(in0,   arrangement, in0,  6);
6044     __ orr(ind1,  arrangement, ind1, in0);
6045     __ ushr(ind1, arrangement, ind1, 2);
6046 
6047     __ ushr(ind2, arrangement, in2,  4);
6048     __ shl(in1,   arrangement, in1,  4);
6049     __ orr(ind2,  arrangement, in1,  ind2);
6050     __ ushr(ind2, arrangement, ind2, 2);
6051 
6052     __ shl(ind3,  arrangement, in2,  2);
6053     __ ushr(ind3, arrangement, ind3, 2);
6054 
6055     __ tbl(out0,  arrangement, codec,  4, ind0);
6056     __ tbl(out1,  arrangement, codec,  4, ind1);
6057     __ tbl(out2,  arrangement, codec,  4, ind2);
6058     __ tbl(out3,  arrangement, codec,  4, ind3);
6059 
6060     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6061   }
6062 
6063    /**
6064    *  Arguments:
6065    *
6066    *  Input:
6067    *  c_rarg0   - src_start
6068    *  c_rarg1   - src_offset
6069    *  c_rarg2   - src_length
6070    *  c_rarg3   - dest_start
6071    *  c_rarg4   - dest_offset
6072    *  c_rarg5   - isURL
6073    *
6074    */
6075   address generate_base64_encodeBlock() {
6076 
6077     static const char toBase64[64] = {
6078       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6079       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6080       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6081       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6082       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6083     };
6084 
6085     static const char toBase64URL[64] = {
6086       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6087       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6088       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6089       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6090       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6091     };
6092 
6093     __ align(CodeEntryAlignment);
6094     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6095     address start = __ pc();
6096 
6097     Register src   = c_rarg0;  // source array
6098     Register soff  = c_rarg1;  // source start offset
6099     Register send  = c_rarg2;  // source end offset
6100     Register dst   = c_rarg3;  // dest array
6101     Register doff  = c_rarg4;  // position for writing to dest array
6102     Register isURL = c_rarg5;  // Base64 or URL character set
6103 
6104     // c_rarg6 and c_rarg7 are free to use as temps
6105     Register codec  = c_rarg6;
6106     Register length = c_rarg7;
6107 
6108     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6109 
6110     __ add(src, src, soff);
6111     __ add(dst, dst, doff);
6112     __ sub(length, send, soff);
6113 
6114     // load the codec base address
6115     __ lea(codec, ExternalAddress((address) toBase64));
6116     __ cbz(isURL, ProcessData);
6117     __ lea(codec, ExternalAddress((address) toBase64URL));
6118 
6119     __ BIND(ProcessData);
6120 
6121     // too short to formup a SIMD loop, roll back
6122     __ cmp(length, (u1)24);
6123     __ br(Assembler::LT, Process3B);
6124 
6125     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6126 
6127     __ BIND(Process48B);
6128     __ cmp(length, (u1)48);
6129     __ br(Assembler::LT, Process24B);
6130     generate_base64_encode_simdround(src, dst, v0, 16);
6131     __ sub(length, length, 48);
6132     __ b(Process48B);
6133 
6134     __ BIND(Process24B);
6135     __ cmp(length, (u1)24);
6136     __ br(Assembler::LT, SIMDExit);
6137     generate_base64_encode_simdround(src, dst, v0, 8);
6138     __ sub(length, length, 24);
6139 
6140     __ BIND(SIMDExit);
6141     __ cbz(length, Exit);
6142 
6143     __ BIND(Process3B);
6144     //  3 src bytes, 24 bits
6145     __ ldrb(r10, __ post(src, 1));
6146     __ ldrb(r11, __ post(src, 1));
6147     __ ldrb(r12, __ post(src, 1));
6148     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6149     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6150     // codec index
6151     __ ubfmw(r15, r12, 18, 23);
6152     __ ubfmw(r14, r12, 12, 17);
6153     __ ubfmw(r13, r12, 6,  11);
6154     __ andw(r12,  r12, 63);
6155     // get the code based on the codec
6156     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6157     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6158     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6159     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6160     __ strb(r15, __ post(dst, 1));
6161     __ strb(r14, __ post(dst, 1));
6162     __ strb(r13, __ post(dst, 1));
6163     __ strb(r12, __ post(dst, 1));
6164     __ sub(length, length, 3);
6165     __ cbnz(length, Process3B);
6166 
6167     __ BIND(Exit);
6168     __ ret(lr);
6169 
6170     return start;
6171   }
6172 
6173   void generate_base64_decode_simdround(Register src, Register dst,
6174         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6175 
6176     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6177     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6178 
6179     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6180     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6181 
6182     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6183 
6184     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6185 
6186     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6187 
6188     // we need unsigned saturating subtract, to make sure all input values
6189     // in range [0, 63] will have 0U value in the higher half lookup
6190     __ uqsubv(decH0, __ T16B, in0, v27);
6191     __ uqsubv(decH1, __ T16B, in1, v27);
6192     __ uqsubv(decH2, __ T16B, in2, v27);
6193     __ uqsubv(decH3, __ T16B, in3, v27);
6194 
6195     // lower half lookup
6196     __ tbl(decL0, arrangement, codecL, 4, in0);
6197     __ tbl(decL1, arrangement, codecL, 4, in1);
6198     __ tbl(decL2, arrangement, codecL, 4, in2);
6199     __ tbl(decL3, arrangement, codecL, 4, in3);
6200 
6201     // higher half lookup
6202     __ tbx(decH0, arrangement, codecH, 4, decH0);
6203     __ tbx(decH1, arrangement, codecH, 4, decH1);
6204     __ tbx(decH2, arrangement, codecH, 4, decH2);
6205     __ tbx(decH3, arrangement, codecH, 4, decH3);
6206 
6207     // combine lower and higher
6208     __ orr(decL0, arrangement, decL0, decH0);
6209     __ orr(decL1, arrangement, decL1, decH1);
6210     __ orr(decL2, arrangement, decL2, decH2);
6211     __ orr(decL3, arrangement, decL3, decH3);
6212 
6213     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6214     __ cmhi(decH0, arrangement, decL0, v27);
6215     __ cmhi(decH1, arrangement, decL1, v27);
6216     __ cmhi(decH2, arrangement, decL2, v27);
6217     __ cmhi(decH3, arrangement, decL3, v27);
6218     __ orr(in0, arrangement, decH0, decH1);
6219     __ orr(in1, arrangement, decH2, decH3);
6220     __ orr(in2, arrangement, in0,   in1);
6221     __ umaxv(in3, arrangement, in2);
6222     __ umov(rscratch2, in3, __ B, 0);
6223 
6224     // get the data to output
6225     __ shl(out0,  arrangement, decL0, 2);
6226     __ ushr(out1, arrangement, decL1, 4);
6227     __ orr(out0,  arrangement, out0,  out1);
6228     __ shl(out1,  arrangement, decL1, 4);
6229     __ ushr(out2, arrangement, decL2, 2);
6230     __ orr(out1,  arrangement, out1,  out2);
6231     __ shl(out2,  arrangement, decL2, 6);
6232     __ orr(out2,  arrangement, out2,  decL3);
6233 
6234     __ cbz(rscratch2, NoIllegalData);
6235 
6236     // handle illegal input
6237     __ umov(r10, in2, __ D, 0);
6238     if (size == 16) {
6239       __ cbnz(r10, ErrorInLowerHalf);
6240 
6241       // illegal input is in higher half, store the lower half now.
6242       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6243 
6244       __ umov(r10, in2,  __ D, 1);
6245       __ umov(r11, out0, __ D, 1);
6246       __ umov(r12, out1, __ D, 1);
6247       __ umov(r13, out2, __ D, 1);
6248       __ b(StoreLegalData);
6249 
6250       __ BIND(ErrorInLowerHalf);
6251     }
6252     __ umov(r11, out0, __ D, 0);
6253     __ umov(r12, out1, __ D, 0);
6254     __ umov(r13, out2, __ D, 0);
6255 
6256     __ BIND(StoreLegalData);
6257     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6258     __ strb(r11, __ post(dst, 1));
6259     __ strb(r12, __ post(dst, 1));
6260     __ strb(r13, __ post(dst, 1));
6261     __ lsr(r10, r10, 8);
6262     __ lsr(r11, r11, 8);
6263     __ lsr(r12, r12, 8);
6264     __ lsr(r13, r13, 8);
6265     __ b(StoreLegalData);
6266 
6267     __ BIND(NoIllegalData);
6268     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6269   }
6270 
6271 
6272    /**
6273    *  Arguments:
6274    *
6275    *  Input:
6276    *  c_rarg0   - src_start
6277    *  c_rarg1   - src_offset
6278    *  c_rarg2   - src_length
6279    *  c_rarg3   - dest_start
6280    *  c_rarg4   - dest_offset
6281    *  c_rarg5   - isURL
6282    *  c_rarg6   - isMIME
6283    *
6284    */
6285   address generate_base64_decodeBlock() {
6286 
6287     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6288     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6289     // titled "Base64 decoding".
6290 
6291     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6292     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6293     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6294     static const uint8_t fromBase64ForNoSIMD[256] = {
6295       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6296       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6297       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6298        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6299       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6300        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6301       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6302        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6303       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6304       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6305       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6306       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6307       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6308       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6309       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6310       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6311     };
6312 
6313     static const uint8_t fromBase64URLForNoSIMD[256] = {
6314       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6315       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6316       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6317        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6318       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6319        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6320       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6321        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6322       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6323       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6324       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6325       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6326       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6327       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6328       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6329       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6330     };
6331 
6332     // A legal value of base64 code is in range [0, 127].  We need two lookups
6333     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6334     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6335     // table vector lookup use tbx, out of range indices are unchanged in
6336     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6337     // The value of index 64 is set to 0, so that we know that we already get the
6338     // decoded data with the 1st lookup.
6339     static const uint8_t fromBase64ForSIMD[128] = {
6340       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6341       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6342       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6343        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6344         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6345        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6346       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6347        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6348     };
6349 
6350     static const uint8_t fromBase64URLForSIMD[128] = {
6351       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6352       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6353       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6354        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6355         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6356        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6357        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6358        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6359     };
6360 
6361     __ align(CodeEntryAlignment);
6362     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6363     address start = __ pc();
6364 
6365     Register src    = c_rarg0;  // source array
6366     Register soff   = c_rarg1;  // source start offset
6367     Register send   = c_rarg2;  // source end offset
6368     Register dst    = c_rarg3;  // dest array
6369     Register doff   = c_rarg4;  // position for writing to dest array
6370     Register isURL  = c_rarg5;  // Base64 or URL character set
6371     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6372 
6373     Register length = send;    // reuse send as length of source data to process
6374 
6375     Register simd_codec   = c_rarg6;
6376     Register nosimd_codec = c_rarg7;
6377 
6378     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6379 
6380     __ enter();
6381 
6382     __ add(src, src, soff);
6383     __ add(dst, dst, doff);
6384 
6385     __ mov(doff, dst);
6386 
6387     __ sub(length, send, soff);
6388     __ bfm(length, zr, 0, 1);
6389 
6390     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6391     __ cbz(isURL, ProcessData);
6392     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6393 
6394     __ BIND(ProcessData);
6395     __ mov(rscratch1, length);
6396     __ cmp(length, (u1)144); // 144 = 80 + 64
6397     __ br(Assembler::LT, Process4B);
6398 
6399     // In the MIME case, the line length cannot be more than 76
6400     // bytes (see RFC 2045). This is too short a block for SIMD
6401     // to be worthwhile, so we use non-SIMD here.
6402     __ movw(rscratch1, 79);
6403 
6404     __ BIND(Process4B);
6405     __ ldrw(r14, __ post(src, 4));
6406     __ ubfxw(r10, r14, 0,  8);
6407     __ ubfxw(r11, r14, 8,  8);
6408     __ ubfxw(r12, r14, 16, 8);
6409     __ ubfxw(r13, r14, 24, 8);
6410     // get the de-code
6411     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6412     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6413     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6414     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6415     // error detection, 255u indicates an illegal input
6416     __ orrw(r14, r10, r11);
6417     __ orrw(r15, r12, r13);
6418     __ orrw(r14, r14, r15);
6419     __ tbnz(r14, 7, Exit);
6420     // recover the data
6421     __ lslw(r14, r10, 10);
6422     __ bfiw(r14, r11, 4, 6);
6423     __ bfmw(r14, r12, 2, 5);
6424     __ rev16w(r14, r14);
6425     __ bfiw(r13, r12, 6, 2);
6426     __ strh(r14, __ post(dst, 2));
6427     __ strb(r13, __ post(dst, 1));
6428     // non-simd loop
6429     __ subsw(rscratch1, rscratch1, 4);
6430     __ br(Assembler::GT, Process4B);
6431 
6432     // if exiting from PreProcess80B, rscratch1 == -1;
6433     // otherwise, rscratch1 == 0.
6434     __ cbzw(rscratch1, Exit);
6435     __ sub(length, length, 80);
6436 
6437     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6438     __ cbz(isURL, SIMDEnter);
6439     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6440 
6441     __ BIND(SIMDEnter);
6442     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6443     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6444     __ mov(rscratch1, 63);
6445     __ dup(v27, __ T16B, rscratch1);
6446 
6447     __ BIND(Process64B);
6448     __ cmp(length, (u1)64);
6449     __ br(Assembler::LT, Process32B);
6450     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6451     __ sub(length, length, 64);
6452     __ b(Process64B);
6453 
6454     __ BIND(Process32B);
6455     __ cmp(length, (u1)32);
6456     __ br(Assembler::LT, SIMDExit);
6457     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6458     __ sub(length, length, 32);
6459     __ b(Process32B);
6460 
6461     __ BIND(SIMDExit);
6462     __ cbz(length, Exit);
6463     __ movw(rscratch1, length);
6464     __ b(Process4B);
6465 
6466     __ BIND(Exit);
6467     __ sub(c_rarg0, dst, doff);
6468 
6469     __ leave();
6470     __ ret(lr);
6471 
6472     return start;
6473   }
6474 
6475   // Support for spin waits.
6476   address generate_spin_wait() {
6477     __ align(CodeEntryAlignment);
6478     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6479     address start = __ pc();
6480 
6481     __ spin_wait();
6482     __ ret(lr);
6483 
6484     return start;
6485   }
6486 
6487 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6488 
6489   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6490   //
6491   // If LSE is in use, generate LSE versions of all the stubs. The
6492   // non-LSE versions are in atomic_aarch64.S.
6493 
6494   // class AtomicStubMark records the entry point of a stub and the
6495   // stub pointer which will point to it. The stub pointer is set to
6496   // the entry point when ~AtomicStubMark() is called, which must be
6497   // after ICache::invalidate_range. This ensures safe publication of
6498   // the generated code.
6499   class AtomicStubMark {
6500     address _entry_point;
6501     aarch64_atomic_stub_t *_stub;
6502     MacroAssembler *_masm;
6503   public:
6504     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6505       _masm = masm;
6506       __ align(32);
6507       _entry_point = __ pc();
6508       _stub = stub;
6509     }
6510     ~AtomicStubMark() {
6511       *_stub = (aarch64_atomic_stub_t)_entry_point;
6512     }
6513   };
6514 
6515   // NB: For memory_order_conservative we need a trailing membar after
6516   // LSE atomic operations but not a leading membar.
6517   //
6518   // We don't need a leading membar because a clause in the Arm ARM
6519   // says:
6520   //
6521   //   Barrier-ordered-before
6522   //
6523   //   Barrier instructions order prior Memory effects before subsequent
6524   //   Memory effects generated by the same Observer. A read or a write
6525   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6526   //   Observer if and only if RW1 appears in program order before RW 2
6527   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6528   //   instruction with both Acquire and Release semantics.
6529   //
6530   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6531   // and Release semantics, therefore we don't need a leading
6532   // barrier. However, there is no corresponding Barrier-ordered-after
6533   // relationship, therefore we need a trailing membar to prevent a
6534   // later store or load from being reordered with the store in an
6535   // atomic instruction.
6536   //
6537   // This was checked by using the herd7 consistency model simulator
6538   // (http://diy.inria.fr/) with this test case:
6539   //
6540   // AArch64 LseCas
6541   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6542   // P0 | P1;
6543   // LDR W4, [X2] | MOV W3, #0;
6544   // DMB LD       | MOV W4, #1;
6545   // LDR W3, [X1] | CASAL W3, W4, [X1];
6546   //              | DMB ISH;
6547   //              | STR W4, [X2];
6548   // exists
6549   // (0:X3=0 /\ 0:X4=1)
6550   //
6551   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6552   // with the store to x in P1. Without the DMB in P1 this may happen.
6553   //
6554   // At the time of writing we don't know of any AArch64 hardware that
6555   // reorders stores in this way, but the Reference Manual permits it.
6556 
6557   void gen_cas_entry(Assembler::operand_size size,
6558                      atomic_memory_order order) {
6559     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6560       exchange_val = c_rarg2;
6561     bool acquire, release;
6562     switch (order) {
6563       case memory_order_relaxed:
6564         acquire = false;
6565         release = false;
6566         break;
6567       case memory_order_release:
6568         acquire = false;
6569         release = true;
6570         break;
6571       default:
6572         acquire = true;
6573         release = true;
6574         break;
6575     }
6576     __ mov(prev, compare_val);
6577     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6578     if (order == memory_order_conservative) {
6579       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6580     }
6581     if (size == Assembler::xword) {
6582       __ mov(r0, prev);
6583     } else {
6584       __ movw(r0, prev);
6585     }
6586     __ ret(lr);
6587   }
6588 
6589   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6590     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6591     // If not relaxed, then default to conservative.  Relaxed is the only
6592     // case we use enough to be worth specializing.
6593     if (order == memory_order_relaxed) {
6594       __ ldadd(size, incr, prev, addr);
6595     } else {
6596       __ ldaddal(size, incr, prev, addr);
6597       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6598     }
6599     if (size == Assembler::xword) {
6600       __ mov(r0, prev);
6601     } else {
6602       __ movw(r0, prev);
6603     }
6604     __ ret(lr);
6605   }
6606 
6607   void gen_swpal_entry(Assembler::operand_size size) {
6608     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6609     __ swpal(size, incr, prev, addr);
6610     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6611     if (size == Assembler::xword) {
6612       __ mov(r0, prev);
6613     } else {
6614       __ movw(r0, prev);
6615     }
6616     __ ret(lr);
6617   }
6618 
6619   void generate_atomic_entry_points() {
6620     if (! UseLSE) {
6621       return;
6622     }
6623 
6624     __ align(CodeEntryAlignment);
6625     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6626     address first_entry = __ pc();
6627 
6628     // ADD, memory_order_conservative
6629     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6630     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6631     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6632     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6633 
6634     // ADD, memory_order_relaxed
6635     AtomicStubMark mark_fetch_add_4_relaxed
6636       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6637     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6638     AtomicStubMark mark_fetch_add_8_relaxed
6639       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6640     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6641 
6642     // XCHG, memory_order_conservative
6643     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6644     gen_swpal_entry(Assembler::word);
6645     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6646     gen_swpal_entry(Assembler::xword);
6647 
6648     // CAS, memory_order_conservative
6649     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6650     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6651     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6652     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6653     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6654     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6655 
6656     // CAS, memory_order_relaxed
6657     AtomicStubMark mark_cmpxchg_1_relaxed
6658       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6659     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6660     AtomicStubMark mark_cmpxchg_4_relaxed
6661       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6662     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6663     AtomicStubMark mark_cmpxchg_8_relaxed
6664       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6665     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6666 
6667     AtomicStubMark mark_cmpxchg_4_release
6668       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6669     gen_cas_entry(MacroAssembler::word, memory_order_release);
6670     AtomicStubMark mark_cmpxchg_8_release
6671       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6672     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6673 
6674     AtomicStubMark mark_cmpxchg_4_seq_cst
6675       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6676     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6677     AtomicStubMark mark_cmpxchg_8_seq_cst
6678       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6679     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6680 
6681     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6682   }
6683 #endif // LINUX
6684 
6685   address generate_cont_thaw(Continuation::thaw_kind kind) {
6686     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
6687     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
6688 
6689     address start = __ pc();
6690 
6691     if (return_barrier) {
6692       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6693       __ mov(sp, rscratch1);
6694     }
6695     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6696 
6697     if (return_barrier) {
6698       // preserve possible return value from a method returning to the return barrier
6699       __ fmovd(rscratch1, v0);
6700       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6701     }
6702 
6703     __ movw(c_rarg1, (return_barrier ? 1 : 0));
6704     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
6705     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
6706 
6707     if (return_barrier) {
6708       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6709       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6710       __ fmovd(v0, rscratch1);
6711     }
6712     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6713 
6714 
6715     Label thaw_success;
6716     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
6717     __ cbnz(rscratch2, thaw_success);
6718     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
6719     __ br(rscratch1);
6720     __ bind(thaw_success);
6721 
6722     // make room for the thawed frames
6723     __ sub(rscratch1, sp, rscratch2);
6724     __ andr(rscratch1, rscratch1, -16); // align
6725     __ mov(sp, rscratch1);
6726 
6727     if (return_barrier) {
6728       // save original return value -- again
6729       __ fmovd(rscratch1, v0);
6730       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6731     }
6732 
6733     // If we want, we can templatize thaw by kind, and have three different entries
6734     __ movw(c_rarg1, (uint32_t)kind);
6735 
6736     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
6737     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
6738 
6739     if (return_barrier) {
6740       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6741       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6742       __ fmovd(v0, rscratch1);
6743     } else {
6744       __ mov(r0, zr); // return 0 (success) from doYield
6745     }
6746 
6747     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
6748     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
6749     __ mov(rfp, sp);
6750 
6751     if (return_barrier_exception) {
6752       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
6753       __ verify_oop(r0);
6754       __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19
6755 
6756       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
6757 
6758       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
6759       // __ reinitialize_ptrue();
6760 
6761       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
6762 
6763       __ mov(r1, r0); // the exception handler
6764       __ mov(r0, r19); // restore return value contaning the exception oop
6765       __ verify_oop(r0);
6766 
6767       __ leave();
6768       __ mov(r3, lr);
6769       __ br(r1); // the exception handler
6770     } else {
6771       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
6772       __ leave();
6773       __ ret(lr);
6774     }
6775 
6776     return start;
6777   }
6778 
6779   address generate_cont_thaw() {
6780     if (!Continuations::enabled()) return nullptr;
6781 
6782     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
6783     address start = __ pc();
6784     generate_cont_thaw(Continuation::thaw_top);
6785     return start;
6786   }
6787 
6788   address generate_cont_returnBarrier() {
6789     if (!Continuations::enabled()) return nullptr;
6790 
6791     // TODO: will probably need multiple return barriers depending on return type
6792     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
6793     address start = __ pc();
6794 
6795     generate_cont_thaw(Continuation::thaw_return_barrier);
6796 
6797     return start;
6798   }
6799 
6800   address generate_cont_returnBarrier_exception() {
6801     if (!Continuations::enabled()) return nullptr;
6802 
6803     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
6804     address start = __ pc();
6805 
6806     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
6807 
6808     return start;
6809   }
6810 
6811 #if INCLUDE_JFR
6812 
6813   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
6814     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6815     __ mov(c_rarg0, thread);
6816   }
6817 
6818   // The handle is dereferenced through a load barrier.
6819   static void jfr_epilogue(MacroAssembler* _masm) {
6820     __ reset_last_Java_frame(true);
6821     Label null_jobject;
6822     __ cbz(r0, null_jobject);
6823     DecoratorSet decorators = ACCESS_READ | IN_NATIVE;
6824     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
6825     bs->load_at(_masm, decorators, T_OBJECT, r0, Address(r0, 0), rscratch1, rscratch2);
6826     __ bind(null_jobject);
6827   }
6828 
6829   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
6830   // It returns a jobject handle to the event writer.
6831   // The handle is dereferenced and the return value is the event writer oop.
6832   static RuntimeStub* generate_jfr_write_checkpoint() {
6833     enum layout {
6834       rbp_off,
6835       rbpH_off,
6836       return_off,
6837       return_off2,
6838       framesize // inclusive of return address
6839     };
6840 
6841     int insts_size = 512;
6842     int locs_size = 64;
6843     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
6844     OopMapSet* oop_maps = new OopMapSet();
6845     MacroAssembler* masm = new MacroAssembler(&code);
6846     MacroAssembler* _masm = masm;
6847 
6848     address start = __ pc();
6849     __ enter();
6850     int frame_complete = __ pc() - start;
6851     address the_pc = __ pc();
6852     jfr_prologue(the_pc, _masm, rthread);
6853     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
6854     jfr_epilogue(_masm);
6855     __ leave();
6856     __ ret(lr);
6857 
6858     OopMap* map = new OopMap(framesize, 1); // rfp
6859     oop_maps->add_gc_map(the_pc - start, map);
6860 
6861     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
6862       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
6863                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6864                                     oop_maps, false);
6865     return stub;
6866   }
6867 
6868 #endif // INCLUDE_JFR
6869 
6870   // Continuation point for throwing of implicit exceptions that are
6871   // not handled in the current activation. Fabricates an exception
6872   // oop and initiates normal exception dispatching in this
6873   // frame. Since we need to preserve callee-saved values (currently
6874   // only for C2, but done for C1 as well) we need a callee-saved oop
6875   // map and therefore have to make these stubs into RuntimeStubs
6876   // rather than BufferBlobs.  If the compiler needs all registers to
6877   // be preserved between the fault point and the exception handler
6878   // then it must assume responsibility for that in
6879   // AbstractCompiler::continuation_for_implicit_null_exception or
6880   // continuation_for_implicit_division_by_zero_exception. All other
6881   // implicit exceptions (e.g., NullPointerException or
6882   // AbstractMethodError on entry) are either at call sites or
6883   // otherwise assume that stack unwinding will be initiated, so
6884   // caller saved registers were assumed volatile in the compiler.
6885 
6886 #undef __
6887 #define __ masm->
6888 
6889   address generate_throw_exception(const char* name,
6890                                    address runtime_entry,
6891                                    Register arg1 = noreg,
6892                                    Register arg2 = noreg) {
6893     // Information about frame layout at time of blocking runtime call.
6894     // Note that we only have to preserve callee-saved registers since
6895     // the compilers are responsible for supplying a continuation point
6896     // if they expect all registers to be preserved.
6897     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6898     enum layout {
6899       rfp_off = 0,
6900       rfp_off2,
6901       return_off,
6902       return_off2,
6903       framesize // inclusive of return address
6904     };
6905 
6906     int insts_size = 512;
6907     int locs_size  = 64;
6908 
6909     CodeBuffer code(name, insts_size, locs_size);
6910     OopMapSet* oop_maps  = new OopMapSet();
6911     MacroAssembler* masm = new MacroAssembler(&code);
6912 
6913     address start = __ pc();
6914 
6915     // This is an inlined and slightly modified version of call_VM
6916     // which has the ability to fetch the return PC out of
6917     // thread-local storage and also sets up last_Java_sp slightly
6918     // differently than the real call_VM
6919 
6920     __ enter(); // Save FP and LR before call
6921 
6922     assert(is_even(framesize/2), "sp not 16-byte aligned");
6923 
6924     // lr and fp are already in place
6925     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
6926 
6927     int frame_complete = __ pc() - start;
6928 
6929     // Set up last_Java_sp and last_Java_fp
6930     address the_pc = __ pc();
6931     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6932 
6933     // Call runtime
6934     if (arg1 != noreg) {
6935       assert(arg2 != c_rarg1, "clobbered");
6936       __ mov(c_rarg1, arg1);
6937     }
6938     if (arg2 != noreg) {
6939       __ mov(c_rarg2, arg2);
6940     }
6941     __ mov(c_rarg0, rthread);
6942     BLOCK_COMMENT("call runtime_entry");
6943     __ mov(rscratch1, runtime_entry);
6944     __ blr(rscratch1);
6945 
6946     // Generate oop map
6947     OopMap* map = new OopMap(framesize, 0);
6948 
6949     oop_maps->add_gc_map(the_pc - start, map);
6950 
6951     __ reset_last_Java_frame(true);
6952 
6953     // Reinitialize the ptrue predicate register, in case the external runtime
6954     // call clobbers ptrue reg, as we may return to SVE compiled code.
6955     __ reinitialize_ptrue();
6956 
6957     __ leave();
6958 
6959     // check for pending exceptions
6960 #ifdef ASSERT
6961     Label L;
6962     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6963     __ cbnz(rscratch1, L);
6964     __ should_not_reach_here();
6965     __ bind(L);
6966 #endif // ASSERT
6967     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6968 
6969     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6970     RuntimeStub* stub =
6971       RuntimeStub::new_runtime_stub(name,
6972                                     &code,
6973                                     frame_complete,
6974                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6975                                     oop_maps, false);
6976     return stub->entry_point();
6977   }
6978 
6979   class MontgomeryMultiplyGenerator : public MacroAssembler {
6980 
6981     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6982       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6983 
6984     RegSet _toSave;
6985     bool _squaring;
6986 
6987   public:
6988     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6989       : MacroAssembler(as->code()), _squaring(squaring) {
6990 
6991       // Register allocation
6992 
6993       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6994       Pa_base = *regs;       // Argument registers
6995       if (squaring)
6996         Pb_base = Pa_base;
6997       else
6998         Pb_base = *++regs;
6999       Pn_base = *++regs;
7000       Rlen= *++regs;
7001       inv = *++regs;
7002       Pm_base = *++regs;
7003 
7004                           // Working registers:
7005       Ra =  *++regs;        // The current digit of a, b, n, and m.
7006       Rb =  *++regs;
7007       Rm =  *++regs;
7008       Rn =  *++regs;
7009 
7010       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7011       Pb =  *++regs;
7012       Pm =  *++regs;
7013       Pn =  *++regs;
7014 
7015       t0 =  *++regs;        // Three registers which form a
7016       t1 =  *++regs;        // triple-precision accumuator.
7017       t2 =  *++regs;
7018 
7019       Ri =  *++regs;        // Inner and outer loop indexes.
7020       Rj =  *++regs;
7021 
7022       Rhi_ab = *++regs;     // Product registers: low and high parts
7023       Rlo_ab = *++regs;     // of a*b and m*n.
7024       Rhi_mn = *++regs;
7025       Rlo_mn = *++regs;
7026 
7027       // r19 and up are callee-saved.
7028       _toSave = RegSet::range(r19, *regs) + Pm_base;
7029     }
7030 
7031   private:
7032     void save_regs() {
7033       push(_toSave, sp);
7034     }
7035 
7036     void restore_regs() {
7037       pop(_toSave, sp);
7038     }
7039 
7040     template <typename T>
7041     void unroll_2(Register count, T block) {
7042       Label loop, end, odd;
7043       tbnz(count, 0, odd);
7044       cbz(count, end);
7045       align(16);
7046       bind(loop);
7047       (this->*block)();
7048       bind(odd);
7049       (this->*block)();
7050       subs(count, count, 2);
7051       br(Assembler::GT, loop);
7052       bind(end);
7053     }
7054 
7055     template <typename T>
7056     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7057       Label loop, end, odd;
7058       tbnz(count, 0, odd);
7059       cbz(count, end);
7060       align(16);
7061       bind(loop);
7062       (this->*block)(d, s, tmp);
7063       bind(odd);
7064       (this->*block)(d, s, tmp);
7065       subs(count, count, 2);
7066       br(Assembler::GT, loop);
7067       bind(end);
7068     }
7069 
7070     void pre1(RegisterOrConstant i) {
7071       block_comment("pre1");
7072       // Pa = Pa_base;
7073       // Pb = Pb_base + i;
7074       // Pm = Pm_base;
7075       // Pn = Pn_base + i;
7076       // Ra = *Pa;
7077       // Rb = *Pb;
7078       // Rm = *Pm;
7079       // Rn = *Pn;
7080       ldr(Ra, Address(Pa_base));
7081       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7082       ldr(Rm, Address(Pm_base));
7083       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7084       lea(Pa, Address(Pa_base));
7085       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7086       lea(Pm, Address(Pm_base));
7087       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7088 
7089       // Zero the m*n result.
7090       mov(Rhi_mn, zr);
7091       mov(Rlo_mn, zr);
7092     }
7093 
7094     // The core multiply-accumulate step of a Montgomery
7095     // multiplication.  The idea is to schedule operations as a
7096     // pipeline so that instructions with long latencies (loads and
7097     // multiplies) have time to complete before their results are
7098     // used.  This most benefits in-order implementations of the
7099     // architecture but out-of-order ones also benefit.
7100     void step() {
7101       block_comment("step");
7102       // MACC(Ra, Rb, t0, t1, t2);
7103       // Ra = *++Pa;
7104       // Rb = *--Pb;
7105       umulh(Rhi_ab, Ra, Rb);
7106       mul(Rlo_ab, Ra, Rb);
7107       ldr(Ra, pre(Pa, wordSize));
7108       ldr(Rb, pre(Pb, -wordSize));
7109       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7110                                        // previous iteration.
7111       // MACC(Rm, Rn, t0, t1, t2);
7112       // Rm = *++Pm;
7113       // Rn = *--Pn;
7114       umulh(Rhi_mn, Rm, Rn);
7115       mul(Rlo_mn, Rm, Rn);
7116       ldr(Rm, pre(Pm, wordSize));
7117       ldr(Rn, pre(Pn, -wordSize));
7118       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7119     }
7120 
7121     void post1() {
7122       block_comment("post1");
7123 
7124       // MACC(Ra, Rb, t0, t1, t2);
7125       // Ra = *++Pa;
7126       // Rb = *--Pb;
7127       umulh(Rhi_ab, Ra, Rb);
7128       mul(Rlo_ab, Ra, Rb);
7129       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7130       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7131 
7132       // *Pm = Rm = t0 * inv;
7133       mul(Rm, t0, inv);
7134       str(Rm, Address(Pm));
7135 
7136       // MACC(Rm, Rn, t0, t1, t2);
7137       // t0 = t1; t1 = t2; t2 = 0;
7138       umulh(Rhi_mn, Rm, Rn);
7139 
7140 #ifndef PRODUCT
7141       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7142       {
7143         mul(Rlo_mn, Rm, Rn);
7144         add(Rlo_mn, t0, Rlo_mn);
7145         Label ok;
7146         cbz(Rlo_mn, ok); {
7147           stop("broken Montgomery multiply");
7148         } bind(ok);
7149       }
7150 #endif
7151       // We have very carefully set things up so that
7152       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7153       // the lower half of Rm * Rn because we know the result already:
7154       // it must be -t0.  t0 + (-t0) must generate a carry iff
7155       // t0 != 0.  So, rather than do a mul and an adds we just set
7156       // the carry flag iff t0 is nonzero.
7157       //
7158       // mul(Rlo_mn, Rm, Rn);
7159       // adds(zr, t0, Rlo_mn);
7160       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7161       adcs(t0, t1, Rhi_mn);
7162       adc(t1, t2, zr);
7163       mov(t2, zr);
7164     }
7165 
7166     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7167       block_comment("pre2");
7168       // Pa = Pa_base + i-len;
7169       // Pb = Pb_base + len;
7170       // Pm = Pm_base + i-len;
7171       // Pn = Pn_base + len;
7172 
7173       if (i.is_register()) {
7174         sub(Rj, i.as_register(), len);
7175       } else {
7176         mov(Rj, i.as_constant());
7177         sub(Rj, Rj, len);
7178       }
7179       // Rj == i-len
7180 
7181       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7182       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7183       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7184       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7185 
7186       // Ra = *++Pa;
7187       // Rb = *--Pb;
7188       // Rm = *++Pm;
7189       // Rn = *--Pn;
7190       ldr(Ra, pre(Pa, wordSize));
7191       ldr(Rb, pre(Pb, -wordSize));
7192       ldr(Rm, pre(Pm, wordSize));
7193       ldr(Rn, pre(Pn, -wordSize));
7194 
7195       mov(Rhi_mn, zr);
7196       mov(Rlo_mn, zr);
7197     }
7198 
7199     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7200       block_comment("post2");
7201       if (i.is_constant()) {
7202         mov(Rj, i.as_constant()-len.as_constant());
7203       } else {
7204         sub(Rj, i.as_register(), len);
7205       }
7206 
7207       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7208 
7209       // As soon as we know the least significant digit of our result,
7210       // store it.
7211       // Pm_base[i-len] = t0;
7212       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7213 
7214       // t0 = t1; t1 = t2; t2 = 0;
7215       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7216       adc(t1, t2, zr);
7217       mov(t2, zr);
7218     }
7219 
7220     // A carry in t0 after Montgomery multiplication means that we
7221     // should subtract multiples of n from our result in m.  We'll
7222     // keep doing that until there is no carry.
7223     void normalize(RegisterOrConstant len) {
7224       block_comment("normalize");
7225       // while (t0)
7226       //   t0 = sub(Pm_base, Pn_base, t0, len);
7227       Label loop, post, again;
7228       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7229       cbz(t0, post); {
7230         bind(again); {
7231           mov(i, zr);
7232           mov(cnt, len);
7233           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7234           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7235           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7236           align(16);
7237           bind(loop); {
7238             sbcs(Rm, Rm, Rn);
7239             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7240             add(i, i, 1);
7241             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7242             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7243             sub(cnt, cnt, 1);
7244           } cbnz(cnt, loop);
7245           sbc(t0, t0, zr);
7246         } cbnz(t0, again);
7247       } bind(post);
7248     }
7249 
7250     // Move memory at s to d, reversing words.
7251     //    Increments d to end of copied memory
7252     //    Destroys tmp1, tmp2
7253     //    Preserves len
7254     //    Leaves s pointing to the address which was in d at start
7255     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7256       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7257       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7258 
7259       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7260       mov(tmp1, len);
7261       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7262       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7263     }
7264     // where
7265     void reverse1(Register d, Register s, Register tmp) {
7266       ldr(tmp, pre(s, -wordSize));
7267       ror(tmp, tmp, 32);
7268       str(tmp, post(d, wordSize));
7269     }
7270 
7271     void step_squaring() {
7272       // An extra ACC
7273       step();
7274       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7275     }
7276 
7277     void last_squaring(RegisterOrConstant i) {
7278       Label dont;
7279       // if ((i & 1) == 0) {
7280       tbnz(i.as_register(), 0, dont); {
7281         // MACC(Ra, Rb, t0, t1, t2);
7282         // Ra = *++Pa;
7283         // Rb = *--Pb;
7284         umulh(Rhi_ab, Ra, Rb);
7285         mul(Rlo_ab, Ra, Rb);
7286         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7287       } bind(dont);
7288     }
7289 
7290     void extra_step_squaring() {
7291       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7292 
7293       // MACC(Rm, Rn, t0, t1, t2);
7294       // Rm = *++Pm;
7295       // Rn = *--Pn;
7296       umulh(Rhi_mn, Rm, Rn);
7297       mul(Rlo_mn, Rm, Rn);
7298       ldr(Rm, pre(Pm, wordSize));
7299       ldr(Rn, pre(Pn, -wordSize));
7300     }
7301 
7302     void post1_squaring() {
7303       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7304 
7305       // *Pm = Rm = t0 * inv;
7306       mul(Rm, t0, inv);
7307       str(Rm, Address(Pm));
7308 
7309       // MACC(Rm, Rn, t0, t1, t2);
7310       // t0 = t1; t1 = t2; t2 = 0;
7311       umulh(Rhi_mn, Rm, Rn);
7312 
7313 #ifndef PRODUCT
7314       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7315       {
7316         mul(Rlo_mn, Rm, Rn);
7317         add(Rlo_mn, t0, Rlo_mn);
7318         Label ok;
7319         cbz(Rlo_mn, ok); {
7320           stop("broken Montgomery multiply");
7321         } bind(ok);
7322       }
7323 #endif
7324       // We have very carefully set things up so that
7325       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7326       // the lower half of Rm * Rn because we know the result already:
7327       // it must be -t0.  t0 + (-t0) must generate a carry iff
7328       // t0 != 0.  So, rather than do a mul and an adds we just set
7329       // the carry flag iff t0 is nonzero.
7330       //
7331       // mul(Rlo_mn, Rm, Rn);
7332       // adds(zr, t0, Rlo_mn);
7333       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7334       adcs(t0, t1, Rhi_mn);
7335       adc(t1, t2, zr);
7336       mov(t2, zr);
7337     }
7338 
7339     void acc(Register Rhi, Register Rlo,
7340              Register t0, Register t1, Register t2) {
7341       adds(t0, t0, Rlo);
7342       adcs(t1, t1, Rhi);
7343       adc(t2, t2, zr);
7344     }
7345 
7346   public:
7347     /**
7348      * Fast Montgomery multiplication.  The derivation of the
7349      * algorithm is in A Cryptographic Library for the Motorola
7350      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7351      *
7352      * Arguments:
7353      *
7354      * Inputs for multiplication:
7355      *   c_rarg0   - int array elements a
7356      *   c_rarg1   - int array elements b
7357      *   c_rarg2   - int array elements n (the modulus)
7358      *   c_rarg3   - int length
7359      *   c_rarg4   - int inv
7360      *   c_rarg5   - int array elements m (the result)
7361      *
7362      * Inputs for squaring:
7363      *   c_rarg0   - int array elements a
7364      *   c_rarg1   - int array elements n (the modulus)
7365      *   c_rarg2   - int length
7366      *   c_rarg3   - int inv
7367      *   c_rarg4   - int array elements m (the result)
7368      *
7369      */
7370     address generate_multiply() {
7371       Label argh, nothing;
7372       bind(argh);
7373       stop("MontgomeryMultiply total_allocation must be <= 8192");
7374 
7375       align(CodeEntryAlignment);
7376       address entry = pc();
7377 
7378       cbzw(Rlen, nothing);
7379 
7380       enter();
7381 
7382       // Make room.
7383       cmpw(Rlen, 512);
7384       br(Assembler::HI, argh);
7385       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7386       andr(sp, Ra, -2 * wordSize);
7387 
7388       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7389 
7390       {
7391         // Copy input args, reversing as we go.  We use Ra as a
7392         // temporary variable.
7393         reverse(Ra, Pa_base, Rlen, t0, t1);
7394         if (!_squaring)
7395           reverse(Ra, Pb_base, Rlen, t0, t1);
7396         reverse(Ra, Pn_base, Rlen, t0, t1);
7397       }
7398 
7399       // Push all call-saved registers and also Pm_base which we'll need
7400       // at the end.
7401       save_regs();
7402 
7403 #ifndef PRODUCT
7404       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7405       {
7406         ldr(Rn, Address(Pn_base, 0));
7407         mul(Rlo_mn, Rn, inv);
7408         subs(zr, Rlo_mn, -1);
7409         Label ok;
7410         br(EQ, ok); {
7411           stop("broken inverse in Montgomery multiply");
7412         } bind(ok);
7413       }
7414 #endif
7415 
7416       mov(Pm_base, Ra);
7417 
7418       mov(t0, zr);
7419       mov(t1, zr);
7420       mov(t2, zr);
7421 
7422       block_comment("for (int i = 0; i < len; i++) {");
7423       mov(Ri, zr); {
7424         Label loop, end;
7425         cmpw(Ri, Rlen);
7426         br(Assembler::GE, end);
7427 
7428         bind(loop);
7429         pre1(Ri);
7430 
7431         block_comment("  for (j = i; j; j--) {"); {
7432           movw(Rj, Ri);
7433           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7434         } block_comment("  } // j");
7435 
7436         post1();
7437         addw(Ri, Ri, 1);
7438         cmpw(Ri, Rlen);
7439         br(Assembler::LT, loop);
7440         bind(end);
7441         block_comment("} // i");
7442       }
7443 
7444       block_comment("for (int i = len; i < 2*len; i++) {");
7445       mov(Ri, Rlen); {
7446         Label loop, end;
7447         cmpw(Ri, Rlen, Assembler::LSL, 1);
7448         br(Assembler::GE, end);
7449 
7450         bind(loop);
7451         pre2(Ri, Rlen);
7452 
7453         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7454           lslw(Rj, Rlen, 1);
7455           subw(Rj, Rj, Ri);
7456           subw(Rj, Rj, 1);
7457           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7458         } block_comment("  } // j");
7459 
7460         post2(Ri, Rlen);
7461         addw(Ri, Ri, 1);
7462         cmpw(Ri, Rlen, Assembler::LSL, 1);
7463         br(Assembler::LT, loop);
7464         bind(end);
7465       }
7466       block_comment("} // i");
7467 
7468       normalize(Rlen);
7469 
7470       mov(Ra, Pm_base);  // Save Pm_base in Ra
7471       restore_regs();  // Restore caller's Pm_base
7472 
7473       // Copy our result into caller's Pm_base
7474       reverse(Pm_base, Ra, Rlen, t0, t1);
7475 
7476       leave();
7477       bind(nothing);
7478       ret(lr);
7479 
7480       return entry;
7481     }
7482     // In C, approximately:
7483 
7484     // void
7485     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7486     //                     julong Pn_base[], julong Pm_base[],
7487     //                     julong inv, int len) {
7488     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7489     //   julong *Pa, *Pb, *Pn, *Pm;
7490     //   julong Ra, Rb, Rn, Rm;
7491 
7492     //   int i;
7493 
7494     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7495 
7496     //   for (i = 0; i < len; i++) {
7497     //     int j;
7498 
7499     //     Pa = Pa_base;
7500     //     Pb = Pb_base + i;
7501     //     Pm = Pm_base;
7502     //     Pn = Pn_base + i;
7503 
7504     //     Ra = *Pa;
7505     //     Rb = *Pb;
7506     //     Rm = *Pm;
7507     //     Rn = *Pn;
7508 
7509     //     int iters = i;
7510     //     for (j = 0; iters--; j++) {
7511     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7512     //       MACC(Ra, Rb, t0, t1, t2);
7513     //       Ra = *++Pa;
7514     //       Rb = *--Pb;
7515     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7516     //       MACC(Rm, Rn, t0, t1, t2);
7517     //       Rm = *++Pm;
7518     //       Rn = *--Pn;
7519     //     }
7520 
7521     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7522     //     MACC(Ra, Rb, t0, t1, t2);
7523     //     *Pm = Rm = t0 * inv;
7524     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7525     //     MACC(Rm, Rn, t0, t1, t2);
7526 
7527     //     assert(t0 == 0, "broken Montgomery multiply");
7528 
7529     //     t0 = t1; t1 = t2; t2 = 0;
7530     //   }
7531 
7532     //   for (i = len; i < 2*len; i++) {
7533     //     int j;
7534 
7535     //     Pa = Pa_base + i-len;
7536     //     Pb = Pb_base + len;
7537     //     Pm = Pm_base + i-len;
7538     //     Pn = Pn_base + len;
7539 
7540     //     Ra = *++Pa;
7541     //     Rb = *--Pb;
7542     //     Rm = *++Pm;
7543     //     Rn = *--Pn;
7544 
7545     //     int iters = len*2-i-1;
7546     //     for (j = i-len+1; iters--; j++) {
7547     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7548     //       MACC(Ra, Rb, t0, t1, t2);
7549     //       Ra = *++Pa;
7550     //       Rb = *--Pb;
7551     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7552     //       MACC(Rm, Rn, t0, t1, t2);
7553     //       Rm = *++Pm;
7554     //       Rn = *--Pn;
7555     //     }
7556 
7557     //     Pm_base[i-len] = t0;
7558     //     t0 = t1; t1 = t2; t2 = 0;
7559     //   }
7560 
7561     //   while (t0)
7562     //     t0 = sub(Pm_base, Pn_base, t0, len);
7563     // }
7564 
7565     /**
7566      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7567      * multiplies than Montgomery multiplication so it should be up to
7568      * 25% faster.  However, its loop control is more complex and it
7569      * may actually run slower on some machines.
7570      *
7571      * Arguments:
7572      *
7573      * Inputs:
7574      *   c_rarg0   - int array elements a
7575      *   c_rarg1   - int array elements n (the modulus)
7576      *   c_rarg2   - int length
7577      *   c_rarg3   - int inv
7578      *   c_rarg4   - int array elements m (the result)
7579      *
7580      */
7581     address generate_square() {
7582       Label argh;
7583       bind(argh);
7584       stop("MontgomeryMultiply total_allocation must be <= 8192");
7585 
7586       align(CodeEntryAlignment);
7587       address entry = pc();
7588 
7589       enter();
7590 
7591       // Make room.
7592       cmpw(Rlen, 512);
7593       br(Assembler::HI, argh);
7594       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7595       andr(sp, Ra, -2 * wordSize);
7596 
7597       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7598 
7599       {
7600         // Copy input args, reversing as we go.  We use Ra as a
7601         // temporary variable.
7602         reverse(Ra, Pa_base, Rlen, t0, t1);
7603         reverse(Ra, Pn_base, Rlen, t0, t1);
7604       }
7605 
7606       // Push all call-saved registers and also Pm_base which we'll need
7607       // at the end.
7608       save_regs();
7609 
7610       mov(Pm_base, Ra);
7611 
7612       mov(t0, zr);
7613       mov(t1, zr);
7614       mov(t2, zr);
7615 
7616       block_comment("for (int i = 0; i < len; i++) {");
7617       mov(Ri, zr); {
7618         Label loop, end;
7619         bind(loop);
7620         cmp(Ri, Rlen);
7621         br(Assembler::GE, end);
7622 
7623         pre1(Ri);
7624 
7625         block_comment("for (j = (i+1)/2; j; j--) {"); {
7626           add(Rj, Ri, 1);
7627           lsr(Rj, Rj, 1);
7628           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7629         } block_comment("  } // j");
7630 
7631         last_squaring(Ri);
7632 
7633         block_comment("  for (j = i/2; j; j--) {"); {
7634           lsr(Rj, Ri, 1);
7635           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7636         } block_comment("  } // j");
7637 
7638         post1_squaring();
7639         add(Ri, Ri, 1);
7640         cmp(Ri, Rlen);
7641         br(Assembler::LT, loop);
7642 
7643         bind(end);
7644         block_comment("} // i");
7645       }
7646 
7647       block_comment("for (int i = len; i < 2*len; i++) {");
7648       mov(Ri, Rlen); {
7649         Label loop, end;
7650         bind(loop);
7651         cmp(Ri, Rlen, Assembler::LSL, 1);
7652         br(Assembler::GE, end);
7653 
7654         pre2(Ri, Rlen);
7655 
7656         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7657           lsl(Rj, Rlen, 1);
7658           sub(Rj, Rj, Ri);
7659           sub(Rj, Rj, 1);
7660           lsr(Rj, Rj, 1);
7661           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7662         } block_comment("  } // j");
7663 
7664         last_squaring(Ri);
7665 
7666         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7667           lsl(Rj, Rlen, 1);
7668           sub(Rj, Rj, Ri);
7669           lsr(Rj, Rj, 1);
7670           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7671         } block_comment("  } // j");
7672 
7673         post2(Ri, Rlen);
7674         add(Ri, Ri, 1);
7675         cmp(Ri, Rlen, Assembler::LSL, 1);
7676 
7677         br(Assembler::LT, loop);
7678         bind(end);
7679         block_comment("} // i");
7680       }
7681 
7682       normalize(Rlen);
7683 
7684       mov(Ra, Pm_base);  // Save Pm_base in Ra
7685       restore_regs();  // Restore caller's Pm_base
7686 
7687       // Copy our result into caller's Pm_base
7688       reverse(Pm_base, Ra, Rlen, t0, t1);
7689 
7690       leave();
7691       ret(lr);
7692 
7693       return entry;
7694     }
7695     // In C, approximately:
7696 
7697     // void
7698     // montgomery_square(julong Pa_base[], julong Pn_base[],
7699     //                   julong Pm_base[], julong inv, int len) {
7700     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7701     //   julong *Pa, *Pb, *Pn, *Pm;
7702     //   julong Ra, Rb, Rn, Rm;
7703 
7704     //   int i;
7705 
7706     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7707 
7708     //   for (i = 0; i < len; i++) {
7709     //     int j;
7710 
7711     //     Pa = Pa_base;
7712     //     Pb = Pa_base + i;
7713     //     Pm = Pm_base;
7714     //     Pn = Pn_base + i;
7715 
7716     //     Ra = *Pa;
7717     //     Rb = *Pb;
7718     //     Rm = *Pm;
7719     //     Rn = *Pn;
7720 
7721     //     int iters = (i+1)/2;
7722     //     for (j = 0; iters--; j++) {
7723     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7724     //       MACC2(Ra, Rb, t0, t1, t2);
7725     //       Ra = *++Pa;
7726     //       Rb = *--Pb;
7727     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7728     //       MACC(Rm, Rn, t0, t1, t2);
7729     //       Rm = *++Pm;
7730     //       Rn = *--Pn;
7731     //     }
7732     //     if ((i & 1) == 0) {
7733     //       assert(Ra == Pa_base[j], "must be");
7734     //       MACC(Ra, Ra, t0, t1, t2);
7735     //     }
7736     //     iters = i/2;
7737     //     assert(iters == i-j, "must be");
7738     //     for (; iters--; j++) {
7739     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7740     //       MACC(Rm, Rn, t0, t1, t2);
7741     //       Rm = *++Pm;
7742     //       Rn = *--Pn;
7743     //     }
7744 
7745     //     *Pm = Rm = t0 * inv;
7746     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7747     //     MACC(Rm, Rn, t0, t1, t2);
7748 
7749     //     assert(t0 == 0, "broken Montgomery multiply");
7750 
7751     //     t0 = t1; t1 = t2; t2 = 0;
7752     //   }
7753 
7754     //   for (i = len; i < 2*len; i++) {
7755     //     int start = i-len+1;
7756     //     int end = start + (len - start)/2;
7757     //     int j;
7758 
7759     //     Pa = Pa_base + i-len;
7760     //     Pb = Pa_base + len;
7761     //     Pm = Pm_base + i-len;
7762     //     Pn = Pn_base + len;
7763 
7764     //     Ra = *++Pa;
7765     //     Rb = *--Pb;
7766     //     Rm = *++Pm;
7767     //     Rn = *--Pn;
7768 
7769     //     int iters = (2*len-i-1)/2;
7770     //     assert(iters == end-start, "must be");
7771     //     for (j = start; iters--; j++) {
7772     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7773     //       MACC2(Ra, Rb, t0, t1, t2);
7774     //       Ra = *++Pa;
7775     //       Rb = *--Pb;
7776     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7777     //       MACC(Rm, Rn, t0, t1, t2);
7778     //       Rm = *++Pm;
7779     //       Rn = *--Pn;
7780     //     }
7781     //     if ((i & 1) == 0) {
7782     //       assert(Ra == Pa_base[j], "must be");
7783     //       MACC(Ra, Ra, t0, t1, t2);
7784     //     }
7785     //     iters =  (2*len-i)/2;
7786     //     assert(iters == len-j, "must be");
7787     //     for (; iters--; j++) {
7788     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7789     //       MACC(Rm, Rn, t0, t1, t2);
7790     //       Rm = *++Pm;
7791     //       Rn = *--Pn;
7792     //     }
7793     //     Pm_base[i-len] = t0;
7794     //     t0 = t1; t1 = t2; t2 = 0;
7795     //   }
7796 
7797     //   while (t0)
7798     //     t0 = sub(Pm_base, Pn_base, t0, len);
7799     // }
7800   };
7801 
7802 
7803   // Call here from the interpreter or compiled code to either load
7804   // multiple returned values from the inline type instance being
7805   // returned to registers or to store returned values to a newly
7806   // allocated inline type instance.
7807   address generate_return_value_stub(address destination, const char* name, bool has_res) {
7808     // We need to save all registers the calling convention may use so
7809     // the runtime calls read or update those registers. This needs to
7810     // be in sync with SharedRuntime::java_return_convention().
7811     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7812     enum layout {
7813       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
7814       j_rarg6_off, j_rarg6_2,
7815       j_rarg5_off, j_rarg5_2,
7816       j_rarg4_off, j_rarg4_2,
7817       j_rarg3_off, j_rarg3_2,
7818       j_rarg2_off, j_rarg2_2,
7819       j_rarg1_off, j_rarg1_2,
7820       j_rarg0_off, j_rarg0_2,
7821 
7822       j_farg7_off, j_farg7_2,
7823       j_farg6_off, j_farg6_2,
7824       j_farg5_off, j_farg5_2,
7825       j_farg4_off, j_farg4_2,
7826       j_farg3_off, j_farg3_2,
7827       j_farg2_off, j_farg2_2,
7828       j_farg1_off, j_farg1_2,
7829       j_farg0_off, j_farg0_2,
7830 
7831       rfp_off, rfp_off2,
7832       return_off, return_off2,
7833 
7834       framesize // inclusive of return address
7835     };
7836 
7837     CodeBuffer code(name, 512, 64);
7838     MacroAssembler* masm = new MacroAssembler(&code);
7839 
7840     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
7841     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
7842     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
7843     int frame_size_in_words = frame_size_in_bytes / wordSize;
7844 
7845     OopMapSet* oop_maps = new OopMapSet();
7846     OopMap* map = new OopMap(frame_size_in_slots, 0);
7847 
7848     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
7849     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
7850     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
7851     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
7852     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
7853     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
7854     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
7855     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
7856 
7857     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
7858     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
7859     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
7860     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
7861     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
7862     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
7863     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
7864     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
7865 
7866     address start = __ pc();
7867 
7868     __ enter(); // Save FP and LR before call
7869 
7870     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
7871     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
7872     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
7873     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
7874 
7875     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
7876     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
7877     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
7878     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
7879 
7880     int frame_complete = __ offset();
7881 
7882     // Set up last_Java_sp and last_Java_fp
7883     address the_pc = __ pc();
7884     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
7885 
7886     // Call runtime
7887     __ mov(c_rarg1, r0);
7888     __ mov(c_rarg0, rthread);
7889 
7890     __ mov(rscratch1, destination);
7891     __ blr(rscratch1);
7892 
7893     oop_maps->add_gc_map(the_pc - start, map);
7894 
7895     __ reset_last_Java_frame(false);
7896 
7897     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
7898     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
7899     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
7900     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
7901 
7902     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
7903     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
7904     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
7905     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
7906 
7907     __ leave();
7908 
7909     // check for pending exceptions
7910     Label pending;
7911     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
7912     __ cbnz(rscratch1, pending);
7913 
7914     if (has_res) {
7915       __ get_vm_result(r0, rthread);
7916     }
7917 
7918     __ ret(lr);
7919 
7920     __ bind(pending);
7921     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7922 
7923     // -------------
7924     // make sure all code is generated
7925     masm->flush();
7926 
7927     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
7928     return stub->entry_point();
7929   }
7930 
7931   // Initialization
7932   void generate_initial() {
7933     // Generate initial stubs and initializes the entry points
7934 
7935     // entry points that exist in all platforms Note: This is code
7936     // that could be shared among different platforms - however the
7937     // benefit seems to be smaller than the disadvantage of having a
7938     // much more complicated generator structure. See also comment in
7939     // stubRoutines.hpp.
7940 
7941     StubRoutines::_forward_exception_entry = generate_forward_exception();
7942 
7943     StubRoutines::_call_stub_entry =
7944       generate_call_stub(StubRoutines::_call_stub_return_address);
7945 
7946     // is referenced by megamorphic call
7947     StubRoutines::_catch_exception_entry = generate_catch_exception();
7948 
7949     // Build this early so it's available for the interpreter.
7950     StubRoutines::_throw_StackOverflowError_entry =
7951       generate_throw_exception("StackOverflowError throw_exception",
7952                                CAST_FROM_FN_PTR(address,
7953                                                 SharedRuntime::throw_StackOverflowError));
7954     StubRoutines::_throw_delayed_StackOverflowError_entry =
7955       generate_throw_exception("delayed StackOverflowError throw_exception",
7956                                CAST_FROM_FN_PTR(address,
7957                                                 SharedRuntime::throw_delayed_StackOverflowError));
7958     if (UseCRC32Intrinsics) {
7959       // set table address before stub generation which use it
7960       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7961       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7962     }
7963 
7964     if (UseCRC32CIntrinsics) {
7965       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7966     }
7967 
7968     // Disabled until JDK-8210858 is fixed
7969     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7970     //   StubRoutines::_dlog = generate_dlog();
7971     // }
7972 
7973     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7974       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7975     }
7976 
7977     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7978       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7979     }
7980 
7981     if (InlineTypeReturnedAsFields) {
7982       StubRoutines::_load_inline_type_fields_in_regs =
7983          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
7984       StubRoutines::_store_inline_type_fields_to_buf =
7985          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
7986     }
7987   }
7988 
7989   void generate_phase1() {
7990     // Continuation stubs:
7991     StubRoutines::_cont_thaw          = generate_cont_thaw();
7992     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
7993     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
7994 
7995     JFR_ONLY(StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();)
7996     JFR_ONLY(StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();)
7997   }
7998 
7999   void generate_all() {
8000     // support for verify_oop (must happen after universe_init)
8001     if (VerifyOops) {
8002       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8003     }
8004     StubRoutines::_throw_AbstractMethodError_entry =
8005       generate_throw_exception("AbstractMethodError throw_exception",
8006                                CAST_FROM_FN_PTR(address,
8007                                                 SharedRuntime::
8008                                                 throw_AbstractMethodError));
8009 
8010     StubRoutines::_throw_IncompatibleClassChangeError_entry =
8011       generate_throw_exception("IncompatibleClassChangeError throw_exception",
8012                                CAST_FROM_FN_PTR(address,
8013                                                 SharedRuntime::
8014                                                 throw_IncompatibleClassChangeError));
8015 
8016     StubRoutines::_throw_NullPointerException_at_call_entry =
8017       generate_throw_exception("NullPointerException at call throw_exception",
8018                                CAST_FROM_FN_PTR(address,
8019                                                 SharedRuntime::
8020                                                 throw_NullPointerException_at_call));
8021 
8022     if (UseSVE == 0) {
8023       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8024     }
8025 
8026     // arraycopy stubs used by compilers
8027     generate_arraycopy_stubs();
8028 
8029     // countPositives stub for large arrays.
8030     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8031 
8032     // array equals stub for large arrays.
8033     if (!UseSimpleArrayEquals) {
8034       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8035     }
8036 
8037     generate_compare_long_strings();
8038 
8039     generate_string_indexof_stubs();
8040 
8041     // byte_array_inflate stub for large arrays.
8042     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8043 
8044     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8045     if (bs_nm != NULL) {
8046       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
8047     }
8048 #ifdef COMPILER2
8049     if (UseMultiplyToLenIntrinsic) {
8050       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8051     }
8052 
8053     if (UseSquareToLenIntrinsic) {
8054       StubRoutines::_squareToLen = generate_squareToLen();
8055     }
8056 
8057     if (UseMulAddIntrinsic) {
8058       StubRoutines::_mulAdd = generate_mulAdd();
8059     }
8060 
8061     if (UseSIMDForBigIntegerShiftIntrinsics) {
8062       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8063       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8064     }
8065 
8066     if (UseMontgomeryMultiplyIntrinsic) {
8067       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8068       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8069       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8070     }
8071 
8072     if (UseMontgomerySquareIntrinsic) {
8073       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8074       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8075       // We use generate_multiply() rather than generate_square()
8076       // because it's faster for the sizes of modulus we care about.
8077       StubRoutines::_montgomerySquare = g.generate_multiply();
8078     }
8079 #endif // COMPILER2
8080 
8081     if (UseBASE64Intrinsics) {
8082         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8083         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8084     }
8085 
8086     // data cache line writeback
8087     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8088     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8089 
8090     if (UseAESIntrinsics) {
8091       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8092       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8093       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8094       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8095       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8096     }
8097     if (UseGHASHIntrinsics) {
8098       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8099       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8100     }
8101     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8102       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8103     }
8104 
8105     if (UseMD5Intrinsics) {
8106       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8107       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8108     }
8109     if (UseSHA1Intrinsics) {
8110       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8111       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8112     }
8113     if (UseSHA256Intrinsics) {
8114       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8115       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8116     }
8117     if (UseSHA512Intrinsics) {
8118       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8119       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8120     }
8121     if (UseSHA3Intrinsics) {
8122       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8123       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8124     }
8125 
8126     // generate Adler32 intrinsics code
8127     if (UseAdler32Intrinsics) {
8128       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8129     }
8130 
8131     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8132 
8133 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8134 
8135     generate_atomic_entry_points();
8136 
8137 #endif // LINUX
8138 
8139     StubRoutines::aarch64::set_completed();
8140   }
8141 
8142  public:
8143   StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
8144     if (phase == 0) {
8145       generate_initial();
8146     } else if (phase == 1) {
8147       generate_phase1(); // stubs that must be available for the interpreter
8148     } else {
8149       generate_all();
8150     }
8151   }
8152 }; // end class declaration
8153 
8154 #define UCM_TABLE_MAX_ENTRIES 8
8155 void StubGenerator_generate(CodeBuffer* code, int phase) {
8156   if (UnsafeCopyMemory::_table == NULL) {
8157     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
8158   }
8159   StubGenerator g(code, phase);
8160 }
8161 
8162 
8163 #if defined (LINUX)
8164 
8165 // Define pointers to atomic stubs and initialize them to point to the
8166 // code in atomic_aarch64.S.
8167 
8168 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8169   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8170     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8171   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8172     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8173 
8174 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8175 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8176 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8177 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8178 DEFAULT_ATOMIC_OP(xchg, 4, )
8179 DEFAULT_ATOMIC_OP(xchg, 8, )
8180 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8181 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8182 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8183 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8184 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8185 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8186 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8187 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8188 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8189 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8190 
8191 #undef DEFAULT_ATOMIC_OP
8192 
8193 #endif // LINUX