1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/atomic.hpp"
  45 #include "runtime/frame.inline.hpp"
  46 #include "runtime/handles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubCodeGenerator.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/thread.inline.hpp"
  51 #include "utilities/align.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 #ifdef COMPILER2
  54 #include "opto/runtime.hpp"
  55 #endif
  56 #if INCLUDE_ZGC
  57 #include "gc/z/zThreadLocalData.hpp"
  58 #endif
  59 
  60 // Declaration and definition of StubGenerator (no .hpp file).
  61 // For a more detailed description of the stub routine structure
  62 // see the comment in stubRoutines.hpp
  63 
  64 #undef __
  65 #define __ _masm->
  66 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  67 
  68 #ifdef PRODUCT
  69 #define BLOCK_COMMENT(str) /* nothing */
  70 #else
  71 #define BLOCK_COMMENT(str) __ block_comment(str)
  72 #endif
  73 
  74 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  75 
  76 // Stub Code definitions
  77 
  78 class StubGenerator: public StubCodeGenerator {
  79  private:
  80 
  81 #ifdef PRODUCT
  82 #define inc_counter_np(counter) ((void)0)
  83 #else
  84   void inc_counter_np_(int& counter) {
  85     __ lea(rscratch2, ExternalAddress((address)&counter));
  86     __ ldrw(rscratch1, Address(rscratch2));
  87     __ addw(rscratch1, rscratch1, 1);
  88     __ strw(rscratch1, Address(rscratch2));
  89   }
  90 #define inc_counter_np(counter) \
  91   BLOCK_COMMENT("inc_counter " #counter); \
  92   inc_counter_np_(counter);
  93 #endif
  94 
  95   // Call stubs are used to call Java from C
  96   //
  97   // Arguments:
  98   //    c_rarg0:   call wrapper address                   address
  99   //    c_rarg1:   result                                 address
 100   //    c_rarg2:   result type                            BasicType
 101   //    c_rarg3:   method                                 Method*
 102   //    c_rarg4:   (interpreter) entry point              address
 103   //    c_rarg5:   parameters                             intptr_t*
 104   //    c_rarg6:   parameter size (in words)              int
 105   //    c_rarg7:   thread                                 Thread*
 106   //
 107   // There is no return from the stub itself as any Java result
 108   // is written to result
 109   //
 110   // we save r30 (lr) as the return PC at the base of the frame and
 111   // link r29 (fp) below it as the frame pointer installing sp (r31)
 112   // into fp.
 113   //
 114   // we save r0-r7, which accounts for all the c arguments.
 115   //
 116   // TODO: strictly do we need to save them all? they are treated as
 117   // volatile by C so could we omit saving the ones we are going to
 118   // place in global registers (thread? method?) or those we only use
 119   // during setup of the Java call?
 120   //
 121   // we don't need to save r8 which C uses as an indirect result location
 122   // return register.
 123   //
 124   // we don't need to save r9-r15 which both C and Java treat as
 125   // volatile
 126   //
 127   // we don't need to save r16-18 because Java does not use them
 128   //
 129   // we save r19-r28 which Java uses as scratch registers and C
 130   // expects to be callee-save
 131   //
 132   // we save the bottom 64 bits of each value stored in v8-v15; it is
 133   // the responsibility of the caller to preserve larger values.
 134   //
 135   // so the stub frame looks like this when we enter Java code
 136   //
 137   //     [ return_from_Java     ] <--- sp
 138   //     [ argument word n      ]
 139   //      ...
 140   // -27 [ argument word 1      ]
 141   // -26 [ saved v15            ] <--- sp_after_call
 142   // -25 [ saved v14            ]
 143   // -24 [ saved v13            ]
 144   // -23 [ saved v12            ]
 145   // -22 [ saved v11            ]
 146   // -21 [ saved v10            ]
 147   // -20 [ saved v9             ]
 148   // -19 [ saved v8             ]
 149   // -18 [ saved r28            ]
 150   // -17 [ saved r27            ]
 151   // -16 [ saved r26            ]
 152   // -15 [ saved r25            ]
 153   // -14 [ saved r24            ]
 154   // -13 [ saved r23            ]
 155   // -12 [ saved r22            ]
 156   // -11 [ saved r21            ]
 157   // -10 [ saved r20            ]
 158   //  -9 [ saved r19            ]
 159   //  -8 [ call wrapper    (r0) ]
 160   //  -7 [ result          (r1) ]
 161   //  -6 [ result type     (r2) ]
 162   //  -5 [ method          (r3) ]
 163   //  -4 [ entry point     (r4) ]
 164   //  -3 [ parameters      (r5) ]
 165   //  -2 [ parameter size  (r6) ]
 166   //  -1 [ thread (r7)          ]
 167   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 168   //   1 [ saved lr       (r30) ]
 169 
 170   // Call stub stack layout word offsets from fp
 171   enum call_stub_layout {
 172     sp_after_call_off = -26,
 173 
 174     d15_off            = -26,
 175     d13_off            = -24,
 176     d11_off            = -22,
 177     d9_off             = -20,
 178 
 179     r28_off            = -18,
 180     r26_off            = -16,
 181     r24_off            = -14,
 182     r22_off            = -12,
 183     r20_off            = -10,
 184     call_wrapper_off   =  -8,
 185     result_off         =  -7,
 186     result_type_off    =  -6,
 187     method_off         =  -5,
 188     entry_point_off    =  -4,
 189     parameter_size_off =  -2,
 190     thread_off         =  -1,
 191     fp_f               =   0,
 192     retaddr_off        =   1,
 193   };
 194 
 195   address generate_call_stub(address& return_address) {
 196     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 197            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 198            "adjust this code");
 199 
 200     StubCodeMark mark(this, "StubRoutines", "call_stub");
 201     address start = __ pc();
 202 
 203     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 204 
 205     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 206     const Address result        (rfp, result_off         * wordSize);
 207     const Address result_type   (rfp, result_type_off    * wordSize);
 208     const Address method        (rfp, method_off         * wordSize);
 209     const Address entry_point   (rfp, entry_point_off    * wordSize);
 210     const Address parameter_size(rfp, parameter_size_off * wordSize);
 211 
 212     const Address thread        (rfp, thread_off         * wordSize);
 213 
 214     const Address d15_save      (rfp, d15_off * wordSize);
 215     const Address d13_save      (rfp, d13_off * wordSize);
 216     const Address d11_save      (rfp, d11_off * wordSize);
 217     const Address d9_save       (rfp, d9_off * wordSize);
 218 
 219     const Address r28_save      (rfp, r28_off * wordSize);
 220     const Address r26_save      (rfp, r26_off * wordSize);
 221     const Address r24_save      (rfp, r24_off * wordSize);
 222     const Address r22_save      (rfp, r22_off * wordSize);
 223     const Address r20_save      (rfp, r20_off * wordSize);
 224 
 225     // stub code
 226 
 227     address aarch64_entry = __ pc();
 228 
 229     // set up frame and move sp to end of save area
 230     __ enter();
 231     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 232 
 233     // save register parameters and Java scratch/global registers
 234     // n.b. we save thread even though it gets installed in
 235     // rthread because we want to sanity check rthread later
 236     __ str(c_rarg7,  thread);
 237     __ strw(c_rarg6, parameter_size);
 238     __ stp(c_rarg4, c_rarg5,  entry_point);
 239     __ stp(c_rarg2, c_rarg3,  result_type);
 240     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 241 
 242     __ stp(r20, r19,   r20_save);
 243     __ stp(r22, r21,   r22_save);
 244     __ stp(r24, r23,   r24_save);
 245     __ stp(r26, r25,   r26_save);
 246     __ stp(r28, r27,   r28_save);
 247 
 248     __ stpd(v9,  v8,   d9_save);
 249     __ stpd(v11, v10,  d11_save);
 250     __ stpd(v13, v12,  d13_save);
 251     __ stpd(v15, v14,  d15_save);
 252 
 253     // install Java thread in global register now we have saved
 254     // whatever value it held
 255     __ mov(rthread, c_rarg7);
 256     // And method
 257     __ mov(rmethod, c_rarg3);
 258 
 259     // set up the heapbase register
 260     __ reinit_heapbase();
 261 
 262 #ifdef ASSERT
 263     // make sure we have no pending exceptions
 264     {
 265       Label L;
 266       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 267       __ cmp(rscratch1, (u1)NULL_WORD);
 268       __ br(Assembler::EQ, L);
 269       __ stop("StubRoutines::call_stub: entered with pending exception");
 270       __ BIND(L);
 271     }
 272 #endif
 273     // pass parameters if any
 274     __ mov(esp, sp);
 275     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 276     __ andr(sp, rscratch1, -2 * wordSize);
 277 
 278     BLOCK_COMMENT("pass parameters if any");
 279     Label parameters_done;
 280     // parameter count is still in c_rarg6
 281     // and parameter pointer identifying param 1 is in c_rarg5
 282     __ cbzw(c_rarg6, parameters_done);
 283 
 284     address loop = __ pc();
 285     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 286     __ subsw(c_rarg6, c_rarg6, 1);
 287     __ push(rscratch1);
 288     __ br(Assembler::GT, loop);
 289 
 290     __ BIND(parameters_done);
 291 
 292     // call Java entry -- passing methdoOop, and current sp
 293     //      rmethod: Method*
 294     //      r13: sender sp
 295     BLOCK_COMMENT("call Java function");
 296     __ mov(r13, sp);
 297     __ blr(c_rarg4);
 298 
 299     // we do this here because the notify will already have been done
 300     // if we get to the next instruction via an exception
 301     //
 302     // n.b. adding this instruction here affects the calculation of
 303     // whether or not a routine returns to the call stub (used when
 304     // doing stack walks) since the normal test is to check the return
 305     // pc against the address saved below. so we may need to allow for
 306     // this extra instruction in the check.
 307 
 308     // save current address for use by exception handling code
 309 
 310     return_address = __ pc();
 311 
 312     // store result depending on type (everything that is not
 313     // T_OBJECT, T_PRIMITIVE_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 314     // n.b. this assumes Java returns an integral result in r0
 315     // and a floating result in j_farg0
 316     // All of j_rargN may be used to return inline type fields so be careful
 317     // not to clobber those.
 318     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
 319     // assignment of Rresult below.
 320     Register Rresult = r14, Rresult_type = r15;
 321     __ ldr(Rresult, result);
 322     Label is_long, is_float, is_double, check_prim, exit;
 323     __ ldr(Rresult_type, result_type);
 324     __ cmp(Rresult_type, (u1)T_OBJECT);
 325     __ br(Assembler::EQ, check_prim);
 326     __ cmp(Rresult_type, (u1)T_PRIMITIVE_OBJECT);
 327     __ br(Assembler::EQ, check_prim);
 328     __ cmp(Rresult_type, (u1)T_LONG);
 329     __ br(Assembler::EQ, is_long);
 330     __ cmp(Rresult_type, (u1)T_FLOAT);
 331     __ br(Assembler::EQ, is_float);
 332     __ cmp(Rresult_type, (u1)T_DOUBLE);
 333     __ br(Assembler::EQ, is_double);
 334 
 335     // handle T_INT case
 336     __ strw(r0, Address(Rresult));
 337 
 338     __ BIND(exit);
 339 
 340     // pop parameters
 341     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 342 
 343 #ifdef ASSERT
 344     // verify that threads correspond
 345     {
 346       Label L, S;
 347       __ ldr(rscratch1, thread);
 348       __ cmp(rthread, rscratch1);
 349       __ br(Assembler::NE, S);
 350       __ get_thread(rscratch1);
 351       __ cmp(rthread, rscratch1);
 352       __ br(Assembler::EQ, L);
 353       __ BIND(S);
 354       __ stop("StubRoutines::call_stub: threads must correspond");
 355       __ BIND(L);
 356     }
 357 #endif
 358 
 359     // restore callee-save registers
 360     __ ldpd(v15, v14,  d15_save);
 361     __ ldpd(v13, v12,  d13_save);
 362     __ ldpd(v11, v10,  d11_save);
 363     __ ldpd(v9,  v8,   d9_save);
 364 
 365     __ ldp(r28, r27,   r28_save);
 366     __ ldp(r26, r25,   r26_save);
 367     __ ldp(r24, r23,   r24_save);
 368     __ ldp(r22, r21,   r22_save);
 369     __ ldp(r20, r19,   r20_save);
 370 
 371     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 372     __ ldrw(c_rarg2, result_type);
 373     __ ldr(c_rarg3,  method);
 374     __ ldp(c_rarg4, c_rarg5,  entry_point);
 375     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 376 
 377     // leave frame and return to caller
 378     __ leave();
 379     __ ret(lr);
 380 
 381     // handle return types different from T_INT
 382     __ BIND(check_prim);
 383     if (InlineTypeReturnedAsFields) {
 384       // Check for scalarized return value
 385       __ tbz(r0, 0, is_long);
 386       // Load pack handler address
 387       __ andr(rscratch1, r0, -2);
 388       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
 389       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
 390       __ blr(rscratch1);
 391       __ b(exit);
 392     }
 393 
 394     __ BIND(is_long);
 395     __ str(r0, Address(Rresult, 0));
 396     __ br(Assembler::AL, exit);
 397 
 398     __ BIND(is_float);
 399     __ strs(j_farg0, Address(Rresult, 0));
 400     __ br(Assembler::AL, exit);
 401 
 402     __ BIND(is_double);
 403     __ strd(j_farg0, Address(Rresult, 0));
 404     __ br(Assembler::AL, exit);
 405 
 406     return start;
 407   }
 408 
 409   // Return point for a Java call if there's an exception thrown in
 410   // Java code.  The exception is caught and transformed into a
 411   // pending exception stored in JavaThread that can be tested from
 412   // within the VM.
 413   //
 414   // Note: Usually the parameters are removed by the callee. In case
 415   // of an exception crossing an activation frame boundary, that is
 416   // not the case if the callee is compiled code => need to setup the
 417   // rsp.
 418   //
 419   // r0: exception oop
 420 
 421   address generate_catch_exception() {
 422     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 423     address start = __ pc();
 424 
 425     // same as in generate_call_stub():
 426     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 427     const Address thread        (rfp, thread_off         * wordSize);
 428 
 429 #ifdef ASSERT
 430     // verify that threads correspond
 431     {
 432       Label L, S;
 433       __ ldr(rscratch1, thread);
 434       __ cmp(rthread, rscratch1);
 435       __ br(Assembler::NE, S);
 436       __ get_thread(rscratch1);
 437       __ cmp(rthread, rscratch1);
 438       __ br(Assembler::EQ, L);
 439       __ bind(S);
 440       __ stop("StubRoutines::catch_exception: threads must correspond");
 441       __ bind(L);
 442     }
 443 #endif
 444 
 445     // set pending exception
 446     __ verify_oop(r0);
 447 
 448     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 449     __ mov(rscratch1, (address)__FILE__);
 450     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 451     __ movw(rscratch1, (int)__LINE__);
 452     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 453 
 454     // complete return to VM
 455     assert(StubRoutines::_call_stub_return_address != NULL,
 456            "_call_stub_return_address must have been generated before");
 457     __ b(StubRoutines::_call_stub_return_address);
 458 
 459     return start;
 460   }
 461 
 462   // Continuation point for runtime calls returning with a pending
 463   // exception.  The pending exception check happened in the runtime
 464   // or native call stub.  The pending exception in Thread is
 465   // converted into a Java-level exception.
 466   //
 467   // Contract with Java-level exception handlers:
 468   // r0: exception
 469   // r3: throwing pc
 470   //
 471   // NOTE: At entry of this stub, exception-pc must be in LR !!
 472 
 473   // NOTE: this is always used as a jump target within generated code
 474   // so it just needs to be generated code wiht no x86 prolog
 475 
 476   address generate_forward_exception() {
 477     StubCodeMark mark(this, "StubRoutines", "forward exception");
 478     address start = __ pc();
 479 
 480     // Upon entry, LR points to the return address returning into
 481     // Java (interpreted or compiled) code; i.e., the return address
 482     // becomes the throwing pc.
 483     //
 484     // Arguments pushed before the runtime call are still on the stack
 485     // but the exception handler will reset the stack pointer ->
 486     // ignore them.  A potential result in registers can be ignored as
 487     // well.
 488 
 489 #ifdef ASSERT
 490     // make sure this code is only executed if there is a pending exception
 491     {
 492       Label L;
 493       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 494       __ cbnz(rscratch1, L);
 495       __ stop("StubRoutines::forward exception: no pending exception (1)");
 496       __ bind(L);
 497     }
 498 #endif
 499 
 500     // compute exception handler into r19
 501 
 502     // call the VM to find the handler address associated with the
 503     // caller address. pass thread in r0 and caller pc (ret address)
 504     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 505     // the stack.
 506     __ mov(c_rarg1, lr);
 507     // lr will be trashed by the VM call so we move it to R19
 508     // (callee-saved) because we also need to pass it to the handler
 509     // returned by this call.
 510     __ mov(r19, lr);
 511     BLOCK_COMMENT("call exception_handler_for_return_address");
 512     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 513                          SharedRuntime::exception_handler_for_return_address),
 514                     rthread, c_rarg1);
 515     // Reinitialize the ptrue predicate register, in case the external runtime
 516     // call clobbers ptrue reg, as we may return to SVE compiled code.
 517     __ reinitialize_ptrue();
 518 
 519     // we should not really care that lr is no longer the callee
 520     // address. we saved the value the handler needs in r19 so we can
 521     // just copy it to r3. however, the C2 handler will push its own
 522     // frame and then calls into the VM and the VM code asserts that
 523     // the PC for the frame above the handler belongs to a compiled
 524     // Java method. So, we restore lr here to satisfy that assert.
 525     __ mov(lr, r19);
 526     // setup r0 & r3 & clear pending exception
 527     __ mov(r3, r19);
 528     __ mov(r19, r0);
 529     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 530     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 531 
 532 #ifdef ASSERT
 533     // make sure exception is set
 534     {
 535       Label L;
 536       __ cbnz(r0, L);
 537       __ stop("StubRoutines::forward exception: no pending exception (2)");
 538       __ bind(L);
 539     }
 540 #endif
 541 
 542     // continue at exception handler
 543     // r0: exception
 544     // r3: throwing pc
 545     // r19: exception handler
 546     __ verify_oop(r0);
 547     __ br(r19);
 548 
 549     return start;
 550   }
 551 
 552   // Non-destructive plausibility checks for oops
 553   //
 554   // Arguments:
 555   //    r0: oop to verify
 556   //    rscratch1: error message
 557   //
 558   // Stack after saving c_rarg3:
 559   //    [tos + 0]: saved c_rarg3
 560   //    [tos + 1]: saved c_rarg2
 561   //    [tos + 2]: saved lr
 562   //    [tos + 3]: saved rscratch2
 563   //    [tos + 4]: saved r0
 564   //    [tos + 5]: saved rscratch1
 565   address generate_verify_oop() {
 566 
 567     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 568     address start = __ pc();
 569 
 570     Label exit, error;
 571 
 572     // save c_rarg2 and c_rarg3
 573     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 574 
 575     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 576     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 577     __ ldr(c_rarg3, Address(c_rarg2));
 578     __ add(c_rarg3, c_rarg3, 1);
 579     __ str(c_rarg3, Address(c_rarg2));
 580 
 581     // object is in r0
 582     // make sure object is 'reasonable'
 583     __ cbz(r0, exit); // if obj is NULL it is OK
 584 
 585 #if INCLUDE_ZGC
 586     if (UseZGC) {
 587       // Check if mask is good.
 588       // verifies that ZAddressBadMask & r0 == 0
 589       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 590       __ andr(c_rarg2, r0, c_rarg3);
 591       __ cbnz(c_rarg2, error);
 592     }
 593 #endif
 594 
 595     // Check if the oop is in the right area of memory
 596     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 597     __ andr(c_rarg2, r0, c_rarg3);
 598     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 599 
 600     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 601     // instruction here because the flags register is live.
 602     __ eor(c_rarg2, c_rarg2, c_rarg3);
 603     __ cbnz(c_rarg2, error);
 604 
 605     // make sure klass is 'reasonable', which is not zero.
 606     __ load_klass(r0, r0);  // get klass
 607     __ cbz(r0, error);      // if klass is NULL it is broken
 608 
 609     // return if everything seems ok
 610     __ bind(exit);
 611 
 612     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 613     __ ret(lr);
 614 
 615     // handle errors
 616     __ bind(error);
 617     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 618 
 619     __ push(RegSet::range(r0, r29), sp);
 620     // debug(char* msg, int64_t pc, int64_t regs[])
 621     __ mov(c_rarg0, rscratch1);      // pass address of error message
 622     __ mov(c_rarg1, lr);             // pass return address
 623     __ mov(c_rarg2, sp);             // pass address of regs on stack
 624 #ifndef PRODUCT
 625     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 626 #endif
 627     BLOCK_COMMENT("call MacroAssembler::debug");
 628     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 629     __ blr(rscratch1);
 630     __ hlt(0);
 631 
 632     return start;
 633   }
 634 
 635   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 636 
 637   // Generate indices for iota vector.
 638   address generate_iota_indices(const char *stub_name) {
 639     __ align(CodeEntryAlignment);
 640     StubCodeMark mark(this, "StubRoutines", stub_name);
 641     address start = __ pc();
 642     __ emit_data64(0x0706050403020100, relocInfo::none);
 643     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 644     return start;
 645   }
 646 
 647   // The inner part of zero_words().  This is the bulk operation,
 648   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 649   // caller is responsible for zeroing the last few words.
 650   //
 651   // Inputs:
 652   // r10: the HeapWord-aligned base address of an array to zero.
 653   // r11: the count in HeapWords, r11 > 0.
 654   //
 655   // Returns r10 and r11, adjusted for the caller to clear.
 656   // r10: the base address of the tail of words left to clear.
 657   // r11: the number of words in the tail.
 658   //      r11 < MacroAssembler::zero_words_block_size.
 659 
 660   address generate_zero_blocks() {
 661     Label done;
 662     Label base_aligned;
 663 
 664     Register base = r10, cnt = r11;
 665 
 666     __ align(CodeEntryAlignment);
 667     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 668     address start = __ pc();
 669 
 670     if (UseBlockZeroing) {
 671       int zva_length = VM_Version::zva_length();
 672 
 673       // Ensure ZVA length can be divided by 16. This is required by
 674       // the subsequent operations.
 675       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 676 
 677       __ tbz(base, 3, base_aligned);
 678       __ str(zr, Address(__ post(base, 8)));
 679       __ sub(cnt, cnt, 1);
 680       __ bind(base_aligned);
 681 
 682       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 683       // alignment.
 684       Label small;
 685       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 686       __ subs(rscratch1, cnt, low_limit >> 3);
 687       __ br(Assembler::LT, small);
 688       __ zero_dcache_blocks(base, cnt);
 689       __ bind(small);
 690     }
 691 
 692     {
 693       // Number of stp instructions we'll unroll
 694       const int unroll =
 695         MacroAssembler::zero_words_block_size / 2;
 696       // Clear the remaining blocks.
 697       Label loop;
 698       __ subs(cnt, cnt, unroll * 2);
 699       __ br(Assembler::LT, done);
 700       __ bind(loop);
 701       for (int i = 0; i < unroll; i++)
 702         __ stp(zr, zr, __ post(base, 16));
 703       __ subs(cnt, cnt, unroll * 2);
 704       __ br(Assembler::GE, loop);
 705       __ bind(done);
 706       __ add(cnt, cnt, unroll * 2);
 707     }
 708 
 709     __ ret(lr);
 710 
 711     return start;
 712   }
 713 
 714 
 715   typedef enum {
 716     copy_forwards = 1,
 717     copy_backwards = -1
 718   } copy_direction;
 719 
 720   // Bulk copy of blocks of 8 words.
 721   //
 722   // count is a count of words.
 723   //
 724   // Precondition: count >= 8
 725   //
 726   // Postconditions:
 727   //
 728   // The least significant bit of count contains the remaining count
 729   // of words to copy.  The rest of count is trash.
 730   //
 731   // s and d are adjusted to point to the remaining words to copy
 732   //
 733   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 734                            copy_direction direction) {
 735     int unit = wordSize * direction;
 736     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 737 
 738     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 739       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 740     const Register stride = r13;
 741 
 742     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 743     assert_different_registers(s, d, count, rscratch1);
 744 
 745     Label again, drain;
 746     const char *stub_name;
 747     if (direction == copy_forwards)
 748       stub_name = "forward_copy_longs";
 749     else
 750       stub_name = "backward_copy_longs";
 751 
 752     __ align(CodeEntryAlignment);
 753 
 754     StubCodeMark mark(this, "StubRoutines", stub_name);
 755 
 756     __ bind(start);
 757 
 758     Label unaligned_copy_long;
 759     if (AvoidUnalignedAccesses) {
 760       __ tbnz(d, 3, unaligned_copy_long);
 761     }
 762 
 763     if (direction == copy_forwards) {
 764       __ sub(s, s, bias);
 765       __ sub(d, d, bias);
 766     }
 767 
 768 #ifdef ASSERT
 769     // Make sure we are never given < 8 words
 770     {
 771       Label L;
 772       __ cmp(count, (u1)8);
 773       __ br(Assembler::GE, L);
 774       __ stop("genrate_copy_longs called with < 8 words");
 775       __ bind(L);
 776     }
 777 #endif
 778 
 779     // Fill 8 registers
 780     if (UseSIMDForMemoryOps) {
 781       __ ldpq(v0, v1, Address(s, 4 * unit));
 782       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 783     } else {
 784       __ ldp(t0, t1, Address(s, 2 * unit));
 785       __ ldp(t2, t3, Address(s, 4 * unit));
 786       __ ldp(t4, t5, Address(s, 6 * unit));
 787       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 788     }
 789 
 790     __ subs(count, count, 16);
 791     __ br(Assembler::LO, drain);
 792 
 793     int prefetch = PrefetchCopyIntervalInBytes;
 794     bool use_stride = false;
 795     if (direction == copy_backwards) {
 796        use_stride = prefetch > 256;
 797        prefetch = -prefetch;
 798        if (use_stride) __ mov(stride, prefetch);
 799     }
 800 
 801     __ bind(again);
 802 
 803     if (PrefetchCopyIntervalInBytes > 0)
 804       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 805 
 806     if (UseSIMDForMemoryOps) {
 807       __ stpq(v0, v1, Address(d, 4 * unit));
 808       __ ldpq(v0, v1, Address(s, 4 * unit));
 809       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 810       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 811     } else {
 812       __ stp(t0, t1, Address(d, 2 * unit));
 813       __ ldp(t0, t1, Address(s, 2 * unit));
 814       __ stp(t2, t3, Address(d, 4 * unit));
 815       __ ldp(t2, t3, Address(s, 4 * unit));
 816       __ stp(t4, t5, Address(d, 6 * unit));
 817       __ ldp(t4, t5, Address(s, 6 * unit));
 818       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 819       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 820     }
 821 
 822     __ subs(count, count, 8);
 823     __ br(Assembler::HS, again);
 824 
 825     // Drain
 826     __ bind(drain);
 827     if (UseSIMDForMemoryOps) {
 828       __ stpq(v0, v1, Address(d, 4 * unit));
 829       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 830     } else {
 831       __ stp(t0, t1, Address(d, 2 * unit));
 832       __ stp(t2, t3, Address(d, 4 * unit));
 833       __ stp(t4, t5, Address(d, 6 * unit));
 834       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 835     }
 836 
 837     {
 838       Label L1, L2;
 839       __ tbz(count, exact_log2(4), L1);
 840       if (UseSIMDForMemoryOps) {
 841         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 842         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 843       } else {
 844         __ ldp(t0, t1, Address(s, 2 * unit));
 845         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 846         __ stp(t0, t1, Address(d, 2 * unit));
 847         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 848       }
 849       __ bind(L1);
 850 
 851       if (direction == copy_forwards) {
 852         __ add(s, s, bias);
 853         __ add(d, d, bias);
 854       }
 855 
 856       __ tbz(count, 1, L2);
 857       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 858       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 859       __ bind(L2);
 860     }
 861 
 862     __ ret(lr);
 863 
 864     if (AvoidUnalignedAccesses) {
 865       Label drain, again;
 866       // Register order for storing. Order is different for backward copy.
 867 
 868       __ bind(unaligned_copy_long);
 869 
 870       // source address is even aligned, target odd aligned
 871       //
 872       // when forward copying word pairs we read long pairs at offsets
 873       // {0, 2, 4, 6} (in long words). when backwards copying we read
 874       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 875       // address by -2 in the forwards case so we can compute the
 876       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 877       // or -1.
 878       //
 879       // when forward copying we need to store 1 word, 3 pairs and
 880       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 881       // zero offset We adjust the destination by -1 which means we
 882       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 883       //
 884       // When backwards copyng we need to store 1 word, 3 pairs and
 885       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 886       // offsets {1, 3, 5, 7, 8} * unit.
 887 
 888       if (direction == copy_forwards) {
 889         __ sub(s, s, 16);
 890         __ sub(d, d, 8);
 891       }
 892 
 893       // Fill 8 registers
 894       //
 895       // for forwards copy s was offset by -16 from the original input
 896       // value of s so the register contents are at these offsets
 897       // relative to the 64 bit block addressed by that original input
 898       // and so on for each successive 64 byte block when s is updated
 899       //
 900       // t0 at offset 0,  t1 at offset 8
 901       // t2 at offset 16, t3 at offset 24
 902       // t4 at offset 32, t5 at offset 40
 903       // t6 at offset 48, t7 at offset 56
 904 
 905       // for backwards copy s was not offset so the register contents
 906       // are at these offsets into the preceding 64 byte block
 907       // relative to that original input and so on for each successive
 908       // preceding 64 byte block when s is updated. this explains the
 909       // slightly counter-intuitive looking pattern of register usage
 910       // in the stp instructions for backwards copy.
 911       //
 912       // t0 at offset -16, t1 at offset -8
 913       // t2 at offset -32, t3 at offset -24
 914       // t4 at offset -48, t5 at offset -40
 915       // t6 at offset -64, t7 at offset -56
 916 
 917       __ ldp(t0, t1, Address(s, 2 * unit));
 918       __ ldp(t2, t3, Address(s, 4 * unit));
 919       __ ldp(t4, t5, Address(s, 6 * unit));
 920       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 921 
 922       __ subs(count, count, 16);
 923       __ br(Assembler::LO, drain);
 924 
 925       int prefetch = PrefetchCopyIntervalInBytes;
 926       bool use_stride = false;
 927       if (direction == copy_backwards) {
 928          use_stride = prefetch > 256;
 929          prefetch = -prefetch;
 930          if (use_stride) __ mov(stride, prefetch);
 931       }
 932 
 933       __ bind(again);
 934 
 935       if (PrefetchCopyIntervalInBytes > 0)
 936         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 937 
 938       if (direction == copy_forwards) {
 939        // allowing for the offset of -8 the store instructions place
 940        // registers into the target 64 bit block at the following
 941        // offsets
 942        //
 943        // t0 at offset 0
 944        // t1 at offset 8,  t2 at offset 16
 945        // t3 at offset 24, t4 at offset 32
 946        // t5 at offset 40, t6 at offset 48
 947        // t7 at offset 56
 948 
 949         __ str(t0, Address(d, 1 * unit));
 950         __ stp(t1, t2, Address(d, 2 * unit));
 951         __ ldp(t0, t1, Address(s, 2 * unit));
 952         __ stp(t3, t4, Address(d, 4 * unit));
 953         __ ldp(t2, t3, Address(s, 4 * unit));
 954         __ stp(t5, t6, Address(d, 6 * unit));
 955         __ ldp(t4, t5, Address(s, 6 * unit));
 956         __ str(t7, Address(__ pre(d, 8 * unit)));
 957         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 958       } else {
 959        // d was not offset when we started so the registers are
 960        // written into the 64 bit block preceding d with the following
 961        // offsets
 962        //
 963        // t1 at offset -8
 964        // t3 at offset -24, t0 at offset -16
 965        // t5 at offset -48, t2 at offset -32
 966        // t7 at offset -56, t4 at offset -48
 967        //                   t6 at offset -64
 968        //
 969        // note that this matches the offsets previously noted for the
 970        // loads
 971 
 972         __ str(t1, Address(d, 1 * unit));
 973         __ stp(t3, t0, Address(d, 3 * unit));
 974         __ ldp(t0, t1, Address(s, 2 * unit));
 975         __ stp(t5, t2, Address(d, 5 * unit));
 976         __ ldp(t2, t3, Address(s, 4 * unit));
 977         __ stp(t7, t4, Address(d, 7 * unit));
 978         __ ldp(t4, t5, Address(s, 6 * unit));
 979         __ str(t6, Address(__ pre(d, 8 * unit)));
 980         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 981       }
 982 
 983       __ subs(count, count, 8);
 984       __ br(Assembler::HS, again);
 985 
 986       // Drain
 987       //
 988       // this uses the same pattern of offsets and register arguments
 989       // as above
 990       __ bind(drain);
 991       if (direction == copy_forwards) {
 992         __ str(t0, Address(d, 1 * unit));
 993         __ stp(t1, t2, Address(d, 2 * unit));
 994         __ stp(t3, t4, Address(d, 4 * unit));
 995         __ stp(t5, t6, Address(d, 6 * unit));
 996         __ str(t7, Address(__ pre(d, 8 * unit)));
 997       } else {
 998         __ str(t1, Address(d, 1 * unit));
 999         __ stp(t3, t0, Address(d, 3 * unit));
1000         __ stp(t5, t2, Address(d, 5 * unit));
1001         __ stp(t7, t4, Address(d, 7 * unit));
1002         __ str(t6, Address(__ pre(d, 8 * unit)));
1003       }
1004       // now we need to copy any remaining part block which may
1005       // include a 4 word block subblock and/or a 2 word subblock.
1006       // bits 2 and 1 in the count are the tell-tale for whetehr we
1007       // have each such subblock
1008       {
1009         Label L1, L2;
1010         __ tbz(count, exact_log2(4), L1);
1011        // this is the same as above but copying only 4 longs hence
1012        // with ony one intervening stp between the str instructions
1013        // but note that the offsets and registers still follow the
1014        // same pattern
1015         __ ldp(t0, t1, Address(s, 2 * unit));
1016         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1017         if (direction == copy_forwards) {
1018           __ str(t0, Address(d, 1 * unit));
1019           __ stp(t1, t2, Address(d, 2 * unit));
1020           __ str(t3, Address(__ pre(d, 4 * unit)));
1021         } else {
1022           __ str(t1, Address(d, 1 * unit));
1023           __ stp(t3, t0, Address(d, 3 * unit));
1024           __ str(t2, Address(__ pre(d, 4 * unit)));
1025         }
1026         __ bind(L1);
1027 
1028         __ tbz(count, 1, L2);
1029        // this is the same as above but copying only 2 longs hence
1030        // there is no intervening stp between the str instructions
1031        // but note that the offset and register patterns are still
1032        // the same
1033         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1034         if (direction == copy_forwards) {
1035           __ str(t0, Address(d, 1 * unit));
1036           __ str(t1, Address(__ pre(d, 2 * unit)));
1037         } else {
1038           __ str(t1, Address(d, 1 * unit));
1039           __ str(t0, Address(__ pre(d, 2 * unit)));
1040         }
1041         __ bind(L2);
1042 
1043        // for forwards copy we need to re-adjust the offsets we
1044        // applied so that s and d are follow the last words written
1045 
1046        if (direction == copy_forwards) {
1047          __ add(s, s, 16);
1048          __ add(d, d, 8);
1049        }
1050 
1051       }
1052 
1053       __ ret(lr);
1054       }
1055   }
1056 
1057   // Small copy: less than 16 bytes.
1058   //
1059   // NB: Ignores all of the bits of count which represent more than 15
1060   // bytes, so a caller doesn't have to mask them.
1061 
1062   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1063     bool is_backwards = step < 0;
1064     size_t granularity = uabs(step);
1065     int direction = is_backwards ? -1 : 1;
1066     int unit = wordSize * direction;
1067 
1068     Label Lword, Lint, Lshort, Lbyte;
1069 
1070     assert(granularity
1071            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1072 
1073     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1074 
1075     // ??? I don't know if this bit-test-and-branch is the right thing
1076     // to do.  It does a lot of jumping, resulting in several
1077     // mispredicted branches.  It might make more sense to do this
1078     // with something like Duff's device with a single computed branch.
1079 
1080     __ tbz(count, 3 - exact_log2(granularity), Lword);
1081     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1082     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1083     __ bind(Lword);
1084 
1085     if (granularity <= sizeof (jint)) {
1086       __ tbz(count, 2 - exact_log2(granularity), Lint);
1087       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1088       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1089       __ bind(Lint);
1090     }
1091 
1092     if (granularity <= sizeof (jshort)) {
1093       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1094       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1095       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1096       __ bind(Lshort);
1097     }
1098 
1099     if (granularity <= sizeof (jbyte)) {
1100       __ tbz(count, 0, Lbyte);
1101       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1102       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1103       __ bind(Lbyte);
1104     }
1105   }
1106 
1107   Label copy_f, copy_b;
1108 
1109   // All-singing all-dancing memory copy.
1110   //
1111   // Copy count units of memory from s to d.  The size of a unit is
1112   // step, which can be positive or negative depending on the direction
1113   // of copy.  If is_aligned is false, we align the source address.
1114   //
1115 
1116   void copy_memory(bool is_aligned, Register s, Register d,
1117                    Register count, Register tmp, int step) {
1118     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1119     bool is_backwards = step < 0;
1120     unsigned int granularity = uabs(step);
1121     const Register t0 = r3, t1 = r4;
1122 
1123     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1124     // load all the data before writing anything
1125     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1126     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1127     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1128     const Register send = r17, dend = r16;
1129 
1130     if (PrefetchCopyIntervalInBytes > 0)
1131       __ prfm(Address(s, 0), PLDL1KEEP);
1132     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1133     __ br(Assembler::HI, copy_big);
1134 
1135     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1136     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1137 
1138     __ cmp(count, u1(16/granularity));
1139     __ br(Assembler::LS, copy16);
1140 
1141     __ cmp(count, u1(64/granularity));
1142     __ br(Assembler::HI, copy80);
1143 
1144     __ cmp(count, u1(32/granularity));
1145     __ br(Assembler::LS, copy32);
1146 
1147     // 33..64 bytes
1148     if (UseSIMDForMemoryOps) {
1149       __ ldpq(v0, v1, Address(s, 0));
1150       __ ldpq(v2, v3, Address(send, -32));
1151       __ stpq(v0, v1, Address(d, 0));
1152       __ stpq(v2, v3, Address(dend, -32));
1153     } else {
1154       __ ldp(t0, t1, Address(s, 0));
1155       __ ldp(t2, t3, Address(s, 16));
1156       __ ldp(t4, t5, Address(send, -32));
1157       __ ldp(t6, t7, Address(send, -16));
1158 
1159       __ stp(t0, t1, Address(d, 0));
1160       __ stp(t2, t3, Address(d, 16));
1161       __ stp(t4, t5, Address(dend, -32));
1162       __ stp(t6, t7, Address(dend, -16));
1163     }
1164     __ b(finish);
1165 
1166     // 17..32 bytes
1167     __ bind(copy32);
1168     __ ldp(t0, t1, Address(s, 0));
1169     __ ldp(t2, t3, Address(send, -16));
1170     __ stp(t0, t1, Address(d, 0));
1171     __ stp(t2, t3, Address(dend, -16));
1172     __ b(finish);
1173 
1174     // 65..80/96 bytes
1175     // (96 bytes if SIMD because we do 32 byes per instruction)
1176     __ bind(copy80);
1177     if (UseSIMDForMemoryOps) {
1178       __ ldpq(v0, v1, Address(s, 0));
1179       __ ldpq(v2, v3, Address(s, 32));
1180       // Unaligned pointers can be an issue for copying.
1181       // The issue has more chances to happen when granularity of data is
1182       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1183       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1184       // The most performance drop has been seen for the range 65-80 bytes.
1185       // For such cases using the pair of ldp/stp instead of the third pair of
1186       // ldpq/stpq fixes the performance issue.
1187       if (granularity < sizeof (jint)) {
1188         Label copy96;
1189         __ cmp(count, u1(80/granularity));
1190         __ br(Assembler::HI, copy96);
1191         __ ldp(t0, t1, Address(send, -16));
1192 
1193         __ stpq(v0, v1, Address(d, 0));
1194         __ stpq(v2, v3, Address(d, 32));
1195         __ stp(t0, t1, Address(dend, -16));
1196         __ b(finish);
1197 
1198         __ bind(copy96);
1199       }
1200       __ ldpq(v4, v5, Address(send, -32));
1201 
1202       __ stpq(v0, v1, Address(d, 0));
1203       __ stpq(v2, v3, Address(d, 32));
1204       __ stpq(v4, v5, Address(dend, -32));
1205     } else {
1206       __ ldp(t0, t1, Address(s, 0));
1207       __ ldp(t2, t3, Address(s, 16));
1208       __ ldp(t4, t5, Address(s, 32));
1209       __ ldp(t6, t7, Address(s, 48));
1210       __ ldp(t8, t9, Address(send, -16));
1211 
1212       __ stp(t0, t1, Address(d, 0));
1213       __ stp(t2, t3, Address(d, 16));
1214       __ stp(t4, t5, Address(d, 32));
1215       __ stp(t6, t7, Address(d, 48));
1216       __ stp(t8, t9, Address(dend, -16));
1217     }
1218     __ b(finish);
1219 
1220     // 0..16 bytes
1221     __ bind(copy16);
1222     __ cmp(count, u1(8/granularity));
1223     __ br(Assembler::LO, copy8);
1224 
1225     // 8..16 bytes
1226     __ ldr(t0, Address(s, 0));
1227     __ ldr(t1, Address(send, -8));
1228     __ str(t0, Address(d, 0));
1229     __ str(t1, Address(dend, -8));
1230     __ b(finish);
1231 
1232     if (granularity < 8) {
1233       // 4..7 bytes
1234       __ bind(copy8);
1235       __ tbz(count, 2 - exact_log2(granularity), copy4);
1236       __ ldrw(t0, Address(s, 0));
1237       __ ldrw(t1, Address(send, -4));
1238       __ strw(t0, Address(d, 0));
1239       __ strw(t1, Address(dend, -4));
1240       __ b(finish);
1241       if (granularity < 4) {
1242         // 0..3 bytes
1243         __ bind(copy4);
1244         __ cbz(count, finish); // get rid of 0 case
1245         if (granularity == 2) {
1246           __ ldrh(t0, Address(s, 0));
1247           __ strh(t0, Address(d, 0));
1248         } else { // granularity == 1
1249           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1250           // the first and last byte.
1251           // Handle the 3 byte case by loading and storing base + count/2
1252           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1253           // This does means in the 1 byte case we load/store the same
1254           // byte 3 times.
1255           __ lsr(count, count, 1);
1256           __ ldrb(t0, Address(s, 0));
1257           __ ldrb(t1, Address(send, -1));
1258           __ ldrb(t2, Address(s, count));
1259           __ strb(t0, Address(d, 0));
1260           __ strb(t1, Address(dend, -1));
1261           __ strb(t2, Address(d, count));
1262         }
1263         __ b(finish);
1264       }
1265     }
1266 
1267     __ bind(copy_big);
1268     if (is_backwards) {
1269       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1270       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1271     }
1272 
1273     // Now we've got the small case out of the way we can align the
1274     // source address on a 2-word boundary.
1275 
1276     Label aligned;
1277 
1278     if (is_aligned) {
1279       // We may have to adjust by 1 word to get s 2-word-aligned.
1280       __ tbz(s, exact_log2(wordSize), aligned);
1281       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1282       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1283       __ sub(count, count, wordSize/granularity);
1284     } else {
1285       if (is_backwards) {
1286         __ andr(rscratch2, s, 2 * wordSize - 1);
1287       } else {
1288         __ neg(rscratch2, s);
1289         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1290       }
1291       // rscratch2 is the byte adjustment needed to align s.
1292       __ cbz(rscratch2, aligned);
1293       int shift = exact_log2(granularity);
1294       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1295       __ sub(count, count, rscratch2);
1296 
1297 #if 0
1298       // ?? This code is only correct for a disjoint copy.  It may or
1299       // may not make sense to use it in that case.
1300 
1301       // Copy the first pair; s and d may not be aligned.
1302       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1303       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1304 
1305       // Align s and d, adjust count
1306       if (is_backwards) {
1307         __ sub(s, s, rscratch2);
1308         __ sub(d, d, rscratch2);
1309       } else {
1310         __ add(s, s, rscratch2);
1311         __ add(d, d, rscratch2);
1312       }
1313 #else
1314       copy_memory_small(s, d, rscratch2, rscratch1, step);
1315 #endif
1316     }
1317 
1318     __ bind(aligned);
1319 
1320     // s is now 2-word-aligned.
1321 
1322     // We have a count of units and some trailing bytes.  Adjust the
1323     // count and do a bulk copy of words.
1324     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1325     if (direction == copy_forwards)
1326       __ bl(copy_f);
1327     else
1328       __ bl(copy_b);
1329 
1330     // And the tail.
1331     copy_memory_small(s, d, count, tmp, step);
1332 
1333     if (granularity >= 8) __ bind(copy8);
1334     if (granularity >= 4) __ bind(copy4);
1335     __ bind(finish);
1336   }
1337 
1338 
1339   void clobber_registers() {
1340 #ifdef ASSERT
1341     RegSet clobbered
1342       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1343     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1344     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1345     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1346       __ mov(*it, rscratch1);
1347     }
1348 #endif
1349 
1350   }
1351 
1352   // Scan over array at a for count oops, verifying each one.
1353   // Preserves a and count, clobbers rscratch1 and rscratch2.
1354   void verify_oop_array (int size, Register a, Register count, Register temp) {
1355     Label loop, end;
1356     __ mov(rscratch1, a);
1357     __ mov(rscratch2, zr);
1358     __ bind(loop);
1359     __ cmp(rscratch2, count);
1360     __ br(Assembler::HS, end);
1361     if (size == wordSize) {
1362       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1363       __ verify_oop(temp);
1364     } else {
1365       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1366       __ decode_heap_oop(temp); // calls verify_oop
1367     }
1368     __ add(rscratch2, rscratch2, 1);
1369     __ b(loop);
1370     __ bind(end);
1371   }
1372 
1373   // Arguments:
1374   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1375   //             ignored
1376   //   is_oop  - true => oop array, so generate store check code
1377   //   name    - stub name string
1378   //
1379   // Inputs:
1380   //   c_rarg0   - source array address
1381   //   c_rarg1   - destination array address
1382   //   c_rarg2   - element count, treated as ssize_t, can be zero
1383   //
1384   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1385   // the hardware handle it.  The two dwords within qwords that span
1386   // cache line boundaries will still be loaded and stored atomically.
1387   //
1388   // Side Effects:
1389   //   disjoint_int_copy_entry is set to the no-overlap entry point
1390   //   used by generate_conjoint_int_oop_copy().
1391   //
1392   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1393                                   const char *name, bool dest_uninitialized = false) {
1394     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1395     RegSet saved_reg = RegSet::of(s, d, count);
1396     __ align(CodeEntryAlignment);
1397     StubCodeMark mark(this, "StubRoutines", name);
1398     address start = __ pc();
1399     __ enter();
1400 
1401     if (entry != NULL) {
1402       *entry = __ pc();
1403       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1404       BLOCK_COMMENT("Entry:");
1405     }
1406 
1407     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1408     if (dest_uninitialized) {
1409       decorators |= IS_DEST_UNINITIALIZED;
1410     }
1411     if (aligned) {
1412       decorators |= ARRAYCOPY_ALIGNED;
1413     }
1414 
1415     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1416     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1417 
1418     if (is_oop) {
1419       // save regs before copy_memory
1420       __ push(RegSet::of(d, count), sp);
1421     }
1422     {
1423       // UnsafeCopyMemory page error: continue after ucm
1424       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1425       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1426       copy_memory(aligned, s, d, count, rscratch1, size);
1427     }
1428 
1429     if (is_oop) {
1430       __ pop(RegSet::of(d, count), sp);
1431       if (VerifyOops)
1432         verify_oop_array(size, d, count, r16);
1433     }
1434 
1435     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1436 
1437     __ leave();
1438     __ mov(r0, zr); // return 0
1439     __ ret(lr);
1440     return start;
1441   }
1442 
1443   // Arguments:
1444   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1445   //             ignored
1446   //   is_oop  - true => oop array, so generate store check code
1447   //   name    - stub name string
1448   //
1449   // Inputs:
1450   //   c_rarg0   - source array address
1451   //   c_rarg1   - destination array address
1452   //   c_rarg2   - element count, treated as ssize_t, can be zero
1453   //
1454   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1455   // the hardware handle it.  The two dwords within qwords that span
1456   // cache line boundaries will still be loaded and stored atomically.
1457   //
1458   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1459                                  address *entry, const char *name,
1460                                  bool dest_uninitialized = false) {
1461     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1462     RegSet saved_regs = RegSet::of(s, d, count);
1463     StubCodeMark mark(this, "StubRoutines", name);
1464     address start = __ pc();
1465     __ enter();
1466 
1467     if (entry != NULL) {
1468       *entry = __ pc();
1469       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1470       BLOCK_COMMENT("Entry:");
1471     }
1472 
1473     // use fwd copy when (d-s) above_equal (count*size)
1474     __ sub(rscratch1, d, s);
1475     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1476     __ br(Assembler::HS, nooverlap_target);
1477 
1478     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1479     if (dest_uninitialized) {
1480       decorators |= IS_DEST_UNINITIALIZED;
1481     }
1482     if (aligned) {
1483       decorators |= ARRAYCOPY_ALIGNED;
1484     }
1485 
1486     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1487     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1488 
1489     if (is_oop) {
1490       // save regs before copy_memory
1491       __ push(RegSet::of(d, count), sp);
1492     }
1493     {
1494       // UnsafeCopyMemory page error: continue after ucm
1495       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1496       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1497       copy_memory(aligned, s, d, count, rscratch1, -size);
1498     }
1499     if (is_oop) {
1500       __ pop(RegSet::of(d, count), sp);
1501       if (VerifyOops)
1502         verify_oop_array(size, d, count, r16);
1503     }
1504     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1505     __ leave();
1506     __ mov(r0, zr); // return 0
1507     __ ret(lr);
1508     return start;
1509 }
1510 
1511   // Arguments:
1512   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1513   //             ignored
1514   //   name    - stub name string
1515   //
1516   // Inputs:
1517   //   c_rarg0   - source array address
1518   //   c_rarg1   - destination array address
1519   //   c_rarg2   - element count, treated as ssize_t, can be zero
1520   //
1521   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1522   // we let the hardware handle it.  The one to eight bytes within words,
1523   // dwords or qwords that span cache line boundaries will still be loaded
1524   // and stored atomically.
1525   //
1526   // Side Effects:
1527   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1528   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1529   // we let the hardware handle it.  The one to eight bytes within words,
1530   // dwords or qwords that span cache line boundaries will still be loaded
1531   // and stored atomically.
1532   //
1533   // Side Effects:
1534   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1535   //   used by generate_conjoint_byte_copy().
1536   //
1537   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1538     const bool not_oop = false;
1539     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1540   }
1541 
1542   // Arguments:
1543   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1544   //             ignored
1545   //   name    - stub name string
1546   //
1547   // Inputs:
1548   //   c_rarg0   - source array address
1549   //   c_rarg1   - destination array address
1550   //   c_rarg2   - element count, treated as ssize_t, can be zero
1551   //
1552   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1553   // we let the hardware handle it.  The one to eight bytes within words,
1554   // dwords or qwords that span cache line boundaries will still be loaded
1555   // and stored atomically.
1556   //
1557   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1558                                       address* entry, const char *name) {
1559     const bool not_oop = false;
1560     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1561   }
1562 
1563   // Arguments:
1564   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1565   //             ignored
1566   //   name    - stub name string
1567   //
1568   // Inputs:
1569   //   c_rarg0   - source array address
1570   //   c_rarg1   - destination array address
1571   //   c_rarg2   - element count, treated as ssize_t, can be zero
1572   //
1573   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1574   // let the hardware handle it.  The two or four words within dwords
1575   // or qwords that span cache line boundaries will still be loaded
1576   // and stored atomically.
1577   //
1578   // Side Effects:
1579   //   disjoint_short_copy_entry is set to the no-overlap entry point
1580   //   used by generate_conjoint_short_copy().
1581   //
1582   address generate_disjoint_short_copy(bool aligned,
1583                                        address* entry, const char *name) {
1584     const bool not_oop = false;
1585     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1586   }
1587 
1588   // Arguments:
1589   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1590   //             ignored
1591   //   name    - stub name string
1592   //
1593   // Inputs:
1594   //   c_rarg0   - source array address
1595   //   c_rarg1   - destination array address
1596   //   c_rarg2   - element count, treated as ssize_t, can be zero
1597   //
1598   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1599   // let the hardware handle it.  The two or four words within dwords
1600   // or qwords that span cache line boundaries will still be loaded
1601   // and stored atomically.
1602   //
1603   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1604                                        address *entry, const char *name) {
1605     const bool not_oop = false;
1606     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1607 
1608   }
1609   // Arguments:
1610   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1611   //             ignored
1612   //   name    - stub name string
1613   //
1614   // Inputs:
1615   //   c_rarg0   - source array address
1616   //   c_rarg1   - destination array address
1617   //   c_rarg2   - element count, treated as ssize_t, can be zero
1618   //
1619   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1620   // the hardware handle it.  The two dwords within qwords that span
1621   // cache line boundaries will still be loaded and stored atomically.
1622   //
1623   // Side Effects:
1624   //   disjoint_int_copy_entry is set to the no-overlap entry point
1625   //   used by generate_conjoint_int_oop_copy().
1626   //
1627   address generate_disjoint_int_copy(bool aligned, address *entry,
1628                                          const char *name, bool dest_uninitialized = false) {
1629     const bool not_oop = false;
1630     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1631   }
1632 
1633   // Arguments:
1634   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1635   //             ignored
1636   //   name    - stub name string
1637   //
1638   // Inputs:
1639   //   c_rarg0   - source array address
1640   //   c_rarg1   - destination array address
1641   //   c_rarg2   - element count, treated as ssize_t, can be zero
1642   //
1643   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1644   // the hardware handle it.  The two dwords within qwords that span
1645   // cache line boundaries will still be loaded and stored atomically.
1646   //
1647   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1648                                      address *entry, const char *name,
1649                                      bool dest_uninitialized = false) {
1650     const bool not_oop = false;
1651     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1652   }
1653 
1654 
1655   // Arguments:
1656   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1657   //             ignored
1658   //   name    - stub name string
1659   //
1660   // Inputs:
1661   //   c_rarg0   - source array address
1662   //   c_rarg1   - destination array address
1663   //   c_rarg2   - element count, treated as size_t, can be zero
1664   //
1665   // Side Effects:
1666   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1667   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1668   //
1669   address generate_disjoint_long_copy(bool aligned, address *entry,
1670                                           const char *name, bool dest_uninitialized = false) {
1671     const bool not_oop = false;
1672     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1673   }
1674 
1675   // Arguments:
1676   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1677   //             ignored
1678   //   name    - stub name string
1679   //
1680   // Inputs:
1681   //   c_rarg0   - source array address
1682   //   c_rarg1   - destination array address
1683   //   c_rarg2   - element count, treated as size_t, can be zero
1684   //
1685   address generate_conjoint_long_copy(bool aligned,
1686                                       address nooverlap_target, address *entry,
1687                                       const char *name, bool dest_uninitialized = false) {
1688     const bool not_oop = false;
1689     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1690   }
1691 
1692   // Arguments:
1693   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1694   //             ignored
1695   //   name    - stub name string
1696   //
1697   // Inputs:
1698   //   c_rarg0   - source array address
1699   //   c_rarg1   - destination array address
1700   //   c_rarg2   - element count, treated as size_t, can be zero
1701   //
1702   // Side Effects:
1703   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1704   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1705   //
1706   address generate_disjoint_oop_copy(bool aligned, address *entry,
1707                                      const char *name, bool dest_uninitialized) {
1708     const bool is_oop = true;
1709     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1710     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1711   }
1712 
1713   // Arguments:
1714   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1715   //             ignored
1716   //   name    - stub name string
1717   //
1718   // Inputs:
1719   //   c_rarg0   - source array address
1720   //   c_rarg1   - destination array address
1721   //   c_rarg2   - element count, treated as size_t, can be zero
1722   //
1723   address generate_conjoint_oop_copy(bool aligned,
1724                                      address nooverlap_target, address *entry,
1725                                      const char *name, bool dest_uninitialized) {
1726     const bool is_oop = true;
1727     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1728     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1729                                   name, dest_uninitialized);
1730   }
1731 
1732 
1733   // Helper for generating a dynamic type check.
1734   // Smashes rscratch1, rscratch2.
1735   void generate_type_check(Register sub_klass,
1736                            Register super_check_offset,
1737                            Register super_klass,
1738                            Label& L_success) {
1739     assert_different_registers(sub_klass, super_check_offset, super_klass);
1740 
1741     BLOCK_COMMENT("type_check:");
1742 
1743     Label L_miss;
1744 
1745     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1746                                      super_check_offset);
1747     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1748 
1749     // Fall through on failure!
1750     __ BIND(L_miss);
1751   }
1752 
1753   //
1754   //  Generate checkcasting array copy stub
1755   //
1756   //  Input:
1757   //    c_rarg0   - source array address
1758   //    c_rarg1   - destination array address
1759   //    c_rarg2   - element count, treated as ssize_t, can be zero
1760   //    c_rarg3   - size_t ckoff (super_check_offset)
1761   //    c_rarg4   - oop ckval (super_klass)
1762   //
1763   //  Output:
1764   //    r0 ==  0  -  success
1765   //    r0 == -1^K - failure, where K is partial transfer count
1766   //
1767   address generate_checkcast_copy(const char *name, address *entry,
1768                                   bool dest_uninitialized = false) {
1769 
1770     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1771 
1772     // Input registers (after setup_arg_regs)
1773     const Register from        = c_rarg0;   // source array address
1774     const Register to          = c_rarg1;   // destination array address
1775     const Register count       = c_rarg2;   // elementscount
1776     const Register ckoff       = c_rarg3;   // super_check_offset
1777     const Register ckval       = c_rarg4;   // super_klass
1778 
1779     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1780     RegSet wb_post_saved_regs = RegSet::of(count);
1781 
1782     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1783     const Register copied_oop  = r22;       // actual oop copied
1784     const Register count_save  = r21;       // orig elementscount
1785     const Register start_to    = r20;       // destination array start address
1786     const Register r19_klass   = r19;       // oop._klass
1787 
1788     //---------------------------------------------------------------
1789     // Assembler stub will be used for this call to arraycopy
1790     // if the two arrays are subtypes of Object[] but the
1791     // destination array type is not equal to or a supertype
1792     // of the source type.  Each element must be separately
1793     // checked.
1794 
1795     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1796                                copied_oop, r19_klass, count_save);
1797 
1798     __ align(CodeEntryAlignment);
1799     StubCodeMark mark(this, "StubRoutines", name);
1800     address start = __ pc();
1801 
1802     __ enter(); // required for proper stackwalking of RuntimeStub frame
1803 
1804 #ifdef ASSERT
1805     // caller guarantees that the arrays really are different
1806     // otherwise, we would have to make conjoint checks
1807     { Label L;
1808       array_overlap_test(L, TIMES_OOP);
1809       __ stop("checkcast_copy within a single array");
1810       __ bind(L);
1811     }
1812 #endif //ASSERT
1813 
1814     // Caller of this entry point must set up the argument registers.
1815     if (entry != NULL) {
1816       *entry = __ pc();
1817       BLOCK_COMMENT("Entry:");
1818     }
1819 
1820      // Empty array:  Nothing to do.
1821     __ cbz(count, L_done);
1822     __ push(RegSet::of(r19, r20, r21, r22), sp);
1823 
1824 #ifdef ASSERT
1825     BLOCK_COMMENT("assert consistent ckoff/ckval");
1826     // The ckoff and ckval must be mutually consistent,
1827     // even though caller generates both.
1828     { Label L;
1829       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1830       __ ldrw(start_to, Address(ckval, sco_offset));
1831       __ cmpw(ckoff, start_to);
1832       __ br(Assembler::EQ, L);
1833       __ stop("super_check_offset inconsistent");
1834       __ bind(L);
1835     }
1836 #endif //ASSERT
1837 
1838     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1839     bool is_oop = true;
1840     if (dest_uninitialized) {
1841       decorators |= IS_DEST_UNINITIALIZED;
1842     }
1843 
1844     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1845     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1846 
1847     // save the original count
1848     __ mov(count_save, count);
1849 
1850     // Copy from low to high addresses
1851     __ mov(start_to, to);              // Save destination array start address
1852     __ b(L_load_element);
1853 
1854     // ======== begin loop ========
1855     // (Loop is rotated; its entry is L_load_element.)
1856     // Loop control:
1857     //   for (; count != 0; count--) {
1858     //     copied_oop = load_heap_oop(from++);
1859     //     ... generate_type_check ...;
1860     //     store_heap_oop(to++, copied_oop);
1861     //   }
1862     __ align(OptoLoopAlignment);
1863 
1864     __ BIND(L_store_element);
1865     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
1866     __ sub(count, count, 1);
1867     __ cbz(count, L_do_card_marks);
1868 
1869     // ======== loop entry is here ========
1870     __ BIND(L_load_element);
1871     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1872     __ cbz(copied_oop, L_store_element);
1873 
1874     __ load_klass(r19_klass, copied_oop);// query the object klass
1875     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1876     // ======== end loop ========
1877 
1878     // It was a real error; we must depend on the caller to finish the job.
1879     // Register count = remaining oops, count_orig = total oops.
1880     // Emit GC store barriers for the oops we have copied and report
1881     // their number to the caller.
1882 
1883     __ subs(count, count_save, count);     // K = partially copied oop count
1884     __ eon(count, count, zr);                   // report (-1^K) to caller
1885     __ br(Assembler::EQ, L_done_pop);
1886 
1887     __ BIND(L_do_card_marks);
1888     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1889 
1890     __ bind(L_done_pop);
1891     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1892     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1893 
1894     __ bind(L_done);
1895     __ mov(r0, count);
1896     __ leave();
1897     __ ret(lr);
1898 
1899     return start;
1900   }
1901 
1902   // Perform range checks on the proposed arraycopy.
1903   // Kills temp, but nothing else.
1904   // Also, clean the sign bits of src_pos and dst_pos.
1905   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1906                               Register src_pos, // source position (c_rarg1)
1907                               Register dst,     // destination array oo (c_rarg2)
1908                               Register dst_pos, // destination position (c_rarg3)
1909                               Register length,
1910                               Register temp,
1911                               Label& L_failed) {
1912     BLOCK_COMMENT("arraycopy_range_checks:");
1913 
1914     assert_different_registers(rscratch1, temp);
1915 
1916     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1917     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1918     __ addw(temp, length, src_pos);
1919     __ cmpw(temp, rscratch1);
1920     __ br(Assembler::HI, L_failed);
1921 
1922     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1923     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1924     __ addw(temp, length, dst_pos);
1925     __ cmpw(temp, rscratch1);
1926     __ br(Assembler::HI, L_failed);
1927 
1928     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1929     __ movw(src_pos, src_pos);
1930     __ movw(dst_pos, dst_pos);
1931 
1932     BLOCK_COMMENT("arraycopy_range_checks done");
1933   }
1934 
1935   // These stubs get called from some dumb test routine.
1936   // I'll write them properly when they're called from
1937   // something that's actually doing something.
1938   static void fake_arraycopy_stub(address src, address dst, int count) {
1939     assert(count == 0, "huh?");
1940   }
1941 
1942 
1943   //
1944   //  Generate 'unsafe' array copy stub
1945   //  Though just as safe as the other stubs, it takes an unscaled
1946   //  size_t argument instead of an element count.
1947   //
1948   //  Input:
1949   //    c_rarg0   - source array address
1950   //    c_rarg1   - destination array address
1951   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1952   //
1953   // Examines the alignment of the operands and dispatches
1954   // to a long, int, short, or byte copy loop.
1955   //
1956   address generate_unsafe_copy(const char *name,
1957                                address byte_copy_entry,
1958                                address short_copy_entry,
1959                                address int_copy_entry,
1960                                address long_copy_entry) {
1961     Label L_long_aligned, L_int_aligned, L_short_aligned;
1962     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1963 
1964     __ align(CodeEntryAlignment);
1965     StubCodeMark mark(this, "StubRoutines", name);
1966     address start = __ pc();
1967     __ enter(); // required for proper stackwalking of RuntimeStub frame
1968 
1969     // bump this on entry, not on exit:
1970     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1971 
1972     __ orr(rscratch1, s, d);
1973     __ orr(rscratch1, rscratch1, count);
1974 
1975     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1976     __ cbz(rscratch1, L_long_aligned);
1977     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1978     __ cbz(rscratch1, L_int_aligned);
1979     __ tbz(rscratch1, 0, L_short_aligned);
1980     __ b(RuntimeAddress(byte_copy_entry));
1981 
1982     __ BIND(L_short_aligned);
1983     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1984     __ b(RuntimeAddress(short_copy_entry));
1985     __ BIND(L_int_aligned);
1986     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1987     __ b(RuntimeAddress(int_copy_entry));
1988     __ BIND(L_long_aligned);
1989     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1990     __ b(RuntimeAddress(long_copy_entry));
1991 
1992     return start;
1993   }
1994 
1995   //
1996   //  Generate generic array copy stubs
1997   //
1998   //  Input:
1999   //    c_rarg0    -  src oop
2000   //    c_rarg1    -  src_pos (32-bits)
2001   //    c_rarg2    -  dst oop
2002   //    c_rarg3    -  dst_pos (32-bits)
2003   //    c_rarg4    -  element count (32-bits)
2004   //
2005   //  Output:
2006   //    r0 ==  0  -  success
2007   //    r0 == -1^K - failure, where K is partial transfer count
2008   //
2009   address generate_generic_copy(const char *name,
2010                                 address byte_copy_entry, address short_copy_entry,
2011                                 address int_copy_entry, address oop_copy_entry,
2012                                 address long_copy_entry, address checkcast_copy_entry) {
2013 
2014     Label L_failed, L_objArray;
2015     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2016 
2017     // Input registers
2018     const Register src        = c_rarg0;  // source array oop
2019     const Register src_pos    = c_rarg1;  // source position
2020     const Register dst        = c_rarg2;  // destination array oop
2021     const Register dst_pos    = c_rarg3;  // destination position
2022     const Register length     = c_rarg4;
2023 
2024 
2025     // Registers used as temps
2026     const Register dst_klass  = c_rarg5;
2027 
2028     __ align(CodeEntryAlignment);
2029 
2030     StubCodeMark mark(this, "StubRoutines", name);
2031 
2032     address start = __ pc();
2033 
2034     __ enter(); // required for proper stackwalking of RuntimeStub frame
2035 
2036     // bump this on entry, not on exit:
2037     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2038 
2039     //-----------------------------------------------------------------------
2040     // Assembler stub will be used for this call to arraycopy
2041     // if the following conditions are met:
2042     //
2043     // (1) src and dst must not be null.
2044     // (2) src_pos must not be negative.
2045     // (3) dst_pos must not be negative.
2046     // (4) length  must not be negative.
2047     // (5) src klass and dst klass should be the same and not NULL.
2048     // (6) src and dst should be arrays.
2049     // (7) src_pos + length must not exceed length of src.
2050     // (8) dst_pos + length must not exceed length of dst.
2051     //
2052 
2053     //  if (src == NULL) return -1;
2054     __ cbz(src, L_failed);
2055 
2056     //  if (src_pos < 0) return -1;
2057     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2058 
2059     //  if (dst == NULL) return -1;
2060     __ cbz(dst, L_failed);
2061 
2062     //  if (dst_pos < 0) return -1;
2063     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2064 
2065     // registers used as temp
2066     const Register scratch_length    = r16; // elements count to copy
2067     const Register scratch_src_klass = r17; // array klass
2068     const Register lh                = r15; // layout helper
2069 
2070     //  if (length < 0) return -1;
2071     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2072     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2073 
2074     __ load_klass(scratch_src_klass, src);
2075 #ifdef ASSERT
2076     //  assert(src->klass() != NULL);
2077     {
2078       BLOCK_COMMENT("assert klasses not null {");
2079       Label L1, L2;
2080       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2081       __ bind(L1);
2082       __ stop("broken null klass");
2083       __ bind(L2);
2084       __ load_klass(rscratch1, dst);
2085       __ cbz(rscratch1, L1);     // this would be broken also
2086       BLOCK_COMMENT("} assert klasses not null done");
2087     }
2088 #endif
2089 
2090     // Load layout helper (32-bits)
2091     //
2092     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2093     // 32        30    24            16              8     2                 0
2094     //
2095     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2096     //
2097 
2098     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2099 
2100     // Handle objArrays completely differently...
2101     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2102     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2103     __ movw(rscratch1, objArray_lh);
2104     __ eorw(rscratch2, lh, rscratch1);
2105     __ cbzw(rscratch2, L_objArray);
2106 
2107     //  if (src->klass() != dst->klass()) return -1;
2108     __ load_klass(rscratch2, dst);
2109     __ eor(rscratch2, rscratch2, scratch_src_klass);
2110     __ cbnz(rscratch2, L_failed);
2111 
2112     // Check for flat inline type array -> return -1
2113     __ tst(lh, Klass::_lh_array_tag_flat_value_bit_inplace);
2114     __ br(Assembler::NE, L_failed);
2115 
2116     // Check for null-free (non-flat) inline type array -> handle as object array
2117     __ tst(lh, Klass::_lh_null_free_array_bit_inplace);
2118     __ br(Assembler::NE, L_failed);
2119 
2120     //  if (!src->is_Array()) return -1;
2121     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2122 
2123     // At this point, it is known to be a typeArray (array_tag 0x3).
2124 #ifdef ASSERT
2125     {
2126       BLOCK_COMMENT("assert primitive array {");
2127       Label L;
2128       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2129       __ cmpw(lh, rscratch2);
2130       __ br(Assembler::GE, L);
2131       __ stop("must be a primitive array");
2132       __ bind(L);
2133       BLOCK_COMMENT("} assert primitive array done");
2134     }
2135 #endif
2136 
2137     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2138                            rscratch2, L_failed);
2139 
2140     // TypeArrayKlass
2141     //
2142     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2143     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2144     //
2145 
2146     const Register rscratch1_offset = rscratch1;    // array offset
2147     const Register r15_elsize = lh; // element size
2148 
2149     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2150            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2151     __ add(src, src, rscratch1_offset);           // src array offset
2152     __ add(dst, dst, rscratch1_offset);           // dst array offset
2153     BLOCK_COMMENT("choose copy loop based on element size");
2154 
2155     // next registers should be set before the jump to corresponding stub
2156     const Register from     = c_rarg0;  // source array address
2157     const Register to       = c_rarg1;  // destination array address
2158     const Register count    = c_rarg2;  // elements count
2159 
2160     // 'from', 'to', 'count' registers should be set in such order
2161     // since they are the same as 'src', 'src_pos', 'dst'.
2162 
2163     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2164 
2165     // The possible values of elsize are 0-3, i.e. exact_log2(element
2166     // size in bytes).  We do a simple bitwise binary search.
2167   __ BIND(L_copy_bytes);
2168     __ tbnz(r15_elsize, 1, L_copy_ints);
2169     __ tbnz(r15_elsize, 0, L_copy_shorts);
2170     __ lea(from, Address(src, src_pos));// src_addr
2171     __ lea(to,   Address(dst, dst_pos));// dst_addr
2172     __ movw(count, scratch_length); // length
2173     __ b(RuntimeAddress(byte_copy_entry));
2174 
2175   __ BIND(L_copy_shorts);
2176     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2177     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2178     __ movw(count, scratch_length); // length
2179     __ b(RuntimeAddress(short_copy_entry));
2180 
2181   __ BIND(L_copy_ints);
2182     __ tbnz(r15_elsize, 0, L_copy_longs);
2183     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2184     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2185     __ movw(count, scratch_length); // length
2186     __ b(RuntimeAddress(int_copy_entry));
2187 
2188   __ BIND(L_copy_longs);
2189 #ifdef ASSERT
2190     {
2191       BLOCK_COMMENT("assert long copy {");
2192       Label L;
2193       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2194       __ cmpw(r15_elsize, LogBytesPerLong);
2195       __ br(Assembler::EQ, L);
2196       __ stop("must be long copy, but elsize is wrong");
2197       __ bind(L);
2198       BLOCK_COMMENT("} assert long copy done");
2199     }
2200 #endif
2201     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2202     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2203     __ movw(count, scratch_length); // length
2204     __ b(RuntimeAddress(long_copy_entry));
2205 
2206     // ObjArrayKlass
2207   __ BIND(L_objArray);
2208     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2209 
2210     Label L_plain_copy, L_checkcast_copy;
2211     //  test array classes for subtyping
2212     __ load_klass(r15, dst);
2213     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2214     __ br(Assembler::NE, L_checkcast_copy);
2215 
2216     // Identically typed arrays can be copied without element-wise checks.
2217     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2218                            rscratch2, L_failed);
2219 
2220     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2221     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2222     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2223     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2224     __ movw(count, scratch_length); // length
2225   __ BIND(L_plain_copy);
2226     __ b(RuntimeAddress(oop_copy_entry));
2227 
2228   __ BIND(L_checkcast_copy);
2229     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2230     {
2231       // Before looking at dst.length, make sure dst is also an objArray.
2232       __ ldrw(rscratch1, Address(r15, lh_offset));
2233       __ movw(rscratch2, objArray_lh);
2234       __ eorw(rscratch1, rscratch1, rscratch2);
2235       __ cbnzw(rscratch1, L_failed);
2236 
2237       // It is safe to examine both src.length and dst.length.
2238       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2239                              r15, L_failed);
2240 
2241       __ load_klass(dst_klass, dst); // reload
2242 
2243       // Marshal the base address arguments now, freeing registers.
2244       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2245       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2246       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2247       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2248       __ movw(count, length);           // length (reloaded)
2249       Register sco_temp = c_rarg3;      // this register is free now
2250       assert_different_registers(from, to, count, sco_temp,
2251                                  dst_klass, scratch_src_klass);
2252       // assert_clean_int(count, sco_temp);
2253 
2254       // Generate the type check.
2255       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2256       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2257 
2258       // Smashes rscratch1, rscratch2
2259       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2260 
2261       // Fetch destination element klass from the ObjArrayKlass header.
2262       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2263       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2264       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2265 
2266       // the checkcast_copy loop needs two extra arguments:
2267       assert(c_rarg3 == sco_temp, "#3 already in place");
2268       // Set up arguments for checkcast_copy_entry.
2269       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2270       __ b(RuntimeAddress(checkcast_copy_entry));
2271     }
2272 
2273   __ BIND(L_failed);
2274     __ mov(r0, -1);
2275     __ leave();   // required for proper stackwalking of RuntimeStub frame
2276     __ ret(lr);
2277 
2278     return start;
2279   }
2280 
2281   //
2282   // Generate stub for array fill. If "aligned" is true, the
2283   // "to" address is assumed to be heapword aligned.
2284   //
2285   // Arguments for generated stub:
2286   //   to:    c_rarg0
2287   //   value: c_rarg1
2288   //   count: c_rarg2 treated as signed
2289   //
2290   address generate_fill(BasicType t, bool aligned, const char *name) {
2291     __ align(CodeEntryAlignment);
2292     StubCodeMark mark(this, "StubRoutines", name);
2293     address start = __ pc();
2294 
2295     BLOCK_COMMENT("Entry:");
2296 
2297     const Register to        = c_rarg0;  // source array address
2298     const Register value     = c_rarg1;  // value
2299     const Register count     = c_rarg2;  // elements count
2300 
2301     const Register bz_base = r10;        // base for block_zero routine
2302     const Register cnt_words = r11;      // temp register
2303 
2304     __ enter();
2305 
2306     Label L_fill_elements, L_exit1;
2307 
2308     int shift = -1;
2309     switch (t) {
2310       case T_BYTE:
2311         shift = 0;
2312         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2313         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2314         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2315         __ br(Assembler::LO, L_fill_elements);
2316         break;
2317       case T_SHORT:
2318         shift = 1;
2319         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2320         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2321         __ br(Assembler::LO, L_fill_elements);
2322         break;
2323       case T_INT:
2324         shift = 2;
2325         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2326         __ br(Assembler::LO, L_fill_elements);
2327         break;
2328       default: ShouldNotReachHere();
2329     }
2330 
2331     // Align source address at 8 bytes address boundary.
2332     Label L_skip_align1, L_skip_align2, L_skip_align4;
2333     if (!aligned) {
2334       switch (t) {
2335         case T_BYTE:
2336           // One byte misalignment happens only for byte arrays.
2337           __ tbz(to, 0, L_skip_align1);
2338           __ strb(value, Address(__ post(to, 1)));
2339           __ subw(count, count, 1);
2340           __ bind(L_skip_align1);
2341           // Fallthrough
2342         case T_SHORT:
2343           // Two bytes misalignment happens only for byte and short (char) arrays.
2344           __ tbz(to, 1, L_skip_align2);
2345           __ strh(value, Address(__ post(to, 2)));
2346           __ subw(count, count, 2 >> shift);
2347           __ bind(L_skip_align2);
2348           // Fallthrough
2349         case T_INT:
2350           // Align to 8 bytes, we know we are 4 byte aligned to start.
2351           __ tbz(to, 2, L_skip_align4);
2352           __ strw(value, Address(__ post(to, 4)));
2353           __ subw(count, count, 4 >> shift);
2354           __ bind(L_skip_align4);
2355           break;
2356         default: ShouldNotReachHere();
2357       }
2358     }
2359 
2360     //
2361     //  Fill large chunks
2362     //
2363     __ lsrw(cnt_words, count, 3 - shift); // number of words
2364     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2365     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2366     if (UseBlockZeroing) {
2367       Label non_block_zeroing, rest;
2368       // If the fill value is zero we can use the fast zero_words().
2369       __ cbnz(value, non_block_zeroing);
2370       __ mov(bz_base, to);
2371       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2372       __ zero_words(bz_base, cnt_words);
2373       __ b(rest);
2374       __ bind(non_block_zeroing);
2375       __ fill_words(to, cnt_words, value);
2376       __ bind(rest);
2377     } else {
2378       __ fill_words(to, cnt_words, value);
2379     }
2380 
2381     // Remaining count is less than 8 bytes. Fill it by a single store.
2382     // Note that the total length is no less than 8 bytes.
2383     if (t == T_BYTE || t == T_SHORT) {
2384       Label L_exit1;
2385       __ cbzw(count, L_exit1);
2386       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2387       __ str(value, Address(to, -8));    // overwrite some elements
2388       __ bind(L_exit1);
2389       __ leave();
2390       __ ret(lr);
2391     }
2392 
2393     // Handle copies less than 8 bytes.
2394     Label L_fill_2, L_fill_4, L_exit2;
2395     __ bind(L_fill_elements);
2396     switch (t) {
2397       case T_BYTE:
2398         __ tbz(count, 0, L_fill_2);
2399         __ strb(value, Address(__ post(to, 1)));
2400         __ bind(L_fill_2);
2401         __ tbz(count, 1, L_fill_4);
2402         __ strh(value, Address(__ post(to, 2)));
2403         __ bind(L_fill_4);
2404         __ tbz(count, 2, L_exit2);
2405         __ strw(value, Address(to));
2406         break;
2407       case T_SHORT:
2408         __ tbz(count, 0, L_fill_4);
2409         __ strh(value, Address(__ post(to, 2)));
2410         __ bind(L_fill_4);
2411         __ tbz(count, 1, L_exit2);
2412         __ strw(value, Address(to));
2413         break;
2414       case T_INT:
2415         __ cbzw(count, L_exit2);
2416         __ strw(value, Address(to));
2417         break;
2418       default: ShouldNotReachHere();
2419     }
2420     __ bind(L_exit2);
2421     __ leave();
2422     __ ret(lr);
2423     return start;
2424   }
2425 
2426   address generate_data_cache_writeback() {
2427     const Register line        = c_rarg0;  // address of line to write back
2428 
2429     __ align(CodeEntryAlignment);
2430 
2431     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2432 
2433     address start = __ pc();
2434     __ enter();
2435     __ cache_wb(Address(line, 0));
2436     __ leave();
2437     __ ret(lr);
2438 
2439     return start;
2440   }
2441 
2442   address generate_data_cache_writeback_sync() {
2443     const Register is_pre     = c_rarg0;  // pre or post sync
2444 
2445     __ align(CodeEntryAlignment);
2446 
2447     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2448 
2449     // pre wbsync is a no-op
2450     // post wbsync translates to an sfence
2451 
2452     Label skip;
2453     address start = __ pc();
2454     __ enter();
2455     __ cbnz(is_pre, skip);
2456     __ cache_wbsync(false);
2457     __ bind(skip);
2458     __ leave();
2459     __ ret(lr);
2460 
2461     return start;
2462   }
2463 
2464   void generate_arraycopy_stubs() {
2465     address entry;
2466     address entry_jbyte_arraycopy;
2467     address entry_jshort_arraycopy;
2468     address entry_jint_arraycopy;
2469     address entry_oop_arraycopy;
2470     address entry_jlong_arraycopy;
2471     address entry_checkcast_arraycopy;
2472 
2473     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2474     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2475 
2476     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2477 
2478     //*** jbyte
2479     // Always need aligned and unaligned versions
2480     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2481                                                                                   "jbyte_disjoint_arraycopy");
2482     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2483                                                                                   &entry_jbyte_arraycopy,
2484                                                                                   "jbyte_arraycopy");
2485     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2486                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2487     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2488                                                                                   "arrayof_jbyte_arraycopy");
2489 
2490     //*** jshort
2491     // Always need aligned and unaligned versions
2492     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2493                                                                                     "jshort_disjoint_arraycopy");
2494     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2495                                                                                     &entry_jshort_arraycopy,
2496                                                                                     "jshort_arraycopy");
2497     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2498                                                                                     "arrayof_jshort_disjoint_arraycopy");
2499     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2500                                                                                     "arrayof_jshort_arraycopy");
2501 
2502     //*** jint
2503     // Aligned versions
2504     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2505                                                                                 "arrayof_jint_disjoint_arraycopy");
2506     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2507                                                                                 "arrayof_jint_arraycopy");
2508     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2509     // entry_jint_arraycopy always points to the unaligned version
2510     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2511                                                                                 "jint_disjoint_arraycopy");
2512     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2513                                                                                 &entry_jint_arraycopy,
2514                                                                                 "jint_arraycopy");
2515 
2516     //*** jlong
2517     // It is always aligned
2518     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2519                                                                                   "arrayof_jlong_disjoint_arraycopy");
2520     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2521                                                                                   "arrayof_jlong_arraycopy");
2522     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2523     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2524 
2525     //*** oops
2526     {
2527       // With compressed oops we need unaligned versions; notice that
2528       // we overwrite entry_oop_arraycopy.
2529       bool aligned = !UseCompressedOops;
2530 
2531       StubRoutines::_arrayof_oop_disjoint_arraycopy
2532         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2533                                      /*dest_uninitialized*/false);
2534       StubRoutines::_arrayof_oop_arraycopy
2535         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2536                                      /*dest_uninitialized*/false);
2537       // Aligned versions without pre-barriers
2538       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2539         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2540                                      /*dest_uninitialized*/true);
2541       StubRoutines::_arrayof_oop_arraycopy_uninit
2542         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2543                                      /*dest_uninitialized*/true);
2544     }
2545 
2546     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2547     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2548     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2549     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2550 
2551     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2552     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2553                                                                         /*dest_uninitialized*/true);
2554 
2555     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2556                                                               entry_jbyte_arraycopy,
2557                                                               entry_jshort_arraycopy,
2558                                                               entry_jint_arraycopy,
2559                                                               entry_jlong_arraycopy);
2560 
2561     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2562                                                                entry_jbyte_arraycopy,
2563                                                                entry_jshort_arraycopy,
2564                                                                entry_jint_arraycopy,
2565                                                                entry_oop_arraycopy,
2566                                                                entry_jlong_arraycopy,
2567                                                                entry_checkcast_arraycopy);
2568 
2569     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2570     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2571     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2572     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2573     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2574     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2575   }
2576 
2577   void generate_math_stubs() { Unimplemented(); }
2578 
2579   // Arguments:
2580   //
2581   // Inputs:
2582   //   c_rarg0   - source byte array address
2583   //   c_rarg1   - destination byte array address
2584   //   c_rarg2   - K (key) in little endian int array
2585   //
2586   address generate_aescrypt_encryptBlock() {
2587     __ align(CodeEntryAlignment);
2588     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2589 
2590     const Register from        = c_rarg0;  // source array address
2591     const Register to          = c_rarg1;  // destination array address
2592     const Register key         = c_rarg2;  // key array address
2593     const Register keylen      = rscratch1;
2594 
2595     address start = __ pc();
2596     __ enter();
2597 
2598     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2599 
2600     __ aesenc_loadkeys(key, keylen);
2601     __ aesecb_encrypt(from, to, keylen);
2602 
2603     __ mov(r0, 0);
2604 
2605     __ leave();
2606     __ ret(lr);
2607 
2608     return start;
2609   }
2610 
2611   // Arguments:
2612   //
2613   // Inputs:
2614   //   c_rarg0   - source byte array address
2615   //   c_rarg1   - destination byte array address
2616   //   c_rarg2   - K (key) in little endian int array
2617   //
2618   address generate_aescrypt_decryptBlock() {
2619     assert(UseAES, "need AES cryptographic extension support");
2620     __ align(CodeEntryAlignment);
2621     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2622     Label L_doLast;
2623 
2624     const Register from        = c_rarg0;  // source array address
2625     const Register to          = c_rarg1;  // destination array address
2626     const Register key         = c_rarg2;  // key array address
2627     const Register keylen      = rscratch1;
2628 
2629     address start = __ pc();
2630     __ enter(); // required for proper stackwalking of RuntimeStub frame
2631 
2632     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2633 
2634     __ aesecb_decrypt(from, to, key, keylen);
2635 
2636     __ mov(r0, 0);
2637 
2638     __ leave();
2639     __ ret(lr);
2640 
2641     return start;
2642   }
2643 
2644   // Arguments:
2645   //
2646   // Inputs:
2647   //   c_rarg0   - source byte array address
2648   //   c_rarg1   - destination byte array address
2649   //   c_rarg2   - K (key) in little endian int array
2650   //   c_rarg3   - r vector byte array address
2651   //   c_rarg4   - input length
2652   //
2653   // Output:
2654   //   x0        - input length
2655   //
2656   address generate_cipherBlockChaining_encryptAESCrypt() {
2657     assert(UseAES, "need AES cryptographic extension support");
2658     __ align(CodeEntryAlignment);
2659     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2660 
2661     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2662 
2663     const Register from        = c_rarg0;  // source array address
2664     const Register to          = c_rarg1;  // destination array address
2665     const Register key         = c_rarg2;  // key array address
2666     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2667                                            // and left with the results of the last encryption block
2668     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2669     const Register keylen      = rscratch1;
2670 
2671     address start = __ pc();
2672 
2673       __ enter();
2674 
2675       __ movw(rscratch2, len_reg);
2676 
2677       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2678 
2679       __ ld1(v0, __ T16B, rvec);
2680 
2681       __ cmpw(keylen, 52);
2682       __ br(Assembler::CC, L_loadkeys_44);
2683       __ br(Assembler::EQ, L_loadkeys_52);
2684 
2685       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2686       __ rev32(v17, __ T16B, v17);
2687       __ rev32(v18, __ T16B, v18);
2688     __ BIND(L_loadkeys_52);
2689       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2690       __ rev32(v19, __ T16B, v19);
2691       __ rev32(v20, __ T16B, v20);
2692     __ BIND(L_loadkeys_44);
2693       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2694       __ rev32(v21, __ T16B, v21);
2695       __ rev32(v22, __ T16B, v22);
2696       __ rev32(v23, __ T16B, v23);
2697       __ rev32(v24, __ T16B, v24);
2698       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2699       __ rev32(v25, __ T16B, v25);
2700       __ rev32(v26, __ T16B, v26);
2701       __ rev32(v27, __ T16B, v27);
2702       __ rev32(v28, __ T16B, v28);
2703       __ ld1(v29, v30, v31, __ T16B, key);
2704       __ rev32(v29, __ T16B, v29);
2705       __ rev32(v30, __ T16B, v30);
2706       __ rev32(v31, __ T16B, v31);
2707 
2708     __ BIND(L_aes_loop);
2709       __ ld1(v1, __ T16B, __ post(from, 16));
2710       __ eor(v0, __ T16B, v0, v1);
2711 
2712       __ br(Assembler::CC, L_rounds_44);
2713       __ br(Assembler::EQ, L_rounds_52);
2714 
2715       __ aese(v0, v17); __ aesmc(v0, v0);
2716       __ aese(v0, v18); __ aesmc(v0, v0);
2717     __ BIND(L_rounds_52);
2718       __ aese(v0, v19); __ aesmc(v0, v0);
2719       __ aese(v0, v20); __ aesmc(v0, v0);
2720     __ BIND(L_rounds_44);
2721       __ aese(v0, v21); __ aesmc(v0, v0);
2722       __ aese(v0, v22); __ aesmc(v0, v0);
2723       __ aese(v0, v23); __ aesmc(v0, v0);
2724       __ aese(v0, v24); __ aesmc(v0, v0);
2725       __ aese(v0, v25); __ aesmc(v0, v0);
2726       __ aese(v0, v26); __ aesmc(v0, v0);
2727       __ aese(v0, v27); __ aesmc(v0, v0);
2728       __ aese(v0, v28); __ aesmc(v0, v0);
2729       __ aese(v0, v29); __ aesmc(v0, v0);
2730       __ aese(v0, v30);
2731       __ eor(v0, __ T16B, v0, v31);
2732 
2733       __ st1(v0, __ T16B, __ post(to, 16));
2734 
2735       __ subw(len_reg, len_reg, 16);
2736       __ cbnzw(len_reg, L_aes_loop);
2737 
2738       __ st1(v0, __ T16B, rvec);
2739 
2740       __ mov(r0, rscratch2);
2741 
2742       __ leave();
2743       __ ret(lr);
2744 
2745       return start;
2746   }
2747 
2748   // Arguments:
2749   //
2750   // Inputs:
2751   //   c_rarg0   - source byte array address
2752   //   c_rarg1   - destination byte array address
2753   //   c_rarg2   - K (key) in little endian int array
2754   //   c_rarg3   - r vector byte array address
2755   //   c_rarg4   - input length
2756   //
2757   // Output:
2758   //   r0        - input length
2759   //
2760   address generate_cipherBlockChaining_decryptAESCrypt() {
2761     assert(UseAES, "need AES cryptographic extension support");
2762     __ align(CodeEntryAlignment);
2763     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2764 
2765     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2766 
2767     const Register from        = c_rarg0;  // source array address
2768     const Register to          = c_rarg1;  // destination array address
2769     const Register key         = c_rarg2;  // key array address
2770     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2771                                            // and left with the results of the last encryption block
2772     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2773     const Register keylen      = rscratch1;
2774 
2775     address start = __ pc();
2776 
2777       __ enter();
2778 
2779       __ movw(rscratch2, len_reg);
2780 
2781       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2782 
2783       __ ld1(v2, __ T16B, rvec);
2784 
2785       __ ld1(v31, __ T16B, __ post(key, 16));
2786       __ rev32(v31, __ T16B, v31);
2787 
2788       __ cmpw(keylen, 52);
2789       __ br(Assembler::CC, L_loadkeys_44);
2790       __ br(Assembler::EQ, L_loadkeys_52);
2791 
2792       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2793       __ rev32(v17, __ T16B, v17);
2794       __ rev32(v18, __ T16B, v18);
2795     __ BIND(L_loadkeys_52);
2796       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2797       __ rev32(v19, __ T16B, v19);
2798       __ rev32(v20, __ T16B, v20);
2799     __ BIND(L_loadkeys_44);
2800       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2801       __ rev32(v21, __ T16B, v21);
2802       __ rev32(v22, __ T16B, v22);
2803       __ rev32(v23, __ T16B, v23);
2804       __ rev32(v24, __ T16B, v24);
2805       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2806       __ rev32(v25, __ T16B, v25);
2807       __ rev32(v26, __ T16B, v26);
2808       __ rev32(v27, __ T16B, v27);
2809       __ rev32(v28, __ T16B, v28);
2810       __ ld1(v29, v30, __ T16B, key);
2811       __ rev32(v29, __ T16B, v29);
2812       __ rev32(v30, __ T16B, v30);
2813 
2814     __ BIND(L_aes_loop);
2815       __ ld1(v0, __ T16B, __ post(from, 16));
2816       __ orr(v1, __ T16B, v0, v0);
2817 
2818       __ br(Assembler::CC, L_rounds_44);
2819       __ br(Assembler::EQ, L_rounds_52);
2820 
2821       __ aesd(v0, v17); __ aesimc(v0, v0);
2822       __ aesd(v0, v18); __ aesimc(v0, v0);
2823     __ BIND(L_rounds_52);
2824       __ aesd(v0, v19); __ aesimc(v0, v0);
2825       __ aesd(v0, v20); __ aesimc(v0, v0);
2826     __ BIND(L_rounds_44);
2827       __ aesd(v0, v21); __ aesimc(v0, v0);
2828       __ aesd(v0, v22); __ aesimc(v0, v0);
2829       __ aesd(v0, v23); __ aesimc(v0, v0);
2830       __ aesd(v0, v24); __ aesimc(v0, v0);
2831       __ aesd(v0, v25); __ aesimc(v0, v0);
2832       __ aesd(v0, v26); __ aesimc(v0, v0);
2833       __ aesd(v0, v27); __ aesimc(v0, v0);
2834       __ aesd(v0, v28); __ aesimc(v0, v0);
2835       __ aesd(v0, v29); __ aesimc(v0, v0);
2836       __ aesd(v0, v30);
2837       __ eor(v0, __ T16B, v0, v31);
2838       __ eor(v0, __ T16B, v0, v2);
2839 
2840       __ st1(v0, __ T16B, __ post(to, 16));
2841       __ orr(v2, __ T16B, v1, v1);
2842 
2843       __ subw(len_reg, len_reg, 16);
2844       __ cbnzw(len_reg, L_aes_loop);
2845 
2846       __ st1(v2, __ T16B, rvec);
2847 
2848       __ mov(r0, rscratch2);
2849 
2850       __ leave();
2851       __ ret(lr);
2852 
2853     return start;
2854   }
2855 
2856   // CTR AES crypt.
2857   // Arguments:
2858   //
2859   // Inputs:
2860   //   c_rarg0   - source byte array address
2861   //   c_rarg1   - destination byte array address
2862   //   c_rarg2   - K (key) in little endian int array
2863   //   c_rarg3   - counter vector byte array address
2864   //   c_rarg4   - input length
2865   //   c_rarg5   - saved encryptedCounter start
2866   //   c_rarg6   - saved used length
2867   //
2868   // Output:
2869   //   r0       - input length
2870   //
2871   address generate_counterMode_AESCrypt() {
2872     const Register in = c_rarg0;
2873     const Register out = c_rarg1;
2874     const Register key = c_rarg2;
2875     const Register counter = c_rarg3;
2876     const Register saved_len = c_rarg4, len = r10;
2877     const Register saved_encrypted_ctr = c_rarg5;
2878     const Register used_ptr = c_rarg6, used = r12;
2879 
2880     const Register offset = r7;
2881     const Register keylen = r11;
2882 
2883     const unsigned char block_size = 16;
2884     const int bulk_width = 4;
2885     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2886     // performance with larger data sizes, but it also means that the
2887     // fast path isn't used until you have at least 8 blocks, and up
2888     // to 127 bytes of data will be executed on the slow path. For
2889     // that reason, and also so as not to blow away too much icache, 4
2890     // blocks seems like a sensible compromise.
2891 
2892     // Algorithm:
2893     //
2894     //    if (len == 0) {
2895     //        goto DONE;
2896     //    }
2897     //    int result = len;
2898     //    do {
2899     //        if (used >= blockSize) {
2900     //            if (len >= bulk_width * blockSize) {
2901     //                CTR_large_block();
2902     //                if (len == 0)
2903     //                    goto DONE;
2904     //            }
2905     //            for (;;) {
2906     //                16ByteVector v0 = counter;
2907     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2908     //                used = 0;
2909     //                if (len < blockSize)
2910     //                    break;    /* goto NEXT */
2911     //                16ByteVector v1 = load16Bytes(in, offset);
2912     //                v1 = v1 ^ encryptedCounter;
2913     //                store16Bytes(out, offset);
2914     //                used = blockSize;
2915     //                offset += blockSize;
2916     //                len -= blockSize;
2917     //                if (len == 0)
2918     //                    goto DONE;
2919     //            }
2920     //        }
2921     //      NEXT:
2922     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2923     //        len--;
2924     //    } while (len != 0);
2925     //  DONE:
2926     //    return result;
2927     //
2928     // CTR_large_block()
2929     //    Wide bulk encryption of whole blocks.
2930 
2931     __ align(CodeEntryAlignment);
2932     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2933     const address start = __ pc();
2934     __ enter();
2935 
2936     Label DONE, CTR_large_block, large_block_return;
2937     __ ldrw(used, Address(used_ptr));
2938     __ cbzw(saved_len, DONE);
2939 
2940     __ mov(len, saved_len);
2941     __ mov(offset, 0);
2942 
2943     // Compute #rounds for AES based on the length of the key array
2944     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2945 
2946     __ aesenc_loadkeys(key, keylen);
2947 
2948     {
2949       Label L_CTR_loop, NEXT;
2950 
2951       __ bind(L_CTR_loop);
2952 
2953       __ cmp(used, block_size);
2954       __ br(__ LO, NEXT);
2955 
2956       // Maybe we have a lot of data
2957       __ subsw(rscratch1, len, bulk_width * block_size);
2958       __ br(__ HS, CTR_large_block);
2959       __ BIND(large_block_return);
2960       __ cbzw(len, DONE);
2961 
2962       // Setup the counter
2963       __ movi(v4, __ T4S, 0);
2964       __ movi(v5, __ T4S, 1);
2965       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2966 
2967       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2968       __ rev32(v16, __ T16B, v0);
2969       __ addv(v16, __ T4S, v16, v4);
2970       __ rev32(v16, __ T16B, v16);
2971       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2972 
2973       {
2974         // We have fewer than bulk_width blocks of data left. Encrypt
2975         // them one by one until there is less than a full block
2976         // remaining, being careful to save both the encrypted counter
2977         // and the counter.
2978 
2979         Label inner_loop;
2980         __ bind(inner_loop);
2981         // Counter to encrypt is in v0
2982         __ aesecb_encrypt(noreg, noreg, keylen);
2983         __ st1(v0, __ T16B, saved_encrypted_ctr);
2984 
2985         // Do we have a remaining full block?
2986 
2987         __ mov(used, 0);
2988         __ cmp(len, block_size);
2989         __ br(__ LO, NEXT);
2990 
2991         // Yes, we have a full block
2992         __ ldrq(v1, Address(in, offset));
2993         __ eor(v1, __ T16B, v1, v0);
2994         __ strq(v1, Address(out, offset));
2995         __ mov(used, block_size);
2996         __ add(offset, offset, block_size);
2997 
2998         __ subw(len, len, block_size);
2999         __ cbzw(len, DONE);
3000 
3001         // Increment the counter, store it back
3002         __ orr(v0, __ T16B, v16, v16);
3003         __ rev32(v16, __ T16B, v16);
3004         __ addv(v16, __ T4S, v16, v4);
3005         __ rev32(v16, __ T16B, v16);
3006         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3007 
3008         __ b(inner_loop);
3009       }
3010 
3011       __ BIND(NEXT);
3012 
3013       // Encrypt a single byte, and loop.
3014       // We expect this to be a rare event.
3015       __ ldrb(rscratch1, Address(in, offset));
3016       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3017       __ eor(rscratch1, rscratch1, rscratch2);
3018       __ strb(rscratch1, Address(out, offset));
3019       __ add(offset, offset, 1);
3020       __ add(used, used, 1);
3021       __ subw(len, len,1);
3022       __ cbnzw(len, L_CTR_loop);
3023     }
3024 
3025     __ bind(DONE);
3026     __ strw(used, Address(used_ptr));
3027     __ mov(r0, saved_len);
3028 
3029     __ leave(); // required for proper stackwalking of RuntimeStub frame
3030     __ ret(lr);
3031 
3032     // Bulk encryption
3033 
3034     __ BIND (CTR_large_block);
3035     assert(bulk_width == 4 || bulk_width == 8, "must be");
3036 
3037     if (bulk_width == 8) {
3038       __ sub(sp, sp, 4 * 16);
3039       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3040     }
3041     __ sub(sp, sp, 4 * 16);
3042     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3043     RegSet saved_regs = (RegSet::of(in, out, offset)
3044                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3045     __ push(saved_regs, sp);
3046     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3047     __ add(in, in, offset);
3048     __ add(out, out, offset);
3049 
3050     // Keys should already be loaded into the correct registers
3051 
3052     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3053     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3054 
3055     // AES/CTR loop
3056     {
3057       Label L_CTR_loop;
3058       __ BIND(L_CTR_loop);
3059 
3060       // Setup the counters
3061       __ movi(v8, __ T4S, 0);
3062       __ movi(v9, __ T4S, 1);
3063       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3064 
3065       for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
3066         __ rev32(f, __ T16B, v16);
3067         __ addv(v16, __ T4S, v16, v8);
3068       }
3069 
3070       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3071 
3072       // Encrypt the counters
3073       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3074 
3075       if (bulk_width == 8) {
3076         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3077       }
3078 
3079       // XOR the encrypted counters with the inputs
3080       for (int i = 0; i < bulk_width; i++) {
3081         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3082       }
3083 
3084       // Write the encrypted data
3085       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3086       if (bulk_width == 8) {
3087         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3088       }
3089 
3090       __ subw(len, len, 16 * bulk_width);
3091       __ cbnzw(len, L_CTR_loop);
3092     }
3093 
3094     // Save the counter back where it goes
3095     __ rev32(v16, __ T16B, v16);
3096     __ st1(v16, __ T16B, counter);
3097 
3098     __ pop(saved_regs, sp);
3099 
3100     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3101     if (bulk_width == 8) {
3102       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3103     }
3104 
3105     __ andr(rscratch1, len, -16 * bulk_width);
3106     __ sub(len, len, rscratch1);
3107     __ add(offset, offset, rscratch1);
3108     __ mov(used, 16);
3109     __ strw(used, Address(used_ptr));
3110     __ b(large_block_return);
3111 
3112     return start;
3113   }
3114 
3115   // Vector AES Galois Counter Mode implementation. Parameters:
3116   //
3117   // in = c_rarg0
3118   // len = c_rarg1
3119   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3120   // out = c_rarg3
3121   // key = c_rarg4
3122   // state = c_rarg5 - GHASH.state
3123   // subkeyHtbl = c_rarg6 - powers of H
3124   // counter = c_rarg7 - 16 bytes of CTR
3125   // return - number of processed bytes
3126   address generate_galoisCounterMode_AESCrypt() {
3127     address ghash_polynomial = __ pc();
3128     __ emit_int64(0x87);  // The low-order bits of the field
3129                           // polynomial (i.e. p = z^7+z^2+z+1)
3130                           // repeated in the low and high parts of a
3131                           // 128-bit vector
3132     __ emit_int64(0x87);
3133 
3134     __ align(CodeEntryAlignment);
3135      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3136     address start = __ pc();
3137     __ enter();
3138 
3139     const Register in = c_rarg0;
3140     const Register len = c_rarg1;
3141     const Register ct = c_rarg2;
3142     const Register out = c_rarg3;
3143     // and updated with the incremented counter in the end
3144 
3145     const Register key = c_rarg4;
3146     const Register state = c_rarg5;
3147 
3148     const Register subkeyHtbl = c_rarg6;
3149 
3150     const Register counter = c_rarg7;
3151 
3152     const Register keylen = r10;
3153     // Save state before entering routine
3154     __ sub(sp, sp, 4 * 16);
3155     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3156     __ sub(sp, sp, 4 * 16);
3157     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3158 
3159     // __ andr(len, len, -512);
3160     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3161     __ str(len, __ pre(sp, -2 * wordSize));
3162 
3163     Label DONE;
3164     __ cbz(len, DONE);
3165 
3166     // Compute #rounds for AES based on the length of the key array
3167     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3168 
3169     __ aesenc_loadkeys(key, keylen);
3170     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3171     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3172 
3173     // AES/CTR loop
3174     {
3175       Label L_CTR_loop;
3176       __ BIND(L_CTR_loop);
3177 
3178       // Setup the counters
3179       __ movi(v8, __ T4S, 0);
3180       __ movi(v9, __ T4S, 1);
3181       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3182       for (FloatRegister f = v0; f < v8; f++) {
3183         __ rev32(f, __ T16B, v16);
3184         __ addv(v16, __ T4S, v16, v8);
3185       }
3186 
3187       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3188 
3189       // Encrypt the counters
3190       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3191 
3192       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3193 
3194       // XOR the encrypted counters with the inputs
3195       for (int i = 0; i < 8; i++) {
3196         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3197       }
3198       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3199       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3200 
3201       __ subw(len, len, 16 * 8);
3202       __ cbnzw(len, L_CTR_loop);
3203     }
3204 
3205     __ rev32(v16, __ T16B, v16);
3206     __ st1(v16, __ T16B, counter);
3207 
3208     __ ldr(len, Address(sp));
3209     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3210 
3211     // GHASH/CTR loop
3212     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3213                                 len, /*unrolls*/4);
3214 
3215 #ifdef ASSERT
3216     { Label L;
3217       __ cmp(len, (unsigned char)0);
3218       __ br(Assembler::EQ, L);
3219       __ stop("stubGenerator: abort");
3220       __ bind(L);
3221   }
3222 #endif
3223 
3224   __ bind(DONE);
3225     // Return the number of bytes processed
3226     __ ldr(r0, __ post(sp, 2 * wordSize));
3227 
3228     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3229     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3230 
3231     __ leave(); // required for proper stackwalking of RuntimeStub frame
3232     __ ret(lr);
3233      return start;
3234   }
3235 
3236   // Arguments:
3237   //
3238   // Inputs:
3239   //   c_rarg0   - byte[]  source+offset
3240   //   c_rarg1   - int[]   SHA.state
3241   //   c_rarg2   - int     offset
3242   //   c_rarg3   - int     limit
3243   //
3244   address generate_md5_implCompress(bool multi_block, const char *name) {
3245     __ align(CodeEntryAlignment);
3246     StubCodeMark mark(this, "StubRoutines", name);
3247     address start = __ pc();
3248 
3249     Register buf       = c_rarg0;
3250     Register state     = c_rarg1;
3251     Register ofs       = c_rarg2;
3252     Register limit     = c_rarg3;
3253     Register a         = r4;
3254     Register b         = r5;
3255     Register c         = r6;
3256     Register d         = r7;
3257     Register rscratch3 = r10;
3258     Register rscratch4 = r11;
3259 
3260     Label keys;
3261     Label md5_loop;
3262 
3263     __ BIND(md5_loop);
3264 
3265     // Save hash values for addition after rounds
3266     __ ldrw(a, Address(state,  0));
3267     __ ldrw(b, Address(state,  4));
3268     __ ldrw(c, Address(state,  8));
3269     __ ldrw(d, Address(state, 12));
3270 
3271 #define FF(r1, r2, r3, r4, k, s, t)              \
3272     __ eorw(rscratch3, r3, r4);                  \
3273     __ movw(rscratch2, t);                       \
3274     __ andw(rscratch3, rscratch3, r2);           \
3275     __ addw(rscratch4, r1, rscratch2);           \
3276     __ ldrw(rscratch1, Address(buf, k*4));       \
3277     __ eorw(rscratch3, rscratch3, r4);           \
3278     __ addw(rscratch3, rscratch3, rscratch1);    \
3279     __ addw(rscratch3, rscratch3, rscratch4);    \
3280     __ rorw(rscratch2, rscratch3, 32 - s);       \
3281     __ addw(r1, rscratch2, r2);
3282 
3283 #define GG(r1, r2, r3, r4, k, s, t)              \
3284     __ eorw(rscratch2, r2, r3);                  \
3285     __ ldrw(rscratch1, Address(buf, k*4));       \
3286     __ andw(rscratch3, rscratch2, r4);           \
3287     __ movw(rscratch2, t);                       \
3288     __ eorw(rscratch3, rscratch3, r3);           \
3289     __ addw(rscratch4, r1, rscratch2);           \
3290     __ addw(rscratch3, rscratch3, rscratch1);    \
3291     __ addw(rscratch3, rscratch3, rscratch4);    \
3292     __ rorw(rscratch2, rscratch3, 32 - s);       \
3293     __ addw(r1, rscratch2, r2);
3294 
3295 #define HH(r1, r2, r3, r4, k, s, t)              \
3296     __ eorw(rscratch3, r3, r4);                  \
3297     __ movw(rscratch2, t);                       \
3298     __ addw(rscratch4, r1, rscratch2);           \
3299     __ ldrw(rscratch1, Address(buf, k*4));       \
3300     __ eorw(rscratch3, rscratch3, r2);           \
3301     __ addw(rscratch3, rscratch3, rscratch1);    \
3302     __ addw(rscratch3, rscratch3, rscratch4);    \
3303     __ rorw(rscratch2, rscratch3, 32 - s);       \
3304     __ addw(r1, rscratch2, r2);
3305 
3306 #define II(r1, r2, r3, r4, k, s, t)              \
3307     __ movw(rscratch3, t);                       \
3308     __ ornw(rscratch2, r2, r4);                  \
3309     __ addw(rscratch4, r1, rscratch3);           \
3310     __ ldrw(rscratch1, Address(buf, k*4));       \
3311     __ eorw(rscratch3, rscratch2, r3);           \
3312     __ addw(rscratch3, rscratch3, rscratch1);    \
3313     __ addw(rscratch3, rscratch3, rscratch4);    \
3314     __ rorw(rscratch2, rscratch3, 32 - s);       \
3315     __ addw(r1, rscratch2, r2);
3316 
3317     // Round 1
3318     FF(a, b, c, d,  0,  7, 0xd76aa478)
3319     FF(d, a, b, c,  1, 12, 0xe8c7b756)
3320     FF(c, d, a, b,  2, 17, 0x242070db)
3321     FF(b, c, d, a,  3, 22, 0xc1bdceee)
3322     FF(a, b, c, d,  4,  7, 0xf57c0faf)
3323     FF(d, a, b, c,  5, 12, 0x4787c62a)
3324     FF(c, d, a, b,  6, 17, 0xa8304613)
3325     FF(b, c, d, a,  7, 22, 0xfd469501)
3326     FF(a, b, c, d,  8,  7, 0x698098d8)
3327     FF(d, a, b, c,  9, 12, 0x8b44f7af)
3328     FF(c, d, a, b, 10, 17, 0xffff5bb1)
3329     FF(b, c, d, a, 11, 22, 0x895cd7be)
3330     FF(a, b, c, d, 12,  7, 0x6b901122)
3331     FF(d, a, b, c, 13, 12, 0xfd987193)
3332     FF(c, d, a, b, 14, 17, 0xa679438e)
3333     FF(b, c, d, a, 15, 22, 0x49b40821)
3334 
3335     // Round 2
3336     GG(a, b, c, d,  1,  5, 0xf61e2562)
3337     GG(d, a, b, c,  6,  9, 0xc040b340)
3338     GG(c, d, a, b, 11, 14, 0x265e5a51)
3339     GG(b, c, d, a,  0, 20, 0xe9b6c7aa)
3340     GG(a, b, c, d,  5,  5, 0xd62f105d)
3341     GG(d, a, b, c, 10,  9, 0x02441453)
3342     GG(c, d, a, b, 15, 14, 0xd8a1e681)
3343     GG(b, c, d, a,  4, 20, 0xe7d3fbc8)
3344     GG(a, b, c, d,  9,  5, 0x21e1cde6)
3345     GG(d, a, b, c, 14,  9, 0xc33707d6)
3346     GG(c, d, a, b,  3, 14, 0xf4d50d87)
3347     GG(b, c, d, a,  8, 20, 0x455a14ed)
3348     GG(a, b, c, d, 13,  5, 0xa9e3e905)
3349     GG(d, a, b, c,  2,  9, 0xfcefa3f8)
3350     GG(c, d, a, b,  7, 14, 0x676f02d9)
3351     GG(b, c, d, a, 12, 20, 0x8d2a4c8a)
3352 
3353     // Round 3
3354     HH(a, b, c, d,  5,  4, 0xfffa3942)
3355     HH(d, a, b, c,  8, 11, 0x8771f681)
3356     HH(c, d, a, b, 11, 16, 0x6d9d6122)
3357     HH(b, c, d, a, 14, 23, 0xfde5380c)
3358     HH(a, b, c, d,  1,  4, 0xa4beea44)
3359     HH(d, a, b, c,  4, 11, 0x4bdecfa9)
3360     HH(c, d, a, b,  7, 16, 0xf6bb4b60)
3361     HH(b, c, d, a, 10, 23, 0xbebfbc70)
3362     HH(a, b, c, d, 13,  4, 0x289b7ec6)
3363     HH(d, a, b, c,  0, 11, 0xeaa127fa)
3364     HH(c, d, a, b,  3, 16, 0xd4ef3085)
3365     HH(b, c, d, a,  6, 23, 0x04881d05)
3366     HH(a, b, c, d,  9,  4, 0xd9d4d039)
3367     HH(d, a, b, c, 12, 11, 0xe6db99e5)
3368     HH(c, d, a, b, 15, 16, 0x1fa27cf8)
3369     HH(b, c, d, a,  2, 23, 0xc4ac5665)
3370 
3371     // Round 4
3372     II(a, b, c, d,  0,  6, 0xf4292244)
3373     II(d, a, b, c,  7, 10, 0x432aff97)
3374     II(c, d, a, b, 14, 15, 0xab9423a7)
3375     II(b, c, d, a,  5, 21, 0xfc93a039)
3376     II(a, b, c, d, 12,  6, 0x655b59c3)
3377     II(d, a, b, c,  3, 10, 0x8f0ccc92)
3378     II(c, d, a, b, 10, 15, 0xffeff47d)
3379     II(b, c, d, a,  1, 21, 0x85845dd1)
3380     II(a, b, c, d,  8,  6, 0x6fa87e4f)
3381     II(d, a, b, c, 15, 10, 0xfe2ce6e0)
3382     II(c, d, a, b,  6, 15, 0xa3014314)
3383     II(b, c, d, a, 13, 21, 0x4e0811a1)
3384     II(a, b, c, d,  4,  6, 0xf7537e82)
3385     II(d, a, b, c, 11, 10, 0xbd3af235)
3386     II(c, d, a, b,  2, 15, 0x2ad7d2bb)
3387     II(b, c, d, a,  9, 21, 0xeb86d391)
3388 
3389 #undef FF
3390 #undef GG
3391 #undef HH
3392 #undef II
3393 
3394     // write hash values back in the correct order
3395     __ ldrw(rscratch1, Address(state,  0));
3396     __ addw(rscratch1, rscratch1, a);
3397     __ strw(rscratch1, Address(state,  0));
3398 
3399     __ ldrw(rscratch2, Address(state,  4));
3400     __ addw(rscratch2, rscratch2, b);
3401     __ strw(rscratch2, Address(state,  4));
3402 
3403     __ ldrw(rscratch3, Address(state,  8));
3404     __ addw(rscratch3, rscratch3, c);
3405     __ strw(rscratch3, Address(state,  8));
3406 
3407     __ ldrw(rscratch4, Address(state, 12));
3408     __ addw(rscratch4, rscratch4, d);
3409     __ strw(rscratch4, Address(state, 12));
3410 
3411     if (multi_block) {
3412       __ add(buf, buf, 64);
3413       __ add(ofs, ofs, 64);
3414       __ cmp(ofs, limit);
3415       __ br(Assembler::LE, md5_loop);
3416       __ mov(c_rarg0, ofs); // return ofs
3417     }
3418 
3419     __ ret(lr);
3420 
3421     return start;
3422   }
3423 
3424   // Arguments:
3425   //
3426   // Inputs:
3427   //   c_rarg0   - byte[]  source+offset
3428   //   c_rarg1   - int[]   SHA.state
3429   //   c_rarg2   - int     offset
3430   //   c_rarg3   - int     limit
3431   //
3432   address generate_sha1_implCompress(bool multi_block, const char *name) {
3433     __ align(CodeEntryAlignment);
3434     StubCodeMark mark(this, "StubRoutines", name);
3435     address start = __ pc();
3436 
3437     Register buf   = c_rarg0;
3438     Register state = c_rarg1;
3439     Register ofs   = c_rarg2;
3440     Register limit = c_rarg3;
3441 
3442     Label keys;
3443     Label sha1_loop;
3444 
3445     // load the keys into v0..v3
3446     __ adr(rscratch1, keys);
3447     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3448     // load 5 words state into v6, v7
3449     __ ldrq(v6, Address(state, 0));
3450     __ ldrs(v7, Address(state, 16));
3451 
3452 
3453     __ BIND(sha1_loop);
3454     // load 64 bytes of data into v16..v19
3455     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3456     __ rev32(v16, __ T16B, v16);
3457     __ rev32(v17, __ T16B, v17);
3458     __ rev32(v18, __ T16B, v18);
3459     __ rev32(v19, __ T16B, v19);
3460 
3461     // do the sha1
3462     __ addv(v4, __ T4S, v16, v0);
3463     __ orr(v20, __ T16B, v6, v6);
3464 
3465     FloatRegister d0 = v16;
3466     FloatRegister d1 = v17;
3467     FloatRegister d2 = v18;
3468     FloatRegister d3 = v19;
3469 
3470     for (int round = 0; round < 20; round++) {
3471       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3472       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3473       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3474       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3475       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3476 
3477       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3478       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3479       __ sha1h(tmp2, __ T4S, v20);
3480       if (round < 5)
3481         __ sha1c(v20, __ T4S, tmp3, tmp4);
3482       else if (round < 10 || round >= 15)
3483         __ sha1p(v20, __ T4S, tmp3, tmp4);
3484       else
3485         __ sha1m(v20, __ T4S, tmp3, tmp4);
3486       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3487 
3488       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3489     }
3490 
3491     __ addv(v7, __ T2S, v7, v21);
3492     __ addv(v6, __ T4S, v6, v20);
3493 
3494     if (multi_block) {
3495       __ add(ofs, ofs, 64);
3496       __ cmp(ofs, limit);
3497       __ br(Assembler::LE, sha1_loop);
3498       __ mov(c_rarg0, ofs); // return ofs
3499     }
3500 
3501     __ strq(v6, Address(state, 0));
3502     __ strs(v7, Address(state, 16));
3503 
3504     __ ret(lr);
3505 
3506     __ bind(keys);
3507     __ emit_int32(0x5a827999);
3508     __ emit_int32(0x6ed9eba1);
3509     __ emit_int32(0x8f1bbcdc);
3510     __ emit_int32(0xca62c1d6);
3511 
3512     return start;
3513   }
3514 
3515 
3516   // Arguments:
3517   //
3518   // Inputs:
3519   //   c_rarg0   - byte[]  source+offset
3520   //   c_rarg1   - int[]   SHA.state
3521   //   c_rarg2   - int     offset
3522   //   c_rarg3   - int     limit
3523   //
3524   address generate_sha256_implCompress(bool multi_block, const char *name) {
3525     static const uint32_t round_consts[64] = {
3526       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3527       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3528       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3529       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3530       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3531       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3532       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3533       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3534       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3535       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3536       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3537       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3538       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3539       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3540       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3541       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3542     };
3543     __ align(CodeEntryAlignment);
3544     StubCodeMark mark(this, "StubRoutines", name);
3545     address start = __ pc();
3546 
3547     Register buf   = c_rarg0;
3548     Register state = c_rarg1;
3549     Register ofs   = c_rarg2;
3550     Register limit = c_rarg3;
3551 
3552     Label sha1_loop;
3553 
3554     __ stpd(v8, v9, __ pre(sp, -32));
3555     __ stpd(v10, v11, Address(sp, 16));
3556 
3557 // dga == v0
3558 // dgb == v1
3559 // dg0 == v2
3560 // dg1 == v3
3561 // dg2 == v4
3562 // t0 == v6
3563 // t1 == v7
3564 
3565     // load 16 keys to v16..v31
3566     __ lea(rscratch1, ExternalAddress((address)round_consts));
3567     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3568     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3569     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3570     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3571 
3572     // load 8 words (256 bits) state
3573     __ ldpq(v0, v1, state);
3574 
3575     __ BIND(sha1_loop);
3576     // load 64 bytes of data into v8..v11
3577     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3578     __ rev32(v8, __ T16B, v8);
3579     __ rev32(v9, __ T16B, v9);
3580     __ rev32(v10, __ T16B, v10);
3581     __ rev32(v11, __ T16B, v11);
3582 
3583     __ addv(v6, __ T4S, v8, v16);
3584     __ orr(v2, __ T16B, v0, v0);
3585     __ orr(v3, __ T16B, v1, v1);
3586 
3587     FloatRegister d0 = v8;
3588     FloatRegister d1 = v9;
3589     FloatRegister d2 = v10;
3590     FloatRegister d3 = v11;
3591 
3592 
3593     for (int round = 0; round < 16; round++) {
3594       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3595       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3596       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3597       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3598 
3599       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3600        __ orr(v4, __ T16B, v2, v2);
3601       if (round < 15)
3602         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3603       __ sha256h(v2, __ T4S, v3, tmp2);
3604       __ sha256h2(v3, __ T4S, v4, tmp2);
3605       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3606 
3607       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3608     }
3609 
3610     __ addv(v0, __ T4S, v0, v2);
3611     __ addv(v1, __ T4S, v1, v3);
3612 
3613     if (multi_block) {
3614       __ add(ofs, ofs, 64);
3615       __ cmp(ofs, limit);
3616       __ br(Assembler::LE, sha1_loop);
3617       __ mov(c_rarg0, ofs); // return ofs
3618     }
3619 
3620     __ ldpd(v10, v11, Address(sp, 16));
3621     __ ldpd(v8, v9, __ post(sp, 32));
3622 
3623     __ stpq(v0, v1, state);
3624 
3625     __ ret(lr);
3626 
3627     return start;
3628   }
3629 
3630   // Arguments:
3631   //
3632   // Inputs:
3633   //   c_rarg0   - byte[]  source+offset
3634   //   c_rarg1   - int[]   SHA.state
3635   //   c_rarg2   - int     offset
3636   //   c_rarg3   - int     limit
3637   //
3638   address generate_sha512_implCompress(bool multi_block, const char *name) {
3639     static const uint64_t round_consts[80] = {
3640       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3641       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3642       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3643       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3644       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3645       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3646       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3647       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3648       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3649       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3650       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3651       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3652       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3653       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3654       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3655       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3656       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3657       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3658       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3659       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3660       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3661       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3662       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3663       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3664       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3665       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3666       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3667     };
3668 
3669     // Double rounds for sha512.
3670     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3671       if (dr < 36)                                                                   \
3672         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3673       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3674       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3675       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3676       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3677       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3678       if (dr < 32) {                                                                 \
3679         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3680         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3681       }                                                                              \
3682       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3683       if (dr < 32)                                                                   \
3684         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3685       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3686       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3687 
3688     __ align(CodeEntryAlignment);
3689     StubCodeMark mark(this, "StubRoutines", name);
3690     address start = __ pc();
3691 
3692     Register buf   = c_rarg0;
3693     Register state = c_rarg1;
3694     Register ofs   = c_rarg2;
3695     Register limit = c_rarg3;
3696 
3697     __ stpd(v8, v9, __ pre(sp, -64));
3698     __ stpd(v10, v11, Address(sp, 16));
3699     __ stpd(v12, v13, Address(sp, 32));
3700     __ stpd(v14, v15, Address(sp, 48));
3701 
3702     Label sha512_loop;
3703 
3704     // load state
3705     __ ld1(v8, v9, v10, v11, __ T2D, state);
3706 
3707     // load first 4 round constants
3708     __ lea(rscratch1, ExternalAddress((address)round_consts));
3709     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3710 
3711     __ BIND(sha512_loop);
3712     // load 128B of data into v12..v19
3713     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3714     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3715     __ rev64(v12, __ T16B, v12);
3716     __ rev64(v13, __ T16B, v13);
3717     __ rev64(v14, __ T16B, v14);
3718     __ rev64(v15, __ T16B, v15);
3719     __ rev64(v16, __ T16B, v16);
3720     __ rev64(v17, __ T16B, v17);
3721     __ rev64(v18, __ T16B, v18);
3722     __ rev64(v19, __ T16B, v19);
3723 
3724     __ mov(rscratch2, rscratch1);
3725 
3726     __ mov(v0, __ T16B, v8);
3727     __ mov(v1, __ T16B, v9);
3728     __ mov(v2, __ T16B, v10);
3729     __ mov(v3, __ T16B, v11);
3730 
3731     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3732     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3733     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3734     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3735     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3736     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3737     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3738     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3739     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3740     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3741     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3742     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3743     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3744     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3745     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3746     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3747     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3748     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3749     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3750     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3751     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3752     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3753     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3754     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3755     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3756     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3757     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3758     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3759     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3760     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3761     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3762     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3763     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3764     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3765     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3766     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3767     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3768     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3769     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3770     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3771 
3772     __ addv(v8, __ T2D, v8, v0);
3773     __ addv(v9, __ T2D, v9, v1);
3774     __ addv(v10, __ T2D, v10, v2);
3775     __ addv(v11, __ T2D, v11, v3);
3776 
3777     if (multi_block) {
3778       __ add(ofs, ofs, 128);
3779       __ cmp(ofs, limit);
3780       __ br(Assembler::LE, sha512_loop);
3781       __ mov(c_rarg0, ofs); // return ofs
3782     }
3783 
3784     __ st1(v8, v9, v10, v11, __ T2D, state);
3785 
3786     __ ldpd(v14, v15, Address(sp, 48));
3787     __ ldpd(v12, v13, Address(sp, 32));
3788     __ ldpd(v10, v11, Address(sp, 16));
3789     __ ldpd(v8, v9, __ post(sp, 64));
3790 
3791     __ ret(lr);
3792 
3793     return start;
3794   }
3795 
3796   // Arguments:
3797   //
3798   // Inputs:
3799   //   c_rarg0   - byte[]  source+offset
3800   //   c_rarg1   - byte[]   SHA.state
3801   //   c_rarg2   - int     digest_length
3802   //   c_rarg3   - int     offset
3803   //   c_rarg4   - int     limit
3804   //
3805   address generate_sha3_implCompress(bool multi_block, const char *name) {
3806     static const uint64_t round_consts[24] = {
3807       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3808       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3809       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3810       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3811       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3812       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3813       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3814       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3815     };
3816 
3817     __ align(CodeEntryAlignment);
3818     StubCodeMark mark(this, "StubRoutines", name);
3819     address start = __ pc();
3820 
3821     Register buf           = c_rarg0;
3822     Register state         = c_rarg1;
3823     Register digest_length = c_rarg2;
3824     Register ofs           = c_rarg3;
3825     Register limit         = c_rarg4;
3826 
3827     Label sha3_loop, rounds24_loop;
3828     Label sha3_512, sha3_384_or_224, sha3_256;
3829 
3830     __ stpd(v8, v9, __ pre(sp, -64));
3831     __ stpd(v10, v11, Address(sp, 16));
3832     __ stpd(v12, v13, Address(sp, 32));
3833     __ stpd(v14, v15, Address(sp, 48));
3834 
3835     // load state
3836     __ add(rscratch1, state, 32);
3837     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3838     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3839     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3840     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3841     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3842     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3843     __ ld1(v24, __ T1D, rscratch1);
3844 
3845     __ BIND(sha3_loop);
3846 
3847     // 24 keccak rounds
3848     __ movw(rscratch2, 24);
3849 
3850     // load round_constants base
3851     __ lea(rscratch1, ExternalAddress((address) round_consts));
3852 
3853     // load input
3854     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3855     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3856     __ eor(v0, __ T8B, v0, v25);
3857     __ eor(v1, __ T8B, v1, v26);
3858     __ eor(v2, __ T8B, v2, v27);
3859     __ eor(v3, __ T8B, v3, v28);
3860     __ eor(v4, __ T8B, v4, v29);
3861     __ eor(v5, __ T8B, v5, v30);
3862     __ eor(v6, __ T8B, v6, v31);
3863 
3864     // digest_length == 64, SHA3-512
3865     __ tbnz(digest_length, 6, sha3_512);
3866 
3867     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3868     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3869     __ eor(v7, __ T8B, v7, v25);
3870     __ eor(v8, __ T8B, v8, v26);
3871     __ eor(v9, __ T8B, v9, v27);
3872     __ eor(v10, __ T8B, v10, v28);
3873     __ eor(v11, __ T8B, v11, v29);
3874     __ eor(v12, __ T8B, v12, v30);
3875 
3876     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3877     __ tbnz(digest_length, 4, sha3_384_or_224);
3878 
3879     // SHA3-256
3880     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3881     __ eor(v13, __ T8B, v13, v25);
3882     __ eor(v14, __ T8B, v14, v26);
3883     __ eor(v15, __ T8B, v15, v27);
3884     __ eor(v16, __ T8B, v16, v28);
3885     __ b(rounds24_loop);
3886 
3887     __ BIND(sha3_384_or_224);
3888     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3889 
3890     // SHA3-224
3891     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3892     __ ld1(v29, __ T8B, __ post(buf, 8));
3893     __ eor(v13, __ T8B, v13, v25);
3894     __ eor(v14, __ T8B, v14, v26);
3895     __ eor(v15, __ T8B, v15, v27);
3896     __ eor(v16, __ T8B, v16, v28);
3897     __ eor(v17, __ T8B, v17, v29);
3898     __ b(rounds24_loop);
3899 
3900     __ BIND(sha3_512);
3901     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3902     __ eor(v7, __ T8B, v7, v25);
3903     __ eor(v8, __ T8B, v8, v26);
3904 
3905     __ BIND(rounds24_loop);
3906     __ subw(rscratch2, rscratch2, 1);
3907 
3908     __ eor3(v29, __ T16B, v4, v9, v14);
3909     __ eor3(v26, __ T16B, v1, v6, v11);
3910     __ eor3(v28, __ T16B, v3, v8, v13);
3911     __ eor3(v25, __ T16B, v0, v5, v10);
3912     __ eor3(v27, __ T16B, v2, v7, v12);
3913     __ eor3(v29, __ T16B, v29, v19, v24);
3914     __ eor3(v26, __ T16B, v26, v16, v21);
3915     __ eor3(v28, __ T16B, v28, v18, v23);
3916     __ eor3(v25, __ T16B, v25, v15, v20);
3917     __ eor3(v27, __ T16B, v27, v17, v22);
3918 
3919     __ rax1(v30, __ T2D, v29, v26);
3920     __ rax1(v26, __ T2D, v26, v28);
3921     __ rax1(v28, __ T2D, v28, v25);
3922     __ rax1(v25, __ T2D, v25, v27);
3923     __ rax1(v27, __ T2D, v27, v29);
3924 
3925     __ eor(v0, __ T16B, v0, v30);
3926     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3927     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3928     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3929     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3930     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3931     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3932     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3933     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3934     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3935     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3936     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3937     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3938     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3939     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3940     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3941     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3942     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3943     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3944     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3945     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3946     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3947     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3948     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3949     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3950 
3951     __ bcax(v20, __ T16B, v31, v22, v8);
3952     __ bcax(v21, __ T16B, v8,  v23, v22);
3953     __ bcax(v22, __ T16B, v22, v24, v23);
3954     __ bcax(v23, __ T16B, v23, v31, v24);
3955     __ bcax(v24, __ T16B, v24, v8,  v31);
3956 
3957     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3958 
3959     __ bcax(v17, __ T16B, v25, v19, v3);
3960     __ bcax(v18, __ T16B, v3,  v15, v19);
3961     __ bcax(v19, __ T16B, v19, v16, v15);
3962     __ bcax(v15, __ T16B, v15, v25, v16);
3963     __ bcax(v16, __ T16B, v16, v3,  v25);
3964 
3965     __ bcax(v10, __ T16B, v29, v12, v26);
3966     __ bcax(v11, __ T16B, v26, v13, v12);
3967     __ bcax(v12, __ T16B, v12, v14, v13);
3968     __ bcax(v13, __ T16B, v13, v29, v14);
3969     __ bcax(v14, __ T16B, v14, v26, v29);
3970 
3971     __ bcax(v7, __ T16B, v30, v9,  v4);
3972     __ bcax(v8, __ T16B, v4,  v5,  v9);
3973     __ bcax(v9, __ T16B, v9,  v6,  v5);
3974     __ bcax(v5, __ T16B, v5,  v30, v6);
3975     __ bcax(v6, __ T16B, v6,  v4,  v30);
3976 
3977     __ bcax(v3, __ T16B, v27, v0,  v28);
3978     __ bcax(v4, __ T16B, v28, v1,  v0);
3979     __ bcax(v0, __ T16B, v0,  v2,  v1);
3980     __ bcax(v1, __ T16B, v1,  v27, v2);
3981     __ bcax(v2, __ T16B, v2,  v28, v27);
3982 
3983     __ eor(v0, __ T16B, v0, v31);
3984 
3985     __ cbnzw(rscratch2, rounds24_loop);
3986 
3987     if (multi_block) {
3988       // block_size =  200 - 2 * digest_length, ofs += block_size
3989       __ add(ofs, ofs, 200);
3990       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3991 
3992       __ cmp(ofs, limit);
3993       __ br(Assembler::LE, sha3_loop);
3994       __ mov(c_rarg0, ofs); // return ofs
3995     }
3996 
3997     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3998     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3999     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4000     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4001     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4002     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4003     __ st1(v24, __ T1D, state);
4004 
4005     __ ldpd(v14, v15, Address(sp, 48));
4006     __ ldpd(v12, v13, Address(sp, 32));
4007     __ ldpd(v10, v11, Address(sp, 16));
4008     __ ldpd(v8, v9, __ post(sp, 64));
4009 
4010     __ ret(lr);
4011 
4012     return start;
4013   }
4014 
4015   // Safefetch stubs.
4016   void generate_safefetch(const char* name, int size, address* entry,
4017                           address* fault_pc, address* continuation_pc) {
4018     // safefetch signatures:
4019     //   int      SafeFetch32(int*      adr, int      errValue);
4020     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
4021     //
4022     // arguments:
4023     //   c_rarg0 = adr
4024     //   c_rarg1 = errValue
4025     //
4026     // result:
4027     //   PPC_RET  = *adr or errValue
4028 
4029     StubCodeMark mark(this, "StubRoutines", name);
4030 
4031     // Entry point, pc or function descriptor.
4032     *entry = __ pc();
4033 
4034     // Load *adr into c_rarg1, may fault.
4035     *fault_pc = __ pc();
4036     switch (size) {
4037       case 4:
4038         // int32_t
4039         __ ldrw(c_rarg1, Address(c_rarg0, 0));
4040         break;
4041       case 8:
4042         // int64_t
4043         __ ldr(c_rarg1, Address(c_rarg0, 0));
4044         break;
4045       default:
4046         ShouldNotReachHere();
4047     }
4048 
4049     // return errValue or *adr
4050     *continuation_pc = __ pc();
4051     __ mov(r0, c_rarg1);
4052     __ ret(lr);
4053   }
4054 
4055   /**
4056    *  Arguments:
4057    *
4058    * Inputs:
4059    *   c_rarg0   - int crc
4060    *   c_rarg1   - byte* buf
4061    *   c_rarg2   - int length
4062    *
4063    * Ouput:
4064    *       rax   - int crc result
4065    */
4066   address generate_updateBytesCRC32() {
4067     assert(UseCRC32Intrinsics, "what are we doing here?");
4068 
4069     __ align(CodeEntryAlignment);
4070     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4071 
4072     address start = __ pc();
4073 
4074     const Register crc   = c_rarg0;  // crc
4075     const Register buf   = c_rarg1;  // source java byte array address
4076     const Register len   = c_rarg2;  // length
4077     const Register table0 = c_rarg3; // crc_table address
4078     const Register table1 = c_rarg4;
4079     const Register table2 = c_rarg5;
4080     const Register table3 = c_rarg6;
4081     const Register tmp3 = c_rarg7;
4082 
4083     BLOCK_COMMENT("Entry:");
4084     __ enter(); // required for proper stackwalking of RuntimeStub frame
4085 
4086     __ kernel_crc32(crc, buf, len,
4087               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4088 
4089     __ leave(); // required for proper stackwalking of RuntimeStub frame
4090     __ ret(lr);
4091 
4092     return start;
4093   }
4094 
4095   /**
4096    *  Arguments:
4097    *
4098    * Inputs:
4099    *   c_rarg0   - int crc
4100    *   c_rarg1   - byte* buf
4101    *   c_rarg2   - int length
4102    *   c_rarg3   - int* table
4103    *
4104    * Ouput:
4105    *       r0   - int crc result
4106    */
4107   address generate_updateBytesCRC32C() {
4108     assert(UseCRC32CIntrinsics, "what are we doing here?");
4109 
4110     __ align(CodeEntryAlignment);
4111     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4112 
4113     address start = __ pc();
4114 
4115     const Register crc   = c_rarg0;  // crc
4116     const Register buf   = c_rarg1;  // source java byte array address
4117     const Register len   = c_rarg2;  // length
4118     const Register table0 = c_rarg3; // crc_table address
4119     const Register table1 = c_rarg4;
4120     const Register table2 = c_rarg5;
4121     const Register table3 = c_rarg6;
4122     const Register tmp3 = c_rarg7;
4123 
4124     BLOCK_COMMENT("Entry:");
4125     __ enter(); // required for proper stackwalking of RuntimeStub frame
4126 
4127     __ kernel_crc32c(crc, buf, len,
4128               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4129 
4130     __ leave(); // required for proper stackwalking of RuntimeStub frame
4131     __ ret(lr);
4132 
4133     return start;
4134   }
4135 
4136   /***
4137    *  Arguments:
4138    *
4139    *  Inputs:
4140    *   c_rarg0   - int   adler
4141    *   c_rarg1   - byte* buff
4142    *   c_rarg2   - int   len
4143    *
4144    * Output:
4145    *   c_rarg0   - int adler result
4146    */
4147   address generate_updateBytesAdler32() {
4148     __ align(CodeEntryAlignment);
4149     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4150     address start = __ pc();
4151 
4152     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4153 
4154     // Aliases
4155     Register adler  = c_rarg0;
4156     Register s1     = c_rarg0;
4157     Register s2     = c_rarg3;
4158     Register buff   = c_rarg1;
4159     Register len    = c_rarg2;
4160     Register nmax  = r4;
4161     Register base  = r5;
4162     Register count = r6;
4163     Register temp0 = rscratch1;
4164     Register temp1 = rscratch2;
4165     FloatRegister vbytes = v0;
4166     FloatRegister vs1acc = v1;
4167     FloatRegister vs2acc = v2;
4168     FloatRegister vtable = v3;
4169 
4170     // Max number of bytes we can process before having to take the mod
4171     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4172     uint64_t BASE = 0xfff1;
4173     uint64_t NMAX = 0x15B0;
4174 
4175     __ mov(base, BASE);
4176     __ mov(nmax, NMAX);
4177 
4178     // Load accumulation coefficients for the upper 16 bits
4179     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4180     __ ld1(vtable, __ T16B, Address(temp0));
4181 
4182     // s1 is initialized to the lower 16 bits of adler
4183     // s2 is initialized to the upper 16 bits of adler
4184     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4185     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4186 
4187     // The pipelined loop needs at least 16 elements for 1 iteration
4188     // It does check this, but it is more effective to skip to the cleanup loop
4189     __ cmp(len, (u1)16);
4190     __ br(Assembler::HS, L_nmax);
4191     __ cbz(len, L_combine);
4192 
4193     __ bind(L_simple_by1_loop);
4194     __ ldrb(temp0, Address(__ post(buff, 1)));
4195     __ add(s1, s1, temp0);
4196     __ add(s2, s2, s1);
4197     __ subs(len, len, 1);
4198     __ br(Assembler::HI, L_simple_by1_loop);
4199 
4200     // s1 = s1 % BASE
4201     __ subs(temp0, s1, base);
4202     __ csel(s1, temp0, s1, Assembler::HS);
4203 
4204     // s2 = s2 % BASE
4205     __ lsr(temp0, s2, 16);
4206     __ lsl(temp1, temp0, 4);
4207     __ sub(temp1, temp1, temp0);
4208     __ add(s2, temp1, s2, ext::uxth);
4209 
4210     __ subs(temp0, s2, base);
4211     __ csel(s2, temp0, s2, Assembler::HS);
4212 
4213     __ b(L_combine);
4214 
4215     __ bind(L_nmax);
4216     __ subs(len, len, nmax);
4217     __ sub(count, nmax, 16);
4218     __ br(Assembler::LO, L_by16);
4219 
4220     __ bind(L_nmax_loop);
4221 
4222     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4223                                       vbytes, vs1acc, vs2acc, vtable);
4224 
4225     __ subs(count, count, 16);
4226     __ br(Assembler::HS, L_nmax_loop);
4227 
4228     // s1 = s1 % BASE
4229     __ lsr(temp0, s1, 16);
4230     __ lsl(temp1, temp0, 4);
4231     __ sub(temp1, temp1, temp0);
4232     __ add(temp1, temp1, s1, ext::uxth);
4233 
4234     __ lsr(temp0, temp1, 16);
4235     __ lsl(s1, temp0, 4);
4236     __ sub(s1, s1, temp0);
4237     __ add(s1, s1, temp1, ext:: uxth);
4238 
4239     __ subs(temp0, s1, base);
4240     __ csel(s1, temp0, s1, Assembler::HS);
4241 
4242     // s2 = s2 % BASE
4243     __ lsr(temp0, s2, 16);
4244     __ lsl(temp1, temp0, 4);
4245     __ sub(temp1, temp1, temp0);
4246     __ add(temp1, temp1, s2, ext::uxth);
4247 
4248     __ lsr(temp0, temp1, 16);
4249     __ lsl(s2, temp0, 4);
4250     __ sub(s2, s2, temp0);
4251     __ add(s2, s2, temp1, ext:: uxth);
4252 
4253     __ subs(temp0, s2, base);
4254     __ csel(s2, temp0, s2, Assembler::HS);
4255 
4256     __ subs(len, len, nmax);
4257     __ sub(count, nmax, 16);
4258     __ br(Assembler::HS, L_nmax_loop);
4259 
4260     __ bind(L_by16);
4261     __ adds(len, len, count);
4262     __ br(Assembler::LO, L_by1);
4263 
4264     __ bind(L_by16_loop);
4265 
4266     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4267                                       vbytes, vs1acc, vs2acc, vtable);
4268 
4269     __ subs(len, len, 16);
4270     __ br(Assembler::HS, L_by16_loop);
4271 
4272     __ bind(L_by1);
4273     __ adds(len, len, 15);
4274     __ br(Assembler::LO, L_do_mod);
4275 
4276     __ bind(L_by1_loop);
4277     __ ldrb(temp0, Address(__ post(buff, 1)));
4278     __ add(s1, temp0, s1);
4279     __ add(s2, s2, s1);
4280     __ subs(len, len, 1);
4281     __ br(Assembler::HS, L_by1_loop);
4282 
4283     __ bind(L_do_mod);
4284     // s1 = s1 % BASE
4285     __ lsr(temp0, s1, 16);
4286     __ lsl(temp1, temp0, 4);
4287     __ sub(temp1, temp1, temp0);
4288     __ add(temp1, temp1, s1, ext::uxth);
4289 
4290     __ lsr(temp0, temp1, 16);
4291     __ lsl(s1, temp0, 4);
4292     __ sub(s1, s1, temp0);
4293     __ add(s1, s1, temp1, ext:: uxth);
4294 
4295     __ subs(temp0, s1, base);
4296     __ csel(s1, temp0, s1, Assembler::HS);
4297 
4298     // s2 = s2 % BASE
4299     __ lsr(temp0, s2, 16);
4300     __ lsl(temp1, temp0, 4);
4301     __ sub(temp1, temp1, temp0);
4302     __ add(temp1, temp1, s2, ext::uxth);
4303 
4304     __ lsr(temp0, temp1, 16);
4305     __ lsl(s2, temp0, 4);
4306     __ sub(s2, s2, temp0);
4307     __ add(s2, s2, temp1, ext:: uxth);
4308 
4309     __ subs(temp0, s2, base);
4310     __ csel(s2, temp0, s2, Assembler::HS);
4311 
4312     // Combine lower bits and higher bits
4313     __ bind(L_combine);
4314     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4315 
4316     __ ret(lr);
4317 
4318     return start;
4319   }
4320 
4321   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4322           Register temp0, Register temp1, FloatRegister vbytes,
4323           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4324     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4325     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4326     // In non-vectorized code, we update s1 and s2 as:
4327     //   s1 <- s1 + b1
4328     //   s2 <- s2 + s1
4329     //   s1 <- s1 + b2
4330     //   s2 <- s2 + b1
4331     //   ...
4332     //   s1 <- s1 + b16
4333     //   s2 <- s2 + s1
4334     // Putting above assignments together, we have:
4335     //   s1_new = s1 + b1 + b2 + ... + b16
4336     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4337     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4338     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4339     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4340 
4341     // s2 = s2 + s1 * 16
4342     __ add(s2, s2, s1, Assembler::LSL, 4);
4343 
4344     // vs1acc = b1 + b2 + b3 + ... + b16
4345     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4346     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4347     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4348     __ uaddlv(vs1acc, __ T16B, vbytes);
4349     __ uaddlv(vs2acc, __ T8H, vs2acc);
4350 
4351     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4352     __ fmovd(temp0, vs1acc);
4353     __ fmovd(temp1, vs2acc);
4354     __ add(s1, s1, temp0);
4355     __ add(s2, s2, temp1);
4356   }
4357 
4358   /**
4359    *  Arguments:
4360    *
4361    *  Input:
4362    *    c_rarg0   - x address
4363    *    c_rarg1   - x length
4364    *    c_rarg2   - y address
4365    *    c_rarg3   - y lenth
4366    *    c_rarg4   - z address
4367    *    c_rarg5   - z length
4368    */
4369   address generate_multiplyToLen() {
4370     __ align(CodeEntryAlignment);
4371     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4372 
4373     address start = __ pc();
4374     const Register x     = r0;
4375     const Register xlen  = r1;
4376     const Register y     = r2;
4377     const Register ylen  = r3;
4378     const Register z     = r4;
4379     const Register zlen  = r5;
4380 
4381     const Register tmp1  = r10;
4382     const Register tmp2  = r11;
4383     const Register tmp3  = r12;
4384     const Register tmp4  = r13;
4385     const Register tmp5  = r14;
4386     const Register tmp6  = r15;
4387     const Register tmp7  = r16;
4388 
4389     BLOCK_COMMENT("Entry:");
4390     __ enter(); // required for proper stackwalking of RuntimeStub frame
4391     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4392     __ leave(); // required for proper stackwalking of RuntimeStub frame
4393     __ ret(lr);
4394 
4395     return start;
4396   }
4397 
4398   address generate_squareToLen() {
4399     // squareToLen algorithm for sizes 1..127 described in java code works
4400     // faster than multiply_to_len on some CPUs and slower on others, but
4401     // multiply_to_len shows a bit better overall results
4402     __ align(CodeEntryAlignment);
4403     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4404     address start = __ pc();
4405 
4406     const Register x     = r0;
4407     const Register xlen  = r1;
4408     const Register z     = r2;
4409     const Register zlen  = r3;
4410     const Register y     = r4; // == x
4411     const Register ylen  = r5; // == xlen
4412 
4413     const Register tmp1  = r10;
4414     const Register tmp2  = r11;
4415     const Register tmp3  = r12;
4416     const Register tmp4  = r13;
4417     const Register tmp5  = r14;
4418     const Register tmp6  = r15;
4419     const Register tmp7  = r16;
4420 
4421     RegSet spilled_regs = RegSet::of(y, ylen);
4422     BLOCK_COMMENT("Entry:");
4423     __ enter();
4424     __ push(spilled_regs, sp);
4425     __ mov(y, x);
4426     __ mov(ylen, xlen);
4427     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4428     __ pop(spilled_regs, sp);
4429     __ leave();
4430     __ ret(lr);
4431     return start;
4432   }
4433 
4434   address generate_mulAdd() {
4435     __ align(CodeEntryAlignment);
4436     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4437 
4438     address start = __ pc();
4439 
4440     const Register out     = r0;
4441     const Register in      = r1;
4442     const Register offset  = r2;
4443     const Register len     = r3;
4444     const Register k       = r4;
4445 
4446     BLOCK_COMMENT("Entry:");
4447     __ enter();
4448     __ mul_add(out, in, offset, len, k);
4449     __ leave();
4450     __ ret(lr);
4451 
4452     return start;
4453   }
4454 
4455   // Arguments:
4456   //
4457   // Input:
4458   //   c_rarg0   - newArr address
4459   //   c_rarg1   - oldArr address
4460   //   c_rarg2   - newIdx
4461   //   c_rarg3   - shiftCount
4462   //   c_rarg4   - numIter
4463   //
4464   address generate_bigIntegerRightShift() {
4465     __ align(CodeEntryAlignment);
4466     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4467     address start = __ pc();
4468 
4469     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4470 
4471     Register newArr        = c_rarg0;
4472     Register oldArr        = c_rarg1;
4473     Register newIdx        = c_rarg2;
4474     Register shiftCount    = c_rarg3;
4475     Register numIter       = c_rarg4;
4476     Register idx           = numIter;
4477 
4478     Register newArrCur     = rscratch1;
4479     Register shiftRevCount = rscratch2;
4480     Register oldArrCur     = r13;
4481     Register oldArrNext    = r14;
4482 
4483     FloatRegister oldElem0        = v0;
4484     FloatRegister oldElem1        = v1;
4485     FloatRegister newElem         = v2;
4486     FloatRegister shiftVCount     = v3;
4487     FloatRegister shiftVRevCount  = v4;
4488 
4489     __ cbz(idx, Exit);
4490 
4491     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4492 
4493     // left shift count
4494     __ movw(shiftRevCount, 32);
4495     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4496 
4497     // numIter too small to allow a 4-words SIMD loop, rolling back
4498     __ cmp(numIter, (u1)4);
4499     __ br(Assembler::LT, ShiftThree);
4500 
4501     __ dup(shiftVCount,    __ T4S, shiftCount);
4502     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4503     __ negr(shiftVCount,   __ T4S, shiftVCount);
4504 
4505     __ BIND(ShiftSIMDLoop);
4506 
4507     // Calculate the load addresses
4508     __ sub(idx, idx, 4);
4509     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4510     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4511     __ add(oldArrCur,  oldArrNext, 4);
4512 
4513     // Load 4 words and process
4514     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4515     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4516     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4517     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4518     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4519     __ st1(newElem,   __ T4S,  Address(newArrCur));
4520 
4521     __ cmp(idx, (u1)4);
4522     __ br(Assembler::LT, ShiftTwoLoop);
4523     __ b(ShiftSIMDLoop);
4524 
4525     __ BIND(ShiftTwoLoop);
4526     __ cbz(idx, Exit);
4527     __ cmp(idx, (u1)1);
4528     __ br(Assembler::EQ, ShiftOne);
4529 
4530     // Calculate the load addresses
4531     __ sub(idx, idx, 2);
4532     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4533     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4534     __ add(oldArrCur,  oldArrNext, 4);
4535 
4536     // Load 2 words and process
4537     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4538     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4539     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4540     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4541     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4542     __ st1(newElem,   __ T2S, Address(newArrCur));
4543     __ b(ShiftTwoLoop);
4544 
4545     __ BIND(ShiftThree);
4546     __ tbz(idx, 1, ShiftOne);
4547     __ tbz(idx, 0, ShiftTwo);
4548     __ ldrw(r10,  Address(oldArr, 12));
4549     __ ldrw(r11,  Address(oldArr, 8));
4550     __ lsrvw(r10, r10, shiftCount);
4551     __ lslvw(r11, r11, shiftRevCount);
4552     __ orrw(r12,  r10, r11);
4553     __ strw(r12,  Address(newArr, 8));
4554 
4555     __ BIND(ShiftTwo);
4556     __ ldrw(r10,  Address(oldArr, 8));
4557     __ ldrw(r11,  Address(oldArr, 4));
4558     __ lsrvw(r10, r10, shiftCount);
4559     __ lslvw(r11, r11, shiftRevCount);
4560     __ orrw(r12,  r10, r11);
4561     __ strw(r12,  Address(newArr, 4));
4562 
4563     __ BIND(ShiftOne);
4564     __ ldrw(r10,  Address(oldArr, 4));
4565     __ ldrw(r11,  Address(oldArr));
4566     __ lsrvw(r10, r10, shiftCount);
4567     __ lslvw(r11, r11, shiftRevCount);
4568     __ orrw(r12,  r10, r11);
4569     __ strw(r12,  Address(newArr));
4570 
4571     __ BIND(Exit);
4572     __ ret(lr);
4573 
4574     return start;
4575   }
4576 
4577   // Arguments:
4578   //
4579   // Input:
4580   //   c_rarg0   - newArr address
4581   //   c_rarg1   - oldArr address
4582   //   c_rarg2   - newIdx
4583   //   c_rarg3   - shiftCount
4584   //   c_rarg4   - numIter
4585   //
4586   address generate_bigIntegerLeftShift() {
4587     __ align(CodeEntryAlignment);
4588     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4589     address start = __ pc();
4590 
4591     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4592 
4593     Register newArr        = c_rarg0;
4594     Register oldArr        = c_rarg1;
4595     Register newIdx        = c_rarg2;
4596     Register shiftCount    = c_rarg3;
4597     Register numIter       = c_rarg4;
4598 
4599     Register shiftRevCount = rscratch1;
4600     Register oldArrNext    = rscratch2;
4601 
4602     FloatRegister oldElem0        = v0;
4603     FloatRegister oldElem1        = v1;
4604     FloatRegister newElem         = v2;
4605     FloatRegister shiftVCount     = v3;
4606     FloatRegister shiftVRevCount  = v4;
4607 
4608     __ cbz(numIter, Exit);
4609 
4610     __ add(oldArrNext, oldArr, 4);
4611     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4612 
4613     // right shift count
4614     __ movw(shiftRevCount, 32);
4615     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4616 
4617     // numIter too small to allow a 4-words SIMD loop, rolling back
4618     __ cmp(numIter, (u1)4);
4619     __ br(Assembler::LT, ShiftThree);
4620 
4621     __ dup(shiftVCount,     __ T4S, shiftCount);
4622     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4623     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4624 
4625     __ BIND(ShiftSIMDLoop);
4626 
4627     // load 4 words and process
4628     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4629     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4630     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4631     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4632     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4633     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4634     __ sub(numIter,   numIter, 4);
4635 
4636     __ cmp(numIter, (u1)4);
4637     __ br(Assembler::LT, ShiftTwoLoop);
4638     __ b(ShiftSIMDLoop);
4639 
4640     __ BIND(ShiftTwoLoop);
4641     __ cbz(numIter, Exit);
4642     __ cmp(numIter, (u1)1);
4643     __ br(Assembler::EQ, ShiftOne);
4644 
4645     // load 2 words and process
4646     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4647     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4648     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4649     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4650     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4651     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4652     __ sub(numIter,   numIter, 2);
4653     __ b(ShiftTwoLoop);
4654 
4655     __ BIND(ShiftThree);
4656     __ ldrw(r10,  __ post(oldArr, 4));
4657     __ ldrw(r11,  __ post(oldArrNext, 4));
4658     __ lslvw(r10, r10, shiftCount);
4659     __ lsrvw(r11, r11, shiftRevCount);
4660     __ orrw(r12,  r10, r11);
4661     __ strw(r12,  __ post(newArr, 4));
4662     __ tbz(numIter, 1, Exit);
4663     __ tbz(numIter, 0, ShiftOne);
4664 
4665     __ BIND(ShiftTwo);
4666     __ ldrw(r10,  __ post(oldArr, 4));
4667     __ ldrw(r11,  __ post(oldArrNext, 4));
4668     __ lslvw(r10, r10, shiftCount);
4669     __ lsrvw(r11, r11, shiftRevCount);
4670     __ orrw(r12,  r10, r11);
4671     __ strw(r12,  __ post(newArr, 4));
4672 
4673     __ BIND(ShiftOne);
4674     __ ldrw(r10,  Address(oldArr));
4675     __ ldrw(r11,  Address(oldArrNext));
4676     __ lslvw(r10, r10, shiftCount);
4677     __ lsrvw(r11, r11, shiftRevCount);
4678     __ orrw(r12,  r10, r11);
4679     __ strw(r12,  Address(newArr));
4680 
4681     __ BIND(Exit);
4682     __ ret(lr);
4683 
4684     return start;
4685   }
4686 
4687   address generate_count_positives(address &count_positives_long) {
4688     const u1 large_loop_size = 64;
4689     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4690     int dcache_line = VM_Version::dcache_line_size();
4691 
4692     Register ary1 = r1, len = r2, result = r0;
4693 
4694     __ align(CodeEntryAlignment);
4695 
4696     StubCodeMark mark(this, "StubRoutines", "count_positives");
4697 
4698     address entry = __ pc();
4699 
4700     __ enter();
4701     // precondition: a copy of len is already in result
4702     // __ mov(result, len);
4703 
4704   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4705         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4706 
4707   __ cmp(len, (u1)15);
4708   __ br(Assembler::GT, LEN_OVER_15);
4709   // The only case when execution falls into this code is when pointer is near
4710   // the end of memory page and we have to avoid reading next page
4711   __ add(ary1, ary1, len);
4712   __ subs(len, len, 8);
4713   __ br(Assembler::GT, LEN_OVER_8);
4714   __ ldr(rscratch2, Address(ary1, -8));
4715   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4716   __ lsrv(rscratch2, rscratch2, rscratch1);
4717   __ tst(rscratch2, UPPER_BIT_MASK);
4718   __ csel(result, zr, result, Assembler::NE);
4719   __ leave();
4720   __ ret(lr);
4721   __ bind(LEN_OVER_8);
4722   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4723   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4724   __ tst(rscratch2, UPPER_BIT_MASK);
4725   __ br(Assembler::NE, RET_NO_POP);
4726   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4727   __ lsrv(rscratch1, rscratch1, rscratch2);
4728   __ tst(rscratch1, UPPER_BIT_MASK);
4729   __ bind(RET_NO_POP);
4730   __ csel(result, zr, result, Assembler::NE);
4731   __ leave();
4732   __ ret(lr);
4733 
4734   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4735   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4736 
4737   count_positives_long = __ pc(); // 2nd entry point
4738 
4739   __ enter();
4740 
4741   __ bind(LEN_OVER_15);
4742     __ push(spilled_regs, sp);
4743     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4744     __ cbz(rscratch2, ALIGNED);
4745     __ ldp(tmp6, tmp1, Address(ary1));
4746     __ mov(tmp5, 16);
4747     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4748     __ add(ary1, ary1, rscratch1);
4749     __ orr(tmp6, tmp6, tmp1);
4750     __ tst(tmp6, UPPER_BIT_MASK);
4751     __ br(Assembler::NE, RET_ADJUST);
4752     __ sub(len, len, rscratch1);
4753 
4754   __ bind(ALIGNED);
4755     __ cmp(len, large_loop_size);
4756     __ br(Assembler::LT, CHECK_16);
4757     // Perform 16-byte load as early return in pre-loop to handle situation
4758     // when initially aligned large array has negative values at starting bytes,
4759     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4760     // slower. Cases with negative bytes further ahead won't be affected that
4761     // much. In fact, it'll be faster due to early loads, less instructions and
4762     // less branches in LARGE_LOOP.
4763     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4764     __ sub(len, len, 16);
4765     __ orr(tmp6, tmp6, tmp1);
4766     __ tst(tmp6, UPPER_BIT_MASK);
4767     __ br(Assembler::NE, RET_ADJUST_16);
4768     __ cmp(len, large_loop_size);
4769     __ br(Assembler::LT, CHECK_16);
4770 
4771     if (SoftwarePrefetchHintDistance >= 0
4772         && SoftwarePrefetchHintDistance >= dcache_line) {
4773       // initial prefetch
4774       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4775     }
4776   __ bind(LARGE_LOOP);
4777     if (SoftwarePrefetchHintDistance >= 0) {
4778       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4779     }
4780     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4781     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4782     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4783     // instructions per cycle and have less branches, but this approach disables
4784     // early return, thus, all 64 bytes are loaded and checked every time.
4785     __ ldp(tmp2, tmp3, Address(ary1));
4786     __ ldp(tmp4, tmp5, Address(ary1, 16));
4787     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4788     __ ldp(tmp6, tmp1, Address(ary1, 48));
4789     __ add(ary1, ary1, large_loop_size);
4790     __ sub(len, len, large_loop_size);
4791     __ orr(tmp2, tmp2, tmp3);
4792     __ orr(tmp4, tmp4, tmp5);
4793     __ orr(rscratch1, rscratch1, rscratch2);
4794     __ orr(tmp6, tmp6, tmp1);
4795     __ orr(tmp2, tmp2, tmp4);
4796     __ orr(rscratch1, rscratch1, tmp6);
4797     __ orr(tmp2, tmp2, rscratch1);
4798     __ tst(tmp2, UPPER_BIT_MASK);
4799     __ br(Assembler::NE, RET_ADJUST_LONG);
4800     __ cmp(len, large_loop_size);
4801     __ br(Assembler::GE, LARGE_LOOP);
4802 
4803   __ bind(CHECK_16); // small 16-byte load pre-loop
4804     __ cmp(len, (u1)16);
4805     __ br(Assembler::LT, POST_LOOP16);
4806 
4807   __ bind(LOOP16); // small 16-byte load loop
4808     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4809     __ sub(len, len, 16);
4810     __ orr(tmp2, tmp2, tmp3);
4811     __ tst(tmp2, UPPER_BIT_MASK);
4812     __ br(Assembler::NE, RET_ADJUST_16);
4813     __ cmp(len, (u1)16);
4814     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4815 
4816   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4817     __ cmp(len, (u1)8);
4818     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4819     __ ldr(tmp3, Address(__ post(ary1, 8)));
4820     __ tst(tmp3, UPPER_BIT_MASK);
4821     __ br(Assembler::NE, RET_ADJUST);
4822     __ sub(len, len, 8);
4823 
4824   __ bind(POST_LOOP16_LOAD_TAIL);
4825     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
4826     __ ldr(tmp1, Address(ary1));
4827     __ mov(tmp2, 64);
4828     __ sub(tmp4, tmp2, len, __ LSL, 3);
4829     __ lslv(tmp1, tmp1, tmp4);
4830     __ tst(tmp1, UPPER_BIT_MASK);
4831     __ br(Assembler::NE, RET_ADJUST);
4832     // Fallthrough
4833 
4834   __ bind(RET_LEN);
4835     __ pop(spilled_regs, sp);
4836     __ leave();
4837     __ ret(lr);
4838 
4839     // difference result - len is the count of guaranteed to be
4840     // positive bytes
4841 
4842   __ bind(RET_ADJUST_LONG);
4843     __ add(len, len, (u1)(large_loop_size - 16));
4844   __ bind(RET_ADJUST_16);
4845     __ add(len, len, 16);
4846   __ bind(RET_ADJUST);
4847     __ pop(spilled_regs, sp);
4848     __ leave();
4849     __ sub(result, result, len);
4850     __ ret(lr);
4851 
4852     return entry;
4853   }
4854 
4855   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4856         bool usePrefetch, Label &NOT_EQUAL) {
4857     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4858         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4859         tmp7 = r12, tmp8 = r13;
4860     Label LOOP;
4861 
4862     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4863     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4864     __ bind(LOOP);
4865     if (usePrefetch) {
4866       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4867       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4868     }
4869     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4870     __ eor(tmp1, tmp1, tmp2);
4871     __ eor(tmp3, tmp3, tmp4);
4872     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4873     __ orr(tmp1, tmp1, tmp3);
4874     __ cbnz(tmp1, NOT_EQUAL);
4875     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4876     __ eor(tmp5, tmp5, tmp6);
4877     __ eor(tmp7, tmp7, tmp8);
4878     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4879     __ orr(tmp5, tmp5, tmp7);
4880     __ cbnz(tmp5, NOT_EQUAL);
4881     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4882     __ eor(tmp1, tmp1, tmp2);
4883     __ eor(tmp3, tmp3, tmp4);
4884     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4885     __ orr(tmp1, tmp1, tmp3);
4886     __ cbnz(tmp1, NOT_EQUAL);
4887     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4888     __ eor(tmp5, tmp5, tmp6);
4889     __ sub(cnt1, cnt1, 8 * wordSize);
4890     __ eor(tmp7, tmp7, tmp8);
4891     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4892     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4893     // cmp) because subs allows an unlimited range of immediate operand.
4894     __ subs(tmp6, cnt1, loopThreshold);
4895     __ orr(tmp5, tmp5, tmp7);
4896     __ cbnz(tmp5, NOT_EQUAL);
4897     __ br(__ GE, LOOP);
4898     // post-loop
4899     __ eor(tmp1, tmp1, tmp2);
4900     __ eor(tmp3, tmp3, tmp4);
4901     __ orr(tmp1, tmp1, tmp3);
4902     __ sub(cnt1, cnt1, 2 * wordSize);
4903     __ cbnz(tmp1, NOT_EQUAL);
4904   }
4905 
4906   void generate_large_array_equals_loop_simd(int loopThreshold,
4907         bool usePrefetch, Label &NOT_EQUAL) {
4908     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4909         tmp2 = rscratch2;
4910     Label LOOP;
4911 
4912     __ bind(LOOP);
4913     if (usePrefetch) {
4914       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4915       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4916     }
4917     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4918     __ sub(cnt1, cnt1, 8 * wordSize);
4919     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4920     __ subs(tmp1, cnt1, loopThreshold);
4921     __ eor(v0, __ T16B, v0, v4);
4922     __ eor(v1, __ T16B, v1, v5);
4923     __ eor(v2, __ T16B, v2, v6);
4924     __ eor(v3, __ T16B, v3, v7);
4925     __ orr(v0, __ T16B, v0, v1);
4926     __ orr(v1, __ T16B, v2, v3);
4927     __ orr(v0, __ T16B, v0, v1);
4928     __ umov(tmp1, v0, __ D, 0);
4929     __ umov(tmp2, v0, __ D, 1);
4930     __ orr(tmp1, tmp1, tmp2);
4931     __ cbnz(tmp1, NOT_EQUAL);
4932     __ br(__ GE, LOOP);
4933   }
4934 
4935   // a1 = r1 - array1 address
4936   // a2 = r2 - array2 address
4937   // result = r0 - return value. Already contains "false"
4938   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4939   // r3-r5 are reserved temporary registers
4940   address generate_large_array_equals() {
4941     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4942         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4943         tmp7 = r12, tmp8 = r13;
4944     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4945         SMALL_LOOP, POST_LOOP;
4946     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4947     // calculate if at least 32 prefetched bytes are used
4948     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4949     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4950     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4951     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4952         tmp5, tmp6, tmp7, tmp8);
4953 
4954     __ align(CodeEntryAlignment);
4955 
4956     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4957 
4958     address entry = __ pc();
4959     __ enter();
4960     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4961     // also advance pointers to use post-increment instead of pre-increment
4962     __ add(a1, a1, wordSize);
4963     __ add(a2, a2, wordSize);
4964     if (AvoidUnalignedAccesses) {
4965       // both implementations (SIMD/nonSIMD) are using relatively large load
4966       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4967       // on some CPUs in case of address is not at least 16-byte aligned.
4968       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4969       // load if needed at least for 1st address and make if 16-byte aligned.
4970       Label ALIGNED16;
4971       __ tbz(a1, 3, ALIGNED16);
4972       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4973       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4974       __ sub(cnt1, cnt1, wordSize);
4975       __ eor(tmp1, tmp1, tmp2);
4976       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4977       __ bind(ALIGNED16);
4978     }
4979     if (UseSIMDForArrayEquals) {
4980       if (SoftwarePrefetchHintDistance >= 0) {
4981         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4982         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4983         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4984             /* prfm = */ true, NOT_EQUAL);
4985         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4986         __ br(__ LT, TAIL);
4987       }
4988       __ bind(NO_PREFETCH_LARGE_LOOP);
4989       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4990           /* prfm = */ false, NOT_EQUAL);
4991     } else {
4992       __ push(spilled_regs, sp);
4993       if (SoftwarePrefetchHintDistance >= 0) {
4994         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4995         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4996         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4997             /* prfm = */ true, NOT_EQUAL);
4998         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4999         __ br(__ LT, TAIL);
5000       }
5001       __ bind(NO_PREFETCH_LARGE_LOOP);
5002       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5003           /* prfm = */ false, NOT_EQUAL);
5004     }
5005     __ bind(TAIL);
5006       __ cbz(cnt1, EQUAL);
5007       __ subs(cnt1, cnt1, wordSize);
5008       __ br(__ LE, POST_LOOP);
5009     __ bind(SMALL_LOOP);
5010       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5011       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5012       __ subs(cnt1, cnt1, wordSize);
5013       __ eor(tmp1, tmp1, tmp2);
5014       __ cbnz(tmp1, NOT_EQUAL);
5015       __ br(__ GT, SMALL_LOOP);
5016     __ bind(POST_LOOP);
5017       __ ldr(tmp1, Address(a1, cnt1));
5018       __ ldr(tmp2, Address(a2, cnt1));
5019       __ eor(tmp1, tmp1, tmp2);
5020       __ cbnz(tmp1, NOT_EQUAL);
5021     __ bind(EQUAL);
5022       __ mov(result, true);
5023     __ bind(NOT_EQUAL);
5024       if (!UseSIMDForArrayEquals) {
5025         __ pop(spilled_regs, sp);
5026       }
5027     __ bind(NOT_EQUAL_NO_POP);
5028     __ leave();
5029     __ ret(lr);
5030     return entry;
5031   }
5032 
5033   address generate_dsin_dcos(bool isCos) {
5034     __ align(CodeEntryAlignment);
5035     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5036     address start = __ pc();
5037     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5038         (address)StubRoutines::aarch64::_two_over_pi,
5039         (address)StubRoutines::aarch64::_pio2,
5040         (address)StubRoutines::aarch64::_dsin_coef,
5041         (address)StubRoutines::aarch64::_dcos_coef);
5042     return start;
5043   }
5044 
5045   address generate_dlog() {
5046     __ align(CodeEntryAlignment);
5047     StubCodeMark mark(this, "StubRoutines", "dlog");
5048     address entry = __ pc();
5049     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5050         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5051     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5052     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5053         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5054     return entry;
5055   }
5056 
5057 
5058   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5059   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5060       Label &DIFF2) {
5061     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5062     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5063 
5064     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5065     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5066     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5067     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5068 
5069     __ fmovd(tmpL, vtmp3);
5070     __ eor(rscratch2, tmp3, tmpL);
5071     __ cbnz(rscratch2, DIFF2);
5072 
5073     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5074     __ umov(tmpL, vtmp3, __ D, 1);
5075     __ eor(rscratch2, tmpU, tmpL);
5076     __ cbnz(rscratch2, DIFF1);
5077 
5078     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5079     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5080     __ fmovd(tmpL, vtmp);
5081     __ eor(rscratch2, tmp3, tmpL);
5082     __ cbnz(rscratch2, DIFF2);
5083 
5084     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5085     __ umov(tmpL, vtmp, __ D, 1);
5086     __ eor(rscratch2, tmpU, tmpL);
5087     __ cbnz(rscratch2, DIFF1);
5088   }
5089 
5090   // r0  = result
5091   // r1  = str1
5092   // r2  = cnt1
5093   // r3  = str2
5094   // r4  = cnt2
5095   // r10 = tmp1
5096   // r11 = tmp2
5097   address generate_compare_long_string_different_encoding(bool isLU) {
5098     __ align(CodeEntryAlignment);
5099     StubCodeMark mark(this, "StubRoutines", isLU
5100         ? "compare_long_string_different_encoding LU"
5101         : "compare_long_string_different_encoding UL");
5102     address entry = __ pc();
5103     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5104         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5105         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5106     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5107         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5108     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5109     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5110 
5111     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5112 
5113     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5114     // cnt2 == amount of characters left to compare
5115     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5116     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5117     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5118     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5119     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5120     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5121     __ eor(rscratch2, tmp1, tmp2);
5122     __ mov(rscratch1, tmp2);
5123     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5124     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5125              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5126     __ push(spilled_regs, sp);
5127     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5128     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5129 
5130     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5131 
5132     if (SoftwarePrefetchHintDistance >= 0) {
5133       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5134       __ br(__ LT, NO_PREFETCH);
5135       __ bind(LARGE_LOOP_PREFETCH);
5136         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5137         __ mov(tmp4, 2);
5138         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5139         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5140           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5141           __ subs(tmp4, tmp4, 1);
5142           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5143           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5144           __ mov(tmp4, 2);
5145         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5146           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5147           __ subs(tmp4, tmp4, 1);
5148           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5149           __ sub(cnt2, cnt2, 64);
5150           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5151           __ br(__ GE, LARGE_LOOP_PREFETCH);
5152     }
5153     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5154     __ bind(NO_PREFETCH);
5155     __ subs(cnt2, cnt2, 16);
5156     __ br(__ LT, TAIL);
5157     __ align(OptoLoopAlignment);
5158     __ bind(SMALL_LOOP); // smaller loop
5159       __ subs(cnt2, cnt2, 16);
5160       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5161       __ br(__ GE, SMALL_LOOP);
5162       __ cmn(cnt2, (u1)16);
5163       __ br(__ EQ, LOAD_LAST);
5164     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5165       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5166       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5167       __ ldr(tmp3, Address(cnt1, -8));
5168       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5169       __ b(LOAD_LAST);
5170     __ bind(DIFF2);
5171       __ mov(tmpU, tmp3);
5172     __ bind(DIFF1);
5173       __ pop(spilled_regs, sp);
5174       __ b(CALCULATE_DIFFERENCE);
5175     __ bind(LOAD_LAST);
5176       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5177       // No need to load it again
5178       __ mov(tmpU, tmp3);
5179       __ pop(spilled_regs, sp);
5180 
5181       // tmp2 points to the address of the last 4 Latin1 characters right now
5182       __ ldrs(vtmp, Address(tmp2));
5183       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5184       __ fmovd(tmpL, vtmp);
5185 
5186       __ eor(rscratch2, tmpU, tmpL);
5187       __ cbz(rscratch2, DONE);
5188 
5189     // Find the first different characters in the longwords and
5190     // compute their difference.
5191     __ bind(CALCULATE_DIFFERENCE);
5192       __ rev(rscratch2, rscratch2);
5193       __ clz(rscratch2, rscratch2);
5194       __ andr(rscratch2, rscratch2, -16);
5195       __ lsrv(tmp1, tmp1, rscratch2);
5196       __ uxthw(tmp1, tmp1);
5197       __ lsrv(rscratch1, rscratch1, rscratch2);
5198       __ uxthw(rscratch1, rscratch1);
5199       __ subw(result, tmp1, rscratch1);
5200     __ bind(DONE);
5201       __ ret(lr);
5202     return entry;
5203   }
5204 
5205     address generate_method_entry_barrier() {
5206     __ align(CodeEntryAlignment);
5207     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5208 
5209     Label deoptimize_label;
5210 
5211     address start = __ pc();
5212 
5213     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5214 
5215     __ enter();
5216     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5217 
5218     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5219 
5220     __ push_call_clobbered_registers();
5221 
5222     __ mov(c_rarg0, rscratch2);
5223     __ call_VM_leaf
5224          (CAST_FROM_FN_PTR
5225           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5226 
5227     __ reset_last_Java_frame(true);
5228 
5229     __ mov(rscratch1, r0);
5230 
5231     __ pop_call_clobbered_registers();
5232 
5233     __ cbnz(rscratch1, deoptimize_label);
5234 
5235     __ leave();
5236     __ ret(lr);
5237 
5238     __ BIND(deoptimize_label);
5239 
5240     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5241     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5242 
5243     __ mov(sp, rscratch1);
5244     __ br(rscratch2);
5245 
5246     return start;
5247   }
5248 
5249   // r0  = result
5250   // r1  = str1
5251   // r2  = cnt1
5252   // r3  = str2
5253   // r4  = cnt2
5254   // r10 = tmp1
5255   // r11 = tmp2
5256   address generate_compare_long_string_same_encoding(bool isLL) {
5257     __ align(CodeEntryAlignment);
5258     StubCodeMark mark(this, "StubRoutines", isLL
5259         ? "compare_long_string_same_encoding LL"
5260         : "compare_long_string_same_encoding UU");
5261     address entry = __ pc();
5262     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5263         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5264 
5265     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5266 
5267     // exit from large loop when less than 64 bytes left to read or we're about
5268     // to prefetch memory behind array border
5269     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5270 
5271     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5272     __ eor(rscratch2, tmp1, tmp2);
5273     __ cbnz(rscratch2, CAL_DIFFERENCE);
5274 
5275     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5276     // update pointers, because of previous read
5277     __ add(str1, str1, wordSize);
5278     __ add(str2, str2, wordSize);
5279     if (SoftwarePrefetchHintDistance >= 0) {
5280       __ align(OptoLoopAlignment);
5281       __ bind(LARGE_LOOP_PREFETCH);
5282         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5283         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5284 
5285         for (int i = 0; i < 4; i++) {
5286           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5287           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5288           __ cmp(tmp1, tmp2);
5289           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5290           __ br(Assembler::NE, DIFF);
5291         }
5292         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5293         __ add(str1, str1, 64);
5294         __ add(str2, str2, 64);
5295         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5296         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5297         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5298     }
5299 
5300     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5301     __ br(Assembler::LE, LESS16);
5302     __ align(OptoLoopAlignment);
5303     __ bind(LOOP_COMPARE16);
5304       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5305       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5306       __ cmp(tmp1, tmp2);
5307       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5308       __ br(Assembler::NE, DIFF);
5309       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5310       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5311       __ br(Assembler::LT, LESS16);
5312 
5313       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5314       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5315       __ cmp(tmp1, tmp2);
5316       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5317       __ br(Assembler::NE, DIFF);
5318       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5319       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5320       __ br(Assembler::GE, LOOP_COMPARE16);
5321       __ cbz(cnt2, LENGTH_DIFF);
5322 
5323     __ bind(LESS16);
5324       // each 8 compare
5325       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5326       __ br(Assembler::LE, LESS8);
5327       __ ldr(tmp1, Address(__ post(str1, 8)));
5328       __ ldr(tmp2, Address(__ post(str2, 8)));
5329       __ eor(rscratch2, tmp1, tmp2);
5330       __ cbnz(rscratch2, CAL_DIFFERENCE);
5331       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5332 
5333     __ bind(LESS8); // directly load last 8 bytes
5334       if (!isLL) {
5335         __ add(cnt2, cnt2, cnt2);
5336       }
5337       __ ldr(tmp1, Address(str1, cnt2));
5338       __ ldr(tmp2, Address(str2, cnt2));
5339       __ eor(rscratch2, tmp1, tmp2);
5340       __ cbz(rscratch2, LENGTH_DIFF);
5341       __ b(CAL_DIFFERENCE);
5342 
5343     __ bind(DIFF);
5344       __ cmp(tmp1, tmp2);
5345       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5346       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5347       // reuse rscratch2 register for the result of eor instruction
5348       __ eor(rscratch2, tmp1, tmp2);
5349 
5350     __ bind(CAL_DIFFERENCE);
5351       __ rev(rscratch2, rscratch2);
5352       __ clz(rscratch2, rscratch2);
5353       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5354       __ lsrv(tmp1, tmp1, rscratch2);
5355       __ lsrv(tmp2, tmp2, rscratch2);
5356       if (isLL) {
5357         __ uxtbw(tmp1, tmp1);
5358         __ uxtbw(tmp2, tmp2);
5359       } else {
5360         __ uxthw(tmp1, tmp1);
5361         __ uxthw(tmp2, tmp2);
5362       }
5363       __ subw(result, tmp1, tmp2);
5364 
5365     __ bind(LENGTH_DIFF);
5366       __ ret(lr);
5367     return entry;
5368   }
5369 
5370   void generate_compare_long_strings() {
5371       StubRoutines::aarch64::_compare_long_string_LL
5372           = generate_compare_long_string_same_encoding(true);
5373       StubRoutines::aarch64::_compare_long_string_UU
5374           = generate_compare_long_string_same_encoding(false);
5375       StubRoutines::aarch64::_compare_long_string_LU
5376           = generate_compare_long_string_different_encoding(true);
5377       StubRoutines::aarch64::_compare_long_string_UL
5378           = generate_compare_long_string_different_encoding(false);
5379   }
5380 
5381   // R0 = result
5382   // R1 = str2
5383   // R2 = cnt1
5384   // R3 = str1
5385   // R4 = cnt2
5386   // This generic linear code use few additional ideas, which makes it faster:
5387   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5388   // in order to skip initial loading(help in systems with 1 ld pipeline)
5389   // 2) we can use "fast" algorithm of finding single character to search for
5390   // first symbol with less branches(1 branch per each loaded register instead
5391   // of branch for each symbol), so, this is where constants like
5392   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5393   // 3) after loading and analyzing 1st register of source string, it can be
5394   // used to search for every 1st character entry, saving few loads in
5395   // comparison with "simplier-but-slower" implementation
5396   // 4) in order to avoid lots of push/pop operations, code below is heavily
5397   // re-using/re-initializing/compressing register values, which makes code
5398   // larger and a bit less readable, however, most of extra operations are
5399   // issued during loads or branches, so, penalty is minimal
5400   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5401     const char* stubName = str1_isL
5402         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5403         : "indexof_linear_uu";
5404     __ align(CodeEntryAlignment);
5405     StubCodeMark mark(this, "StubRoutines", stubName);
5406     address entry = __ pc();
5407 
5408     int str1_chr_size = str1_isL ? 1 : 2;
5409     int str2_chr_size = str2_isL ? 1 : 2;
5410     int str1_chr_shift = str1_isL ? 0 : 1;
5411     int str2_chr_shift = str2_isL ? 0 : 1;
5412     bool isL = str1_isL && str2_isL;
5413    // parameters
5414     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5415     // temporary registers
5416     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5417     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5418     // redefinitions
5419     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5420 
5421     __ push(spilled_regs, sp);
5422     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5423         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5424         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5425         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5426         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5427         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5428     // Read whole register from str1. It is safe, because length >=8 here
5429     __ ldr(ch1, Address(str1));
5430     // Read whole register from str2. It is safe, because length >=8 here
5431     __ ldr(ch2, Address(str2));
5432     __ sub(cnt2, cnt2, cnt1);
5433     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5434     if (str1_isL != str2_isL) {
5435       __ eor(v0, __ T16B, v0, v0);
5436     }
5437     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5438     __ mul(first, first, tmp1);
5439     // check if we have less than 1 register to check
5440     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5441     if (str1_isL != str2_isL) {
5442       __ fmovd(v1, ch1);
5443     }
5444     __ br(__ LE, L_SMALL);
5445     __ eor(ch2, first, ch2);
5446     if (str1_isL != str2_isL) {
5447       __ zip1(v1, __ T16B, v1, v0);
5448     }
5449     __ sub(tmp2, ch2, tmp1);
5450     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5451     __ bics(tmp2, tmp2, ch2);
5452     if (str1_isL != str2_isL) {
5453       __ fmovd(ch1, v1);
5454     }
5455     __ br(__ NE, L_HAS_ZERO);
5456     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5457     __ add(result, result, wordSize/str2_chr_size);
5458     __ add(str2, str2, wordSize);
5459     __ br(__ LT, L_POST_LOOP);
5460     __ BIND(L_LOOP);
5461       __ ldr(ch2, Address(str2));
5462       __ eor(ch2, first, ch2);
5463       __ sub(tmp2, ch2, tmp1);
5464       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5465       __ bics(tmp2, tmp2, ch2);
5466       __ br(__ NE, L_HAS_ZERO);
5467     __ BIND(L_LOOP_PROCEED);
5468       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5469       __ add(str2, str2, wordSize);
5470       __ add(result, result, wordSize/str2_chr_size);
5471       __ br(__ GE, L_LOOP);
5472     __ BIND(L_POST_LOOP);
5473       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5474       __ br(__ LE, NOMATCH);
5475       __ ldr(ch2, Address(str2));
5476       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5477       __ eor(ch2, first, ch2);
5478       __ sub(tmp2, ch2, tmp1);
5479       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5480       __ mov(tmp4, -1); // all bits set
5481       __ b(L_SMALL_PROCEED);
5482     __ align(OptoLoopAlignment);
5483     __ BIND(L_SMALL);
5484       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5485       __ eor(ch2, first, ch2);
5486       if (str1_isL != str2_isL) {
5487         __ zip1(v1, __ T16B, v1, v0);
5488       }
5489       __ sub(tmp2, ch2, tmp1);
5490       __ mov(tmp4, -1); // all bits set
5491       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5492       if (str1_isL != str2_isL) {
5493         __ fmovd(ch1, v1); // move converted 4 symbols
5494       }
5495     __ BIND(L_SMALL_PROCEED);
5496       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5497       __ bic(tmp2, tmp2, ch2);
5498       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5499       __ rbit(tmp2, tmp2);
5500       __ br(__ EQ, NOMATCH);
5501     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5502       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5503       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5504       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5505       if (str2_isL) { // LL
5506         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5507         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5508         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5509         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5510         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5511       } else {
5512         __ mov(ch2, 0xE); // all bits in byte set except last one
5513         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5514         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5515         __ lslv(tmp2, tmp2, tmp4);
5516         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5517         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5518         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5519         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5520       }
5521       __ cmp(ch1, ch2);
5522       __ mov(tmp4, wordSize/str2_chr_size);
5523       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5524     __ BIND(L_SMALL_CMP_LOOP);
5525       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5526                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5527       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5528                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5529       __ add(tmp4, tmp4, 1);
5530       __ cmp(tmp4, cnt1);
5531       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5532       __ cmp(first, ch2);
5533       __ br(__ EQ, L_SMALL_CMP_LOOP);
5534     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5535       __ cbz(tmp2, NOMATCH); // no more matches. exit
5536       __ clz(tmp4, tmp2);
5537       __ add(result, result, 1); // advance index
5538       __ add(str2, str2, str2_chr_size); // advance pointer
5539       __ b(L_SMALL_HAS_ZERO_LOOP);
5540     __ align(OptoLoopAlignment);
5541     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5542       __ cmp(first, ch2);
5543       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5544       __ b(DONE);
5545     __ align(OptoLoopAlignment);
5546     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5547       if (str2_isL) { // LL
5548         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5549         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5550         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5551         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5552         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5553       } else {
5554         __ mov(ch2, 0xE); // all bits in byte set except last one
5555         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5556         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5557         __ lslv(tmp2, tmp2, tmp4);
5558         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5559         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5560         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5561         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5562       }
5563       __ cmp(ch1, ch2);
5564       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5565       __ b(DONE);
5566     __ align(OptoLoopAlignment);
5567     __ BIND(L_HAS_ZERO);
5568       __ rbit(tmp2, tmp2);
5569       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5570       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5571       // It's fine because both counters are 32bit and are not changed in this
5572       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5573       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5574       __ sub(result, result, 1);
5575     __ BIND(L_HAS_ZERO_LOOP);
5576       __ mov(cnt1, wordSize/str2_chr_size);
5577       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5578       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5579       if (str2_isL) {
5580         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5581         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5582         __ lslv(tmp2, tmp2, tmp4);
5583         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5584         __ add(tmp4, tmp4, 1);
5585         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5586         __ lsl(tmp2, tmp2, 1);
5587         __ mov(tmp4, wordSize/str2_chr_size);
5588       } else {
5589         __ mov(ch2, 0xE);
5590         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5591         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5592         __ lslv(tmp2, tmp2, tmp4);
5593         __ add(tmp4, tmp4, 1);
5594         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5595         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5596         __ lsl(tmp2, tmp2, 1);
5597         __ mov(tmp4, wordSize/str2_chr_size);
5598         __ sub(str2, str2, str2_chr_size);
5599       }
5600       __ cmp(ch1, ch2);
5601       __ mov(tmp4, wordSize/str2_chr_size);
5602       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5603     __ BIND(L_CMP_LOOP);
5604       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5605                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5606       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5607                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5608       __ add(tmp4, tmp4, 1);
5609       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5610       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5611       __ cmp(cnt1, ch2);
5612       __ br(__ EQ, L_CMP_LOOP);
5613     __ BIND(L_CMP_LOOP_NOMATCH);
5614       // here we're not matched
5615       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5616       __ clz(tmp4, tmp2);
5617       __ add(str2, str2, str2_chr_size); // advance pointer
5618       __ b(L_HAS_ZERO_LOOP);
5619     __ align(OptoLoopAlignment);
5620     __ BIND(L_CMP_LOOP_LAST_CMP);
5621       __ cmp(cnt1, ch2);
5622       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5623       __ b(DONE);
5624     __ align(OptoLoopAlignment);
5625     __ BIND(L_CMP_LOOP_LAST_CMP2);
5626       if (str2_isL) {
5627         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5628         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5629         __ lslv(tmp2, tmp2, tmp4);
5630         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5631         __ add(tmp4, tmp4, 1);
5632         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5633         __ lsl(tmp2, tmp2, 1);
5634       } else {
5635         __ mov(ch2, 0xE);
5636         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5637         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5638         __ lslv(tmp2, tmp2, tmp4);
5639         __ add(tmp4, tmp4, 1);
5640         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5641         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5642         __ lsl(tmp2, tmp2, 1);
5643         __ sub(str2, str2, str2_chr_size);
5644       }
5645       __ cmp(ch1, ch2);
5646       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5647       __ b(DONE);
5648     __ align(OptoLoopAlignment);
5649     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5650       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5651       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5652       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5653       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5654       // result by analyzed characters value, so, we can just reset lower bits
5655       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5656       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5657       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5658       // index of last analyzed substring inside current octet. So, str2 in at
5659       // respective start address. We need to advance it to next octet
5660       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5661       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5662       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5663       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5664       __ movw(cnt2, cnt2);
5665       __ b(L_LOOP_PROCEED);
5666     __ align(OptoLoopAlignment);
5667     __ BIND(NOMATCH);
5668       __ mov(result, -1);
5669     __ BIND(DONE);
5670       __ pop(spilled_regs, sp);
5671       __ ret(lr);
5672     return entry;
5673   }
5674 
5675   void generate_string_indexof_stubs() {
5676     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5677     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5678     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5679   }
5680 
5681   void inflate_and_store_2_fp_registers(bool generatePrfm,
5682       FloatRegister src1, FloatRegister src2) {
5683     Register dst = r1;
5684     __ zip1(v1, __ T16B, src1, v0);
5685     __ zip2(v2, __ T16B, src1, v0);
5686     if (generatePrfm) {
5687       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5688     }
5689     __ zip1(v3, __ T16B, src2, v0);
5690     __ zip2(v4, __ T16B, src2, v0);
5691     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5692   }
5693 
5694   // R0 = src
5695   // R1 = dst
5696   // R2 = len
5697   // R3 = len >> 3
5698   // V0 = 0
5699   // v1 = loaded 8 bytes
5700   address generate_large_byte_array_inflate() {
5701     __ align(CodeEntryAlignment);
5702     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5703     address entry = __ pc();
5704     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5705     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5706     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5707 
5708     // do one more 8-byte read to have address 16-byte aligned in most cases
5709     // also use single store instruction
5710     __ ldrd(v2, __ post(src, 8));
5711     __ sub(octetCounter, octetCounter, 2);
5712     __ zip1(v1, __ T16B, v1, v0);
5713     __ zip1(v2, __ T16B, v2, v0);
5714     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5715     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5716     __ subs(rscratch1, octetCounter, large_loop_threshold);
5717     __ br(__ LE, LOOP_START);
5718     __ b(LOOP_PRFM_START);
5719     __ bind(LOOP_PRFM);
5720       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5721     __ bind(LOOP_PRFM_START);
5722       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5723       __ sub(octetCounter, octetCounter, 8);
5724       __ subs(rscratch1, octetCounter, large_loop_threshold);
5725       inflate_and_store_2_fp_registers(true, v3, v4);
5726       inflate_and_store_2_fp_registers(true, v5, v6);
5727       __ br(__ GT, LOOP_PRFM);
5728       __ cmp(octetCounter, (u1)8);
5729       __ br(__ LT, DONE);
5730     __ bind(LOOP);
5731       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5732       __ bind(LOOP_START);
5733       __ sub(octetCounter, octetCounter, 8);
5734       __ cmp(octetCounter, (u1)8);
5735       inflate_and_store_2_fp_registers(false, v3, v4);
5736       inflate_and_store_2_fp_registers(false, v5, v6);
5737       __ br(__ GE, LOOP);
5738     __ bind(DONE);
5739       __ ret(lr);
5740     return entry;
5741   }
5742 
5743   /**
5744    *  Arguments:
5745    *
5746    *  Input:
5747    *  c_rarg0   - current state address
5748    *  c_rarg1   - H key address
5749    *  c_rarg2   - data address
5750    *  c_rarg3   - number of blocks
5751    *
5752    *  Output:
5753    *  Updated state at c_rarg0
5754    */
5755   address generate_ghash_processBlocks() {
5756     // Bafflingly, GCM uses little-endian for the byte order, but
5757     // big-endian for the bit order.  For example, the polynomial 1 is
5758     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5759     //
5760     // So, we must either reverse the bytes in each word and do
5761     // everything big-endian or reverse the bits in each byte and do
5762     // it little-endian.  On AArch64 it's more idiomatic to reverse
5763     // the bits in each byte (we have an instruction, RBIT, to do
5764     // that) and keep the data in little-endian bit order throught the
5765     // calculation, bit-reversing the inputs and outputs.
5766 
5767     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5768     __ align(wordSize * 2);
5769     address p = __ pc();
5770     __ emit_int64(0x87);  // The low-order bits of the field
5771                           // polynomial (i.e. p = z^7+z^2+z+1)
5772                           // repeated in the low and high parts of a
5773                           // 128-bit vector
5774     __ emit_int64(0x87);
5775 
5776     __ align(CodeEntryAlignment);
5777     address start = __ pc();
5778 
5779     Register state   = c_rarg0;
5780     Register subkeyH = c_rarg1;
5781     Register data    = c_rarg2;
5782     Register blocks  = c_rarg3;
5783 
5784     FloatRegister vzr = v30;
5785     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5786 
5787     __ ldrq(v24, p);    // The field polynomial
5788 
5789     __ ldrq(v0, Address(state));
5790     __ ldrq(v1, Address(subkeyH));
5791 
5792     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5793     __ rbit(v0, __ T16B, v0);
5794     __ rev64(v1, __ T16B, v1);
5795     __ rbit(v1, __ T16B, v1);
5796 
5797     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5798     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5799 
5800     {
5801       Label L_ghash_loop;
5802       __ bind(L_ghash_loop);
5803 
5804       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5805                                                  // reversing each byte
5806       __ rbit(v2, __ T16B, v2);
5807       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5808 
5809       // Multiply state in v2 by subkey in v1
5810       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5811                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
5812                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
5813       // Reduce v7:v5 by the field polynomial
5814       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
5815 
5816       __ sub(blocks, blocks, 1);
5817       __ cbnz(blocks, L_ghash_loop);
5818     }
5819 
5820     // The bit-reversed result is at this point in v0
5821     __ rev64(v0, __ T16B, v0);
5822     __ rbit(v0, __ T16B, v0);
5823 
5824     __ st1(v0, __ T16B, state);
5825     __ ret(lr);
5826 
5827     return start;
5828   }
5829 
5830   address generate_ghash_processBlocks_wide() {
5831     address small = generate_ghash_processBlocks();
5832 
5833     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
5834     __ align(wordSize * 2);
5835     address p = __ pc();
5836     __ emit_int64(0x87);  // The low-order bits of the field
5837                           // polynomial (i.e. p = z^7+z^2+z+1)
5838                           // repeated in the low and high parts of a
5839                           // 128-bit vector
5840     __ emit_int64(0x87);
5841 
5842     __ align(CodeEntryAlignment);
5843     address start = __ pc();
5844 
5845     Register state   = c_rarg0;
5846     Register subkeyH = c_rarg1;
5847     Register data    = c_rarg2;
5848     Register blocks  = c_rarg3;
5849 
5850     const int unroll = 4;
5851 
5852     __ cmp(blocks, (unsigned char)(unroll * 2));
5853     __ br(__ LT, small);
5854 
5855     if (unroll > 1) {
5856     // Save state before entering routine
5857       __ sub(sp, sp, 4 * 16);
5858       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
5859       __ sub(sp, sp, 4 * 16);
5860       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
5861     }
5862 
5863     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
5864 
5865     if (unroll > 1) {
5866       // And restore state
5867       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
5868       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
5869     }
5870 
5871     __ cmp(blocks, (unsigned char)0);
5872     __ br(__ GT, small);
5873 
5874     __ ret(lr);
5875 
5876     return start;
5877   }
5878 
5879   void generate_base64_encode_simdround(Register src, Register dst,
5880         FloatRegister codec, u8 size) {
5881 
5882     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
5883     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5884     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5885 
5886     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5887 
5888     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
5889 
5890     __ ushr(ind0, arrangement, in0,  2);
5891 
5892     __ ushr(ind1, arrangement, in1,  2);
5893     __ shl(in0,   arrangement, in0,  6);
5894     __ orr(ind1,  arrangement, ind1, in0);
5895     __ ushr(ind1, arrangement, ind1, 2);
5896 
5897     __ ushr(ind2, arrangement, in2,  4);
5898     __ shl(in1,   arrangement, in1,  4);
5899     __ orr(ind2,  arrangement, in1,  ind2);
5900     __ ushr(ind2, arrangement, ind2, 2);
5901 
5902     __ shl(ind3,  arrangement, in2,  2);
5903     __ ushr(ind3, arrangement, ind3, 2);
5904 
5905     __ tbl(out0,  arrangement, codec,  4, ind0);
5906     __ tbl(out1,  arrangement, codec,  4, ind1);
5907     __ tbl(out2,  arrangement, codec,  4, ind2);
5908     __ tbl(out3,  arrangement, codec,  4, ind3);
5909 
5910     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
5911   }
5912 
5913    /**
5914    *  Arguments:
5915    *
5916    *  Input:
5917    *  c_rarg0   - src_start
5918    *  c_rarg1   - src_offset
5919    *  c_rarg2   - src_length
5920    *  c_rarg3   - dest_start
5921    *  c_rarg4   - dest_offset
5922    *  c_rarg5   - isURL
5923    *
5924    */
5925   address generate_base64_encodeBlock() {
5926 
5927     static const char toBase64[64] = {
5928       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5929       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5930       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5931       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5932       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5933     };
5934 
5935     static const char toBase64URL[64] = {
5936       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5937       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5938       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5939       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5940       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5941     };
5942 
5943     __ align(CodeEntryAlignment);
5944     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5945     address start = __ pc();
5946 
5947     Register src   = c_rarg0;  // source array
5948     Register soff  = c_rarg1;  // source start offset
5949     Register send  = c_rarg2;  // source end offset
5950     Register dst   = c_rarg3;  // dest array
5951     Register doff  = c_rarg4;  // position for writing to dest array
5952     Register isURL = c_rarg5;  // Base64 or URL chracter set
5953 
5954     // c_rarg6 and c_rarg7 are free to use as temps
5955     Register codec  = c_rarg6;
5956     Register length = c_rarg7;
5957 
5958     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
5959 
5960     __ add(src, src, soff);
5961     __ add(dst, dst, doff);
5962     __ sub(length, send, soff);
5963 
5964     // load the codec base address
5965     __ lea(codec, ExternalAddress((address) toBase64));
5966     __ cbz(isURL, ProcessData);
5967     __ lea(codec, ExternalAddress((address) toBase64URL));
5968 
5969     __ BIND(ProcessData);
5970 
5971     // too short to formup a SIMD loop, roll back
5972     __ cmp(length, (u1)24);
5973     __ br(Assembler::LT, Process3B);
5974 
5975     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
5976 
5977     __ BIND(Process48B);
5978     __ cmp(length, (u1)48);
5979     __ br(Assembler::LT, Process24B);
5980     generate_base64_encode_simdround(src, dst, v0, 16);
5981     __ sub(length, length, 48);
5982     __ b(Process48B);
5983 
5984     __ BIND(Process24B);
5985     __ cmp(length, (u1)24);
5986     __ br(Assembler::LT, SIMDExit);
5987     generate_base64_encode_simdround(src, dst, v0, 8);
5988     __ sub(length, length, 24);
5989 
5990     __ BIND(SIMDExit);
5991     __ cbz(length, Exit);
5992 
5993     __ BIND(Process3B);
5994     //  3 src bytes, 24 bits
5995     __ ldrb(r10, __ post(src, 1));
5996     __ ldrb(r11, __ post(src, 1));
5997     __ ldrb(r12, __ post(src, 1));
5998     __ orrw(r11, r11, r10, Assembler::LSL, 8);
5999     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6000     // codec index
6001     __ ubfmw(r15, r12, 18, 23);
6002     __ ubfmw(r14, r12, 12, 17);
6003     __ ubfmw(r13, r12, 6,  11);
6004     __ andw(r12,  r12, 63);
6005     // get the code based on the codec
6006     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6007     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6008     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6009     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6010     __ strb(r15, __ post(dst, 1));
6011     __ strb(r14, __ post(dst, 1));
6012     __ strb(r13, __ post(dst, 1));
6013     __ strb(r12, __ post(dst, 1));
6014     __ sub(length, length, 3);
6015     __ cbnz(length, Process3B);
6016 
6017     __ BIND(Exit);
6018     __ ret(lr);
6019 
6020     return start;
6021   }
6022 
6023   void generate_base64_decode_simdround(Register src, Register dst,
6024         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6025 
6026     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6027     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6028 
6029     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6030     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6031 
6032     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6033 
6034     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6035 
6036     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6037 
6038     // we need unsigned saturating substract, to make sure all input values
6039     // in range [0, 63] will have 0U value in the higher half lookup
6040     __ uqsubv(decH0, __ T16B, in0, v27);
6041     __ uqsubv(decH1, __ T16B, in1, v27);
6042     __ uqsubv(decH2, __ T16B, in2, v27);
6043     __ uqsubv(decH3, __ T16B, in3, v27);
6044 
6045     // lower half lookup
6046     __ tbl(decL0, arrangement, codecL, 4, in0);
6047     __ tbl(decL1, arrangement, codecL, 4, in1);
6048     __ tbl(decL2, arrangement, codecL, 4, in2);
6049     __ tbl(decL3, arrangement, codecL, 4, in3);
6050 
6051     // higher half lookup
6052     __ tbx(decH0, arrangement, codecH, 4, decH0);
6053     __ tbx(decH1, arrangement, codecH, 4, decH1);
6054     __ tbx(decH2, arrangement, codecH, 4, decH2);
6055     __ tbx(decH3, arrangement, codecH, 4, decH3);
6056 
6057     // combine lower and higher
6058     __ orr(decL0, arrangement, decL0, decH0);
6059     __ orr(decL1, arrangement, decL1, decH1);
6060     __ orr(decL2, arrangement, decL2, decH2);
6061     __ orr(decL3, arrangement, decL3, decH3);
6062 
6063     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6064     __ cmhi(decH0, arrangement, decL0, v27);
6065     __ cmhi(decH1, arrangement, decL1, v27);
6066     __ cmhi(decH2, arrangement, decL2, v27);
6067     __ cmhi(decH3, arrangement, decL3, v27);
6068     __ orr(in0, arrangement, decH0, decH1);
6069     __ orr(in1, arrangement, decH2, decH3);
6070     __ orr(in2, arrangement, in0,   in1);
6071     __ umaxv(in3, arrangement, in2);
6072     __ umov(rscratch2, in3, __ B, 0);
6073 
6074     // get the data to output
6075     __ shl(out0,  arrangement, decL0, 2);
6076     __ ushr(out1, arrangement, decL1, 4);
6077     __ orr(out0,  arrangement, out0,  out1);
6078     __ shl(out1,  arrangement, decL1, 4);
6079     __ ushr(out2, arrangement, decL2, 2);
6080     __ orr(out1,  arrangement, out1,  out2);
6081     __ shl(out2,  arrangement, decL2, 6);
6082     __ orr(out2,  arrangement, out2,  decL3);
6083 
6084     __ cbz(rscratch2, NoIllegalData);
6085 
6086     // handle illegal input
6087     __ umov(r10, in2, __ D, 0);
6088     if (size == 16) {
6089       __ cbnz(r10, ErrorInLowerHalf);
6090 
6091       // illegal input is in higher half, store the lower half now.
6092       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6093 
6094       __ umov(r10, in2,  __ D, 1);
6095       __ umov(r11, out0, __ D, 1);
6096       __ umov(r12, out1, __ D, 1);
6097       __ umov(r13, out2, __ D, 1);
6098       __ b(StoreLegalData);
6099 
6100       __ BIND(ErrorInLowerHalf);
6101     }
6102     __ umov(r11, out0, __ D, 0);
6103     __ umov(r12, out1, __ D, 0);
6104     __ umov(r13, out2, __ D, 0);
6105 
6106     __ BIND(StoreLegalData);
6107     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6108     __ strb(r11, __ post(dst, 1));
6109     __ strb(r12, __ post(dst, 1));
6110     __ strb(r13, __ post(dst, 1));
6111     __ lsr(r10, r10, 8);
6112     __ lsr(r11, r11, 8);
6113     __ lsr(r12, r12, 8);
6114     __ lsr(r13, r13, 8);
6115     __ b(StoreLegalData);
6116 
6117     __ BIND(NoIllegalData);
6118     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6119   }
6120 
6121 
6122    /**
6123    *  Arguments:
6124    *
6125    *  Input:
6126    *  c_rarg0   - src_start
6127    *  c_rarg1   - src_offset
6128    *  c_rarg2   - src_length
6129    *  c_rarg3   - dest_start
6130    *  c_rarg4   - dest_offset
6131    *  c_rarg5   - isURL
6132    *  c_rarg6   - isMIME
6133    *
6134    */
6135   address generate_base64_decodeBlock() {
6136 
6137     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6138     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6139     // titled "Base64 decoding".
6140 
6141     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6142     // except the trailing character '=' is also treated illegal value in this instrinsic. That
6143     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6144     static const uint8_t fromBase64ForNoSIMD[256] = {
6145       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6146       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6147       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6148        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6149       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6150        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6151       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6152        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6153       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6154       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6155       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6156       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6157       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6158       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6159       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6160       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6161     };
6162 
6163     static const uint8_t fromBase64URLForNoSIMD[256] = {
6164       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6165       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6166       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6167        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6168       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6169        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6170       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6171        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6172       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6173       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6174       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6175       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6176       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6177       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6178       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6179       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6180     };
6181 
6182     // A legal value of base64 code is in range [0, 127].  We need two lookups
6183     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6184     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6185     // table vector lookup use tbx, out of range indices are unchanged in
6186     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6187     // The value of index 64 is set to 0, so that we know that we already get the
6188     // decoded data with the 1st lookup.
6189     static const uint8_t fromBase64ForSIMD[128] = {
6190       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6191       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6192       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6193        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6194         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6195        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6196       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6197        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6198     };
6199 
6200     static const uint8_t fromBase64URLForSIMD[128] = {
6201       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6202       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6203       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6204        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6205         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6206        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6207        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6208        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6209     };
6210 
6211     __ align(CodeEntryAlignment);
6212     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6213     address start = __ pc();
6214 
6215     Register src    = c_rarg0;  // source array
6216     Register soff   = c_rarg1;  // source start offset
6217     Register send   = c_rarg2;  // source end offset
6218     Register dst    = c_rarg3;  // dest array
6219     Register doff   = c_rarg4;  // position for writing to dest array
6220     Register isURL  = c_rarg5;  // Base64 or URL character set
6221     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6222 
6223     Register length = send;    // reuse send as length of source data to process
6224 
6225     Register simd_codec   = c_rarg6;
6226     Register nosimd_codec = c_rarg7;
6227 
6228     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6229 
6230     __ enter();
6231 
6232     __ add(src, src, soff);
6233     __ add(dst, dst, doff);
6234 
6235     __ mov(doff, dst);
6236 
6237     __ sub(length, send, soff);
6238     __ bfm(length, zr, 0, 1);
6239 
6240     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6241     __ cbz(isURL, ProcessData);
6242     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6243 
6244     __ BIND(ProcessData);
6245     __ mov(rscratch1, length);
6246     __ cmp(length, (u1)144); // 144 = 80 + 64
6247     __ br(Assembler::LT, Process4B);
6248 
6249     // In the MIME case, the line length cannot be more than 76
6250     // bytes (see RFC 2045). This is too short a block for SIMD
6251     // to be worthwhile, so we use non-SIMD here.
6252     __ movw(rscratch1, 79);
6253 
6254     __ BIND(Process4B);
6255     __ ldrw(r14, __ post(src, 4));
6256     __ ubfxw(r10, r14, 0,  8);
6257     __ ubfxw(r11, r14, 8,  8);
6258     __ ubfxw(r12, r14, 16, 8);
6259     __ ubfxw(r13, r14, 24, 8);
6260     // get the de-code
6261     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6262     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6263     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6264     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6265     // error detection, 255u indicates an illegal input
6266     __ orrw(r14, r10, r11);
6267     __ orrw(r15, r12, r13);
6268     __ orrw(r14, r14, r15);
6269     __ tbnz(r14, 7, Exit);
6270     // recover the data
6271     __ lslw(r14, r10, 10);
6272     __ bfiw(r14, r11, 4, 6);
6273     __ bfmw(r14, r12, 2, 5);
6274     __ rev16w(r14, r14);
6275     __ bfiw(r13, r12, 6, 2);
6276     __ strh(r14, __ post(dst, 2));
6277     __ strb(r13, __ post(dst, 1));
6278     // non-simd loop
6279     __ subsw(rscratch1, rscratch1, 4);
6280     __ br(Assembler::GT, Process4B);
6281 
6282     // if exiting from PreProcess80B, rscratch1 == -1;
6283     // otherwise, rscratch1 == 0.
6284     __ cbzw(rscratch1, Exit);
6285     __ sub(length, length, 80);
6286 
6287     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6288     __ cbz(isURL, SIMDEnter);
6289     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6290 
6291     __ BIND(SIMDEnter);
6292     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6293     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6294     __ mov(rscratch1, 63);
6295     __ dup(v27, __ T16B, rscratch1);
6296 
6297     __ BIND(Process64B);
6298     __ cmp(length, (u1)64);
6299     __ br(Assembler::LT, Process32B);
6300     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6301     __ sub(length, length, 64);
6302     __ b(Process64B);
6303 
6304     __ BIND(Process32B);
6305     __ cmp(length, (u1)32);
6306     __ br(Assembler::LT, SIMDExit);
6307     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6308     __ sub(length, length, 32);
6309     __ b(Process32B);
6310 
6311     __ BIND(SIMDExit);
6312     __ cbz(length, Exit);
6313     __ movw(rscratch1, length);
6314     __ b(Process4B);
6315 
6316     __ BIND(Exit);
6317     __ sub(c_rarg0, dst, doff);
6318 
6319     __ leave();
6320     __ ret(lr);
6321 
6322     return start;
6323   }
6324 
6325   // Support for spin waits.
6326   address generate_spin_wait() {
6327     __ align(CodeEntryAlignment);
6328     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6329     address start = __ pc();
6330 
6331     __ spin_wait();
6332     __ ret(lr);
6333 
6334     return start;
6335   }
6336 
6337 #ifdef LINUX
6338 
6339   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6340   //
6341   // If LSE is in use, generate LSE versions of all the stubs. The
6342   // non-LSE versions are in atomic_aarch64.S.
6343 
6344   // class AtomicStubMark records the entry point of a stub and the
6345   // stub pointer which will point to it. The stub pointer is set to
6346   // the entry point when ~AtomicStubMark() is called, which must be
6347   // after ICache::invalidate_range. This ensures safe publication of
6348   // the generated code.
6349   class AtomicStubMark {
6350     address _entry_point;
6351     aarch64_atomic_stub_t *_stub;
6352     MacroAssembler *_masm;
6353   public:
6354     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6355       _masm = masm;
6356       __ align(32);
6357       _entry_point = __ pc();
6358       _stub = stub;
6359     }
6360     ~AtomicStubMark() {
6361       *_stub = (aarch64_atomic_stub_t)_entry_point;
6362     }
6363   };
6364 
6365   // NB: For memory_order_conservative we need a trailing membar after
6366   // LSE atomic operations but not a leading membar.
6367   //
6368   // We don't need a leading membar because a clause in the Arm ARM
6369   // says:
6370   //
6371   //   Barrier-ordered-before
6372   //
6373   //   Barrier instructions order prior Memory effects before subsequent
6374   //   Memory effects generated by the same Observer. A read or a write
6375   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6376   //   Observer if and only if RW1 appears in program order before RW 2
6377   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6378   //   instruction with both Acquire and Release semantics.
6379   //
6380   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6381   // and Release semantics, therefore we don't need a leading
6382   // barrier. However, there is no corresponding Barrier-ordered-after
6383   // relationship, therefore we need a trailing membar to prevent a
6384   // later store or load from being reordered with the store in an
6385   // atomic instruction.
6386   //
6387   // This was checked by using the herd7 consistency model simulator
6388   // (http://diy.inria.fr/) with this test case:
6389   //
6390   // AArch64 LseCas
6391   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6392   // P0 | P1;
6393   // LDR W4, [X2] | MOV W3, #0;
6394   // DMB LD       | MOV W4, #1;
6395   // LDR W3, [X1] | CASAL W3, W4, [X1];
6396   //              | DMB ISH;
6397   //              | STR W4, [X2];
6398   // exists
6399   // (0:X3=0 /\ 0:X4=1)
6400   //
6401   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6402   // with the store to x in P1. Without the DMB in P1 this may happen.
6403   //
6404   // At the time of writing we don't know of any AArch64 hardware that
6405   // reorders stores in this way, but the Reference Manual permits it.
6406 
6407   void gen_cas_entry(Assembler::operand_size size,
6408                      atomic_memory_order order) {
6409     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6410       exchange_val = c_rarg2;
6411     bool acquire, release;
6412     switch (order) {
6413       case memory_order_relaxed:
6414         acquire = false;
6415         release = false;
6416         break;
6417       case memory_order_release:
6418         acquire = false;
6419         release = true;
6420         break;
6421       default:
6422         acquire = true;
6423         release = true;
6424         break;
6425     }
6426     __ mov(prev, compare_val);
6427     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6428     if (order == memory_order_conservative) {
6429       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6430     }
6431     if (size == Assembler::xword) {
6432       __ mov(r0, prev);
6433     } else {
6434       __ movw(r0, prev);
6435     }
6436     __ ret(lr);
6437   }
6438 
6439   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6440     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6441     // If not relaxed, then default to conservative.  Relaxed is the only
6442     // case we use enough to be worth specializing.
6443     if (order == memory_order_relaxed) {
6444       __ ldadd(size, incr, prev, addr);
6445     } else {
6446       __ ldaddal(size, incr, prev, addr);
6447       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6448     }
6449     if (size == Assembler::xword) {
6450       __ mov(r0, prev);
6451     } else {
6452       __ movw(r0, prev);
6453     }
6454     __ ret(lr);
6455   }
6456 
6457   void gen_swpal_entry(Assembler::operand_size size) {
6458     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6459     __ swpal(size, incr, prev, addr);
6460     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6461     if (size == Assembler::xword) {
6462       __ mov(r0, prev);
6463     } else {
6464       __ movw(r0, prev);
6465     }
6466     __ ret(lr);
6467   }
6468 
6469   void generate_atomic_entry_points() {
6470     if (! UseLSE) {
6471       return;
6472     }
6473 
6474     __ align(CodeEntryAlignment);
6475     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6476     address first_entry = __ pc();
6477 
6478     // ADD, memory_order_conservative
6479     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6480     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6481     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6482     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6483 
6484     // ADD, memory_order_relaxed
6485     AtomicStubMark mark_fetch_add_4_relaxed
6486       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6487     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6488     AtomicStubMark mark_fetch_add_8_relaxed
6489       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6490     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6491 
6492     // XCHG, memory_order_conservative
6493     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6494     gen_swpal_entry(Assembler::word);
6495     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6496     gen_swpal_entry(Assembler::xword);
6497 
6498     // CAS, memory_order_conservative
6499     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6500     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6501     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6502     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6503     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6504     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6505 
6506     // CAS, memory_order_relaxed
6507     AtomicStubMark mark_cmpxchg_1_relaxed
6508       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6509     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6510     AtomicStubMark mark_cmpxchg_4_relaxed
6511       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6512     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6513     AtomicStubMark mark_cmpxchg_8_relaxed
6514       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6515     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6516 
6517     AtomicStubMark mark_cmpxchg_4_release
6518       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6519     gen_cas_entry(MacroAssembler::word, memory_order_release);
6520     AtomicStubMark mark_cmpxchg_8_release
6521       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6522     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6523 
6524     AtomicStubMark mark_cmpxchg_4_seq_cst
6525       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6526     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6527     AtomicStubMark mark_cmpxchg_8_seq_cst
6528       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6529     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6530 
6531     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6532   }
6533 #endif // LINUX
6534 
6535   // Continuation point for throwing of implicit exceptions that are
6536   // not handled in the current activation. Fabricates an exception
6537   // oop and initiates normal exception dispatching in this
6538   // frame. Since we need to preserve callee-saved values (currently
6539   // only for C2, but done for C1 as well) we need a callee-saved oop
6540   // map and therefore have to make these stubs into RuntimeStubs
6541   // rather than BufferBlobs.  If the compiler needs all registers to
6542   // be preserved between the fault point and the exception handler
6543   // then it must assume responsibility for that in
6544   // AbstractCompiler::continuation_for_implicit_null_exception or
6545   // continuation_for_implicit_division_by_zero_exception. All other
6546   // implicit exceptions (e.g., NullPointerException or
6547   // AbstractMethodError on entry) are either at call sites or
6548   // otherwise assume that stack unwinding will be initiated, so
6549   // caller saved registers were assumed volatile in the compiler.
6550 
6551 #undef __
6552 #define __ masm->
6553 
6554   address generate_throw_exception(const char* name,
6555                                    address runtime_entry,
6556                                    Register arg1 = noreg,
6557                                    Register arg2 = noreg) {
6558     // Information about frame layout at time of blocking runtime call.
6559     // Note that we only have to preserve callee-saved registers since
6560     // the compilers are responsible for supplying a continuation point
6561     // if they expect all registers to be preserved.
6562     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6563     enum layout {
6564       rfp_off = 0,
6565       rfp_off2,
6566       return_off,
6567       return_off2,
6568       framesize // inclusive of return address
6569     };
6570 
6571     int insts_size = 512;
6572     int locs_size  = 64;
6573 
6574     CodeBuffer code(name, insts_size, locs_size);
6575     OopMapSet* oop_maps  = new OopMapSet();
6576     MacroAssembler* masm = new MacroAssembler(&code);
6577 
6578     address start = __ pc();
6579 
6580     // This is an inlined and slightly modified version of call_VM
6581     // which has the ability to fetch the return PC out of
6582     // thread-local storage and also sets up last_Java_sp slightly
6583     // differently than the real call_VM
6584 
6585     __ enter(); // Save FP and LR before call
6586 
6587     assert(is_even(framesize/2), "sp not 16-byte aligned");
6588 
6589     // lr and fp are already in place
6590     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
6591 
6592     int frame_complete = __ pc() - start;
6593 
6594     // Set up last_Java_sp and last_Java_fp
6595     address the_pc = __ pc();
6596     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6597 
6598     // Call runtime
6599     if (arg1 != noreg) {
6600       assert(arg2 != c_rarg1, "clobbered");
6601       __ mov(c_rarg1, arg1);
6602     }
6603     if (arg2 != noreg) {
6604       __ mov(c_rarg2, arg2);
6605     }
6606     __ mov(c_rarg0, rthread);
6607     BLOCK_COMMENT("call runtime_entry");
6608     __ mov(rscratch1, runtime_entry);
6609     __ blr(rscratch1);
6610 
6611     // Generate oop map
6612     OopMap* map = new OopMap(framesize, 0);
6613 
6614     oop_maps->add_gc_map(the_pc - start, map);
6615 
6616     __ reset_last_Java_frame(true);
6617 
6618     // Reinitialize the ptrue predicate register, in case the external runtime
6619     // call clobbers ptrue reg, as we may return to SVE compiled code.
6620     __ reinitialize_ptrue();
6621 
6622     __ leave();
6623 
6624     // check for pending exceptions
6625 #ifdef ASSERT
6626     Label L;
6627     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6628     __ cbnz(rscratch1, L);
6629     __ should_not_reach_here();
6630     __ bind(L);
6631 #endif // ASSERT
6632     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6633 
6634 
6635     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6636     RuntimeStub* stub =
6637       RuntimeStub::new_runtime_stub(name,
6638                                     &code,
6639                                     frame_complete,
6640                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6641                                     oop_maps, false);
6642     return stub->entry_point();
6643   }
6644 
6645   class MontgomeryMultiplyGenerator : public MacroAssembler {
6646 
6647     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6648       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6649 
6650     RegSet _toSave;
6651     bool _squaring;
6652 
6653   public:
6654     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6655       : MacroAssembler(as->code()), _squaring(squaring) {
6656 
6657       // Register allocation
6658 
6659       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6660       Pa_base = *regs;       // Argument registers
6661       if (squaring)
6662         Pb_base = Pa_base;
6663       else
6664         Pb_base = *++regs;
6665       Pn_base = *++regs;
6666       Rlen= *++regs;
6667       inv = *++regs;
6668       Pm_base = *++regs;
6669 
6670                           // Working registers:
6671       Ra =  *++regs;        // The current digit of a, b, n, and m.
6672       Rb =  *++regs;
6673       Rm =  *++regs;
6674       Rn =  *++regs;
6675 
6676       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
6677       Pb =  *++regs;
6678       Pm =  *++regs;
6679       Pn =  *++regs;
6680 
6681       t0 =  *++regs;        // Three registers which form a
6682       t1 =  *++regs;        // triple-precision accumuator.
6683       t2 =  *++regs;
6684 
6685       Ri =  *++regs;        // Inner and outer loop indexes.
6686       Rj =  *++regs;
6687 
6688       Rhi_ab = *++regs;     // Product registers: low and high parts
6689       Rlo_ab = *++regs;     // of a*b and m*n.
6690       Rhi_mn = *++regs;
6691       Rlo_mn = *++regs;
6692 
6693       // r19 and up are callee-saved.
6694       _toSave = RegSet::range(r19, *regs) + Pm_base;
6695     }
6696 
6697   private:
6698     void save_regs() {
6699       push(_toSave, sp);
6700     }
6701 
6702     void restore_regs() {
6703       pop(_toSave, sp);
6704     }
6705 
6706     template <typename T>
6707     void unroll_2(Register count, T block) {
6708       Label loop, end, odd;
6709       tbnz(count, 0, odd);
6710       cbz(count, end);
6711       align(16);
6712       bind(loop);
6713       (this->*block)();
6714       bind(odd);
6715       (this->*block)();
6716       subs(count, count, 2);
6717       br(Assembler::GT, loop);
6718       bind(end);
6719     }
6720 
6721     template <typename T>
6722     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
6723       Label loop, end, odd;
6724       tbnz(count, 0, odd);
6725       cbz(count, end);
6726       align(16);
6727       bind(loop);
6728       (this->*block)(d, s, tmp);
6729       bind(odd);
6730       (this->*block)(d, s, tmp);
6731       subs(count, count, 2);
6732       br(Assembler::GT, loop);
6733       bind(end);
6734     }
6735 
6736     void pre1(RegisterOrConstant i) {
6737       block_comment("pre1");
6738       // Pa = Pa_base;
6739       // Pb = Pb_base + i;
6740       // Pm = Pm_base;
6741       // Pn = Pn_base + i;
6742       // Ra = *Pa;
6743       // Rb = *Pb;
6744       // Rm = *Pm;
6745       // Rn = *Pn;
6746       ldr(Ra, Address(Pa_base));
6747       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6748       ldr(Rm, Address(Pm_base));
6749       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6750       lea(Pa, Address(Pa_base));
6751       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6752       lea(Pm, Address(Pm_base));
6753       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6754 
6755       // Zero the m*n result.
6756       mov(Rhi_mn, zr);
6757       mov(Rlo_mn, zr);
6758     }
6759 
6760     // The core multiply-accumulate step of a Montgomery
6761     // multiplication.  The idea is to schedule operations as a
6762     // pipeline so that instructions with long latencies (loads and
6763     // multiplies) have time to complete before their results are
6764     // used.  This most benefits in-order implementations of the
6765     // architecture but out-of-order ones also benefit.
6766     void step() {
6767       block_comment("step");
6768       // MACC(Ra, Rb, t0, t1, t2);
6769       // Ra = *++Pa;
6770       // Rb = *--Pb;
6771       umulh(Rhi_ab, Ra, Rb);
6772       mul(Rlo_ab, Ra, Rb);
6773       ldr(Ra, pre(Pa, wordSize));
6774       ldr(Rb, pre(Pb, -wordSize));
6775       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
6776                                        // previous iteration.
6777       // MACC(Rm, Rn, t0, t1, t2);
6778       // Rm = *++Pm;
6779       // Rn = *--Pn;
6780       umulh(Rhi_mn, Rm, Rn);
6781       mul(Rlo_mn, Rm, Rn);
6782       ldr(Rm, pre(Pm, wordSize));
6783       ldr(Rn, pre(Pn, -wordSize));
6784       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6785     }
6786 
6787     void post1() {
6788       block_comment("post1");
6789 
6790       // MACC(Ra, Rb, t0, t1, t2);
6791       // Ra = *++Pa;
6792       // Rb = *--Pb;
6793       umulh(Rhi_ab, Ra, Rb);
6794       mul(Rlo_ab, Ra, Rb);
6795       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6796       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6797 
6798       // *Pm = Rm = t0 * inv;
6799       mul(Rm, t0, inv);
6800       str(Rm, Address(Pm));
6801 
6802       // MACC(Rm, Rn, t0, t1, t2);
6803       // t0 = t1; t1 = t2; t2 = 0;
6804       umulh(Rhi_mn, Rm, Rn);
6805 
6806 #ifndef PRODUCT
6807       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6808       {
6809         mul(Rlo_mn, Rm, Rn);
6810         add(Rlo_mn, t0, Rlo_mn);
6811         Label ok;
6812         cbz(Rlo_mn, ok); {
6813           stop("broken Montgomery multiply");
6814         } bind(ok);
6815       }
6816 #endif
6817       // We have very carefully set things up so that
6818       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6819       // the lower half of Rm * Rn because we know the result already:
6820       // it must be -t0.  t0 + (-t0) must generate a carry iff
6821       // t0 != 0.  So, rather than do a mul and an adds we just set
6822       // the carry flag iff t0 is nonzero.
6823       //
6824       // mul(Rlo_mn, Rm, Rn);
6825       // adds(zr, t0, Rlo_mn);
6826       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6827       adcs(t0, t1, Rhi_mn);
6828       adc(t1, t2, zr);
6829       mov(t2, zr);
6830     }
6831 
6832     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
6833       block_comment("pre2");
6834       // Pa = Pa_base + i-len;
6835       // Pb = Pb_base + len;
6836       // Pm = Pm_base + i-len;
6837       // Pn = Pn_base + len;
6838 
6839       if (i.is_register()) {
6840         sub(Rj, i.as_register(), len);
6841       } else {
6842         mov(Rj, i.as_constant());
6843         sub(Rj, Rj, len);
6844       }
6845       // Rj == i-len
6846 
6847       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
6848       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
6849       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6850       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
6851 
6852       // Ra = *++Pa;
6853       // Rb = *--Pb;
6854       // Rm = *++Pm;
6855       // Rn = *--Pn;
6856       ldr(Ra, pre(Pa, wordSize));
6857       ldr(Rb, pre(Pb, -wordSize));
6858       ldr(Rm, pre(Pm, wordSize));
6859       ldr(Rn, pre(Pn, -wordSize));
6860 
6861       mov(Rhi_mn, zr);
6862       mov(Rlo_mn, zr);
6863     }
6864 
6865     void post2(RegisterOrConstant i, RegisterOrConstant len) {
6866       block_comment("post2");
6867       if (i.is_constant()) {
6868         mov(Rj, i.as_constant()-len.as_constant());
6869       } else {
6870         sub(Rj, i.as_register(), len);
6871       }
6872 
6873       adds(t0, t0, Rlo_mn); // The pending m*n, low part
6874 
6875       // As soon as we know the least significant digit of our result,
6876       // store it.
6877       // Pm_base[i-len] = t0;
6878       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6879 
6880       // t0 = t1; t1 = t2; t2 = 0;
6881       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
6882       adc(t1, t2, zr);
6883       mov(t2, zr);
6884     }
6885 
6886     // A carry in t0 after Montgomery multiplication means that we
6887     // should subtract multiples of n from our result in m.  We'll
6888     // keep doing that until there is no carry.
6889     void normalize(RegisterOrConstant len) {
6890       block_comment("normalize");
6891       // while (t0)
6892       //   t0 = sub(Pm_base, Pn_base, t0, len);
6893       Label loop, post, again;
6894       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
6895       cbz(t0, post); {
6896         bind(again); {
6897           mov(i, zr);
6898           mov(cnt, len);
6899           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6900           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6901           subs(zr, zr, zr); // set carry flag, i.e. no borrow
6902           align(16);
6903           bind(loop); {
6904             sbcs(Rm, Rm, Rn);
6905             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6906             add(i, i, 1);
6907             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6908             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6909             sub(cnt, cnt, 1);
6910           } cbnz(cnt, loop);
6911           sbc(t0, t0, zr);
6912         } cbnz(t0, again);
6913       } bind(post);
6914     }
6915 
6916     // Move memory at s to d, reversing words.
6917     //    Increments d to end of copied memory
6918     //    Destroys tmp1, tmp2
6919     //    Preserves len
6920     //    Leaves s pointing to the address which was in d at start
6921     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
6922       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
6923 
6924       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
6925       mov(tmp1, len);
6926       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
6927       sub(s, d, len, ext::uxtw, LogBytesPerWord);
6928     }
6929     // where
6930     void reverse1(Register d, Register s, Register tmp) {
6931       ldr(tmp, pre(s, -wordSize));
6932       ror(tmp, tmp, 32);
6933       str(tmp, post(d, wordSize));
6934     }
6935 
6936     void step_squaring() {
6937       // An extra ACC
6938       step();
6939       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6940     }
6941 
6942     void last_squaring(RegisterOrConstant i) {
6943       Label dont;
6944       // if ((i & 1) == 0) {
6945       tbnz(i.as_register(), 0, dont); {
6946         // MACC(Ra, Rb, t0, t1, t2);
6947         // Ra = *++Pa;
6948         // Rb = *--Pb;
6949         umulh(Rhi_ab, Ra, Rb);
6950         mul(Rlo_ab, Ra, Rb);
6951         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6952       } bind(dont);
6953     }
6954 
6955     void extra_step_squaring() {
6956       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6957 
6958       // MACC(Rm, Rn, t0, t1, t2);
6959       // Rm = *++Pm;
6960       // Rn = *--Pn;
6961       umulh(Rhi_mn, Rm, Rn);
6962       mul(Rlo_mn, Rm, Rn);
6963       ldr(Rm, pre(Pm, wordSize));
6964       ldr(Rn, pre(Pn, -wordSize));
6965     }
6966 
6967     void post1_squaring() {
6968       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6969 
6970       // *Pm = Rm = t0 * inv;
6971       mul(Rm, t0, inv);
6972       str(Rm, Address(Pm));
6973 
6974       // MACC(Rm, Rn, t0, t1, t2);
6975       // t0 = t1; t1 = t2; t2 = 0;
6976       umulh(Rhi_mn, Rm, Rn);
6977 
6978 #ifndef PRODUCT
6979       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6980       {
6981         mul(Rlo_mn, Rm, Rn);
6982         add(Rlo_mn, t0, Rlo_mn);
6983         Label ok;
6984         cbz(Rlo_mn, ok); {
6985           stop("broken Montgomery multiply");
6986         } bind(ok);
6987       }
6988 #endif
6989       // We have very carefully set things up so that
6990       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6991       // the lower half of Rm * Rn because we know the result already:
6992       // it must be -t0.  t0 + (-t0) must generate a carry iff
6993       // t0 != 0.  So, rather than do a mul and an adds we just set
6994       // the carry flag iff t0 is nonzero.
6995       //
6996       // mul(Rlo_mn, Rm, Rn);
6997       // adds(zr, t0, Rlo_mn);
6998       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6999       adcs(t0, t1, Rhi_mn);
7000       adc(t1, t2, zr);
7001       mov(t2, zr);
7002     }
7003 
7004     void acc(Register Rhi, Register Rlo,
7005              Register t0, Register t1, Register t2) {
7006       adds(t0, t0, Rlo);
7007       adcs(t1, t1, Rhi);
7008       adc(t2, t2, zr);
7009     }
7010 
7011   public:
7012     /**
7013      * Fast Montgomery multiplication.  The derivation of the
7014      * algorithm is in A Cryptographic Library for the Motorola
7015      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7016      *
7017      * Arguments:
7018      *
7019      * Inputs for multiplication:
7020      *   c_rarg0   - int array elements a
7021      *   c_rarg1   - int array elements b
7022      *   c_rarg2   - int array elements n (the modulus)
7023      *   c_rarg3   - int length
7024      *   c_rarg4   - int inv
7025      *   c_rarg5   - int array elements m (the result)
7026      *
7027      * Inputs for squaring:
7028      *   c_rarg0   - int array elements a
7029      *   c_rarg1   - int array elements n (the modulus)
7030      *   c_rarg2   - int length
7031      *   c_rarg3   - int inv
7032      *   c_rarg4   - int array elements m (the result)
7033      *
7034      */
7035     address generate_multiply() {
7036       Label argh, nothing;
7037       bind(argh);
7038       stop("MontgomeryMultiply total_allocation must be <= 8192");
7039 
7040       align(CodeEntryAlignment);
7041       address entry = pc();
7042 
7043       cbzw(Rlen, nothing);
7044 
7045       enter();
7046 
7047       // Make room.
7048       cmpw(Rlen, 512);
7049       br(Assembler::HI, argh);
7050       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7051       andr(sp, Ra, -2 * wordSize);
7052 
7053       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7054 
7055       {
7056         // Copy input args, reversing as we go.  We use Ra as a
7057         // temporary variable.
7058         reverse(Ra, Pa_base, Rlen, t0, t1);
7059         if (!_squaring)
7060           reverse(Ra, Pb_base, Rlen, t0, t1);
7061         reverse(Ra, Pn_base, Rlen, t0, t1);
7062       }
7063 
7064       // Push all call-saved registers and also Pm_base which we'll need
7065       // at the end.
7066       save_regs();
7067 
7068 #ifndef PRODUCT
7069       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7070       {
7071         ldr(Rn, Address(Pn_base, 0));
7072         mul(Rlo_mn, Rn, inv);
7073         subs(zr, Rlo_mn, -1);
7074         Label ok;
7075         br(EQ, ok); {
7076           stop("broken inverse in Montgomery multiply");
7077         } bind(ok);
7078       }
7079 #endif
7080 
7081       mov(Pm_base, Ra);
7082 
7083       mov(t0, zr);
7084       mov(t1, zr);
7085       mov(t2, zr);
7086 
7087       block_comment("for (int i = 0; i < len; i++) {");
7088       mov(Ri, zr); {
7089         Label loop, end;
7090         cmpw(Ri, Rlen);
7091         br(Assembler::GE, end);
7092 
7093         bind(loop);
7094         pre1(Ri);
7095 
7096         block_comment("  for (j = i; j; j--) {"); {
7097           movw(Rj, Ri);
7098           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7099         } block_comment("  } // j");
7100 
7101         post1();
7102         addw(Ri, Ri, 1);
7103         cmpw(Ri, Rlen);
7104         br(Assembler::LT, loop);
7105         bind(end);
7106         block_comment("} // i");
7107       }
7108 
7109       block_comment("for (int i = len; i < 2*len; i++) {");
7110       mov(Ri, Rlen); {
7111         Label loop, end;
7112         cmpw(Ri, Rlen, Assembler::LSL, 1);
7113         br(Assembler::GE, end);
7114 
7115         bind(loop);
7116         pre2(Ri, Rlen);
7117 
7118         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7119           lslw(Rj, Rlen, 1);
7120           subw(Rj, Rj, Ri);
7121           subw(Rj, Rj, 1);
7122           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7123         } block_comment("  } // j");
7124 
7125         post2(Ri, Rlen);
7126         addw(Ri, Ri, 1);
7127         cmpw(Ri, Rlen, Assembler::LSL, 1);
7128         br(Assembler::LT, loop);
7129         bind(end);
7130       }
7131       block_comment("} // i");
7132 
7133       normalize(Rlen);
7134 
7135       mov(Ra, Pm_base);  // Save Pm_base in Ra
7136       restore_regs();  // Restore caller's Pm_base
7137 
7138       // Copy our result into caller's Pm_base
7139       reverse(Pm_base, Ra, Rlen, t0, t1);
7140 
7141       leave();
7142       bind(nothing);
7143       ret(lr);
7144 
7145       return entry;
7146     }
7147     // In C, approximately:
7148 
7149     // void
7150     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7151     //                     julong Pn_base[], julong Pm_base[],
7152     //                     julong inv, int len) {
7153     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7154     //   julong *Pa, *Pb, *Pn, *Pm;
7155     //   julong Ra, Rb, Rn, Rm;
7156 
7157     //   int i;
7158 
7159     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7160 
7161     //   for (i = 0; i < len; i++) {
7162     //     int j;
7163 
7164     //     Pa = Pa_base;
7165     //     Pb = Pb_base + i;
7166     //     Pm = Pm_base;
7167     //     Pn = Pn_base + i;
7168 
7169     //     Ra = *Pa;
7170     //     Rb = *Pb;
7171     //     Rm = *Pm;
7172     //     Rn = *Pn;
7173 
7174     //     int iters = i;
7175     //     for (j = 0; iters--; j++) {
7176     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7177     //       MACC(Ra, Rb, t0, t1, t2);
7178     //       Ra = *++Pa;
7179     //       Rb = *--Pb;
7180     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7181     //       MACC(Rm, Rn, t0, t1, t2);
7182     //       Rm = *++Pm;
7183     //       Rn = *--Pn;
7184     //     }
7185 
7186     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7187     //     MACC(Ra, Rb, t0, t1, t2);
7188     //     *Pm = Rm = t0 * inv;
7189     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7190     //     MACC(Rm, Rn, t0, t1, t2);
7191 
7192     //     assert(t0 == 0, "broken Montgomery multiply");
7193 
7194     //     t0 = t1; t1 = t2; t2 = 0;
7195     //   }
7196 
7197     //   for (i = len; i < 2*len; i++) {
7198     //     int j;
7199 
7200     //     Pa = Pa_base + i-len;
7201     //     Pb = Pb_base + len;
7202     //     Pm = Pm_base + i-len;
7203     //     Pn = Pn_base + len;
7204 
7205     //     Ra = *++Pa;
7206     //     Rb = *--Pb;
7207     //     Rm = *++Pm;
7208     //     Rn = *--Pn;
7209 
7210     //     int iters = len*2-i-1;
7211     //     for (j = i-len+1; iters--; j++) {
7212     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7213     //       MACC(Ra, Rb, t0, t1, t2);
7214     //       Ra = *++Pa;
7215     //       Rb = *--Pb;
7216     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7217     //       MACC(Rm, Rn, t0, t1, t2);
7218     //       Rm = *++Pm;
7219     //       Rn = *--Pn;
7220     //     }
7221 
7222     //     Pm_base[i-len] = t0;
7223     //     t0 = t1; t1 = t2; t2 = 0;
7224     //   }
7225 
7226     //   while (t0)
7227     //     t0 = sub(Pm_base, Pn_base, t0, len);
7228     // }
7229 
7230     /**
7231      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7232      * multiplies than Montgomery multiplication so it should be up to
7233      * 25% faster.  However, its loop control is more complex and it
7234      * may actually run slower on some machines.
7235      *
7236      * Arguments:
7237      *
7238      * Inputs:
7239      *   c_rarg0   - int array elements a
7240      *   c_rarg1   - int array elements n (the modulus)
7241      *   c_rarg2   - int length
7242      *   c_rarg3   - int inv
7243      *   c_rarg4   - int array elements m (the result)
7244      *
7245      */
7246     address generate_square() {
7247       Label argh;
7248       bind(argh);
7249       stop("MontgomeryMultiply total_allocation must be <= 8192");
7250 
7251       align(CodeEntryAlignment);
7252       address entry = pc();
7253 
7254       enter();
7255 
7256       // Make room.
7257       cmpw(Rlen, 512);
7258       br(Assembler::HI, argh);
7259       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7260       andr(sp, Ra, -2 * wordSize);
7261 
7262       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7263 
7264       {
7265         // Copy input args, reversing as we go.  We use Ra as a
7266         // temporary variable.
7267         reverse(Ra, Pa_base, Rlen, t0, t1);
7268         reverse(Ra, Pn_base, Rlen, t0, t1);
7269       }
7270 
7271       // Push all call-saved registers and also Pm_base which we'll need
7272       // at the end.
7273       save_regs();
7274 
7275       mov(Pm_base, Ra);
7276 
7277       mov(t0, zr);
7278       mov(t1, zr);
7279       mov(t2, zr);
7280 
7281       block_comment("for (int i = 0; i < len; i++) {");
7282       mov(Ri, zr); {
7283         Label loop, end;
7284         bind(loop);
7285         cmp(Ri, Rlen);
7286         br(Assembler::GE, end);
7287 
7288         pre1(Ri);
7289 
7290         block_comment("for (j = (i+1)/2; j; j--) {"); {
7291           add(Rj, Ri, 1);
7292           lsr(Rj, Rj, 1);
7293           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7294         } block_comment("  } // j");
7295 
7296         last_squaring(Ri);
7297 
7298         block_comment("  for (j = i/2; j; j--) {"); {
7299           lsr(Rj, Ri, 1);
7300           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7301         } block_comment("  } // j");
7302 
7303         post1_squaring();
7304         add(Ri, Ri, 1);
7305         cmp(Ri, Rlen);
7306         br(Assembler::LT, loop);
7307 
7308         bind(end);
7309         block_comment("} // i");
7310       }
7311 
7312       block_comment("for (int i = len; i < 2*len; i++) {");
7313       mov(Ri, Rlen); {
7314         Label loop, end;
7315         bind(loop);
7316         cmp(Ri, Rlen, Assembler::LSL, 1);
7317         br(Assembler::GE, end);
7318 
7319         pre2(Ri, Rlen);
7320 
7321         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7322           lsl(Rj, Rlen, 1);
7323           sub(Rj, Rj, Ri);
7324           sub(Rj, Rj, 1);
7325           lsr(Rj, Rj, 1);
7326           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7327         } block_comment("  } // j");
7328 
7329         last_squaring(Ri);
7330 
7331         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7332           lsl(Rj, Rlen, 1);
7333           sub(Rj, Rj, Ri);
7334           lsr(Rj, Rj, 1);
7335           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7336         } block_comment("  } // j");
7337 
7338         post2(Ri, Rlen);
7339         add(Ri, Ri, 1);
7340         cmp(Ri, Rlen, Assembler::LSL, 1);
7341 
7342         br(Assembler::LT, loop);
7343         bind(end);
7344         block_comment("} // i");
7345       }
7346 
7347       normalize(Rlen);
7348 
7349       mov(Ra, Pm_base);  // Save Pm_base in Ra
7350       restore_regs();  // Restore caller's Pm_base
7351 
7352       // Copy our result into caller's Pm_base
7353       reverse(Pm_base, Ra, Rlen, t0, t1);
7354 
7355       leave();
7356       ret(lr);
7357 
7358       return entry;
7359     }
7360     // In C, approximately:
7361 
7362     // void
7363     // montgomery_square(julong Pa_base[], julong Pn_base[],
7364     //                   julong Pm_base[], julong inv, int len) {
7365     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7366     //   julong *Pa, *Pb, *Pn, *Pm;
7367     //   julong Ra, Rb, Rn, Rm;
7368 
7369     //   int i;
7370 
7371     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7372 
7373     //   for (i = 0; i < len; i++) {
7374     //     int j;
7375 
7376     //     Pa = Pa_base;
7377     //     Pb = Pa_base + i;
7378     //     Pm = Pm_base;
7379     //     Pn = Pn_base + i;
7380 
7381     //     Ra = *Pa;
7382     //     Rb = *Pb;
7383     //     Rm = *Pm;
7384     //     Rn = *Pn;
7385 
7386     //     int iters = (i+1)/2;
7387     //     for (j = 0; iters--; j++) {
7388     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7389     //       MACC2(Ra, Rb, t0, t1, t2);
7390     //       Ra = *++Pa;
7391     //       Rb = *--Pb;
7392     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7393     //       MACC(Rm, Rn, t0, t1, t2);
7394     //       Rm = *++Pm;
7395     //       Rn = *--Pn;
7396     //     }
7397     //     if ((i & 1) == 0) {
7398     //       assert(Ra == Pa_base[j], "must be");
7399     //       MACC(Ra, Ra, t0, t1, t2);
7400     //     }
7401     //     iters = i/2;
7402     //     assert(iters == i-j, "must be");
7403     //     for (; iters--; j++) {
7404     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7405     //       MACC(Rm, Rn, t0, t1, t2);
7406     //       Rm = *++Pm;
7407     //       Rn = *--Pn;
7408     //     }
7409 
7410     //     *Pm = Rm = t0 * inv;
7411     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7412     //     MACC(Rm, Rn, t0, t1, t2);
7413 
7414     //     assert(t0 == 0, "broken Montgomery multiply");
7415 
7416     //     t0 = t1; t1 = t2; t2 = 0;
7417     //   }
7418 
7419     //   for (i = len; i < 2*len; i++) {
7420     //     int start = i-len+1;
7421     //     int end = start + (len - start)/2;
7422     //     int j;
7423 
7424     //     Pa = Pa_base + i-len;
7425     //     Pb = Pa_base + len;
7426     //     Pm = Pm_base + i-len;
7427     //     Pn = Pn_base + len;
7428 
7429     //     Ra = *++Pa;
7430     //     Rb = *--Pb;
7431     //     Rm = *++Pm;
7432     //     Rn = *--Pn;
7433 
7434     //     int iters = (2*len-i-1)/2;
7435     //     assert(iters == end-start, "must be");
7436     //     for (j = start; iters--; j++) {
7437     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7438     //       MACC2(Ra, Rb, t0, t1, t2);
7439     //       Ra = *++Pa;
7440     //       Rb = *--Pb;
7441     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7442     //       MACC(Rm, Rn, t0, t1, t2);
7443     //       Rm = *++Pm;
7444     //       Rn = *--Pn;
7445     //     }
7446     //     if ((i & 1) == 0) {
7447     //       assert(Ra == Pa_base[j], "must be");
7448     //       MACC(Ra, Ra, t0, t1, t2);
7449     //     }
7450     //     iters =  (2*len-i)/2;
7451     //     assert(iters == len-j, "must be");
7452     //     for (; iters--; j++) {
7453     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7454     //       MACC(Rm, Rn, t0, t1, t2);
7455     //       Rm = *++Pm;
7456     //       Rn = *--Pn;
7457     //     }
7458     //     Pm_base[i-len] = t0;
7459     //     t0 = t1; t1 = t2; t2 = 0;
7460     //   }
7461 
7462     //   while (t0)
7463     //     t0 = sub(Pm_base, Pn_base, t0, len);
7464     // }
7465   };
7466 
7467 
7468   // Call here from the interpreter or compiled code to either load
7469   // multiple returned values from the inline type instance being
7470   // returned to registers or to store returned values to a newly
7471   // allocated inline type instance.
7472   address generate_return_value_stub(address destination, const char* name, bool has_res) {
7473     // We need to save all registers the calling convention may use so
7474     // the runtime calls read or update those registers. This needs to
7475     // be in sync with SharedRuntime::java_return_convention().
7476     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7477     enum layout {
7478       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
7479       j_rarg6_off, j_rarg6_2,
7480       j_rarg5_off, j_rarg5_2,
7481       j_rarg4_off, j_rarg4_2,
7482       j_rarg3_off, j_rarg3_2,
7483       j_rarg2_off, j_rarg2_2,
7484       j_rarg1_off, j_rarg1_2,
7485       j_rarg0_off, j_rarg0_2,
7486 
7487       j_farg7_off, j_farg7_2,
7488       j_farg6_off, j_farg6_2,
7489       j_farg5_off, j_farg5_2,
7490       j_farg4_off, j_farg4_2,
7491       j_farg3_off, j_farg3_2,
7492       j_farg2_off, j_farg2_2,
7493       j_farg1_off, j_farg1_2,
7494       j_farg0_off, j_farg0_2,
7495 
7496       rfp_off, rfp_off2,
7497       return_off, return_off2,
7498 
7499       framesize // inclusive of return address
7500     };
7501 
7502     CodeBuffer code(name, 512, 64);
7503     MacroAssembler* masm = new MacroAssembler(&code);
7504 
7505     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
7506     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
7507     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
7508     int frame_size_in_words = frame_size_in_bytes / wordSize;
7509 
7510     OopMapSet* oop_maps = new OopMapSet();
7511     OopMap* map = new OopMap(frame_size_in_slots, 0);
7512 
7513     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
7514     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
7515     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
7516     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
7517     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
7518     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
7519     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
7520     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
7521 
7522     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
7523     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
7524     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
7525     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
7526     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
7527     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
7528     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
7529     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
7530 
7531     address start = __ pc();
7532 
7533     __ enter(); // Save FP and LR before call
7534 
7535     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
7536     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
7537     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
7538     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
7539 
7540     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
7541     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
7542     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
7543     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
7544 
7545     int frame_complete = __ offset();
7546 
7547     // Set up last_Java_sp and last_Java_fp
7548     address the_pc = __ pc();
7549     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7550 
7551     // Call runtime
7552     __ mov(c_rarg1, r0);
7553     __ mov(c_rarg0, rthread);
7554 
7555     __ mov(rscratch1, destination);
7556     __ blr(rscratch1);
7557 
7558     oop_maps->add_gc_map(the_pc - start, map);
7559 
7560     __ reset_last_Java_frame(false);
7561 
7562     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
7563     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
7564     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
7565     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
7566 
7567     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
7568     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
7569     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
7570     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
7571 
7572     __ leave();
7573 
7574     // check for pending exceptions
7575     Label pending;
7576     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
7577     __ cbnz(rscratch1, pending);
7578 
7579     if (has_res) {
7580       __ get_vm_result(r0, rthread);
7581     }
7582 
7583     __ ret(lr);
7584 
7585     __ bind(pending);
7586     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7587 
7588     // -------------
7589     // make sure all code is generated
7590     masm->flush();
7591 
7592     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
7593     return stub->entry_point();
7594   }
7595 
7596   // Initialization
7597   void generate_initial() {
7598     // Generate initial stubs and initializes the entry points
7599 
7600     // entry points that exist in all platforms Note: This is code
7601     // that could be shared among different platforms - however the
7602     // benefit seems to be smaller than the disadvantage of having a
7603     // much more complicated generator structure. See also comment in
7604     // stubRoutines.hpp.
7605 
7606     StubRoutines::_forward_exception_entry = generate_forward_exception();
7607 
7608     StubRoutines::_call_stub_entry =
7609       generate_call_stub(StubRoutines::_call_stub_return_address);
7610 
7611     // is referenced by megamorphic call
7612     StubRoutines::_catch_exception_entry = generate_catch_exception();
7613 
7614     // Build this early so it's available for the interpreter.
7615     StubRoutines::_throw_StackOverflowError_entry =
7616       generate_throw_exception("StackOverflowError throw_exception",
7617                                CAST_FROM_FN_PTR(address,
7618                                                 SharedRuntime::throw_StackOverflowError));
7619     StubRoutines::_throw_delayed_StackOverflowError_entry =
7620       generate_throw_exception("delayed StackOverflowError throw_exception",
7621                                CAST_FROM_FN_PTR(address,
7622                                                 SharedRuntime::throw_delayed_StackOverflowError));
7623     if (UseCRC32Intrinsics) {
7624       // set table address before stub generation which use it
7625       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7626       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7627     }
7628 
7629     if (UseCRC32CIntrinsics) {
7630       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7631     }
7632 
7633     // Disabled until JDK-8210858 is fixed
7634     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7635     //   StubRoutines::_dlog = generate_dlog();
7636     // }
7637 
7638     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7639       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7640     }
7641 
7642     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7643       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7644     }
7645 
7646     if (InlineTypeReturnedAsFields) {
7647       StubRoutines::_load_inline_type_fields_in_regs =
7648          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
7649       StubRoutines::_store_inline_type_fields_to_buf =
7650          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
7651     }
7652 
7653     // Safefetch stubs.
7654     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7655                                                        &StubRoutines::_safefetch32_fault_pc,
7656                                                        &StubRoutines::_safefetch32_continuation_pc);
7657     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7658                                                        &StubRoutines::_safefetchN_fault_pc,
7659                                                        &StubRoutines::_safefetchN_continuation_pc);
7660   }
7661 
7662   void generate_all() {
7663     // support for verify_oop (must happen after universe_init)
7664     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
7665     StubRoutines::_throw_AbstractMethodError_entry =
7666       generate_throw_exception("AbstractMethodError throw_exception",
7667                                CAST_FROM_FN_PTR(address,
7668                                                 SharedRuntime::
7669                                                 throw_AbstractMethodError));
7670 
7671     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7672       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7673                                CAST_FROM_FN_PTR(address,
7674                                                 SharedRuntime::
7675                                                 throw_IncompatibleClassChangeError));
7676 
7677     StubRoutines::_throw_NullPointerException_at_call_entry =
7678       generate_throw_exception("NullPointerException at call throw_exception",
7679                                CAST_FROM_FN_PTR(address,
7680                                                 SharedRuntime::
7681                                                 throw_NullPointerException_at_call));
7682 
7683     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7684 
7685     // arraycopy stubs used by compilers
7686     generate_arraycopy_stubs();
7687 
7688     // countPositives stub for large arrays.
7689     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
7690 
7691     // array equals stub for large arrays.
7692     if (!UseSimpleArrayEquals) {
7693       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7694     }
7695 
7696     generate_compare_long_strings();
7697 
7698     generate_string_indexof_stubs();
7699 
7700     // byte_array_inflate stub for large arrays.
7701     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7702 
7703     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7704     if (bs_nm != NULL) {
7705       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7706     }
7707 #ifdef COMPILER2
7708     if (UseMultiplyToLenIntrinsic) {
7709       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7710     }
7711 
7712     if (UseSquareToLenIntrinsic) {
7713       StubRoutines::_squareToLen = generate_squareToLen();
7714     }
7715 
7716     if (UseMulAddIntrinsic) {
7717       StubRoutines::_mulAdd = generate_mulAdd();
7718     }
7719 
7720     if (UseSIMDForBigIntegerShiftIntrinsics) {
7721       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7722       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7723     }
7724 
7725     if (UseMontgomeryMultiplyIntrinsic) {
7726       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7727       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7728       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7729     }
7730 
7731     if (UseMontgomerySquareIntrinsic) {
7732       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7733       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7734       // We use generate_multiply() rather than generate_square()
7735       // because it's faster for the sizes of modulus we care about.
7736       StubRoutines::_montgomerySquare = g.generate_multiply();
7737     }
7738 #endif // COMPILER2
7739 
7740     if (UseBASE64Intrinsics) {
7741         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7742         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7743     }
7744 
7745     // data cache line writeback
7746     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7747     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7748 
7749     if (UseAESIntrinsics) {
7750       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7751       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7752       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7753       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7754       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7755     }
7756     if (UseGHASHIntrinsics) {
7757       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7758       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
7759     }
7760     if (UseAESIntrinsics && UseGHASHIntrinsics) {
7761       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7762     }
7763 
7764     if (UseMD5Intrinsics) {
7765       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
7766       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
7767     }
7768     if (UseSHA1Intrinsics) {
7769       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7770       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7771     }
7772     if (UseSHA256Intrinsics) {
7773       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7774       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7775     }
7776     if (UseSHA512Intrinsics) {
7777       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7778       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7779     }
7780     if (UseSHA3Intrinsics) {
7781       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7782       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7783     }
7784 
7785     // generate Adler32 intrinsics code
7786     if (UseAdler32Intrinsics) {
7787       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7788     }
7789 
7790     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
7791 
7792 #ifdef LINUX
7793 
7794     generate_atomic_entry_points();
7795 
7796 #endif // LINUX
7797 
7798     StubRoutines::aarch64::set_completed();
7799   }
7800 
7801  public:
7802   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7803     if (all) {
7804       generate_all();
7805     } else {
7806       generate_initial();
7807     }
7808   }
7809 }; // end class declaration
7810 
7811 #define UCM_TABLE_MAX_ENTRIES 8
7812 void StubGenerator_generate(CodeBuffer* code, bool all) {
7813   if (UnsafeCopyMemory::_table == NULL) {
7814     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7815   }
7816   StubGenerator g(code, all);
7817 }
7818 
7819 
7820 #ifdef LINUX
7821 
7822 // Define pointers to atomic stubs and initialize them to point to the
7823 // code in atomic_aarch64.S.
7824 
7825 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
7826   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7827     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
7828   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7829     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7830 
7831 DEFAULT_ATOMIC_OP(fetch_add, 4, )
7832 DEFAULT_ATOMIC_OP(fetch_add, 8, )
7833 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
7834 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
7835 DEFAULT_ATOMIC_OP(xchg, 4, )
7836 DEFAULT_ATOMIC_OP(xchg, 8, )
7837 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
7838 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
7839 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
7840 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
7841 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
7842 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
7843 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
7844 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
7845 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
7846 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
7847 
7848 #undef DEFAULT_ATOMIC_OP
7849 
7850 #endif // LINUX