1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "atomic_aarch64.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/interpreter.hpp"
  36 #include "memory/universe.hpp"
  37 #include "nativeInst_aarch64.hpp"
  38 #include "oops/instanceOop.hpp"
  39 #include "oops/method.hpp"
  40 #include "oops/objArrayKlass.hpp"
  41 #include "oops/oop.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/atomic.hpp"
  44 #include "runtime/frame.inline.hpp"
  45 #include "runtime/handles.inline.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubCodeGenerator.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/thread.inline.hpp"
  50 #include "utilities/align.hpp"
  51 #include "utilities/globalDefinitions.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 #ifdef COMPILER2
  54 #include "opto/runtime.hpp"
  55 #endif
  56 #if INCLUDE_ZGC
  57 #include "gc/z/zThreadLocalData.hpp"
  58 #endif
  59 
  60 // Declaration and definition of StubGenerator (no .hpp file).
  61 // For a more detailed description of the stub routine structure
  62 // see the comment in stubRoutines.hpp
  63 
  64 #undef __
  65 #define __ _masm->
  66 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  67 
  68 #ifdef PRODUCT
  69 #define BLOCK_COMMENT(str) /* nothing */
  70 #else
  71 #define BLOCK_COMMENT(str) __ block_comment(str)
  72 #endif
  73 
  74 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  75 
  76 OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots);
  77 void fill_continuation_entry(MacroAssembler* masm);
  78 void continuation_enter_cleanup(MacroAssembler* masm);
  79 
  80 // Stub Code definitions
  81 
  82 class StubGenerator: public StubCodeGenerator {
  83  private:
  84 
  85 #ifdef PRODUCT
  86 #define inc_counter_np(counter) ((void)0)
  87 #else
  88   void inc_counter_np_(int& counter) {
  89     __ lea(rscratch2, ExternalAddress((address)&counter));
  90     __ ldrw(rscratch1, Address(rscratch2));
  91     __ addw(rscratch1, rscratch1, 1);
  92     __ strw(rscratch1, Address(rscratch2));
  93   }
  94 #define inc_counter_np(counter) \
  95   BLOCK_COMMENT("inc_counter " #counter); \
  96   inc_counter_np_(counter);
  97 #endif
  98 
  99   // Call stubs are used to call Java from C
 100   //
 101   // Arguments:
 102   //    c_rarg0:   call wrapper address                   address
 103   //    c_rarg1:   result                                 address
 104   //    c_rarg2:   result type                            BasicType
 105   //    c_rarg3:   method                                 Method*
 106   //    c_rarg4:   (interpreter) entry point              address
 107   //    c_rarg5:   parameters                             intptr_t*
 108   //    c_rarg6:   parameter size (in words)              int
 109   //    c_rarg7:   thread                                 Thread*
 110   //
 111   // There is no return from the stub itself as any Java result
 112   // is written to result
 113   //
 114   // we save r30 (lr) as the return PC at the base of the frame and
 115   // link r29 (fp) below it as the frame pointer installing sp (r31)
 116   // into fp.
 117   //
 118   // we save r0-r7, which accounts for all the c arguments.
 119   //
 120   // TODO: strictly do we need to save them all? they are treated as
 121   // volatile by C so could we omit saving the ones we are going to
 122   // place in global registers (thread? method?) or those we only use
 123   // during setup of the Java call?
 124   //
 125   // we don't need to save r8 which C uses as an indirect result location
 126   // return register.
 127   //
 128   // we don't need to save r9-r15 which both C and Java treat as
 129   // volatile
 130   //
 131   // we don't need to save r16-18 because Java does not use them
 132   //
 133   // we save r19-r28 which Java uses as scratch registers and C
 134   // expects to be callee-save
 135   //
 136   // we save the bottom 64 bits of each value stored in v8-v15; it is
 137   // the responsibility of the caller to preserve larger values.
 138   //
 139   // so the stub frame looks like this when we enter Java code
 140   //
 141   //     [ return_from_Java     ] <--- sp
 142   //     [ argument word n      ]
 143   //      ...
 144   // -27 [ argument word 1      ]
 145   // -26 [ saved v15            ] <--- sp_after_call
 146   // -25 [ saved v14            ]
 147   // -24 [ saved v13            ]
 148   // -23 [ saved v12            ]
 149   // -22 [ saved v11            ]
 150   // -21 [ saved v10            ]
 151   // -20 [ saved v9             ]
 152   // -19 [ saved v8             ]
 153   // -18 [ saved r28            ]
 154   // -17 [ saved r27            ]
 155   // -16 [ saved r26            ]
 156   // -15 [ saved r25            ]
 157   // -14 [ saved r24            ]
 158   // -13 [ saved r23            ]
 159   // -12 [ saved r22            ]
 160   // -11 [ saved r21            ]
 161   // -10 [ saved r20            ]
 162   //  -9 [ saved r19            ]
 163   //  -8 [ call wrapper    (r0) ]
 164   //  -7 [ result          (r1) ]
 165   //  -6 [ result type     (r2) ]
 166   //  -5 [ method          (r3) ]
 167   //  -4 [ entry point     (r4) ]
 168   //  -3 [ parameters      (r5) ]
 169   //  -2 [ parameter size  (r6) ]
 170   //  -1 [ thread (r7)          ]
 171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 172   //   1 [ saved lr       (r30) ]
 173 
 174   // Call stub stack layout word offsets from fp
 175   enum call_stub_layout {
 176     sp_after_call_off = -26,
 177 
 178     d15_off            = -26,
 179     d13_off            = -24,
 180     d11_off            = -22,
 181     d9_off             = -20,
 182 
 183     r28_off            = -18,
 184     r26_off            = -16,
 185     r24_off            = -14,
 186     r22_off            = -12,
 187     r20_off            = -10,
 188     call_wrapper_off   =  -8,
 189     result_off         =  -7,
 190     result_type_off    =  -6,
 191     method_off         =  -5,
 192     entry_point_off    =  -4,
 193     parameter_size_off =  -2,
 194     thread_off         =  -1,
 195     fp_f               =   0,
 196     retaddr_off        =   1,
 197   };
 198 
 199   address generate_call_stub(address& return_address) {
 200     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 201            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 202            "adjust this code");
 203 
 204     StubCodeMark mark(this, "StubRoutines", "call_stub");
 205     address start = __ pc();
 206 
 207     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 208 
 209     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 210     const Address result        (rfp, result_off         * wordSize);
 211     const Address result_type   (rfp, result_type_off    * wordSize);
 212     const Address method        (rfp, method_off         * wordSize);
 213     const Address entry_point   (rfp, entry_point_off    * wordSize);
 214     const Address parameter_size(rfp, parameter_size_off * wordSize);
 215 
 216     const Address thread        (rfp, thread_off         * wordSize);
 217 
 218     const Address d15_save      (rfp, d15_off * wordSize);
 219     const Address d13_save      (rfp, d13_off * wordSize);
 220     const Address d11_save      (rfp, d11_off * wordSize);
 221     const Address d9_save       (rfp, d9_off * wordSize);
 222 
 223     const Address r28_save      (rfp, r28_off * wordSize);
 224     const Address r26_save      (rfp, r26_off * wordSize);
 225     const Address r24_save      (rfp, r24_off * wordSize);
 226     const Address r22_save      (rfp, r22_off * wordSize);
 227     const Address r20_save      (rfp, r20_off * wordSize);
 228 
 229     // stub code
 230 
 231     address aarch64_entry = __ pc();
 232 
 233     // set up frame and move sp to end of save area
 234     __ enter();
 235     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 236 
 237     // save register parameters and Java scratch/global registers
 238     // n.b. we save thread even though it gets installed in
 239     // rthread because we want to sanity check rthread later
 240     __ str(c_rarg7,  thread);
 241     __ strw(c_rarg6, parameter_size);
 242     __ stp(c_rarg4, c_rarg5,  entry_point);
 243     __ stp(c_rarg2, c_rarg3,  result_type);
 244     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 245 
 246     __ stp(r20, r19,   r20_save);
 247     __ stp(r22, r21,   r22_save);
 248     __ stp(r24, r23,   r24_save);
 249     __ stp(r26, r25,   r26_save);
 250     __ stp(r28, r27,   r28_save);
 251 
 252     __ stpd(v9,  v8,   d9_save);
 253     __ stpd(v11, v10,  d11_save);
 254     __ stpd(v13, v12,  d13_save);
 255     __ stpd(v15, v14,  d15_save);
 256 
 257     // install Java thread in global register now we have saved
 258     // whatever value it held
 259     __ mov(rthread, c_rarg7);
 260     // And method
 261     __ mov(rmethod, c_rarg3);
 262 
 263     // set up the heapbase register
 264     __ reinit_heapbase();
 265 
 266 #ifdef ASSERT
 267     // make sure we have no pending exceptions
 268     {
 269       Label L;
 270       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 271       __ cmp(rscratch1, (u1)NULL_WORD);
 272       __ br(Assembler::EQ, L);
 273       __ stop("StubRoutines::call_stub: entered with pending exception");
 274       __ BIND(L);
 275     }
 276 #endif
 277     // pass parameters if any
 278     __ mov(esp, sp);
 279     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 280     __ andr(sp, rscratch1, -2 * wordSize);
 281 
 282     BLOCK_COMMENT("pass parameters if any");
 283     Label parameters_done;
 284     // parameter count is still in c_rarg6
 285     // and parameter pointer identifying param 1 is in c_rarg5
 286     __ cbzw(c_rarg6, parameters_done);
 287 
 288     address loop = __ pc();
 289     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 290     __ subsw(c_rarg6, c_rarg6, 1);
 291     __ push(rscratch1);
 292     __ br(Assembler::GT, loop);
 293 
 294     __ BIND(parameters_done);
 295 
 296     // call Java entry -- passing methdoOop, and current sp
 297     //      rmethod: Method*
 298     //      r13: sender sp
 299     BLOCK_COMMENT("call Java function");
 300     __ mov(r13, sp);
 301     __ blr(c_rarg4);
 302 
 303     // we do this here because the notify will already have been done
 304     // if we get to the next instruction via an exception
 305     //
 306     // n.b. adding this instruction here affects the calculation of
 307     // whether or not a routine returns to the call stub (used when
 308     // doing stack walks) since the normal test is to check the return
 309     // pc against the address saved below. so we may need to allow for
 310     // this extra instruction in the check.
 311 
 312     // save current address for use by exception handling code
 313 
 314     return_address = __ pc();
 315 
 316     // store result depending on type (everything that is not
 317     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 318     // n.b. this assumes Java returns an integral result in r0
 319     // and a floating result in j_farg0
 320     __ ldr(j_rarg2, result);
 321     Label is_long, is_float, is_double, exit;
 322     __ ldr(j_rarg1, result_type);
 323     __ cmp(j_rarg1, (u1)T_OBJECT);
 324     __ br(Assembler::EQ, is_long);
 325     __ cmp(j_rarg1, (u1)T_LONG);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, (u1)T_FLOAT);
 328     __ br(Assembler::EQ, is_float);
 329     __ cmp(j_rarg1, (u1)T_DOUBLE);
 330     __ br(Assembler::EQ, is_double);
 331 
 332     // handle T_INT case
 333     __ strw(r0, Address(j_rarg2));
 334 
 335     __ BIND(exit);
 336 
 337     // pop parameters
 338     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 339 
 340 #ifdef ASSERT
 341     // verify that threads correspond
 342     {
 343       Label L, S;
 344       __ ldr(rscratch1, thread);
 345       __ cmp(rthread, rscratch1);
 346       __ br(Assembler::NE, S);
 347       __ get_thread(rscratch1);
 348       __ cmp(rthread, rscratch1);
 349       __ br(Assembler::EQ, L);
 350       __ BIND(S);
 351       __ stop("StubRoutines::call_stub: threads must correspond");
 352       __ BIND(L);
 353     }
 354 #endif
 355 
 356     __ pop_cont_fastpath(rthread);
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376     // leave frame and return to caller
 377     __ leave();
 378     __ ret(lr);
 379 
 380     // handle return types different from T_INT
 381 
 382     __ BIND(is_long);
 383     __ str(r0, Address(j_rarg2, 0));
 384     __ br(Assembler::AL, exit);
 385 
 386     __ BIND(is_float);
 387     __ strs(j_farg0, Address(j_rarg2, 0));
 388     __ br(Assembler::AL, exit);
 389 
 390     __ BIND(is_double);
 391     __ strd(j_farg0, Address(j_rarg2, 0));
 392     __ br(Assembler::AL, exit);
 393 
 394     return start;
 395   }
 396 
 397   // Return point for a Java call if there's an exception thrown in
 398   // Java code.  The exception is caught and transformed into a
 399   // pending exception stored in JavaThread that can be tested from
 400   // within the VM.
 401   //
 402   // Note: Usually the parameters are removed by the callee. In case
 403   // of an exception crossing an activation frame boundary, that is
 404   // not the case if the callee is compiled code => need to setup the
 405   // rsp.
 406   //
 407   // r0: exception oop
 408 
 409   address generate_catch_exception() {
 410     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 411     address start = __ pc();
 412 
 413     // same as in generate_call_stub():
 414     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 415     const Address thread        (rfp, thread_off         * wordSize);
 416 
 417 #ifdef ASSERT
 418     // verify that threads correspond
 419     {
 420       Label L, S;
 421       __ ldr(rscratch1, thread);
 422       __ cmp(rthread, rscratch1);
 423       __ br(Assembler::NE, S);
 424       __ get_thread(rscratch1);
 425       __ cmp(rthread, rscratch1);
 426       __ br(Assembler::EQ, L);
 427       __ bind(S);
 428       __ stop("StubRoutines::catch_exception: threads must correspond");
 429       __ bind(L);
 430     }
 431 #endif
 432 
 433     // set pending exception
 434     __ verify_oop(r0);
 435 
 436     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 437     __ mov(rscratch1, (address)__FILE__);
 438     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 439     __ movw(rscratch1, (int)__LINE__);
 440     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 441 
 442     // complete return to VM
 443     assert(StubRoutines::_call_stub_return_address != NULL,
 444            "_call_stub_return_address must have been generated before");
 445     __ b(StubRoutines::_call_stub_return_address);
 446 
 447     return start;
 448   }
 449 
 450   // Continuation point for runtime calls returning with a pending
 451   // exception.  The pending exception check happened in the runtime
 452   // or native call stub.  The pending exception in Thread is
 453   // converted into a Java-level exception.
 454   //
 455   // Contract with Java-level exception handlers:
 456   // r0: exception
 457   // r3: throwing pc
 458   //
 459   // NOTE: At entry of this stub, exception-pc must be in LR !!
 460 
 461   // NOTE: this is always used as a jump target within generated code
 462   // so it just needs to be generated code wiht no x86 prolog
 463 
 464   address generate_forward_exception() {
 465     StubCodeMark mark(this, "StubRoutines", "forward exception");
 466     address start = __ pc();
 467 
 468     // Upon entry, LR points to the return address returning into
 469     // Java (interpreted or compiled) code; i.e., the return address
 470     // becomes the throwing pc.
 471     //
 472     // Arguments pushed before the runtime call are still on the stack
 473     // but the exception handler will reset the stack pointer ->
 474     // ignore them.  A potential result in registers can be ignored as
 475     // well.
 476 
 477 #ifdef ASSERT
 478     // make sure this code is only executed if there is a pending exception
 479     {
 480       Label L;
 481       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 482       __ cbnz(rscratch1, L);
 483       __ stop("StubRoutines::forward exception: no pending exception (1)");
 484       __ bind(L);
 485     }
 486 #endif
 487 
 488     // compute exception handler into r19
 489 
 490     // call the VM to find the handler address associated with the
 491     // caller address. pass thread in r0 and caller pc (ret address)
 492     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 493     // the stack.
 494     __ mov(c_rarg1, lr);
 495     // lr will be trashed by the VM call so we move it to R19
 496     // (callee-saved) because we also need to pass it to the handler
 497     // returned by this call.
 498     __ mov(r19, lr);
 499     BLOCK_COMMENT("call exception_handler_for_return_address");
 500     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 501                          SharedRuntime::exception_handler_for_return_address),
 502                     rthread, c_rarg1);
 503     // Reinitialize the ptrue predicate register, in case the external runtime
 504     // call clobbers ptrue reg, as we may return to SVE compiled code.
 505     __ reinitialize_ptrue();
 506 
 507     // we should not really care that lr is no longer the callee
 508     // address. we saved the value the handler needs in r19 so we can
 509     // just copy it to r3. however, the C2 handler will push its own
 510     // frame and then calls into the VM and the VM code asserts that
 511     // the PC for the frame above the handler belongs to a compiled
 512     // Java method. So, we restore lr here to satisfy that assert.
 513     __ mov(lr, r19);
 514     // setup r0 & r3 & clear pending exception
 515     __ mov(r3, r19);
 516     __ mov(r19, r0);
 517     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 518     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 519 
 520 #ifdef ASSERT
 521     // make sure exception is set
 522     {
 523       Label L;
 524       __ cbnz(r0, L);
 525       __ stop("StubRoutines::forward exception: no pending exception (2)");
 526       __ bind(L);
 527     }
 528 #endif
 529 
 530     // continue at exception handler
 531     // r0: exception
 532     // r3: throwing pc
 533     // r19: exception handler
 534     __ verify_oop(r0);
 535     __ br(r19);
 536 
 537     return start;
 538   }
 539 
 540   // Non-destructive plausibility checks for oops
 541   //
 542   // Arguments:
 543   //    r0: oop to verify
 544   //    rscratch1: error message
 545   //
 546   // Stack after saving c_rarg3:
 547   //    [tos + 0]: saved c_rarg3
 548   //    [tos + 1]: saved c_rarg2
 549   //    [tos + 2]: saved lr
 550   //    [tos + 3]: saved rscratch2
 551   //    [tos + 4]: saved r0
 552   //    [tos + 5]: saved rscratch1
 553   address generate_verify_oop() {
 554 
 555     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 556     address start = __ pc();
 557 
 558     Label exit, error;
 559 
 560     // save c_rarg2 and c_rarg3
 561     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 562 
 563     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 564     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 565     __ ldr(c_rarg3, Address(c_rarg2));
 566     __ add(c_rarg3, c_rarg3, 1);
 567     __ str(c_rarg3, Address(c_rarg2));
 568 
 569     // object is in r0
 570     // make sure object is 'reasonable'
 571     __ cbz(r0, exit); // if obj is NULL it is OK
 572 
 573 #if INCLUDE_ZGC
 574     if (UseZGC) {
 575       // Check if mask is good.
 576       // verifies that ZAddressBadMask & r0 == 0
 577       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 578       __ andr(c_rarg2, r0, c_rarg3);
 579       __ cbnz(c_rarg2, error);
 580     }
 581 #endif
 582 
 583     // Check if the oop is in the right area of memory
 584     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 585     __ andr(c_rarg2, r0, c_rarg3);
 586     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 587 
 588     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 589     // instruction here because the flags register is live.
 590     __ eor(c_rarg2, c_rarg2, c_rarg3);
 591     __ cbnz(c_rarg2, error);
 592 
 593     // make sure klass is 'reasonable', which is not zero.
 594     __ load_klass(r0, r0);  // get klass
 595     __ cbz(r0, error);      // if klass is NULL it is broken
 596 
 597     // return if everything seems ok
 598     __ bind(exit);
 599 
 600     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 601     __ ret(lr);
 602 
 603     // handle errors
 604     __ bind(error);
 605     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 606 
 607     __ push(RegSet::range(r0, r29), sp);
 608     // debug(char* msg, int64_t pc, int64_t regs[])
 609     __ mov(c_rarg0, rscratch1);      // pass address of error message
 610     __ mov(c_rarg1, lr);             // pass return address
 611     __ mov(c_rarg2, sp);             // pass address of regs on stack
 612 #ifndef PRODUCT
 613     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 614 #endif
 615     BLOCK_COMMENT("call MacroAssembler::debug");
 616     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 617     __ blr(rscratch1);
 618     __ hlt(0);
 619 
 620     return start;
 621   }
 622 
 623   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 624 
 625   // Generate indices for iota vector.
 626   address generate_iota_indices(const char *stub_name) {
 627     __ align(CodeEntryAlignment);
 628     StubCodeMark mark(this, "StubRoutines", stub_name);
 629     address start = __ pc();
 630     __ emit_data64(0x0706050403020100, relocInfo::none);
 631     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 632     return start;
 633   }
 634 
 635   // The inner part of zero_words().  This is the bulk operation,
 636   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 637   // caller is responsible for zeroing the last few words.
 638   //
 639   // Inputs:
 640   // r10: the HeapWord-aligned base address of an array to zero.
 641   // r11: the count in HeapWords, r11 > 0.
 642   //
 643   // Returns r10 and r11, adjusted for the caller to clear.
 644   // r10: the base address of the tail of words left to clear.
 645   // r11: the number of words in the tail.
 646   //      r11 < MacroAssembler::zero_words_block_size.
 647 
 648   address generate_zero_blocks() {
 649     Label done;
 650     Label base_aligned;
 651 
 652     Register base = r10, cnt = r11;
 653 
 654     __ align(CodeEntryAlignment);
 655     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 656     address start = __ pc();
 657 
 658     if (UseBlockZeroing) {
 659       int zva_length = VM_Version::zva_length();
 660 
 661       // Ensure ZVA length can be divided by 16. This is required by
 662       // the subsequent operations.
 663       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 664 
 665       __ tbz(base, 3, base_aligned);
 666       __ str(zr, Address(__ post(base, 8)));
 667       __ sub(cnt, cnt, 1);
 668       __ bind(base_aligned);
 669 
 670       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 671       // alignment.
 672       Label small;
 673       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 674       __ subs(rscratch1, cnt, low_limit >> 3);
 675       __ br(Assembler::LT, small);
 676       __ zero_dcache_blocks(base, cnt);
 677       __ bind(small);
 678     }
 679 
 680     {
 681       // Number of stp instructions we'll unroll
 682       const int unroll =
 683         MacroAssembler::zero_words_block_size / 2;
 684       // Clear the remaining blocks.
 685       Label loop;
 686       __ subs(cnt, cnt, unroll * 2);
 687       __ br(Assembler::LT, done);
 688       __ bind(loop);
 689       for (int i = 0; i < unroll; i++)
 690         __ stp(zr, zr, __ post(base, 16));
 691       __ subs(cnt, cnt, unroll * 2);
 692       __ br(Assembler::GE, loop);
 693       __ bind(done);
 694       __ add(cnt, cnt, unroll * 2);
 695     }
 696 
 697     __ ret(lr);
 698 
 699     return start;
 700   }
 701 
 702 
 703   typedef enum {
 704     copy_forwards = 1,
 705     copy_backwards = -1
 706   } copy_direction;
 707 
 708   // Bulk copy of blocks of 8 words.
 709   //
 710   // count is a count of words.
 711   //
 712   // Precondition: count >= 8
 713   //
 714   // Postconditions:
 715   //
 716   // The least significant bit of count contains the remaining count
 717   // of words to copy.  The rest of count is trash.
 718   //
 719   // s and d are adjusted to point to the remaining words to copy
 720   //
 721   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 722                            copy_direction direction) {
 723     int unit = wordSize * direction;
 724     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 725 
 726     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 727       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 728     const Register stride = r13;
 729 
 730     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 731     assert_different_registers(s, d, count, rscratch1);
 732 
 733     Label again, drain;
 734     const char *stub_name;
 735     if (direction == copy_forwards)
 736       stub_name = "forward_copy_longs";
 737     else
 738       stub_name = "backward_copy_longs";
 739 
 740     __ align(CodeEntryAlignment);
 741 
 742     StubCodeMark mark(this, "StubRoutines", stub_name);
 743 
 744     __ bind(start);
 745 
 746     Label unaligned_copy_long;
 747     if (AvoidUnalignedAccesses) {
 748       __ tbnz(d, 3, unaligned_copy_long);
 749     }
 750 
 751     if (direction == copy_forwards) {
 752       __ sub(s, s, bias);
 753       __ sub(d, d, bias);
 754     }
 755 
 756 #ifdef ASSERT
 757     // Make sure we are never given < 8 words
 758     {
 759       Label L;
 760       __ cmp(count, (u1)8);
 761       __ br(Assembler::GE, L);
 762       __ stop("genrate_copy_longs called with < 8 words");
 763       __ bind(L);
 764     }
 765 #endif
 766 
 767     // Fill 8 registers
 768     if (UseSIMDForMemoryOps) {
 769       __ ldpq(v0, v1, Address(s, 4 * unit));
 770       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 771     } else {
 772       __ ldp(t0, t1, Address(s, 2 * unit));
 773       __ ldp(t2, t3, Address(s, 4 * unit));
 774       __ ldp(t4, t5, Address(s, 6 * unit));
 775       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 776     }
 777 
 778     __ subs(count, count, 16);
 779     __ br(Assembler::LO, drain);
 780 
 781     int prefetch = PrefetchCopyIntervalInBytes;
 782     bool use_stride = false;
 783     if (direction == copy_backwards) {
 784        use_stride = prefetch > 256;
 785        prefetch = -prefetch;
 786        if (use_stride) __ mov(stride, prefetch);
 787     }
 788 
 789     __ bind(again);
 790 
 791     if (PrefetchCopyIntervalInBytes > 0)
 792       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 793 
 794     if (UseSIMDForMemoryOps) {
 795       __ stpq(v0, v1, Address(d, 4 * unit));
 796       __ ldpq(v0, v1, Address(s, 4 * unit));
 797       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 798       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 799     } else {
 800       __ stp(t0, t1, Address(d, 2 * unit));
 801       __ ldp(t0, t1, Address(s, 2 * unit));
 802       __ stp(t2, t3, Address(d, 4 * unit));
 803       __ ldp(t2, t3, Address(s, 4 * unit));
 804       __ stp(t4, t5, Address(d, 6 * unit));
 805       __ ldp(t4, t5, Address(s, 6 * unit));
 806       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 807       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 808     }
 809 
 810     __ subs(count, count, 8);
 811     __ br(Assembler::HS, again);
 812 
 813     // Drain
 814     __ bind(drain);
 815     if (UseSIMDForMemoryOps) {
 816       __ stpq(v0, v1, Address(d, 4 * unit));
 817       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 818     } else {
 819       __ stp(t0, t1, Address(d, 2 * unit));
 820       __ stp(t2, t3, Address(d, 4 * unit));
 821       __ stp(t4, t5, Address(d, 6 * unit));
 822       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 823     }
 824 
 825     {
 826       Label L1, L2;
 827       __ tbz(count, exact_log2(4), L1);
 828       if (UseSIMDForMemoryOps) {
 829         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 830         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 831       } else {
 832         __ ldp(t0, t1, Address(s, 2 * unit));
 833         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 834         __ stp(t0, t1, Address(d, 2 * unit));
 835         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 836       }
 837       __ bind(L1);
 838 
 839       if (direction == copy_forwards) {
 840         __ add(s, s, bias);
 841         __ add(d, d, bias);
 842       }
 843 
 844       __ tbz(count, 1, L2);
 845       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 846       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 847       __ bind(L2);
 848     }
 849 
 850     __ ret(lr);
 851 
 852     if (AvoidUnalignedAccesses) {
 853       Label drain, again;
 854       // Register order for storing. Order is different for backward copy.
 855 
 856       __ bind(unaligned_copy_long);
 857 
 858       // source address is even aligned, target odd aligned
 859       //
 860       // when forward copying word pairs we read long pairs at offsets
 861       // {0, 2, 4, 6} (in long words). when backwards copying we read
 862       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 863       // address by -2 in the forwards case so we can compute the
 864       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 865       // or -1.
 866       //
 867       // when forward copying we need to store 1 word, 3 pairs and
 868       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 869       // zero offset We adjust the destination by -1 which means we
 870       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 871       //
 872       // When backwards copyng we need to store 1 word, 3 pairs and
 873       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 874       // offsets {1, 3, 5, 7, 8} * unit.
 875 
 876       if (direction == copy_forwards) {
 877         __ sub(s, s, 16);
 878         __ sub(d, d, 8);
 879       }
 880 
 881       // Fill 8 registers
 882       //
 883       // for forwards copy s was offset by -16 from the original input
 884       // value of s so the register contents are at these offsets
 885       // relative to the 64 bit block addressed by that original input
 886       // and so on for each successive 64 byte block when s is updated
 887       //
 888       // t0 at offset 0,  t1 at offset 8
 889       // t2 at offset 16, t3 at offset 24
 890       // t4 at offset 32, t5 at offset 40
 891       // t6 at offset 48, t7 at offset 56
 892 
 893       // for backwards copy s was not offset so the register contents
 894       // are at these offsets into the preceding 64 byte block
 895       // relative to that original input and so on for each successive
 896       // preceding 64 byte block when s is updated. this explains the
 897       // slightly counter-intuitive looking pattern of register usage
 898       // in the stp instructions for backwards copy.
 899       //
 900       // t0 at offset -16, t1 at offset -8
 901       // t2 at offset -32, t3 at offset -24
 902       // t4 at offset -48, t5 at offset -40
 903       // t6 at offset -64, t7 at offset -56
 904 
 905       __ ldp(t0, t1, Address(s, 2 * unit));
 906       __ ldp(t2, t3, Address(s, 4 * unit));
 907       __ ldp(t4, t5, Address(s, 6 * unit));
 908       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 909 
 910       __ subs(count, count, 16);
 911       __ br(Assembler::LO, drain);
 912 
 913       int prefetch = PrefetchCopyIntervalInBytes;
 914       bool use_stride = false;
 915       if (direction == copy_backwards) {
 916          use_stride = prefetch > 256;
 917          prefetch = -prefetch;
 918          if (use_stride) __ mov(stride, prefetch);
 919       }
 920 
 921       __ bind(again);
 922 
 923       if (PrefetchCopyIntervalInBytes > 0)
 924         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 925 
 926       if (direction == copy_forwards) {
 927        // allowing for the offset of -8 the store instructions place
 928        // registers into the target 64 bit block at the following
 929        // offsets
 930        //
 931        // t0 at offset 0
 932        // t1 at offset 8,  t2 at offset 16
 933        // t3 at offset 24, t4 at offset 32
 934        // t5 at offset 40, t6 at offset 48
 935        // t7 at offset 56
 936 
 937         __ str(t0, Address(d, 1 * unit));
 938         __ stp(t1, t2, Address(d, 2 * unit));
 939         __ ldp(t0, t1, Address(s, 2 * unit));
 940         __ stp(t3, t4, Address(d, 4 * unit));
 941         __ ldp(t2, t3, Address(s, 4 * unit));
 942         __ stp(t5, t6, Address(d, 6 * unit));
 943         __ ldp(t4, t5, Address(s, 6 * unit));
 944         __ str(t7, Address(__ pre(d, 8 * unit)));
 945         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 946       } else {
 947        // d was not offset when we started so the registers are
 948        // written into the 64 bit block preceding d with the following
 949        // offsets
 950        //
 951        // t1 at offset -8
 952        // t3 at offset -24, t0 at offset -16
 953        // t5 at offset -48, t2 at offset -32
 954        // t7 at offset -56, t4 at offset -48
 955        //                   t6 at offset -64
 956        //
 957        // note that this matches the offsets previously noted for the
 958        // loads
 959 
 960         __ str(t1, Address(d, 1 * unit));
 961         __ stp(t3, t0, Address(d, 3 * unit));
 962         __ ldp(t0, t1, Address(s, 2 * unit));
 963         __ stp(t5, t2, Address(d, 5 * unit));
 964         __ ldp(t2, t3, Address(s, 4 * unit));
 965         __ stp(t7, t4, Address(d, 7 * unit));
 966         __ ldp(t4, t5, Address(s, 6 * unit));
 967         __ str(t6, Address(__ pre(d, 8 * unit)));
 968         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 969       }
 970 
 971       __ subs(count, count, 8);
 972       __ br(Assembler::HS, again);
 973 
 974       // Drain
 975       //
 976       // this uses the same pattern of offsets and register arguments
 977       // as above
 978       __ bind(drain);
 979       if (direction == copy_forwards) {
 980         __ str(t0, Address(d, 1 * unit));
 981         __ stp(t1, t2, Address(d, 2 * unit));
 982         __ stp(t3, t4, Address(d, 4 * unit));
 983         __ stp(t5, t6, Address(d, 6 * unit));
 984         __ str(t7, Address(__ pre(d, 8 * unit)));
 985       } else {
 986         __ str(t1, Address(d, 1 * unit));
 987         __ stp(t3, t0, Address(d, 3 * unit));
 988         __ stp(t5, t2, Address(d, 5 * unit));
 989         __ stp(t7, t4, Address(d, 7 * unit));
 990         __ str(t6, Address(__ pre(d, 8 * unit)));
 991       }
 992       // now we need to copy any remaining part block which may
 993       // include a 4 word block subblock and/or a 2 word subblock.
 994       // bits 2 and 1 in the count are the tell-tale for whetehr we
 995       // have each such subblock
 996       {
 997         Label L1, L2;
 998         __ tbz(count, exact_log2(4), L1);
 999        // this is the same as above but copying only 4 longs hence
1000        // with ony one intervening stp between the str instructions
1001        // but note that the offsets and registers still follow the
1002        // same pattern
1003         __ ldp(t0, t1, Address(s, 2 * unit));
1004         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1005         if (direction == copy_forwards) {
1006           __ str(t0, Address(d, 1 * unit));
1007           __ stp(t1, t2, Address(d, 2 * unit));
1008           __ str(t3, Address(__ pre(d, 4 * unit)));
1009         } else {
1010           __ str(t1, Address(d, 1 * unit));
1011           __ stp(t3, t0, Address(d, 3 * unit));
1012           __ str(t2, Address(__ pre(d, 4 * unit)));
1013         }
1014         __ bind(L1);
1015 
1016         __ tbz(count, 1, L2);
1017        // this is the same as above but copying only 2 longs hence
1018        // there is no intervening stp between the str instructions
1019        // but note that the offset and register patterns are still
1020        // the same
1021         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1022         if (direction == copy_forwards) {
1023           __ str(t0, Address(d, 1 * unit));
1024           __ str(t1, Address(__ pre(d, 2 * unit)));
1025         } else {
1026           __ str(t1, Address(d, 1 * unit));
1027           __ str(t0, Address(__ pre(d, 2 * unit)));
1028         }
1029         __ bind(L2);
1030 
1031        // for forwards copy we need to re-adjust the offsets we
1032        // applied so that s and d are follow the last words written
1033 
1034        if (direction == copy_forwards) {
1035          __ add(s, s, 16);
1036          __ add(d, d, 8);
1037        }
1038 
1039       }
1040 
1041       __ ret(lr);
1042       }
1043   }
1044 
1045   // Small copy: less than 16 bytes.
1046   //
1047   // NB: Ignores all of the bits of count which represent more than 15
1048   // bytes, so a caller doesn't have to mask them.
1049 
1050   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1051     bool is_backwards = step < 0;
1052     size_t granularity = uabs(step);
1053     int direction = is_backwards ? -1 : 1;
1054     int unit = wordSize * direction;
1055 
1056     Label Lword, Lint, Lshort, Lbyte;
1057 
1058     assert(granularity
1059            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1060 
1061     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1062 
1063     // ??? I don't know if this bit-test-and-branch is the right thing
1064     // to do.  It does a lot of jumping, resulting in several
1065     // mispredicted branches.  It might make more sense to do this
1066     // with something like Duff's device with a single computed branch.
1067 
1068     __ tbz(count, 3 - exact_log2(granularity), Lword);
1069     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1070     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1071     __ bind(Lword);
1072 
1073     if (granularity <= sizeof (jint)) {
1074       __ tbz(count, 2 - exact_log2(granularity), Lint);
1075       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1076       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1077       __ bind(Lint);
1078     }
1079 
1080     if (granularity <= sizeof (jshort)) {
1081       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1082       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1083       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1084       __ bind(Lshort);
1085     }
1086 
1087     if (granularity <= sizeof (jbyte)) {
1088       __ tbz(count, 0, Lbyte);
1089       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1090       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1091       __ bind(Lbyte);
1092     }
1093   }
1094 
1095   Label copy_f, copy_b;
1096 
1097   // All-singing all-dancing memory copy.
1098   //
1099   // Copy count units of memory from s to d.  The size of a unit is
1100   // step, which can be positive or negative depending on the direction
1101   // of copy.  If is_aligned is false, we align the source address.
1102   //
1103 
1104   void copy_memory(bool is_aligned, Register s, Register d,
1105                    Register count, Register tmp, int step) {
1106     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1107     bool is_backwards = step < 0;
1108     unsigned int granularity = uabs(step);
1109     const Register t0 = r3, t1 = r4;
1110 
1111     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1112     // load all the data before writing anything
1113     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1114     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1115     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1116     const Register send = r17, dend = r16;
1117 
1118     if (PrefetchCopyIntervalInBytes > 0)
1119       __ prfm(Address(s, 0), PLDL1KEEP);
1120     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1121     __ br(Assembler::HI, copy_big);
1122 
1123     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1124     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1125 
1126     __ cmp(count, u1(16/granularity));
1127     __ br(Assembler::LS, copy16);
1128 
1129     __ cmp(count, u1(64/granularity));
1130     __ br(Assembler::HI, copy80);
1131 
1132     __ cmp(count, u1(32/granularity));
1133     __ br(Assembler::LS, copy32);
1134 
1135     // 33..64 bytes
1136     if (UseSIMDForMemoryOps) {
1137       __ ldpq(v0, v1, Address(s, 0));
1138       __ ldpq(v2, v3, Address(send, -32));
1139       __ stpq(v0, v1, Address(d, 0));
1140       __ stpq(v2, v3, Address(dend, -32));
1141     } else {
1142       __ ldp(t0, t1, Address(s, 0));
1143       __ ldp(t2, t3, Address(s, 16));
1144       __ ldp(t4, t5, Address(send, -32));
1145       __ ldp(t6, t7, Address(send, -16));
1146 
1147       __ stp(t0, t1, Address(d, 0));
1148       __ stp(t2, t3, Address(d, 16));
1149       __ stp(t4, t5, Address(dend, -32));
1150       __ stp(t6, t7, Address(dend, -16));
1151     }
1152     __ b(finish);
1153 
1154     // 17..32 bytes
1155     __ bind(copy32);
1156     __ ldp(t0, t1, Address(s, 0));
1157     __ ldp(t2, t3, Address(send, -16));
1158     __ stp(t0, t1, Address(d, 0));
1159     __ stp(t2, t3, Address(dend, -16));
1160     __ b(finish);
1161 
1162     // 65..80/96 bytes
1163     // (96 bytes if SIMD because we do 32 byes per instruction)
1164     __ bind(copy80);
1165     if (UseSIMDForMemoryOps) {
1166       __ ldpq(v0, v1, Address(s, 0));
1167       __ ldpq(v2, v3, Address(s, 32));
1168       // Unaligned pointers can be an issue for copying.
1169       // The issue has more chances to happen when granularity of data is
1170       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1171       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1172       // The most performance drop has been seen for the range 65-80 bytes.
1173       // For such cases using the pair of ldp/stp instead of the third pair of
1174       // ldpq/stpq fixes the performance issue.
1175       if (granularity < sizeof (jint)) {
1176         Label copy96;
1177         __ cmp(count, u1(80/granularity));
1178         __ br(Assembler::HI, copy96);
1179         __ ldp(t0, t1, Address(send, -16));
1180 
1181         __ stpq(v0, v1, Address(d, 0));
1182         __ stpq(v2, v3, Address(d, 32));
1183         __ stp(t0, t1, Address(dend, -16));
1184         __ b(finish);
1185 
1186         __ bind(copy96);
1187       }
1188       __ ldpq(v4, v5, Address(send, -32));
1189 
1190       __ stpq(v0, v1, Address(d, 0));
1191       __ stpq(v2, v3, Address(d, 32));
1192       __ stpq(v4, v5, Address(dend, -32));
1193     } else {
1194       __ ldp(t0, t1, Address(s, 0));
1195       __ ldp(t2, t3, Address(s, 16));
1196       __ ldp(t4, t5, Address(s, 32));
1197       __ ldp(t6, t7, Address(s, 48));
1198       __ ldp(t8, t9, Address(send, -16));
1199 
1200       __ stp(t0, t1, Address(d, 0));
1201       __ stp(t2, t3, Address(d, 16));
1202       __ stp(t4, t5, Address(d, 32));
1203       __ stp(t6, t7, Address(d, 48));
1204       __ stp(t8, t9, Address(dend, -16));
1205     }
1206     __ b(finish);
1207 
1208     // 0..16 bytes
1209     __ bind(copy16);
1210     __ cmp(count, u1(8/granularity));
1211     __ br(Assembler::LO, copy8);
1212 
1213     // 8..16 bytes
1214     __ ldr(t0, Address(s, 0));
1215     __ ldr(t1, Address(send, -8));
1216     __ str(t0, Address(d, 0));
1217     __ str(t1, Address(dend, -8));
1218     __ b(finish);
1219 
1220     if (granularity < 8) {
1221       // 4..7 bytes
1222       __ bind(copy8);
1223       __ tbz(count, 2 - exact_log2(granularity), copy4);
1224       __ ldrw(t0, Address(s, 0));
1225       __ ldrw(t1, Address(send, -4));
1226       __ strw(t0, Address(d, 0));
1227       __ strw(t1, Address(dend, -4));
1228       __ b(finish);
1229       if (granularity < 4) {
1230         // 0..3 bytes
1231         __ bind(copy4);
1232         __ cbz(count, finish); // get rid of 0 case
1233         if (granularity == 2) {
1234           __ ldrh(t0, Address(s, 0));
1235           __ strh(t0, Address(d, 0));
1236         } else { // granularity == 1
1237           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1238           // the first and last byte.
1239           // Handle the 3 byte case by loading and storing base + count/2
1240           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1241           // This does means in the 1 byte case we load/store the same
1242           // byte 3 times.
1243           __ lsr(count, count, 1);
1244           __ ldrb(t0, Address(s, 0));
1245           __ ldrb(t1, Address(send, -1));
1246           __ ldrb(t2, Address(s, count));
1247           __ strb(t0, Address(d, 0));
1248           __ strb(t1, Address(dend, -1));
1249           __ strb(t2, Address(d, count));
1250         }
1251         __ b(finish);
1252       }
1253     }
1254 
1255     __ bind(copy_big);
1256     if (is_backwards) {
1257       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1258       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1259     }
1260 
1261     // Now we've got the small case out of the way we can align the
1262     // source address on a 2-word boundary.
1263 
1264     Label aligned;
1265 
1266     if (is_aligned) {
1267       // We may have to adjust by 1 word to get s 2-word-aligned.
1268       __ tbz(s, exact_log2(wordSize), aligned);
1269       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1270       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1271       __ sub(count, count, wordSize/granularity);
1272     } else {
1273       if (is_backwards) {
1274         __ andr(rscratch2, s, 2 * wordSize - 1);
1275       } else {
1276         __ neg(rscratch2, s);
1277         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1278       }
1279       // rscratch2 is the byte adjustment needed to align s.
1280       __ cbz(rscratch2, aligned);
1281       int shift = exact_log2(granularity);
1282       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1283       __ sub(count, count, rscratch2);
1284 
1285 #if 0
1286       // ?? This code is only correct for a disjoint copy.  It may or
1287       // may not make sense to use it in that case.
1288 
1289       // Copy the first pair; s and d may not be aligned.
1290       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1291       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1292 
1293       // Align s and d, adjust count
1294       if (is_backwards) {
1295         __ sub(s, s, rscratch2);
1296         __ sub(d, d, rscratch2);
1297       } else {
1298         __ add(s, s, rscratch2);
1299         __ add(d, d, rscratch2);
1300       }
1301 #else
1302       copy_memory_small(s, d, rscratch2, rscratch1, step);
1303 #endif
1304     }
1305 
1306     __ bind(aligned);
1307 
1308     // s is now 2-word-aligned.
1309 
1310     // We have a count of units and some trailing bytes.  Adjust the
1311     // count and do a bulk copy of words.
1312     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1313     if (direction == copy_forwards)
1314       __ bl(copy_f);
1315     else
1316       __ bl(copy_b);
1317 
1318     // And the tail.
1319     copy_memory_small(s, d, count, tmp, step);
1320 
1321     if (granularity >= 8) __ bind(copy8);
1322     if (granularity >= 4) __ bind(copy4);
1323     __ bind(finish);
1324   }
1325 
1326 
1327   void clobber_registers() {
1328 #ifdef ASSERT
1329     RegSet clobbered
1330       = MacroAssembler::call_clobbered_registers() - rscratch1;
1331     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1332     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1333     for (RegSetIterator<> it = clobbered.begin(); *it != noreg; ++it) {
1334       __ mov(*it, rscratch1);
1335     }
1336 #endif
1337 
1338   }
1339 
1340   // Scan over array at a for count oops, verifying each one.
1341   // Preserves a and count, clobbers rscratch1 and rscratch2.
1342   void verify_oop_array (int size, Register a, Register count, Register temp) {
1343     Label loop, end;
1344     __ mov(rscratch1, a);
1345     __ mov(rscratch2, zr);
1346     __ bind(loop);
1347     __ cmp(rscratch2, count);
1348     __ br(Assembler::HS, end);
1349     if (size == wordSize) {
1350       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1351       __ verify_oop(temp);
1352     } else {
1353       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1354       __ decode_heap_oop(temp); // calls verify_oop
1355     }
1356     __ add(rscratch2, rscratch2, 1);
1357     __ b(loop);
1358     __ bind(end);
1359   }
1360 
1361   // Arguments:
1362   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1363   //             ignored
1364   //   is_oop  - true => oop array, so generate store check code
1365   //   name    - stub name string
1366   //
1367   // Inputs:
1368   //   c_rarg0   - source array address
1369   //   c_rarg1   - destination array address
1370   //   c_rarg2   - element count, treated as ssize_t, can be zero
1371   //
1372   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1373   // the hardware handle it.  The two dwords within qwords that span
1374   // cache line boundaries will still be loaded and stored atomically.
1375   //
1376   // Side Effects:
1377   //   disjoint_int_copy_entry is set to the no-overlap entry point
1378   //   used by generate_conjoint_int_oop_copy().
1379   //
1380   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1381                                   const char *name, bool dest_uninitialized = false) {
1382     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1383     RegSet saved_reg = RegSet::of(s, d, count);
1384     __ align(CodeEntryAlignment);
1385     StubCodeMark mark(this, "StubRoutines", name);
1386     address start = __ pc();
1387     __ enter();
1388 
1389     if (entry != NULL) {
1390       *entry = __ pc();
1391       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1392       BLOCK_COMMENT("Entry:");
1393     }
1394 
1395     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1396     if (dest_uninitialized) {
1397       decorators |= IS_DEST_UNINITIALIZED;
1398     }
1399     if (aligned) {
1400       decorators |= ARRAYCOPY_ALIGNED;
1401     }
1402 
1403     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1404     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1405 
1406     if (is_oop) {
1407       // save regs before copy_memory
1408       __ push(RegSet::of(d, count), sp);
1409     }
1410     {
1411       // UnsafeCopyMemory page error: continue after ucm
1412       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1413       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1414       copy_memory(aligned, s, d, count, rscratch1, size);
1415     }
1416 
1417     if (is_oop) {
1418       __ pop(RegSet::of(d, count), sp);
1419       if (VerifyOops)
1420         verify_oop_array(size, d, count, r16);
1421     }
1422 
1423     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1424 
1425     __ leave();
1426     __ mov(r0, zr); // return 0
1427     __ ret(lr);
1428     return start;
1429   }
1430 
1431   // Arguments:
1432   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1433   //             ignored
1434   //   is_oop  - true => oop array, so generate store check code
1435   //   name    - stub name string
1436   //
1437   // Inputs:
1438   //   c_rarg0   - source array address
1439   //   c_rarg1   - destination array address
1440   //   c_rarg2   - element count, treated as ssize_t, can be zero
1441   //
1442   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1443   // the hardware handle it.  The two dwords within qwords that span
1444   // cache line boundaries will still be loaded and stored atomically.
1445   //
1446   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1447                                  address *entry, const char *name,
1448                                  bool dest_uninitialized = false) {
1449     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1450     RegSet saved_regs = RegSet::of(s, d, count);
1451     StubCodeMark mark(this, "StubRoutines", name);
1452     address start = __ pc();
1453     __ enter();
1454 
1455     if (entry != NULL) {
1456       *entry = __ pc();
1457       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1458       BLOCK_COMMENT("Entry:");
1459     }
1460 
1461     // use fwd copy when (d-s) above_equal (count*size)
1462     __ sub(rscratch1, d, s);
1463     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1464     __ br(Assembler::HS, nooverlap_target);
1465 
1466     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1467     if (dest_uninitialized) {
1468       decorators |= IS_DEST_UNINITIALIZED;
1469     }
1470     if (aligned) {
1471       decorators |= ARRAYCOPY_ALIGNED;
1472     }
1473 
1474     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1475     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1476 
1477     if (is_oop) {
1478       // save regs before copy_memory
1479       __ push(RegSet::of(d, count), sp);
1480     }
1481     {
1482       // UnsafeCopyMemory page error: continue after ucm
1483       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1484       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1485       copy_memory(aligned, s, d, count, rscratch1, -size);
1486     }
1487     if (is_oop) {
1488       __ pop(RegSet::of(d, count), sp);
1489       if (VerifyOops)
1490         verify_oop_array(size, d, count, r16);
1491     }
1492     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1493     __ leave();
1494     __ mov(r0, zr); // return 0
1495     __ ret(lr);
1496     return start;
1497 }
1498 
1499   // Arguments:
1500   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1501   //             ignored
1502   //   name    - stub name string
1503   //
1504   // Inputs:
1505   //   c_rarg0   - source array address
1506   //   c_rarg1   - destination array address
1507   //   c_rarg2   - element count, treated as ssize_t, can be zero
1508   //
1509   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1510   // we let the hardware handle it.  The one to eight bytes within words,
1511   // dwords or qwords that span cache line boundaries will still be loaded
1512   // and stored atomically.
1513   //
1514   // Side Effects:
1515   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1516   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1517   // we let the hardware handle it.  The one to eight bytes within words,
1518   // dwords or qwords that span cache line boundaries will still be loaded
1519   // and stored atomically.
1520   //
1521   // Side Effects:
1522   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1523   //   used by generate_conjoint_byte_copy().
1524   //
1525   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1526     const bool not_oop = false;
1527     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1528   }
1529 
1530   // Arguments:
1531   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1532   //             ignored
1533   //   name    - stub name string
1534   //
1535   // Inputs:
1536   //   c_rarg0   - source array address
1537   //   c_rarg1   - destination array address
1538   //   c_rarg2   - element count, treated as ssize_t, can be zero
1539   //
1540   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1541   // we let the hardware handle it.  The one to eight bytes within words,
1542   // dwords or qwords that span cache line boundaries will still be loaded
1543   // and stored atomically.
1544   //
1545   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1546                                       address* entry, const char *name) {
1547     const bool not_oop = false;
1548     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1549   }
1550 
1551   // Arguments:
1552   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1553   //             ignored
1554   //   name    - stub name string
1555   //
1556   // Inputs:
1557   //   c_rarg0   - source array address
1558   //   c_rarg1   - destination array address
1559   //   c_rarg2   - element count, treated as ssize_t, can be zero
1560   //
1561   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1562   // let the hardware handle it.  The two or four words within dwords
1563   // or qwords that span cache line boundaries will still be loaded
1564   // and stored atomically.
1565   //
1566   // Side Effects:
1567   //   disjoint_short_copy_entry is set to the no-overlap entry point
1568   //   used by generate_conjoint_short_copy().
1569   //
1570   address generate_disjoint_short_copy(bool aligned,
1571                                        address* entry, const char *name) {
1572     const bool not_oop = false;
1573     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1574   }
1575 
1576   // Arguments:
1577   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1578   //             ignored
1579   //   name    - stub name string
1580   //
1581   // Inputs:
1582   //   c_rarg0   - source array address
1583   //   c_rarg1   - destination array address
1584   //   c_rarg2   - element count, treated as ssize_t, can be zero
1585   //
1586   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1587   // let the hardware handle it.  The two or four words within dwords
1588   // or qwords that span cache line boundaries will still be loaded
1589   // and stored atomically.
1590   //
1591   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1592                                        address *entry, const char *name) {
1593     const bool not_oop = false;
1594     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1595 
1596   }
1597   // Arguments:
1598   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1599   //             ignored
1600   //   name    - stub name string
1601   //
1602   // Inputs:
1603   //   c_rarg0   - source array address
1604   //   c_rarg1   - destination array address
1605   //   c_rarg2   - element count, treated as ssize_t, can be zero
1606   //
1607   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1608   // the hardware handle it.  The two dwords within qwords that span
1609   // cache line boundaries will still be loaded and stored atomically.
1610   //
1611   // Side Effects:
1612   //   disjoint_int_copy_entry is set to the no-overlap entry point
1613   //   used by generate_conjoint_int_oop_copy().
1614   //
1615   address generate_disjoint_int_copy(bool aligned, address *entry,
1616                                          const char *name, bool dest_uninitialized = false) {
1617     const bool not_oop = false;
1618     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1619   }
1620 
1621   // Arguments:
1622   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1623   //             ignored
1624   //   name    - stub name string
1625   //
1626   // Inputs:
1627   //   c_rarg0   - source array address
1628   //   c_rarg1   - destination array address
1629   //   c_rarg2   - element count, treated as ssize_t, can be zero
1630   //
1631   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1632   // the hardware handle it.  The two dwords within qwords that span
1633   // cache line boundaries will still be loaded and stored atomically.
1634   //
1635   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1636                                      address *entry, const char *name,
1637                                      bool dest_uninitialized = false) {
1638     const bool not_oop = false;
1639     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1640   }
1641 
1642 
1643   // Arguments:
1644   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1645   //             ignored
1646   //   name    - stub name string
1647   //
1648   // Inputs:
1649   //   c_rarg0   - source array address
1650   //   c_rarg1   - destination array address
1651   //   c_rarg2   - element count, treated as size_t, can be zero
1652   //
1653   // Side Effects:
1654   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1655   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1656   //
1657   address generate_disjoint_long_copy(bool aligned, address *entry,
1658                                           const char *name, bool dest_uninitialized = false) {
1659     const bool not_oop = false;
1660     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1661   }
1662 
1663   // Arguments:
1664   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1665   //             ignored
1666   //   name    - stub name string
1667   //
1668   // Inputs:
1669   //   c_rarg0   - source array address
1670   //   c_rarg1   - destination array address
1671   //   c_rarg2   - element count, treated as size_t, can be zero
1672   //
1673   address generate_conjoint_long_copy(bool aligned,
1674                                       address nooverlap_target, address *entry,
1675                                       const char *name, bool dest_uninitialized = false) {
1676     const bool not_oop = false;
1677     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1678   }
1679 
1680   // Arguments:
1681   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1682   //             ignored
1683   //   name    - stub name string
1684   //
1685   // Inputs:
1686   //   c_rarg0   - source array address
1687   //   c_rarg1   - destination array address
1688   //   c_rarg2   - element count, treated as size_t, can be zero
1689   //
1690   // Side Effects:
1691   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1692   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1693   //
1694   address generate_disjoint_oop_copy(bool aligned, address *entry,
1695                                      const char *name, bool dest_uninitialized) {
1696     const bool is_oop = true;
1697     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1698     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1699   }
1700 
1701   // Arguments:
1702   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1703   //             ignored
1704   //   name    - stub name string
1705   //
1706   // Inputs:
1707   //   c_rarg0   - source array address
1708   //   c_rarg1   - destination array address
1709   //   c_rarg2   - element count, treated as size_t, can be zero
1710   //
1711   address generate_conjoint_oop_copy(bool aligned,
1712                                      address nooverlap_target, address *entry,
1713                                      const char *name, bool dest_uninitialized) {
1714     const bool is_oop = true;
1715     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1716     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1717                                   name, dest_uninitialized);
1718   }
1719 
1720 
1721   // Helper for generating a dynamic type check.
1722   // Smashes rscratch1, rscratch2.
1723   void generate_type_check(Register sub_klass,
1724                            Register super_check_offset,
1725                            Register super_klass,
1726                            Label& L_success) {
1727     assert_different_registers(sub_klass, super_check_offset, super_klass);
1728 
1729     BLOCK_COMMENT("type_check:");
1730 
1731     Label L_miss;
1732 
1733     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1734                                      super_check_offset);
1735     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1736 
1737     // Fall through on failure!
1738     __ BIND(L_miss);
1739   }
1740 
1741   //
1742   //  Generate checkcasting array copy stub
1743   //
1744   //  Input:
1745   //    c_rarg0   - source array address
1746   //    c_rarg1   - destination array address
1747   //    c_rarg2   - element count, treated as ssize_t, can be zero
1748   //    c_rarg3   - size_t ckoff (super_check_offset)
1749   //    c_rarg4   - oop ckval (super_klass)
1750   //
1751   //  Output:
1752   //    r0 ==  0  -  success
1753   //    r0 == -1^K - failure, where K is partial transfer count
1754   //
1755   address generate_checkcast_copy(const char *name, address *entry,
1756                                   bool dest_uninitialized = false) {
1757 
1758     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1759 
1760     // Input registers (after setup_arg_regs)
1761     const Register from        = c_rarg0;   // source array address
1762     const Register to          = c_rarg1;   // destination array address
1763     const Register count       = c_rarg2;   // elementscount
1764     const Register ckoff       = c_rarg3;   // super_check_offset
1765     const Register ckval       = c_rarg4;   // super_klass
1766 
1767     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1768     RegSet wb_post_saved_regs = RegSet::of(count);
1769 
1770     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1771     const Register copied_oop  = r22;       // actual oop copied
1772     const Register count_save  = r21;       // orig elementscount
1773     const Register start_to    = r20;       // destination array start address
1774     const Register r19_klass   = r19;       // oop._klass
1775 
1776     //---------------------------------------------------------------
1777     // Assembler stub will be used for this call to arraycopy
1778     // if the two arrays are subtypes of Object[] but the
1779     // destination array type is not equal to or a supertype
1780     // of the source type.  Each element must be separately
1781     // checked.
1782 
1783     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1784                                copied_oop, r19_klass, count_save);
1785 
1786     __ align(CodeEntryAlignment);
1787     StubCodeMark mark(this, "StubRoutines", name);
1788     address start = __ pc();
1789 
1790     __ enter(); // required for proper stackwalking of RuntimeStub frame
1791 
1792 #ifdef ASSERT
1793     // caller guarantees that the arrays really are different
1794     // otherwise, we would have to make conjoint checks
1795     { Label L;
1796       array_overlap_test(L, TIMES_OOP);
1797       __ stop("checkcast_copy within a single array");
1798       __ bind(L);
1799     }
1800 #endif //ASSERT
1801 
1802     // Caller of this entry point must set up the argument registers.
1803     if (entry != NULL) {
1804       *entry = __ pc();
1805       BLOCK_COMMENT("Entry:");
1806     }
1807 
1808      // Empty array:  Nothing to do.
1809     __ cbz(count, L_done);
1810     __ push(RegSet::of(r19, r20, r21, r22), sp);
1811 
1812 #ifdef ASSERT
1813     BLOCK_COMMENT("assert consistent ckoff/ckval");
1814     // The ckoff and ckval must be mutually consistent,
1815     // even though caller generates both.
1816     { Label L;
1817       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1818       __ ldrw(start_to, Address(ckval, sco_offset));
1819       __ cmpw(ckoff, start_to);
1820       __ br(Assembler::EQ, L);
1821       __ stop("super_check_offset inconsistent");
1822       __ bind(L);
1823     }
1824 #endif //ASSERT
1825 
1826     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1827     bool is_oop = true;
1828     if (dest_uninitialized) {
1829       decorators |= IS_DEST_UNINITIALIZED;
1830     }
1831 
1832     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1833     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1834 
1835     // save the original count
1836     __ mov(count_save, count);
1837 
1838     // Copy from low to high addresses
1839     __ mov(start_to, to);              // Save destination array start address
1840     __ b(L_load_element);
1841 
1842     // ======== begin loop ========
1843     // (Loop is rotated; its entry is L_load_element.)
1844     // Loop control:
1845     //   for (; count != 0; count--) {
1846     //     copied_oop = load_heap_oop(from++);
1847     //     ... generate_type_check ...;
1848     //     store_heap_oop(to++, copied_oop);
1849     //   }
1850     __ align(OptoLoopAlignment);
1851 
1852     __ BIND(L_store_element);
1853     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1854     __ sub(count, count, 1);
1855     __ cbz(count, L_do_card_marks);
1856 
1857     // ======== loop entry is here ========
1858     __ BIND(L_load_element);
1859     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1860     __ cbz(copied_oop, L_store_element);
1861 
1862     __ load_klass(r19_klass, copied_oop);// query the object klass
1863     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1864     // ======== end loop ========
1865 
1866     // It was a real error; we must depend on the caller to finish the job.
1867     // Register count = remaining oops, count_orig = total oops.
1868     // Emit GC store barriers for the oops we have copied and report
1869     // their number to the caller.
1870 
1871     __ subs(count, count_save, count);     // K = partially copied oop count
1872     __ eon(count, count, zr);                   // report (-1^K) to caller
1873     __ br(Assembler::EQ, L_done_pop);
1874 
1875     __ BIND(L_do_card_marks);
1876     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1877 
1878     __ bind(L_done_pop);
1879     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1880     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1881 
1882     __ bind(L_done);
1883     __ mov(r0, count);
1884     __ leave();
1885     __ ret(lr);
1886 
1887     return start;
1888   }
1889 
1890   // Perform range checks on the proposed arraycopy.
1891   // Kills temp, but nothing else.
1892   // Also, clean the sign bits of src_pos and dst_pos.
1893   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1894                               Register src_pos, // source position (c_rarg1)
1895                               Register dst,     // destination array oo (c_rarg2)
1896                               Register dst_pos, // destination position (c_rarg3)
1897                               Register length,
1898                               Register temp,
1899                               Label& L_failed) {
1900     BLOCK_COMMENT("arraycopy_range_checks:");
1901 
1902     assert_different_registers(rscratch1, temp);
1903 
1904     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1905     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1906     __ addw(temp, length, src_pos);
1907     __ cmpw(temp, rscratch1);
1908     __ br(Assembler::HI, L_failed);
1909 
1910     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1911     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1912     __ addw(temp, length, dst_pos);
1913     __ cmpw(temp, rscratch1);
1914     __ br(Assembler::HI, L_failed);
1915 
1916     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1917     __ movw(src_pos, src_pos);
1918     __ movw(dst_pos, dst_pos);
1919 
1920     BLOCK_COMMENT("arraycopy_range_checks done");
1921   }
1922 
1923   // These stubs get called from some dumb test routine.
1924   // I'll write them properly when they're called from
1925   // something that's actually doing something.
1926   static void fake_arraycopy_stub(address src, address dst, int count) {
1927     assert(count == 0, "huh?");
1928   }
1929 
1930 
1931   //
1932   //  Generate 'unsafe' array copy stub
1933   //  Though just as safe as the other stubs, it takes an unscaled
1934   //  size_t argument instead of an element count.
1935   //
1936   //  Input:
1937   //    c_rarg0   - source array address
1938   //    c_rarg1   - destination array address
1939   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1940   //
1941   // Examines the alignment of the operands and dispatches
1942   // to a long, int, short, or byte copy loop.
1943   //
1944   address generate_unsafe_copy(const char *name,
1945                                address byte_copy_entry,
1946                                address short_copy_entry,
1947                                address int_copy_entry,
1948                                address long_copy_entry) {
1949     Label L_long_aligned, L_int_aligned, L_short_aligned;
1950     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1951 
1952     __ align(CodeEntryAlignment);
1953     StubCodeMark mark(this, "StubRoutines", name);
1954     address start = __ pc();
1955     __ enter(); // required for proper stackwalking of RuntimeStub frame
1956 
1957     // bump this on entry, not on exit:
1958     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1959 
1960     __ orr(rscratch1, s, d);
1961     __ orr(rscratch1, rscratch1, count);
1962 
1963     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1964     __ cbz(rscratch1, L_long_aligned);
1965     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1966     __ cbz(rscratch1, L_int_aligned);
1967     __ tbz(rscratch1, 0, L_short_aligned);
1968     __ b(RuntimeAddress(byte_copy_entry));
1969 
1970     __ BIND(L_short_aligned);
1971     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1972     __ b(RuntimeAddress(short_copy_entry));
1973     __ BIND(L_int_aligned);
1974     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1975     __ b(RuntimeAddress(int_copy_entry));
1976     __ BIND(L_long_aligned);
1977     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1978     __ b(RuntimeAddress(long_copy_entry));
1979 
1980     return start;
1981   }
1982 
1983   //
1984   //  Generate generic array copy stubs
1985   //
1986   //  Input:
1987   //    c_rarg0    -  src oop
1988   //    c_rarg1    -  src_pos (32-bits)
1989   //    c_rarg2    -  dst oop
1990   //    c_rarg3    -  dst_pos (32-bits)
1991   //    c_rarg4    -  element count (32-bits)
1992   //
1993   //  Output:
1994   //    r0 ==  0  -  success
1995   //    r0 == -1^K - failure, where K is partial transfer count
1996   //
1997   address generate_generic_copy(const char *name,
1998                                 address byte_copy_entry, address short_copy_entry,
1999                                 address int_copy_entry, address oop_copy_entry,
2000                                 address long_copy_entry, address checkcast_copy_entry) {
2001 
2002     Label L_failed, L_objArray;
2003     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2004 
2005     // Input registers
2006     const Register src        = c_rarg0;  // source array oop
2007     const Register src_pos    = c_rarg1;  // source position
2008     const Register dst        = c_rarg2;  // destination array oop
2009     const Register dst_pos    = c_rarg3;  // destination position
2010     const Register length     = c_rarg4;
2011 
2012 
2013     // Registers used as temps
2014     const Register dst_klass  = c_rarg5;
2015 
2016     __ align(CodeEntryAlignment);
2017 
2018     StubCodeMark mark(this, "StubRoutines", name);
2019 
2020     address start = __ pc();
2021 
2022     __ enter(); // required for proper stackwalking of RuntimeStub frame
2023 
2024     // bump this on entry, not on exit:
2025     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2026 
2027     //-----------------------------------------------------------------------
2028     // Assembler stub will be used for this call to arraycopy
2029     // if the following conditions are met:
2030     //
2031     // (1) src and dst must not be null.
2032     // (2) src_pos must not be negative.
2033     // (3) dst_pos must not be negative.
2034     // (4) length  must not be negative.
2035     // (5) src klass and dst klass should be the same and not NULL.
2036     // (6) src and dst should be arrays.
2037     // (7) src_pos + length must not exceed length of src.
2038     // (8) dst_pos + length must not exceed length of dst.
2039     //
2040 
2041     //  if (src == NULL) return -1;
2042     __ cbz(src, L_failed);
2043 
2044     //  if (src_pos < 0) return -1;
2045     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2046 
2047     //  if (dst == NULL) return -1;
2048     __ cbz(dst, L_failed);
2049 
2050     //  if (dst_pos < 0) return -1;
2051     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2052 
2053     // registers used as temp
2054     const Register scratch_length    = r16; // elements count to copy
2055     const Register scratch_src_klass = r17; // array klass
2056     const Register lh                = r15; // layout helper
2057 
2058     //  if (length < 0) return -1;
2059     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2060     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2061 
2062     __ load_klass(scratch_src_klass, src);
2063 #ifdef ASSERT
2064     //  assert(src->klass() != NULL);
2065     {
2066       BLOCK_COMMENT("assert klasses not null {");
2067       Label L1, L2;
2068       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2069       __ bind(L1);
2070       __ stop("broken null klass");
2071       __ bind(L2);
2072       __ load_klass(rscratch1, dst);
2073       __ cbz(rscratch1, L1);     // this would be broken also
2074       BLOCK_COMMENT("} assert klasses not null done");
2075     }
2076 #endif
2077 
2078     // Load layout helper (32-bits)
2079     //
2080     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2081     // 32        30    24            16              8     2                 0
2082     //
2083     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2084     //
2085 
2086     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2087 
2088     // Handle objArrays completely differently...
2089     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2090     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2091     __ movw(rscratch1, objArray_lh);
2092     __ eorw(rscratch2, lh, rscratch1);
2093     __ cbzw(rscratch2, L_objArray);
2094 
2095     //  if (src->klass() != dst->klass()) return -1;
2096     __ load_klass(rscratch2, dst);
2097     __ eor(rscratch2, rscratch2, scratch_src_klass);
2098     __ cbnz(rscratch2, L_failed);
2099 
2100     //  if (!src->is_Array()) return -1;
2101     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2102 
2103     // At this point, it is known to be a typeArray (array_tag 0x3).
2104 #ifdef ASSERT
2105     {
2106       BLOCK_COMMENT("assert primitive array {");
2107       Label L;
2108       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2109       __ cmpw(lh, rscratch2);
2110       __ br(Assembler::GE, L);
2111       __ stop("must be a primitive array");
2112       __ bind(L);
2113       BLOCK_COMMENT("} assert primitive array done");
2114     }
2115 #endif
2116 
2117     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2118                            rscratch2, L_failed);
2119 
2120     // TypeArrayKlass
2121     //
2122     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2123     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2124     //
2125 
2126     const Register rscratch1_offset = rscratch1;    // array offset
2127     const Register r15_elsize = lh; // element size
2128 
2129     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2130            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2131     __ add(src, src, rscratch1_offset);           // src array offset
2132     __ add(dst, dst, rscratch1_offset);           // dst array offset
2133     BLOCK_COMMENT("choose copy loop based on element size");
2134 
2135     // next registers should be set before the jump to corresponding stub
2136     const Register from     = c_rarg0;  // source array address
2137     const Register to       = c_rarg1;  // destination array address
2138     const Register count    = c_rarg2;  // elements count
2139 
2140     // 'from', 'to', 'count' registers should be set in such order
2141     // since they are the same as 'src', 'src_pos', 'dst'.
2142 
2143     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2144 
2145     // The possible values of elsize are 0-3, i.e. exact_log2(element
2146     // size in bytes).  We do a simple bitwise binary search.
2147   __ BIND(L_copy_bytes);
2148     __ tbnz(r15_elsize, 1, L_copy_ints);
2149     __ tbnz(r15_elsize, 0, L_copy_shorts);
2150     __ lea(from, Address(src, src_pos));// src_addr
2151     __ lea(to,   Address(dst, dst_pos));// dst_addr
2152     __ movw(count, scratch_length); // length
2153     __ b(RuntimeAddress(byte_copy_entry));
2154 
2155   __ BIND(L_copy_shorts);
2156     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2157     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2158     __ movw(count, scratch_length); // length
2159     __ b(RuntimeAddress(short_copy_entry));
2160 
2161   __ BIND(L_copy_ints);
2162     __ tbnz(r15_elsize, 0, L_copy_longs);
2163     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2164     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2165     __ movw(count, scratch_length); // length
2166     __ b(RuntimeAddress(int_copy_entry));
2167 
2168   __ BIND(L_copy_longs);
2169 #ifdef ASSERT
2170     {
2171       BLOCK_COMMENT("assert long copy {");
2172       Label L;
2173       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2174       __ cmpw(r15_elsize, LogBytesPerLong);
2175       __ br(Assembler::EQ, L);
2176       __ stop("must be long copy, but elsize is wrong");
2177       __ bind(L);
2178       BLOCK_COMMENT("} assert long copy done");
2179     }
2180 #endif
2181     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2182     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2183     __ movw(count, scratch_length); // length
2184     __ b(RuntimeAddress(long_copy_entry));
2185 
2186     // ObjArrayKlass
2187   __ BIND(L_objArray);
2188     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2189 
2190     Label L_plain_copy, L_checkcast_copy;
2191     //  test array classes for subtyping
2192     __ load_klass(r15, dst);
2193     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2194     __ br(Assembler::NE, L_checkcast_copy);
2195 
2196     // Identically typed arrays can be copied without element-wise checks.
2197     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2198                            rscratch2, L_failed);
2199 
2200     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2201     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2202     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2203     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2204     __ movw(count, scratch_length); // length
2205   __ BIND(L_plain_copy);
2206     __ b(RuntimeAddress(oop_copy_entry));
2207 
2208   __ BIND(L_checkcast_copy);
2209     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2210     {
2211       // Before looking at dst.length, make sure dst is also an objArray.
2212       __ ldrw(rscratch1, Address(r15, lh_offset));
2213       __ movw(rscratch2, objArray_lh);
2214       __ eorw(rscratch1, rscratch1, rscratch2);
2215       __ cbnzw(rscratch1, L_failed);
2216 
2217       // It is safe to examine both src.length and dst.length.
2218       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2219                              r15, L_failed);
2220 
2221       __ load_klass(dst_klass, dst); // reload
2222 
2223       // Marshal the base address arguments now, freeing registers.
2224       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2225       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2226       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2227       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2228       __ movw(count, length);           // length (reloaded)
2229       Register sco_temp = c_rarg3;      // this register is free now
2230       assert_different_registers(from, to, count, sco_temp,
2231                                  dst_klass, scratch_src_klass);
2232       // assert_clean_int(count, sco_temp);
2233 
2234       // Generate the type check.
2235       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2236       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2237 
2238       // Smashes rscratch1, rscratch2
2239       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2240 
2241       // Fetch destination element klass from the ObjArrayKlass header.
2242       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2243       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2244       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2245 
2246       // the checkcast_copy loop needs two extra arguments:
2247       assert(c_rarg3 == sco_temp, "#3 already in place");
2248       // Set up arguments for checkcast_copy_entry.
2249       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2250       __ b(RuntimeAddress(checkcast_copy_entry));
2251     }
2252 
2253   __ BIND(L_failed);
2254     __ mov(r0, -1);
2255     __ leave();   // required for proper stackwalking of RuntimeStub frame
2256     __ ret(lr);
2257 
2258     return start;
2259   }
2260 
2261   //
2262   // Generate stub for array fill. If "aligned" is true, the
2263   // "to" address is assumed to be heapword aligned.
2264   //
2265   // Arguments for generated stub:
2266   //   to:    c_rarg0
2267   //   value: c_rarg1
2268   //   count: c_rarg2 treated as signed
2269   //
2270   address generate_fill(BasicType t, bool aligned, const char *name) {
2271     __ align(CodeEntryAlignment);
2272     StubCodeMark mark(this, "StubRoutines", name);
2273     address start = __ pc();
2274 
2275     BLOCK_COMMENT("Entry:");
2276 
2277     const Register to        = c_rarg0;  // source array address
2278     const Register value     = c_rarg1;  // value
2279     const Register count     = c_rarg2;  // elements count
2280 
2281     const Register bz_base = r10;        // base for block_zero routine
2282     const Register cnt_words = r11;      // temp register
2283 
2284     __ enter();
2285 
2286     Label L_fill_elements, L_exit1;
2287 
2288     int shift = -1;
2289     switch (t) {
2290       case T_BYTE:
2291         shift = 0;
2292         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2293         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2294         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2295         __ br(Assembler::LO, L_fill_elements);
2296         break;
2297       case T_SHORT:
2298         shift = 1;
2299         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2300         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2301         __ br(Assembler::LO, L_fill_elements);
2302         break;
2303       case T_INT:
2304         shift = 2;
2305         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2306         __ br(Assembler::LO, L_fill_elements);
2307         break;
2308       default: ShouldNotReachHere();
2309     }
2310 
2311     // Align source address at 8 bytes address boundary.
2312     Label L_skip_align1, L_skip_align2, L_skip_align4;
2313     if (!aligned) {
2314       switch (t) {
2315         case T_BYTE:
2316           // One byte misalignment happens only for byte arrays.
2317           __ tbz(to, 0, L_skip_align1);
2318           __ strb(value, Address(__ post(to, 1)));
2319           __ subw(count, count, 1);
2320           __ bind(L_skip_align1);
2321           // Fallthrough
2322         case T_SHORT:
2323           // Two bytes misalignment happens only for byte and short (char) arrays.
2324           __ tbz(to, 1, L_skip_align2);
2325           __ strh(value, Address(__ post(to, 2)));
2326           __ subw(count, count, 2 >> shift);
2327           __ bind(L_skip_align2);
2328           // Fallthrough
2329         case T_INT:
2330           // Align to 8 bytes, we know we are 4 byte aligned to start.
2331           __ tbz(to, 2, L_skip_align4);
2332           __ strw(value, Address(__ post(to, 4)));
2333           __ subw(count, count, 4 >> shift);
2334           __ bind(L_skip_align4);
2335           break;
2336         default: ShouldNotReachHere();
2337       }
2338     }
2339 
2340     //
2341     //  Fill large chunks
2342     //
2343     __ lsrw(cnt_words, count, 3 - shift); // number of words
2344     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2345     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2346     if (UseBlockZeroing) {
2347       Label non_block_zeroing, rest;
2348       // If the fill value is zero we can use the fast zero_words().
2349       __ cbnz(value, non_block_zeroing);
2350       __ mov(bz_base, to);
2351       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2352       __ zero_words(bz_base, cnt_words);
2353       __ b(rest);
2354       __ bind(non_block_zeroing);
2355       __ fill_words(to, cnt_words, value);
2356       __ bind(rest);
2357     } else {
2358       __ fill_words(to, cnt_words, value);
2359     }
2360 
2361     // Remaining count is less than 8 bytes. Fill it by a single store.
2362     // Note that the total length is no less than 8 bytes.
2363     if (t == T_BYTE || t == T_SHORT) {
2364       Label L_exit1;
2365       __ cbzw(count, L_exit1);
2366       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2367       __ str(value, Address(to, -8));    // overwrite some elements
2368       __ bind(L_exit1);
2369       __ leave();
2370       __ ret(lr);
2371     }
2372 
2373     // Handle copies less than 8 bytes.
2374     Label L_fill_2, L_fill_4, L_exit2;
2375     __ bind(L_fill_elements);
2376     switch (t) {
2377       case T_BYTE:
2378         __ tbz(count, 0, L_fill_2);
2379         __ strb(value, Address(__ post(to, 1)));
2380         __ bind(L_fill_2);
2381         __ tbz(count, 1, L_fill_4);
2382         __ strh(value, Address(__ post(to, 2)));
2383         __ bind(L_fill_4);
2384         __ tbz(count, 2, L_exit2);
2385         __ strw(value, Address(to));
2386         break;
2387       case T_SHORT:
2388         __ tbz(count, 0, L_fill_4);
2389         __ strh(value, Address(__ post(to, 2)));
2390         __ bind(L_fill_4);
2391         __ tbz(count, 1, L_exit2);
2392         __ strw(value, Address(to));
2393         break;
2394       case T_INT:
2395         __ cbzw(count, L_exit2);
2396         __ strw(value, Address(to));
2397         break;
2398       default: ShouldNotReachHere();
2399     }
2400     __ bind(L_exit2);
2401     __ leave();
2402     __ ret(lr);
2403     return start;
2404   }
2405 
2406   address generate_data_cache_writeback() {
2407     const Register line        = c_rarg0;  // address of line to write back
2408 
2409     __ align(CodeEntryAlignment);
2410 
2411     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2412 
2413     address start = __ pc();
2414     __ enter();
2415     __ cache_wb(Address(line, 0));
2416     __ leave();
2417     __ ret(lr);
2418 
2419     return start;
2420   }
2421 
2422   address generate_data_cache_writeback_sync() {
2423     const Register is_pre     = c_rarg0;  // pre or post sync
2424 
2425     __ align(CodeEntryAlignment);
2426 
2427     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2428 
2429     // pre wbsync is a no-op
2430     // post wbsync translates to an sfence
2431 
2432     Label skip;
2433     address start = __ pc();
2434     __ enter();
2435     __ cbnz(is_pre, skip);
2436     __ cache_wbsync(false);
2437     __ bind(skip);
2438     __ leave();
2439     __ ret(lr);
2440 
2441     return start;
2442   }
2443 
2444   void generate_arraycopy_stubs() {
2445     address entry;
2446     address entry_jbyte_arraycopy;
2447     address entry_jshort_arraycopy;
2448     address entry_jint_arraycopy;
2449     address entry_oop_arraycopy;
2450     address entry_jlong_arraycopy;
2451     address entry_checkcast_arraycopy;
2452 
2453     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2454     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2455 
2456     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2457 
2458     //*** jbyte
2459     // Always need aligned and unaligned versions
2460     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2461                                                                                   "jbyte_disjoint_arraycopy");
2462     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2463                                                                                   &entry_jbyte_arraycopy,
2464                                                                                   "jbyte_arraycopy");
2465     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2466                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2467     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2468                                                                                   "arrayof_jbyte_arraycopy");
2469 
2470     //*** jshort
2471     // Always need aligned and unaligned versions
2472     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2473                                                                                     "jshort_disjoint_arraycopy");
2474     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2475                                                                                     &entry_jshort_arraycopy,
2476                                                                                     "jshort_arraycopy");
2477     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2478                                                                                     "arrayof_jshort_disjoint_arraycopy");
2479     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2480                                                                                     "arrayof_jshort_arraycopy");
2481 
2482     //*** jint
2483     // Aligned versions
2484     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2485                                                                                 "arrayof_jint_disjoint_arraycopy");
2486     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2487                                                                                 "arrayof_jint_arraycopy");
2488     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2489     // entry_jint_arraycopy always points to the unaligned version
2490     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2491                                                                                 "jint_disjoint_arraycopy");
2492     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2493                                                                                 &entry_jint_arraycopy,
2494                                                                                 "jint_arraycopy");
2495 
2496     //*** jlong
2497     // It is always aligned
2498     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2499                                                                                   "arrayof_jlong_disjoint_arraycopy");
2500     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2501                                                                                   "arrayof_jlong_arraycopy");
2502     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2503     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2504 
2505     //*** oops
2506     {
2507       // With compressed oops we need unaligned versions; notice that
2508       // we overwrite entry_oop_arraycopy.
2509       bool aligned = !UseCompressedOops;
2510 
2511       StubRoutines::_arrayof_oop_disjoint_arraycopy
2512         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2513                                      /*dest_uninitialized*/false);
2514       StubRoutines::_arrayof_oop_arraycopy
2515         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2516                                      /*dest_uninitialized*/false);
2517       // Aligned versions without pre-barriers
2518       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2519         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2520                                      /*dest_uninitialized*/true);
2521       StubRoutines::_arrayof_oop_arraycopy_uninit
2522         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2523                                      /*dest_uninitialized*/true);
2524     }
2525 
2526     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2527     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2528     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2529     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2530 
2531     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2532     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2533                                                                         /*dest_uninitialized*/true);
2534 
2535     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2536                                                               entry_jbyte_arraycopy,
2537                                                               entry_jshort_arraycopy,
2538                                                               entry_jint_arraycopy,
2539                                                               entry_jlong_arraycopy);
2540 
2541     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2542                                                                entry_jbyte_arraycopy,
2543                                                                entry_jshort_arraycopy,
2544                                                                entry_jint_arraycopy,
2545                                                                entry_oop_arraycopy,
2546                                                                entry_jlong_arraycopy,
2547                                                                entry_checkcast_arraycopy);
2548 
2549     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2550     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2551     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2552     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2553     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2554     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2555   }
2556 
2557   void generate_math_stubs() { Unimplemented(); }
2558 
2559   // Arguments:
2560   //
2561   // Inputs:
2562   //   c_rarg0   - source byte array address
2563   //   c_rarg1   - destination byte array address
2564   //   c_rarg2   - K (key) in little endian int array
2565   //
2566   address generate_aescrypt_encryptBlock() {
2567     __ align(CodeEntryAlignment);
2568     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2569 
2570     const Register from        = c_rarg0;  // source array address
2571     const Register to          = c_rarg1;  // destination array address
2572     const Register key         = c_rarg2;  // key array address
2573     const Register keylen      = rscratch1;
2574 
2575     address start = __ pc();
2576     __ enter();
2577 
2578     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2579 
2580     __ aesenc_loadkeys(key, keylen);
2581     __ aesecb_encrypt(from, to, keylen);
2582 
2583     __ mov(r0, 0);
2584 
2585     __ leave();
2586     __ ret(lr);
2587 
2588     return start;
2589   }
2590 
2591   // Arguments:
2592   //
2593   // Inputs:
2594   //   c_rarg0   - source byte array address
2595   //   c_rarg1   - destination byte array address
2596   //   c_rarg2   - K (key) in little endian int array
2597   //
2598   address generate_aescrypt_decryptBlock() {
2599     assert(UseAES, "need AES cryptographic extension support");
2600     __ align(CodeEntryAlignment);
2601     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2602     Label L_doLast;
2603 
2604     const Register from        = c_rarg0;  // source array address
2605     const Register to          = c_rarg1;  // destination array address
2606     const Register key         = c_rarg2;  // key array address
2607     const Register keylen      = rscratch1;
2608 
2609     address start = __ pc();
2610     __ enter(); // required for proper stackwalking of RuntimeStub frame
2611 
2612     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2613 
2614     __ aesecb_decrypt(from, to, key, keylen);
2615 
2616     __ mov(r0, 0);
2617 
2618     __ leave();
2619     __ ret(lr);
2620 
2621     return start;
2622   }
2623 
2624   // Arguments:
2625   //
2626   // Inputs:
2627   //   c_rarg0   - source byte array address
2628   //   c_rarg1   - destination byte array address
2629   //   c_rarg2   - K (key) in little endian int array
2630   //   c_rarg3   - r vector byte array address
2631   //   c_rarg4   - input length
2632   //
2633   // Output:
2634   //   x0        - input length
2635   //
2636   address generate_cipherBlockChaining_encryptAESCrypt() {
2637     assert(UseAES, "need AES cryptographic extension support");
2638     __ align(CodeEntryAlignment);
2639     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2640 
2641     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2642 
2643     const Register from        = c_rarg0;  // source array address
2644     const Register to          = c_rarg1;  // destination array address
2645     const Register key         = c_rarg2;  // key array address
2646     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2647                                            // and left with the results of the last encryption block
2648     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2649     const Register keylen      = rscratch1;
2650 
2651     address start = __ pc();
2652 
2653       __ enter();
2654 
2655       __ movw(rscratch2, len_reg);
2656 
2657       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2658 
2659       __ ld1(v0, __ T16B, rvec);
2660 
2661       __ cmpw(keylen, 52);
2662       __ br(Assembler::CC, L_loadkeys_44);
2663       __ br(Assembler::EQ, L_loadkeys_52);
2664 
2665       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2666       __ rev32(v17, __ T16B, v17);
2667       __ rev32(v18, __ T16B, v18);
2668     __ BIND(L_loadkeys_52);
2669       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2670       __ rev32(v19, __ T16B, v19);
2671       __ rev32(v20, __ T16B, v20);
2672     __ BIND(L_loadkeys_44);
2673       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2674       __ rev32(v21, __ T16B, v21);
2675       __ rev32(v22, __ T16B, v22);
2676       __ rev32(v23, __ T16B, v23);
2677       __ rev32(v24, __ T16B, v24);
2678       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2679       __ rev32(v25, __ T16B, v25);
2680       __ rev32(v26, __ T16B, v26);
2681       __ rev32(v27, __ T16B, v27);
2682       __ rev32(v28, __ T16B, v28);
2683       __ ld1(v29, v30, v31, __ T16B, key);
2684       __ rev32(v29, __ T16B, v29);
2685       __ rev32(v30, __ T16B, v30);
2686       __ rev32(v31, __ T16B, v31);
2687 
2688     __ BIND(L_aes_loop);
2689       __ ld1(v1, __ T16B, __ post(from, 16));
2690       __ eor(v0, __ T16B, v0, v1);
2691 
2692       __ br(Assembler::CC, L_rounds_44);
2693       __ br(Assembler::EQ, L_rounds_52);
2694 
2695       __ aese(v0, v17); __ aesmc(v0, v0);
2696       __ aese(v0, v18); __ aesmc(v0, v0);
2697     __ BIND(L_rounds_52);
2698       __ aese(v0, v19); __ aesmc(v0, v0);
2699       __ aese(v0, v20); __ aesmc(v0, v0);
2700     __ BIND(L_rounds_44);
2701       __ aese(v0, v21); __ aesmc(v0, v0);
2702       __ aese(v0, v22); __ aesmc(v0, v0);
2703       __ aese(v0, v23); __ aesmc(v0, v0);
2704       __ aese(v0, v24); __ aesmc(v0, v0);
2705       __ aese(v0, v25); __ aesmc(v0, v0);
2706       __ aese(v0, v26); __ aesmc(v0, v0);
2707       __ aese(v0, v27); __ aesmc(v0, v0);
2708       __ aese(v0, v28); __ aesmc(v0, v0);
2709       __ aese(v0, v29); __ aesmc(v0, v0);
2710       __ aese(v0, v30);
2711       __ eor(v0, __ T16B, v0, v31);
2712 
2713       __ st1(v0, __ T16B, __ post(to, 16));
2714 
2715       __ subw(len_reg, len_reg, 16);
2716       __ cbnzw(len_reg, L_aes_loop);
2717 
2718       __ st1(v0, __ T16B, rvec);
2719 
2720       __ mov(r0, rscratch2);
2721 
2722       __ leave();
2723       __ ret(lr);
2724 
2725       return start;
2726   }
2727 
2728   // Arguments:
2729   //
2730   // Inputs:
2731   //   c_rarg0   - source byte array address
2732   //   c_rarg1   - destination byte array address
2733   //   c_rarg2   - K (key) in little endian int array
2734   //   c_rarg3   - r vector byte array address
2735   //   c_rarg4   - input length
2736   //
2737   // Output:
2738   //   r0        - input length
2739   //
2740   address generate_cipherBlockChaining_decryptAESCrypt() {
2741     assert(UseAES, "need AES cryptographic extension support");
2742     __ align(CodeEntryAlignment);
2743     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2744 
2745     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2746 
2747     const Register from        = c_rarg0;  // source array address
2748     const Register to          = c_rarg1;  // destination array address
2749     const Register key         = c_rarg2;  // key array address
2750     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2751                                            // and left with the results of the last encryption block
2752     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2753     const Register keylen      = rscratch1;
2754 
2755     address start = __ pc();
2756 
2757       __ enter();
2758 
2759       __ movw(rscratch2, len_reg);
2760 
2761       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2762 
2763       __ ld1(v2, __ T16B, rvec);
2764 
2765       __ ld1(v31, __ T16B, __ post(key, 16));
2766       __ rev32(v31, __ T16B, v31);
2767 
2768       __ cmpw(keylen, 52);
2769       __ br(Assembler::CC, L_loadkeys_44);
2770       __ br(Assembler::EQ, L_loadkeys_52);
2771 
2772       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2773       __ rev32(v17, __ T16B, v17);
2774       __ rev32(v18, __ T16B, v18);
2775     __ BIND(L_loadkeys_52);
2776       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2777       __ rev32(v19, __ T16B, v19);
2778       __ rev32(v20, __ T16B, v20);
2779     __ BIND(L_loadkeys_44);
2780       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2781       __ rev32(v21, __ T16B, v21);
2782       __ rev32(v22, __ T16B, v22);
2783       __ rev32(v23, __ T16B, v23);
2784       __ rev32(v24, __ T16B, v24);
2785       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2786       __ rev32(v25, __ T16B, v25);
2787       __ rev32(v26, __ T16B, v26);
2788       __ rev32(v27, __ T16B, v27);
2789       __ rev32(v28, __ T16B, v28);
2790       __ ld1(v29, v30, __ T16B, key);
2791       __ rev32(v29, __ T16B, v29);
2792       __ rev32(v30, __ T16B, v30);
2793 
2794     __ BIND(L_aes_loop);
2795       __ ld1(v0, __ T16B, __ post(from, 16));
2796       __ orr(v1, __ T16B, v0, v0);
2797 
2798       __ br(Assembler::CC, L_rounds_44);
2799       __ br(Assembler::EQ, L_rounds_52);
2800 
2801       __ aesd(v0, v17); __ aesimc(v0, v0);
2802       __ aesd(v0, v18); __ aesimc(v0, v0);
2803     __ BIND(L_rounds_52);
2804       __ aesd(v0, v19); __ aesimc(v0, v0);
2805       __ aesd(v0, v20); __ aesimc(v0, v0);
2806     __ BIND(L_rounds_44);
2807       __ aesd(v0, v21); __ aesimc(v0, v0);
2808       __ aesd(v0, v22); __ aesimc(v0, v0);
2809       __ aesd(v0, v23); __ aesimc(v0, v0);
2810       __ aesd(v0, v24); __ aesimc(v0, v0);
2811       __ aesd(v0, v25); __ aesimc(v0, v0);
2812       __ aesd(v0, v26); __ aesimc(v0, v0);
2813       __ aesd(v0, v27); __ aesimc(v0, v0);
2814       __ aesd(v0, v28); __ aesimc(v0, v0);
2815       __ aesd(v0, v29); __ aesimc(v0, v0);
2816       __ aesd(v0, v30);
2817       __ eor(v0, __ T16B, v0, v31);
2818       __ eor(v0, __ T16B, v0, v2);
2819 
2820       __ st1(v0, __ T16B, __ post(to, 16));
2821       __ orr(v2, __ T16B, v1, v1);
2822 
2823       __ subw(len_reg, len_reg, 16);
2824       __ cbnzw(len_reg, L_aes_loop);
2825 
2826       __ st1(v2, __ T16B, rvec);
2827 
2828       __ mov(r0, rscratch2);
2829 
2830       __ leave();
2831       __ ret(lr);
2832 
2833     return start;
2834   }
2835 
2836   // CTR AES crypt.
2837   // Arguments:
2838   //
2839   // Inputs:
2840   //   c_rarg0   - source byte array address
2841   //   c_rarg1   - destination byte array address
2842   //   c_rarg2   - K (key) in little endian int array
2843   //   c_rarg3   - counter vector byte array address
2844   //   c_rarg4   - input length
2845   //   c_rarg5   - saved encryptedCounter start
2846   //   c_rarg6   - saved used length
2847   //
2848   // Output:
2849   //   r0       - input length
2850   //
2851   address generate_counterMode_AESCrypt() {
2852     const Register in = c_rarg0;
2853     const Register out = c_rarg1;
2854     const Register key = c_rarg2;
2855     const Register counter = c_rarg3;
2856     const Register saved_len = c_rarg4, len = r10;
2857     const Register saved_encrypted_ctr = c_rarg5;
2858     const Register used_ptr = c_rarg6, used = r12;
2859 
2860     const Register offset = r7;
2861     const Register keylen = r11;
2862 
2863     const unsigned char block_size = 16;
2864     const int bulk_width = 4;
2865     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2866     // performance with larger data sizes, but it also means that the
2867     // fast path isn't used until you have at least 8 blocks, and up
2868     // to 127 bytes of data will be executed on the slow path. For
2869     // that reason, and also so as not to blow away too much icache, 4
2870     // blocks seems like a sensible compromise.
2871 
2872     // Algorithm:
2873     //
2874     //    if (len == 0) {
2875     //        goto DONE;
2876     //    }
2877     //    int result = len;
2878     //    do {
2879     //        if (used >= blockSize) {
2880     //            if (len >= bulk_width * blockSize) {
2881     //                CTR_large_block();
2882     //                if (len == 0)
2883     //                    goto DONE;
2884     //            }
2885     //            for (;;) {
2886     //                16ByteVector v0 = counter;
2887     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2888     //                used = 0;
2889     //                if (len < blockSize)
2890     //                    break;    /* goto NEXT */
2891     //                16ByteVector v1 = load16Bytes(in, offset);
2892     //                v1 = v1 ^ encryptedCounter;
2893     //                store16Bytes(out, offset);
2894     //                used = blockSize;
2895     //                offset += blockSize;
2896     //                len -= blockSize;
2897     //                if (len == 0)
2898     //                    goto DONE;
2899     //            }
2900     //        }
2901     //      NEXT:
2902     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2903     //        len--;
2904     //    } while (len != 0);
2905     //  DONE:
2906     //    return result;
2907     //
2908     // CTR_large_block()
2909     //    Wide bulk encryption of whole blocks.
2910 
2911     __ align(CodeEntryAlignment);
2912     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2913     const address start = __ pc();
2914     __ enter();
2915 
2916     Label DONE, CTR_large_block, large_block_return;
2917     __ ldrw(used, Address(used_ptr));
2918     __ cbzw(saved_len, DONE);
2919 
2920     __ mov(len, saved_len);
2921     __ mov(offset, 0);
2922 
2923     // Compute #rounds for AES based on the length of the key array
2924     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2925 
2926     __ aesenc_loadkeys(key, keylen);
2927 
2928     {
2929       Label L_CTR_loop, NEXT;
2930 
2931       __ bind(L_CTR_loop);
2932 
2933       __ cmp(used, block_size);
2934       __ br(__ LO, NEXT);
2935 
2936       // Maybe we have a lot of data
2937       __ subsw(rscratch1, len, bulk_width * block_size);
2938       __ br(__ HS, CTR_large_block);
2939       __ BIND(large_block_return);
2940       __ cbzw(len, DONE);
2941 
2942       // Setup the counter
2943       __ movi(v4, __ T4S, 0);
2944       __ movi(v5, __ T4S, 1);
2945       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2946 
2947       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2948       __ rev32(v16, __ T16B, v0);
2949       __ addv(v16, __ T4S, v16, v4);
2950       __ rev32(v16, __ T16B, v16);
2951       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2952 
2953       {
2954         // We have fewer than bulk_width blocks of data left. Encrypt
2955         // them one by one until there is less than a full block
2956         // remaining, being careful to save both the encrypted counter
2957         // and the counter.
2958 
2959         Label inner_loop;
2960         __ bind(inner_loop);
2961         // Counter to encrypt is in v0
2962         __ aesecb_encrypt(noreg, noreg, keylen);
2963         __ st1(v0, __ T16B, saved_encrypted_ctr);
2964 
2965         // Do we have a remaining full block?
2966 
2967         __ mov(used, 0);
2968         __ cmp(len, block_size);
2969         __ br(__ LO, NEXT);
2970 
2971         // Yes, we have a full block
2972         __ ldrq(v1, Address(in, offset));
2973         __ eor(v1, __ T16B, v1, v0);
2974         __ strq(v1, Address(out, offset));
2975         __ mov(used, block_size);
2976         __ add(offset, offset, block_size);
2977 
2978         __ subw(len, len, block_size);
2979         __ cbzw(len, DONE);
2980 
2981         // Increment the counter, store it back
2982         __ orr(v0, __ T16B, v16, v16);
2983         __ rev32(v16, __ T16B, v16);
2984         __ addv(v16, __ T4S, v16, v4);
2985         __ rev32(v16, __ T16B, v16);
2986         __ st1(v16, __ T16B, counter); // Save the incremented counter back
2987 
2988         __ b(inner_loop);
2989       }
2990 
2991       __ BIND(NEXT);
2992 
2993       // Encrypt a single byte, and loop.
2994       // We expect this to be a rare event.
2995       __ ldrb(rscratch1, Address(in, offset));
2996       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
2997       __ eor(rscratch1, rscratch1, rscratch2);
2998       __ strb(rscratch1, Address(out, offset));
2999       __ add(offset, offset, 1);
3000       __ add(used, used, 1);
3001       __ subw(len, len,1);
3002       __ cbnzw(len, L_CTR_loop);
3003     }
3004 
3005     __ bind(DONE);
3006     __ strw(used, Address(used_ptr));
3007     __ mov(r0, saved_len);
3008 
3009     __ leave(); // required for proper stackwalking of RuntimeStub frame
3010     __ ret(lr);
3011 
3012     // Bulk encryption
3013 
3014     __ BIND (CTR_large_block);
3015     assert(bulk_width == 4 || bulk_width == 8, "must be");
3016 
3017     if (bulk_width == 8) {
3018       __ sub(sp, sp, 4 * 16);
3019       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3020     }
3021     __ sub(sp, sp, 4 * 16);
3022     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3023     RegSet saved_regs = (RegSet::of(in, out, offset)
3024                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3025     __ push(saved_regs, sp);
3026     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3027     __ add(in, in, offset);
3028     __ add(out, out, offset);
3029 
3030     // Keys should already be loaded into the correct registers
3031 
3032     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3033     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3034 
3035     // AES/CTR loop
3036     {
3037       Label L_CTR_loop;
3038       __ BIND(L_CTR_loop);
3039 
3040       // Setup the counters
3041       __ movi(v8, __ T4S, 0);
3042       __ movi(v9, __ T4S, 1);
3043       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3044 
3045       for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
3046         __ rev32(f, __ T16B, v16);
3047         __ addv(v16, __ T4S, v16, v8);
3048       }
3049 
3050       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3051 
3052       // Encrypt the counters
3053       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3054 
3055       if (bulk_width == 8) {
3056         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3057       }
3058 
3059       // XOR the encrypted counters with the inputs
3060       for (int i = 0; i < bulk_width; i++) {
3061         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3062       }
3063 
3064       // Write the encrypted data
3065       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3066       if (bulk_width == 8) {
3067         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3068       }
3069 
3070       __ subw(len, len, 16 * bulk_width);
3071       __ cbnzw(len, L_CTR_loop);
3072     }
3073 
3074     // Save the counter back where it goes
3075     __ rev32(v16, __ T16B, v16);
3076     __ st1(v16, __ T16B, counter);
3077 
3078     __ pop(saved_regs, sp);
3079 
3080     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3081     if (bulk_width == 8) {
3082       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3083     }
3084 
3085     __ andr(rscratch1, len, -16 * bulk_width);
3086     __ sub(len, len, rscratch1);
3087     __ add(offset, offset, rscratch1);
3088     __ mov(used, 16);
3089     __ strw(used, Address(used_ptr));
3090     __ b(large_block_return);
3091 
3092     return start;
3093   }
3094 
3095   // Vector AES Galois Counter Mode implementation. Parameters:
3096   //
3097   // in = c_rarg0
3098   // len = c_rarg1
3099   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3100   // out = c_rarg3
3101   // key = c_rarg4
3102   // state = c_rarg5 - GHASH.state
3103   // subkeyHtbl = c_rarg6 - powers of H
3104   // subkeyHtbl_48_entries = c_rarg7 (not used)
3105   // counter = [sp, #0] pointer to 16 bytes of CTR
3106   // return - number of processed bytes
3107   address generate_galoisCounterMode_AESCrypt() {
3108     address ghash_polynomial = __ pc();
3109     __ emit_int64(0x87);  // The low-order bits of the field
3110                           // polynomial (i.e. p = z^7+z^2+z+1)
3111                           // repeated in the low and high parts of a
3112                           // 128-bit vector
3113     __ emit_int64(0x87);
3114 
3115     __ align(CodeEntryAlignment);
3116      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3117     address start = __ pc();
3118     __ enter();
3119 
3120     const Register in = c_rarg0;
3121     const Register len = c_rarg1;
3122     const Register ct = c_rarg2;
3123     const Register out = c_rarg3;
3124     // and updated with the incremented counter in the end
3125 
3126     const Register key = c_rarg4;
3127     const Register state = c_rarg5;
3128 
3129     const Register subkeyHtbl = c_rarg6;
3130 
3131     // Pointer to CTR is passed on the stack before the (fp, lr) pair.
3132     const Address counter_mem(sp, 2 * wordSize);
3133     const Register counter = c_rarg7;
3134     __ ldr(counter, counter_mem);
3135 
3136     const Register keylen = r10;
3137     // Save state before entering routine
3138     __ sub(sp, sp, 4 * 16);
3139     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3140     __ sub(sp, sp, 4 * 16);
3141     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3142 
3143     // __ andr(len, len, -512);
3144     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3145     __ str(len, __ pre(sp, -2 * wordSize));
3146 
3147     Label DONE;
3148     __ cbz(len, DONE);
3149 
3150     // Compute #rounds for AES based on the length of the key array
3151     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3152 
3153     __ aesenc_loadkeys(key, keylen);
3154     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3155     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3156 
3157     // AES/CTR loop
3158     {
3159       Label L_CTR_loop;
3160       __ BIND(L_CTR_loop);
3161 
3162       // Setup the counters
3163       __ movi(v8, __ T4S, 0);
3164       __ movi(v9, __ T4S, 1);
3165       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3166       for (FloatRegister f = v0; f < v8; f++) {
3167         __ rev32(f, __ T16B, v16);
3168         __ addv(v16, __ T4S, v16, v8);
3169       }
3170 
3171       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3172 
3173       // Encrypt the counters
3174       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3175 
3176       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3177 
3178       // XOR the encrypted counters with the inputs
3179       for (int i = 0; i < 8; i++) {
3180         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3181       }
3182       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3183       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3184 
3185       __ subw(len, len, 16 * 8);
3186       __ cbnzw(len, L_CTR_loop);
3187     }
3188 
3189     __ rev32(v16, __ T16B, v16);
3190     __ st1(v16, __ T16B, counter);
3191 
3192     __ ldr(len, Address(sp));
3193     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3194 
3195     // GHASH/CTR loop
3196     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3197                                 len, /*unrolls*/4);
3198 
3199 #ifdef ASSERT
3200     { Label L;
3201       __ cmp(len, (unsigned char)0);
3202       __ br(Assembler::EQ, L);
3203       __ stop("stubGenerator: abort");
3204       __ bind(L);
3205   }
3206 #endif
3207 
3208   __ bind(DONE);
3209     // Return the number of bytes processed
3210     __ ldr(r0, __ post(sp, 2 * wordSize));
3211 
3212     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3213     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3214 
3215     __ leave(); // required for proper stackwalking of RuntimeStub frame
3216     __ ret(lr);
3217      return start;
3218   }
3219 
3220   // Arguments:
3221   //
3222   // Inputs:
3223   //   c_rarg0   - byte[]  source+offset
3224   //   c_rarg1   - int[]   SHA.state
3225   //   c_rarg2   - int     offset
3226   //   c_rarg3   - int     limit
3227   //
3228   address generate_sha1_implCompress(bool multi_block, const char *name) {
3229     __ align(CodeEntryAlignment);
3230     StubCodeMark mark(this, "StubRoutines", name);
3231     address start = __ pc();
3232 
3233     Register buf   = c_rarg0;
3234     Register state = c_rarg1;
3235     Register ofs   = c_rarg2;
3236     Register limit = c_rarg3;
3237 
3238     Label keys;
3239     Label sha1_loop;
3240 
3241     // load the keys into v0..v3
3242     __ adr(rscratch1, keys);
3243     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3244     // load 5 words state into v6, v7
3245     __ ldrq(v6, Address(state, 0));
3246     __ ldrs(v7, Address(state, 16));
3247 
3248 
3249     __ BIND(sha1_loop);
3250     // load 64 bytes of data into v16..v19
3251     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3252     __ rev32(v16, __ T16B, v16);
3253     __ rev32(v17, __ T16B, v17);
3254     __ rev32(v18, __ T16B, v18);
3255     __ rev32(v19, __ T16B, v19);
3256 
3257     // do the sha1
3258     __ addv(v4, __ T4S, v16, v0);
3259     __ orr(v20, __ T16B, v6, v6);
3260 
3261     FloatRegister d0 = v16;
3262     FloatRegister d1 = v17;
3263     FloatRegister d2 = v18;
3264     FloatRegister d3 = v19;
3265 
3266     for (int round = 0; round < 20; round++) {
3267       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3268       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3269       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3270       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3271       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3272 
3273       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3274       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3275       __ sha1h(tmp2, __ T4S, v20);
3276       if (round < 5)
3277         __ sha1c(v20, __ T4S, tmp3, tmp4);
3278       else if (round < 10 || round >= 15)
3279         __ sha1p(v20, __ T4S, tmp3, tmp4);
3280       else
3281         __ sha1m(v20, __ T4S, tmp3, tmp4);
3282       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3283 
3284       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3285     }
3286 
3287     __ addv(v7, __ T2S, v7, v21);
3288     __ addv(v6, __ T4S, v6, v20);
3289 
3290     if (multi_block) {
3291       __ add(ofs, ofs, 64);
3292       __ cmp(ofs, limit);
3293       __ br(Assembler::LE, sha1_loop);
3294       __ mov(c_rarg0, ofs); // return ofs
3295     }
3296 
3297     __ strq(v6, Address(state, 0));
3298     __ strs(v7, Address(state, 16));
3299 
3300     __ ret(lr);
3301 
3302     __ bind(keys);
3303     __ emit_int32(0x5a827999);
3304     __ emit_int32(0x6ed9eba1);
3305     __ emit_int32(0x8f1bbcdc);
3306     __ emit_int32(0xca62c1d6);
3307 
3308     return start;
3309   }
3310 
3311 
3312   // Arguments:
3313   //
3314   // Inputs:
3315   //   c_rarg0   - byte[]  source+offset
3316   //   c_rarg1   - int[]   SHA.state
3317   //   c_rarg2   - int     offset
3318   //   c_rarg3   - int     limit
3319   //
3320   address generate_sha256_implCompress(bool multi_block, const char *name) {
3321     static const uint32_t round_consts[64] = {
3322       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3323       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3324       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3325       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3326       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3327       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3328       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3329       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3330       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3331       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3332       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3333       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3334       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3335       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3336       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3337       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3338     };
3339     __ align(CodeEntryAlignment);
3340     StubCodeMark mark(this, "StubRoutines", name);
3341     address start = __ pc();
3342 
3343     Register buf   = c_rarg0;
3344     Register state = c_rarg1;
3345     Register ofs   = c_rarg2;
3346     Register limit = c_rarg3;
3347 
3348     Label sha1_loop;
3349 
3350     __ stpd(v8, v9, __ pre(sp, -32));
3351     __ stpd(v10, v11, Address(sp, 16));
3352 
3353 // dga == v0
3354 // dgb == v1
3355 // dg0 == v2
3356 // dg1 == v3
3357 // dg2 == v4
3358 // t0 == v6
3359 // t1 == v7
3360 
3361     // load 16 keys to v16..v31
3362     __ lea(rscratch1, ExternalAddress((address)round_consts));
3363     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3364     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3365     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3366     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3367 
3368     // load 8 words (256 bits) state
3369     __ ldpq(v0, v1, state);
3370 
3371     __ BIND(sha1_loop);
3372     // load 64 bytes of data into v8..v11
3373     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3374     __ rev32(v8, __ T16B, v8);
3375     __ rev32(v9, __ T16B, v9);
3376     __ rev32(v10, __ T16B, v10);
3377     __ rev32(v11, __ T16B, v11);
3378 
3379     __ addv(v6, __ T4S, v8, v16);
3380     __ orr(v2, __ T16B, v0, v0);
3381     __ orr(v3, __ T16B, v1, v1);
3382 
3383     FloatRegister d0 = v8;
3384     FloatRegister d1 = v9;
3385     FloatRegister d2 = v10;
3386     FloatRegister d3 = v11;
3387 
3388 
3389     for (int round = 0; round < 16; round++) {
3390       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3391       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3392       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3393       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3394 
3395       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3396        __ orr(v4, __ T16B, v2, v2);
3397       if (round < 15)
3398         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3399       __ sha256h(v2, __ T4S, v3, tmp2);
3400       __ sha256h2(v3, __ T4S, v4, tmp2);
3401       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3402 
3403       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3404     }
3405 
3406     __ addv(v0, __ T4S, v0, v2);
3407     __ addv(v1, __ T4S, v1, v3);
3408 
3409     if (multi_block) {
3410       __ add(ofs, ofs, 64);
3411       __ cmp(ofs, limit);
3412       __ br(Assembler::LE, sha1_loop);
3413       __ mov(c_rarg0, ofs); // return ofs
3414     }
3415 
3416     __ ldpd(v10, v11, Address(sp, 16));
3417     __ ldpd(v8, v9, __ post(sp, 32));
3418 
3419     __ stpq(v0, v1, state);
3420 
3421     __ ret(lr);
3422 
3423     return start;
3424   }
3425 
3426   // Arguments:
3427   //
3428   // Inputs:
3429   //   c_rarg0   - byte[]  source+offset
3430   //   c_rarg1   - int[]   SHA.state
3431   //   c_rarg2   - int     offset
3432   //   c_rarg3   - int     limit
3433   //
3434   address generate_sha512_implCompress(bool multi_block, const char *name) {
3435     static const uint64_t round_consts[80] = {
3436       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3437       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3438       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3439       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3440       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3441       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3442       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3443       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3444       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3445       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3446       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3447       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3448       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3449       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3450       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3451       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3452       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3453       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3454       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3455       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3456       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3457       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3458       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3459       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3460       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3461       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3462       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3463     };
3464 
3465     // Double rounds for sha512.
3466     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3467       if (dr < 36)                                                                   \
3468         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3469       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3470       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3471       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3472       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3473       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3474       if (dr < 32) {                                                                 \
3475         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3476         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3477       }                                                                              \
3478       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3479       if (dr < 32)                                                                   \
3480         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3481       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3482       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3483 
3484     __ align(CodeEntryAlignment);
3485     StubCodeMark mark(this, "StubRoutines", name);
3486     address start = __ pc();
3487 
3488     Register buf   = c_rarg0;
3489     Register state = c_rarg1;
3490     Register ofs   = c_rarg2;
3491     Register limit = c_rarg3;
3492 
3493     __ stpd(v8, v9, __ pre(sp, -64));
3494     __ stpd(v10, v11, Address(sp, 16));
3495     __ stpd(v12, v13, Address(sp, 32));
3496     __ stpd(v14, v15, Address(sp, 48));
3497 
3498     Label sha512_loop;
3499 
3500     // load state
3501     __ ld1(v8, v9, v10, v11, __ T2D, state);
3502 
3503     // load first 4 round constants
3504     __ lea(rscratch1, ExternalAddress((address)round_consts));
3505     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3506 
3507     __ BIND(sha512_loop);
3508     // load 128B of data into v12..v19
3509     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3510     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3511     __ rev64(v12, __ T16B, v12);
3512     __ rev64(v13, __ T16B, v13);
3513     __ rev64(v14, __ T16B, v14);
3514     __ rev64(v15, __ T16B, v15);
3515     __ rev64(v16, __ T16B, v16);
3516     __ rev64(v17, __ T16B, v17);
3517     __ rev64(v18, __ T16B, v18);
3518     __ rev64(v19, __ T16B, v19);
3519 
3520     __ mov(rscratch2, rscratch1);
3521 
3522     __ mov(v0, __ T16B, v8);
3523     __ mov(v1, __ T16B, v9);
3524     __ mov(v2, __ T16B, v10);
3525     __ mov(v3, __ T16B, v11);
3526 
3527     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3528     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3529     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3530     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3531     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3532     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3533     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3534     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3535     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3536     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3537     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3538     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3539     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3540     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3541     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3542     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3543     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3544     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3545     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3546     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3547     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3548     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3549     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3550     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3551     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3552     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3553     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3554     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3555     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3556     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3557     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3558     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3559     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3560     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3561     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3562     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3563     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3564     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3565     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3566     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3567 
3568     __ addv(v8, __ T2D, v8, v0);
3569     __ addv(v9, __ T2D, v9, v1);
3570     __ addv(v10, __ T2D, v10, v2);
3571     __ addv(v11, __ T2D, v11, v3);
3572 
3573     if (multi_block) {
3574       __ add(ofs, ofs, 128);
3575       __ cmp(ofs, limit);
3576       __ br(Assembler::LE, sha512_loop);
3577       __ mov(c_rarg0, ofs); // return ofs
3578     }
3579 
3580     __ st1(v8, v9, v10, v11, __ T2D, state);
3581 
3582     __ ldpd(v14, v15, Address(sp, 48));
3583     __ ldpd(v12, v13, Address(sp, 32));
3584     __ ldpd(v10, v11, Address(sp, 16));
3585     __ ldpd(v8, v9, __ post(sp, 64));
3586 
3587     __ ret(lr);
3588 
3589     return start;
3590   }
3591 
3592   // Arguments:
3593   //
3594   // Inputs:
3595   //   c_rarg0   - byte[]  source+offset
3596   //   c_rarg1   - byte[]   SHA.state
3597   //   c_rarg2   - int     digest_length
3598   //   c_rarg3   - int     offset
3599   //   c_rarg4   - int     limit
3600   //
3601   address generate_sha3_implCompress(bool multi_block, const char *name) {
3602     static const uint64_t round_consts[24] = {
3603       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3604       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3605       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3606       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3607       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3608       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3609       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3610       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3611     };
3612 
3613     __ align(CodeEntryAlignment);
3614     StubCodeMark mark(this, "StubRoutines", name);
3615     address start = __ pc();
3616 
3617     Register buf           = c_rarg0;
3618     Register state         = c_rarg1;
3619     Register digest_length = c_rarg2;
3620     Register ofs           = c_rarg3;
3621     Register limit         = c_rarg4;
3622 
3623     Label sha3_loop, rounds24_loop;
3624     Label sha3_512, sha3_384_or_224, sha3_256;
3625 
3626     __ stpd(v8, v9, __ pre(sp, -64));
3627     __ stpd(v10, v11, Address(sp, 16));
3628     __ stpd(v12, v13, Address(sp, 32));
3629     __ stpd(v14, v15, Address(sp, 48));
3630 
3631     // load state
3632     __ add(rscratch1, state, 32);
3633     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3634     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3635     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3636     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3637     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3638     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3639     __ ld1(v24, __ T1D, rscratch1);
3640 
3641     __ BIND(sha3_loop);
3642 
3643     // 24 keccak rounds
3644     __ movw(rscratch2, 24);
3645 
3646     // load round_constants base
3647     __ lea(rscratch1, ExternalAddress((address) round_consts));
3648 
3649     // load input
3650     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3651     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3652     __ eor(v0, __ T8B, v0, v25);
3653     __ eor(v1, __ T8B, v1, v26);
3654     __ eor(v2, __ T8B, v2, v27);
3655     __ eor(v3, __ T8B, v3, v28);
3656     __ eor(v4, __ T8B, v4, v29);
3657     __ eor(v5, __ T8B, v5, v30);
3658     __ eor(v6, __ T8B, v6, v31);
3659 
3660     // digest_length == 64, SHA3-512
3661     __ tbnz(digest_length, 6, sha3_512);
3662 
3663     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3664     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3665     __ eor(v7, __ T8B, v7, v25);
3666     __ eor(v8, __ T8B, v8, v26);
3667     __ eor(v9, __ T8B, v9, v27);
3668     __ eor(v10, __ T8B, v10, v28);
3669     __ eor(v11, __ T8B, v11, v29);
3670     __ eor(v12, __ T8B, v12, v30);
3671 
3672     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3673     __ tbnz(digest_length, 4, sha3_384_or_224);
3674 
3675     // SHA3-256
3676     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3677     __ eor(v13, __ T8B, v13, v25);
3678     __ eor(v14, __ T8B, v14, v26);
3679     __ eor(v15, __ T8B, v15, v27);
3680     __ eor(v16, __ T8B, v16, v28);
3681     __ b(rounds24_loop);
3682 
3683     __ BIND(sha3_384_or_224);
3684     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3685 
3686     // SHA3-224
3687     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3688     __ ld1(v29, __ T8B, __ post(buf, 8));
3689     __ eor(v13, __ T8B, v13, v25);
3690     __ eor(v14, __ T8B, v14, v26);
3691     __ eor(v15, __ T8B, v15, v27);
3692     __ eor(v16, __ T8B, v16, v28);
3693     __ eor(v17, __ T8B, v17, v29);
3694     __ b(rounds24_loop);
3695 
3696     __ BIND(sha3_512);
3697     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3698     __ eor(v7, __ T8B, v7, v25);
3699     __ eor(v8, __ T8B, v8, v26);
3700 
3701     __ BIND(rounds24_loop);
3702     __ subw(rscratch2, rscratch2, 1);
3703 
3704     __ eor3(v29, __ T16B, v4, v9, v14);
3705     __ eor3(v26, __ T16B, v1, v6, v11);
3706     __ eor3(v28, __ T16B, v3, v8, v13);
3707     __ eor3(v25, __ T16B, v0, v5, v10);
3708     __ eor3(v27, __ T16B, v2, v7, v12);
3709     __ eor3(v29, __ T16B, v29, v19, v24);
3710     __ eor3(v26, __ T16B, v26, v16, v21);
3711     __ eor3(v28, __ T16B, v28, v18, v23);
3712     __ eor3(v25, __ T16B, v25, v15, v20);
3713     __ eor3(v27, __ T16B, v27, v17, v22);
3714 
3715     __ rax1(v30, __ T2D, v29, v26);
3716     __ rax1(v26, __ T2D, v26, v28);
3717     __ rax1(v28, __ T2D, v28, v25);
3718     __ rax1(v25, __ T2D, v25, v27);
3719     __ rax1(v27, __ T2D, v27, v29);
3720 
3721     __ eor(v0, __ T16B, v0, v30);
3722     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3723     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3724     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3725     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3726     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3727     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3728     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3729     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3730     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3731     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3732     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3733     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3734     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3735     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3736     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3737     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3738     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3739     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3740     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3741     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3742     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3743     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3744     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3745     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3746 
3747     __ bcax(v20, __ T16B, v31, v22, v8);
3748     __ bcax(v21, __ T16B, v8,  v23, v22);
3749     __ bcax(v22, __ T16B, v22, v24, v23);
3750     __ bcax(v23, __ T16B, v23, v31, v24);
3751     __ bcax(v24, __ T16B, v24, v8,  v31);
3752 
3753     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3754 
3755     __ bcax(v17, __ T16B, v25, v19, v3);
3756     __ bcax(v18, __ T16B, v3,  v15, v19);
3757     __ bcax(v19, __ T16B, v19, v16, v15);
3758     __ bcax(v15, __ T16B, v15, v25, v16);
3759     __ bcax(v16, __ T16B, v16, v3,  v25);
3760 
3761     __ bcax(v10, __ T16B, v29, v12, v26);
3762     __ bcax(v11, __ T16B, v26, v13, v12);
3763     __ bcax(v12, __ T16B, v12, v14, v13);
3764     __ bcax(v13, __ T16B, v13, v29, v14);
3765     __ bcax(v14, __ T16B, v14, v26, v29);
3766 
3767     __ bcax(v7, __ T16B, v30, v9,  v4);
3768     __ bcax(v8, __ T16B, v4,  v5,  v9);
3769     __ bcax(v9, __ T16B, v9,  v6,  v5);
3770     __ bcax(v5, __ T16B, v5,  v30, v6);
3771     __ bcax(v6, __ T16B, v6,  v4,  v30);
3772 
3773     __ bcax(v3, __ T16B, v27, v0,  v28);
3774     __ bcax(v4, __ T16B, v28, v1,  v0);
3775     __ bcax(v0, __ T16B, v0,  v2,  v1);
3776     __ bcax(v1, __ T16B, v1,  v27, v2);
3777     __ bcax(v2, __ T16B, v2,  v28, v27);
3778 
3779     __ eor(v0, __ T16B, v0, v31);
3780 
3781     __ cbnzw(rscratch2, rounds24_loop);
3782 
3783     if (multi_block) {
3784       // block_size =  200 - 2 * digest_length, ofs += block_size
3785       __ add(ofs, ofs, 200);
3786       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3787 
3788       __ cmp(ofs, limit);
3789       __ br(Assembler::LE, sha3_loop);
3790       __ mov(c_rarg0, ofs); // return ofs
3791     }
3792 
3793     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3794     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3795     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3796     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3797     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3798     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3799     __ st1(v24, __ T1D, state);
3800 
3801     __ ldpd(v14, v15, Address(sp, 48));
3802     __ ldpd(v12, v13, Address(sp, 32));
3803     __ ldpd(v10, v11, Address(sp, 16));
3804     __ ldpd(v8, v9, __ post(sp, 64));
3805 
3806     __ ret(lr);
3807 
3808     return start;
3809   }
3810 
3811   // Safefetch stubs.
3812   void generate_safefetch(const char* name, int size, address* entry,
3813                           address* fault_pc, address* continuation_pc) {
3814     // safefetch signatures:
3815     //   int      SafeFetch32(int*      adr, int      errValue);
3816     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3817     //
3818     // arguments:
3819     //   c_rarg0 = adr
3820     //   c_rarg1 = errValue
3821     //
3822     // result:
3823     //   PPC_RET  = *adr or errValue
3824 
3825     StubCodeMark mark(this, "StubRoutines", name);
3826 
3827     // Entry point, pc or function descriptor.
3828     *entry = __ pc();
3829 
3830     // Load *adr into c_rarg1, may fault.
3831     *fault_pc = __ pc();
3832     switch (size) {
3833       case 4:
3834         // int32_t
3835         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3836         break;
3837       case 8:
3838         // int64_t
3839         __ ldr(c_rarg1, Address(c_rarg0, 0));
3840         break;
3841       default:
3842         ShouldNotReachHere();
3843     }
3844 
3845     // return errValue or *adr
3846     *continuation_pc = __ pc();
3847     __ mov(r0, c_rarg1);
3848     __ ret(lr);
3849   }
3850 
3851   /**
3852    *  Arguments:
3853    *
3854    * Inputs:
3855    *   c_rarg0   - int crc
3856    *   c_rarg1   - byte* buf
3857    *   c_rarg2   - int length
3858    *
3859    * Ouput:
3860    *       rax   - int crc result
3861    */
3862   address generate_updateBytesCRC32() {
3863     assert(UseCRC32Intrinsics, "what are we doing here?");
3864 
3865     __ align(CodeEntryAlignment);
3866     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3867 
3868     address start = __ pc();
3869 
3870     const Register crc   = c_rarg0;  // crc
3871     const Register buf   = c_rarg1;  // source java byte array address
3872     const Register len   = c_rarg2;  // length
3873     const Register table0 = c_rarg3; // crc_table address
3874     const Register table1 = c_rarg4;
3875     const Register table2 = c_rarg5;
3876     const Register table3 = c_rarg6;
3877     const Register tmp3 = c_rarg7;
3878 
3879     BLOCK_COMMENT("Entry:");
3880     __ enter(); // required for proper stackwalking of RuntimeStub frame
3881 
3882     __ kernel_crc32(crc, buf, len,
3883               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3884 
3885     __ leave(); // required for proper stackwalking of RuntimeStub frame
3886     __ ret(lr);
3887 
3888     return start;
3889   }
3890 
3891   /**
3892    *  Arguments:
3893    *
3894    * Inputs:
3895    *   c_rarg0   - int crc
3896    *   c_rarg1   - byte* buf
3897    *   c_rarg2   - int length
3898    *   c_rarg3   - int* table
3899    *
3900    * Ouput:
3901    *       r0   - int crc result
3902    */
3903   address generate_updateBytesCRC32C() {
3904     assert(UseCRC32CIntrinsics, "what are we doing here?");
3905 
3906     __ align(CodeEntryAlignment);
3907     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3908 
3909     address start = __ pc();
3910 
3911     const Register crc   = c_rarg0;  // crc
3912     const Register buf   = c_rarg1;  // source java byte array address
3913     const Register len   = c_rarg2;  // length
3914     const Register table0 = c_rarg3; // crc_table address
3915     const Register table1 = c_rarg4;
3916     const Register table2 = c_rarg5;
3917     const Register table3 = c_rarg6;
3918     const Register tmp3 = c_rarg7;
3919 
3920     BLOCK_COMMENT("Entry:");
3921     __ enter(); // required for proper stackwalking of RuntimeStub frame
3922 
3923     __ kernel_crc32c(crc, buf, len,
3924               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3925 
3926     __ leave(); // required for proper stackwalking of RuntimeStub frame
3927     __ ret(lr);
3928 
3929     return start;
3930   }
3931 
3932   /***
3933    *  Arguments:
3934    *
3935    *  Inputs:
3936    *   c_rarg0   - int   adler
3937    *   c_rarg1   - byte* buff
3938    *   c_rarg2   - int   len
3939    *
3940    * Output:
3941    *   c_rarg0   - int adler result
3942    */
3943   address generate_updateBytesAdler32() {
3944     __ align(CodeEntryAlignment);
3945     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3946     address start = __ pc();
3947 
3948     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3949 
3950     // Aliases
3951     Register adler  = c_rarg0;
3952     Register s1     = c_rarg0;
3953     Register s2     = c_rarg3;
3954     Register buff   = c_rarg1;
3955     Register len    = c_rarg2;
3956     Register nmax  = r4;
3957     Register base  = r5;
3958     Register count = r6;
3959     Register temp0 = rscratch1;
3960     Register temp1 = rscratch2;
3961     FloatRegister vbytes = v0;
3962     FloatRegister vs1acc = v1;
3963     FloatRegister vs2acc = v2;
3964     FloatRegister vtable = v3;
3965 
3966     // Max number of bytes we can process before having to take the mod
3967     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3968     uint64_t BASE = 0xfff1;
3969     uint64_t NMAX = 0x15B0;
3970 
3971     __ mov(base, BASE);
3972     __ mov(nmax, NMAX);
3973 
3974     // Load accumulation coefficients for the upper 16 bits
3975     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3976     __ ld1(vtable, __ T16B, Address(temp0));
3977 
3978     // s1 is initialized to the lower 16 bits of adler
3979     // s2 is initialized to the upper 16 bits of adler
3980     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3981     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3982 
3983     // The pipelined loop needs at least 16 elements for 1 iteration
3984     // It does check this, but it is more effective to skip to the cleanup loop
3985     __ cmp(len, (u1)16);
3986     __ br(Assembler::HS, L_nmax);
3987     __ cbz(len, L_combine);
3988 
3989     __ bind(L_simple_by1_loop);
3990     __ ldrb(temp0, Address(__ post(buff, 1)));
3991     __ add(s1, s1, temp0);
3992     __ add(s2, s2, s1);
3993     __ subs(len, len, 1);
3994     __ br(Assembler::HI, L_simple_by1_loop);
3995 
3996     // s1 = s1 % BASE
3997     __ subs(temp0, s1, base);
3998     __ csel(s1, temp0, s1, Assembler::HS);
3999 
4000     // s2 = s2 % BASE
4001     __ lsr(temp0, s2, 16);
4002     __ lsl(temp1, temp0, 4);
4003     __ sub(temp1, temp1, temp0);
4004     __ add(s2, temp1, s2, ext::uxth);
4005 
4006     __ subs(temp0, s2, base);
4007     __ csel(s2, temp0, s2, Assembler::HS);
4008 
4009     __ b(L_combine);
4010 
4011     __ bind(L_nmax);
4012     __ subs(len, len, nmax);
4013     __ sub(count, nmax, 16);
4014     __ br(Assembler::LO, L_by16);
4015 
4016     __ bind(L_nmax_loop);
4017 
4018     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4019                                       vbytes, vs1acc, vs2acc, vtable);
4020 
4021     __ subs(count, count, 16);
4022     __ br(Assembler::HS, L_nmax_loop);
4023 
4024     // s1 = s1 % BASE
4025     __ lsr(temp0, s1, 16);
4026     __ lsl(temp1, temp0, 4);
4027     __ sub(temp1, temp1, temp0);
4028     __ add(temp1, temp1, s1, ext::uxth);
4029 
4030     __ lsr(temp0, temp1, 16);
4031     __ lsl(s1, temp0, 4);
4032     __ sub(s1, s1, temp0);
4033     __ add(s1, s1, temp1, ext:: uxth);
4034 
4035     __ subs(temp0, s1, base);
4036     __ csel(s1, temp0, s1, Assembler::HS);
4037 
4038     // s2 = s2 % BASE
4039     __ lsr(temp0, s2, 16);
4040     __ lsl(temp1, temp0, 4);
4041     __ sub(temp1, temp1, temp0);
4042     __ add(temp1, temp1, s2, ext::uxth);
4043 
4044     __ lsr(temp0, temp1, 16);
4045     __ lsl(s2, temp0, 4);
4046     __ sub(s2, s2, temp0);
4047     __ add(s2, s2, temp1, ext:: uxth);
4048 
4049     __ subs(temp0, s2, base);
4050     __ csel(s2, temp0, s2, Assembler::HS);
4051 
4052     __ subs(len, len, nmax);
4053     __ sub(count, nmax, 16);
4054     __ br(Assembler::HS, L_nmax_loop);
4055 
4056     __ bind(L_by16);
4057     __ adds(len, len, count);
4058     __ br(Assembler::LO, L_by1);
4059 
4060     __ bind(L_by16_loop);
4061 
4062     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4063                                       vbytes, vs1acc, vs2acc, vtable);
4064 
4065     __ subs(len, len, 16);
4066     __ br(Assembler::HS, L_by16_loop);
4067 
4068     __ bind(L_by1);
4069     __ adds(len, len, 15);
4070     __ br(Assembler::LO, L_do_mod);
4071 
4072     __ bind(L_by1_loop);
4073     __ ldrb(temp0, Address(__ post(buff, 1)));
4074     __ add(s1, temp0, s1);
4075     __ add(s2, s2, s1);
4076     __ subs(len, len, 1);
4077     __ br(Assembler::HS, L_by1_loop);
4078 
4079     __ bind(L_do_mod);
4080     // s1 = s1 % BASE
4081     __ lsr(temp0, s1, 16);
4082     __ lsl(temp1, temp0, 4);
4083     __ sub(temp1, temp1, temp0);
4084     __ add(temp1, temp1, s1, ext::uxth);
4085 
4086     __ lsr(temp0, temp1, 16);
4087     __ lsl(s1, temp0, 4);
4088     __ sub(s1, s1, temp0);
4089     __ add(s1, s1, temp1, ext:: uxth);
4090 
4091     __ subs(temp0, s1, base);
4092     __ csel(s1, temp0, s1, Assembler::HS);
4093 
4094     // s2 = s2 % BASE
4095     __ lsr(temp0, s2, 16);
4096     __ lsl(temp1, temp0, 4);
4097     __ sub(temp1, temp1, temp0);
4098     __ add(temp1, temp1, s2, ext::uxth);
4099 
4100     __ lsr(temp0, temp1, 16);
4101     __ lsl(s2, temp0, 4);
4102     __ sub(s2, s2, temp0);
4103     __ add(s2, s2, temp1, ext:: uxth);
4104 
4105     __ subs(temp0, s2, base);
4106     __ csel(s2, temp0, s2, Assembler::HS);
4107 
4108     // Combine lower bits and higher bits
4109     __ bind(L_combine);
4110     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4111 
4112     __ ret(lr);
4113 
4114     return start;
4115   }
4116 
4117   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4118           Register temp0, Register temp1, FloatRegister vbytes,
4119           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4120     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4121     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4122     // In non-vectorized code, we update s1 and s2 as:
4123     //   s1 <- s1 + b1
4124     //   s2 <- s2 + s1
4125     //   s1 <- s1 + b2
4126     //   s2 <- s2 + b1
4127     //   ...
4128     //   s1 <- s1 + b16
4129     //   s2 <- s2 + s1
4130     // Putting above assignments together, we have:
4131     //   s1_new = s1 + b1 + b2 + ... + b16
4132     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4133     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4134     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4135     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4136 
4137     // s2 = s2 + s1 * 16
4138     __ add(s2, s2, s1, Assembler::LSL, 4);
4139 
4140     // vs1acc = b1 + b2 + b3 + ... + b16
4141     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4142     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4143     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4144     __ uaddlv(vs1acc, __ T16B, vbytes);
4145     __ uaddlv(vs2acc, __ T8H, vs2acc);
4146 
4147     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4148     __ fmovd(temp0, vs1acc);
4149     __ fmovd(temp1, vs2acc);
4150     __ add(s1, s1, temp0);
4151     __ add(s2, s2, temp1);
4152   }
4153 
4154   /**
4155    *  Arguments:
4156    *
4157    *  Input:
4158    *    c_rarg0   - x address
4159    *    c_rarg1   - x length
4160    *    c_rarg2   - y address
4161    *    c_rarg3   - y lenth
4162    *    c_rarg4   - z address
4163    *    c_rarg5   - z length
4164    */
4165   address generate_multiplyToLen() {
4166     __ align(CodeEntryAlignment);
4167     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4168 
4169     address start = __ pc();
4170     const Register x     = r0;
4171     const Register xlen  = r1;
4172     const Register y     = r2;
4173     const Register ylen  = r3;
4174     const Register z     = r4;
4175     const Register zlen  = r5;
4176 
4177     const Register tmp1  = r10;
4178     const Register tmp2  = r11;
4179     const Register tmp3  = r12;
4180     const Register tmp4  = r13;
4181     const Register tmp5  = r14;
4182     const Register tmp6  = r15;
4183     const Register tmp7  = r16;
4184 
4185     BLOCK_COMMENT("Entry:");
4186     __ enter(); // required for proper stackwalking of RuntimeStub frame
4187     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4188     __ leave(); // required for proper stackwalking of RuntimeStub frame
4189     __ ret(lr);
4190 
4191     return start;
4192   }
4193 
4194   address generate_squareToLen() {
4195     // squareToLen algorithm for sizes 1..127 described in java code works
4196     // faster than multiply_to_len on some CPUs and slower on others, but
4197     // multiply_to_len shows a bit better overall results
4198     __ align(CodeEntryAlignment);
4199     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4200     address start = __ pc();
4201 
4202     const Register x     = r0;
4203     const Register xlen  = r1;
4204     const Register z     = r2;
4205     const Register zlen  = r3;
4206     const Register y     = r4; // == x
4207     const Register ylen  = r5; // == xlen
4208 
4209     const Register tmp1  = r10;
4210     const Register tmp2  = r11;
4211     const Register tmp3  = r12;
4212     const Register tmp4  = r13;
4213     const Register tmp5  = r14;
4214     const Register tmp6  = r15;
4215     const Register tmp7  = r16;
4216 
4217     RegSet spilled_regs = RegSet::of(y, ylen);
4218     BLOCK_COMMENT("Entry:");
4219     __ enter();
4220     __ push(spilled_regs, sp);
4221     __ mov(y, x);
4222     __ mov(ylen, xlen);
4223     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4224     __ pop(spilled_regs, sp);
4225     __ leave();
4226     __ ret(lr);
4227     return start;
4228   }
4229 
4230   address generate_mulAdd() {
4231     __ align(CodeEntryAlignment);
4232     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4233 
4234     address start = __ pc();
4235 
4236     const Register out     = r0;
4237     const Register in      = r1;
4238     const Register offset  = r2;
4239     const Register len     = r3;
4240     const Register k       = r4;
4241 
4242     BLOCK_COMMENT("Entry:");
4243     __ enter();
4244     __ mul_add(out, in, offset, len, k);
4245     __ leave();
4246     __ ret(lr);
4247 
4248     return start;
4249   }
4250 
4251   // Arguments:
4252   //
4253   // Input:
4254   //   c_rarg0   - newArr address
4255   //   c_rarg1   - oldArr address
4256   //   c_rarg2   - newIdx
4257   //   c_rarg3   - shiftCount
4258   //   c_rarg4   - numIter
4259   //
4260   address generate_bigIntegerRightShift() {
4261     __ align(CodeEntryAlignment);
4262     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4263     address start = __ pc();
4264 
4265     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4266 
4267     Register newArr        = c_rarg0;
4268     Register oldArr        = c_rarg1;
4269     Register newIdx        = c_rarg2;
4270     Register shiftCount    = c_rarg3;
4271     Register numIter       = c_rarg4;
4272     Register idx           = numIter;
4273 
4274     Register newArrCur     = rscratch1;
4275     Register shiftRevCount = rscratch2;
4276     Register oldArrCur     = r13;
4277     Register oldArrNext    = r14;
4278 
4279     FloatRegister oldElem0        = v0;
4280     FloatRegister oldElem1        = v1;
4281     FloatRegister newElem         = v2;
4282     FloatRegister shiftVCount     = v3;
4283     FloatRegister shiftVRevCount  = v4;
4284 
4285     __ cbz(idx, Exit);
4286 
4287     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4288 
4289     // left shift count
4290     __ movw(shiftRevCount, 32);
4291     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4292 
4293     // numIter too small to allow a 4-words SIMD loop, rolling back
4294     __ cmp(numIter, (u1)4);
4295     __ br(Assembler::LT, ShiftThree);
4296 
4297     __ dup(shiftVCount,    __ T4S, shiftCount);
4298     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4299     __ negr(shiftVCount,   __ T4S, shiftVCount);
4300 
4301     __ BIND(ShiftSIMDLoop);
4302 
4303     // Calculate the load addresses
4304     __ sub(idx, idx, 4);
4305     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4306     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4307     __ add(oldArrCur,  oldArrNext, 4);
4308 
4309     // Load 4 words and process
4310     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4311     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4312     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4313     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4314     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4315     __ st1(newElem,   __ T4S,  Address(newArrCur));
4316 
4317     __ cmp(idx, (u1)4);
4318     __ br(Assembler::LT, ShiftTwoLoop);
4319     __ b(ShiftSIMDLoop);
4320 
4321     __ BIND(ShiftTwoLoop);
4322     __ cbz(idx, Exit);
4323     __ cmp(idx, (u1)1);
4324     __ br(Assembler::EQ, ShiftOne);
4325 
4326     // Calculate the load addresses
4327     __ sub(idx, idx, 2);
4328     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4329     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4330     __ add(oldArrCur,  oldArrNext, 4);
4331 
4332     // Load 2 words and process
4333     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4334     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4335     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4336     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4337     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4338     __ st1(newElem,   __ T2S, Address(newArrCur));
4339     __ b(ShiftTwoLoop);
4340 
4341     __ BIND(ShiftThree);
4342     __ tbz(idx, 1, ShiftOne);
4343     __ tbz(idx, 0, ShiftTwo);
4344     __ ldrw(r10,  Address(oldArr, 12));
4345     __ ldrw(r11,  Address(oldArr, 8));
4346     __ lsrvw(r10, r10, shiftCount);
4347     __ lslvw(r11, r11, shiftRevCount);
4348     __ orrw(r12,  r10, r11);
4349     __ strw(r12,  Address(newArr, 8));
4350 
4351     __ BIND(ShiftTwo);
4352     __ ldrw(r10,  Address(oldArr, 8));
4353     __ ldrw(r11,  Address(oldArr, 4));
4354     __ lsrvw(r10, r10, shiftCount);
4355     __ lslvw(r11, r11, shiftRevCount);
4356     __ orrw(r12,  r10, r11);
4357     __ strw(r12,  Address(newArr, 4));
4358 
4359     __ BIND(ShiftOne);
4360     __ ldrw(r10,  Address(oldArr, 4));
4361     __ ldrw(r11,  Address(oldArr));
4362     __ lsrvw(r10, r10, shiftCount);
4363     __ lslvw(r11, r11, shiftRevCount);
4364     __ orrw(r12,  r10, r11);
4365     __ strw(r12,  Address(newArr));
4366 
4367     __ BIND(Exit);
4368     __ ret(lr);
4369 
4370     return start;
4371   }
4372 
4373   // Arguments:
4374   //
4375   // Input:
4376   //   c_rarg0   - newArr address
4377   //   c_rarg1   - oldArr address
4378   //   c_rarg2   - newIdx
4379   //   c_rarg3   - shiftCount
4380   //   c_rarg4   - numIter
4381   //
4382   address generate_bigIntegerLeftShift() {
4383     __ align(CodeEntryAlignment);
4384     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4385     address start = __ pc();
4386 
4387     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4388 
4389     Register newArr        = c_rarg0;
4390     Register oldArr        = c_rarg1;
4391     Register newIdx        = c_rarg2;
4392     Register shiftCount    = c_rarg3;
4393     Register numIter       = c_rarg4;
4394 
4395     Register shiftRevCount = rscratch1;
4396     Register oldArrNext    = rscratch2;
4397 
4398     FloatRegister oldElem0        = v0;
4399     FloatRegister oldElem1        = v1;
4400     FloatRegister newElem         = v2;
4401     FloatRegister shiftVCount     = v3;
4402     FloatRegister shiftVRevCount  = v4;
4403 
4404     __ cbz(numIter, Exit);
4405 
4406     __ add(oldArrNext, oldArr, 4);
4407     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4408 
4409     // right shift count
4410     __ movw(shiftRevCount, 32);
4411     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4412 
4413     // numIter too small to allow a 4-words SIMD loop, rolling back
4414     __ cmp(numIter, (u1)4);
4415     __ br(Assembler::LT, ShiftThree);
4416 
4417     __ dup(shiftVCount,     __ T4S, shiftCount);
4418     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4419     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4420 
4421     __ BIND(ShiftSIMDLoop);
4422 
4423     // load 4 words and process
4424     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4425     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4426     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4427     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4428     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4429     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4430     __ sub(numIter,   numIter, 4);
4431 
4432     __ cmp(numIter, (u1)4);
4433     __ br(Assembler::LT, ShiftTwoLoop);
4434     __ b(ShiftSIMDLoop);
4435 
4436     __ BIND(ShiftTwoLoop);
4437     __ cbz(numIter, Exit);
4438     __ cmp(numIter, (u1)1);
4439     __ br(Assembler::EQ, ShiftOne);
4440 
4441     // load 2 words and process
4442     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4443     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4444     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4445     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4446     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4447     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4448     __ sub(numIter,   numIter, 2);
4449     __ b(ShiftTwoLoop);
4450 
4451     __ BIND(ShiftThree);
4452     __ ldrw(r10,  __ post(oldArr, 4));
4453     __ ldrw(r11,  __ post(oldArrNext, 4));
4454     __ lslvw(r10, r10, shiftCount);
4455     __ lsrvw(r11, r11, shiftRevCount);
4456     __ orrw(r12,  r10, r11);
4457     __ strw(r12,  __ post(newArr, 4));
4458     __ tbz(numIter, 1, Exit);
4459     __ tbz(numIter, 0, ShiftOne);
4460 
4461     __ BIND(ShiftTwo);
4462     __ ldrw(r10,  __ post(oldArr, 4));
4463     __ ldrw(r11,  __ post(oldArrNext, 4));
4464     __ lslvw(r10, r10, shiftCount);
4465     __ lsrvw(r11, r11, shiftRevCount);
4466     __ orrw(r12,  r10, r11);
4467     __ strw(r12,  __ post(newArr, 4));
4468 
4469     __ BIND(ShiftOne);
4470     __ ldrw(r10,  Address(oldArr));
4471     __ ldrw(r11,  Address(oldArrNext));
4472     __ lslvw(r10, r10, shiftCount);
4473     __ lsrvw(r11, r11, shiftRevCount);
4474     __ orrw(r12,  r10, r11);
4475     __ strw(r12,  Address(newArr));
4476 
4477     __ BIND(Exit);
4478     __ ret(lr);
4479 
4480     return start;
4481   }
4482 
4483   address generate_has_negatives(address &has_negatives_long) {
4484     const u1 large_loop_size = 64;
4485     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4486     int dcache_line = VM_Version::dcache_line_size();
4487 
4488     Register ary1 = r1, len = r2, result = r0;
4489 
4490     __ align(CodeEntryAlignment);
4491 
4492     StubCodeMark mark(this, "StubRoutines", "has_negatives");
4493 
4494     address entry = __ pc();
4495 
4496     __ enter();
4497 
4498   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
4499         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4500 
4501   __ cmp(len, (u1)15);
4502   __ br(Assembler::GT, LEN_OVER_15);
4503   // The only case when execution falls into this code is when pointer is near
4504   // the end of memory page and we have to avoid reading next page
4505   __ add(ary1, ary1, len);
4506   __ subs(len, len, 8);
4507   __ br(Assembler::GT, LEN_OVER_8);
4508   __ ldr(rscratch2, Address(ary1, -8));
4509   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4510   __ lsrv(rscratch2, rscratch2, rscratch1);
4511   __ tst(rscratch2, UPPER_BIT_MASK);
4512   __ cset(result, Assembler::NE);
4513   __ leave();
4514   __ ret(lr);
4515   __ bind(LEN_OVER_8);
4516   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4517   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4518   __ tst(rscratch2, UPPER_BIT_MASK);
4519   __ br(Assembler::NE, RET_TRUE_NO_POP);
4520   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4521   __ lsrv(rscratch1, rscratch1, rscratch2);
4522   __ tst(rscratch1, UPPER_BIT_MASK);
4523   __ cset(result, Assembler::NE);
4524   __ leave();
4525   __ ret(lr);
4526 
4527   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4528   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4529 
4530   has_negatives_long = __ pc(); // 2nd entry point
4531 
4532   __ enter();
4533 
4534   __ bind(LEN_OVER_15);
4535     __ push(spilled_regs, sp);
4536     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4537     __ cbz(rscratch2, ALIGNED);
4538     __ ldp(tmp6, tmp1, Address(ary1));
4539     __ mov(tmp5, 16);
4540     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4541     __ add(ary1, ary1, rscratch1);
4542     __ sub(len, len, rscratch1);
4543     __ orr(tmp6, tmp6, tmp1);
4544     __ tst(tmp6, UPPER_BIT_MASK);
4545     __ br(Assembler::NE, RET_TRUE);
4546 
4547   __ bind(ALIGNED);
4548     __ cmp(len, large_loop_size);
4549     __ br(Assembler::LT, CHECK_16);
4550     // Perform 16-byte load as early return in pre-loop to handle situation
4551     // when initially aligned large array has negative values at starting bytes,
4552     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4553     // slower. Cases with negative bytes further ahead won't be affected that
4554     // much. In fact, it'll be faster due to early loads, less instructions and
4555     // less branches in LARGE_LOOP.
4556     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4557     __ sub(len, len, 16);
4558     __ orr(tmp6, tmp6, tmp1);
4559     __ tst(tmp6, UPPER_BIT_MASK);
4560     __ br(Assembler::NE, RET_TRUE);
4561     __ cmp(len, large_loop_size);
4562     __ br(Assembler::LT, CHECK_16);
4563 
4564     if (SoftwarePrefetchHintDistance >= 0
4565         && SoftwarePrefetchHintDistance >= dcache_line) {
4566       // initial prefetch
4567       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4568     }
4569   __ bind(LARGE_LOOP);
4570     if (SoftwarePrefetchHintDistance >= 0) {
4571       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4572     }
4573     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4574     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4575     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4576     // instructions per cycle and have less branches, but this approach disables
4577     // early return, thus, all 64 bytes are loaded and checked every time.
4578     __ ldp(tmp2, tmp3, Address(ary1));
4579     __ ldp(tmp4, tmp5, Address(ary1, 16));
4580     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4581     __ ldp(tmp6, tmp1, Address(ary1, 48));
4582     __ add(ary1, ary1, large_loop_size);
4583     __ sub(len, len, large_loop_size);
4584     __ orr(tmp2, tmp2, tmp3);
4585     __ orr(tmp4, tmp4, tmp5);
4586     __ orr(rscratch1, rscratch1, rscratch2);
4587     __ orr(tmp6, tmp6, tmp1);
4588     __ orr(tmp2, tmp2, tmp4);
4589     __ orr(rscratch1, rscratch1, tmp6);
4590     __ orr(tmp2, tmp2, rscratch1);
4591     __ tst(tmp2, UPPER_BIT_MASK);
4592     __ br(Assembler::NE, RET_TRUE);
4593     __ cmp(len, large_loop_size);
4594     __ br(Assembler::GE, LARGE_LOOP);
4595 
4596   __ bind(CHECK_16); // small 16-byte load pre-loop
4597     __ cmp(len, (u1)16);
4598     __ br(Assembler::LT, POST_LOOP16);
4599 
4600   __ bind(LOOP16); // small 16-byte load loop
4601     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4602     __ sub(len, len, 16);
4603     __ orr(tmp2, tmp2, tmp3);
4604     __ tst(tmp2, UPPER_BIT_MASK);
4605     __ br(Assembler::NE, RET_TRUE);
4606     __ cmp(len, (u1)16);
4607     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4608 
4609   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4610     __ cmp(len, (u1)8);
4611     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4612     __ ldr(tmp3, Address(__ post(ary1, 8)));
4613     __ sub(len, len, 8);
4614     __ tst(tmp3, UPPER_BIT_MASK);
4615     __ br(Assembler::NE, RET_TRUE);
4616 
4617   __ bind(POST_LOOP16_LOAD_TAIL);
4618     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
4619     __ ldr(tmp1, Address(ary1));
4620     __ mov(tmp2, 64);
4621     __ sub(tmp4, tmp2, len, __ LSL, 3);
4622     __ lslv(tmp1, tmp1, tmp4);
4623     __ tst(tmp1, UPPER_BIT_MASK);
4624     __ br(Assembler::NE, RET_TRUE);
4625     // Fallthrough
4626 
4627   __ bind(RET_FALSE);
4628     __ pop(spilled_regs, sp);
4629     __ leave();
4630     __ mov(result, zr);
4631     __ ret(lr);
4632 
4633   __ bind(RET_TRUE);
4634     __ pop(spilled_regs, sp);
4635   __ bind(RET_TRUE_NO_POP);
4636     __ leave();
4637     __ mov(result, 1);
4638     __ ret(lr);
4639 
4640   __ bind(DONE);
4641     __ pop(spilled_regs, sp);
4642     __ leave();
4643     __ ret(lr);
4644     return entry;
4645   }
4646 
4647   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4648         bool usePrefetch, Label &NOT_EQUAL) {
4649     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4650         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4651         tmp7 = r12, tmp8 = r13;
4652     Label LOOP;
4653 
4654     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4655     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4656     __ bind(LOOP);
4657     if (usePrefetch) {
4658       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4659       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4660     }
4661     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4662     __ eor(tmp1, tmp1, tmp2);
4663     __ eor(tmp3, tmp3, tmp4);
4664     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4665     __ orr(tmp1, tmp1, tmp3);
4666     __ cbnz(tmp1, NOT_EQUAL);
4667     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4668     __ eor(tmp5, tmp5, tmp6);
4669     __ eor(tmp7, tmp7, tmp8);
4670     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4671     __ orr(tmp5, tmp5, tmp7);
4672     __ cbnz(tmp5, NOT_EQUAL);
4673     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4674     __ eor(tmp1, tmp1, tmp2);
4675     __ eor(tmp3, tmp3, tmp4);
4676     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4677     __ orr(tmp1, tmp1, tmp3);
4678     __ cbnz(tmp1, NOT_EQUAL);
4679     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4680     __ eor(tmp5, tmp5, tmp6);
4681     __ sub(cnt1, cnt1, 8 * wordSize);
4682     __ eor(tmp7, tmp7, tmp8);
4683     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4684     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4685     // cmp) because subs allows an unlimited range of immediate operand.
4686     __ subs(tmp6, cnt1, loopThreshold);
4687     __ orr(tmp5, tmp5, tmp7);
4688     __ cbnz(tmp5, NOT_EQUAL);
4689     __ br(__ GE, LOOP);
4690     // post-loop
4691     __ eor(tmp1, tmp1, tmp2);
4692     __ eor(tmp3, tmp3, tmp4);
4693     __ orr(tmp1, tmp1, tmp3);
4694     __ sub(cnt1, cnt1, 2 * wordSize);
4695     __ cbnz(tmp1, NOT_EQUAL);
4696   }
4697 
4698   void generate_large_array_equals_loop_simd(int loopThreshold,
4699         bool usePrefetch, Label &NOT_EQUAL) {
4700     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4701         tmp2 = rscratch2;
4702     Label LOOP;
4703 
4704     __ bind(LOOP);
4705     if (usePrefetch) {
4706       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4707       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4708     }
4709     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4710     __ sub(cnt1, cnt1, 8 * wordSize);
4711     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4712     __ subs(tmp1, cnt1, loopThreshold);
4713     __ eor(v0, __ T16B, v0, v4);
4714     __ eor(v1, __ T16B, v1, v5);
4715     __ eor(v2, __ T16B, v2, v6);
4716     __ eor(v3, __ T16B, v3, v7);
4717     __ orr(v0, __ T16B, v0, v1);
4718     __ orr(v1, __ T16B, v2, v3);
4719     __ orr(v0, __ T16B, v0, v1);
4720     __ umov(tmp1, v0, __ D, 0);
4721     __ umov(tmp2, v0, __ D, 1);
4722     __ orr(tmp1, tmp1, tmp2);
4723     __ cbnz(tmp1, NOT_EQUAL);
4724     __ br(__ GE, LOOP);
4725   }
4726 
4727   // a1 = r1 - array1 address
4728   // a2 = r2 - array2 address
4729   // result = r0 - return value. Already contains "false"
4730   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4731   // r3-r5 are reserved temporary registers
4732   address generate_large_array_equals() {
4733     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4734         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4735         tmp7 = r12, tmp8 = r13;
4736     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4737         SMALL_LOOP, POST_LOOP;
4738     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4739     // calculate if at least 32 prefetched bytes are used
4740     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4741     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4742     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4743     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4744         tmp5, tmp6, tmp7, tmp8);
4745 
4746     __ align(CodeEntryAlignment);
4747 
4748     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4749 
4750     address entry = __ pc();
4751     __ enter();
4752     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4753     // also advance pointers to use post-increment instead of pre-increment
4754     __ add(a1, a1, wordSize);
4755     __ add(a2, a2, wordSize);
4756     if (AvoidUnalignedAccesses) {
4757       // both implementations (SIMD/nonSIMD) are using relatively large load
4758       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4759       // on some CPUs in case of address is not at least 16-byte aligned.
4760       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4761       // load if needed at least for 1st address and make if 16-byte aligned.
4762       Label ALIGNED16;
4763       __ tbz(a1, 3, ALIGNED16);
4764       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4765       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4766       __ sub(cnt1, cnt1, wordSize);
4767       __ eor(tmp1, tmp1, tmp2);
4768       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4769       __ bind(ALIGNED16);
4770     }
4771     if (UseSIMDForArrayEquals) {
4772       if (SoftwarePrefetchHintDistance >= 0) {
4773         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4774         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4775         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4776             /* prfm = */ true, NOT_EQUAL);
4777         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4778         __ br(__ LT, TAIL);
4779       }
4780       __ bind(NO_PREFETCH_LARGE_LOOP);
4781       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4782           /* prfm = */ false, NOT_EQUAL);
4783     } else {
4784       __ push(spilled_regs, sp);
4785       if (SoftwarePrefetchHintDistance >= 0) {
4786         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4787         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4788         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4789             /* prfm = */ true, NOT_EQUAL);
4790         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4791         __ br(__ LT, TAIL);
4792       }
4793       __ bind(NO_PREFETCH_LARGE_LOOP);
4794       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4795           /* prfm = */ false, NOT_EQUAL);
4796     }
4797     __ bind(TAIL);
4798       __ cbz(cnt1, EQUAL);
4799       __ subs(cnt1, cnt1, wordSize);
4800       __ br(__ LE, POST_LOOP);
4801     __ bind(SMALL_LOOP);
4802       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4803       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4804       __ subs(cnt1, cnt1, wordSize);
4805       __ eor(tmp1, tmp1, tmp2);
4806       __ cbnz(tmp1, NOT_EQUAL);
4807       __ br(__ GT, SMALL_LOOP);
4808     __ bind(POST_LOOP);
4809       __ ldr(tmp1, Address(a1, cnt1));
4810       __ ldr(tmp2, Address(a2, cnt1));
4811       __ eor(tmp1, tmp1, tmp2);
4812       __ cbnz(tmp1, NOT_EQUAL);
4813     __ bind(EQUAL);
4814       __ mov(result, true);
4815     __ bind(NOT_EQUAL);
4816       if (!UseSIMDForArrayEquals) {
4817         __ pop(spilled_regs, sp);
4818       }
4819     __ bind(NOT_EQUAL_NO_POP);
4820     __ leave();
4821     __ ret(lr);
4822     return entry;
4823   }
4824 
4825   address generate_dsin_dcos(bool isCos) {
4826     __ align(CodeEntryAlignment);
4827     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4828     address start = __ pc();
4829     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4830         (address)StubRoutines::aarch64::_two_over_pi,
4831         (address)StubRoutines::aarch64::_pio2,
4832         (address)StubRoutines::aarch64::_dsin_coef,
4833         (address)StubRoutines::aarch64::_dcos_coef);
4834     return start;
4835   }
4836 
4837   address generate_dlog() {
4838     __ align(CodeEntryAlignment);
4839     StubCodeMark mark(this, "StubRoutines", "dlog");
4840     address entry = __ pc();
4841     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4842         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4843     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4844     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4845         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4846     return entry;
4847   }
4848 
4849 
4850   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4851   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4852       Label &DIFF2) {
4853     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4854     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4855 
4856     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4857     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4858     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4859     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4860 
4861     __ fmovd(tmpL, vtmp3);
4862     __ eor(rscratch2, tmp3, tmpL);
4863     __ cbnz(rscratch2, DIFF2);
4864 
4865     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4866     __ umov(tmpL, vtmp3, __ D, 1);
4867     __ eor(rscratch2, tmpU, tmpL);
4868     __ cbnz(rscratch2, DIFF1);
4869 
4870     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4871     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4872     __ fmovd(tmpL, vtmp);
4873     __ eor(rscratch2, tmp3, tmpL);
4874     __ cbnz(rscratch2, DIFF2);
4875 
4876     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4877     __ umov(tmpL, vtmp, __ D, 1);
4878     __ eor(rscratch2, tmpU, tmpL);
4879     __ cbnz(rscratch2, DIFF1);
4880   }
4881 
4882   // r0  = result
4883   // r1  = str1
4884   // r2  = cnt1
4885   // r3  = str2
4886   // r4  = cnt2
4887   // r10 = tmp1
4888   // r11 = tmp2
4889   address generate_compare_long_string_different_encoding(bool isLU) {
4890     __ align(CodeEntryAlignment);
4891     StubCodeMark mark(this, "StubRoutines", isLU
4892         ? "compare_long_string_different_encoding LU"
4893         : "compare_long_string_different_encoding UL");
4894     address entry = __ pc();
4895     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4896         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4897         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4898     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4899         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4900     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4901     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4902 
4903     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4904 
4905     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4906     // cnt2 == amount of characters left to compare
4907     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4908     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4909     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4910     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4911     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4912     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4913     __ eor(rscratch2, tmp1, tmp2);
4914     __ mov(rscratch1, tmp2);
4915     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4916     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4917              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4918     __ push(spilled_regs, sp);
4919     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4920     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4921 
4922     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4923 
4924     if (SoftwarePrefetchHintDistance >= 0) {
4925       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4926       __ br(__ LT, NO_PREFETCH);
4927       __ bind(LARGE_LOOP_PREFETCH);
4928         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4929         __ mov(tmp4, 2);
4930         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4931         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4932           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4933           __ subs(tmp4, tmp4, 1);
4934           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4935           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4936           __ mov(tmp4, 2);
4937         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4938           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4939           __ subs(tmp4, tmp4, 1);
4940           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4941           __ sub(cnt2, cnt2, 64);
4942           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4943           __ br(__ GE, LARGE_LOOP_PREFETCH);
4944     }
4945     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4946     __ bind(NO_PREFETCH);
4947     __ subs(cnt2, cnt2, 16);
4948     __ br(__ LT, TAIL);
4949     __ align(OptoLoopAlignment);
4950     __ bind(SMALL_LOOP); // smaller loop
4951       __ subs(cnt2, cnt2, 16);
4952       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4953       __ br(__ GE, SMALL_LOOP);
4954       __ cmn(cnt2, (u1)16);
4955       __ br(__ EQ, LOAD_LAST);
4956     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4957       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4958       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4959       __ ldr(tmp3, Address(cnt1, -8));
4960       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4961       __ b(LOAD_LAST);
4962     __ bind(DIFF2);
4963       __ mov(tmpU, tmp3);
4964     __ bind(DIFF1);
4965       __ pop(spilled_regs, sp);
4966       __ b(CALCULATE_DIFFERENCE);
4967     __ bind(LOAD_LAST);
4968       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4969       // No need to load it again
4970       __ mov(tmpU, tmp3);
4971       __ pop(spilled_regs, sp);
4972 
4973       // tmp2 points to the address of the last 4 Latin1 characters right now
4974       __ ldrs(vtmp, Address(tmp2));
4975       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4976       __ fmovd(tmpL, vtmp);
4977 
4978       __ eor(rscratch2, tmpU, tmpL);
4979       __ cbz(rscratch2, DONE);
4980 
4981     // Find the first different characters in the longwords and
4982     // compute their difference.
4983     __ bind(CALCULATE_DIFFERENCE);
4984       __ rev(rscratch2, rscratch2);
4985       __ clz(rscratch2, rscratch2);
4986       __ andr(rscratch2, rscratch2, -16);
4987       __ lsrv(tmp1, tmp1, rscratch2);
4988       __ uxthw(tmp1, tmp1);
4989       __ lsrv(rscratch1, rscratch1, rscratch2);
4990       __ uxthw(rscratch1, rscratch1);
4991       __ subw(result, tmp1, rscratch1);
4992     __ bind(DONE);
4993       __ ret(lr);
4994     return entry;
4995   }
4996 
4997     address generate_method_entry_barrier() {
4998     __ align(CodeEntryAlignment);
4999     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5000 
5001     Label deoptimize_label;
5002 
5003     address start = __ pc();
5004 
5005     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5006 
5007     __ enter();
5008     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5009 
5010     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5011 
5012     __ push_call_clobbered_registers();
5013 
5014     __ mov(c_rarg0, rscratch2);
5015     __ call_VM_leaf
5016          (CAST_FROM_FN_PTR
5017           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5018 
5019     __ reset_last_Java_frame(true);
5020 
5021     __ mov(rscratch1, r0);
5022 
5023     __ pop_call_clobbered_registers();
5024 
5025     __ cbnz(rscratch1, deoptimize_label);
5026 
5027     __ leave();
5028     __ ret(lr);
5029 
5030     __ BIND(deoptimize_label);
5031 
5032     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5033     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5034 
5035     __ mov(sp, rscratch1);
5036     __ br(rscratch2);
5037 
5038     return start;
5039   }
5040 
5041   // r0  = result
5042   // r1  = str1
5043   // r2  = cnt1
5044   // r3  = str2
5045   // r4  = cnt2
5046   // r10 = tmp1
5047   // r11 = tmp2
5048   address generate_compare_long_string_same_encoding(bool isLL) {
5049     __ align(CodeEntryAlignment);
5050     StubCodeMark mark(this, "StubRoutines", isLL
5051         ? "compare_long_string_same_encoding LL"
5052         : "compare_long_string_same_encoding UU");
5053     address entry = __ pc();
5054     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5055         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5056 
5057     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5058 
5059     // exit from large loop when less than 64 bytes left to read or we're about
5060     // to prefetch memory behind array border
5061     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5062 
5063     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5064     __ eor(rscratch2, tmp1, tmp2);
5065     __ cbnz(rscratch2, CAL_DIFFERENCE);
5066 
5067     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5068     // update pointers, because of previous read
5069     __ add(str1, str1, wordSize);
5070     __ add(str2, str2, wordSize);
5071     if (SoftwarePrefetchHintDistance >= 0) {
5072       __ bind(LARGE_LOOP_PREFETCH);
5073         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5074         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5075 
5076         __ align(OptoLoopAlignment);
5077         for (int i = 0; i < 4; i++) {
5078           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5079           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5080           __ cmp(tmp1, tmp2);
5081           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5082           __ br(Assembler::NE, DIFF);
5083         }
5084         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5085         __ add(str1, str1, 64);
5086         __ add(str2, str2, 64);
5087         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5088         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5089         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5090     }
5091 
5092     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5093     __ br(Assembler::LE, LESS16);
5094     __ align(OptoLoopAlignment);
5095     __ bind(LOOP_COMPARE16);
5096       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5097       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5098       __ cmp(tmp1, tmp2);
5099       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5100       __ br(Assembler::NE, DIFF);
5101       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5102       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5103       __ br(Assembler::LT, LESS16);
5104 
5105       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5106       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5107       __ cmp(tmp1, tmp2);
5108       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5109       __ br(Assembler::NE, DIFF);
5110       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5111       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5112       __ br(Assembler::GE, LOOP_COMPARE16);
5113       __ cbz(cnt2, LENGTH_DIFF);
5114 
5115     __ bind(LESS16);
5116       // each 8 compare
5117       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5118       __ br(Assembler::LE, LESS8);
5119       __ ldr(tmp1, Address(__ post(str1, 8)));
5120       __ ldr(tmp2, Address(__ post(str2, 8)));
5121       __ eor(rscratch2, tmp1, tmp2);
5122       __ cbnz(rscratch2, CAL_DIFFERENCE);
5123       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5124 
5125     __ bind(LESS8); // directly load last 8 bytes
5126       if (!isLL) {
5127         __ add(cnt2, cnt2, cnt2);
5128       }
5129       __ ldr(tmp1, Address(str1, cnt2));
5130       __ ldr(tmp2, Address(str2, cnt2));
5131       __ eor(rscratch2, tmp1, tmp2);
5132       __ cbz(rscratch2, LENGTH_DIFF);
5133       __ b(CAL_DIFFERENCE);
5134 
5135     __ bind(DIFF);
5136       __ cmp(tmp1, tmp2);
5137       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5138       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5139       // reuse rscratch2 register for the result of eor instruction
5140       __ eor(rscratch2, tmp1, tmp2);
5141 
5142     __ bind(CAL_DIFFERENCE);
5143       __ rev(rscratch2, rscratch2);
5144       __ clz(rscratch2, rscratch2);
5145       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5146       __ lsrv(tmp1, tmp1, rscratch2);
5147       __ lsrv(tmp2, tmp2, rscratch2);
5148       if (isLL) {
5149         __ uxtbw(tmp1, tmp1);
5150         __ uxtbw(tmp2, tmp2);
5151       } else {
5152         __ uxthw(tmp1, tmp1);
5153         __ uxthw(tmp2, tmp2);
5154       }
5155       __ subw(result, tmp1, tmp2);
5156 
5157     __ bind(LENGTH_DIFF);
5158       __ ret(lr);
5159     return entry;
5160   }
5161 
5162   void generate_compare_long_strings() {
5163       StubRoutines::aarch64::_compare_long_string_LL
5164           = generate_compare_long_string_same_encoding(true);
5165       StubRoutines::aarch64::_compare_long_string_UU
5166           = generate_compare_long_string_same_encoding(false);
5167       StubRoutines::aarch64::_compare_long_string_LU
5168           = generate_compare_long_string_different_encoding(true);
5169       StubRoutines::aarch64::_compare_long_string_UL
5170           = generate_compare_long_string_different_encoding(false);
5171   }
5172 
5173   // R0 = result
5174   // R1 = str2
5175   // R2 = cnt1
5176   // R3 = str1
5177   // R4 = cnt2
5178   // This generic linear code use few additional ideas, which makes it faster:
5179   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5180   // in order to skip initial loading(help in systems with 1 ld pipeline)
5181   // 2) we can use "fast" algorithm of finding single character to search for
5182   // first symbol with less branches(1 branch per each loaded register instead
5183   // of branch for each symbol), so, this is where constants like
5184   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5185   // 3) after loading and analyzing 1st register of source string, it can be
5186   // used to search for every 1st character entry, saving few loads in
5187   // comparison with "simplier-but-slower" implementation
5188   // 4) in order to avoid lots of push/pop operations, code below is heavily
5189   // re-using/re-initializing/compressing register values, which makes code
5190   // larger and a bit less readable, however, most of extra operations are
5191   // issued during loads or branches, so, penalty is minimal
5192   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5193     const char* stubName = str1_isL
5194         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5195         : "indexof_linear_uu";
5196     __ align(CodeEntryAlignment);
5197     StubCodeMark mark(this, "StubRoutines", stubName);
5198     address entry = __ pc();
5199 
5200     int str1_chr_size = str1_isL ? 1 : 2;
5201     int str2_chr_size = str2_isL ? 1 : 2;
5202     int str1_chr_shift = str1_isL ? 0 : 1;
5203     int str2_chr_shift = str2_isL ? 0 : 1;
5204     bool isL = str1_isL && str2_isL;
5205    // parameters
5206     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5207     // temporary registers
5208     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5209     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5210     // redefinitions
5211     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5212 
5213     __ push(spilled_regs, sp);
5214     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5215         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5216         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5217         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5218         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5219         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5220     // Read whole register from str1. It is safe, because length >=8 here
5221     __ ldr(ch1, Address(str1));
5222     // Read whole register from str2. It is safe, because length >=8 here
5223     __ ldr(ch2, Address(str2));
5224     __ sub(cnt2, cnt2, cnt1);
5225     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5226     if (str1_isL != str2_isL) {
5227       __ eor(v0, __ T16B, v0, v0);
5228     }
5229     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5230     __ mul(first, first, tmp1);
5231     // check if we have less than 1 register to check
5232     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5233     if (str1_isL != str2_isL) {
5234       __ fmovd(v1, ch1);
5235     }
5236     __ br(__ LE, L_SMALL);
5237     __ eor(ch2, first, ch2);
5238     if (str1_isL != str2_isL) {
5239       __ zip1(v1, __ T16B, v1, v0);
5240     }
5241     __ sub(tmp2, ch2, tmp1);
5242     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5243     __ bics(tmp2, tmp2, ch2);
5244     if (str1_isL != str2_isL) {
5245       __ fmovd(ch1, v1);
5246     }
5247     __ br(__ NE, L_HAS_ZERO);
5248     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5249     __ add(result, result, wordSize/str2_chr_size);
5250     __ add(str2, str2, wordSize);
5251     __ br(__ LT, L_POST_LOOP);
5252     __ BIND(L_LOOP);
5253       __ ldr(ch2, Address(str2));
5254       __ eor(ch2, first, ch2);
5255       __ sub(tmp2, ch2, tmp1);
5256       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5257       __ bics(tmp2, tmp2, ch2);
5258       __ br(__ NE, L_HAS_ZERO);
5259     __ BIND(L_LOOP_PROCEED);
5260       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5261       __ add(str2, str2, wordSize);
5262       __ add(result, result, wordSize/str2_chr_size);
5263       __ br(__ GE, L_LOOP);
5264     __ BIND(L_POST_LOOP);
5265       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5266       __ br(__ LE, NOMATCH);
5267       __ ldr(ch2, Address(str2));
5268       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5269       __ eor(ch2, first, ch2);
5270       __ sub(tmp2, ch2, tmp1);
5271       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5272       __ mov(tmp4, -1); // all bits set
5273       __ b(L_SMALL_PROCEED);
5274     __ align(OptoLoopAlignment);
5275     __ BIND(L_SMALL);
5276       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5277       __ eor(ch2, first, ch2);
5278       if (str1_isL != str2_isL) {
5279         __ zip1(v1, __ T16B, v1, v0);
5280       }
5281       __ sub(tmp2, ch2, tmp1);
5282       __ mov(tmp4, -1); // all bits set
5283       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5284       if (str1_isL != str2_isL) {
5285         __ fmovd(ch1, v1); // move converted 4 symbols
5286       }
5287     __ BIND(L_SMALL_PROCEED);
5288       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5289       __ bic(tmp2, tmp2, ch2);
5290       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5291       __ rbit(tmp2, tmp2);
5292       __ br(__ EQ, NOMATCH);
5293     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5294       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5295       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5296       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5297       if (str2_isL) { // LL
5298         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5299         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5300         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5301         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5302         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5303       } else {
5304         __ mov(ch2, 0xE); // all bits in byte set except last one
5305         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5306         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5307         __ lslv(tmp2, tmp2, tmp4);
5308         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5309         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5310         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5311         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5312       }
5313       __ cmp(ch1, ch2);
5314       __ mov(tmp4, wordSize/str2_chr_size);
5315       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5316     __ BIND(L_SMALL_CMP_LOOP);
5317       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5318                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5319       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5320                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5321       __ add(tmp4, tmp4, 1);
5322       __ cmp(tmp4, cnt1);
5323       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5324       __ cmp(first, ch2);
5325       __ br(__ EQ, L_SMALL_CMP_LOOP);
5326     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5327       __ cbz(tmp2, NOMATCH); // no more matches. exit
5328       __ clz(tmp4, tmp2);
5329       __ add(result, result, 1); // advance index
5330       __ add(str2, str2, str2_chr_size); // advance pointer
5331       __ b(L_SMALL_HAS_ZERO_LOOP);
5332     __ align(OptoLoopAlignment);
5333     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5334       __ cmp(first, ch2);
5335       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5336       __ b(DONE);
5337     __ align(OptoLoopAlignment);
5338     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5339       if (str2_isL) { // LL
5340         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5341         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5342         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5343         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5344         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5345       } else {
5346         __ mov(ch2, 0xE); // all bits in byte set except last one
5347         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5348         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5349         __ lslv(tmp2, tmp2, tmp4);
5350         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5351         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5352         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5353         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5354       }
5355       __ cmp(ch1, ch2);
5356       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5357       __ b(DONE);
5358     __ align(OptoLoopAlignment);
5359     __ BIND(L_HAS_ZERO);
5360       __ rbit(tmp2, tmp2);
5361       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5362       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5363       // It's fine because both counters are 32bit and are not changed in this
5364       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5365       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5366       __ sub(result, result, 1);
5367     __ BIND(L_HAS_ZERO_LOOP);
5368       __ mov(cnt1, wordSize/str2_chr_size);
5369       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5370       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5371       if (str2_isL) {
5372         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5373         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5374         __ lslv(tmp2, tmp2, tmp4);
5375         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5376         __ add(tmp4, tmp4, 1);
5377         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5378         __ lsl(tmp2, tmp2, 1);
5379         __ mov(tmp4, wordSize/str2_chr_size);
5380       } else {
5381         __ mov(ch2, 0xE);
5382         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5383         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5384         __ lslv(tmp2, tmp2, tmp4);
5385         __ add(tmp4, tmp4, 1);
5386         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5387         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5388         __ lsl(tmp2, tmp2, 1);
5389         __ mov(tmp4, wordSize/str2_chr_size);
5390         __ sub(str2, str2, str2_chr_size);
5391       }
5392       __ cmp(ch1, ch2);
5393       __ mov(tmp4, wordSize/str2_chr_size);
5394       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5395     __ BIND(L_CMP_LOOP);
5396       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5397                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5398       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5399                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5400       __ add(tmp4, tmp4, 1);
5401       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5402       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5403       __ cmp(cnt1, ch2);
5404       __ br(__ EQ, L_CMP_LOOP);
5405     __ BIND(L_CMP_LOOP_NOMATCH);
5406       // here we're not matched
5407       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5408       __ clz(tmp4, tmp2);
5409       __ add(str2, str2, str2_chr_size); // advance pointer
5410       __ b(L_HAS_ZERO_LOOP);
5411     __ align(OptoLoopAlignment);
5412     __ BIND(L_CMP_LOOP_LAST_CMP);
5413       __ cmp(cnt1, ch2);
5414       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5415       __ b(DONE);
5416     __ align(OptoLoopAlignment);
5417     __ BIND(L_CMP_LOOP_LAST_CMP2);
5418       if (str2_isL) {
5419         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5420         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5421         __ lslv(tmp2, tmp2, tmp4);
5422         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5423         __ add(tmp4, tmp4, 1);
5424         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5425         __ lsl(tmp2, tmp2, 1);
5426       } else {
5427         __ mov(ch2, 0xE);
5428         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5429         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5430         __ lslv(tmp2, tmp2, tmp4);
5431         __ add(tmp4, tmp4, 1);
5432         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5433         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5434         __ lsl(tmp2, tmp2, 1);
5435         __ sub(str2, str2, str2_chr_size);
5436       }
5437       __ cmp(ch1, ch2);
5438       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5439       __ b(DONE);
5440     __ align(OptoLoopAlignment);
5441     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5442       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5443       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5444       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5445       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5446       // result by analyzed characters value, so, we can just reset lower bits
5447       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5448       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5449       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5450       // index of last analyzed substring inside current octet. So, str2 in at
5451       // respective start address. We need to advance it to next octet
5452       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5453       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5454       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5455       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5456       __ movw(cnt2, cnt2);
5457       __ b(L_LOOP_PROCEED);
5458     __ align(OptoLoopAlignment);
5459     __ BIND(NOMATCH);
5460       __ mov(result, -1);
5461     __ BIND(DONE);
5462       __ pop(spilled_regs, sp);
5463       __ ret(lr);
5464     return entry;
5465   }
5466 
5467   void generate_string_indexof_stubs() {
5468     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5469     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5470     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5471   }
5472 
5473   void inflate_and_store_2_fp_registers(bool generatePrfm,
5474       FloatRegister src1, FloatRegister src2) {
5475     Register dst = r1;
5476     __ zip1(v1, __ T16B, src1, v0);
5477     __ zip2(v2, __ T16B, src1, v0);
5478     if (generatePrfm) {
5479       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5480     }
5481     __ zip1(v3, __ T16B, src2, v0);
5482     __ zip2(v4, __ T16B, src2, v0);
5483     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5484   }
5485 
5486   // R0 = src
5487   // R1 = dst
5488   // R2 = len
5489   // R3 = len >> 3
5490   // V0 = 0
5491   // v1 = loaded 8 bytes
5492   address generate_large_byte_array_inflate() {
5493     __ align(CodeEntryAlignment);
5494     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5495     address entry = __ pc();
5496     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5497     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5498     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5499 
5500     // do one more 8-byte read to have address 16-byte aligned in most cases
5501     // also use single store instruction
5502     __ ldrd(v2, __ post(src, 8));
5503     __ sub(octetCounter, octetCounter, 2);
5504     __ zip1(v1, __ T16B, v1, v0);
5505     __ zip1(v2, __ T16B, v2, v0);
5506     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5507     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5508     __ subs(rscratch1, octetCounter, large_loop_threshold);
5509     __ br(__ LE, LOOP_START);
5510     __ b(LOOP_PRFM_START);
5511     __ bind(LOOP_PRFM);
5512       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5513     __ bind(LOOP_PRFM_START);
5514       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5515       __ sub(octetCounter, octetCounter, 8);
5516       __ subs(rscratch1, octetCounter, large_loop_threshold);
5517       inflate_and_store_2_fp_registers(true, v3, v4);
5518       inflate_and_store_2_fp_registers(true, v5, v6);
5519       __ br(__ GT, LOOP_PRFM);
5520       __ cmp(octetCounter, (u1)8);
5521       __ br(__ LT, DONE);
5522     __ bind(LOOP);
5523       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5524       __ bind(LOOP_START);
5525       __ sub(octetCounter, octetCounter, 8);
5526       __ cmp(octetCounter, (u1)8);
5527       inflate_and_store_2_fp_registers(false, v3, v4);
5528       inflate_and_store_2_fp_registers(false, v5, v6);
5529       __ br(__ GE, LOOP);
5530     __ bind(DONE);
5531       __ ret(lr);
5532     return entry;
5533   }
5534 
5535   /**
5536    *  Arguments:
5537    *
5538    *  Input:
5539    *  c_rarg0   - current state address
5540    *  c_rarg1   - H key address
5541    *  c_rarg2   - data address
5542    *  c_rarg3   - number of blocks
5543    *
5544    *  Output:
5545    *  Updated state at c_rarg0
5546    */
5547   address generate_ghash_processBlocks() {
5548     // Bafflingly, GCM uses little-endian for the byte order, but
5549     // big-endian for the bit order.  For example, the polynomial 1 is
5550     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5551     //
5552     // So, we must either reverse the bytes in each word and do
5553     // everything big-endian or reverse the bits in each byte and do
5554     // it little-endian.  On AArch64 it's more idiomatic to reverse
5555     // the bits in each byte (we have an instruction, RBIT, to do
5556     // that) and keep the data in little-endian bit order throught the
5557     // calculation, bit-reversing the inputs and outputs.
5558 
5559     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5560     __ align(wordSize * 2);
5561     address p = __ pc();
5562     __ emit_int64(0x87);  // The low-order bits of the field
5563                           // polynomial (i.e. p = z^7+z^2+z+1)
5564                           // repeated in the low and high parts of a
5565                           // 128-bit vector
5566     __ emit_int64(0x87);
5567 
5568     __ align(CodeEntryAlignment);
5569     address start = __ pc();
5570 
5571     Register state   = c_rarg0;
5572     Register subkeyH = c_rarg1;
5573     Register data    = c_rarg2;
5574     Register blocks  = c_rarg3;
5575 
5576     FloatRegister vzr = v30;
5577     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5578 
5579     __ ldrq(v24, p);    // The field polynomial
5580 
5581     __ ldrq(v0, Address(state));
5582     __ ldrq(v1, Address(subkeyH));
5583 
5584     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5585     __ rbit(v0, __ T16B, v0);
5586     __ rev64(v1, __ T16B, v1);
5587     __ rbit(v1, __ T16B, v1);
5588 
5589     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5590     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5591 
5592     {
5593       Label L_ghash_loop;
5594       __ bind(L_ghash_loop);
5595 
5596       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5597                                                  // reversing each byte
5598       __ rbit(v2, __ T16B, v2);
5599       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5600 
5601       // Multiply state in v2 by subkey in v1
5602       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5603                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
5604                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
5605       // Reduce v7:v5 by the field polynomial
5606       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
5607 
5608       __ sub(blocks, blocks, 1);
5609       __ cbnz(blocks, L_ghash_loop);
5610     }
5611 
5612     // The bit-reversed result is at this point in v0
5613     __ rev64(v0, __ T16B, v0);
5614     __ rbit(v0, __ T16B, v0);
5615 
5616     __ st1(v0, __ T16B, state);
5617     __ ret(lr);
5618 
5619     return start;
5620   }
5621 
5622   address generate_ghash_processBlocks_wide() {
5623     address small = generate_ghash_processBlocks();
5624 
5625     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
5626     __ align(wordSize * 2);
5627     address p = __ pc();
5628     __ emit_int64(0x87);  // The low-order bits of the field
5629                           // polynomial (i.e. p = z^7+z^2+z+1)
5630                           // repeated in the low and high parts of a
5631                           // 128-bit vector
5632     __ emit_int64(0x87);
5633 
5634     __ align(CodeEntryAlignment);
5635     address start = __ pc();
5636 
5637     Register state   = c_rarg0;
5638     Register subkeyH = c_rarg1;
5639     Register data    = c_rarg2;
5640     Register blocks  = c_rarg3;
5641 
5642     const int unroll = 4;
5643 
5644     __ cmp(blocks, (unsigned char)(unroll * 2));
5645     __ br(__ LT, small);
5646 
5647     if (unroll > 1) {
5648     // Save state before entering routine
5649       __ sub(sp, sp, 4 * 16);
5650       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
5651       __ sub(sp, sp, 4 * 16);
5652       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
5653     }
5654 
5655     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
5656 
5657     if (unroll > 1) {
5658       // And restore state
5659       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
5660       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
5661     }
5662 
5663     __ cmp(blocks, (unsigned char)0);
5664     __ br(__ GT, small);
5665 
5666     __ ret(lr);
5667 
5668     return start;
5669   }
5670 
5671   void generate_base64_encode_simdround(Register src, Register dst,
5672         FloatRegister codec, u8 size) {
5673 
5674     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
5675     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5676     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5677 
5678     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5679 
5680     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
5681 
5682     __ ushr(ind0, arrangement, in0,  2);
5683 
5684     __ ushr(ind1, arrangement, in1,  2);
5685     __ shl(in0,   arrangement, in0,  6);
5686     __ orr(ind1,  arrangement, ind1, in0);
5687     __ ushr(ind1, arrangement, ind1, 2);
5688 
5689     __ ushr(ind2, arrangement, in2,  4);
5690     __ shl(in1,   arrangement, in1,  4);
5691     __ orr(ind2,  arrangement, in1,  ind2);
5692     __ ushr(ind2, arrangement, ind2, 2);
5693 
5694     __ shl(ind3,  arrangement, in2,  2);
5695     __ ushr(ind3, arrangement, ind3, 2);
5696 
5697     __ tbl(out0,  arrangement, codec,  4, ind0);
5698     __ tbl(out1,  arrangement, codec,  4, ind1);
5699     __ tbl(out2,  arrangement, codec,  4, ind2);
5700     __ tbl(out3,  arrangement, codec,  4, ind3);
5701 
5702     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
5703   }
5704 
5705    /**
5706    *  Arguments:
5707    *
5708    *  Input:
5709    *  c_rarg0   - src_start
5710    *  c_rarg1   - src_offset
5711    *  c_rarg2   - src_length
5712    *  c_rarg3   - dest_start
5713    *  c_rarg4   - dest_offset
5714    *  c_rarg5   - isURL
5715    *
5716    */
5717   address generate_base64_encodeBlock() {
5718 
5719     static const char toBase64[64] = {
5720       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5721       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5722       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5723       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5724       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5725     };
5726 
5727     static const char toBase64URL[64] = {
5728       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5729       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5730       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5731       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5732       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5733     };
5734 
5735     __ align(CodeEntryAlignment);
5736     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5737     address start = __ pc();
5738 
5739     Register src   = c_rarg0;  // source array
5740     Register soff  = c_rarg1;  // source start offset
5741     Register send  = c_rarg2;  // source end offset
5742     Register dst   = c_rarg3;  // dest array
5743     Register doff  = c_rarg4;  // position for writing to dest array
5744     Register isURL = c_rarg5;  // Base64 or URL chracter set
5745 
5746     // c_rarg6 and c_rarg7 are free to use as temps
5747     Register codec  = c_rarg6;
5748     Register length = c_rarg7;
5749 
5750     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
5751 
5752     __ add(src, src, soff);
5753     __ add(dst, dst, doff);
5754     __ sub(length, send, soff);
5755 
5756     // load the codec base address
5757     __ lea(codec, ExternalAddress((address) toBase64));
5758     __ cbz(isURL, ProcessData);
5759     __ lea(codec, ExternalAddress((address) toBase64URL));
5760 
5761     __ BIND(ProcessData);
5762 
5763     // too short to formup a SIMD loop, roll back
5764     __ cmp(length, (u1)24);
5765     __ br(Assembler::LT, Process3B);
5766 
5767     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
5768 
5769     __ BIND(Process48B);
5770     __ cmp(length, (u1)48);
5771     __ br(Assembler::LT, Process24B);
5772     generate_base64_encode_simdround(src, dst, v0, 16);
5773     __ sub(length, length, 48);
5774     __ b(Process48B);
5775 
5776     __ BIND(Process24B);
5777     __ cmp(length, (u1)24);
5778     __ br(Assembler::LT, SIMDExit);
5779     generate_base64_encode_simdround(src, dst, v0, 8);
5780     __ sub(length, length, 24);
5781 
5782     __ BIND(SIMDExit);
5783     __ cbz(length, Exit);
5784 
5785     __ BIND(Process3B);
5786     //  3 src bytes, 24 bits
5787     __ ldrb(r10, __ post(src, 1));
5788     __ ldrb(r11, __ post(src, 1));
5789     __ ldrb(r12, __ post(src, 1));
5790     __ orrw(r11, r11, r10, Assembler::LSL, 8);
5791     __ orrw(r12, r12, r11, Assembler::LSL, 8);
5792     // codec index
5793     __ ubfmw(r15, r12, 18, 23);
5794     __ ubfmw(r14, r12, 12, 17);
5795     __ ubfmw(r13, r12, 6,  11);
5796     __ andw(r12,  r12, 63);
5797     // get the code based on the codec
5798     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
5799     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
5800     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
5801     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
5802     __ strb(r15, __ post(dst, 1));
5803     __ strb(r14, __ post(dst, 1));
5804     __ strb(r13, __ post(dst, 1));
5805     __ strb(r12, __ post(dst, 1));
5806     __ sub(length, length, 3);
5807     __ cbnz(length, Process3B);
5808 
5809     __ BIND(Exit);
5810     __ ret(lr);
5811 
5812     return start;
5813   }
5814 
5815   void generate_base64_decode_simdround(Register src, Register dst,
5816         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
5817 
5818     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
5819     FloatRegister out0 = v20, out1 = v21, out2 = v22;
5820 
5821     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
5822     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
5823 
5824     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
5825 
5826     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5827 
5828     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
5829 
5830     // we need unsigned saturating substract, to make sure all input values
5831     // in range [0, 63] will have 0U value in the higher half lookup
5832     __ uqsubv(decH0, __ T16B, in0, v27);
5833     __ uqsubv(decH1, __ T16B, in1, v27);
5834     __ uqsubv(decH2, __ T16B, in2, v27);
5835     __ uqsubv(decH3, __ T16B, in3, v27);
5836 
5837     // lower half lookup
5838     __ tbl(decL0, arrangement, codecL, 4, in0);
5839     __ tbl(decL1, arrangement, codecL, 4, in1);
5840     __ tbl(decL2, arrangement, codecL, 4, in2);
5841     __ tbl(decL3, arrangement, codecL, 4, in3);
5842 
5843     // higher half lookup
5844     __ tbx(decH0, arrangement, codecH, 4, decH0);
5845     __ tbx(decH1, arrangement, codecH, 4, decH1);
5846     __ tbx(decH2, arrangement, codecH, 4, decH2);
5847     __ tbx(decH3, arrangement, codecH, 4, decH3);
5848 
5849     // combine lower and higher
5850     __ orr(decL0, arrangement, decL0, decH0);
5851     __ orr(decL1, arrangement, decL1, decH1);
5852     __ orr(decL2, arrangement, decL2, decH2);
5853     __ orr(decL3, arrangement, decL3, decH3);
5854 
5855     // check illegal inputs, value larger than 63 (maximum of 6 bits)
5856     __ cmhi(decH0, arrangement, decL0, v27);
5857     __ cmhi(decH1, arrangement, decL1, v27);
5858     __ cmhi(decH2, arrangement, decL2, v27);
5859     __ cmhi(decH3, arrangement, decL3, v27);
5860     __ orr(in0, arrangement, decH0, decH1);
5861     __ orr(in1, arrangement, decH2, decH3);
5862     __ orr(in2, arrangement, in0,   in1);
5863     __ umaxv(in3, arrangement, in2);
5864     __ umov(rscratch2, in3, __ B, 0);
5865 
5866     // get the data to output
5867     __ shl(out0,  arrangement, decL0, 2);
5868     __ ushr(out1, arrangement, decL1, 4);
5869     __ orr(out0,  arrangement, out0,  out1);
5870     __ shl(out1,  arrangement, decL1, 4);
5871     __ ushr(out2, arrangement, decL2, 2);
5872     __ orr(out1,  arrangement, out1,  out2);
5873     __ shl(out2,  arrangement, decL2, 6);
5874     __ orr(out2,  arrangement, out2,  decL3);
5875 
5876     __ cbz(rscratch2, NoIllegalData);
5877 
5878     // handle illegal input
5879     __ umov(r10, in2, __ D, 0);
5880     if (size == 16) {
5881       __ cbnz(r10, ErrorInLowerHalf);
5882 
5883       // illegal input is in higher half, store the lower half now.
5884       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
5885 
5886       __ umov(r10, in2,  __ D, 1);
5887       __ umov(r11, out0, __ D, 1);
5888       __ umov(r12, out1, __ D, 1);
5889       __ umov(r13, out2, __ D, 1);
5890       __ b(StoreLegalData);
5891 
5892       __ BIND(ErrorInLowerHalf);
5893     }
5894     __ umov(r11, out0, __ D, 0);
5895     __ umov(r12, out1, __ D, 0);
5896     __ umov(r13, out2, __ D, 0);
5897 
5898     __ BIND(StoreLegalData);
5899     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
5900     __ strb(r11, __ post(dst, 1));
5901     __ strb(r12, __ post(dst, 1));
5902     __ strb(r13, __ post(dst, 1));
5903     __ lsr(r10, r10, 8);
5904     __ lsr(r11, r11, 8);
5905     __ lsr(r12, r12, 8);
5906     __ lsr(r13, r13, 8);
5907     __ b(StoreLegalData);
5908 
5909     __ BIND(NoIllegalData);
5910     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
5911   }
5912 
5913 
5914    /**
5915    *  Arguments:
5916    *
5917    *  Input:
5918    *  c_rarg0   - src_start
5919    *  c_rarg1   - src_offset
5920    *  c_rarg2   - src_length
5921    *  c_rarg3   - dest_start
5922    *  c_rarg4   - dest_offset
5923    *  c_rarg5   - isURL
5924    *  c_rarg6   - isMIME
5925    *
5926    */
5927   address generate_base64_decodeBlock() {
5928 
5929     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
5930     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
5931     // titled "Base64 decoding".
5932 
5933     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
5934     // except the trailing character '=' is also treated illegal value in this instrinsic. That
5935     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
5936     static const uint8_t fromBase64ForNoSIMD[256] = {
5937       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5938       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5939       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
5940        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5941       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5942        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
5943       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5944        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5945       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5946       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5947       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5948       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5949       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5950       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5951       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5952       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5953     };
5954 
5955     static const uint8_t fromBase64URLForNoSIMD[256] = {
5956       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5957       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5958       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
5959        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5960       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5961        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
5962       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5963        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5964       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5965       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5966       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5967       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5968       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5969       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5970       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5971       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5972     };
5973 
5974     // A legal value of base64 code is in range [0, 127].  We need two lookups
5975     // with tbl/tbx and combine them to get the decode data. The 1st table vector
5976     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
5977     // table vector lookup use tbx, out of range indices are unchanged in
5978     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
5979     // The value of index 64 is set to 0, so that we know that we already get the
5980     // decoded data with the 1st lookup.
5981     static const uint8_t fromBase64ForSIMD[128] = {
5982       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5983       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5984       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
5985        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5986         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
5987        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
5988       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
5989        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
5990     };
5991 
5992     static const uint8_t fromBase64URLForSIMD[128] = {
5993       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5994       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5995       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
5996        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5997         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
5998        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
5999        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6000        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6001     };
6002 
6003     __ align(CodeEntryAlignment);
6004     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6005     address start = __ pc();
6006 
6007     Register src    = c_rarg0;  // source array
6008     Register soff   = c_rarg1;  // source start offset
6009     Register send   = c_rarg2;  // source end offset
6010     Register dst    = c_rarg3;  // dest array
6011     Register doff   = c_rarg4;  // position for writing to dest array
6012     Register isURL  = c_rarg5;  // Base64 or URL character set
6013     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6014 
6015     Register length = send;    // reuse send as length of source data to process
6016 
6017     Register simd_codec   = c_rarg6;
6018     Register nosimd_codec = c_rarg7;
6019 
6020     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6021 
6022     __ enter();
6023 
6024     __ add(src, src, soff);
6025     __ add(dst, dst, doff);
6026 
6027     __ mov(doff, dst);
6028 
6029     __ sub(length, send, soff);
6030     __ bfm(length, zr, 0, 1);
6031 
6032     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6033     __ cbz(isURL, ProcessData);
6034     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6035 
6036     __ BIND(ProcessData);
6037     __ mov(rscratch1, length);
6038     __ cmp(length, (u1)144); // 144 = 80 + 64
6039     __ br(Assembler::LT, Process4B);
6040 
6041     // In the MIME case, the line length cannot be more than 76
6042     // bytes (see RFC 2045). This is too short a block for SIMD
6043     // to be worthwhile, so we use non-SIMD here.
6044     __ movw(rscratch1, 79);
6045 
6046     __ BIND(Process4B);
6047     __ ldrw(r14, __ post(src, 4));
6048     __ ubfxw(r10, r14, 0,  8);
6049     __ ubfxw(r11, r14, 8,  8);
6050     __ ubfxw(r12, r14, 16, 8);
6051     __ ubfxw(r13, r14, 24, 8);
6052     // get the de-code
6053     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6054     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6055     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6056     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6057     // error detection, 255u indicates an illegal input
6058     __ orrw(r14, r10, r11);
6059     __ orrw(r15, r12, r13);
6060     __ orrw(r14, r14, r15);
6061     __ tbnz(r14, 7, Exit);
6062     // recover the data
6063     __ lslw(r14, r10, 10);
6064     __ bfiw(r14, r11, 4, 6);
6065     __ bfmw(r14, r12, 2, 5);
6066     __ rev16w(r14, r14);
6067     __ bfiw(r13, r12, 6, 2);
6068     __ strh(r14, __ post(dst, 2));
6069     __ strb(r13, __ post(dst, 1));
6070     // non-simd loop
6071     __ subsw(rscratch1, rscratch1, 4);
6072     __ br(Assembler::GT, Process4B);
6073 
6074     // if exiting from PreProcess80B, rscratch1 == -1;
6075     // otherwise, rscratch1 == 0.
6076     __ cbzw(rscratch1, Exit);
6077     __ sub(length, length, 80);
6078 
6079     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6080     __ cbz(isURL, SIMDEnter);
6081     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6082 
6083     __ BIND(SIMDEnter);
6084     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6085     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6086     __ mov(rscratch1, 63);
6087     __ dup(v27, __ T16B, rscratch1);
6088 
6089     __ BIND(Process64B);
6090     __ cmp(length, (u1)64);
6091     __ br(Assembler::LT, Process32B);
6092     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6093     __ sub(length, length, 64);
6094     __ b(Process64B);
6095 
6096     __ BIND(Process32B);
6097     __ cmp(length, (u1)32);
6098     __ br(Assembler::LT, SIMDExit);
6099     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6100     __ sub(length, length, 32);
6101     __ b(Process32B);
6102 
6103     __ BIND(SIMDExit);
6104     __ cbz(length, Exit);
6105     __ movw(rscratch1, length);
6106     __ b(Process4B);
6107 
6108     __ BIND(Exit);
6109     __ sub(c_rarg0, dst, doff);
6110 
6111     __ leave();
6112     __ ret(lr);
6113 
6114     return start;
6115   }
6116 
6117 #ifdef LINUX
6118 
6119   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6120   //
6121   // If LSE is in use, generate LSE versions of all the stubs. The
6122   // non-LSE versions are in atomic_aarch64.S.
6123 
6124   // class AtomicStubMark records the entry point of a stub and the
6125   // stub pointer which will point to it. The stub pointer is set to
6126   // the entry point when ~AtomicStubMark() is called, which must be
6127   // after ICache::invalidate_range. This ensures safe publication of
6128   // the generated code.
6129   class AtomicStubMark {
6130     address _entry_point;
6131     aarch64_atomic_stub_t *_stub;
6132     MacroAssembler *_masm;
6133   public:
6134     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6135       _masm = masm;
6136       __ align(32);
6137       _entry_point = __ pc();
6138       _stub = stub;
6139     }
6140     ~AtomicStubMark() {
6141       *_stub = (aarch64_atomic_stub_t)_entry_point;
6142     }
6143   };
6144 
6145   // NB: For memory_order_conservative we need a trailing membar after
6146   // LSE atomic operations but not a leading membar.
6147   //
6148   // We don't need a leading membar because a clause in the Arm ARM
6149   // says:
6150   //
6151   //   Barrier-ordered-before
6152   //
6153   //   Barrier instructions order prior Memory effects before subsequent
6154   //   Memory effects generated by the same Observer. A read or a write
6155   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6156   //   Observer if and only if RW1 appears in program order before RW 2
6157   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6158   //   instruction with both Acquire and Release semantics.
6159   //
6160   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6161   // and Release semantics, therefore we don't need a leading
6162   // barrier. However, there is no corresponding Barrier-ordered-after
6163   // relationship, therefore we need a trailing membar to prevent a
6164   // later store or load from being reordered with the store in an
6165   // atomic instruction.
6166   //
6167   // This was checked by using the herd7 consistency model simulator
6168   // (http://diy.inria.fr/) with this test case:
6169   //
6170   // AArch64 LseCas
6171   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6172   // P0 | P1;
6173   // LDR W4, [X2] | MOV W3, #0;
6174   // DMB LD       | MOV W4, #1;
6175   // LDR W3, [X1] | CASAL W3, W4, [X1];
6176   //              | DMB ISH;
6177   //              | STR W4, [X2];
6178   // exists
6179   // (0:X3=0 /\ 0:X4=1)
6180   //
6181   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6182   // with the store to x in P1. Without the DMB in P1 this may happen.
6183   //
6184   // At the time of writing we don't know of any AArch64 hardware that
6185   // reorders stores in this way, but the Reference Manual permits it.
6186 
6187   void gen_cas_entry(Assembler::operand_size size,
6188                      atomic_memory_order order) {
6189     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6190       exchange_val = c_rarg2;
6191     bool acquire, release;
6192     switch (order) {
6193       case memory_order_relaxed:
6194         acquire = false;
6195         release = false;
6196         break;
6197       case memory_order_release:
6198         acquire = false;
6199         release = true;
6200         break;
6201       default:
6202         acquire = true;
6203         release = true;
6204         break;
6205     }
6206     __ mov(prev, compare_val);
6207     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6208     if (order == memory_order_conservative) {
6209       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6210     }
6211     if (size == Assembler::xword) {
6212       __ mov(r0, prev);
6213     } else {
6214       __ movw(r0, prev);
6215     }
6216     __ ret(lr);
6217   }
6218 
6219   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6220     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6221     // If not relaxed, then default to conservative.  Relaxed is the only
6222     // case we use enough to be worth specializing.
6223     if (order == memory_order_relaxed) {
6224       __ ldadd(size, incr, prev, addr);
6225     } else {
6226       __ ldaddal(size, incr, prev, addr);
6227       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6228     }
6229     if (size == Assembler::xword) {
6230       __ mov(r0, prev);
6231     } else {
6232       __ movw(r0, prev);
6233     }
6234     __ ret(lr);
6235   }
6236 
6237   void gen_swpal_entry(Assembler::operand_size size) {
6238     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6239     __ swpal(size, incr, prev, addr);
6240     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6241     if (size == Assembler::xword) {
6242       __ mov(r0, prev);
6243     } else {
6244       __ movw(r0, prev);
6245     }
6246     __ ret(lr);
6247   }
6248 
6249   void generate_atomic_entry_points() {
6250     if (! UseLSE) {
6251       return;
6252     }
6253 
6254     __ align(CodeEntryAlignment);
6255     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6256     address first_entry = __ pc();
6257 
6258     // ADD, memory_order_conservative
6259     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6260     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6261     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6262     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6263 
6264     // ADD, memory_order_relaxed
6265     AtomicStubMark mark_fetch_add_4_relaxed
6266       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6267     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6268     AtomicStubMark mark_fetch_add_8_relaxed
6269       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6270     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6271 
6272     // XCHG, memory_order_conservative
6273     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6274     gen_swpal_entry(Assembler::word);
6275     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6276     gen_swpal_entry(Assembler::xword);
6277 
6278     // CAS, memory_order_conservative
6279     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6280     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6281     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6282     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6283     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6284     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6285 
6286     // CAS, memory_order_relaxed
6287     AtomicStubMark mark_cmpxchg_1_relaxed
6288       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6289     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6290     AtomicStubMark mark_cmpxchg_4_relaxed
6291       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6292     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6293     AtomicStubMark mark_cmpxchg_8_relaxed
6294       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6295     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6296 
6297     AtomicStubMark mark_cmpxchg_4_release
6298       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6299     gen_cas_entry(MacroAssembler::word, memory_order_release);
6300     AtomicStubMark mark_cmpxchg_8_release
6301       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6302     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6303 
6304     AtomicStubMark mark_cmpxchg_4_seq_cst
6305       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6306     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6307     AtomicStubMark mark_cmpxchg_8_seq_cst
6308       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6309     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6310 
6311     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6312   }
6313 #endif // LINUX
6314 
6315 RuntimeStub* generate_cont_doYield() {
6316     const char *name = "cont_doYield";
6317 
6318     enum layout {
6319       rfp_off1,
6320       rfp_off2,
6321       lr_off,
6322       lr_off2,
6323       framesize // inclusive of return address
6324     };
6325     // assert(is_even(framesize/2), "sp not 16-byte aligned");
6326     
6327     int insts_size = 512;
6328     int locs_size  = 64;
6329     CodeBuffer code(name, insts_size, locs_size);
6330     OopMapSet* oop_maps  = new OopMapSet();
6331     MacroAssembler* masm = new MacroAssembler(&code);
6332     MacroAssembler* _masm = masm;
6333 
6334     address start = __ pc();
6335 
6336     __ enter();
6337 
6338     __ mov(c_rarg1, sp);
6339 
6340     int frame_complete = __ pc() - start;
6341     address the_pc = __ pc();
6342 
6343     __ post_call_nop(); // this must be exactly after the pc value that is pushed into the frame info, we use this nop for fast CodeBlob lookup
6344 
6345     __ mov(c_rarg0, rthread);
6346     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6347 
6348     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::freeze), 2);
6349       
6350     __ reset_last_Java_frame(true);
6351 
6352     Label pinned;
6353 
6354     __ cbnz(r0, pinned);
6355 
6356     __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6357     __ mov(sp, rscratch1);
6358     continuation_enter_cleanup(masm);
6359 
6360     __ bind(pinned); // pinned -- return to caller
6361     
6362     __ leave();
6363     __ ret(lr);
6364 
6365     OopMap* map = new OopMap(framesize, 1);
6366     // map->set_callee_saved(VMRegImpl::stack2reg(rfp_off), rfp->as_VMReg());
6367     oop_maps->add_gc_map(the_pc - start, map);
6368 
6369     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
6370     RuntimeStub::new_runtime_stub(name,
6371                                   &code,
6372                                   frame_complete,
6373                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6374                                   oop_maps, false);
6375     return stub;
6376   }
6377 
6378   address generate_cont_jump_from_safepoint() {
6379     __ align(CodeEntryAlignment);
6380     StubCodeMark mark(this, "StubRoutines","Continuation jump from safepoint");
6381 
6382     address start = __ pc();
6383 
6384 #ifdef ASSERT
6385     { // verify that threads correspond
6386       Label L;
6387       __ get_thread(rscratch1);
6388       __ cmp(rthread, rscratch1);
6389       __ br(Assembler::EQ, L);
6390       __ stop("StubRoutines::cont_jump_from_safepoint: threads must correspond");
6391       __ BIND(L);
6392     }
6393 #endif
6394 
6395     __ reset_last_Java_frame(true); // false would be fine, too, I guess
6396     __ reinit_heapbase();
6397     
6398     __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6399     __ mov(sp, rscratch1);
6400     continuation_enter_cleanup(_masm);
6401     __ leave();
6402     __ ret(lr);
6403 
6404     return start;
6405   }
6406 
6407   address generate_cont_thaw(bool return_barrier, bool exception) {
6408     assert (return_barrier || !exception, "must be");
6409 
6410     address start = __ pc();
6411 
6412     if (return_barrier) {
6413       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6414       __ mov(sp, rscratch1);
6415     }
6416     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6417 
6418     if (return_barrier) {
6419       // preserve possible return value from a method returning to the return barrier
6420       __ fmovd(rscratch1, v0);
6421       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6422     }
6423 
6424     __ movw(c_rarg1, (return_barrier ? 1 : 0) + (exception ? 1 : 0));
6425     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
6426     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
6427 
6428     if (return_barrier) {
6429       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6430       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6431       __ fmovd(v0, rscratch1);
6432     }
6433     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6434 
6435 
6436     Label thaw_success;
6437     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
6438     __ cbnz(rscratch2, thaw_success);
6439     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
6440     __ br(rscratch1);
6441     __ bind(thaw_success);
6442     
6443     // make room for the thawed frames
6444     __ sub(rscratch1, sp, rscratch2);
6445     __ andr(rscratch1, rscratch1, -16); // align
6446     __ mov(sp, rscratch1);
6447     
6448     if (return_barrier) {
6449       // save original return value -- again
6450       __ fmovd(rscratch1, v0);
6451       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6452     }
6453 
6454     __ movw(c_rarg1, (return_barrier ? 1 : 0) + (exception ? 1 : 0));
6455     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::thaw), rthread, c_rarg1);
6456     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
6457 
6458     if (return_barrier) {
6459       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6460       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6461       __ fmovd(v0, rscratch1);
6462     } else {
6463       __ mov(r0, zr); // return 0 (success) from doYield
6464     }
6465 
6466     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
6467     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
6468     __ mov(rfp, sp);
6469 
6470     if (exception) {
6471       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
6472       __ verify_oop(r0);
6473       __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19
6474 
6475       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
6476 
6477       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
6478       // __ reinitialize_ptrue();
6479 
6480       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
6481 
6482       __ mov(r1, r0); // the exception handler
6483       __ mov(r0, r19); // restore return value contaning the exception oop
6484       __ verify_oop(r0);
6485 
6486       __ leave();
6487       __ mov(r3, lr);
6488       __ br(r1); // the exception handler
6489     }
6490 
6491     // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
6492     __ leave();
6493     __ ret(lr);
6494 
6495     return start;
6496   }
6497 
6498   address generate_cont_thaw() {
6499     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
6500     address start = __ pc();
6501     generate_cont_thaw(false, false);
6502     return start;
6503   }
6504 
6505   address generate_cont_returnBarrier() {
6506     // TODO: will probably need multiple return barriers depending on return type
6507     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
6508     address start = __ pc();
6509 
6510     generate_cont_thaw(true, false);
6511 
6512     return start;
6513   }
6514 
6515   address generate_cont_returnBarrier_exception() {
6516     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
6517     address start = __ pc();
6518 
6519     generate_cont_thaw(true, true);
6520 
6521     return start;
6522   }
6523 
6524   address generate_cont_interpreter_forced_preempt_return() {
6525       StubCodeMark mark(this, "StubRoutines", "cont interpreter forced preempt return");
6526       address start = __ pc();
6527 
6528       // This is necessary for forced yields, as the return addres (in rbx) is captured in a call_VM, and skips the restoration of rbcp and locals
6529 
6530       assert_asm(_masm, __ cmp(sp, rfp), Assembler::EQ, "sp != fp"); // __ mov(rfp, sp);
6531       __ leave(); // we're now on the last thawed frame
6532 
6533       __ ldr(rbcp,    Address(rfp, frame::interpreter_frame_bcp_offset    * wordSize)); // InterpreterMacroAssembler::restore_bcp()
6534       __ ldr(rlocals, Address(rfp, frame::interpreter_frame_locals_offset * wordSize)); // InterpreterMacroAssembler::restore_locals()
6535       __ ldr(rcpool,  Address(rfp, frame::interpreter_frame_cache_offset  * wordSize)); // InterpreterMacroAssembler::restore_constant_pool_cache()
6536       __ ldr(rmethod, Address(rfp, frame::interpreter_frame_method_offset * wordSize)); // InterpreterMacroAssembler::get_method(rmethod) -- might not be necessary
6537       // __ reinit_heapbase();
6538 
6539       // Restore stack bottom in case i2c adjusted stack and NULL it as marker that esp is now tos until next java call
6540       __ ldr(esp, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
6541       __ str(zr,  Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
6542 
6543       __ ret(lr);
6544 
6545       return start;
6546     }
6547 
6548 #if INCLUDE_JFR
6549 
6550   static void jfr_set_last_java_frame(MacroAssembler* _masm, Register thread) {
6551     Register last_java_pc = c_rarg0;
6552     Register last_java_sp = c_rarg2;
6553     __ ldr(last_java_pc, Address(sp, 0));
6554     __ lea(last_java_sp, Address(sp, wordSize));
6555     // __ vzeroupper();
6556     Address anchor_java_pc(thread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
6557     __ str(last_java_pc, anchor_java_pc);
6558     __ str(last_java_sp, Address(thread, JavaThread::last_Java_sp_offset()));
6559   }
6560 
6561   static void jfr_prologue(MacroAssembler* _masm, Register thread) {
6562     jfr_set_last_java_frame(_masm, thread);
6563     __ mov(c_rarg0, rthread);
6564   }
6565 
6566   // Handle is dereference here using correct load constructs.
6567   static void jfr_epilogue(MacroAssembler* _masm, Register thread) {
6568     __ reset_last_Java_frame(false);
6569     Label null_jobject;
6570     __ cbz(r0, null_jobject);
6571     DecoratorSet decorators = ACCESS_READ | IN_NATIVE;
6572     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
6573     bs->load_at(_masm, decorators, T_OBJECT, r0, Address(r0, 0), rscratch1, rthread);
6574     __ bind(null_jobject);
6575   }
6576 
6577   // For c2: c_rarg0 is junk, c_rarg1 is the thread id. Call to runtime to write a checkpoint.
6578   // Runtime will return a jobject handle to the event writer. The handle is dereferenced and the return value
6579   // is the event writer oop.
6580   address generate_jfr_write_checkpoint() {
6581     StubCodeMark mark(this, "jfr_write_checkpoint", "JFR C2 support for Virtual Threads");
6582     address start = __ pc();
6583 
6584     __ enter();
6585     jfr_prologue(_masm, rthread);
6586     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JFR_WRITE_CHECKPOINT_FUNCTION), 2);
6587     jfr_epilogue(_masm, rthread);
6588     __ leave();
6589     __ ret(lr);
6590 
6591     return start;
6592   }
6593 
6594   // For c1: call the corresponding runtime routine, it returns a jobject handle to the event writer.
6595   // The handle is dereferenced and the return value is the event writer oop.
6596   address generate_jfr_get_event_writer() {
6597     StubCodeMark mark(this, "jfr_get_event_writer", "JFR C1 support for Virtual Threads");
6598     address start = __ pc();
6599 
6600     __ enter();
6601     jfr_prologue(_masm, rthread);
6602     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JFR_GET_EVENT_WRITER_FUNCTION), 1);
6603     jfr_epilogue(_masm, rthread);
6604     __ leave();
6605     __ ret(lr);
6606 
6607     return start;
6608   }
6609 
6610 #endif // INCLUDE_JFR
6611 
6612   // Continuation point for throwing of implicit exceptions that are
6613   // not handled in the current activation. Fabricates an exception
6614   // oop and initiates normal exception dispatching in this
6615   // frame. Since we need to preserve callee-saved values (currently
6616   // only for C2, but done for C1 as well) we need a callee-saved oop
6617   // map and therefore have to make these stubs into RuntimeStubs
6618   // rather than BufferBlobs.  If the compiler needs all registers to
6619   // be preserved between the fault point and the exception handler
6620   // then it must assume responsibility for that in
6621   // AbstractCompiler::continuation_for_implicit_null_exception or
6622   // continuation_for_implicit_division_by_zero_exception. All other
6623   // implicit exceptions (e.g., NullPointerException or
6624   // AbstractMethodError on entry) are either at call sites or
6625   // otherwise assume that stack unwinding will be initiated, so
6626   // caller saved registers were assumed volatile in the compiler.
6627 
6628 #undef __
6629 #define __ masm->
6630 
6631   address generate_throw_exception(const char* name,
6632                                    address runtime_entry,
6633                                    Register arg1 = noreg,
6634                                    Register arg2 = noreg) {
6635     // Information about frame layout at time of blocking runtime call.
6636     // Note that we only have to preserve callee-saved registers since
6637     // the compilers are responsible for supplying a continuation point
6638     // if they expect all registers to be preserved.
6639     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6640     enum layout {
6641       rfp_off = 0,
6642       rfp_off2,
6643       return_off,
6644       return_off2,
6645       framesize // inclusive of return address
6646     };
6647 
6648     int insts_size = 512;
6649     int locs_size  = 64;
6650 
6651     CodeBuffer code(name, insts_size, locs_size);
6652     OopMapSet* oop_maps  = new OopMapSet();
6653     MacroAssembler* masm = new MacroAssembler(&code);
6654 
6655     address start = __ pc();
6656 
6657     // This is an inlined and slightly modified version of call_VM
6658     // which has the ability to fetch the return PC out of
6659     // thread-local storage and also sets up last_Java_sp slightly
6660     // differently than the real call_VM
6661 
6662     __ enter(); // Save FP and LR before call
6663 
6664     assert(is_even(framesize/2), "sp not 16-byte aligned");
6665 
6666     // lr and fp are already in place
6667     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
6668 
6669     int frame_complete = __ pc() - start;
6670 
6671     // Set up last_Java_sp and last_Java_fp
6672     address the_pc = __ pc();
6673     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6674 
6675     // Call runtime
6676     if (arg1 != noreg) {
6677       assert(arg2 != c_rarg1, "clobbered");
6678       __ mov(c_rarg1, arg1);
6679     }
6680     if (arg2 != noreg) {
6681       __ mov(c_rarg2, arg2);
6682     }
6683     __ mov(c_rarg0, rthread);
6684     BLOCK_COMMENT("call runtime_entry");
6685     __ mov(rscratch1, runtime_entry);
6686     __ blr(rscratch1);
6687 
6688     // Generate oop map
6689     OopMap* map = new OopMap(framesize, 0);
6690 
6691     oop_maps->add_gc_map(the_pc - start, map);
6692 
6693     __ reset_last_Java_frame(true);
6694 
6695     // Reinitialize the ptrue predicate register, in case the external runtime
6696     // call clobbers ptrue reg, as we may return to SVE compiled code.
6697     __ reinitialize_ptrue();
6698 
6699     __ leave();
6700 
6701     // check for pending exceptions
6702 #ifdef ASSERT
6703     Label L;
6704     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6705     __ cbnz(rscratch1, L);
6706     __ should_not_reach_here();
6707     __ bind(L);
6708 #endif // ASSERT
6709     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6710 
6711 
6712     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6713     RuntimeStub* stub =
6714       RuntimeStub::new_runtime_stub(name,
6715                                     &code,
6716                                     frame_complete,
6717                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6718                                     oop_maps, false);
6719     return stub->entry_point();
6720   }
6721 
6722   class MontgomeryMultiplyGenerator : public MacroAssembler {
6723 
6724     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6725       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6726 
6727     RegSet _toSave;
6728     bool _squaring;
6729 
6730   public:
6731     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6732       : MacroAssembler(as->code()), _squaring(squaring) {
6733 
6734       // Register allocation
6735 
6736       RegSetIterator<> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6737       Pa_base = *regs;       // Argument registers
6738       if (squaring)
6739         Pb_base = Pa_base;
6740       else
6741         Pb_base = *++regs;
6742       Pn_base = *++regs;
6743       Rlen= *++regs;
6744       inv = *++regs;
6745       Pm_base = *++regs;
6746 
6747                           // Working registers:
6748       Ra =  *++regs;        // The current digit of a, b, n, and m.
6749       Rb =  *++regs;
6750       Rm =  *++regs;
6751       Rn =  *++regs;
6752 
6753       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
6754       Pb =  *++regs;
6755       Pm =  *++regs;
6756       Pn =  *++regs;
6757 
6758       t0 =  *++regs;        // Three registers which form a
6759       t1 =  *++regs;        // triple-precision accumuator.
6760       t2 =  *++regs;
6761 
6762       Ri =  *++regs;        // Inner and outer loop indexes.
6763       Rj =  *++regs;
6764 
6765       Rhi_ab = *++regs;     // Product registers: low and high parts
6766       Rlo_ab = *++regs;     // of a*b and m*n.
6767       Rhi_mn = *++regs;
6768       Rlo_mn = *++regs;
6769 
6770       // r19 and up are callee-saved.
6771       _toSave = RegSet::range(r19, *regs) + Pm_base;
6772     }
6773 
6774   private:
6775     void save_regs() {
6776       push(_toSave, sp);
6777     }
6778 
6779     void restore_regs() {
6780       pop(_toSave, sp);
6781     }
6782 
6783     template <typename T>
6784     void unroll_2(Register count, T block) {
6785       Label loop, end, odd;
6786       tbnz(count, 0, odd);
6787       cbz(count, end);
6788       align(16);
6789       bind(loop);
6790       (this->*block)();
6791       bind(odd);
6792       (this->*block)();
6793       subs(count, count, 2);
6794       br(Assembler::GT, loop);
6795       bind(end);
6796     }
6797 
6798     template <typename T>
6799     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
6800       Label loop, end, odd;
6801       tbnz(count, 0, odd);
6802       cbz(count, end);
6803       align(16);
6804       bind(loop);
6805       (this->*block)(d, s, tmp);
6806       bind(odd);
6807       (this->*block)(d, s, tmp);
6808       subs(count, count, 2);
6809       br(Assembler::GT, loop);
6810       bind(end);
6811     }
6812 
6813     void pre1(RegisterOrConstant i) {
6814       block_comment("pre1");
6815       // Pa = Pa_base;
6816       // Pb = Pb_base + i;
6817       // Pm = Pm_base;
6818       // Pn = Pn_base + i;
6819       // Ra = *Pa;
6820       // Rb = *Pb;
6821       // Rm = *Pm;
6822       // Rn = *Pn;
6823       ldr(Ra, Address(Pa_base));
6824       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6825       ldr(Rm, Address(Pm_base));
6826       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6827       lea(Pa, Address(Pa_base));
6828       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6829       lea(Pm, Address(Pm_base));
6830       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6831 
6832       // Zero the m*n result.
6833       mov(Rhi_mn, zr);
6834       mov(Rlo_mn, zr);
6835     }
6836 
6837     // The core multiply-accumulate step of a Montgomery
6838     // multiplication.  The idea is to schedule operations as a
6839     // pipeline so that instructions with long latencies (loads and
6840     // multiplies) have time to complete before their results are
6841     // used.  This most benefits in-order implementations of the
6842     // architecture but out-of-order ones also benefit.
6843     void step() {
6844       block_comment("step");
6845       // MACC(Ra, Rb, t0, t1, t2);
6846       // Ra = *++Pa;
6847       // Rb = *--Pb;
6848       umulh(Rhi_ab, Ra, Rb);
6849       mul(Rlo_ab, Ra, Rb);
6850       ldr(Ra, pre(Pa, wordSize));
6851       ldr(Rb, pre(Pb, -wordSize));
6852       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
6853                                        // previous iteration.
6854       // MACC(Rm, Rn, t0, t1, t2);
6855       // Rm = *++Pm;
6856       // Rn = *--Pn;
6857       umulh(Rhi_mn, Rm, Rn);
6858       mul(Rlo_mn, Rm, Rn);
6859       ldr(Rm, pre(Pm, wordSize));
6860       ldr(Rn, pre(Pn, -wordSize));
6861       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6862     }
6863 
6864     void post1() {
6865       block_comment("post1");
6866 
6867       // MACC(Ra, Rb, t0, t1, t2);
6868       // Ra = *++Pa;
6869       // Rb = *--Pb;
6870       umulh(Rhi_ab, Ra, Rb);
6871       mul(Rlo_ab, Ra, Rb);
6872       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6873       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6874 
6875       // *Pm = Rm = t0 * inv;
6876       mul(Rm, t0, inv);
6877       str(Rm, Address(Pm));
6878 
6879       // MACC(Rm, Rn, t0, t1, t2);
6880       // t0 = t1; t1 = t2; t2 = 0;
6881       umulh(Rhi_mn, Rm, Rn);
6882 
6883 #ifndef PRODUCT
6884       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6885       {
6886         mul(Rlo_mn, Rm, Rn);
6887         add(Rlo_mn, t0, Rlo_mn);
6888         Label ok;
6889         cbz(Rlo_mn, ok); {
6890           stop("broken Montgomery multiply");
6891         } bind(ok);
6892       }
6893 #endif
6894       // We have very carefully set things up so that
6895       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6896       // the lower half of Rm * Rn because we know the result already:
6897       // it must be -t0.  t0 + (-t0) must generate a carry iff
6898       // t0 != 0.  So, rather than do a mul and an adds we just set
6899       // the carry flag iff t0 is nonzero.
6900       //
6901       // mul(Rlo_mn, Rm, Rn);
6902       // adds(zr, t0, Rlo_mn);
6903       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6904       adcs(t0, t1, Rhi_mn);
6905       adc(t1, t2, zr);
6906       mov(t2, zr);
6907     }
6908 
6909     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
6910       block_comment("pre2");
6911       // Pa = Pa_base + i-len;
6912       // Pb = Pb_base + len;
6913       // Pm = Pm_base + i-len;
6914       // Pn = Pn_base + len;
6915 
6916       if (i.is_register()) {
6917         sub(Rj, i.as_register(), len);
6918       } else {
6919         mov(Rj, i.as_constant());
6920         sub(Rj, Rj, len);
6921       }
6922       // Rj == i-len
6923 
6924       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
6925       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
6926       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6927       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
6928 
6929       // Ra = *++Pa;
6930       // Rb = *--Pb;
6931       // Rm = *++Pm;
6932       // Rn = *--Pn;
6933       ldr(Ra, pre(Pa, wordSize));
6934       ldr(Rb, pre(Pb, -wordSize));
6935       ldr(Rm, pre(Pm, wordSize));
6936       ldr(Rn, pre(Pn, -wordSize));
6937 
6938       mov(Rhi_mn, zr);
6939       mov(Rlo_mn, zr);
6940     }
6941 
6942     void post2(RegisterOrConstant i, RegisterOrConstant len) {
6943       block_comment("post2");
6944       if (i.is_constant()) {
6945         mov(Rj, i.as_constant()-len.as_constant());
6946       } else {
6947         sub(Rj, i.as_register(), len);
6948       }
6949 
6950       adds(t0, t0, Rlo_mn); // The pending m*n, low part
6951 
6952       // As soon as we know the least significant digit of our result,
6953       // store it.
6954       // Pm_base[i-len] = t0;
6955       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6956 
6957       // t0 = t1; t1 = t2; t2 = 0;
6958       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
6959       adc(t1, t2, zr);
6960       mov(t2, zr);
6961     }
6962 
6963     // A carry in t0 after Montgomery multiplication means that we
6964     // should subtract multiples of n from our result in m.  We'll
6965     // keep doing that until there is no carry.
6966     void normalize(RegisterOrConstant len) {
6967       block_comment("normalize");
6968       // while (t0)
6969       //   t0 = sub(Pm_base, Pn_base, t0, len);
6970       Label loop, post, again;
6971       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
6972       cbz(t0, post); {
6973         bind(again); {
6974           mov(i, zr);
6975           mov(cnt, len);
6976           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6977           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6978           subs(zr, zr, zr); // set carry flag, i.e. no borrow
6979           align(16);
6980           bind(loop); {
6981             sbcs(Rm, Rm, Rn);
6982             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6983             add(i, i, 1);
6984             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6985             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6986             sub(cnt, cnt, 1);
6987           } cbnz(cnt, loop);
6988           sbc(t0, t0, zr);
6989         } cbnz(t0, again);
6990       } bind(post);
6991     }
6992 
6993     // Move memory at s to d, reversing words.
6994     //    Increments d to end of copied memory
6995     //    Destroys tmp1, tmp2
6996     //    Preserves len
6997     //    Leaves s pointing to the address which was in d at start
6998     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
6999       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
7000 
7001       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7002       mov(tmp1, len);
7003       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7004       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7005     }
7006     // where
7007     void reverse1(Register d, Register s, Register tmp) {
7008       ldr(tmp, pre(s, -wordSize));
7009       ror(tmp, tmp, 32);
7010       str(tmp, post(d, wordSize));
7011     }
7012 
7013     void step_squaring() {
7014       // An extra ACC
7015       step();
7016       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7017     }
7018 
7019     void last_squaring(RegisterOrConstant i) {
7020       Label dont;
7021       // if ((i & 1) == 0) {
7022       tbnz(i.as_register(), 0, dont); {
7023         // MACC(Ra, Rb, t0, t1, t2);
7024         // Ra = *++Pa;
7025         // Rb = *--Pb;
7026         umulh(Rhi_ab, Ra, Rb);
7027         mul(Rlo_ab, Ra, Rb);
7028         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7029       } bind(dont);
7030     }
7031 
7032     void extra_step_squaring() {
7033       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7034 
7035       // MACC(Rm, Rn, t0, t1, t2);
7036       // Rm = *++Pm;
7037       // Rn = *--Pn;
7038       umulh(Rhi_mn, Rm, Rn);
7039       mul(Rlo_mn, Rm, Rn);
7040       ldr(Rm, pre(Pm, wordSize));
7041       ldr(Rn, pre(Pn, -wordSize));
7042     }
7043 
7044     void post1_squaring() {
7045       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7046 
7047       // *Pm = Rm = t0 * inv;
7048       mul(Rm, t0, inv);
7049       str(Rm, Address(Pm));
7050 
7051       // MACC(Rm, Rn, t0, t1, t2);
7052       // t0 = t1; t1 = t2; t2 = 0;
7053       umulh(Rhi_mn, Rm, Rn);
7054 
7055 #ifndef PRODUCT
7056       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7057       {
7058         mul(Rlo_mn, Rm, Rn);
7059         add(Rlo_mn, t0, Rlo_mn);
7060         Label ok;
7061         cbz(Rlo_mn, ok); {
7062           stop("broken Montgomery multiply");
7063         } bind(ok);
7064       }
7065 #endif
7066       // We have very carefully set things up so that
7067       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7068       // the lower half of Rm * Rn because we know the result already:
7069       // it must be -t0.  t0 + (-t0) must generate a carry iff
7070       // t0 != 0.  So, rather than do a mul and an adds we just set
7071       // the carry flag iff t0 is nonzero.
7072       //
7073       // mul(Rlo_mn, Rm, Rn);
7074       // adds(zr, t0, Rlo_mn);
7075       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7076       adcs(t0, t1, Rhi_mn);
7077       adc(t1, t2, zr);
7078       mov(t2, zr);
7079     }
7080 
7081     void acc(Register Rhi, Register Rlo,
7082              Register t0, Register t1, Register t2) {
7083       adds(t0, t0, Rlo);
7084       adcs(t1, t1, Rhi);
7085       adc(t2, t2, zr);
7086     }
7087 
7088   public:
7089     /**
7090      * Fast Montgomery multiplication.  The derivation of the
7091      * algorithm is in A Cryptographic Library for the Motorola
7092      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7093      *
7094      * Arguments:
7095      *
7096      * Inputs for multiplication:
7097      *   c_rarg0   - int array elements a
7098      *   c_rarg1   - int array elements b
7099      *   c_rarg2   - int array elements n (the modulus)
7100      *   c_rarg3   - int length
7101      *   c_rarg4   - int inv
7102      *   c_rarg5   - int array elements m (the result)
7103      *
7104      * Inputs for squaring:
7105      *   c_rarg0   - int array elements a
7106      *   c_rarg1   - int array elements n (the modulus)
7107      *   c_rarg2   - int length
7108      *   c_rarg3   - int inv
7109      *   c_rarg4   - int array elements m (the result)
7110      *
7111      */
7112     address generate_multiply() {
7113       Label argh, nothing;
7114       bind(argh);
7115       stop("MontgomeryMultiply total_allocation must be <= 8192");
7116 
7117       align(CodeEntryAlignment);
7118       address entry = pc();
7119 
7120       cbzw(Rlen, nothing);
7121 
7122       enter();
7123 
7124       // Make room.
7125       cmpw(Rlen, 512);
7126       br(Assembler::HI, argh);
7127       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7128       andr(sp, Ra, -2 * wordSize);
7129 
7130       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7131 
7132       {
7133         // Copy input args, reversing as we go.  We use Ra as a
7134         // temporary variable.
7135         reverse(Ra, Pa_base, Rlen, t0, t1);
7136         if (!_squaring)
7137           reverse(Ra, Pb_base, Rlen, t0, t1);
7138         reverse(Ra, Pn_base, Rlen, t0, t1);
7139       }
7140 
7141       // Push all call-saved registers and also Pm_base which we'll need
7142       // at the end.
7143       save_regs();
7144 
7145 #ifndef PRODUCT
7146       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7147       {
7148         ldr(Rn, Address(Pn_base, 0));
7149         mul(Rlo_mn, Rn, inv);
7150         subs(zr, Rlo_mn, -1);
7151         Label ok;
7152         br(EQ, ok); {
7153           stop("broken inverse in Montgomery multiply");
7154         } bind(ok);
7155       }
7156 #endif
7157 
7158       mov(Pm_base, Ra);
7159 
7160       mov(t0, zr);
7161       mov(t1, zr);
7162       mov(t2, zr);
7163 
7164       block_comment("for (int i = 0; i < len; i++) {");
7165       mov(Ri, zr); {
7166         Label loop, end;
7167         cmpw(Ri, Rlen);
7168         br(Assembler::GE, end);
7169 
7170         bind(loop);
7171         pre1(Ri);
7172 
7173         block_comment("  for (j = i; j; j--) {"); {
7174           movw(Rj, Ri);
7175           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7176         } block_comment("  } // j");
7177 
7178         post1();
7179         addw(Ri, Ri, 1);
7180         cmpw(Ri, Rlen);
7181         br(Assembler::LT, loop);
7182         bind(end);
7183         block_comment("} // i");
7184       }
7185 
7186       block_comment("for (int i = len; i < 2*len; i++) {");
7187       mov(Ri, Rlen); {
7188         Label loop, end;
7189         cmpw(Ri, Rlen, Assembler::LSL, 1);
7190         br(Assembler::GE, end);
7191 
7192         bind(loop);
7193         pre2(Ri, Rlen);
7194 
7195         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7196           lslw(Rj, Rlen, 1);
7197           subw(Rj, Rj, Ri);
7198           subw(Rj, Rj, 1);
7199           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7200         } block_comment("  } // j");
7201 
7202         post2(Ri, Rlen);
7203         addw(Ri, Ri, 1);
7204         cmpw(Ri, Rlen, Assembler::LSL, 1);
7205         br(Assembler::LT, loop);
7206         bind(end);
7207       }
7208       block_comment("} // i");
7209 
7210       normalize(Rlen);
7211 
7212       mov(Ra, Pm_base);  // Save Pm_base in Ra
7213       restore_regs();  // Restore caller's Pm_base
7214 
7215       // Copy our result into caller's Pm_base
7216       reverse(Pm_base, Ra, Rlen, t0, t1);
7217 
7218       leave();
7219       bind(nothing);
7220       ret(lr);
7221 
7222       return entry;
7223     }
7224     // In C, approximately:
7225 
7226     // void
7227     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7228     //                     julong Pn_base[], julong Pm_base[],
7229     //                     julong inv, int len) {
7230     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7231     //   julong *Pa, *Pb, *Pn, *Pm;
7232     //   julong Ra, Rb, Rn, Rm;
7233 
7234     //   int i;
7235 
7236     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7237 
7238     //   for (i = 0; i < len; i++) {
7239     //     int j;
7240 
7241     //     Pa = Pa_base;
7242     //     Pb = Pb_base + i;
7243     //     Pm = Pm_base;
7244     //     Pn = Pn_base + i;
7245 
7246     //     Ra = *Pa;
7247     //     Rb = *Pb;
7248     //     Rm = *Pm;
7249     //     Rn = *Pn;
7250 
7251     //     int iters = i;
7252     //     for (j = 0; iters--; j++) {
7253     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7254     //       MACC(Ra, Rb, t0, t1, t2);
7255     //       Ra = *++Pa;
7256     //       Rb = *--Pb;
7257     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7258     //       MACC(Rm, Rn, t0, t1, t2);
7259     //       Rm = *++Pm;
7260     //       Rn = *--Pn;
7261     //     }
7262 
7263     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7264     //     MACC(Ra, Rb, t0, t1, t2);
7265     //     *Pm = Rm = t0 * inv;
7266     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7267     //     MACC(Rm, Rn, t0, t1, t2);
7268 
7269     //     assert(t0 == 0, "broken Montgomery multiply");
7270 
7271     //     t0 = t1; t1 = t2; t2 = 0;
7272     //   }
7273 
7274     //   for (i = len; i < 2*len; i++) {
7275     //     int j;
7276 
7277     //     Pa = Pa_base + i-len;
7278     //     Pb = Pb_base + len;
7279     //     Pm = Pm_base + i-len;
7280     //     Pn = Pn_base + len;
7281 
7282     //     Ra = *++Pa;
7283     //     Rb = *--Pb;
7284     //     Rm = *++Pm;
7285     //     Rn = *--Pn;
7286 
7287     //     int iters = len*2-i-1;
7288     //     for (j = i-len+1; iters--; j++) {
7289     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7290     //       MACC(Ra, Rb, t0, t1, t2);
7291     //       Ra = *++Pa;
7292     //       Rb = *--Pb;
7293     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7294     //       MACC(Rm, Rn, t0, t1, t2);
7295     //       Rm = *++Pm;
7296     //       Rn = *--Pn;
7297     //     }
7298 
7299     //     Pm_base[i-len] = t0;
7300     //     t0 = t1; t1 = t2; t2 = 0;
7301     //   }
7302 
7303     //   while (t0)
7304     //     t0 = sub(Pm_base, Pn_base, t0, len);
7305     // }
7306 
7307     /**
7308      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7309      * multiplies than Montgomery multiplication so it should be up to
7310      * 25% faster.  However, its loop control is more complex and it
7311      * may actually run slower on some machines.
7312      *
7313      * Arguments:
7314      *
7315      * Inputs:
7316      *   c_rarg0   - int array elements a
7317      *   c_rarg1   - int array elements n (the modulus)
7318      *   c_rarg2   - int length
7319      *   c_rarg3   - int inv
7320      *   c_rarg4   - int array elements m (the result)
7321      *
7322      */
7323     address generate_square() {
7324       Label argh;
7325       bind(argh);
7326       stop("MontgomeryMultiply total_allocation must be <= 8192");
7327 
7328       align(CodeEntryAlignment);
7329       address entry = pc();
7330 
7331       enter();
7332 
7333       // Make room.
7334       cmpw(Rlen, 512);
7335       br(Assembler::HI, argh);
7336       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7337       andr(sp, Ra, -2 * wordSize);
7338 
7339       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7340 
7341       {
7342         // Copy input args, reversing as we go.  We use Ra as a
7343         // temporary variable.
7344         reverse(Ra, Pa_base, Rlen, t0, t1);
7345         reverse(Ra, Pn_base, Rlen, t0, t1);
7346       }
7347 
7348       // Push all call-saved registers and also Pm_base which we'll need
7349       // at the end.
7350       save_regs();
7351 
7352       mov(Pm_base, Ra);
7353 
7354       mov(t0, zr);
7355       mov(t1, zr);
7356       mov(t2, zr);
7357 
7358       block_comment("for (int i = 0; i < len; i++) {");
7359       mov(Ri, zr); {
7360         Label loop, end;
7361         bind(loop);
7362         cmp(Ri, Rlen);
7363         br(Assembler::GE, end);
7364 
7365         pre1(Ri);
7366 
7367         block_comment("for (j = (i+1)/2; j; j--) {"); {
7368           add(Rj, Ri, 1);
7369           lsr(Rj, Rj, 1);
7370           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7371         } block_comment("  } // j");
7372 
7373         last_squaring(Ri);
7374 
7375         block_comment("  for (j = i/2; j; j--) {"); {
7376           lsr(Rj, Ri, 1);
7377           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7378         } block_comment("  } // j");
7379 
7380         post1_squaring();
7381         add(Ri, Ri, 1);
7382         cmp(Ri, Rlen);
7383         br(Assembler::LT, loop);
7384 
7385         bind(end);
7386         block_comment("} // i");
7387       }
7388 
7389       block_comment("for (int i = len; i < 2*len; i++) {");
7390       mov(Ri, Rlen); {
7391         Label loop, end;
7392         bind(loop);
7393         cmp(Ri, Rlen, Assembler::LSL, 1);
7394         br(Assembler::GE, end);
7395 
7396         pre2(Ri, Rlen);
7397 
7398         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7399           lsl(Rj, Rlen, 1);
7400           sub(Rj, Rj, Ri);
7401           sub(Rj, Rj, 1);
7402           lsr(Rj, Rj, 1);
7403           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7404         } block_comment("  } // j");
7405 
7406         last_squaring(Ri);
7407 
7408         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7409           lsl(Rj, Rlen, 1);
7410           sub(Rj, Rj, Ri);
7411           lsr(Rj, Rj, 1);
7412           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7413         } block_comment("  } // j");
7414 
7415         post2(Ri, Rlen);
7416         add(Ri, Ri, 1);
7417         cmp(Ri, Rlen, Assembler::LSL, 1);
7418 
7419         br(Assembler::LT, loop);
7420         bind(end);
7421         block_comment("} // i");
7422       }
7423 
7424       normalize(Rlen);
7425 
7426       mov(Ra, Pm_base);  // Save Pm_base in Ra
7427       restore_regs();  // Restore caller's Pm_base
7428 
7429       // Copy our result into caller's Pm_base
7430       reverse(Pm_base, Ra, Rlen, t0, t1);
7431 
7432       leave();
7433       ret(lr);
7434 
7435       return entry;
7436     }
7437     // In C, approximately:
7438 
7439     // void
7440     // montgomery_square(julong Pa_base[], julong Pn_base[],
7441     //                   julong Pm_base[], julong inv, int len) {
7442     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7443     //   julong *Pa, *Pb, *Pn, *Pm;
7444     //   julong Ra, Rb, Rn, Rm;
7445 
7446     //   int i;
7447 
7448     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7449 
7450     //   for (i = 0; i < len; i++) {
7451     //     int j;
7452 
7453     //     Pa = Pa_base;
7454     //     Pb = Pa_base + i;
7455     //     Pm = Pm_base;
7456     //     Pn = Pn_base + i;
7457 
7458     //     Ra = *Pa;
7459     //     Rb = *Pb;
7460     //     Rm = *Pm;
7461     //     Rn = *Pn;
7462 
7463     //     int iters = (i+1)/2;
7464     //     for (j = 0; iters--; j++) {
7465     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7466     //       MACC2(Ra, Rb, t0, t1, t2);
7467     //       Ra = *++Pa;
7468     //       Rb = *--Pb;
7469     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7470     //       MACC(Rm, Rn, t0, t1, t2);
7471     //       Rm = *++Pm;
7472     //       Rn = *--Pn;
7473     //     }
7474     //     if ((i & 1) == 0) {
7475     //       assert(Ra == Pa_base[j], "must be");
7476     //       MACC(Ra, Ra, t0, t1, t2);
7477     //     }
7478     //     iters = i/2;
7479     //     assert(iters == i-j, "must be");
7480     //     for (; iters--; j++) {
7481     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7482     //       MACC(Rm, Rn, t0, t1, t2);
7483     //       Rm = *++Pm;
7484     //       Rn = *--Pn;
7485     //     }
7486 
7487     //     *Pm = Rm = t0 * inv;
7488     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7489     //     MACC(Rm, Rn, t0, t1, t2);
7490 
7491     //     assert(t0 == 0, "broken Montgomery multiply");
7492 
7493     //     t0 = t1; t1 = t2; t2 = 0;
7494     //   }
7495 
7496     //   for (i = len; i < 2*len; i++) {
7497     //     int start = i-len+1;
7498     //     int end = start + (len - start)/2;
7499     //     int j;
7500 
7501     //     Pa = Pa_base + i-len;
7502     //     Pb = Pa_base + len;
7503     //     Pm = Pm_base + i-len;
7504     //     Pn = Pn_base + len;
7505 
7506     //     Ra = *++Pa;
7507     //     Rb = *--Pb;
7508     //     Rm = *++Pm;
7509     //     Rn = *--Pn;
7510 
7511     //     int iters = (2*len-i-1)/2;
7512     //     assert(iters == end-start, "must be");
7513     //     for (j = start; iters--; j++) {
7514     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7515     //       MACC2(Ra, Rb, t0, t1, t2);
7516     //       Ra = *++Pa;
7517     //       Rb = *--Pb;
7518     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7519     //       MACC(Rm, Rn, t0, t1, t2);
7520     //       Rm = *++Pm;
7521     //       Rn = *--Pn;
7522     //     }
7523     //     if ((i & 1) == 0) {
7524     //       assert(Ra == Pa_base[j], "must be");
7525     //       MACC(Ra, Ra, t0, t1, t2);
7526     //     }
7527     //     iters =  (2*len-i)/2;
7528     //     assert(iters == len-j, "must be");
7529     //     for (; iters--; j++) {
7530     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7531     //       MACC(Rm, Rn, t0, t1, t2);
7532     //       Rm = *++Pm;
7533     //       Rn = *--Pn;
7534     //     }
7535     //     Pm_base[i-len] = t0;
7536     //     t0 = t1; t1 = t2; t2 = 0;
7537     //   }
7538 
7539     //   while (t0)
7540     //     t0 = sub(Pm_base, Pn_base, t0, len);
7541     // }
7542   };
7543 
7544 
7545   // Initialization
7546   void generate_initial() {
7547     // Generate initial stubs and initializes the entry points
7548 
7549     // entry points that exist in all platforms Note: This is code
7550     // that could be shared among different platforms - however the
7551     // benefit seems to be smaller than the disadvantage of having a
7552     // much more complicated generator structure. See also comment in
7553     // stubRoutines.hpp.
7554 
7555     StubRoutines::_forward_exception_entry = generate_forward_exception();
7556 
7557     StubRoutines::_call_stub_entry =
7558       generate_call_stub(StubRoutines::_call_stub_return_address);
7559 
7560     // is referenced by megamorphic call
7561     StubRoutines::_catch_exception_entry = generate_catch_exception();
7562 
7563     // Build this early so it's available for the interpreter.
7564     StubRoutines::_throw_StackOverflowError_entry =
7565       generate_throw_exception("StackOverflowError throw_exception",
7566                                CAST_FROM_FN_PTR(address,
7567                                                 SharedRuntime::throw_StackOverflowError));
7568     StubRoutines::_throw_delayed_StackOverflowError_entry =
7569       generate_throw_exception("delayed StackOverflowError throw_exception",
7570                                CAST_FROM_FN_PTR(address,
7571                                                 SharedRuntime::throw_delayed_StackOverflowError));
7572     if (UseCRC32Intrinsics) {
7573       // set table address before stub generation which use it
7574       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7575       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7576     }
7577 
7578     if (UseCRC32CIntrinsics) {
7579       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7580     }
7581 
7582     // Disabled until JDK-8210858 is fixed
7583     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7584     //   StubRoutines::_dlog = generate_dlog();
7585     // }
7586 
7587     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7588       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7589     }
7590 
7591     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7592       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7593     }
7594 
7595     // Safefetch stubs.
7596     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7597                                                        &StubRoutines::_safefetch32_fault_pc,
7598                                                        &StubRoutines::_safefetch32_continuation_pc);
7599     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7600                                                        &StubRoutines::_safefetchN_fault_pc,
7601                                                        &StubRoutines::_safefetchN_continuation_pc);
7602   }
7603 
7604   void generate_phase1() {
7605     // Continuation stubs:
7606     StubRoutines::_cont_thaw          = generate_cont_thaw();
7607     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
7608     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
7609     StubRoutines::_cont_doYield_stub = generate_cont_doYield();
7610     StubRoutines::_cont_doYield    = StubRoutines::_cont_doYield_stub->entry_point();
7611     StubRoutines::_cont_jump_from_sp = generate_cont_jump_from_safepoint();
7612     StubRoutines::_cont_interpreter_forced_preempt_return = generate_cont_interpreter_forced_preempt_return();
7613 
7614     JFR_ONLY(StubRoutines::_jfr_write_checkpoint = generate_jfr_write_checkpoint();)
7615     JFR_ONLY(StubRoutines::_jfr_get_event_writer = generate_jfr_get_event_writer();)
7616   }
7617 
7618   void generate_all() {
7619     // support for verify_oop (must happen after universe_init)
7620     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
7621     StubRoutines::_throw_AbstractMethodError_entry =
7622       generate_throw_exception("AbstractMethodError throw_exception",
7623                                CAST_FROM_FN_PTR(address,
7624                                                 SharedRuntime::
7625                                                 throw_AbstractMethodError));
7626 
7627     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7628       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7629                                CAST_FROM_FN_PTR(address,
7630                                                 SharedRuntime::
7631                                                 throw_IncompatibleClassChangeError));
7632 
7633     StubRoutines::_throw_NullPointerException_at_call_entry =
7634       generate_throw_exception("NullPointerException at call throw_exception",
7635                                CAST_FROM_FN_PTR(address,
7636                                                 SharedRuntime::
7637                                                 throw_NullPointerException_at_call));
7638 
7639     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7640 
7641     // arraycopy stubs used by compilers
7642     generate_arraycopy_stubs();
7643 
7644     // has negatives stub for large arrays.
7645     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
7646 
7647     // array equals stub for large arrays.
7648     if (!UseSimpleArrayEquals) {
7649       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7650     }
7651 
7652     generate_compare_long_strings();
7653 
7654     generate_string_indexof_stubs();
7655 
7656     // byte_array_inflate stub for large arrays.
7657     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7658 
7659     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7660     if (bs_nm != NULL) {
7661       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7662     }
7663 #ifdef COMPILER2
7664     if (UseMultiplyToLenIntrinsic) {
7665       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7666     }
7667 
7668     if (UseSquareToLenIntrinsic) {
7669       StubRoutines::_squareToLen = generate_squareToLen();
7670     }
7671 
7672     if (UseMulAddIntrinsic) {
7673       StubRoutines::_mulAdd = generate_mulAdd();
7674     }
7675 
7676     if (UseSIMDForBigIntegerShiftIntrinsics) {
7677       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7678       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7679     }
7680 
7681     if (UseMontgomeryMultiplyIntrinsic) {
7682       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7683       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7684       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7685     }
7686 
7687     if (UseMontgomerySquareIntrinsic) {
7688       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7689       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7690       // We use generate_multiply() rather than generate_square()
7691       // because it's faster for the sizes of modulus we care about.
7692       StubRoutines::_montgomerySquare = g.generate_multiply();
7693     }
7694 #endif // COMPILER2
7695 
7696     // generate GHASH intrinsics code
7697     if (UseGHASHIntrinsics) {
7698       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7699       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
7700     }
7701 
7702     if (UseBASE64Intrinsics) {
7703         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7704         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7705     }
7706 
7707     // data cache line writeback
7708     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7709     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7710 
7711     if (UseAESIntrinsics) {
7712       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7713       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7714       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7715       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7716       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7717       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7718     }
7719 
7720     if (UseSHA1Intrinsics) {
7721       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7722       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7723     }
7724     if (UseSHA256Intrinsics) {
7725       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7726       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7727     }
7728     if (UseSHA512Intrinsics) {
7729       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7730       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7731     }
7732     if (UseSHA3Intrinsics) {
7733       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7734       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7735     }
7736 
7737     // generate Adler32 intrinsics code
7738     if (UseAdler32Intrinsics) {
7739       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7740     }
7741 
7742 #ifdef LINUX
7743 
7744     generate_atomic_entry_points();
7745 
7746 #endif // LINUX
7747 
7748     StubRoutines::aarch64::set_completed();
7749   }
7750 
7751  public:
7752   StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
7753     if (phase == 0) {
7754       generate_initial();
7755     } else if (phase == 1) {
7756       generate_phase1(); // stubs that must be available for the interpreter
7757     } else {
7758       generate_all();
7759     }
7760   }
7761 }; // end class declaration
7762 
7763 #define UCM_TABLE_MAX_ENTRIES 8
7764 void StubGenerator_generate(CodeBuffer* code, int phase) {
7765   if (UnsafeCopyMemory::_table == NULL) {
7766     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7767   }
7768   StubGenerator g(code, phase);
7769 }
7770 
7771 
7772 #ifdef LINUX
7773 
7774 // Define pointers to atomic stubs and initialize them to point to the
7775 // code in atomic_aarch64.S.
7776 
7777 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
7778   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7779     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
7780   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7781     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7782 
7783 DEFAULT_ATOMIC_OP(fetch_add, 4, )
7784 DEFAULT_ATOMIC_OP(fetch_add, 8, )
7785 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
7786 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
7787 DEFAULT_ATOMIC_OP(xchg, 4, )
7788 DEFAULT_ATOMIC_OP(xchg, 8, )
7789 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
7790 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
7791 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
7792 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
7793 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
7794 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
7795 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
7796 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
7797 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
7798 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
7799 
7800 #undef DEFAULT_ATOMIC_OP
7801 
7802 #endif // LINUX
7803 
7804 
7805 #undef __
7806 #define __ masm->
7807 
7808 // on exit, sp points to the ContinuationEntry
7809 OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
7810   assert (ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
7811   assert (in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
7812   assert (in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
7813 
7814   stack_slots += (int)ContinuationEntry::size()/wordSize;
7815   __ sub(sp, sp, (int)ContinuationEntry::size()); // place Continuation metadata
7816 
7817   OopMap* map = new OopMap(((int)ContinuationEntry::size() + wordSize)/ VMRegImpl::stack_slot_size, 0 /* arg_slots*/);
7818   ContinuationEntry::setup_oopmap(map);
7819 
7820   __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7821   __ str(rscratch1, Address(sp, ContinuationEntry::parent_offset()));
7822   __ mov(rscratch1, sp); // we can't use sp as the source in str
7823   __ str(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7824 
7825   return map;
7826 }
7827 
7828 // on entry c_rarg1 points to the continuation 
7829 //          sp points to ContinuationEntry
7830 void fill_continuation_entry(MacroAssembler* masm) {
7831 #ifdef ASSERT
7832   __ movw(rscratch1, 0x1234);
7833   __ strw(rscratch1, Address(sp, ContinuationEntry::cookie_offset()));
7834 #endif
7835 
7836   __ str(c_rarg1, Address(sp, ContinuationEntry::cont_offset()));
7837   __ str(zr, Address(sp, ContinuationEntry::chunk_offset()));
7838   __ strw(zr, Address(sp, ContinuationEntry::argsize_offset()));
7839 
7840   __ ldr(rscratch1, Address(rthread, JavaThread::cont_fastpath_offset()));
7841   __ str(rscratch1, Address(sp, ContinuationEntry::parent_cont_fastpath_offset()));
7842   __ ldr(rscratch1, Address(rthread, JavaThread::held_monitor_count_offset()));
7843   __ str(rscratch1, Address(sp, ContinuationEntry::parent_held_monitor_count_offset()));
7844   
7845   __ str(zr, Address(rthread, JavaThread::cont_fastpath_offset()));
7846   __ reset_held_monitor_count(rthread);
7847 }
7848 
7849 // on entry, sp points to the ContinuationEntry
7850 // on exit, rfp points to the spilled rfp in the entry frame
7851 void continuation_enter_cleanup(MacroAssembler* masm) {
7852 #ifndef PRODUCT
7853   Label OK;
7854   __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7855   __ cmp(sp, rscratch1);
7856   __ br(Assembler::EQ, OK);
7857   __ stop("incorrect sp1");
7858   __ bind(OK);
7859 #endif
7860   
7861   __ ldr(rscratch1, Address(sp, ContinuationEntry::parent_cont_fastpath_offset()));
7862   __ str(rscratch1, Address(rthread, JavaThread::cont_fastpath_offset()));
7863   __ ldr(rscratch1, Address(sp, ContinuationEntry::parent_held_monitor_count_offset()));
7864   __ str(rscratch1, Address(rthread, JavaThread::held_monitor_count_offset()));
7865 
7866   __ ldr(rscratch2, Address(sp, ContinuationEntry::parent_offset()));
7867   __ str(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
7868   __ add(rfp, sp, (int)ContinuationEntry::size());
7869 }
7870 
7871 #undef __