1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/atomic.hpp"
  45 #include "runtime/frame.inline.hpp"
  46 #include "runtime/handles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubCodeGenerator.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/thread.inline.hpp"
  51 #include "utilities/align.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 #ifdef COMPILER2
  54 #include "opto/runtime.hpp"
  55 #endif
  56 #if INCLUDE_ZGC
  57 #include "gc/z/zThreadLocalData.hpp"
  58 #endif
  59 
  60 // Declaration and definition of StubGenerator (no .hpp file).
  61 // For a more detailed description of the stub routine structure
  62 // see the comment in stubRoutines.hpp
  63 
  64 #undef __
  65 #define __ _masm->
  66 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  67 
  68 #ifdef PRODUCT
  69 #define BLOCK_COMMENT(str) /* nothing */
  70 #else
  71 #define BLOCK_COMMENT(str) __ block_comment(str)
  72 #endif
  73 
  74 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  75 
  76 // Stub Code definitions
  77 
  78 class StubGenerator: public StubCodeGenerator {
  79  private:
  80 
  81 #ifdef PRODUCT
  82 #define inc_counter_np(counter) ((void)0)
  83 #else
  84   void inc_counter_np_(int& counter) {
  85     __ lea(rscratch2, ExternalAddress((address)&counter));
  86     __ ldrw(rscratch1, Address(rscratch2));
  87     __ addw(rscratch1, rscratch1, 1);
  88     __ strw(rscratch1, Address(rscratch2));
  89   }
  90 #define inc_counter_np(counter) \
  91   BLOCK_COMMENT("inc_counter " #counter); \
  92   inc_counter_np_(counter);
  93 #endif
  94 
  95   // Call stubs are used to call Java from C
  96   //
  97   // Arguments:
  98   //    c_rarg0:   call wrapper address                   address
  99   //    c_rarg1:   result                                 address
 100   //    c_rarg2:   result type                            BasicType
 101   //    c_rarg3:   method                                 Method*
 102   //    c_rarg4:   (interpreter) entry point              address
 103   //    c_rarg5:   parameters                             intptr_t*
 104   //    c_rarg6:   parameter size (in words)              int
 105   //    c_rarg7:   thread                                 Thread*
 106   //
 107   // There is no return from the stub itself as any Java result
 108   // is written to result
 109   //
 110   // we save r30 (lr) as the return PC at the base of the frame and
 111   // link r29 (fp) below it as the frame pointer installing sp (r31)
 112   // into fp.
 113   //
 114   // we save r0-r7, which accounts for all the c arguments.
 115   //
 116   // TODO: strictly do we need to save them all? they are treated as
 117   // volatile by C so could we omit saving the ones we are going to
 118   // place in global registers (thread? method?) or those we only use
 119   // during setup of the Java call?
 120   //
 121   // we don't need to save r8 which C uses as an indirect result location
 122   // return register.
 123   //
 124   // we don't need to save r9-r15 which both C and Java treat as
 125   // volatile
 126   //
 127   // we don't need to save r16-18 because Java does not use them
 128   //
 129   // we save r19-r28 which Java uses as scratch registers and C
 130   // expects to be callee-save
 131   //
 132   // we save the bottom 64 bits of each value stored in v8-v15; it is
 133   // the responsibility of the caller to preserve larger values.
 134   //
 135   // so the stub frame looks like this when we enter Java code
 136   //
 137   //     [ return_from_Java     ] <--- sp
 138   //     [ argument word n      ]
 139   //      ...
 140   // -27 [ argument word 1      ]
 141   // -26 [ saved v15            ] <--- sp_after_call
 142   // -25 [ saved v14            ]
 143   // -24 [ saved v13            ]
 144   // -23 [ saved v12            ]
 145   // -22 [ saved v11            ]
 146   // -21 [ saved v10            ]
 147   // -20 [ saved v9             ]
 148   // -19 [ saved v8             ]
 149   // -18 [ saved r28            ]
 150   // -17 [ saved r27            ]
 151   // -16 [ saved r26            ]
 152   // -15 [ saved r25            ]
 153   // -14 [ saved r24            ]
 154   // -13 [ saved r23            ]
 155   // -12 [ saved r22            ]
 156   // -11 [ saved r21            ]
 157   // -10 [ saved r20            ]
 158   //  -9 [ saved r19            ]
 159   //  -8 [ call wrapper    (r0) ]
 160   //  -7 [ result          (r1) ]
 161   //  -6 [ result type     (r2) ]
 162   //  -5 [ method          (r3) ]
 163   //  -4 [ entry point     (r4) ]
 164   //  -3 [ parameters      (r5) ]
 165   //  -2 [ parameter size  (r6) ]
 166   //  -1 [ thread (r7)          ]
 167   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 168   //   1 [ saved lr       (r30) ]
 169 
 170   // Call stub stack layout word offsets from fp
 171   enum call_stub_layout {
 172     sp_after_call_off = -26,
 173 
 174     d15_off            = -26,
 175     d13_off            = -24,
 176     d11_off            = -22,
 177     d9_off             = -20,
 178 
 179     r28_off            = -18,
 180     r26_off            = -16,
 181     r24_off            = -14,
 182     r22_off            = -12,
 183     r20_off            = -10,
 184     call_wrapper_off   =  -8,
 185     result_off         =  -7,
 186     result_type_off    =  -6,
 187     method_off         =  -5,
 188     entry_point_off    =  -4,
 189     parameter_size_off =  -2,
 190     thread_off         =  -1,
 191     fp_f               =   0,
 192     retaddr_off        =   1,
 193   };
 194 
 195   address generate_call_stub(address& return_address) {
 196     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 197            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 198            "adjust this code");
 199 
 200     StubCodeMark mark(this, "StubRoutines", "call_stub");
 201     address start = __ pc();
 202 
 203     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 204 
 205     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 206     const Address result        (rfp, result_off         * wordSize);
 207     const Address result_type   (rfp, result_type_off    * wordSize);
 208     const Address method        (rfp, method_off         * wordSize);
 209     const Address entry_point   (rfp, entry_point_off    * wordSize);
 210     const Address parameter_size(rfp, parameter_size_off * wordSize);
 211 
 212     const Address thread        (rfp, thread_off         * wordSize);
 213 
 214     const Address d15_save      (rfp, d15_off * wordSize);
 215     const Address d13_save      (rfp, d13_off * wordSize);
 216     const Address d11_save      (rfp, d11_off * wordSize);
 217     const Address d9_save       (rfp, d9_off * wordSize);
 218 
 219     const Address r28_save      (rfp, r28_off * wordSize);
 220     const Address r26_save      (rfp, r26_off * wordSize);
 221     const Address r24_save      (rfp, r24_off * wordSize);
 222     const Address r22_save      (rfp, r22_off * wordSize);
 223     const Address r20_save      (rfp, r20_off * wordSize);
 224 
 225     // stub code
 226 
 227     address aarch64_entry = __ pc();
 228 
 229     // set up frame and move sp to end of save area
 230     __ enter();
 231     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 232 
 233     // save register parameters and Java scratch/global registers
 234     // n.b. we save thread even though it gets installed in
 235     // rthread because we want to sanity check rthread later
 236     __ str(c_rarg7,  thread);
 237     __ strw(c_rarg6, parameter_size);
 238     __ stp(c_rarg4, c_rarg5,  entry_point);
 239     __ stp(c_rarg2, c_rarg3,  result_type);
 240     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 241 
 242     __ stp(r20, r19,   r20_save);
 243     __ stp(r22, r21,   r22_save);
 244     __ stp(r24, r23,   r24_save);
 245     __ stp(r26, r25,   r26_save);
 246     __ stp(r28, r27,   r28_save);
 247 
 248     __ stpd(v9,  v8,   d9_save);
 249     __ stpd(v11, v10,  d11_save);
 250     __ stpd(v13, v12,  d13_save);
 251     __ stpd(v15, v14,  d15_save);
 252 
 253     // install Java thread in global register now we have saved
 254     // whatever value it held
 255     __ mov(rthread, c_rarg7);
 256     // And method
 257     __ mov(rmethod, c_rarg3);
 258 
 259     // set up the heapbase register
 260     __ reinit_heapbase();
 261 
 262 #ifdef ASSERT
 263     // make sure we have no pending exceptions
 264     {
 265       Label L;
 266       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 267       __ cmp(rscratch1, (u1)NULL_WORD);
 268       __ br(Assembler::EQ, L);
 269       __ stop("StubRoutines::call_stub: entered with pending exception");
 270       __ BIND(L);
 271     }
 272 #endif
 273     // pass parameters if any
 274     __ mov(esp, sp);
 275     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 276     __ andr(sp, rscratch1, -2 * wordSize);
 277 
 278     BLOCK_COMMENT("pass parameters if any");
 279     Label parameters_done;
 280     // parameter count is still in c_rarg6
 281     // and parameter pointer identifying param 1 is in c_rarg5
 282     __ cbzw(c_rarg6, parameters_done);
 283 
 284     address loop = __ pc();
 285     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 286     __ subsw(c_rarg6, c_rarg6, 1);
 287     __ push(rscratch1);
 288     __ br(Assembler::GT, loop);
 289 
 290     __ BIND(parameters_done);
 291 
 292     // call Java entry -- passing methdoOop, and current sp
 293     //      rmethod: Method*
 294     //      r13: sender sp
 295     BLOCK_COMMENT("call Java function");
 296     __ mov(r13, sp);
 297     __ blr(c_rarg4);
 298 
 299     // we do this here because the notify will already have been done
 300     // if we get to the next instruction via an exception
 301     //
 302     // n.b. adding this instruction here affects the calculation of
 303     // whether or not a routine returns to the call stub (used when
 304     // doing stack walks) since the normal test is to check the return
 305     // pc against the address saved below. so we may need to allow for
 306     // this extra instruction in the check.
 307 
 308     // save current address for use by exception handling code
 309 
 310     return_address = __ pc();
 311 
 312     // store result depending on type (everything that is not
 313     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 314     // n.b. this assumes Java returns an integral result in r0
 315     // and a floating result in j_farg0
 316     __ ldr(j_rarg2, result);
 317     Label is_long, is_float, is_double, exit;
 318     __ ldr(j_rarg1, result_type);
 319     __ cmp(j_rarg1, (u1)T_OBJECT);
 320     __ br(Assembler::EQ, is_long);
 321     __ cmp(j_rarg1, (u1)T_LONG);
 322     __ br(Assembler::EQ, is_long);
 323     __ cmp(j_rarg1, (u1)T_FLOAT);
 324     __ br(Assembler::EQ, is_float);
 325     __ cmp(j_rarg1, (u1)T_DOUBLE);
 326     __ br(Assembler::EQ, is_double);
 327 
 328     // handle T_INT case
 329     __ strw(r0, Address(j_rarg2));
 330 
 331     __ BIND(exit);
 332 
 333     // pop parameters
 334     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 335 
 336 #ifdef ASSERT
 337     // verify that threads correspond
 338     {
 339       Label L, S;
 340       __ ldr(rscratch1, thread);
 341       __ cmp(rthread, rscratch1);
 342       __ br(Assembler::NE, S);
 343       __ get_thread(rscratch1);
 344       __ cmp(rthread, rscratch1);
 345       __ br(Assembler::EQ, L);
 346       __ BIND(S);
 347       __ stop("StubRoutines::call_stub: threads must correspond");
 348       __ BIND(L);
 349     }
 350 #endif
 351 
 352     // restore callee-save registers
 353     __ ldpd(v15, v14,  d15_save);
 354     __ ldpd(v13, v12,  d13_save);
 355     __ ldpd(v11, v10,  d11_save);
 356     __ ldpd(v9,  v8,   d9_save);
 357 
 358     __ ldp(r28, r27,   r28_save);
 359     __ ldp(r26, r25,   r26_save);
 360     __ ldp(r24, r23,   r24_save);
 361     __ ldp(r22, r21,   r22_save);
 362     __ ldp(r20, r19,   r20_save);
 363 
 364     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 365     __ ldrw(c_rarg2, result_type);
 366     __ ldr(c_rarg3,  method);
 367     __ ldp(c_rarg4, c_rarg5,  entry_point);
 368     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 369 
 370     // leave frame and return to caller
 371     __ leave();
 372     __ ret(lr);
 373 
 374     // handle return types different from T_INT
 375 
 376     __ BIND(is_long);
 377     __ str(r0, Address(j_rarg2, 0));
 378     __ br(Assembler::AL, exit);
 379 
 380     __ BIND(is_float);
 381     __ strs(j_farg0, Address(j_rarg2, 0));
 382     __ br(Assembler::AL, exit);
 383 
 384     __ BIND(is_double);
 385     __ strd(j_farg0, Address(j_rarg2, 0));
 386     __ br(Assembler::AL, exit);
 387 
 388     return start;
 389   }
 390 
 391   // Return point for a Java call if there's an exception thrown in
 392   // Java code.  The exception is caught and transformed into a
 393   // pending exception stored in JavaThread that can be tested from
 394   // within the VM.
 395   //
 396   // Note: Usually the parameters are removed by the callee. In case
 397   // of an exception crossing an activation frame boundary, that is
 398   // not the case if the callee is compiled code => need to setup the
 399   // rsp.
 400   //
 401   // r0: exception oop
 402 
 403   address generate_catch_exception() {
 404     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 405     address start = __ pc();
 406 
 407     // same as in generate_call_stub():
 408     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 409     const Address thread        (rfp, thread_off         * wordSize);
 410 
 411 #ifdef ASSERT
 412     // verify that threads correspond
 413     {
 414       Label L, S;
 415       __ ldr(rscratch1, thread);
 416       __ cmp(rthread, rscratch1);
 417       __ br(Assembler::NE, S);
 418       __ get_thread(rscratch1);
 419       __ cmp(rthread, rscratch1);
 420       __ br(Assembler::EQ, L);
 421       __ bind(S);
 422       __ stop("StubRoutines::catch_exception: threads must correspond");
 423       __ bind(L);
 424     }
 425 #endif
 426 
 427     // set pending exception
 428     __ verify_oop(r0);
 429 
 430     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 431     __ mov(rscratch1, (address)__FILE__);
 432     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 433     __ movw(rscratch1, (int)__LINE__);
 434     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 435 
 436     // complete return to VM
 437     assert(StubRoutines::_call_stub_return_address != NULL,
 438            "_call_stub_return_address must have been generated before");
 439     __ b(StubRoutines::_call_stub_return_address);
 440 
 441     return start;
 442   }
 443 
 444   // Continuation point for runtime calls returning with a pending
 445   // exception.  The pending exception check happened in the runtime
 446   // or native call stub.  The pending exception in Thread is
 447   // converted into a Java-level exception.
 448   //
 449   // Contract with Java-level exception handlers:
 450   // r0: exception
 451   // r3: throwing pc
 452   //
 453   // NOTE: At entry of this stub, exception-pc must be in LR !!
 454 
 455   // NOTE: this is always used as a jump target within generated code
 456   // so it just needs to be generated code wiht no x86 prolog
 457 
 458   address generate_forward_exception() {
 459     StubCodeMark mark(this, "StubRoutines", "forward exception");
 460     address start = __ pc();
 461 
 462     // Upon entry, LR points to the return address returning into
 463     // Java (interpreted or compiled) code; i.e., the return address
 464     // becomes the throwing pc.
 465     //
 466     // Arguments pushed before the runtime call are still on the stack
 467     // but the exception handler will reset the stack pointer ->
 468     // ignore them.  A potential result in registers can be ignored as
 469     // well.
 470 
 471 #ifdef ASSERT
 472     // make sure this code is only executed if there is a pending exception
 473     {
 474       Label L;
 475       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 476       __ cbnz(rscratch1, L);
 477       __ stop("StubRoutines::forward exception: no pending exception (1)");
 478       __ bind(L);
 479     }
 480 #endif
 481 
 482     // compute exception handler into r19
 483 
 484     // call the VM to find the handler address associated with the
 485     // caller address. pass thread in r0 and caller pc (ret address)
 486     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 487     // the stack.
 488     __ mov(c_rarg1, lr);
 489     // lr will be trashed by the VM call so we move it to R19
 490     // (callee-saved) because we also need to pass it to the handler
 491     // returned by this call.
 492     __ mov(r19, lr);
 493     BLOCK_COMMENT("call exception_handler_for_return_address");
 494     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 495                          SharedRuntime::exception_handler_for_return_address),
 496                     rthread, c_rarg1);
 497     // Reinitialize the ptrue predicate register, in case the external runtime
 498     // call clobbers ptrue reg, as we may return to SVE compiled code.
 499     __ reinitialize_ptrue();
 500 
 501     // we should not really care that lr is no longer the callee
 502     // address. we saved the value the handler needs in r19 so we can
 503     // just copy it to r3. however, the C2 handler will push its own
 504     // frame and then calls into the VM and the VM code asserts that
 505     // the PC for the frame above the handler belongs to a compiled
 506     // Java method. So, we restore lr here to satisfy that assert.
 507     __ mov(lr, r19);
 508     // setup r0 & r3 & clear pending exception
 509     __ mov(r3, r19);
 510     __ mov(r19, r0);
 511     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 512     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 513 
 514 #ifdef ASSERT
 515     // make sure exception is set
 516     {
 517       Label L;
 518       __ cbnz(r0, L);
 519       __ stop("StubRoutines::forward exception: no pending exception (2)");
 520       __ bind(L);
 521     }
 522 #endif
 523 
 524     // continue at exception handler
 525     // r0: exception
 526     // r3: throwing pc
 527     // r19: exception handler
 528     __ verify_oop(r0);
 529     __ br(r19);
 530 
 531     return start;
 532   }
 533 
 534   // Non-destructive plausibility checks for oops
 535   //
 536   // Arguments:
 537   //    r0: oop to verify
 538   //    rscratch1: error message
 539   //
 540   // Stack after saving c_rarg3:
 541   //    [tos + 0]: saved c_rarg3
 542   //    [tos + 1]: saved c_rarg2
 543   //    [tos + 2]: saved lr
 544   //    [tos + 3]: saved rscratch2
 545   //    [tos + 4]: saved r0
 546   //    [tos + 5]: saved rscratch1
 547   address generate_verify_oop() {
 548 
 549     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 550     address start = __ pc();
 551 
 552     Label exit, error;
 553 
 554     // save c_rarg2 and c_rarg3
 555     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 556 
 557     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 558     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 559     __ ldr(c_rarg3, Address(c_rarg2));
 560     __ add(c_rarg3, c_rarg3, 1);
 561     __ str(c_rarg3, Address(c_rarg2));
 562 
 563     // object is in r0
 564     // make sure object is 'reasonable'
 565     __ cbz(r0, exit); // if obj is NULL it is OK
 566 
 567 #if INCLUDE_ZGC
 568     if (UseZGC) {
 569       // Check if mask is good.
 570       // verifies that ZAddressBadMask & r0 == 0
 571       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 572       __ andr(c_rarg2, r0, c_rarg3);
 573       __ cbnz(c_rarg2, error);
 574     }
 575 #endif
 576 
 577     // Check if the oop is in the right area of memory
 578     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 579     __ andr(c_rarg2, r0, c_rarg3);
 580     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 581 
 582     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 583     // instruction here because the flags register is live.
 584     __ eor(c_rarg2, c_rarg2, c_rarg3);
 585     __ cbnz(c_rarg2, error);
 586 
 587     // make sure klass is 'reasonable', which is not zero.
 588     __ load_klass(r0, r0);  // get klass
 589     __ cbz(r0, error);      // if klass is NULL it is broken
 590 
 591     // return if everything seems ok
 592     __ bind(exit);
 593 
 594     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 595     __ ret(lr);
 596 
 597     // handle errors
 598     __ bind(error);
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600 
 601     __ push(RegSet::range(r0, r29), sp);
 602     // debug(char* msg, int64_t pc, int64_t regs[])
 603     __ mov(c_rarg0, rscratch1);      // pass address of error message
 604     __ mov(c_rarg1, lr);             // pass return address
 605     __ mov(c_rarg2, sp);             // pass address of regs on stack
 606 #ifndef PRODUCT
 607     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 608 #endif
 609     BLOCK_COMMENT("call MacroAssembler::debug");
 610     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 611     __ blr(rscratch1);
 612     __ hlt(0);
 613 
 614     return start;
 615   }
 616 
 617   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 618 
 619   // Generate indices for iota vector.
 620   address generate_iota_indices(const char *stub_name) {
 621     __ align(CodeEntryAlignment);
 622     StubCodeMark mark(this, "StubRoutines", stub_name);
 623     address start = __ pc();
 624     __ emit_data64(0x0706050403020100, relocInfo::none);
 625     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 626     return start;
 627   }
 628 
 629   // The inner part of zero_words().  This is the bulk operation,
 630   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 631   // caller is responsible for zeroing the last few words.
 632   //
 633   // Inputs:
 634   // r10: the HeapWord-aligned base address of an array to zero.
 635   // r11: the count in HeapWords, r11 > 0.
 636   //
 637   // Returns r10 and r11, adjusted for the caller to clear.
 638   // r10: the base address of the tail of words left to clear.
 639   // r11: the number of words in the tail.
 640   //      r11 < MacroAssembler::zero_words_block_size.
 641 
 642   address generate_zero_blocks() {
 643     Label done;
 644     Label base_aligned;
 645 
 646     Register base = r10, cnt = r11;
 647 
 648     __ align(CodeEntryAlignment);
 649     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 650     address start = __ pc();
 651 
 652     if (UseBlockZeroing) {
 653       int zva_length = VM_Version::zva_length();
 654 
 655       // Ensure ZVA length can be divided by 16. This is required by
 656       // the subsequent operations.
 657       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 658 
 659       __ tbz(base, 3, base_aligned);
 660       __ str(zr, Address(__ post(base, 8)));
 661       __ sub(cnt, cnt, 1);
 662       __ bind(base_aligned);
 663 
 664       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 665       // alignment.
 666       Label small;
 667       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 668       __ subs(rscratch1, cnt, low_limit >> 3);
 669       __ br(Assembler::LT, small);
 670       __ zero_dcache_blocks(base, cnt);
 671       __ bind(small);
 672     }
 673 
 674     {
 675       // Number of stp instructions we'll unroll
 676       const int unroll =
 677         MacroAssembler::zero_words_block_size / 2;
 678       // Clear the remaining blocks.
 679       Label loop;
 680       __ subs(cnt, cnt, unroll * 2);
 681       __ br(Assembler::LT, done);
 682       __ bind(loop);
 683       for (int i = 0; i < unroll; i++)
 684         __ stp(zr, zr, __ post(base, 16));
 685       __ subs(cnt, cnt, unroll * 2);
 686       __ br(Assembler::GE, loop);
 687       __ bind(done);
 688       __ add(cnt, cnt, unroll * 2);
 689     }
 690 
 691     __ ret(lr);
 692 
 693     return start;
 694   }
 695 
 696 
 697   typedef enum {
 698     copy_forwards = 1,
 699     copy_backwards = -1
 700   } copy_direction;
 701 
 702   // Bulk copy of blocks of 8 words.
 703   //
 704   // count is a count of words.
 705   //
 706   // Precondition: count >= 8
 707   //
 708   // Postconditions:
 709   //
 710   // The least significant bit of count contains the remaining count
 711   // of words to copy.  The rest of count is trash.
 712   //
 713   // s and d are adjusted to point to the remaining words to copy
 714   //
 715   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 716                            copy_direction direction) {
 717     int unit = wordSize * direction;
 718     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 719 
 720     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 721       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 722     const Register stride = r13;
 723 
 724     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 725     assert_different_registers(s, d, count, rscratch1);
 726 
 727     Label again, drain;
 728     const char *stub_name;
 729     if (direction == copy_forwards)
 730       stub_name = "forward_copy_longs";
 731     else
 732       stub_name = "backward_copy_longs";
 733 
 734     __ align(CodeEntryAlignment);
 735 
 736     StubCodeMark mark(this, "StubRoutines", stub_name);
 737 
 738     __ bind(start);
 739 
 740     Label unaligned_copy_long;
 741     if (AvoidUnalignedAccesses) {
 742       __ tbnz(d, 3, unaligned_copy_long);
 743     }
 744 
 745     if (direction == copy_forwards) {
 746       __ sub(s, s, bias);
 747       __ sub(d, d, bias);
 748     }
 749 
 750 #ifdef ASSERT
 751     // Make sure we are never given < 8 words
 752     {
 753       Label L;
 754       __ cmp(count, (u1)8);
 755       __ br(Assembler::GE, L);
 756       __ stop("genrate_copy_longs called with < 8 words");
 757       __ bind(L);
 758     }
 759 #endif
 760 
 761     // Fill 8 registers
 762     if (UseSIMDForMemoryOps) {
 763       __ ldpq(v0, v1, Address(s, 4 * unit));
 764       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 765     } else {
 766       __ ldp(t0, t1, Address(s, 2 * unit));
 767       __ ldp(t2, t3, Address(s, 4 * unit));
 768       __ ldp(t4, t5, Address(s, 6 * unit));
 769       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 770     }
 771 
 772     __ subs(count, count, 16);
 773     __ br(Assembler::LO, drain);
 774 
 775     int prefetch = PrefetchCopyIntervalInBytes;
 776     bool use_stride = false;
 777     if (direction == copy_backwards) {
 778        use_stride = prefetch > 256;
 779        prefetch = -prefetch;
 780        if (use_stride) __ mov(stride, prefetch);
 781     }
 782 
 783     __ bind(again);
 784 
 785     if (PrefetchCopyIntervalInBytes > 0)
 786       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 787 
 788     if (UseSIMDForMemoryOps) {
 789       __ stpq(v0, v1, Address(d, 4 * unit));
 790       __ ldpq(v0, v1, Address(s, 4 * unit));
 791       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 792       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 793     } else {
 794       __ stp(t0, t1, Address(d, 2 * unit));
 795       __ ldp(t0, t1, Address(s, 2 * unit));
 796       __ stp(t2, t3, Address(d, 4 * unit));
 797       __ ldp(t2, t3, Address(s, 4 * unit));
 798       __ stp(t4, t5, Address(d, 6 * unit));
 799       __ ldp(t4, t5, Address(s, 6 * unit));
 800       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 801       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 802     }
 803 
 804     __ subs(count, count, 8);
 805     __ br(Assembler::HS, again);
 806 
 807     // Drain
 808     __ bind(drain);
 809     if (UseSIMDForMemoryOps) {
 810       __ stpq(v0, v1, Address(d, 4 * unit));
 811       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 812     } else {
 813       __ stp(t0, t1, Address(d, 2 * unit));
 814       __ stp(t2, t3, Address(d, 4 * unit));
 815       __ stp(t4, t5, Address(d, 6 * unit));
 816       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 817     }
 818 
 819     {
 820       Label L1, L2;
 821       __ tbz(count, exact_log2(4), L1);
 822       if (UseSIMDForMemoryOps) {
 823         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 824         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 825       } else {
 826         __ ldp(t0, t1, Address(s, 2 * unit));
 827         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 828         __ stp(t0, t1, Address(d, 2 * unit));
 829         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 830       }
 831       __ bind(L1);
 832 
 833       if (direction == copy_forwards) {
 834         __ add(s, s, bias);
 835         __ add(d, d, bias);
 836       }
 837 
 838       __ tbz(count, 1, L2);
 839       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 840       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 841       __ bind(L2);
 842     }
 843 
 844     __ ret(lr);
 845 
 846     if (AvoidUnalignedAccesses) {
 847       Label drain, again;
 848       // Register order for storing. Order is different for backward copy.
 849 
 850       __ bind(unaligned_copy_long);
 851 
 852       // source address is even aligned, target odd aligned
 853       //
 854       // when forward copying word pairs we read long pairs at offsets
 855       // {0, 2, 4, 6} (in long words). when backwards copying we read
 856       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 857       // address by -2 in the forwards case so we can compute the
 858       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 859       // or -1.
 860       //
 861       // when forward copying we need to store 1 word, 3 pairs and
 862       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 863       // zero offset We adjust the destination by -1 which means we
 864       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 865       //
 866       // When backwards copyng we need to store 1 word, 3 pairs and
 867       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 868       // offsets {1, 3, 5, 7, 8} * unit.
 869 
 870       if (direction == copy_forwards) {
 871         __ sub(s, s, 16);
 872         __ sub(d, d, 8);
 873       }
 874 
 875       // Fill 8 registers
 876       //
 877       // for forwards copy s was offset by -16 from the original input
 878       // value of s so the register contents are at these offsets
 879       // relative to the 64 bit block addressed by that original input
 880       // and so on for each successive 64 byte block when s is updated
 881       //
 882       // t0 at offset 0,  t1 at offset 8
 883       // t2 at offset 16, t3 at offset 24
 884       // t4 at offset 32, t5 at offset 40
 885       // t6 at offset 48, t7 at offset 56
 886 
 887       // for backwards copy s was not offset so the register contents
 888       // are at these offsets into the preceding 64 byte block
 889       // relative to that original input and so on for each successive
 890       // preceding 64 byte block when s is updated. this explains the
 891       // slightly counter-intuitive looking pattern of register usage
 892       // in the stp instructions for backwards copy.
 893       //
 894       // t0 at offset -16, t1 at offset -8
 895       // t2 at offset -32, t3 at offset -24
 896       // t4 at offset -48, t5 at offset -40
 897       // t6 at offset -64, t7 at offset -56
 898 
 899       __ ldp(t0, t1, Address(s, 2 * unit));
 900       __ ldp(t2, t3, Address(s, 4 * unit));
 901       __ ldp(t4, t5, Address(s, 6 * unit));
 902       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 903 
 904       __ subs(count, count, 16);
 905       __ br(Assembler::LO, drain);
 906 
 907       int prefetch = PrefetchCopyIntervalInBytes;
 908       bool use_stride = false;
 909       if (direction == copy_backwards) {
 910          use_stride = prefetch > 256;
 911          prefetch = -prefetch;
 912          if (use_stride) __ mov(stride, prefetch);
 913       }
 914 
 915       __ bind(again);
 916 
 917       if (PrefetchCopyIntervalInBytes > 0)
 918         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 919 
 920       if (direction == copy_forwards) {
 921        // allowing for the offset of -8 the store instructions place
 922        // registers into the target 64 bit block at the following
 923        // offsets
 924        //
 925        // t0 at offset 0
 926        // t1 at offset 8,  t2 at offset 16
 927        // t3 at offset 24, t4 at offset 32
 928        // t5 at offset 40, t6 at offset 48
 929        // t7 at offset 56
 930 
 931         __ str(t0, Address(d, 1 * unit));
 932         __ stp(t1, t2, Address(d, 2 * unit));
 933         __ ldp(t0, t1, Address(s, 2 * unit));
 934         __ stp(t3, t4, Address(d, 4 * unit));
 935         __ ldp(t2, t3, Address(s, 4 * unit));
 936         __ stp(t5, t6, Address(d, 6 * unit));
 937         __ ldp(t4, t5, Address(s, 6 * unit));
 938         __ str(t7, Address(__ pre(d, 8 * unit)));
 939         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 940       } else {
 941        // d was not offset when we started so the registers are
 942        // written into the 64 bit block preceding d with the following
 943        // offsets
 944        //
 945        // t1 at offset -8
 946        // t3 at offset -24, t0 at offset -16
 947        // t5 at offset -48, t2 at offset -32
 948        // t7 at offset -56, t4 at offset -48
 949        //                   t6 at offset -64
 950        //
 951        // note that this matches the offsets previously noted for the
 952        // loads
 953 
 954         __ str(t1, Address(d, 1 * unit));
 955         __ stp(t3, t0, Address(d, 3 * unit));
 956         __ ldp(t0, t1, Address(s, 2 * unit));
 957         __ stp(t5, t2, Address(d, 5 * unit));
 958         __ ldp(t2, t3, Address(s, 4 * unit));
 959         __ stp(t7, t4, Address(d, 7 * unit));
 960         __ ldp(t4, t5, Address(s, 6 * unit));
 961         __ str(t6, Address(__ pre(d, 8 * unit)));
 962         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 963       }
 964 
 965       __ subs(count, count, 8);
 966       __ br(Assembler::HS, again);
 967 
 968       // Drain
 969       //
 970       // this uses the same pattern of offsets and register arguments
 971       // as above
 972       __ bind(drain);
 973       if (direction == copy_forwards) {
 974         __ str(t0, Address(d, 1 * unit));
 975         __ stp(t1, t2, Address(d, 2 * unit));
 976         __ stp(t3, t4, Address(d, 4 * unit));
 977         __ stp(t5, t6, Address(d, 6 * unit));
 978         __ str(t7, Address(__ pre(d, 8 * unit)));
 979       } else {
 980         __ str(t1, Address(d, 1 * unit));
 981         __ stp(t3, t0, Address(d, 3 * unit));
 982         __ stp(t5, t2, Address(d, 5 * unit));
 983         __ stp(t7, t4, Address(d, 7 * unit));
 984         __ str(t6, Address(__ pre(d, 8 * unit)));
 985       }
 986       // now we need to copy any remaining part block which may
 987       // include a 4 word block subblock and/or a 2 word subblock.
 988       // bits 2 and 1 in the count are the tell-tale for whetehr we
 989       // have each such subblock
 990       {
 991         Label L1, L2;
 992         __ tbz(count, exact_log2(4), L1);
 993        // this is the same as above but copying only 4 longs hence
 994        // with ony one intervening stp between the str instructions
 995        // but note that the offsets and registers still follow the
 996        // same pattern
 997         __ ldp(t0, t1, Address(s, 2 * unit));
 998         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 999         if (direction == copy_forwards) {
1000           __ str(t0, Address(d, 1 * unit));
1001           __ stp(t1, t2, Address(d, 2 * unit));
1002           __ str(t3, Address(__ pre(d, 4 * unit)));
1003         } else {
1004           __ str(t1, Address(d, 1 * unit));
1005           __ stp(t3, t0, Address(d, 3 * unit));
1006           __ str(t2, Address(__ pre(d, 4 * unit)));
1007         }
1008         __ bind(L1);
1009 
1010         __ tbz(count, 1, L2);
1011        // this is the same as above but copying only 2 longs hence
1012        // there is no intervening stp between the str instructions
1013        // but note that the offset and register patterns are still
1014        // the same
1015         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1016         if (direction == copy_forwards) {
1017           __ str(t0, Address(d, 1 * unit));
1018           __ str(t1, Address(__ pre(d, 2 * unit)));
1019         } else {
1020           __ str(t1, Address(d, 1 * unit));
1021           __ str(t0, Address(__ pre(d, 2 * unit)));
1022         }
1023         __ bind(L2);
1024 
1025        // for forwards copy we need to re-adjust the offsets we
1026        // applied so that s and d are follow the last words written
1027 
1028        if (direction == copy_forwards) {
1029          __ add(s, s, 16);
1030          __ add(d, d, 8);
1031        }
1032 
1033       }
1034 
1035       __ ret(lr);
1036       }
1037   }
1038 
1039   // Small copy: less than 16 bytes.
1040   //
1041   // NB: Ignores all of the bits of count which represent more than 15
1042   // bytes, so a caller doesn't have to mask them.
1043 
1044   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1045     bool is_backwards = step < 0;
1046     size_t granularity = uabs(step);
1047     int direction = is_backwards ? -1 : 1;
1048     int unit = wordSize * direction;
1049 
1050     Label Lword, Lint, Lshort, Lbyte;
1051 
1052     assert(granularity
1053            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1054 
1055     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1056 
1057     // ??? I don't know if this bit-test-and-branch is the right thing
1058     // to do.  It does a lot of jumping, resulting in several
1059     // mispredicted branches.  It might make more sense to do this
1060     // with something like Duff's device with a single computed branch.
1061 
1062     __ tbz(count, 3 - exact_log2(granularity), Lword);
1063     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1064     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1065     __ bind(Lword);
1066 
1067     if (granularity <= sizeof (jint)) {
1068       __ tbz(count, 2 - exact_log2(granularity), Lint);
1069       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1070       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1071       __ bind(Lint);
1072     }
1073 
1074     if (granularity <= sizeof (jshort)) {
1075       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1076       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1077       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1078       __ bind(Lshort);
1079     }
1080 
1081     if (granularity <= sizeof (jbyte)) {
1082       __ tbz(count, 0, Lbyte);
1083       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1084       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1085       __ bind(Lbyte);
1086     }
1087   }
1088 
1089   Label copy_f, copy_b;
1090 
1091   // All-singing all-dancing memory copy.
1092   //
1093   // Copy count units of memory from s to d.  The size of a unit is
1094   // step, which can be positive or negative depending on the direction
1095   // of copy.  If is_aligned is false, we align the source address.
1096   //
1097 
1098   void copy_memory(bool is_aligned, Register s, Register d,
1099                    Register count, Register tmp, int step) {
1100     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1101     bool is_backwards = step < 0;
1102     unsigned int granularity = uabs(step);
1103     const Register t0 = r3, t1 = r4;
1104 
1105     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1106     // load all the data before writing anything
1107     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1108     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1109     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1110     const Register send = r17, dend = r16;
1111 
1112     if (PrefetchCopyIntervalInBytes > 0)
1113       __ prfm(Address(s, 0), PLDL1KEEP);
1114     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1115     __ br(Assembler::HI, copy_big);
1116 
1117     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1118     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1119 
1120     __ cmp(count, u1(16/granularity));
1121     __ br(Assembler::LS, copy16);
1122 
1123     __ cmp(count, u1(64/granularity));
1124     __ br(Assembler::HI, copy80);
1125 
1126     __ cmp(count, u1(32/granularity));
1127     __ br(Assembler::LS, copy32);
1128 
1129     // 33..64 bytes
1130     if (UseSIMDForMemoryOps) {
1131       __ ldpq(v0, v1, Address(s, 0));
1132       __ ldpq(v2, v3, Address(send, -32));
1133       __ stpq(v0, v1, Address(d, 0));
1134       __ stpq(v2, v3, Address(dend, -32));
1135     } else {
1136       __ ldp(t0, t1, Address(s, 0));
1137       __ ldp(t2, t3, Address(s, 16));
1138       __ ldp(t4, t5, Address(send, -32));
1139       __ ldp(t6, t7, Address(send, -16));
1140 
1141       __ stp(t0, t1, Address(d, 0));
1142       __ stp(t2, t3, Address(d, 16));
1143       __ stp(t4, t5, Address(dend, -32));
1144       __ stp(t6, t7, Address(dend, -16));
1145     }
1146     __ b(finish);
1147 
1148     // 17..32 bytes
1149     __ bind(copy32);
1150     __ ldp(t0, t1, Address(s, 0));
1151     __ ldp(t2, t3, Address(send, -16));
1152     __ stp(t0, t1, Address(d, 0));
1153     __ stp(t2, t3, Address(dend, -16));
1154     __ b(finish);
1155 
1156     // 65..80/96 bytes
1157     // (96 bytes if SIMD because we do 32 byes per instruction)
1158     __ bind(copy80);
1159     if (UseSIMDForMemoryOps) {
1160       __ ldpq(v0, v1, Address(s, 0));
1161       __ ldpq(v2, v3, Address(s, 32));
1162       // Unaligned pointers can be an issue for copying.
1163       // The issue has more chances to happen when granularity of data is
1164       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1165       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1166       // The most performance drop has been seen for the range 65-80 bytes.
1167       // For such cases using the pair of ldp/stp instead of the third pair of
1168       // ldpq/stpq fixes the performance issue.
1169       if (granularity < sizeof (jint)) {
1170         Label copy96;
1171         __ cmp(count, u1(80/granularity));
1172         __ br(Assembler::HI, copy96);
1173         __ ldp(t0, t1, Address(send, -16));
1174 
1175         __ stpq(v0, v1, Address(d, 0));
1176         __ stpq(v2, v3, Address(d, 32));
1177         __ stp(t0, t1, Address(dend, -16));
1178         __ b(finish);
1179 
1180         __ bind(copy96);
1181       }
1182       __ ldpq(v4, v5, Address(send, -32));
1183 
1184       __ stpq(v0, v1, Address(d, 0));
1185       __ stpq(v2, v3, Address(d, 32));
1186       __ stpq(v4, v5, Address(dend, -32));
1187     } else {
1188       __ ldp(t0, t1, Address(s, 0));
1189       __ ldp(t2, t3, Address(s, 16));
1190       __ ldp(t4, t5, Address(s, 32));
1191       __ ldp(t6, t7, Address(s, 48));
1192       __ ldp(t8, t9, Address(send, -16));
1193 
1194       __ stp(t0, t1, Address(d, 0));
1195       __ stp(t2, t3, Address(d, 16));
1196       __ stp(t4, t5, Address(d, 32));
1197       __ stp(t6, t7, Address(d, 48));
1198       __ stp(t8, t9, Address(dend, -16));
1199     }
1200     __ b(finish);
1201 
1202     // 0..16 bytes
1203     __ bind(copy16);
1204     __ cmp(count, u1(8/granularity));
1205     __ br(Assembler::LO, copy8);
1206 
1207     // 8..16 bytes
1208     __ ldr(t0, Address(s, 0));
1209     __ ldr(t1, Address(send, -8));
1210     __ str(t0, Address(d, 0));
1211     __ str(t1, Address(dend, -8));
1212     __ b(finish);
1213 
1214     if (granularity < 8) {
1215       // 4..7 bytes
1216       __ bind(copy8);
1217       __ tbz(count, 2 - exact_log2(granularity), copy4);
1218       __ ldrw(t0, Address(s, 0));
1219       __ ldrw(t1, Address(send, -4));
1220       __ strw(t0, Address(d, 0));
1221       __ strw(t1, Address(dend, -4));
1222       __ b(finish);
1223       if (granularity < 4) {
1224         // 0..3 bytes
1225         __ bind(copy4);
1226         __ cbz(count, finish); // get rid of 0 case
1227         if (granularity == 2) {
1228           __ ldrh(t0, Address(s, 0));
1229           __ strh(t0, Address(d, 0));
1230         } else { // granularity == 1
1231           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1232           // the first and last byte.
1233           // Handle the 3 byte case by loading and storing base + count/2
1234           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1235           // This does means in the 1 byte case we load/store the same
1236           // byte 3 times.
1237           __ lsr(count, count, 1);
1238           __ ldrb(t0, Address(s, 0));
1239           __ ldrb(t1, Address(send, -1));
1240           __ ldrb(t2, Address(s, count));
1241           __ strb(t0, Address(d, 0));
1242           __ strb(t1, Address(dend, -1));
1243           __ strb(t2, Address(d, count));
1244         }
1245         __ b(finish);
1246       }
1247     }
1248 
1249     __ bind(copy_big);
1250     if (is_backwards) {
1251       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1252       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1253     }
1254 
1255     // Now we've got the small case out of the way we can align the
1256     // source address on a 2-word boundary.
1257 
1258     Label aligned;
1259 
1260     if (is_aligned) {
1261       // We may have to adjust by 1 word to get s 2-word-aligned.
1262       __ tbz(s, exact_log2(wordSize), aligned);
1263       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1264       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1265       __ sub(count, count, wordSize/granularity);
1266     } else {
1267       if (is_backwards) {
1268         __ andr(rscratch2, s, 2 * wordSize - 1);
1269       } else {
1270         __ neg(rscratch2, s);
1271         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1272       }
1273       // rscratch2 is the byte adjustment needed to align s.
1274       __ cbz(rscratch2, aligned);
1275       int shift = exact_log2(granularity);
1276       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1277       __ sub(count, count, rscratch2);
1278 
1279 #if 0
1280       // ?? This code is only correct for a disjoint copy.  It may or
1281       // may not make sense to use it in that case.
1282 
1283       // Copy the first pair; s and d may not be aligned.
1284       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1285       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1286 
1287       // Align s and d, adjust count
1288       if (is_backwards) {
1289         __ sub(s, s, rscratch2);
1290         __ sub(d, d, rscratch2);
1291       } else {
1292         __ add(s, s, rscratch2);
1293         __ add(d, d, rscratch2);
1294       }
1295 #else
1296       copy_memory_small(s, d, rscratch2, rscratch1, step);
1297 #endif
1298     }
1299 
1300     __ bind(aligned);
1301 
1302     // s is now 2-word-aligned.
1303 
1304     // We have a count of units and some trailing bytes.  Adjust the
1305     // count and do a bulk copy of words.
1306     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1307     if (direction == copy_forwards)
1308       __ bl(copy_f);
1309     else
1310       __ bl(copy_b);
1311 
1312     // And the tail.
1313     copy_memory_small(s, d, count, tmp, step);
1314 
1315     if (granularity >= 8) __ bind(copy8);
1316     if (granularity >= 4) __ bind(copy4);
1317     __ bind(finish);
1318   }
1319 
1320 
1321   void clobber_registers() {
1322 #ifdef ASSERT
1323     RegSet clobbered
1324       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1325     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1326     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1327     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1328       __ mov(*it, rscratch1);
1329     }
1330 #endif
1331 
1332   }
1333 
1334   // Scan over array at a for count oops, verifying each one.
1335   // Preserves a and count, clobbers rscratch1 and rscratch2.
1336   void verify_oop_array (int size, Register a, Register count, Register temp) {
1337     Label loop, end;
1338     __ mov(rscratch1, a);
1339     __ mov(rscratch2, zr);
1340     __ bind(loop);
1341     __ cmp(rscratch2, count);
1342     __ br(Assembler::HS, end);
1343     if (size == wordSize) {
1344       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1345       __ verify_oop(temp);
1346     } else {
1347       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1348       __ decode_heap_oop(temp); // calls verify_oop
1349     }
1350     __ add(rscratch2, rscratch2, 1);
1351     __ b(loop);
1352     __ bind(end);
1353   }
1354 
1355   // Arguments:
1356   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1357   //             ignored
1358   //   is_oop  - true => oop array, so generate store check code
1359   //   name    - stub name string
1360   //
1361   // Inputs:
1362   //   c_rarg0   - source array address
1363   //   c_rarg1   - destination array address
1364   //   c_rarg2   - element count, treated as ssize_t, can be zero
1365   //
1366   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1367   // the hardware handle it.  The two dwords within qwords that span
1368   // cache line boundaries will still be loaded and stored atomically.
1369   //
1370   // Side Effects:
1371   //   disjoint_int_copy_entry is set to the no-overlap entry point
1372   //   used by generate_conjoint_int_oop_copy().
1373   //
1374   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1375                                   const char *name, bool dest_uninitialized = false) {
1376     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1377     RegSet saved_reg = RegSet::of(s, d, count);
1378     __ align(CodeEntryAlignment);
1379     StubCodeMark mark(this, "StubRoutines", name);
1380     address start = __ pc();
1381     __ enter();
1382 
1383     if (entry != NULL) {
1384       *entry = __ pc();
1385       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1386       BLOCK_COMMENT("Entry:");
1387     }
1388 
1389     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1390     if (dest_uninitialized) {
1391       decorators |= IS_DEST_UNINITIALIZED;
1392     }
1393     if (aligned) {
1394       decorators |= ARRAYCOPY_ALIGNED;
1395     }
1396 
1397     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1398     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1399 
1400     if (is_oop) {
1401       // save regs before copy_memory
1402       __ push(RegSet::of(d, count), sp);
1403     }
1404     {
1405       // UnsafeCopyMemory page error: continue after ucm
1406       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1407       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1408       copy_memory(aligned, s, d, count, rscratch1, size);
1409     }
1410 
1411     if (is_oop) {
1412       __ pop(RegSet::of(d, count), sp);
1413       if (VerifyOops)
1414         verify_oop_array(size, d, count, r16);
1415     }
1416 
1417     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1418 
1419     __ leave();
1420     __ mov(r0, zr); // return 0
1421     __ ret(lr);
1422     return start;
1423   }
1424 
1425   // Arguments:
1426   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1427   //             ignored
1428   //   is_oop  - true => oop array, so generate store check code
1429   //   name    - stub name string
1430   //
1431   // Inputs:
1432   //   c_rarg0   - source array address
1433   //   c_rarg1   - destination array address
1434   //   c_rarg2   - element count, treated as ssize_t, can be zero
1435   //
1436   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1437   // the hardware handle it.  The two dwords within qwords that span
1438   // cache line boundaries will still be loaded and stored atomically.
1439   //
1440   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1441                                  address *entry, const char *name,
1442                                  bool dest_uninitialized = false) {
1443     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1444     RegSet saved_regs = RegSet::of(s, d, count);
1445     StubCodeMark mark(this, "StubRoutines", name);
1446     address start = __ pc();
1447     __ enter();
1448 
1449     if (entry != NULL) {
1450       *entry = __ pc();
1451       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1452       BLOCK_COMMENT("Entry:");
1453     }
1454 
1455     // use fwd copy when (d-s) above_equal (count*size)
1456     __ sub(rscratch1, d, s);
1457     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1458     __ br(Assembler::HS, nooverlap_target);
1459 
1460     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1461     if (dest_uninitialized) {
1462       decorators |= IS_DEST_UNINITIALIZED;
1463     }
1464     if (aligned) {
1465       decorators |= ARRAYCOPY_ALIGNED;
1466     }
1467 
1468     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1469     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1470 
1471     if (is_oop) {
1472       // save regs before copy_memory
1473       __ push(RegSet::of(d, count), sp);
1474     }
1475     {
1476       // UnsafeCopyMemory page error: continue after ucm
1477       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1478       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1479       copy_memory(aligned, s, d, count, rscratch1, -size);
1480     }
1481     if (is_oop) {
1482       __ pop(RegSet::of(d, count), sp);
1483       if (VerifyOops)
1484         verify_oop_array(size, d, count, r16);
1485     }
1486     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1487     __ leave();
1488     __ mov(r0, zr); // return 0
1489     __ ret(lr);
1490     return start;
1491 }
1492 
1493   // Arguments:
1494   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1495   //             ignored
1496   //   name    - stub name string
1497   //
1498   // Inputs:
1499   //   c_rarg0   - source array address
1500   //   c_rarg1   - destination array address
1501   //   c_rarg2   - element count, treated as ssize_t, can be zero
1502   //
1503   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1504   // we let the hardware handle it.  The one to eight bytes within words,
1505   // dwords or qwords that span cache line boundaries will still be loaded
1506   // and stored atomically.
1507   //
1508   // Side Effects:
1509   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1510   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1511   // we let the hardware handle it.  The one to eight bytes within words,
1512   // dwords or qwords that span cache line boundaries will still be loaded
1513   // and stored atomically.
1514   //
1515   // Side Effects:
1516   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1517   //   used by generate_conjoint_byte_copy().
1518   //
1519   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1520     const bool not_oop = false;
1521     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1522   }
1523 
1524   // Arguments:
1525   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1526   //             ignored
1527   //   name    - stub name string
1528   //
1529   // Inputs:
1530   //   c_rarg0   - source array address
1531   //   c_rarg1   - destination array address
1532   //   c_rarg2   - element count, treated as ssize_t, can be zero
1533   //
1534   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1535   // we let the hardware handle it.  The one to eight bytes within words,
1536   // dwords or qwords that span cache line boundaries will still be loaded
1537   // and stored atomically.
1538   //
1539   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1540                                       address* entry, const char *name) {
1541     const bool not_oop = false;
1542     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1543   }
1544 
1545   // Arguments:
1546   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1547   //             ignored
1548   //   name    - stub name string
1549   //
1550   // Inputs:
1551   //   c_rarg0   - source array address
1552   //   c_rarg1   - destination array address
1553   //   c_rarg2   - element count, treated as ssize_t, can be zero
1554   //
1555   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1556   // let the hardware handle it.  The two or four words within dwords
1557   // or qwords that span cache line boundaries will still be loaded
1558   // and stored atomically.
1559   //
1560   // Side Effects:
1561   //   disjoint_short_copy_entry is set to the no-overlap entry point
1562   //   used by generate_conjoint_short_copy().
1563   //
1564   address generate_disjoint_short_copy(bool aligned,
1565                                        address* entry, const char *name) {
1566     const bool not_oop = false;
1567     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1568   }
1569 
1570   // Arguments:
1571   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1572   //             ignored
1573   //   name    - stub name string
1574   //
1575   // Inputs:
1576   //   c_rarg0   - source array address
1577   //   c_rarg1   - destination array address
1578   //   c_rarg2   - element count, treated as ssize_t, can be zero
1579   //
1580   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1581   // let the hardware handle it.  The two or four words within dwords
1582   // or qwords that span cache line boundaries will still be loaded
1583   // and stored atomically.
1584   //
1585   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1586                                        address *entry, const char *name) {
1587     const bool not_oop = false;
1588     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1589 
1590   }
1591   // Arguments:
1592   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1593   //             ignored
1594   //   name    - stub name string
1595   //
1596   // Inputs:
1597   //   c_rarg0   - source array address
1598   //   c_rarg1   - destination array address
1599   //   c_rarg2   - element count, treated as ssize_t, can be zero
1600   //
1601   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1602   // the hardware handle it.  The two dwords within qwords that span
1603   // cache line boundaries will still be loaded and stored atomically.
1604   //
1605   // Side Effects:
1606   //   disjoint_int_copy_entry is set to the no-overlap entry point
1607   //   used by generate_conjoint_int_oop_copy().
1608   //
1609   address generate_disjoint_int_copy(bool aligned, address *entry,
1610                                          const char *name, bool dest_uninitialized = false) {
1611     const bool not_oop = false;
1612     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1613   }
1614 
1615   // Arguments:
1616   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1617   //             ignored
1618   //   name    - stub name string
1619   //
1620   // Inputs:
1621   //   c_rarg0   - source array address
1622   //   c_rarg1   - destination array address
1623   //   c_rarg2   - element count, treated as ssize_t, can be zero
1624   //
1625   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1626   // the hardware handle it.  The two dwords within qwords that span
1627   // cache line boundaries will still be loaded and stored atomically.
1628   //
1629   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1630                                      address *entry, const char *name,
1631                                      bool dest_uninitialized = false) {
1632     const bool not_oop = false;
1633     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1634   }
1635 
1636 
1637   // Arguments:
1638   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1639   //             ignored
1640   //   name    - stub name string
1641   //
1642   // Inputs:
1643   //   c_rarg0   - source array address
1644   //   c_rarg1   - destination array address
1645   //   c_rarg2   - element count, treated as size_t, can be zero
1646   //
1647   // Side Effects:
1648   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1649   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1650   //
1651   address generate_disjoint_long_copy(bool aligned, address *entry,
1652                                           const char *name, bool dest_uninitialized = false) {
1653     const bool not_oop = false;
1654     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1655   }
1656 
1657   // Arguments:
1658   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1659   //             ignored
1660   //   name    - stub name string
1661   //
1662   // Inputs:
1663   //   c_rarg0   - source array address
1664   //   c_rarg1   - destination array address
1665   //   c_rarg2   - element count, treated as size_t, can be zero
1666   //
1667   address generate_conjoint_long_copy(bool aligned,
1668                                       address nooverlap_target, address *entry,
1669                                       const char *name, bool dest_uninitialized = false) {
1670     const bool not_oop = false;
1671     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1672   }
1673 
1674   // Arguments:
1675   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1676   //             ignored
1677   //   name    - stub name string
1678   //
1679   // Inputs:
1680   //   c_rarg0   - source array address
1681   //   c_rarg1   - destination array address
1682   //   c_rarg2   - element count, treated as size_t, can be zero
1683   //
1684   // Side Effects:
1685   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1686   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1687   //
1688   address generate_disjoint_oop_copy(bool aligned, address *entry,
1689                                      const char *name, bool dest_uninitialized) {
1690     const bool is_oop = true;
1691     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1692     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1693   }
1694 
1695   // Arguments:
1696   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1697   //             ignored
1698   //   name    - stub name string
1699   //
1700   // Inputs:
1701   //   c_rarg0   - source array address
1702   //   c_rarg1   - destination array address
1703   //   c_rarg2   - element count, treated as size_t, can be zero
1704   //
1705   address generate_conjoint_oop_copy(bool aligned,
1706                                      address nooverlap_target, address *entry,
1707                                      const char *name, bool dest_uninitialized) {
1708     const bool is_oop = true;
1709     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1710     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1711                                   name, dest_uninitialized);
1712   }
1713 
1714 
1715   // Helper for generating a dynamic type check.
1716   // Smashes rscratch1, rscratch2.
1717   void generate_type_check(Register sub_klass,
1718                            Register super_check_offset,
1719                            Register super_klass,
1720                            Label& L_success) {
1721     assert_different_registers(sub_klass, super_check_offset, super_klass);
1722 
1723     BLOCK_COMMENT("type_check:");
1724 
1725     Label L_miss;
1726 
1727     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1728                                      super_check_offset);
1729     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1730 
1731     // Fall through on failure!
1732     __ BIND(L_miss);
1733   }
1734 
1735   //
1736   //  Generate checkcasting array copy stub
1737   //
1738   //  Input:
1739   //    c_rarg0   - source array address
1740   //    c_rarg1   - destination array address
1741   //    c_rarg2   - element count, treated as ssize_t, can be zero
1742   //    c_rarg3   - size_t ckoff (super_check_offset)
1743   //    c_rarg4   - oop ckval (super_klass)
1744   //
1745   //  Output:
1746   //    r0 ==  0  -  success
1747   //    r0 == -1^K - failure, where K is partial transfer count
1748   //
1749   address generate_checkcast_copy(const char *name, address *entry,
1750                                   bool dest_uninitialized = false) {
1751 
1752     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1753 
1754     // Input registers (after setup_arg_regs)
1755     const Register from        = c_rarg0;   // source array address
1756     const Register to          = c_rarg1;   // destination array address
1757     const Register count       = c_rarg2;   // elementscount
1758     const Register ckoff       = c_rarg3;   // super_check_offset
1759     const Register ckval       = c_rarg4;   // super_klass
1760 
1761     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1762     RegSet wb_post_saved_regs = RegSet::of(count);
1763 
1764     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1765     const Register copied_oop  = r22;       // actual oop copied
1766     const Register count_save  = r21;       // orig elementscount
1767     const Register start_to    = r20;       // destination array start address
1768     const Register r19_klass   = r19;       // oop._klass
1769 
1770     //---------------------------------------------------------------
1771     // Assembler stub will be used for this call to arraycopy
1772     // if the two arrays are subtypes of Object[] but the
1773     // destination array type is not equal to or a supertype
1774     // of the source type.  Each element must be separately
1775     // checked.
1776 
1777     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1778                                copied_oop, r19_klass, count_save);
1779 
1780     __ align(CodeEntryAlignment);
1781     StubCodeMark mark(this, "StubRoutines", name);
1782     address start = __ pc();
1783 
1784     __ enter(); // required for proper stackwalking of RuntimeStub frame
1785 
1786 #ifdef ASSERT
1787     // caller guarantees that the arrays really are different
1788     // otherwise, we would have to make conjoint checks
1789     { Label L;
1790       array_overlap_test(L, TIMES_OOP);
1791       __ stop("checkcast_copy within a single array");
1792       __ bind(L);
1793     }
1794 #endif //ASSERT
1795 
1796     // Caller of this entry point must set up the argument registers.
1797     if (entry != NULL) {
1798       *entry = __ pc();
1799       BLOCK_COMMENT("Entry:");
1800     }
1801 
1802      // Empty array:  Nothing to do.
1803     __ cbz(count, L_done);
1804     __ push(RegSet::of(r19, r20, r21, r22), sp);
1805 
1806 #ifdef ASSERT
1807     BLOCK_COMMENT("assert consistent ckoff/ckval");
1808     // The ckoff and ckval must be mutually consistent,
1809     // even though caller generates both.
1810     { Label L;
1811       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1812       __ ldrw(start_to, Address(ckval, sco_offset));
1813       __ cmpw(ckoff, start_to);
1814       __ br(Assembler::EQ, L);
1815       __ stop("super_check_offset inconsistent");
1816       __ bind(L);
1817     }
1818 #endif //ASSERT
1819 
1820     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1821     bool is_oop = true;
1822     if (dest_uninitialized) {
1823       decorators |= IS_DEST_UNINITIALIZED;
1824     }
1825 
1826     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1827     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1828 
1829     // save the original count
1830     __ mov(count_save, count);
1831 
1832     // Copy from low to high addresses
1833     __ mov(start_to, to);              // Save destination array start address
1834     __ b(L_load_element);
1835 
1836     // ======== begin loop ========
1837     // (Loop is rotated; its entry is L_load_element.)
1838     // Loop control:
1839     //   for (; count != 0; count--) {
1840     //     copied_oop = load_heap_oop(from++);
1841     //     ... generate_type_check ...;
1842     //     store_heap_oop(to++, copied_oop);
1843     //   }
1844     __ align(OptoLoopAlignment);
1845 
1846     __ BIND(L_store_element);
1847     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1848     __ sub(count, count, 1);
1849     __ cbz(count, L_do_card_marks);
1850 
1851     // ======== loop entry is here ========
1852     __ BIND(L_load_element);
1853     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1854     __ cbz(copied_oop, L_store_element);
1855 
1856     __ load_klass(r19_klass, copied_oop);// query the object klass
1857     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1858     // ======== end loop ========
1859 
1860     // It was a real error; we must depend on the caller to finish the job.
1861     // Register count = remaining oops, count_orig = total oops.
1862     // Emit GC store barriers for the oops we have copied and report
1863     // their number to the caller.
1864 
1865     __ subs(count, count_save, count);     // K = partially copied oop count
1866     __ eon(count, count, zr);                   // report (-1^K) to caller
1867     __ br(Assembler::EQ, L_done_pop);
1868 
1869     __ BIND(L_do_card_marks);
1870     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1871 
1872     __ bind(L_done_pop);
1873     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1874     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1875 
1876     __ bind(L_done);
1877     __ mov(r0, count);
1878     __ leave();
1879     __ ret(lr);
1880 
1881     return start;
1882   }
1883 
1884   // Perform range checks on the proposed arraycopy.
1885   // Kills temp, but nothing else.
1886   // Also, clean the sign bits of src_pos and dst_pos.
1887   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1888                               Register src_pos, // source position (c_rarg1)
1889                               Register dst,     // destination array oo (c_rarg2)
1890                               Register dst_pos, // destination position (c_rarg3)
1891                               Register length,
1892                               Register temp,
1893                               Label& L_failed) {
1894     BLOCK_COMMENT("arraycopy_range_checks:");
1895 
1896     assert_different_registers(rscratch1, temp);
1897 
1898     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1899     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1900     __ addw(temp, length, src_pos);
1901     __ cmpw(temp, rscratch1);
1902     __ br(Assembler::HI, L_failed);
1903 
1904     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1905     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1906     __ addw(temp, length, dst_pos);
1907     __ cmpw(temp, rscratch1);
1908     __ br(Assembler::HI, L_failed);
1909 
1910     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1911     __ movw(src_pos, src_pos);
1912     __ movw(dst_pos, dst_pos);
1913 
1914     BLOCK_COMMENT("arraycopy_range_checks done");
1915   }
1916 
1917   // These stubs get called from some dumb test routine.
1918   // I'll write them properly when they're called from
1919   // something that's actually doing something.
1920   static void fake_arraycopy_stub(address src, address dst, int count) {
1921     assert(count == 0, "huh?");
1922   }
1923 
1924 
1925   //
1926   //  Generate 'unsafe' array copy stub
1927   //  Though just as safe as the other stubs, it takes an unscaled
1928   //  size_t argument instead of an element count.
1929   //
1930   //  Input:
1931   //    c_rarg0   - source array address
1932   //    c_rarg1   - destination array address
1933   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1934   //
1935   // Examines the alignment of the operands and dispatches
1936   // to a long, int, short, or byte copy loop.
1937   //
1938   address generate_unsafe_copy(const char *name,
1939                                address byte_copy_entry,
1940                                address short_copy_entry,
1941                                address int_copy_entry,
1942                                address long_copy_entry) {
1943     Label L_long_aligned, L_int_aligned, L_short_aligned;
1944     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1945 
1946     __ align(CodeEntryAlignment);
1947     StubCodeMark mark(this, "StubRoutines", name);
1948     address start = __ pc();
1949     __ enter(); // required for proper stackwalking of RuntimeStub frame
1950 
1951     // bump this on entry, not on exit:
1952     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1953 
1954     __ orr(rscratch1, s, d);
1955     __ orr(rscratch1, rscratch1, count);
1956 
1957     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1958     __ cbz(rscratch1, L_long_aligned);
1959     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1960     __ cbz(rscratch1, L_int_aligned);
1961     __ tbz(rscratch1, 0, L_short_aligned);
1962     __ b(RuntimeAddress(byte_copy_entry));
1963 
1964     __ BIND(L_short_aligned);
1965     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1966     __ b(RuntimeAddress(short_copy_entry));
1967     __ BIND(L_int_aligned);
1968     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1969     __ b(RuntimeAddress(int_copy_entry));
1970     __ BIND(L_long_aligned);
1971     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1972     __ b(RuntimeAddress(long_copy_entry));
1973 
1974     return start;
1975   }
1976 
1977   //
1978   //  Generate generic array copy stubs
1979   //
1980   //  Input:
1981   //    c_rarg0    -  src oop
1982   //    c_rarg1    -  src_pos (32-bits)
1983   //    c_rarg2    -  dst oop
1984   //    c_rarg3    -  dst_pos (32-bits)
1985   //    c_rarg4    -  element count (32-bits)
1986   //
1987   //  Output:
1988   //    r0 ==  0  -  success
1989   //    r0 == -1^K - failure, where K is partial transfer count
1990   //
1991   address generate_generic_copy(const char *name,
1992                                 address byte_copy_entry, address short_copy_entry,
1993                                 address int_copy_entry, address oop_copy_entry,
1994                                 address long_copy_entry, address checkcast_copy_entry) {
1995 
1996     Label L_failed, L_objArray;
1997     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1998 
1999     // Input registers
2000     const Register src        = c_rarg0;  // source array oop
2001     const Register src_pos    = c_rarg1;  // source position
2002     const Register dst        = c_rarg2;  // destination array oop
2003     const Register dst_pos    = c_rarg3;  // destination position
2004     const Register length     = c_rarg4;
2005 
2006 
2007     // Registers used as temps
2008     const Register dst_klass  = c_rarg5;
2009 
2010     __ align(CodeEntryAlignment);
2011 
2012     StubCodeMark mark(this, "StubRoutines", name);
2013 
2014     address start = __ pc();
2015 
2016     __ enter(); // required for proper stackwalking of RuntimeStub frame
2017 
2018     // bump this on entry, not on exit:
2019     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2020 
2021     //-----------------------------------------------------------------------
2022     // Assembler stub will be used for this call to arraycopy
2023     // if the following conditions are met:
2024     //
2025     // (1) src and dst must not be null.
2026     // (2) src_pos must not be negative.
2027     // (3) dst_pos must not be negative.
2028     // (4) length  must not be negative.
2029     // (5) src klass and dst klass should be the same and not NULL.
2030     // (6) src and dst should be arrays.
2031     // (7) src_pos + length must not exceed length of src.
2032     // (8) dst_pos + length must not exceed length of dst.
2033     //
2034 
2035     //  if (src == NULL) return -1;
2036     __ cbz(src, L_failed);
2037 
2038     //  if (src_pos < 0) return -1;
2039     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2040 
2041     //  if (dst == NULL) return -1;
2042     __ cbz(dst, L_failed);
2043 
2044     //  if (dst_pos < 0) return -1;
2045     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2046 
2047     // registers used as temp
2048     const Register scratch_length    = r16; // elements count to copy
2049     const Register scratch_src_klass = r17; // array klass
2050     const Register lh                = r15; // layout helper
2051 
2052     //  if (length < 0) return -1;
2053     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2054     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2055 
2056     __ load_klass(scratch_src_klass, src);
2057 #ifdef ASSERT
2058     //  assert(src->klass() != NULL);
2059     {
2060       BLOCK_COMMENT("assert klasses not null {");
2061       Label L1, L2;
2062       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2063       __ bind(L1);
2064       __ stop("broken null klass");
2065       __ bind(L2);
2066       __ load_klass(rscratch1, dst);
2067       __ cbz(rscratch1, L1);     // this would be broken also
2068       BLOCK_COMMENT("} assert klasses not null done");
2069     }
2070 #endif
2071 
2072     // Load layout helper (32-bits)
2073     //
2074     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2075     // 32        30    24            16              8     2                 0
2076     //
2077     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2078     //
2079 
2080     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2081 
2082     // Handle objArrays completely differently...
2083     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2084     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2085     __ movw(rscratch1, objArray_lh);
2086     __ eorw(rscratch2, lh, rscratch1);
2087     __ cbzw(rscratch2, L_objArray);
2088 
2089     //  if (src->klass() != dst->klass()) return -1;
2090     __ load_klass(rscratch2, dst);
2091     __ eor(rscratch2, rscratch2, scratch_src_klass);
2092     __ cbnz(rscratch2, L_failed);
2093 
2094     //  if (!src->is_Array()) return -1;
2095     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2096 
2097     // At this point, it is known to be a typeArray (array_tag 0x3).
2098 #ifdef ASSERT
2099     {
2100       BLOCK_COMMENT("assert primitive array {");
2101       Label L;
2102       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2103       __ cmpw(lh, rscratch2);
2104       __ br(Assembler::GE, L);
2105       __ stop("must be a primitive array");
2106       __ bind(L);
2107       BLOCK_COMMENT("} assert primitive array done");
2108     }
2109 #endif
2110 
2111     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2112                            rscratch2, L_failed);
2113 
2114     // TypeArrayKlass
2115     //
2116     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2117     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2118     //
2119 
2120     const Register rscratch1_offset = rscratch1;    // array offset
2121     const Register r15_elsize = lh; // element size
2122 
2123     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2124            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2125     __ add(src, src, rscratch1_offset);           // src array offset
2126     __ add(dst, dst, rscratch1_offset);           // dst array offset
2127     BLOCK_COMMENT("choose copy loop based on element size");
2128 
2129     // next registers should be set before the jump to corresponding stub
2130     const Register from     = c_rarg0;  // source array address
2131     const Register to       = c_rarg1;  // destination array address
2132     const Register count    = c_rarg2;  // elements count
2133 
2134     // 'from', 'to', 'count' registers should be set in such order
2135     // since they are the same as 'src', 'src_pos', 'dst'.
2136 
2137     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2138 
2139     // The possible values of elsize are 0-3, i.e. exact_log2(element
2140     // size in bytes).  We do a simple bitwise binary search.
2141   __ BIND(L_copy_bytes);
2142     __ tbnz(r15_elsize, 1, L_copy_ints);
2143     __ tbnz(r15_elsize, 0, L_copy_shorts);
2144     __ lea(from, Address(src, src_pos));// src_addr
2145     __ lea(to,   Address(dst, dst_pos));// dst_addr
2146     __ movw(count, scratch_length); // length
2147     __ b(RuntimeAddress(byte_copy_entry));
2148 
2149   __ BIND(L_copy_shorts);
2150     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2151     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2152     __ movw(count, scratch_length); // length
2153     __ b(RuntimeAddress(short_copy_entry));
2154 
2155   __ BIND(L_copy_ints);
2156     __ tbnz(r15_elsize, 0, L_copy_longs);
2157     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2158     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2159     __ movw(count, scratch_length); // length
2160     __ b(RuntimeAddress(int_copy_entry));
2161 
2162   __ BIND(L_copy_longs);
2163 #ifdef ASSERT
2164     {
2165       BLOCK_COMMENT("assert long copy {");
2166       Label L;
2167       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2168       __ cmpw(r15_elsize, LogBytesPerLong);
2169       __ br(Assembler::EQ, L);
2170       __ stop("must be long copy, but elsize is wrong");
2171       __ bind(L);
2172       BLOCK_COMMENT("} assert long copy done");
2173     }
2174 #endif
2175     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2176     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2177     __ movw(count, scratch_length); // length
2178     __ b(RuntimeAddress(long_copy_entry));
2179 
2180     // ObjArrayKlass
2181   __ BIND(L_objArray);
2182     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2183 
2184     Label L_plain_copy, L_checkcast_copy;
2185     //  test array classes for subtyping
2186     __ load_klass(r15, dst);
2187     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2188     __ br(Assembler::NE, L_checkcast_copy);
2189 
2190     // Identically typed arrays can be copied without element-wise checks.
2191     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2192                            rscratch2, L_failed);
2193 
2194     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2195     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2196     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2197     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2198     __ movw(count, scratch_length); // length
2199   __ BIND(L_plain_copy);
2200     __ b(RuntimeAddress(oop_copy_entry));
2201 
2202   __ BIND(L_checkcast_copy);
2203     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2204     {
2205       // Before looking at dst.length, make sure dst is also an objArray.
2206       __ ldrw(rscratch1, Address(r15, lh_offset));
2207       __ movw(rscratch2, objArray_lh);
2208       __ eorw(rscratch1, rscratch1, rscratch2);
2209       __ cbnzw(rscratch1, L_failed);
2210 
2211       // It is safe to examine both src.length and dst.length.
2212       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2213                              r15, L_failed);
2214 
2215       __ load_klass(dst_klass, dst); // reload
2216 
2217       // Marshal the base address arguments now, freeing registers.
2218       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2219       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2220       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2221       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2222       __ movw(count, length);           // length (reloaded)
2223       Register sco_temp = c_rarg3;      // this register is free now
2224       assert_different_registers(from, to, count, sco_temp,
2225                                  dst_klass, scratch_src_klass);
2226       // assert_clean_int(count, sco_temp);
2227 
2228       // Generate the type check.
2229       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2230       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2231 
2232       // Smashes rscratch1, rscratch2
2233       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2234 
2235       // Fetch destination element klass from the ObjArrayKlass header.
2236       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2237       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2238       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2239 
2240       // the checkcast_copy loop needs two extra arguments:
2241       assert(c_rarg3 == sco_temp, "#3 already in place");
2242       // Set up arguments for checkcast_copy_entry.
2243       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2244       __ b(RuntimeAddress(checkcast_copy_entry));
2245     }
2246 
2247   __ BIND(L_failed);
2248     __ mov(r0, -1);
2249     __ leave();   // required for proper stackwalking of RuntimeStub frame
2250     __ ret(lr);
2251 
2252     return start;
2253   }
2254 
2255   //
2256   // Generate stub for array fill. If "aligned" is true, the
2257   // "to" address is assumed to be heapword aligned.
2258   //
2259   // Arguments for generated stub:
2260   //   to:    c_rarg0
2261   //   value: c_rarg1
2262   //   count: c_rarg2 treated as signed
2263   //
2264   address generate_fill(BasicType t, bool aligned, const char *name) {
2265     __ align(CodeEntryAlignment);
2266     StubCodeMark mark(this, "StubRoutines", name);
2267     address start = __ pc();
2268 
2269     BLOCK_COMMENT("Entry:");
2270 
2271     const Register to        = c_rarg0;  // source array address
2272     const Register value     = c_rarg1;  // value
2273     const Register count     = c_rarg2;  // elements count
2274 
2275     const Register bz_base = r10;        // base for block_zero routine
2276     const Register cnt_words = r11;      // temp register
2277 
2278     __ enter();
2279 
2280     Label L_fill_elements, L_exit1;
2281 
2282     int shift = -1;
2283     switch (t) {
2284       case T_BYTE:
2285         shift = 0;
2286         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2287         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2288         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2289         __ br(Assembler::LO, L_fill_elements);
2290         break;
2291       case T_SHORT:
2292         shift = 1;
2293         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2294         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2295         __ br(Assembler::LO, L_fill_elements);
2296         break;
2297       case T_INT:
2298         shift = 2;
2299         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2300         __ br(Assembler::LO, L_fill_elements);
2301         break;
2302       default: ShouldNotReachHere();
2303     }
2304 
2305     // Align source address at 8 bytes address boundary.
2306     Label L_skip_align1, L_skip_align2, L_skip_align4;
2307     if (!aligned) {
2308       switch (t) {
2309         case T_BYTE:
2310           // One byte misalignment happens only for byte arrays.
2311           __ tbz(to, 0, L_skip_align1);
2312           __ strb(value, Address(__ post(to, 1)));
2313           __ subw(count, count, 1);
2314           __ bind(L_skip_align1);
2315           // Fallthrough
2316         case T_SHORT:
2317           // Two bytes misalignment happens only for byte and short (char) arrays.
2318           __ tbz(to, 1, L_skip_align2);
2319           __ strh(value, Address(__ post(to, 2)));
2320           __ subw(count, count, 2 >> shift);
2321           __ bind(L_skip_align2);
2322           // Fallthrough
2323         case T_INT:
2324           // Align to 8 bytes, we know we are 4 byte aligned to start.
2325           __ tbz(to, 2, L_skip_align4);
2326           __ strw(value, Address(__ post(to, 4)));
2327           __ subw(count, count, 4 >> shift);
2328           __ bind(L_skip_align4);
2329           break;
2330         default: ShouldNotReachHere();
2331       }
2332     }
2333 
2334     //
2335     //  Fill large chunks
2336     //
2337     __ lsrw(cnt_words, count, 3 - shift); // number of words
2338     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2339     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2340     if (UseBlockZeroing) {
2341       Label non_block_zeroing, rest;
2342       // If the fill value is zero we can use the fast zero_words().
2343       __ cbnz(value, non_block_zeroing);
2344       __ mov(bz_base, to);
2345       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2346       __ zero_words(bz_base, cnt_words);
2347       __ b(rest);
2348       __ bind(non_block_zeroing);
2349       __ fill_words(to, cnt_words, value);
2350       __ bind(rest);
2351     } else {
2352       __ fill_words(to, cnt_words, value);
2353     }
2354 
2355     // Remaining count is less than 8 bytes. Fill it by a single store.
2356     // Note that the total length is no less than 8 bytes.
2357     if (t == T_BYTE || t == T_SHORT) {
2358       Label L_exit1;
2359       __ cbzw(count, L_exit1);
2360       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2361       __ str(value, Address(to, -8));    // overwrite some elements
2362       __ bind(L_exit1);
2363       __ leave();
2364       __ ret(lr);
2365     }
2366 
2367     // Handle copies less than 8 bytes.
2368     Label L_fill_2, L_fill_4, L_exit2;
2369     __ bind(L_fill_elements);
2370     switch (t) {
2371       case T_BYTE:
2372         __ tbz(count, 0, L_fill_2);
2373         __ strb(value, Address(__ post(to, 1)));
2374         __ bind(L_fill_2);
2375         __ tbz(count, 1, L_fill_4);
2376         __ strh(value, Address(__ post(to, 2)));
2377         __ bind(L_fill_4);
2378         __ tbz(count, 2, L_exit2);
2379         __ strw(value, Address(to));
2380         break;
2381       case T_SHORT:
2382         __ tbz(count, 0, L_fill_4);
2383         __ strh(value, Address(__ post(to, 2)));
2384         __ bind(L_fill_4);
2385         __ tbz(count, 1, L_exit2);
2386         __ strw(value, Address(to));
2387         break;
2388       case T_INT:
2389         __ cbzw(count, L_exit2);
2390         __ strw(value, Address(to));
2391         break;
2392       default: ShouldNotReachHere();
2393     }
2394     __ bind(L_exit2);
2395     __ leave();
2396     __ ret(lr);
2397     return start;
2398   }
2399 
2400   address generate_data_cache_writeback() {
2401     const Register line        = c_rarg0;  // address of line to write back
2402 
2403     __ align(CodeEntryAlignment);
2404 
2405     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2406 
2407     address start = __ pc();
2408     __ enter();
2409     __ cache_wb(Address(line, 0));
2410     __ leave();
2411     __ ret(lr);
2412 
2413     return start;
2414   }
2415 
2416   address generate_data_cache_writeback_sync() {
2417     const Register is_pre     = c_rarg0;  // pre or post sync
2418 
2419     __ align(CodeEntryAlignment);
2420 
2421     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2422 
2423     // pre wbsync is a no-op
2424     // post wbsync translates to an sfence
2425 
2426     Label skip;
2427     address start = __ pc();
2428     __ enter();
2429     __ cbnz(is_pre, skip);
2430     __ cache_wbsync(false);
2431     __ bind(skip);
2432     __ leave();
2433     __ ret(lr);
2434 
2435     return start;
2436   }
2437 
2438   void generate_arraycopy_stubs() {
2439     address entry;
2440     address entry_jbyte_arraycopy;
2441     address entry_jshort_arraycopy;
2442     address entry_jint_arraycopy;
2443     address entry_oop_arraycopy;
2444     address entry_jlong_arraycopy;
2445     address entry_checkcast_arraycopy;
2446 
2447     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2448     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2449 
2450     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2451 
2452     //*** jbyte
2453     // Always need aligned and unaligned versions
2454     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2455                                                                                   "jbyte_disjoint_arraycopy");
2456     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2457                                                                                   &entry_jbyte_arraycopy,
2458                                                                                   "jbyte_arraycopy");
2459     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2460                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2461     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2462                                                                                   "arrayof_jbyte_arraycopy");
2463 
2464     //*** jshort
2465     // Always need aligned and unaligned versions
2466     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2467                                                                                     "jshort_disjoint_arraycopy");
2468     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2469                                                                                     &entry_jshort_arraycopy,
2470                                                                                     "jshort_arraycopy");
2471     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2472                                                                                     "arrayof_jshort_disjoint_arraycopy");
2473     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2474                                                                                     "arrayof_jshort_arraycopy");
2475 
2476     //*** jint
2477     // Aligned versions
2478     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2479                                                                                 "arrayof_jint_disjoint_arraycopy");
2480     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2481                                                                                 "arrayof_jint_arraycopy");
2482     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2483     // entry_jint_arraycopy always points to the unaligned version
2484     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2485                                                                                 "jint_disjoint_arraycopy");
2486     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2487                                                                                 &entry_jint_arraycopy,
2488                                                                                 "jint_arraycopy");
2489 
2490     //*** jlong
2491     // It is always aligned
2492     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2493                                                                                   "arrayof_jlong_disjoint_arraycopy");
2494     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2495                                                                                   "arrayof_jlong_arraycopy");
2496     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2497     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2498 
2499     //*** oops
2500     {
2501       // With compressed oops we need unaligned versions; notice that
2502       // we overwrite entry_oop_arraycopy.
2503       bool aligned = !UseCompressedOops;
2504 
2505       StubRoutines::_arrayof_oop_disjoint_arraycopy
2506         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2507                                      /*dest_uninitialized*/false);
2508       StubRoutines::_arrayof_oop_arraycopy
2509         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2510                                      /*dest_uninitialized*/false);
2511       // Aligned versions without pre-barriers
2512       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2513         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2514                                      /*dest_uninitialized*/true);
2515       StubRoutines::_arrayof_oop_arraycopy_uninit
2516         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2517                                      /*dest_uninitialized*/true);
2518     }
2519 
2520     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2521     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2522     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2523     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2524 
2525     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2526     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2527                                                                         /*dest_uninitialized*/true);
2528 
2529     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2530                                                               entry_jbyte_arraycopy,
2531                                                               entry_jshort_arraycopy,
2532                                                               entry_jint_arraycopy,
2533                                                               entry_jlong_arraycopy);
2534 
2535     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2536                                                                entry_jbyte_arraycopy,
2537                                                                entry_jshort_arraycopy,
2538                                                                entry_jint_arraycopy,
2539                                                                entry_oop_arraycopy,
2540                                                                entry_jlong_arraycopy,
2541                                                                entry_checkcast_arraycopy);
2542 
2543     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2544     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2545     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2546     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2547     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2548     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2549   }
2550 
2551   void generate_math_stubs() { Unimplemented(); }
2552 
2553   // Arguments:
2554   //
2555   // Inputs:
2556   //   c_rarg0   - source byte array address
2557   //   c_rarg1   - destination byte array address
2558   //   c_rarg2   - K (key) in little endian int array
2559   //
2560   address generate_aescrypt_encryptBlock() {
2561     __ align(CodeEntryAlignment);
2562     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2563 
2564     const Register from        = c_rarg0;  // source array address
2565     const Register to          = c_rarg1;  // destination array address
2566     const Register key         = c_rarg2;  // key array address
2567     const Register keylen      = rscratch1;
2568 
2569     address start = __ pc();
2570     __ enter();
2571 
2572     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2573 
2574     __ aesenc_loadkeys(key, keylen);
2575     __ aesecb_encrypt(from, to, keylen);
2576 
2577     __ mov(r0, 0);
2578 
2579     __ leave();
2580     __ ret(lr);
2581 
2582     return start;
2583   }
2584 
2585   // Arguments:
2586   //
2587   // Inputs:
2588   //   c_rarg0   - source byte array address
2589   //   c_rarg1   - destination byte array address
2590   //   c_rarg2   - K (key) in little endian int array
2591   //
2592   address generate_aescrypt_decryptBlock() {
2593     assert(UseAES, "need AES cryptographic extension support");
2594     __ align(CodeEntryAlignment);
2595     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2596     Label L_doLast;
2597 
2598     const Register from        = c_rarg0;  // source array address
2599     const Register to          = c_rarg1;  // destination array address
2600     const Register key         = c_rarg2;  // key array address
2601     const Register keylen      = rscratch1;
2602 
2603     address start = __ pc();
2604     __ enter(); // required for proper stackwalking of RuntimeStub frame
2605 
2606     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2607 
2608     __ aesecb_decrypt(from, to, key, keylen);
2609 
2610     __ mov(r0, 0);
2611 
2612     __ leave();
2613     __ ret(lr);
2614 
2615     return start;
2616   }
2617 
2618   // Arguments:
2619   //
2620   // Inputs:
2621   //   c_rarg0   - source byte array address
2622   //   c_rarg1   - destination byte array address
2623   //   c_rarg2   - K (key) in little endian int array
2624   //   c_rarg3   - r vector byte array address
2625   //   c_rarg4   - input length
2626   //
2627   // Output:
2628   //   x0        - input length
2629   //
2630   address generate_cipherBlockChaining_encryptAESCrypt() {
2631     assert(UseAES, "need AES cryptographic extension support");
2632     __ align(CodeEntryAlignment);
2633     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2634 
2635     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2636 
2637     const Register from        = c_rarg0;  // source array address
2638     const Register to          = c_rarg1;  // destination array address
2639     const Register key         = c_rarg2;  // key array address
2640     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2641                                            // and left with the results of the last encryption block
2642     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2643     const Register keylen      = rscratch1;
2644 
2645     address start = __ pc();
2646 
2647       __ enter();
2648 
2649       __ movw(rscratch2, len_reg);
2650 
2651       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2652 
2653       __ ld1(v0, __ T16B, rvec);
2654 
2655       __ cmpw(keylen, 52);
2656       __ br(Assembler::CC, L_loadkeys_44);
2657       __ br(Assembler::EQ, L_loadkeys_52);
2658 
2659       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2660       __ rev32(v17, __ T16B, v17);
2661       __ rev32(v18, __ T16B, v18);
2662     __ BIND(L_loadkeys_52);
2663       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2664       __ rev32(v19, __ T16B, v19);
2665       __ rev32(v20, __ T16B, v20);
2666     __ BIND(L_loadkeys_44);
2667       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2668       __ rev32(v21, __ T16B, v21);
2669       __ rev32(v22, __ T16B, v22);
2670       __ rev32(v23, __ T16B, v23);
2671       __ rev32(v24, __ T16B, v24);
2672       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2673       __ rev32(v25, __ T16B, v25);
2674       __ rev32(v26, __ T16B, v26);
2675       __ rev32(v27, __ T16B, v27);
2676       __ rev32(v28, __ T16B, v28);
2677       __ ld1(v29, v30, v31, __ T16B, key);
2678       __ rev32(v29, __ T16B, v29);
2679       __ rev32(v30, __ T16B, v30);
2680       __ rev32(v31, __ T16B, v31);
2681 
2682     __ BIND(L_aes_loop);
2683       __ ld1(v1, __ T16B, __ post(from, 16));
2684       __ eor(v0, __ T16B, v0, v1);
2685 
2686       __ br(Assembler::CC, L_rounds_44);
2687       __ br(Assembler::EQ, L_rounds_52);
2688 
2689       __ aese(v0, v17); __ aesmc(v0, v0);
2690       __ aese(v0, v18); __ aesmc(v0, v0);
2691     __ BIND(L_rounds_52);
2692       __ aese(v0, v19); __ aesmc(v0, v0);
2693       __ aese(v0, v20); __ aesmc(v0, v0);
2694     __ BIND(L_rounds_44);
2695       __ aese(v0, v21); __ aesmc(v0, v0);
2696       __ aese(v0, v22); __ aesmc(v0, v0);
2697       __ aese(v0, v23); __ aesmc(v0, v0);
2698       __ aese(v0, v24); __ aesmc(v0, v0);
2699       __ aese(v0, v25); __ aesmc(v0, v0);
2700       __ aese(v0, v26); __ aesmc(v0, v0);
2701       __ aese(v0, v27); __ aesmc(v0, v0);
2702       __ aese(v0, v28); __ aesmc(v0, v0);
2703       __ aese(v0, v29); __ aesmc(v0, v0);
2704       __ aese(v0, v30);
2705       __ eor(v0, __ T16B, v0, v31);
2706 
2707       __ st1(v0, __ T16B, __ post(to, 16));
2708 
2709       __ subw(len_reg, len_reg, 16);
2710       __ cbnzw(len_reg, L_aes_loop);
2711 
2712       __ st1(v0, __ T16B, rvec);
2713 
2714       __ mov(r0, rscratch2);
2715 
2716       __ leave();
2717       __ ret(lr);
2718 
2719       return start;
2720   }
2721 
2722   // Arguments:
2723   //
2724   // Inputs:
2725   //   c_rarg0   - source byte array address
2726   //   c_rarg1   - destination byte array address
2727   //   c_rarg2   - K (key) in little endian int array
2728   //   c_rarg3   - r vector byte array address
2729   //   c_rarg4   - input length
2730   //
2731   // Output:
2732   //   r0        - input length
2733   //
2734   address generate_cipherBlockChaining_decryptAESCrypt() {
2735     assert(UseAES, "need AES cryptographic extension support");
2736     __ align(CodeEntryAlignment);
2737     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2738 
2739     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2740 
2741     const Register from        = c_rarg0;  // source array address
2742     const Register to          = c_rarg1;  // destination array address
2743     const Register key         = c_rarg2;  // key array address
2744     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2745                                            // and left with the results of the last encryption block
2746     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2747     const Register keylen      = rscratch1;
2748 
2749     address start = __ pc();
2750 
2751       __ enter();
2752 
2753       __ movw(rscratch2, len_reg);
2754 
2755       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2756 
2757       __ ld1(v2, __ T16B, rvec);
2758 
2759       __ ld1(v31, __ T16B, __ post(key, 16));
2760       __ rev32(v31, __ T16B, v31);
2761 
2762       __ cmpw(keylen, 52);
2763       __ br(Assembler::CC, L_loadkeys_44);
2764       __ br(Assembler::EQ, L_loadkeys_52);
2765 
2766       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2767       __ rev32(v17, __ T16B, v17);
2768       __ rev32(v18, __ T16B, v18);
2769     __ BIND(L_loadkeys_52);
2770       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2771       __ rev32(v19, __ T16B, v19);
2772       __ rev32(v20, __ T16B, v20);
2773     __ BIND(L_loadkeys_44);
2774       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2775       __ rev32(v21, __ T16B, v21);
2776       __ rev32(v22, __ T16B, v22);
2777       __ rev32(v23, __ T16B, v23);
2778       __ rev32(v24, __ T16B, v24);
2779       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2780       __ rev32(v25, __ T16B, v25);
2781       __ rev32(v26, __ T16B, v26);
2782       __ rev32(v27, __ T16B, v27);
2783       __ rev32(v28, __ T16B, v28);
2784       __ ld1(v29, v30, __ T16B, key);
2785       __ rev32(v29, __ T16B, v29);
2786       __ rev32(v30, __ T16B, v30);
2787 
2788     __ BIND(L_aes_loop);
2789       __ ld1(v0, __ T16B, __ post(from, 16));
2790       __ orr(v1, __ T16B, v0, v0);
2791 
2792       __ br(Assembler::CC, L_rounds_44);
2793       __ br(Assembler::EQ, L_rounds_52);
2794 
2795       __ aesd(v0, v17); __ aesimc(v0, v0);
2796       __ aesd(v0, v18); __ aesimc(v0, v0);
2797     __ BIND(L_rounds_52);
2798       __ aesd(v0, v19); __ aesimc(v0, v0);
2799       __ aesd(v0, v20); __ aesimc(v0, v0);
2800     __ BIND(L_rounds_44);
2801       __ aesd(v0, v21); __ aesimc(v0, v0);
2802       __ aesd(v0, v22); __ aesimc(v0, v0);
2803       __ aesd(v0, v23); __ aesimc(v0, v0);
2804       __ aesd(v0, v24); __ aesimc(v0, v0);
2805       __ aesd(v0, v25); __ aesimc(v0, v0);
2806       __ aesd(v0, v26); __ aesimc(v0, v0);
2807       __ aesd(v0, v27); __ aesimc(v0, v0);
2808       __ aesd(v0, v28); __ aesimc(v0, v0);
2809       __ aesd(v0, v29); __ aesimc(v0, v0);
2810       __ aesd(v0, v30);
2811       __ eor(v0, __ T16B, v0, v31);
2812       __ eor(v0, __ T16B, v0, v2);
2813 
2814       __ st1(v0, __ T16B, __ post(to, 16));
2815       __ orr(v2, __ T16B, v1, v1);
2816 
2817       __ subw(len_reg, len_reg, 16);
2818       __ cbnzw(len_reg, L_aes_loop);
2819 
2820       __ st1(v2, __ T16B, rvec);
2821 
2822       __ mov(r0, rscratch2);
2823 
2824       __ leave();
2825       __ ret(lr);
2826 
2827     return start;
2828   }
2829 
2830   // CTR AES crypt.
2831   // Arguments:
2832   //
2833   // Inputs:
2834   //   c_rarg0   - source byte array address
2835   //   c_rarg1   - destination byte array address
2836   //   c_rarg2   - K (key) in little endian int array
2837   //   c_rarg3   - counter vector byte array address
2838   //   c_rarg4   - input length
2839   //   c_rarg5   - saved encryptedCounter start
2840   //   c_rarg6   - saved used length
2841   //
2842   // Output:
2843   //   r0       - input length
2844   //
2845   address generate_counterMode_AESCrypt() {
2846     const Register in = c_rarg0;
2847     const Register out = c_rarg1;
2848     const Register key = c_rarg2;
2849     const Register counter = c_rarg3;
2850     const Register saved_len = c_rarg4, len = r10;
2851     const Register saved_encrypted_ctr = c_rarg5;
2852     const Register used_ptr = c_rarg6, used = r12;
2853 
2854     const Register offset = r7;
2855     const Register keylen = r11;
2856 
2857     const unsigned char block_size = 16;
2858     const int bulk_width = 4;
2859     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2860     // performance with larger data sizes, but it also means that the
2861     // fast path isn't used until you have at least 8 blocks, and up
2862     // to 127 bytes of data will be executed on the slow path. For
2863     // that reason, and also so as not to blow away too much icache, 4
2864     // blocks seems like a sensible compromise.
2865 
2866     // Algorithm:
2867     //
2868     //    if (len == 0) {
2869     //        goto DONE;
2870     //    }
2871     //    int result = len;
2872     //    do {
2873     //        if (used >= blockSize) {
2874     //            if (len >= bulk_width * blockSize) {
2875     //                CTR_large_block();
2876     //                if (len == 0)
2877     //                    goto DONE;
2878     //            }
2879     //            for (;;) {
2880     //                16ByteVector v0 = counter;
2881     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2882     //                used = 0;
2883     //                if (len < blockSize)
2884     //                    break;    /* goto NEXT */
2885     //                16ByteVector v1 = load16Bytes(in, offset);
2886     //                v1 = v1 ^ encryptedCounter;
2887     //                store16Bytes(out, offset);
2888     //                used = blockSize;
2889     //                offset += blockSize;
2890     //                len -= blockSize;
2891     //                if (len == 0)
2892     //                    goto DONE;
2893     //            }
2894     //        }
2895     //      NEXT:
2896     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2897     //        len--;
2898     //    } while (len != 0);
2899     //  DONE:
2900     //    return result;
2901     //
2902     // CTR_large_block()
2903     //    Wide bulk encryption of whole blocks.
2904 
2905     __ align(CodeEntryAlignment);
2906     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2907     const address start = __ pc();
2908     __ enter();
2909 
2910     Label DONE, CTR_large_block, large_block_return;
2911     __ ldrw(used, Address(used_ptr));
2912     __ cbzw(saved_len, DONE);
2913 
2914     __ mov(len, saved_len);
2915     __ mov(offset, 0);
2916 
2917     // Compute #rounds for AES based on the length of the key array
2918     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2919 
2920     __ aesenc_loadkeys(key, keylen);
2921 
2922     {
2923       Label L_CTR_loop, NEXT;
2924 
2925       __ bind(L_CTR_loop);
2926 
2927       __ cmp(used, block_size);
2928       __ br(__ LO, NEXT);
2929 
2930       // Maybe we have a lot of data
2931       __ subsw(rscratch1, len, bulk_width * block_size);
2932       __ br(__ HS, CTR_large_block);
2933       __ BIND(large_block_return);
2934       __ cbzw(len, DONE);
2935 
2936       // Setup the counter
2937       __ movi(v4, __ T4S, 0);
2938       __ movi(v5, __ T4S, 1);
2939       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2940 
2941       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2942       __ rev32(v16, __ T16B, v0);
2943       __ addv(v16, __ T4S, v16, v4);
2944       __ rev32(v16, __ T16B, v16);
2945       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2946 
2947       {
2948         // We have fewer than bulk_width blocks of data left. Encrypt
2949         // them one by one until there is less than a full block
2950         // remaining, being careful to save both the encrypted counter
2951         // and the counter.
2952 
2953         Label inner_loop;
2954         __ bind(inner_loop);
2955         // Counter to encrypt is in v0
2956         __ aesecb_encrypt(noreg, noreg, keylen);
2957         __ st1(v0, __ T16B, saved_encrypted_ctr);
2958 
2959         // Do we have a remaining full block?
2960 
2961         __ mov(used, 0);
2962         __ cmp(len, block_size);
2963         __ br(__ LO, NEXT);
2964 
2965         // Yes, we have a full block
2966         __ ldrq(v1, Address(in, offset));
2967         __ eor(v1, __ T16B, v1, v0);
2968         __ strq(v1, Address(out, offset));
2969         __ mov(used, block_size);
2970         __ add(offset, offset, block_size);
2971 
2972         __ subw(len, len, block_size);
2973         __ cbzw(len, DONE);
2974 
2975         // Increment the counter, store it back
2976         __ orr(v0, __ T16B, v16, v16);
2977         __ rev32(v16, __ T16B, v16);
2978         __ addv(v16, __ T4S, v16, v4);
2979         __ rev32(v16, __ T16B, v16);
2980         __ st1(v16, __ T16B, counter); // Save the incremented counter back
2981 
2982         __ b(inner_loop);
2983       }
2984 
2985       __ BIND(NEXT);
2986 
2987       // Encrypt a single byte, and loop.
2988       // We expect this to be a rare event.
2989       __ ldrb(rscratch1, Address(in, offset));
2990       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
2991       __ eor(rscratch1, rscratch1, rscratch2);
2992       __ strb(rscratch1, Address(out, offset));
2993       __ add(offset, offset, 1);
2994       __ add(used, used, 1);
2995       __ subw(len, len,1);
2996       __ cbnzw(len, L_CTR_loop);
2997     }
2998 
2999     __ bind(DONE);
3000     __ strw(used, Address(used_ptr));
3001     __ mov(r0, saved_len);
3002 
3003     __ leave(); // required for proper stackwalking of RuntimeStub frame
3004     __ ret(lr);
3005 
3006     // Bulk encryption
3007 
3008     __ BIND (CTR_large_block);
3009     assert(bulk_width == 4 || bulk_width == 8, "must be");
3010 
3011     if (bulk_width == 8) {
3012       __ sub(sp, sp, 4 * 16);
3013       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3014     }
3015     __ sub(sp, sp, 4 * 16);
3016     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3017     RegSet saved_regs = (RegSet::of(in, out, offset)
3018                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3019     __ push(saved_regs, sp);
3020     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3021     __ add(in, in, offset);
3022     __ add(out, out, offset);
3023 
3024     // Keys should already be loaded into the correct registers
3025 
3026     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3027     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3028 
3029     // AES/CTR loop
3030     {
3031       Label L_CTR_loop;
3032       __ BIND(L_CTR_loop);
3033 
3034       // Setup the counters
3035       __ movi(v8, __ T4S, 0);
3036       __ movi(v9, __ T4S, 1);
3037       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3038 
3039       for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
3040         __ rev32(f, __ T16B, v16);
3041         __ addv(v16, __ T4S, v16, v8);
3042       }
3043 
3044       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3045 
3046       // Encrypt the counters
3047       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3048 
3049       if (bulk_width == 8) {
3050         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3051       }
3052 
3053       // XOR the encrypted counters with the inputs
3054       for (int i = 0; i < bulk_width; i++) {
3055         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3056       }
3057 
3058       // Write the encrypted data
3059       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3060       if (bulk_width == 8) {
3061         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3062       }
3063 
3064       __ subw(len, len, 16 * bulk_width);
3065       __ cbnzw(len, L_CTR_loop);
3066     }
3067 
3068     // Save the counter back where it goes
3069     __ rev32(v16, __ T16B, v16);
3070     __ st1(v16, __ T16B, counter);
3071 
3072     __ pop(saved_regs, sp);
3073 
3074     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3075     if (bulk_width == 8) {
3076       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3077     }
3078 
3079     __ andr(rscratch1, len, -16 * bulk_width);
3080     __ sub(len, len, rscratch1);
3081     __ add(offset, offset, rscratch1);
3082     __ mov(used, 16);
3083     __ strw(used, Address(used_ptr));
3084     __ b(large_block_return);
3085 
3086     return start;
3087   }
3088 
3089   // Vector AES Galois Counter Mode implementation. Parameters:
3090   //
3091   // in = c_rarg0
3092   // len = c_rarg1
3093   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3094   // out = c_rarg3
3095   // key = c_rarg4
3096   // state = c_rarg5 - GHASH.state
3097   // subkeyHtbl = c_rarg6 - powers of H
3098   // counter = c_rarg7 - 16 bytes of CTR
3099   // return - number of processed bytes
3100   address generate_galoisCounterMode_AESCrypt() {
3101     address ghash_polynomial = __ pc();
3102     __ emit_int64(0x87);  // The low-order bits of the field
3103                           // polynomial (i.e. p = z^7+z^2+z+1)
3104                           // repeated in the low and high parts of a
3105                           // 128-bit vector
3106     __ emit_int64(0x87);
3107 
3108     __ align(CodeEntryAlignment);
3109      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3110     address start = __ pc();
3111     __ enter();
3112 
3113     const Register in = c_rarg0;
3114     const Register len = c_rarg1;
3115     const Register ct = c_rarg2;
3116     const Register out = c_rarg3;
3117     // and updated with the incremented counter in the end
3118 
3119     const Register key = c_rarg4;
3120     const Register state = c_rarg5;
3121 
3122     const Register subkeyHtbl = c_rarg6;
3123 
3124     const Register counter = c_rarg7;
3125 
3126     const Register keylen = r10;
3127     // Save state before entering routine
3128     __ sub(sp, sp, 4 * 16);
3129     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3130     __ sub(sp, sp, 4 * 16);
3131     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3132 
3133     // __ andr(len, len, -512);
3134     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3135     __ str(len, __ pre(sp, -2 * wordSize));
3136 
3137     Label DONE;
3138     __ cbz(len, DONE);
3139 
3140     // Compute #rounds for AES based on the length of the key array
3141     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3142 
3143     __ aesenc_loadkeys(key, keylen);
3144     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3145     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3146 
3147     // AES/CTR loop
3148     {
3149       Label L_CTR_loop;
3150       __ BIND(L_CTR_loop);
3151 
3152       // Setup the counters
3153       __ movi(v8, __ T4S, 0);
3154       __ movi(v9, __ T4S, 1);
3155       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3156       for (FloatRegister f = v0; f < v8; f++) {
3157         __ rev32(f, __ T16B, v16);
3158         __ addv(v16, __ T4S, v16, v8);
3159       }
3160 
3161       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3162 
3163       // Encrypt the counters
3164       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3165 
3166       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3167 
3168       // XOR the encrypted counters with the inputs
3169       for (int i = 0; i < 8; i++) {
3170         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3171       }
3172       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3173       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3174 
3175       __ subw(len, len, 16 * 8);
3176       __ cbnzw(len, L_CTR_loop);
3177     }
3178 
3179     __ rev32(v16, __ T16B, v16);
3180     __ st1(v16, __ T16B, counter);
3181 
3182     __ ldr(len, Address(sp));
3183     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3184 
3185     // GHASH/CTR loop
3186     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3187                                 len, /*unrolls*/4);
3188 
3189 #ifdef ASSERT
3190     { Label L;
3191       __ cmp(len, (unsigned char)0);
3192       __ br(Assembler::EQ, L);
3193       __ stop("stubGenerator: abort");
3194       __ bind(L);
3195   }
3196 #endif
3197 
3198   __ bind(DONE);
3199     // Return the number of bytes processed
3200     __ ldr(r0, __ post(sp, 2 * wordSize));
3201 
3202     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3203     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3204 
3205     __ leave(); // required for proper stackwalking of RuntimeStub frame
3206     __ ret(lr);
3207      return start;
3208   }
3209 
3210   // Arguments:
3211   //
3212   // Inputs:
3213   //   c_rarg0   - byte[]  source+offset
3214   //   c_rarg1   - int[]   SHA.state
3215   //   c_rarg2   - int     offset
3216   //   c_rarg3   - int     limit
3217   //
3218   address generate_md5_implCompress(bool multi_block, const char *name) {
3219     __ align(CodeEntryAlignment);
3220     StubCodeMark mark(this, "StubRoutines", name);
3221     address start = __ pc();
3222 
3223     Register buf       = c_rarg0;
3224     Register state     = c_rarg1;
3225     Register ofs       = c_rarg2;
3226     Register limit     = c_rarg3;
3227     Register a         = r4;
3228     Register b         = r5;
3229     Register c         = r6;
3230     Register d         = r7;
3231     Register rscratch3 = r10;
3232     Register rscratch4 = r11;
3233 
3234     Label keys;
3235     Label md5_loop;
3236 
3237     __ BIND(md5_loop);
3238 
3239     // Save hash values for addition after rounds
3240     __ ldrw(a, Address(state,  0));
3241     __ ldrw(b, Address(state,  4));
3242     __ ldrw(c, Address(state,  8));
3243     __ ldrw(d, Address(state, 12));
3244 
3245 #define FF(r1, r2, r3, r4, k, s, t)              \
3246     __ eorw(rscratch3, r3, r4);                  \
3247     __ movw(rscratch2, t);                       \
3248     __ andw(rscratch3, rscratch3, r2);           \
3249     __ addw(rscratch4, r1, rscratch2);           \
3250     __ ldrw(rscratch1, Address(buf, k*4));       \
3251     __ eorw(rscratch3, rscratch3, r4);           \
3252     __ addw(rscratch3, rscratch3, rscratch1);    \
3253     __ addw(rscratch3, rscratch3, rscratch4);    \
3254     __ rorw(rscratch2, rscratch3, 32 - s);       \
3255     __ addw(r1, rscratch2, r2);
3256 
3257 #define GG(r1, r2, r3, r4, k, s, t)              \
3258     __ eorw(rscratch2, r2, r3);                  \
3259     __ ldrw(rscratch1, Address(buf, k*4));       \
3260     __ andw(rscratch3, rscratch2, r4);           \
3261     __ movw(rscratch2, t);                       \
3262     __ eorw(rscratch3, rscratch3, r3);           \
3263     __ addw(rscratch4, r1, rscratch2);           \
3264     __ addw(rscratch3, rscratch3, rscratch1);    \
3265     __ addw(rscratch3, rscratch3, rscratch4);    \
3266     __ rorw(rscratch2, rscratch3, 32 - s);       \
3267     __ addw(r1, rscratch2, r2);
3268 
3269 #define HH(r1, r2, r3, r4, k, s, t)              \
3270     __ eorw(rscratch3, r3, r4);                  \
3271     __ movw(rscratch2, t);                       \
3272     __ addw(rscratch4, r1, rscratch2);           \
3273     __ ldrw(rscratch1, Address(buf, k*4));       \
3274     __ eorw(rscratch3, rscratch3, r2);           \
3275     __ addw(rscratch3, rscratch3, rscratch1);    \
3276     __ addw(rscratch3, rscratch3, rscratch4);    \
3277     __ rorw(rscratch2, rscratch3, 32 - s);       \
3278     __ addw(r1, rscratch2, r2);
3279 
3280 #define II(r1, r2, r3, r4, k, s, t)              \
3281     __ movw(rscratch3, t);                       \
3282     __ ornw(rscratch2, r2, r4);                  \
3283     __ addw(rscratch4, r1, rscratch3);           \
3284     __ ldrw(rscratch1, Address(buf, k*4));       \
3285     __ eorw(rscratch3, rscratch2, r3);           \
3286     __ addw(rscratch3, rscratch3, rscratch1);    \
3287     __ addw(rscratch3, rscratch3, rscratch4);    \
3288     __ rorw(rscratch2, rscratch3, 32 - s);       \
3289     __ addw(r1, rscratch2, r2);
3290 
3291     // Round 1
3292     FF(a, b, c, d,  0,  7, 0xd76aa478)
3293     FF(d, a, b, c,  1, 12, 0xe8c7b756)
3294     FF(c, d, a, b,  2, 17, 0x242070db)
3295     FF(b, c, d, a,  3, 22, 0xc1bdceee)
3296     FF(a, b, c, d,  4,  7, 0xf57c0faf)
3297     FF(d, a, b, c,  5, 12, 0x4787c62a)
3298     FF(c, d, a, b,  6, 17, 0xa8304613)
3299     FF(b, c, d, a,  7, 22, 0xfd469501)
3300     FF(a, b, c, d,  8,  7, 0x698098d8)
3301     FF(d, a, b, c,  9, 12, 0x8b44f7af)
3302     FF(c, d, a, b, 10, 17, 0xffff5bb1)
3303     FF(b, c, d, a, 11, 22, 0x895cd7be)
3304     FF(a, b, c, d, 12,  7, 0x6b901122)
3305     FF(d, a, b, c, 13, 12, 0xfd987193)
3306     FF(c, d, a, b, 14, 17, 0xa679438e)
3307     FF(b, c, d, a, 15, 22, 0x49b40821)
3308 
3309     // Round 2
3310     GG(a, b, c, d,  1,  5, 0xf61e2562)
3311     GG(d, a, b, c,  6,  9, 0xc040b340)
3312     GG(c, d, a, b, 11, 14, 0x265e5a51)
3313     GG(b, c, d, a,  0, 20, 0xe9b6c7aa)
3314     GG(a, b, c, d,  5,  5, 0xd62f105d)
3315     GG(d, a, b, c, 10,  9, 0x02441453)
3316     GG(c, d, a, b, 15, 14, 0xd8a1e681)
3317     GG(b, c, d, a,  4, 20, 0xe7d3fbc8)
3318     GG(a, b, c, d,  9,  5, 0x21e1cde6)
3319     GG(d, a, b, c, 14,  9, 0xc33707d6)
3320     GG(c, d, a, b,  3, 14, 0xf4d50d87)
3321     GG(b, c, d, a,  8, 20, 0x455a14ed)
3322     GG(a, b, c, d, 13,  5, 0xa9e3e905)
3323     GG(d, a, b, c,  2,  9, 0xfcefa3f8)
3324     GG(c, d, a, b,  7, 14, 0x676f02d9)
3325     GG(b, c, d, a, 12, 20, 0x8d2a4c8a)
3326 
3327     // Round 3
3328     HH(a, b, c, d,  5,  4, 0xfffa3942)
3329     HH(d, a, b, c,  8, 11, 0x8771f681)
3330     HH(c, d, a, b, 11, 16, 0x6d9d6122)
3331     HH(b, c, d, a, 14, 23, 0xfde5380c)
3332     HH(a, b, c, d,  1,  4, 0xa4beea44)
3333     HH(d, a, b, c,  4, 11, 0x4bdecfa9)
3334     HH(c, d, a, b,  7, 16, 0xf6bb4b60)
3335     HH(b, c, d, a, 10, 23, 0xbebfbc70)
3336     HH(a, b, c, d, 13,  4, 0x289b7ec6)
3337     HH(d, a, b, c,  0, 11, 0xeaa127fa)
3338     HH(c, d, a, b,  3, 16, 0xd4ef3085)
3339     HH(b, c, d, a,  6, 23, 0x04881d05)
3340     HH(a, b, c, d,  9,  4, 0xd9d4d039)
3341     HH(d, a, b, c, 12, 11, 0xe6db99e5)
3342     HH(c, d, a, b, 15, 16, 0x1fa27cf8)
3343     HH(b, c, d, a,  2, 23, 0xc4ac5665)
3344 
3345     // Round 4
3346     II(a, b, c, d,  0,  6, 0xf4292244)
3347     II(d, a, b, c,  7, 10, 0x432aff97)
3348     II(c, d, a, b, 14, 15, 0xab9423a7)
3349     II(b, c, d, a,  5, 21, 0xfc93a039)
3350     II(a, b, c, d, 12,  6, 0x655b59c3)
3351     II(d, a, b, c,  3, 10, 0x8f0ccc92)
3352     II(c, d, a, b, 10, 15, 0xffeff47d)
3353     II(b, c, d, a,  1, 21, 0x85845dd1)
3354     II(a, b, c, d,  8,  6, 0x6fa87e4f)
3355     II(d, a, b, c, 15, 10, 0xfe2ce6e0)
3356     II(c, d, a, b,  6, 15, 0xa3014314)
3357     II(b, c, d, a, 13, 21, 0x4e0811a1)
3358     II(a, b, c, d,  4,  6, 0xf7537e82)
3359     II(d, a, b, c, 11, 10, 0xbd3af235)
3360     II(c, d, a, b,  2, 15, 0x2ad7d2bb)
3361     II(b, c, d, a,  9, 21, 0xeb86d391)
3362 
3363 #undef FF
3364 #undef GG
3365 #undef HH
3366 #undef II
3367 
3368     // write hash values back in the correct order
3369     __ ldrw(rscratch1, Address(state,  0));
3370     __ addw(rscratch1, rscratch1, a);
3371     __ strw(rscratch1, Address(state,  0));
3372 
3373     __ ldrw(rscratch2, Address(state,  4));
3374     __ addw(rscratch2, rscratch2, b);
3375     __ strw(rscratch2, Address(state,  4));
3376 
3377     __ ldrw(rscratch3, Address(state,  8));
3378     __ addw(rscratch3, rscratch3, c);
3379     __ strw(rscratch3, Address(state,  8));
3380 
3381     __ ldrw(rscratch4, Address(state, 12));
3382     __ addw(rscratch4, rscratch4, d);
3383     __ strw(rscratch4, Address(state, 12));
3384 
3385     if (multi_block) {
3386       __ add(buf, buf, 64);
3387       __ add(ofs, ofs, 64);
3388       __ cmp(ofs, limit);
3389       __ br(Assembler::LE, md5_loop);
3390       __ mov(c_rarg0, ofs); // return ofs
3391     }
3392 
3393     __ ret(lr);
3394 
3395     return start;
3396   }
3397 
3398   // Arguments:
3399   //
3400   // Inputs:
3401   //   c_rarg0   - byte[]  source+offset
3402   //   c_rarg1   - int[]   SHA.state
3403   //   c_rarg2   - int     offset
3404   //   c_rarg3   - int     limit
3405   //
3406   address generate_sha1_implCompress(bool multi_block, const char *name) {
3407     __ align(CodeEntryAlignment);
3408     StubCodeMark mark(this, "StubRoutines", name);
3409     address start = __ pc();
3410 
3411     Register buf   = c_rarg0;
3412     Register state = c_rarg1;
3413     Register ofs   = c_rarg2;
3414     Register limit = c_rarg3;
3415 
3416     Label keys;
3417     Label sha1_loop;
3418 
3419     // load the keys into v0..v3
3420     __ adr(rscratch1, keys);
3421     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3422     // load 5 words state into v6, v7
3423     __ ldrq(v6, Address(state, 0));
3424     __ ldrs(v7, Address(state, 16));
3425 
3426 
3427     __ BIND(sha1_loop);
3428     // load 64 bytes of data into v16..v19
3429     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3430     __ rev32(v16, __ T16B, v16);
3431     __ rev32(v17, __ T16B, v17);
3432     __ rev32(v18, __ T16B, v18);
3433     __ rev32(v19, __ T16B, v19);
3434 
3435     // do the sha1
3436     __ addv(v4, __ T4S, v16, v0);
3437     __ orr(v20, __ T16B, v6, v6);
3438 
3439     FloatRegister d0 = v16;
3440     FloatRegister d1 = v17;
3441     FloatRegister d2 = v18;
3442     FloatRegister d3 = v19;
3443 
3444     for (int round = 0; round < 20; round++) {
3445       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3446       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3447       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3448       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3449       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3450 
3451       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3452       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3453       __ sha1h(tmp2, __ T4S, v20);
3454       if (round < 5)
3455         __ sha1c(v20, __ T4S, tmp3, tmp4);
3456       else if (round < 10 || round >= 15)
3457         __ sha1p(v20, __ T4S, tmp3, tmp4);
3458       else
3459         __ sha1m(v20, __ T4S, tmp3, tmp4);
3460       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3461 
3462       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3463     }
3464 
3465     __ addv(v7, __ T2S, v7, v21);
3466     __ addv(v6, __ T4S, v6, v20);
3467 
3468     if (multi_block) {
3469       __ add(ofs, ofs, 64);
3470       __ cmp(ofs, limit);
3471       __ br(Assembler::LE, sha1_loop);
3472       __ mov(c_rarg0, ofs); // return ofs
3473     }
3474 
3475     __ strq(v6, Address(state, 0));
3476     __ strs(v7, Address(state, 16));
3477 
3478     __ ret(lr);
3479 
3480     __ bind(keys);
3481     __ emit_int32(0x5a827999);
3482     __ emit_int32(0x6ed9eba1);
3483     __ emit_int32(0x8f1bbcdc);
3484     __ emit_int32(0xca62c1d6);
3485 
3486     return start;
3487   }
3488 
3489 
3490   // Arguments:
3491   //
3492   // Inputs:
3493   //   c_rarg0   - byte[]  source+offset
3494   //   c_rarg1   - int[]   SHA.state
3495   //   c_rarg2   - int     offset
3496   //   c_rarg3   - int     limit
3497   //
3498   address generate_sha256_implCompress(bool multi_block, const char *name) {
3499     static const uint32_t round_consts[64] = {
3500       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3501       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3502       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3503       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3504       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3505       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3506       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3507       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3508       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3509       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3510       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3511       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3512       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3513       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3514       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3515       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3516     };
3517     __ align(CodeEntryAlignment);
3518     StubCodeMark mark(this, "StubRoutines", name);
3519     address start = __ pc();
3520 
3521     Register buf   = c_rarg0;
3522     Register state = c_rarg1;
3523     Register ofs   = c_rarg2;
3524     Register limit = c_rarg3;
3525 
3526     Label sha1_loop;
3527 
3528     __ stpd(v8, v9, __ pre(sp, -32));
3529     __ stpd(v10, v11, Address(sp, 16));
3530 
3531 // dga == v0
3532 // dgb == v1
3533 // dg0 == v2
3534 // dg1 == v3
3535 // dg2 == v4
3536 // t0 == v6
3537 // t1 == v7
3538 
3539     // load 16 keys to v16..v31
3540     __ lea(rscratch1, ExternalAddress((address)round_consts));
3541     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3542     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3543     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3544     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3545 
3546     // load 8 words (256 bits) state
3547     __ ldpq(v0, v1, state);
3548 
3549     __ BIND(sha1_loop);
3550     // load 64 bytes of data into v8..v11
3551     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3552     __ rev32(v8, __ T16B, v8);
3553     __ rev32(v9, __ T16B, v9);
3554     __ rev32(v10, __ T16B, v10);
3555     __ rev32(v11, __ T16B, v11);
3556 
3557     __ addv(v6, __ T4S, v8, v16);
3558     __ orr(v2, __ T16B, v0, v0);
3559     __ orr(v3, __ T16B, v1, v1);
3560 
3561     FloatRegister d0 = v8;
3562     FloatRegister d1 = v9;
3563     FloatRegister d2 = v10;
3564     FloatRegister d3 = v11;
3565 
3566 
3567     for (int round = 0; round < 16; round++) {
3568       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3569       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3570       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3571       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3572 
3573       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3574        __ orr(v4, __ T16B, v2, v2);
3575       if (round < 15)
3576         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3577       __ sha256h(v2, __ T4S, v3, tmp2);
3578       __ sha256h2(v3, __ T4S, v4, tmp2);
3579       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3580 
3581       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3582     }
3583 
3584     __ addv(v0, __ T4S, v0, v2);
3585     __ addv(v1, __ T4S, v1, v3);
3586 
3587     if (multi_block) {
3588       __ add(ofs, ofs, 64);
3589       __ cmp(ofs, limit);
3590       __ br(Assembler::LE, sha1_loop);
3591       __ mov(c_rarg0, ofs); // return ofs
3592     }
3593 
3594     __ ldpd(v10, v11, Address(sp, 16));
3595     __ ldpd(v8, v9, __ post(sp, 32));
3596 
3597     __ stpq(v0, v1, state);
3598 
3599     __ ret(lr);
3600 
3601     return start;
3602   }
3603 
3604   // Arguments:
3605   //
3606   // Inputs:
3607   //   c_rarg0   - byte[]  source+offset
3608   //   c_rarg1   - int[]   SHA.state
3609   //   c_rarg2   - int     offset
3610   //   c_rarg3   - int     limit
3611   //
3612   address generate_sha512_implCompress(bool multi_block, const char *name) {
3613     static const uint64_t round_consts[80] = {
3614       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3615       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3616       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3617       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3618       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3619       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3620       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3621       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3622       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3623       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3624       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3625       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3626       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3627       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3628       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3629       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3630       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3631       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3632       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3633       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3634       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3635       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3636       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3637       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3638       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3639       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3640       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3641     };
3642 
3643     // Double rounds for sha512.
3644     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3645       if (dr < 36)                                                                   \
3646         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3647       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3648       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3649       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3650       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3651       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3652       if (dr < 32) {                                                                 \
3653         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3654         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3655       }                                                                              \
3656       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3657       if (dr < 32)                                                                   \
3658         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3659       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3660       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3661 
3662     __ align(CodeEntryAlignment);
3663     StubCodeMark mark(this, "StubRoutines", name);
3664     address start = __ pc();
3665 
3666     Register buf   = c_rarg0;
3667     Register state = c_rarg1;
3668     Register ofs   = c_rarg2;
3669     Register limit = c_rarg3;
3670 
3671     __ stpd(v8, v9, __ pre(sp, -64));
3672     __ stpd(v10, v11, Address(sp, 16));
3673     __ stpd(v12, v13, Address(sp, 32));
3674     __ stpd(v14, v15, Address(sp, 48));
3675 
3676     Label sha512_loop;
3677 
3678     // load state
3679     __ ld1(v8, v9, v10, v11, __ T2D, state);
3680 
3681     // load first 4 round constants
3682     __ lea(rscratch1, ExternalAddress((address)round_consts));
3683     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3684 
3685     __ BIND(sha512_loop);
3686     // load 128B of data into v12..v19
3687     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3688     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3689     __ rev64(v12, __ T16B, v12);
3690     __ rev64(v13, __ T16B, v13);
3691     __ rev64(v14, __ T16B, v14);
3692     __ rev64(v15, __ T16B, v15);
3693     __ rev64(v16, __ T16B, v16);
3694     __ rev64(v17, __ T16B, v17);
3695     __ rev64(v18, __ T16B, v18);
3696     __ rev64(v19, __ T16B, v19);
3697 
3698     __ mov(rscratch2, rscratch1);
3699 
3700     __ mov(v0, __ T16B, v8);
3701     __ mov(v1, __ T16B, v9);
3702     __ mov(v2, __ T16B, v10);
3703     __ mov(v3, __ T16B, v11);
3704 
3705     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3706     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3707     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3708     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3709     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3710     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3711     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3712     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3713     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3714     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3715     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3716     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3717     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3718     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3719     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3720     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3721     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3722     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3723     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3724     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3725     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3726     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3727     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3728     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3729     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3730     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3731     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3732     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3733     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3734     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3735     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3736     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3737     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3738     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3739     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3740     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3741     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3742     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3743     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3744     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3745 
3746     __ addv(v8, __ T2D, v8, v0);
3747     __ addv(v9, __ T2D, v9, v1);
3748     __ addv(v10, __ T2D, v10, v2);
3749     __ addv(v11, __ T2D, v11, v3);
3750 
3751     if (multi_block) {
3752       __ add(ofs, ofs, 128);
3753       __ cmp(ofs, limit);
3754       __ br(Assembler::LE, sha512_loop);
3755       __ mov(c_rarg0, ofs); // return ofs
3756     }
3757 
3758     __ st1(v8, v9, v10, v11, __ T2D, state);
3759 
3760     __ ldpd(v14, v15, Address(sp, 48));
3761     __ ldpd(v12, v13, Address(sp, 32));
3762     __ ldpd(v10, v11, Address(sp, 16));
3763     __ ldpd(v8, v9, __ post(sp, 64));
3764 
3765     __ ret(lr);
3766 
3767     return start;
3768   }
3769 
3770   // Arguments:
3771   //
3772   // Inputs:
3773   //   c_rarg0   - byte[]  source+offset
3774   //   c_rarg1   - byte[]   SHA.state
3775   //   c_rarg2   - int     digest_length
3776   //   c_rarg3   - int     offset
3777   //   c_rarg4   - int     limit
3778   //
3779   address generate_sha3_implCompress(bool multi_block, const char *name) {
3780     static const uint64_t round_consts[24] = {
3781       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3782       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3783       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3784       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3785       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3786       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3787       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3788       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3789     };
3790 
3791     __ align(CodeEntryAlignment);
3792     StubCodeMark mark(this, "StubRoutines", name);
3793     address start = __ pc();
3794 
3795     Register buf           = c_rarg0;
3796     Register state         = c_rarg1;
3797     Register digest_length = c_rarg2;
3798     Register ofs           = c_rarg3;
3799     Register limit         = c_rarg4;
3800 
3801     Label sha3_loop, rounds24_loop;
3802     Label sha3_512, sha3_384_or_224, sha3_256;
3803 
3804     __ stpd(v8, v9, __ pre(sp, -64));
3805     __ stpd(v10, v11, Address(sp, 16));
3806     __ stpd(v12, v13, Address(sp, 32));
3807     __ stpd(v14, v15, Address(sp, 48));
3808 
3809     // load state
3810     __ add(rscratch1, state, 32);
3811     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3812     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3813     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3814     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3815     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3816     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3817     __ ld1(v24, __ T1D, rscratch1);
3818 
3819     __ BIND(sha3_loop);
3820 
3821     // 24 keccak rounds
3822     __ movw(rscratch2, 24);
3823 
3824     // load round_constants base
3825     __ lea(rscratch1, ExternalAddress((address) round_consts));
3826 
3827     // load input
3828     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3829     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3830     __ eor(v0, __ T8B, v0, v25);
3831     __ eor(v1, __ T8B, v1, v26);
3832     __ eor(v2, __ T8B, v2, v27);
3833     __ eor(v3, __ T8B, v3, v28);
3834     __ eor(v4, __ T8B, v4, v29);
3835     __ eor(v5, __ T8B, v5, v30);
3836     __ eor(v6, __ T8B, v6, v31);
3837 
3838     // digest_length == 64, SHA3-512
3839     __ tbnz(digest_length, 6, sha3_512);
3840 
3841     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3842     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3843     __ eor(v7, __ T8B, v7, v25);
3844     __ eor(v8, __ T8B, v8, v26);
3845     __ eor(v9, __ T8B, v9, v27);
3846     __ eor(v10, __ T8B, v10, v28);
3847     __ eor(v11, __ T8B, v11, v29);
3848     __ eor(v12, __ T8B, v12, v30);
3849 
3850     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3851     __ tbnz(digest_length, 4, sha3_384_or_224);
3852 
3853     // SHA3-256
3854     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3855     __ eor(v13, __ T8B, v13, v25);
3856     __ eor(v14, __ T8B, v14, v26);
3857     __ eor(v15, __ T8B, v15, v27);
3858     __ eor(v16, __ T8B, v16, v28);
3859     __ b(rounds24_loop);
3860 
3861     __ BIND(sha3_384_or_224);
3862     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3863 
3864     // SHA3-224
3865     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3866     __ ld1(v29, __ T8B, __ post(buf, 8));
3867     __ eor(v13, __ T8B, v13, v25);
3868     __ eor(v14, __ T8B, v14, v26);
3869     __ eor(v15, __ T8B, v15, v27);
3870     __ eor(v16, __ T8B, v16, v28);
3871     __ eor(v17, __ T8B, v17, v29);
3872     __ b(rounds24_loop);
3873 
3874     __ BIND(sha3_512);
3875     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3876     __ eor(v7, __ T8B, v7, v25);
3877     __ eor(v8, __ T8B, v8, v26);
3878 
3879     __ BIND(rounds24_loop);
3880     __ subw(rscratch2, rscratch2, 1);
3881 
3882     __ eor3(v29, __ T16B, v4, v9, v14);
3883     __ eor3(v26, __ T16B, v1, v6, v11);
3884     __ eor3(v28, __ T16B, v3, v8, v13);
3885     __ eor3(v25, __ T16B, v0, v5, v10);
3886     __ eor3(v27, __ T16B, v2, v7, v12);
3887     __ eor3(v29, __ T16B, v29, v19, v24);
3888     __ eor3(v26, __ T16B, v26, v16, v21);
3889     __ eor3(v28, __ T16B, v28, v18, v23);
3890     __ eor3(v25, __ T16B, v25, v15, v20);
3891     __ eor3(v27, __ T16B, v27, v17, v22);
3892 
3893     __ rax1(v30, __ T2D, v29, v26);
3894     __ rax1(v26, __ T2D, v26, v28);
3895     __ rax1(v28, __ T2D, v28, v25);
3896     __ rax1(v25, __ T2D, v25, v27);
3897     __ rax1(v27, __ T2D, v27, v29);
3898 
3899     __ eor(v0, __ T16B, v0, v30);
3900     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3901     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3902     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3903     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3904     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3905     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3906     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3907     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3908     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3909     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3910     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3911     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3912     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3913     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3914     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3915     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3916     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3917     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3918     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3919     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3920     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3921     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3922     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3923     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3924 
3925     __ bcax(v20, __ T16B, v31, v22, v8);
3926     __ bcax(v21, __ T16B, v8,  v23, v22);
3927     __ bcax(v22, __ T16B, v22, v24, v23);
3928     __ bcax(v23, __ T16B, v23, v31, v24);
3929     __ bcax(v24, __ T16B, v24, v8,  v31);
3930 
3931     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3932 
3933     __ bcax(v17, __ T16B, v25, v19, v3);
3934     __ bcax(v18, __ T16B, v3,  v15, v19);
3935     __ bcax(v19, __ T16B, v19, v16, v15);
3936     __ bcax(v15, __ T16B, v15, v25, v16);
3937     __ bcax(v16, __ T16B, v16, v3,  v25);
3938 
3939     __ bcax(v10, __ T16B, v29, v12, v26);
3940     __ bcax(v11, __ T16B, v26, v13, v12);
3941     __ bcax(v12, __ T16B, v12, v14, v13);
3942     __ bcax(v13, __ T16B, v13, v29, v14);
3943     __ bcax(v14, __ T16B, v14, v26, v29);
3944 
3945     __ bcax(v7, __ T16B, v30, v9,  v4);
3946     __ bcax(v8, __ T16B, v4,  v5,  v9);
3947     __ bcax(v9, __ T16B, v9,  v6,  v5);
3948     __ bcax(v5, __ T16B, v5,  v30, v6);
3949     __ bcax(v6, __ T16B, v6,  v4,  v30);
3950 
3951     __ bcax(v3, __ T16B, v27, v0,  v28);
3952     __ bcax(v4, __ T16B, v28, v1,  v0);
3953     __ bcax(v0, __ T16B, v0,  v2,  v1);
3954     __ bcax(v1, __ T16B, v1,  v27, v2);
3955     __ bcax(v2, __ T16B, v2,  v28, v27);
3956 
3957     __ eor(v0, __ T16B, v0, v31);
3958 
3959     __ cbnzw(rscratch2, rounds24_loop);
3960 
3961     if (multi_block) {
3962       // block_size =  200 - 2 * digest_length, ofs += block_size
3963       __ add(ofs, ofs, 200);
3964       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3965 
3966       __ cmp(ofs, limit);
3967       __ br(Assembler::LE, sha3_loop);
3968       __ mov(c_rarg0, ofs); // return ofs
3969     }
3970 
3971     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3972     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3973     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3974     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3975     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3976     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3977     __ st1(v24, __ T1D, state);
3978 
3979     __ ldpd(v14, v15, Address(sp, 48));
3980     __ ldpd(v12, v13, Address(sp, 32));
3981     __ ldpd(v10, v11, Address(sp, 16));
3982     __ ldpd(v8, v9, __ post(sp, 64));
3983 
3984     __ ret(lr);
3985 
3986     return start;
3987   }
3988 
3989   // Safefetch stubs.
3990   void generate_safefetch(const char* name, int size, address* entry,
3991                           address* fault_pc, address* continuation_pc) {
3992     // safefetch signatures:
3993     //   int      SafeFetch32(int*      adr, int      errValue);
3994     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3995     //
3996     // arguments:
3997     //   c_rarg0 = adr
3998     //   c_rarg1 = errValue
3999     //
4000     // result:
4001     //   PPC_RET  = *adr or errValue
4002 
4003     StubCodeMark mark(this, "StubRoutines", name);
4004 
4005     // Entry point, pc or function descriptor.
4006     *entry = __ pc();
4007 
4008     // Load *adr into c_rarg1, may fault.
4009     *fault_pc = __ pc();
4010     switch (size) {
4011       case 4:
4012         // int32_t
4013         __ ldrw(c_rarg1, Address(c_rarg0, 0));
4014         break;
4015       case 8:
4016         // int64_t
4017         __ ldr(c_rarg1, Address(c_rarg0, 0));
4018         break;
4019       default:
4020         ShouldNotReachHere();
4021     }
4022 
4023     // return errValue or *adr
4024     *continuation_pc = __ pc();
4025     __ mov(r0, c_rarg1);
4026     __ ret(lr);
4027   }
4028 
4029   /**
4030    *  Arguments:
4031    *
4032    * Inputs:
4033    *   c_rarg0   - int crc
4034    *   c_rarg1   - byte* buf
4035    *   c_rarg2   - int length
4036    *
4037    * Ouput:
4038    *       rax   - int crc result
4039    */
4040   address generate_updateBytesCRC32() {
4041     assert(UseCRC32Intrinsics, "what are we doing here?");
4042 
4043     __ align(CodeEntryAlignment);
4044     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4045 
4046     address start = __ pc();
4047 
4048     const Register crc   = c_rarg0;  // crc
4049     const Register buf   = c_rarg1;  // source java byte array address
4050     const Register len   = c_rarg2;  // length
4051     const Register table0 = c_rarg3; // crc_table address
4052     const Register table1 = c_rarg4;
4053     const Register table2 = c_rarg5;
4054     const Register table3 = c_rarg6;
4055     const Register tmp3 = c_rarg7;
4056 
4057     BLOCK_COMMENT("Entry:");
4058     __ enter(); // required for proper stackwalking of RuntimeStub frame
4059 
4060     __ kernel_crc32(crc, buf, len,
4061               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4062 
4063     __ leave(); // required for proper stackwalking of RuntimeStub frame
4064     __ ret(lr);
4065 
4066     return start;
4067   }
4068 
4069   /**
4070    *  Arguments:
4071    *
4072    * Inputs:
4073    *   c_rarg0   - int crc
4074    *   c_rarg1   - byte* buf
4075    *   c_rarg2   - int length
4076    *   c_rarg3   - int* table
4077    *
4078    * Ouput:
4079    *       r0   - int crc result
4080    */
4081   address generate_updateBytesCRC32C() {
4082     assert(UseCRC32CIntrinsics, "what are we doing here?");
4083 
4084     __ align(CodeEntryAlignment);
4085     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4086 
4087     address start = __ pc();
4088 
4089     const Register crc   = c_rarg0;  // crc
4090     const Register buf   = c_rarg1;  // source java byte array address
4091     const Register len   = c_rarg2;  // length
4092     const Register table0 = c_rarg3; // crc_table address
4093     const Register table1 = c_rarg4;
4094     const Register table2 = c_rarg5;
4095     const Register table3 = c_rarg6;
4096     const Register tmp3 = c_rarg7;
4097 
4098     BLOCK_COMMENT("Entry:");
4099     __ enter(); // required for proper stackwalking of RuntimeStub frame
4100 
4101     __ kernel_crc32c(crc, buf, len,
4102               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4103 
4104     __ leave(); // required for proper stackwalking of RuntimeStub frame
4105     __ ret(lr);
4106 
4107     return start;
4108   }
4109 
4110   /***
4111    *  Arguments:
4112    *
4113    *  Inputs:
4114    *   c_rarg0   - int   adler
4115    *   c_rarg1   - byte* buff
4116    *   c_rarg2   - int   len
4117    *
4118    * Output:
4119    *   c_rarg0   - int adler result
4120    */
4121   address generate_updateBytesAdler32() {
4122     __ align(CodeEntryAlignment);
4123     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4124     address start = __ pc();
4125 
4126     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4127 
4128     // Aliases
4129     Register adler  = c_rarg0;
4130     Register s1     = c_rarg0;
4131     Register s2     = c_rarg3;
4132     Register buff   = c_rarg1;
4133     Register len    = c_rarg2;
4134     Register nmax  = r4;
4135     Register base  = r5;
4136     Register count = r6;
4137     Register temp0 = rscratch1;
4138     Register temp1 = rscratch2;
4139     FloatRegister vbytes = v0;
4140     FloatRegister vs1acc = v1;
4141     FloatRegister vs2acc = v2;
4142     FloatRegister vtable = v3;
4143 
4144     // Max number of bytes we can process before having to take the mod
4145     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4146     uint64_t BASE = 0xfff1;
4147     uint64_t NMAX = 0x15B0;
4148 
4149     __ mov(base, BASE);
4150     __ mov(nmax, NMAX);
4151 
4152     // Load accumulation coefficients for the upper 16 bits
4153     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4154     __ ld1(vtable, __ T16B, Address(temp0));
4155 
4156     // s1 is initialized to the lower 16 bits of adler
4157     // s2 is initialized to the upper 16 bits of adler
4158     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4159     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4160 
4161     // The pipelined loop needs at least 16 elements for 1 iteration
4162     // It does check this, but it is more effective to skip to the cleanup loop
4163     __ cmp(len, (u1)16);
4164     __ br(Assembler::HS, L_nmax);
4165     __ cbz(len, L_combine);
4166 
4167     __ bind(L_simple_by1_loop);
4168     __ ldrb(temp0, Address(__ post(buff, 1)));
4169     __ add(s1, s1, temp0);
4170     __ add(s2, s2, s1);
4171     __ subs(len, len, 1);
4172     __ br(Assembler::HI, L_simple_by1_loop);
4173 
4174     // s1 = s1 % BASE
4175     __ subs(temp0, s1, base);
4176     __ csel(s1, temp0, s1, Assembler::HS);
4177 
4178     // s2 = s2 % BASE
4179     __ lsr(temp0, s2, 16);
4180     __ lsl(temp1, temp0, 4);
4181     __ sub(temp1, temp1, temp0);
4182     __ add(s2, temp1, s2, ext::uxth);
4183 
4184     __ subs(temp0, s2, base);
4185     __ csel(s2, temp0, s2, Assembler::HS);
4186 
4187     __ b(L_combine);
4188 
4189     __ bind(L_nmax);
4190     __ subs(len, len, nmax);
4191     __ sub(count, nmax, 16);
4192     __ br(Assembler::LO, L_by16);
4193 
4194     __ bind(L_nmax_loop);
4195 
4196     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4197                                       vbytes, vs1acc, vs2acc, vtable);
4198 
4199     __ subs(count, count, 16);
4200     __ br(Assembler::HS, L_nmax_loop);
4201 
4202     // s1 = s1 % BASE
4203     __ lsr(temp0, s1, 16);
4204     __ lsl(temp1, temp0, 4);
4205     __ sub(temp1, temp1, temp0);
4206     __ add(temp1, temp1, s1, ext::uxth);
4207 
4208     __ lsr(temp0, temp1, 16);
4209     __ lsl(s1, temp0, 4);
4210     __ sub(s1, s1, temp0);
4211     __ add(s1, s1, temp1, ext:: uxth);
4212 
4213     __ subs(temp0, s1, base);
4214     __ csel(s1, temp0, s1, Assembler::HS);
4215 
4216     // s2 = s2 % BASE
4217     __ lsr(temp0, s2, 16);
4218     __ lsl(temp1, temp0, 4);
4219     __ sub(temp1, temp1, temp0);
4220     __ add(temp1, temp1, s2, ext::uxth);
4221 
4222     __ lsr(temp0, temp1, 16);
4223     __ lsl(s2, temp0, 4);
4224     __ sub(s2, s2, temp0);
4225     __ add(s2, s2, temp1, ext:: uxth);
4226 
4227     __ subs(temp0, s2, base);
4228     __ csel(s2, temp0, s2, Assembler::HS);
4229 
4230     __ subs(len, len, nmax);
4231     __ sub(count, nmax, 16);
4232     __ br(Assembler::HS, L_nmax_loop);
4233 
4234     __ bind(L_by16);
4235     __ adds(len, len, count);
4236     __ br(Assembler::LO, L_by1);
4237 
4238     __ bind(L_by16_loop);
4239 
4240     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4241                                       vbytes, vs1acc, vs2acc, vtable);
4242 
4243     __ subs(len, len, 16);
4244     __ br(Assembler::HS, L_by16_loop);
4245 
4246     __ bind(L_by1);
4247     __ adds(len, len, 15);
4248     __ br(Assembler::LO, L_do_mod);
4249 
4250     __ bind(L_by1_loop);
4251     __ ldrb(temp0, Address(__ post(buff, 1)));
4252     __ add(s1, temp0, s1);
4253     __ add(s2, s2, s1);
4254     __ subs(len, len, 1);
4255     __ br(Assembler::HS, L_by1_loop);
4256 
4257     __ bind(L_do_mod);
4258     // s1 = s1 % BASE
4259     __ lsr(temp0, s1, 16);
4260     __ lsl(temp1, temp0, 4);
4261     __ sub(temp1, temp1, temp0);
4262     __ add(temp1, temp1, s1, ext::uxth);
4263 
4264     __ lsr(temp0, temp1, 16);
4265     __ lsl(s1, temp0, 4);
4266     __ sub(s1, s1, temp0);
4267     __ add(s1, s1, temp1, ext:: uxth);
4268 
4269     __ subs(temp0, s1, base);
4270     __ csel(s1, temp0, s1, Assembler::HS);
4271 
4272     // s2 = s2 % BASE
4273     __ lsr(temp0, s2, 16);
4274     __ lsl(temp1, temp0, 4);
4275     __ sub(temp1, temp1, temp0);
4276     __ add(temp1, temp1, s2, ext::uxth);
4277 
4278     __ lsr(temp0, temp1, 16);
4279     __ lsl(s2, temp0, 4);
4280     __ sub(s2, s2, temp0);
4281     __ add(s2, s2, temp1, ext:: uxth);
4282 
4283     __ subs(temp0, s2, base);
4284     __ csel(s2, temp0, s2, Assembler::HS);
4285 
4286     // Combine lower bits and higher bits
4287     __ bind(L_combine);
4288     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4289 
4290     __ ret(lr);
4291 
4292     return start;
4293   }
4294 
4295   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4296           Register temp0, Register temp1, FloatRegister vbytes,
4297           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4298     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4299     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4300     // In non-vectorized code, we update s1 and s2 as:
4301     //   s1 <- s1 + b1
4302     //   s2 <- s2 + s1
4303     //   s1 <- s1 + b2
4304     //   s2 <- s2 + b1
4305     //   ...
4306     //   s1 <- s1 + b16
4307     //   s2 <- s2 + s1
4308     // Putting above assignments together, we have:
4309     //   s1_new = s1 + b1 + b2 + ... + b16
4310     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4311     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4312     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4313     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4314 
4315     // s2 = s2 + s1 * 16
4316     __ add(s2, s2, s1, Assembler::LSL, 4);
4317 
4318     // vs1acc = b1 + b2 + b3 + ... + b16
4319     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4320     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4321     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4322     __ uaddlv(vs1acc, __ T16B, vbytes);
4323     __ uaddlv(vs2acc, __ T8H, vs2acc);
4324 
4325     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4326     __ fmovd(temp0, vs1acc);
4327     __ fmovd(temp1, vs2acc);
4328     __ add(s1, s1, temp0);
4329     __ add(s2, s2, temp1);
4330   }
4331 
4332   /**
4333    *  Arguments:
4334    *
4335    *  Input:
4336    *    c_rarg0   - x address
4337    *    c_rarg1   - x length
4338    *    c_rarg2   - y address
4339    *    c_rarg3   - y lenth
4340    *    c_rarg4   - z address
4341    *    c_rarg5   - z length
4342    */
4343   address generate_multiplyToLen() {
4344     __ align(CodeEntryAlignment);
4345     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4346 
4347     address start = __ pc();
4348     const Register x     = r0;
4349     const Register xlen  = r1;
4350     const Register y     = r2;
4351     const Register ylen  = r3;
4352     const Register z     = r4;
4353     const Register zlen  = r5;
4354 
4355     const Register tmp1  = r10;
4356     const Register tmp2  = r11;
4357     const Register tmp3  = r12;
4358     const Register tmp4  = r13;
4359     const Register tmp5  = r14;
4360     const Register tmp6  = r15;
4361     const Register tmp7  = r16;
4362 
4363     BLOCK_COMMENT("Entry:");
4364     __ enter(); // required for proper stackwalking of RuntimeStub frame
4365     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4366     __ leave(); // required for proper stackwalking of RuntimeStub frame
4367     __ ret(lr);
4368 
4369     return start;
4370   }
4371 
4372   address generate_squareToLen() {
4373     // squareToLen algorithm for sizes 1..127 described in java code works
4374     // faster than multiply_to_len on some CPUs and slower on others, but
4375     // multiply_to_len shows a bit better overall results
4376     __ align(CodeEntryAlignment);
4377     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4378     address start = __ pc();
4379 
4380     const Register x     = r0;
4381     const Register xlen  = r1;
4382     const Register z     = r2;
4383     const Register zlen  = r3;
4384     const Register y     = r4; // == x
4385     const Register ylen  = r5; // == xlen
4386 
4387     const Register tmp1  = r10;
4388     const Register tmp2  = r11;
4389     const Register tmp3  = r12;
4390     const Register tmp4  = r13;
4391     const Register tmp5  = r14;
4392     const Register tmp6  = r15;
4393     const Register tmp7  = r16;
4394 
4395     RegSet spilled_regs = RegSet::of(y, ylen);
4396     BLOCK_COMMENT("Entry:");
4397     __ enter();
4398     __ push(spilled_regs, sp);
4399     __ mov(y, x);
4400     __ mov(ylen, xlen);
4401     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4402     __ pop(spilled_regs, sp);
4403     __ leave();
4404     __ ret(lr);
4405     return start;
4406   }
4407 
4408   address generate_mulAdd() {
4409     __ align(CodeEntryAlignment);
4410     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4411 
4412     address start = __ pc();
4413 
4414     const Register out     = r0;
4415     const Register in      = r1;
4416     const Register offset  = r2;
4417     const Register len     = r3;
4418     const Register k       = r4;
4419 
4420     BLOCK_COMMENT("Entry:");
4421     __ enter();
4422     __ mul_add(out, in, offset, len, k);
4423     __ leave();
4424     __ ret(lr);
4425 
4426     return start;
4427   }
4428 
4429   // Arguments:
4430   //
4431   // Input:
4432   //   c_rarg0   - newArr address
4433   //   c_rarg1   - oldArr address
4434   //   c_rarg2   - newIdx
4435   //   c_rarg3   - shiftCount
4436   //   c_rarg4   - numIter
4437   //
4438   address generate_bigIntegerRightShift() {
4439     __ align(CodeEntryAlignment);
4440     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4441     address start = __ pc();
4442 
4443     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4444 
4445     Register newArr        = c_rarg0;
4446     Register oldArr        = c_rarg1;
4447     Register newIdx        = c_rarg2;
4448     Register shiftCount    = c_rarg3;
4449     Register numIter       = c_rarg4;
4450     Register idx           = numIter;
4451 
4452     Register newArrCur     = rscratch1;
4453     Register shiftRevCount = rscratch2;
4454     Register oldArrCur     = r13;
4455     Register oldArrNext    = r14;
4456 
4457     FloatRegister oldElem0        = v0;
4458     FloatRegister oldElem1        = v1;
4459     FloatRegister newElem         = v2;
4460     FloatRegister shiftVCount     = v3;
4461     FloatRegister shiftVRevCount  = v4;
4462 
4463     __ cbz(idx, Exit);
4464 
4465     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4466 
4467     // left shift count
4468     __ movw(shiftRevCount, 32);
4469     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4470 
4471     // numIter too small to allow a 4-words SIMD loop, rolling back
4472     __ cmp(numIter, (u1)4);
4473     __ br(Assembler::LT, ShiftThree);
4474 
4475     __ dup(shiftVCount,    __ T4S, shiftCount);
4476     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4477     __ negr(shiftVCount,   __ T4S, shiftVCount);
4478 
4479     __ BIND(ShiftSIMDLoop);
4480 
4481     // Calculate the load addresses
4482     __ sub(idx, idx, 4);
4483     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4484     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4485     __ add(oldArrCur,  oldArrNext, 4);
4486 
4487     // Load 4 words and process
4488     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4489     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4490     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4491     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4492     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4493     __ st1(newElem,   __ T4S,  Address(newArrCur));
4494 
4495     __ cmp(idx, (u1)4);
4496     __ br(Assembler::LT, ShiftTwoLoop);
4497     __ b(ShiftSIMDLoop);
4498 
4499     __ BIND(ShiftTwoLoop);
4500     __ cbz(idx, Exit);
4501     __ cmp(idx, (u1)1);
4502     __ br(Assembler::EQ, ShiftOne);
4503 
4504     // Calculate the load addresses
4505     __ sub(idx, idx, 2);
4506     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4507     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4508     __ add(oldArrCur,  oldArrNext, 4);
4509 
4510     // Load 2 words and process
4511     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4512     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4513     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4514     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4515     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4516     __ st1(newElem,   __ T2S, Address(newArrCur));
4517     __ b(ShiftTwoLoop);
4518 
4519     __ BIND(ShiftThree);
4520     __ tbz(idx, 1, ShiftOne);
4521     __ tbz(idx, 0, ShiftTwo);
4522     __ ldrw(r10,  Address(oldArr, 12));
4523     __ ldrw(r11,  Address(oldArr, 8));
4524     __ lsrvw(r10, r10, shiftCount);
4525     __ lslvw(r11, r11, shiftRevCount);
4526     __ orrw(r12,  r10, r11);
4527     __ strw(r12,  Address(newArr, 8));
4528 
4529     __ BIND(ShiftTwo);
4530     __ ldrw(r10,  Address(oldArr, 8));
4531     __ ldrw(r11,  Address(oldArr, 4));
4532     __ lsrvw(r10, r10, shiftCount);
4533     __ lslvw(r11, r11, shiftRevCount);
4534     __ orrw(r12,  r10, r11);
4535     __ strw(r12,  Address(newArr, 4));
4536 
4537     __ BIND(ShiftOne);
4538     __ ldrw(r10,  Address(oldArr, 4));
4539     __ ldrw(r11,  Address(oldArr));
4540     __ lsrvw(r10, r10, shiftCount);
4541     __ lslvw(r11, r11, shiftRevCount);
4542     __ orrw(r12,  r10, r11);
4543     __ strw(r12,  Address(newArr));
4544 
4545     __ BIND(Exit);
4546     __ ret(lr);
4547 
4548     return start;
4549   }
4550 
4551   // Arguments:
4552   //
4553   // Input:
4554   //   c_rarg0   - newArr address
4555   //   c_rarg1   - oldArr address
4556   //   c_rarg2   - newIdx
4557   //   c_rarg3   - shiftCount
4558   //   c_rarg4   - numIter
4559   //
4560   address generate_bigIntegerLeftShift() {
4561     __ align(CodeEntryAlignment);
4562     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4563     address start = __ pc();
4564 
4565     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4566 
4567     Register newArr        = c_rarg0;
4568     Register oldArr        = c_rarg1;
4569     Register newIdx        = c_rarg2;
4570     Register shiftCount    = c_rarg3;
4571     Register numIter       = c_rarg4;
4572 
4573     Register shiftRevCount = rscratch1;
4574     Register oldArrNext    = rscratch2;
4575 
4576     FloatRegister oldElem0        = v0;
4577     FloatRegister oldElem1        = v1;
4578     FloatRegister newElem         = v2;
4579     FloatRegister shiftVCount     = v3;
4580     FloatRegister shiftVRevCount  = v4;
4581 
4582     __ cbz(numIter, Exit);
4583 
4584     __ add(oldArrNext, oldArr, 4);
4585     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4586 
4587     // right shift count
4588     __ movw(shiftRevCount, 32);
4589     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4590 
4591     // numIter too small to allow a 4-words SIMD loop, rolling back
4592     __ cmp(numIter, (u1)4);
4593     __ br(Assembler::LT, ShiftThree);
4594 
4595     __ dup(shiftVCount,     __ T4S, shiftCount);
4596     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4597     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4598 
4599     __ BIND(ShiftSIMDLoop);
4600 
4601     // load 4 words and process
4602     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4603     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4604     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4605     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4606     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4607     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4608     __ sub(numIter,   numIter, 4);
4609 
4610     __ cmp(numIter, (u1)4);
4611     __ br(Assembler::LT, ShiftTwoLoop);
4612     __ b(ShiftSIMDLoop);
4613 
4614     __ BIND(ShiftTwoLoop);
4615     __ cbz(numIter, Exit);
4616     __ cmp(numIter, (u1)1);
4617     __ br(Assembler::EQ, ShiftOne);
4618 
4619     // load 2 words and process
4620     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4621     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4622     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4623     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4624     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4625     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4626     __ sub(numIter,   numIter, 2);
4627     __ b(ShiftTwoLoop);
4628 
4629     __ BIND(ShiftThree);
4630     __ ldrw(r10,  __ post(oldArr, 4));
4631     __ ldrw(r11,  __ post(oldArrNext, 4));
4632     __ lslvw(r10, r10, shiftCount);
4633     __ lsrvw(r11, r11, shiftRevCount);
4634     __ orrw(r12,  r10, r11);
4635     __ strw(r12,  __ post(newArr, 4));
4636     __ tbz(numIter, 1, Exit);
4637     __ tbz(numIter, 0, ShiftOne);
4638 
4639     __ BIND(ShiftTwo);
4640     __ ldrw(r10,  __ post(oldArr, 4));
4641     __ ldrw(r11,  __ post(oldArrNext, 4));
4642     __ lslvw(r10, r10, shiftCount);
4643     __ lsrvw(r11, r11, shiftRevCount);
4644     __ orrw(r12,  r10, r11);
4645     __ strw(r12,  __ post(newArr, 4));
4646 
4647     __ BIND(ShiftOne);
4648     __ ldrw(r10,  Address(oldArr));
4649     __ ldrw(r11,  Address(oldArrNext));
4650     __ lslvw(r10, r10, shiftCount);
4651     __ lsrvw(r11, r11, shiftRevCount);
4652     __ orrw(r12,  r10, r11);
4653     __ strw(r12,  Address(newArr));
4654 
4655     __ BIND(Exit);
4656     __ ret(lr);
4657 
4658     return start;
4659   }
4660 
4661   address generate_count_positives(address &count_positives_long) {
4662     const u1 large_loop_size = 64;
4663     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4664     int dcache_line = VM_Version::dcache_line_size();
4665 
4666     Register ary1 = r1, len = r2, result = r0;
4667 
4668     __ align(CodeEntryAlignment);
4669 
4670     StubCodeMark mark(this, "StubRoutines", "count_positives");
4671 
4672     address entry = __ pc();
4673 
4674     __ enter();
4675     // precondition: a copy of len is already in result
4676     // __ mov(result, len);
4677 
4678   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4679         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4680 
4681   __ cmp(len, (u1)15);
4682   __ br(Assembler::GT, LEN_OVER_15);
4683   // The only case when execution falls into this code is when pointer is near
4684   // the end of memory page and we have to avoid reading next page
4685   __ add(ary1, ary1, len);
4686   __ subs(len, len, 8);
4687   __ br(Assembler::GT, LEN_OVER_8);
4688   __ ldr(rscratch2, Address(ary1, -8));
4689   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4690   __ lsrv(rscratch2, rscratch2, rscratch1);
4691   __ tst(rscratch2, UPPER_BIT_MASK);
4692   __ csel(result, zr, result, Assembler::NE);
4693   __ leave();
4694   __ ret(lr);
4695   __ bind(LEN_OVER_8);
4696   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4697   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4698   __ tst(rscratch2, UPPER_BIT_MASK);
4699   __ br(Assembler::NE, RET_NO_POP);
4700   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4701   __ lsrv(rscratch1, rscratch1, rscratch2);
4702   __ tst(rscratch1, UPPER_BIT_MASK);
4703   __ bind(RET_NO_POP);
4704   __ csel(result, zr, result, Assembler::NE);
4705   __ leave();
4706   __ ret(lr);
4707 
4708   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4709   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4710 
4711   count_positives_long = __ pc(); // 2nd entry point
4712 
4713   __ enter();
4714 
4715   __ bind(LEN_OVER_15);
4716     __ push(spilled_regs, sp);
4717     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4718     __ cbz(rscratch2, ALIGNED);
4719     __ ldp(tmp6, tmp1, Address(ary1));
4720     __ mov(tmp5, 16);
4721     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4722     __ add(ary1, ary1, rscratch1);
4723     __ orr(tmp6, tmp6, tmp1);
4724     __ tst(tmp6, UPPER_BIT_MASK);
4725     __ br(Assembler::NE, RET_ADJUST);
4726     __ sub(len, len, rscratch1);
4727 
4728   __ bind(ALIGNED);
4729     __ cmp(len, large_loop_size);
4730     __ br(Assembler::LT, CHECK_16);
4731     // Perform 16-byte load as early return in pre-loop to handle situation
4732     // when initially aligned large array has negative values at starting bytes,
4733     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4734     // slower. Cases with negative bytes further ahead won't be affected that
4735     // much. In fact, it'll be faster due to early loads, less instructions and
4736     // less branches in LARGE_LOOP.
4737     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4738     __ sub(len, len, 16);
4739     __ orr(tmp6, tmp6, tmp1);
4740     __ tst(tmp6, UPPER_BIT_MASK);
4741     __ br(Assembler::NE, RET_ADJUST_16);
4742     __ cmp(len, large_loop_size);
4743     __ br(Assembler::LT, CHECK_16);
4744 
4745     if (SoftwarePrefetchHintDistance >= 0
4746         && SoftwarePrefetchHintDistance >= dcache_line) {
4747       // initial prefetch
4748       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4749     }
4750   __ bind(LARGE_LOOP);
4751     if (SoftwarePrefetchHintDistance >= 0) {
4752       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4753     }
4754     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4755     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4756     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4757     // instructions per cycle and have less branches, but this approach disables
4758     // early return, thus, all 64 bytes are loaded and checked every time.
4759     __ ldp(tmp2, tmp3, Address(ary1));
4760     __ ldp(tmp4, tmp5, Address(ary1, 16));
4761     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4762     __ ldp(tmp6, tmp1, Address(ary1, 48));
4763     __ add(ary1, ary1, large_loop_size);
4764     __ sub(len, len, large_loop_size);
4765     __ orr(tmp2, tmp2, tmp3);
4766     __ orr(tmp4, tmp4, tmp5);
4767     __ orr(rscratch1, rscratch1, rscratch2);
4768     __ orr(tmp6, tmp6, tmp1);
4769     __ orr(tmp2, tmp2, tmp4);
4770     __ orr(rscratch1, rscratch1, tmp6);
4771     __ orr(tmp2, tmp2, rscratch1);
4772     __ tst(tmp2, UPPER_BIT_MASK);
4773     __ br(Assembler::NE, RET_ADJUST_LONG);
4774     __ cmp(len, large_loop_size);
4775     __ br(Assembler::GE, LARGE_LOOP);
4776 
4777   __ bind(CHECK_16); // small 16-byte load pre-loop
4778     __ cmp(len, (u1)16);
4779     __ br(Assembler::LT, POST_LOOP16);
4780 
4781   __ bind(LOOP16); // small 16-byte load loop
4782     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4783     __ sub(len, len, 16);
4784     __ orr(tmp2, tmp2, tmp3);
4785     __ tst(tmp2, UPPER_BIT_MASK);
4786     __ br(Assembler::NE, RET_ADJUST_16);
4787     __ cmp(len, (u1)16);
4788     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4789 
4790   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4791     __ cmp(len, (u1)8);
4792     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4793     __ ldr(tmp3, Address(__ post(ary1, 8)));
4794     __ tst(tmp3, UPPER_BIT_MASK);
4795     __ br(Assembler::NE, RET_ADJUST);
4796     __ sub(len, len, 8);
4797 
4798   __ bind(POST_LOOP16_LOAD_TAIL);
4799     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
4800     __ ldr(tmp1, Address(ary1));
4801     __ mov(tmp2, 64);
4802     __ sub(tmp4, tmp2, len, __ LSL, 3);
4803     __ lslv(tmp1, tmp1, tmp4);
4804     __ tst(tmp1, UPPER_BIT_MASK);
4805     __ br(Assembler::NE, RET_ADJUST);
4806     // Fallthrough
4807 
4808   __ bind(RET_LEN);
4809     __ pop(spilled_regs, sp);
4810     __ leave();
4811     __ ret(lr);
4812 
4813     // difference result - len is the count of guaranteed to be
4814     // positive bytes
4815 
4816   __ bind(RET_ADJUST_LONG);
4817     __ add(len, len, (u1)(large_loop_size - 16));
4818   __ bind(RET_ADJUST_16);
4819     __ add(len, len, 16);
4820   __ bind(RET_ADJUST);
4821     __ pop(spilled_regs, sp);
4822     __ leave();
4823     __ sub(result, result, len);
4824     __ ret(lr);
4825 
4826     return entry;
4827   }
4828 
4829   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4830         bool usePrefetch, Label &NOT_EQUAL) {
4831     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4832         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4833         tmp7 = r12, tmp8 = r13;
4834     Label LOOP;
4835 
4836     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4837     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4838     __ bind(LOOP);
4839     if (usePrefetch) {
4840       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4841       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4842     }
4843     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4844     __ eor(tmp1, tmp1, tmp2);
4845     __ eor(tmp3, tmp3, tmp4);
4846     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4847     __ orr(tmp1, tmp1, tmp3);
4848     __ cbnz(tmp1, NOT_EQUAL);
4849     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4850     __ eor(tmp5, tmp5, tmp6);
4851     __ eor(tmp7, tmp7, tmp8);
4852     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4853     __ orr(tmp5, tmp5, tmp7);
4854     __ cbnz(tmp5, NOT_EQUAL);
4855     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4856     __ eor(tmp1, tmp1, tmp2);
4857     __ eor(tmp3, tmp3, tmp4);
4858     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4859     __ orr(tmp1, tmp1, tmp3);
4860     __ cbnz(tmp1, NOT_EQUAL);
4861     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4862     __ eor(tmp5, tmp5, tmp6);
4863     __ sub(cnt1, cnt1, 8 * wordSize);
4864     __ eor(tmp7, tmp7, tmp8);
4865     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4866     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4867     // cmp) because subs allows an unlimited range of immediate operand.
4868     __ subs(tmp6, cnt1, loopThreshold);
4869     __ orr(tmp5, tmp5, tmp7);
4870     __ cbnz(tmp5, NOT_EQUAL);
4871     __ br(__ GE, LOOP);
4872     // post-loop
4873     __ eor(tmp1, tmp1, tmp2);
4874     __ eor(tmp3, tmp3, tmp4);
4875     __ orr(tmp1, tmp1, tmp3);
4876     __ sub(cnt1, cnt1, 2 * wordSize);
4877     __ cbnz(tmp1, NOT_EQUAL);
4878   }
4879 
4880   void generate_large_array_equals_loop_simd(int loopThreshold,
4881         bool usePrefetch, Label &NOT_EQUAL) {
4882     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4883         tmp2 = rscratch2;
4884     Label LOOP;
4885 
4886     __ bind(LOOP);
4887     if (usePrefetch) {
4888       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4889       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4890     }
4891     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4892     __ sub(cnt1, cnt1, 8 * wordSize);
4893     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4894     __ subs(tmp1, cnt1, loopThreshold);
4895     __ eor(v0, __ T16B, v0, v4);
4896     __ eor(v1, __ T16B, v1, v5);
4897     __ eor(v2, __ T16B, v2, v6);
4898     __ eor(v3, __ T16B, v3, v7);
4899     __ orr(v0, __ T16B, v0, v1);
4900     __ orr(v1, __ T16B, v2, v3);
4901     __ orr(v0, __ T16B, v0, v1);
4902     __ umov(tmp1, v0, __ D, 0);
4903     __ umov(tmp2, v0, __ D, 1);
4904     __ orr(tmp1, tmp1, tmp2);
4905     __ cbnz(tmp1, NOT_EQUAL);
4906     __ br(__ GE, LOOP);
4907   }
4908 
4909   // a1 = r1 - array1 address
4910   // a2 = r2 - array2 address
4911   // result = r0 - return value. Already contains "false"
4912   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4913   // r3-r5 are reserved temporary registers
4914   address generate_large_array_equals() {
4915     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4916         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4917         tmp7 = r12, tmp8 = r13;
4918     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4919         SMALL_LOOP, POST_LOOP;
4920     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4921     // calculate if at least 32 prefetched bytes are used
4922     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4923     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4924     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4925     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4926         tmp5, tmp6, tmp7, tmp8);
4927 
4928     __ align(CodeEntryAlignment);
4929 
4930     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4931 
4932     address entry = __ pc();
4933     __ enter();
4934     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4935     // also advance pointers to use post-increment instead of pre-increment
4936     __ add(a1, a1, wordSize);
4937     __ add(a2, a2, wordSize);
4938     if (AvoidUnalignedAccesses) {
4939       // both implementations (SIMD/nonSIMD) are using relatively large load
4940       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4941       // on some CPUs in case of address is not at least 16-byte aligned.
4942       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4943       // load if needed at least for 1st address and make if 16-byte aligned.
4944       Label ALIGNED16;
4945       __ tbz(a1, 3, ALIGNED16);
4946       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4947       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4948       __ sub(cnt1, cnt1, wordSize);
4949       __ eor(tmp1, tmp1, tmp2);
4950       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4951       __ bind(ALIGNED16);
4952     }
4953     if (UseSIMDForArrayEquals) {
4954       if (SoftwarePrefetchHintDistance >= 0) {
4955         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4956         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4957         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4958             /* prfm = */ true, NOT_EQUAL);
4959         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4960         __ br(__ LT, TAIL);
4961       }
4962       __ bind(NO_PREFETCH_LARGE_LOOP);
4963       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4964           /* prfm = */ false, NOT_EQUAL);
4965     } else {
4966       __ push(spilled_regs, sp);
4967       if (SoftwarePrefetchHintDistance >= 0) {
4968         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4969         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4970         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4971             /* prfm = */ true, NOT_EQUAL);
4972         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4973         __ br(__ LT, TAIL);
4974       }
4975       __ bind(NO_PREFETCH_LARGE_LOOP);
4976       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4977           /* prfm = */ false, NOT_EQUAL);
4978     }
4979     __ bind(TAIL);
4980       __ cbz(cnt1, EQUAL);
4981       __ subs(cnt1, cnt1, wordSize);
4982       __ br(__ LE, POST_LOOP);
4983     __ bind(SMALL_LOOP);
4984       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4985       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4986       __ subs(cnt1, cnt1, wordSize);
4987       __ eor(tmp1, tmp1, tmp2);
4988       __ cbnz(tmp1, NOT_EQUAL);
4989       __ br(__ GT, SMALL_LOOP);
4990     __ bind(POST_LOOP);
4991       __ ldr(tmp1, Address(a1, cnt1));
4992       __ ldr(tmp2, Address(a2, cnt1));
4993       __ eor(tmp1, tmp1, tmp2);
4994       __ cbnz(tmp1, NOT_EQUAL);
4995     __ bind(EQUAL);
4996       __ mov(result, true);
4997     __ bind(NOT_EQUAL);
4998       if (!UseSIMDForArrayEquals) {
4999         __ pop(spilled_regs, sp);
5000       }
5001     __ bind(NOT_EQUAL_NO_POP);
5002     __ leave();
5003     __ ret(lr);
5004     return entry;
5005   }
5006 
5007   address generate_dsin_dcos(bool isCos) {
5008     __ align(CodeEntryAlignment);
5009     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5010     address start = __ pc();
5011     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5012         (address)StubRoutines::aarch64::_two_over_pi,
5013         (address)StubRoutines::aarch64::_pio2,
5014         (address)StubRoutines::aarch64::_dsin_coef,
5015         (address)StubRoutines::aarch64::_dcos_coef);
5016     return start;
5017   }
5018 
5019   address generate_dlog() {
5020     __ align(CodeEntryAlignment);
5021     StubCodeMark mark(this, "StubRoutines", "dlog");
5022     address entry = __ pc();
5023     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5024         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5025     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5026     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5027         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5028     return entry;
5029   }
5030 
5031 
5032   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5033   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5034       Label &DIFF2) {
5035     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5036     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5037 
5038     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5039     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5040     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5041     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5042 
5043     __ fmovd(tmpL, vtmp3);
5044     __ eor(rscratch2, tmp3, tmpL);
5045     __ cbnz(rscratch2, DIFF2);
5046 
5047     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5048     __ umov(tmpL, vtmp3, __ D, 1);
5049     __ eor(rscratch2, tmpU, tmpL);
5050     __ cbnz(rscratch2, DIFF1);
5051 
5052     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5053     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5054     __ fmovd(tmpL, vtmp);
5055     __ eor(rscratch2, tmp3, tmpL);
5056     __ cbnz(rscratch2, DIFF2);
5057 
5058     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5059     __ umov(tmpL, vtmp, __ D, 1);
5060     __ eor(rscratch2, tmpU, tmpL);
5061     __ cbnz(rscratch2, DIFF1);
5062   }
5063 
5064   // r0  = result
5065   // r1  = str1
5066   // r2  = cnt1
5067   // r3  = str2
5068   // r4  = cnt2
5069   // r10 = tmp1
5070   // r11 = tmp2
5071   address generate_compare_long_string_different_encoding(bool isLU) {
5072     __ align(CodeEntryAlignment);
5073     StubCodeMark mark(this, "StubRoutines", isLU
5074         ? "compare_long_string_different_encoding LU"
5075         : "compare_long_string_different_encoding UL");
5076     address entry = __ pc();
5077     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5078         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5079         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5080     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5081         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5082     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5083     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5084 
5085     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5086 
5087     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5088     // cnt2 == amount of characters left to compare
5089     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5090     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5091     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5092     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5093     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5094     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5095     __ eor(rscratch2, tmp1, tmp2);
5096     __ mov(rscratch1, tmp2);
5097     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5098     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5099              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5100     __ push(spilled_regs, sp);
5101     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5102     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5103 
5104     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5105 
5106     if (SoftwarePrefetchHintDistance >= 0) {
5107       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5108       __ br(__ LT, NO_PREFETCH);
5109       __ bind(LARGE_LOOP_PREFETCH);
5110         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5111         __ mov(tmp4, 2);
5112         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5113         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5114           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5115           __ subs(tmp4, tmp4, 1);
5116           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5117           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5118           __ mov(tmp4, 2);
5119         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5120           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5121           __ subs(tmp4, tmp4, 1);
5122           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5123           __ sub(cnt2, cnt2, 64);
5124           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5125           __ br(__ GE, LARGE_LOOP_PREFETCH);
5126     }
5127     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5128     __ bind(NO_PREFETCH);
5129     __ subs(cnt2, cnt2, 16);
5130     __ br(__ LT, TAIL);
5131     __ align(OptoLoopAlignment);
5132     __ bind(SMALL_LOOP); // smaller loop
5133       __ subs(cnt2, cnt2, 16);
5134       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5135       __ br(__ GE, SMALL_LOOP);
5136       __ cmn(cnt2, (u1)16);
5137       __ br(__ EQ, LOAD_LAST);
5138     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5139       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5140       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5141       __ ldr(tmp3, Address(cnt1, -8));
5142       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5143       __ b(LOAD_LAST);
5144     __ bind(DIFF2);
5145       __ mov(tmpU, tmp3);
5146     __ bind(DIFF1);
5147       __ pop(spilled_regs, sp);
5148       __ b(CALCULATE_DIFFERENCE);
5149     __ bind(LOAD_LAST);
5150       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5151       // No need to load it again
5152       __ mov(tmpU, tmp3);
5153       __ pop(spilled_regs, sp);
5154 
5155       // tmp2 points to the address of the last 4 Latin1 characters right now
5156       __ ldrs(vtmp, Address(tmp2));
5157       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5158       __ fmovd(tmpL, vtmp);
5159 
5160       __ eor(rscratch2, tmpU, tmpL);
5161       __ cbz(rscratch2, DONE);
5162 
5163     // Find the first different characters in the longwords and
5164     // compute their difference.
5165     __ bind(CALCULATE_DIFFERENCE);
5166       __ rev(rscratch2, rscratch2);
5167       __ clz(rscratch2, rscratch2);
5168       __ andr(rscratch2, rscratch2, -16);
5169       __ lsrv(tmp1, tmp1, rscratch2);
5170       __ uxthw(tmp1, tmp1);
5171       __ lsrv(rscratch1, rscratch1, rscratch2);
5172       __ uxthw(rscratch1, rscratch1);
5173       __ subw(result, tmp1, rscratch1);
5174     __ bind(DONE);
5175       __ ret(lr);
5176     return entry;
5177   }
5178 
5179     address generate_method_entry_barrier() {
5180     __ align(CodeEntryAlignment);
5181     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5182 
5183     Label deoptimize_label;
5184 
5185     address start = __ pc();
5186 
5187     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5188 
5189     __ enter();
5190     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5191 
5192     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5193 
5194     __ push_call_clobbered_registers();
5195 
5196     __ mov(c_rarg0, rscratch2);
5197     __ call_VM_leaf
5198          (CAST_FROM_FN_PTR
5199           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5200 
5201     __ reset_last_Java_frame(true);
5202 
5203     __ mov(rscratch1, r0);
5204 
5205     __ pop_call_clobbered_registers();
5206 
5207     __ cbnz(rscratch1, deoptimize_label);
5208 
5209     __ leave();
5210     __ ret(lr);
5211 
5212     __ BIND(deoptimize_label);
5213 
5214     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5215     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5216 
5217     __ mov(sp, rscratch1);
5218     __ br(rscratch2);
5219 
5220     return start;
5221   }
5222 
5223   // r0  = result
5224   // r1  = str1
5225   // r2  = cnt1
5226   // r3  = str2
5227   // r4  = cnt2
5228   // r10 = tmp1
5229   // r11 = tmp2
5230   address generate_compare_long_string_same_encoding(bool isLL) {
5231     __ align(CodeEntryAlignment);
5232     StubCodeMark mark(this, "StubRoutines", isLL
5233         ? "compare_long_string_same_encoding LL"
5234         : "compare_long_string_same_encoding UU");
5235     address entry = __ pc();
5236     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5237         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5238 
5239     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5240 
5241     // exit from large loop when less than 64 bytes left to read or we're about
5242     // to prefetch memory behind array border
5243     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5244 
5245     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5246     __ eor(rscratch2, tmp1, tmp2);
5247     __ cbnz(rscratch2, CAL_DIFFERENCE);
5248 
5249     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5250     // update pointers, because of previous read
5251     __ add(str1, str1, wordSize);
5252     __ add(str2, str2, wordSize);
5253     if (SoftwarePrefetchHintDistance >= 0) {
5254       __ align(OptoLoopAlignment);
5255       __ bind(LARGE_LOOP_PREFETCH);
5256         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5257         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5258 
5259         for (int i = 0; i < 4; i++) {
5260           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5261           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5262           __ cmp(tmp1, tmp2);
5263           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5264           __ br(Assembler::NE, DIFF);
5265         }
5266         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5267         __ add(str1, str1, 64);
5268         __ add(str2, str2, 64);
5269         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5270         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5271         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5272     }
5273 
5274     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5275     __ br(Assembler::LE, LESS16);
5276     __ align(OptoLoopAlignment);
5277     __ bind(LOOP_COMPARE16);
5278       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5279       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5280       __ cmp(tmp1, tmp2);
5281       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5282       __ br(Assembler::NE, DIFF);
5283       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5284       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5285       __ br(Assembler::LT, LESS16);
5286 
5287       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5288       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5289       __ cmp(tmp1, tmp2);
5290       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5291       __ br(Assembler::NE, DIFF);
5292       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5293       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5294       __ br(Assembler::GE, LOOP_COMPARE16);
5295       __ cbz(cnt2, LENGTH_DIFF);
5296 
5297     __ bind(LESS16);
5298       // each 8 compare
5299       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5300       __ br(Assembler::LE, LESS8);
5301       __ ldr(tmp1, Address(__ post(str1, 8)));
5302       __ ldr(tmp2, Address(__ post(str2, 8)));
5303       __ eor(rscratch2, tmp1, tmp2);
5304       __ cbnz(rscratch2, CAL_DIFFERENCE);
5305       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5306 
5307     __ bind(LESS8); // directly load last 8 bytes
5308       if (!isLL) {
5309         __ add(cnt2, cnt2, cnt2);
5310       }
5311       __ ldr(tmp1, Address(str1, cnt2));
5312       __ ldr(tmp2, Address(str2, cnt2));
5313       __ eor(rscratch2, tmp1, tmp2);
5314       __ cbz(rscratch2, LENGTH_DIFF);
5315       __ b(CAL_DIFFERENCE);
5316 
5317     __ bind(DIFF);
5318       __ cmp(tmp1, tmp2);
5319       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5320       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5321       // reuse rscratch2 register for the result of eor instruction
5322       __ eor(rscratch2, tmp1, tmp2);
5323 
5324     __ bind(CAL_DIFFERENCE);
5325       __ rev(rscratch2, rscratch2);
5326       __ clz(rscratch2, rscratch2);
5327       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5328       __ lsrv(tmp1, tmp1, rscratch2);
5329       __ lsrv(tmp2, tmp2, rscratch2);
5330       if (isLL) {
5331         __ uxtbw(tmp1, tmp1);
5332         __ uxtbw(tmp2, tmp2);
5333       } else {
5334         __ uxthw(tmp1, tmp1);
5335         __ uxthw(tmp2, tmp2);
5336       }
5337       __ subw(result, tmp1, tmp2);
5338 
5339     __ bind(LENGTH_DIFF);
5340       __ ret(lr);
5341     return entry;
5342   }
5343 
5344   void generate_compare_long_strings() {
5345       StubRoutines::aarch64::_compare_long_string_LL
5346           = generate_compare_long_string_same_encoding(true);
5347       StubRoutines::aarch64::_compare_long_string_UU
5348           = generate_compare_long_string_same_encoding(false);
5349       StubRoutines::aarch64::_compare_long_string_LU
5350           = generate_compare_long_string_different_encoding(true);
5351       StubRoutines::aarch64::_compare_long_string_UL
5352           = generate_compare_long_string_different_encoding(false);
5353   }
5354 
5355   // R0 = result
5356   // R1 = str2
5357   // R2 = cnt1
5358   // R3 = str1
5359   // R4 = cnt2
5360   // This generic linear code use few additional ideas, which makes it faster:
5361   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5362   // in order to skip initial loading(help in systems with 1 ld pipeline)
5363   // 2) we can use "fast" algorithm of finding single character to search for
5364   // first symbol with less branches(1 branch per each loaded register instead
5365   // of branch for each symbol), so, this is where constants like
5366   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5367   // 3) after loading and analyzing 1st register of source string, it can be
5368   // used to search for every 1st character entry, saving few loads in
5369   // comparison with "simplier-but-slower" implementation
5370   // 4) in order to avoid lots of push/pop operations, code below is heavily
5371   // re-using/re-initializing/compressing register values, which makes code
5372   // larger and a bit less readable, however, most of extra operations are
5373   // issued during loads or branches, so, penalty is minimal
5374   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5375     const char* stubName = str1_isL
5376         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5377         : "indexof_linear_uu";
5378     __ align(CodeEntryAlignment);
5379     StubCodeMark mark(this, "StubRoutines", stubName);
5380     address entry = __ pc();
5381 
5382     int str1_chr_size = str1_isL ? 1 : 2;
5383     int str2_chr_size = str2_isL ? 1 : 2;
5384     int str1_chr_shift = str1_isL ? 0 : 1;
5385     int str2_chr_shift = str2_isL ? 0 : 1;
5386     bool isL = str1_isL && str2_isL;
5387    // parameters
5388     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5389     // temporary registers
5390     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5391     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5392     // redefinitions
5393     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5394 
5395     __ push(spilled_regs, sp);
5396     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5397         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5398         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5399         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5400         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5401         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5402     // Read whole register from str1. It is safe, because length >=8 here
5403     __ ldr(ch1, Address(str1));
5404     // Read whole register from str2. It is safe, because length >=8 here
5405     __ ldr(ch2, Address(str2));
5406     __ sub(cnt2, cnt2, cnt1);
5407     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5408     if (str1_isL != str2_isL) {
5409       __ eor(v0, __ T16B, v0, v0);
5410     }
5411     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5412     __ mul(first, first, tmp1);
5413     // check if we have less than 1 register to check
5414     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5415     if (str1_isL != str2_isL) {
5416       __ fmovd(v1, ch1);
5417     }
5418     __ br(__ LE, L_SMALL);
5419     __ eor(ch2, first, ch2);
5420     if (str1_isL != str2_isL) {
5421       __ zip1(v1, __ T16B, v1, v0);
5422     }
5423     __ sub(tmp2, ch2, tmp1);
5424     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5425     __ bics(tmp2, tmp2, ch2);
5426     if (str1_isL != str2_isL) {
5427       __ fmovd(ch1, v1);
5428     }
5429     __ br(__ NE, L_HAS_ZERO);
5430     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5431     __ add(result, result, wordSize/str2_chr_size);
5432     __ add(str2, str2, wordSize);
5433     __ br(__ LT, L_POST_LOOP);
5434     __ BIND(L_LOOP);
5435       __ ldr(ch2, Address(str2));
5436       __ eor(ch2, first, ch2);
5437       __ sub(tmp2, ch2, tmp1);
5438       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5439       __ bics(tmp2, tmp2, ch2);
5440       __ br(__ NE, L_HAS_ZERO);
5441     __ BIND(L_LOOP_PROCEED);
5442       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5443       __ add(str2, str2, wordSize);
5444       __ add(result, result, wordSize/str2_chr_size);
5445       __ br(__ GE, L_LOOP);
5446     __ BIND(L_POST_LOOP);
5447       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5448       __ br(__ LE, NOMATCH);
5449       __ ldr(ch2, Address(str2));
5450       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5451       __ eor(ch2, first, ch2);
5452       __ sub(tmp2, ch2, tmp1);
5453       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5454       __ mov(tmp4, -1); // all bits set
5455       __ b(L_SMALL_PROCEED);
5456     __ align(OptoLoopAlignment);
5457     __ BIND(L_SMALL);
5458       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5459       __ eor(ch2, first, ch2);
5460       if (str1_isL != str2_isL) {
5461         __ zip1(v1, __ T16B, v1, v0);
5462       }
5463       __ sub(tmp2, ch2, tmp1);
5464       __ mov(tmp4, -1); // all bits set
5465       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5466       if (str1_isL != str2_isL) {
5467         __ fmovd(ch1, v1); // move converted 4 symbols
5468       }
5469     __ BIND(L_SMALL_PROCEED);
5470       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5471       __ bic(tmp2, tmp2, ch2);
5472       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5473       __ rbit(tmp2, tmp2);
5474       __ br(__ EQ, NOMATCH);
5475     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5476       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5477       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5478       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5479       if (str2_isL) { // LL
5480         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5481         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5482         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5483         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5484         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5485       } else {
5486         __ mov(ch2, 0xE); // all bits in byte set except last one
5487         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5488         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5489         __ lslv(tmp2, tmp2, tmp4);
5490         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5491         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5492         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5493         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5494       }
5495       __ cmp(ch1, ch2);
5496       __ mov(tmp4, wordSize/str2_chr_size);
5497       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5498     __ BIND(L_SMALL_CMP_LOOP);
5499       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5500                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5501       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5502                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5503       __ add(tmp4, tmp4, 1);
5504       __ cmp(tmp4, cnt1);
5505       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5506       __ cmp(first, ch2);
5507       __ br(__ EQ, L_SMALL_CMP_LOOP);
5508     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5509       __ cbz(tmp2, NOMATCH); // no more matches. exit
5510       __ clz(tmp4, tmp2);
5511       __ add(result, result, 1); // advance index
5512       __ add(str2, str2, str2_chr_size); // advance pointer
5513       __ b(L_SMALL_HAS_ZERO_LOOP);
5514     __ align(OptoLoopAlignment);
5515     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5516       __ cmp(first, ch2);
5517       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5518       __ b(DONE);
5519     __ align(OptoLoopAlignment);
5520     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5521       if (str2_isL) { // LL
5522         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5523         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5524         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5525         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5526         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5527       } else {
5528         __ mov(ch2, 0xE); // all bits in byte set except last one
5529         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5530         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5531         __ lslv(tmp2, tmp2, tmp4);
5532         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5533         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5534         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5535         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5536       }
5537       __ cmp(ch1, ch2);
5538       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5539       __ b(DONE);
5540     __ align(OptoLoopAlignment);
5541     __ BIND(L_HAS_ZERO);
5542       __ rbit(tmp2, tmp2);
5543       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5544       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5545       // It's fine because both counters are 32bit and are not changed in this
5546       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5547       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5548       __ sub(result, result, 1);
5549     __ BIND(L_HAS_ZERO_LOOP);
5550       __ mov(cnt1, wordSize/str2_chr_size);
5551       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5552       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5553       if (str2_isL) {
5554         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5555         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5556         __ lslv(tmp2, tmp2, tmp4);
5557         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5558         __ add(tmp4, tmp4, 1);
5559         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5560         __ lsl(tmp2, tmp2, 1);
5561         __ mov(tmp4, wordSize/str2_chr_size);
5562       } else {
5563         __ mov(ch2, 0xE);
5564         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5565         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5566         __ lslv(tmp2, tmp2, tmp4);
5567         __ add(tmp4, tmp4, 1);
5568         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5569         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5570         __ lsl(tmp2, tmp2, 1);
5571         __ mov(tmp4, wordSize/str2_chr_size);
5572         __ sub(str2, str2, str2_chr_size);
5573       }
5574       __ cmp(ch1, ch2);
5575       __ mov(tmp4, wordSize/str2_chr_size);
5576       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5577     __ BIND(L_CMP_LOOP);
5578       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5579                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5580       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5581                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5582       __ add(tmp4, tmp4, 1);
5583       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5584       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5585       __ cmp(cnt1, ch2);
5586       __ br(__ EQ, L_CMP_LOOP);
5587     __ BIND(L_CMP_LOOP_NOMATCH);
5588       // here we're not matched
5589       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5590       __ clz(tmp4, tmp2);
5591       __ add(str2, str2, str2_chr_size); // advance pointer
5592       __ b(L_HAS_ZERO_LOOP);
5593     __ align(OptoLoopAlignment);
5594     __ BIND(L_CMP_LOOP_LAST_CMP);
5595       __ cmp(cnt1, ch2);
5596       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5597       __ b(DONE);
5598     __ align(OptoLoopAlignment);
5599     __ BIND(L_CMP_LOOP_LAST_CMP2);
5600       if (str2_isL) {
5601         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5602         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5603         __ lslv(tmp2, tmp2, tmp4);
5604         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5605         __ add(tmp4, tmp4, 1);
5606         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5607         __ lsl(tmp2, tmp2, 1);
5608       } else {
5609         __ mov(ch2, 0xE);
5610         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5611         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5612         __ lslv(tmp2, tmp2, tmp4);
5613         __ add(tmp4, tmp4, 1);
5614         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5615         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5616         __ lsl(tmp2, tmp2, 1);
5617         __ sub(str2, str2, str2_chr_size);
5618       }
5619       __ cmp(ch1, ch2);
5620       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5621       __ b(DONE);
5622     __ align(OptoLoopAlignment);
5623     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5624       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5625       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5626       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5627       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5628       // result by analyzed characters value, so, we can just reset lower bits
5629       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5630       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5631       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5632       // index of last analyzed substring inside current octet. So, str2 in at
5633       // respective start address. We need to advance it to next octet
5634       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5635       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5636       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5637       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5638       __ movw(cnt2, cnt2);
5639       __ b(L_LOOP_PROCEED);
5640     __ align(OptoLoopAlignment);
5641     __ BIND(NOMATCH);
5642       __ mov(result, -1);
5643     __ BIND(DONE);
5644       __ pop(spilled_regs, sp);
5645       __ ret(lr);
5646     return entry;
5647   }
5648 
5649   void generate_string_indexof_stubs() {
5650     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5651     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5652     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5653   }
5654 
5655   void inflate_and_store_2_fp_registers(bool generatePrfm,
5656       FloatRegister src1, FloatRegister src2) {
5657     Register dst = r1;
5658     __ zip1(v1, __ T16B, src1, v0);
5659     __ zip2(v2, __ T16B, src1, v0);
5660     if (generatePrfm) {
5661       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5662     }
5663     __ zip1(v3, __ T16B, src2, v0);
5664     __ zip2(v4, __ T16B, src2, v0);
5665     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5666   }
5667 
5668   // R0 = src
5669   // R1 = dst
5670   // R2 = len
5671   // R3 = len >> 3
5672   // V0 = 0
5673   // v1 = loaded 8 bytes
5674   address generate_large_byte_array_inflate() {
5675     __ align(CodeEntryAlignment);
5676     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5677     address entry = __ pc();
5678     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5679     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5680     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5681 
5682     // do one more 8-byte read to have address 16-byte aligned in most cases
5683     // also use single store instruction
5684     __ ldrd(v2, __ post(src, 8));
5685     __ sub(octetCounter, octetCounter, 2);
5686     __ zip1(v1, __ T16B, v1, v0);
5687     __ zip1(v2, __ T16B, v2, v0);
5688     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5689     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5690     __ subs(rscratch1, octetCounter, large_loop_threshold);
5691     __ br(__ LE, LOOP_START);
5692     __ b(LOOP_PRFM_START);
5693     __ bind(LOOP_PRFM);
5694       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5695     __ bind(LOOP_PRFM_START);
5696       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5697       __ sub(octetCounter, octetCounter, 8);
5698       __ subs(rscratch1, octetCounter, large_loop_threshold);
5699       inflate_and_store_2_fp_registers(true, v3, v4);
5700       inflate_and_store_2_fp_registers(true, v5, v6);
5701       __ br(__ GT, LOOP_PRFM);
5702       __ cmp(octetCounter, (u1)8);
5703       __ br(__ LT, DONE);
5704     __ bind(LOOP);
5705       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5706       __ bind(LOOP_START);
5707       __ sub(octetCounter, octetCounter, 8);
5708       __ cmp(octetCounter, (u1)8);
5709       inflate_and_store_2_fp_registers(false, v3, v4);
5710       inflate_and_store_2_fp_registers(false, v5, v6);
5711       __ br(__ GE, LOOP);
5712     __ bind(DONE);
5713       __ ret(lr);
5714     return entry;
5715   }
5716 
5717   /**
5718    *  Arguments:
5719    *
5720    *  Input:
5721    *  c_rarg0   - current state address
5722    *  c_rarg1   - H key address
5723    *  c_rarg2   - data address
5724    *  c_rarg3   - number of blocks
5725    *
5726    *  Output:
5727    *  Updated state at c_rarg0
5728    */
5729   address generate_ghash_processBlocks() {
5730     // Bafflingly, GCM uses little-endian for the byte order, but
5731     // big-endian for the bit order.  For example, the polynomial 1 is
5732     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5733     //
5734     // So, we must either reverse the bytes in each word and do
5735     // everything big-endian or reverse the bits in each byte and do
5736     // it little-endian.  On AArch64 it's more idiomatic to reverse
5737     // the bits in each byte (we have an instruction, RBIT, to do
5738     // that) and keep the data in little-endian bit order throught the
5739     // calculation, bit-reversing the inputs and outputs.
5740 
5741     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5742     __ align(wordSize * 2);
5743     address p = __ pc();
5744     __ emit_int64(0x87);  // The low-order bits of the field
5745                           // polynomial (i.e. p = z^7+z^2+z+1)
5746                           // repeated in the low and high parts of a
5747                           // 128-bit vector
5748     __ emit_int64(0x87);
5749 
5750     __ align(CodeEntryAlignment);
5751     address start = __ pc();
5752 
5753     Register state   = c_rarg0;
5754     Register subkeyH = c_rarg1;
5755     Register data    = c_rarg2;
5756     Register blocks  = c_rarg3;
5757 
5758     FloatRegister vzr = v30;
5759     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5760 
5761     __ ldrq(v24, p);    // The field polynomial
5762 
5763     __ ldrq(v0, Address(state));
5764     __ ldrq(v1, Address(subkeyH));
5765 
5766     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5767     __ rbit(v0, __ T16B, v0);
5768     __ rev64(v1, __ T16B, v1);
5769     __ rbit(v1, __ T16B, v1);
5770 
5771     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5772     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5773 
5774     {
5775       Label L_ghash_loop;
5776       __ bind(L_ghash_loop);
5777 
5778       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5779                                                  // reversing each byte
5780       __ rbit(v2, __ T16B, v2);
5781       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5782 
5783       // Multiply state in v2 by subkey in v1
5784       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5785                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
5786                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
5787       // Reduce v7:v5 by the field polynomial
5788       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
5789 
5790       __ sub(blocks, blocks, 1);
5791       __ cbnz(blocks, L_ghash_loop);
5792     }
5793 
5794     // The bit-reversed result is at this point in v0
5795     __ rev64(v0, __ T16B, v0);
5796     __ rbit(v0, __ T16B, v0);
5797 
5798     __ st1(v0, __ T16B, state);
5799     __ ret(lr);
5800 
5801     return start;
5802   }
5803 
5804   address generate_ghash_processBlocks_wide() {
5805     address small = generate_ghash_processBlocks();
5806 
5807     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
5808     __ align(wordSize * 2);
5809     address p = __ pc();
5810     __ emit_int64(0x87);  // The low-order bits of the field
5811                           // polynomial (i.e. p = z^7+z^2+z+1)
5812                           // repeated in the low and high parts of a
5813                           // 128-bit vector
5814     __ emit_int64(0x87);
5815 
5816     __ align(CodeEntryAlignment);
5817     address start = __ pc();
5818 
5819     Register state   = c_rarg0;
5820     Register subkeyH = c_rarg1;
5821     Register data    = c_rarg2;
5822     Register blocks  = c_rarg3;
5823 
5824     const int unroll = 4;
5825 
5826     __ cmp(blocks, (unsigned char)(unroll * 2));
5827     __ br(__ LT, small);
5828 
5829     if (unroll > 1) {
5830     // Save state before entering routine
5831       __ sub(sp, sp, 4 * 16);
5832       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
5833       __ sub(sp, sp, 4 * 16);
5834       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
5835     }
5836 
5837     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
5838 
5839     if (unroll > 1) {
5840       // And restore state
5841       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
5842       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
5843     }
5844 
5845     __ cmp(blocks, (unsigned char)0);
5846     __ br(__ GT, small);
5847 
5848     __ ret(lr);
5849 
5850     return start;
5851   }
5852 
5853   void generate_base64_encode_simdround(Register src, Register dst,
5854         FloatRegister codec, u8 size) {
5855 
5856     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
5857     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5858     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5859 
5860     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5861 
5862     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
5863 
5864     __ ushr(ind0, arrangement, in0,  2);
5865 
5866     __ ushr(ind1, arrangement, in1,  2);
5867     __ shl(in0,   arrangement, in0,  6);
5868     __ orr(ind1,  arrangement, ind1, in0);
5869     __ ushr(ind1, arrangement, ind1, 2);
5870 
5871     __ ushr(ind2, arrangement, in2,  4);
5872     __ shl(in1,   arrangement, in1,  4);
5873     __ orr(ind2,  arrangement, in1,  ind2);
5874     __ ushr(ind2, arrangement, ind2, 2);
5875 
5876     __ shl(ind3,  arrangement, in2,  2);
5877     __ ushr(ind3, arrangement, ind3, 2);
5878 
5879     __ tbl(out0,  arrangement, codec,  4, ind0);
5880     __ tbl(out1,  arrangement, codec,  4, ind1);
5881     __ tbl(out2,  arrangement, codec,  4, ind2);
5882     __ tbl(out3,  arrangement, codec,  4, ind3);
5883 
5884     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
5885   }
5886 
5887    /**
5888    *  Arguments:
5889    *
5890    *  Input:
5891    *  c_rarg0   - src_start
5892    *  c_rarg1   - src_offset
5893    *  c_rarg2   - src_length
5894    *  c_rarg3   - dest_start
5895    *  c_rarg4   - dest_offset
5896    *  c_rarg5   - isURL
5897    *
5898    */
5899   address generate_base64_encodeBlock() {
5900 
5901     static const char toBase64[64] = {
5902       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5903       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5904       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5905       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5906       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5907     };
5908 
5909     static const char toBase64URL[64] = {
5910       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5911       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5912       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5913       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5914       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5915     };
5916 
5917     __ align(CodeEntryAlignment);
5918     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5919     address start = __ pc();
5920 
5921     Register src   = c_rarg0;  // source array
5922     Register soff  = c_rarg1;  // source start offset
5923     Register send  = c_rarg2;  // source end offset
5924     Register dst   = c_rarg3;  // dest array
5925     Register doff  = c_rarg4;  // position for writing to dest array
5926     Register isURL = c_rarg5;  // Base64 or URL chracter set
5927 
5928     // c_rarg6 and c_rarg7 are free to use as temps
5929     Register codec  = c_rarg6;
5930     Register length = c_rarg7;
5931 
5932     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
5933 
5934     __ add(src, src, soff);
5935     __ add(dst, dst, doff);
5936     __ sub(length, send, soff);
5937 
5938     // load the codec base address
5939     __ lea(codec, ExternalAddress((address) toBase64));
5940     __ cbz(isURL, ProcessData);
5941     __ lea(codec, ExternalAddress((address) toBase64URL));
5942 
5943     __ BIND(ProcessData);
5944 
5945     // too short to formup a SIMD loop, roll back
5946     __ cmp(length, (u1)24);
5947     __ br(Assembler::LT, Process3B);
5948 
5949     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
5950 
5951     __ BIND(Process48B);
5952     __ cmp(length, (u1)48);
5953     __ br(Assembler::LT, Process24B);
5954     generate_base64_encode_simdround(src, dst, v0, 16);
5955     __ sub(length, length, 48);
5956     __ b(Process48B);
5957 
5958     __ BIND(Process24B);
5959     __ cmp(length, (u1)24);
5960     __ br(Assembler::LT, SIMDExit);
5961     generate_base64_encode_simdround(src, dst, v0, 8);
5962     __ sub(length, length, 24);
5963 
5964     __ BIND(SIMDExit);
5965     __ cbz(length, Exit);
5966 
5967     __ BIND(Process3B);
5968     //  3 src bytes, 24 bits
5969     __ ldrb(r10, __ post(src, 1));
5970     __ ldrb(r11, __ post(src, 1));
5971     __ ldrb(r12, __ post(src, 1));
5972     __ orrw(r11, r11, r10, Assembler::LSL, 8);
5973     __ orrw(r12, r12, r11, Assembler::LSL, 8);
5974     // codec index
5975     __ ubfmw(r15, r12, 18, 23);
5976     __ ubfmw(r14, r12, 12, 17);
5977     __ ubfmw(r13, r12, 6,  11);
5978     __ andw(r12,  r12, 63);
5979     // get the code based on the codec
5980     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
5981     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
5982     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
5983     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
5984     __ strb(r15, __ post(dst, 1));
5985     __ strb(r14, __ post(dst, 1));
5986     __ strb(r13, __ post(dst, 1));
5987     __ strb(r12, __ post(dst, 1));
5988     __ sub(length, length, 3);
5989     __ cbnz(length, Process3B);
5990 
5991     __ BIND(Exit);
5992     __ ret(lr);
5993 
5994     return start;
5995   }
5996 
5997   void generate_base64_decode_simdround(Register src, Register dst,
5998         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
5999 
6000     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6001     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6002 
6003     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6004     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6005 
6006     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6007 
6008     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6009 
6010     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6011 
6012     // we need unsigned saturating substract, to make sure all input values
6013     // in range [0, 63] will have 0U value in the higher half lookup
6014     __ uqsubv(decH0, __ T16B, in0, v27);
6015     __ uqsubv(decH1, __ T16B, in1, v27);
6016     __ uqsubv(decH2, __ T16B, in2, v27);
6017     __ uqsubv(decH3, __ T16B, in3, v27);
6018 
6019     // lower half lookup
6020     __ tbl(decL0, arrangement, codecL, 4, in0);
6021     __ tbl(decL1, arrangement, codecL, 4, in1);
6022     __ tbl(decL2, arrangement, codecL, 4, in2);
6023     __ tbl(decL3, arrangement, codecL, 4, in3);
6024 
6025     // higher half lookup
6026     __ tbx(decH0, arrangement, codecH, 4, decH0);
6027     __ tbx(decH1, arrangement, codecH, 4, decH1);
6028     __ tbx(decH2, arrangement, codecH, 4, decH2);
6029     __ tbx(decH3, arrangement, codecH, 4, decH3);
6030 
6031     // combine lower and higher
6032     __ orr(decL0, arrangement, decL0, decH0);
6033     __ orr(decL1, arrangement, decL1, decH1);
6034     __ orr(decL2, arrangement, decL2, decH2);
6035     __ orr(decL3, arrangement, decL3, decH3);
6036 
6037     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6038     __ cmhi(decH0, arrangement, decL0, v27);
6039     __ cmhi(decH1, arrangement, decL1, v27);
6040     __ cmhi(decH2, arrangement, decL2, v27);
6041     __ cmhi(decH3, arrangement, decL3, v27);
6042     __ orr(in0, arrangement, decH0, decH1);
6043     __ orr(in1, arrangement, decH2, decH3);
6044     __ orr(in2, arrangement, in0,   in1);
6045     __ umaxv(in3, arrangement, in2);
6046     __ umov(rscratch2, in3, __ B, 0);
6047 
6048     // get the data to output
6049     __ shl(out0,  arrangement, decL0, 2);
6050     __ ushr(out1, arrangement, decL1, 4);
6051     __ orr(out0,  arrangement, out0,  out1);
6052     __ shl(out1,  arrangement, decL1, 4);
6053     __ ushr(out2, arrangement, decL2, 2);
6054     __ orr(out1,  arrangement, out1,  out2);
6055     __ shl(out2,  arrangement, decL2, 6);
6056     __ orr(out2,  arrangement, out2,  decL3);
6057 
6058     __ cbz(rscratch2, NoIllegalData);
6059 
6060     // handle illegal input
6061     __ umov(r10, in2, __ D, 0);
6062     if (size == 16) {
6063       __ cbnz(r10, ErrorInLowerHalf);
6064 
6065       // illegal input is in higher half, store the lower half now.
6066       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6067 
6068       __ umov(r10, in2,  __ D, 1);
6069       __ umov(r11, out0, __ D, 1);
6070       __ umov(r12, out1, __ D, 1);
6071       __ umov(r13, out2, __ D, 1);
6072       __ b(StoreLegalData);
6073 
6074       __ BIND(ErrorInLowerHalf);
6075     }
6076     __ umov(r11, out0, __ D, 0);
6077     __ umov(r12, out1, __ D, 0);
6078     __ umov(r13, out2, __ D, 0);
6079 
6080     __ BIND(StoreLegalData);
6081     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6082     __ strb(r11, __ post(dst, 1));
6083     __ strb(r12, __ post(dst, 1));
6084     __ strb(r13, __ post(dst, 1));
6085     __ lsr(r10, r10, 8);
6086     __ lsr(r11, r11, 8);
6087     __ lsr(r12, r12, 8);
6088     __ lsr(r13, r13, 8);
6089     __ b(StoreLegalData);
6090 
6091     __ BIND(NoIllegalData);
6092     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6093   }
6094 
6095 
6096    /**
6097    *  Arguments:
6098    *
6099    *  Input:
6100    *  c_rarg0   - src_start
6101    *  c_rarg1   - src_offset
6102    *  c_rarg2   - src_length
6103    *  c_rarg3   - dest_start
6104    *  c_rarg4   - dest_offset
6105    *  c_rarg5   - isURL
6106    *  c_rarg6   - isMIME
6107    *
6108    */
6109   address generate_base64_decodeBlock() {
6110 
6111     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6112     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6113     // titled "Base64 decoding".
6114 
6115     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6116     // except the trailing character '=' is also treated illegal value in this instrinsic. That
6117     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6118     static const uint8_t fromBase64ForNoSIMD[256] = {
6119       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6120       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6121       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6122        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6123       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6124        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6125       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6126        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6127       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6128       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6129       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6130       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6131       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6132       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6133       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6134       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6135     };
6136 
6137     static const uint8_t fromBase64URLForNoSIMD[256] = {
6138       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6139       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6140       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6141        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6142       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6143        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6144       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6145        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6146       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6147       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6148       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6149       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6150       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6151       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6152       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6153       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6154     };
6155 
6156     // A legal value of base64 code is in range [0, 127].  We need two lookups
6157     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6158     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6159     // table vector lookup use tbx, out of range indices are unchanged in
6160     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6161     // The value of index 64 is set to 0, so that we know that we already get the
6162     // decoded data with the 1st lookup.
6163     static const uint8_t fromBase64ForSIMD[128] = {
6164       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6165       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6166       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6167        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6168         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6169        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6170       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6171        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6172     };
6173 
6174     static const uint8_t fromBase64URLForSIMD[128] = {
6175       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6176       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6177       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6178        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6179         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6180        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6181        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6182        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6183     };
6184 
6185     __ align(CodeEntryAlignment);
6186     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6187     address start = __ pc();
6188 
6189     Register src    = c_rarg0;  // source array
6190     Register soff   = c_rarg1;  // source start offset
6191     Register send   = c_rarg2;  // source end offset
6192     Register dst    = c_rarg3;  // dest array
6193     Register doff   = c_rarg4;  // position for writing to dest array
6194     Register isURL  = c_rarg5;  // Base64 or URL character set
6195     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6196 
6197     Register length = send;    // reuse send as length of source data to process
6198 
6199     Register simd_codec   = c_rarg6;
6200     Register nosimd_codec = c_rarg7;
6201 
6202     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6203 
6204     __ enter();
6205 
6206     __ add(src, src, soff);
6207     __ add(dst, dst, doff);
6208 
6209     __ mov(doff, dst);
6210 
6211     __ sub(length, send, soff);
6212     __ bfm(length, zr, 0, 1);
6213 
6214     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6215     __ cbz(isURL, ProcessData);
6216     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6217 
6218     __ BIND(ProcessData);
6219     __ mov(rscratch1, length);
6220     __ cmp(length, (u1)144); // 144 = 80 + 64
6221     __ br(Assembler::LT, Process4B);
6222 
6223     // In the MIME case, the line length cannot be more than 76
6224     // bytes (see RFC 2045). This is too short a block for SIMD
6225     // to be worthwhile, so we use non-SIMD here.
6226     __ movw(rscratch1, 79);
6227 
6228     __ BIND(Process4B);
6229     __ ldrw(r14, __ post(src, 4));
6230     __ ubfxw(r10, r14, 0,  8);
6231     __ ubfxw(r11, r14, 8,  8);
6232     __ ubfxw(r12, r14, 16, 8);
6233     __ ubfxw(r13, r14, 24, 8);
6234     // get the de-code
6235     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6236     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6237     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6238     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6239     // error detection, 255u indicates an illegal input
6240     __ orrw(r14, r10, r11);
6241     __ orrw(r15, r12, r13);
6242     __ orrw(r14, r14, r15);
6243     __ tbnz(r14, 7, Exit);
6244     // recover the data
6245     __ lslw(r14, r10, 10);
6246     __ bfiw(r14, r11, 4, 6);
6247     __ bfmw(r14, r12, 2, 5);
6248     __ rev16w(r14, r14);
6249     __ bfiw(r13, r12, 6, 2);
6250     __ strh(r14, __ post(dst, 2));
6251     __ strb(r13, __ post(dst, 1));
6252     // non-simd loop
6253     __ subsw(rscratch1, rscratch1, 4);
6254     __ br(Assembler::GT, Process4B);
6255 
6256     // if exiting from PreProcess80B, rscratch1 == -1;
6257     // otherwise, rscratch1 == 0.
6258     __ cbzw(rscratch1, Exit);
6259     __ sub(length, length, 80);
6260 
6261     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6262     __ cbz(isURL, SIMDEnter);
6263     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6264 
6265     __ BIND(SIMDEnter);
6266     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6267     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6268     __ mov(rscratch1, 63);
6269     __ dup(v27, __ T16B, rscratch1);
6270 
6271     __ BIND(Process64B);
6272     __ cmp(length, (u1)64);
6273     __ br(Assembler::LT, Process32B);
6274     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6275     __ sub(length, length, 64);
6276     __ b(Process64B);
6277 
6278     __ BIND(Process32B);
6279     __ cmp(length, (u1)32);
6280     __ br(Assembler::LT, SIMDExit);
6281     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6282     __ sub(length, length, 32);
6283     __ b(Process32B);
6284 
6285     __ BIND(SIMDExit);
6286     __ cbz(length, Exit);
6287     __ movw(rscratch1, length);
6288     __ b(Process4B);
6289 
6290     __ BIND(Exit);
6291     __ sub(c_rarg0, dst, doff);
6292 
6293     __ leave();
6294     __ ret(lr);
6295 
6296     return start;
6297   }
6298 
6299   // Support for spin waits.
6300   address generate_spin_wait() {
6301     __ align(CodeEntryAlignment);
6302     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6303     address start = __ pc();
6304 
6305     __ spin_wait();
6306     __ ret(lr);
6307 
6308     return start;
6309   }
6310 
6311 #ifdef LINUX
6312 
6313   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6314   //
6315   // If LSE is in use, generate LSE versions of all the stubs. The
6316   // non-LSE versions are in atomic_aarch64.S.
6317 
6318   // class AtomicStubMark records the entry point of a stub and the
6319   // stub pointer which will point to it. The stub pointer is set to
6320   // the entry point when ~AtomicStubMark() is called, which must be
6321   // after ICache::invalidate_range. This ensures safe publication of
6322   // the generated code.
6323   class AtomicStubMark {
6324     address _entry_point;
6325     aarch64_atomic_stub_t *_stub;
6326     MacroAssembler *_masm;
6327   public:
6328     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6329       _masm = masm;
6330       __ align(32);
6331       _entry_point = __ pc();
6332       _stub = stub;
6333     }
6334     ~AtomicStubMark() {
6335       *_stub = (aarch64_atomic_stub_t)_entry_point;
6336     }
6337   };
6338 
6339   // NB: For memory_order_conservative we need a trailing membar after
6340   // LSE atomic operations but not a leading membar.
6341   //
6342   // We don't need a leading membar because a clause in the Arm ARM
6343   // says:
6344   //
6345   //   Barrier-ordered-before
6346   //
6347   //   Barrier instructions order prior Memory effects before subsequent
6348   //   Memory effects generated by the same Observer. A read or a write
6349   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6350   //   Observer if and only if RW1 appears in program order before RW 2
6351   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6352   //   instruction with both Acquire and Release semantics.
6353   //
6354   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6355   // and Release semantics, therefore we don't need a leading
6356   // barrier. However, there is no corresponding Barrier-ordered-after
6357   // relationship, therefore we need a trailing membar to prevent a
6358   // later store or load from being reordered with the store in an
6359   // atomic instruction.
6360   //
6361   // This was checked by using the herd7 consistency model simulator
6362   // (http://diy.inria.fr/) with this test case:
6363   //
6364   // AArch64 LseCas
6365   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6366   // P0 | P1;
6367   // LDR W4, [X2] | MOV W3, #0;
6368   // DMB LD       | MOV W4, #1;
6369   // LDR W3, [X1] | CASAL W3, W4, [X1];
6370   //              | DMB ISH;
6371   //              | STR W4, [X2];
6372   // exists
6373   // (0:X3=0 /\ 0:X4=1)
6374   //
6375   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6376   // with the store to x in P1. Without the DMB in P1 this may happen.
6377   //
6378   // At the time of writing we don't know of any AArch64 hardware that
6379   // reorders stores in this way, but the Reference Manual permits it.
6380 
6381   void gen_cas_entry(Assembler::operand_size size,
6382                      atomic_memory_order order) {
6383     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6384       exchange_val = c_rarg2;
6385     bool acquire, release;
6386     switch (order) {
6387       case memory_order_relaxed:
6388         acquire = false;
6389         release = false;
6390         break;
6391       case memory_order_release:
6392         acquire = false;
6393         release = true;
6394         break;
6395       default:
6396         acquire = true;
6397         release = true;
6398         break;
6399     }
6400     __ mov(prev, compare_val);
6401     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6402     if (order == memory_order_conservative) {
6403       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6404     }
6405     if (size == Assembler::xword) {
6406       __ mov(r0, prev);
6407     } else {
6408       __ movw(r0, prev);
6409     }
6410     __ ret(lr);
6411   }
6412 
6413   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6414     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6415     // If not relaxed, then default to conservative.  Relaxed is the only
6416     // case we use enough to be worth specializing.
6417     if (order == memory_order_relaxed) {
6418       __ ldadd(size, incr, prev, addr);
6419     } else {
6420       __ ldaddal(size, incr, prev, addr);
6421       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6422     }
6423     if (size == Assembler::xword) {
6424       __ mov(r0, prev);
6425     } else {
6426       __ movw(r0, prev);
6427     }
6428     __ ret(lr);
6429   }
6430 
6431   void gen_swpal_entry(Assembler::operand_size size) {
6432     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6433     __ swpal(size, incr, prev, addr);
6434     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6435     if (size == Assembler::xword) {
6436       __ mov(r0, prev);
6437     } else {
6438       __ movw(r0, prev);
6439     }
6440     __ ret(lr);
6441   }
6442 
6443   void generate_atomic_entry_points() {
6444     if (! UseLSE) {
6445       return;
6446     }
6447 
6448     __ align(CodeEntryAlignment);
6449     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6450     address first_entry = __ pc();
6451 
6452     // ADD, memory_order_conservative
6453     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6454     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6455     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6456     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6457 
6458     // ADD, memory_order_relaxed
6459     AtomicStubMark mark_fetch_add_4_relaxed
6460       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6461     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6462     AtomicStubMark mark_fetch_add_8_relaxed
6463       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6464     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6465 
6466     // XCHG, memory_order_conservative
6467     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6468     gen_swpal_entry(Assembler::word);
6469     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6470     gen_swpal_entry(Assembler::xword);
6471 
6472     // CAS, memory_order_conservative
6473     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6474     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6475     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6476     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6477     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6478     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6479 
6480     // CAS, memory_order_relaxed
6481     AtomicStubMark mark_cmpxchg_1_relaxed
6482       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6483     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6484     AtomicStubMark mark_cmpxchg_4_relaxed
6485       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6486     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6487     AtomicStubMark mark_cmpxchg_8_relaxed
6488       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6489     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6490 
6491     AtomicStubMark mark_cmpxchg_4_release
6492       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6493     gen_cas_entry(MacroAssembler::word, memory_order_release);
6494     AtomicStubMark mark_cmpxchg_8_release
6495       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6496     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6497 
6498     AtomicStubMark mark_cmpxchg_4_seq_cst
6499       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6500     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6501     AtomicStubMark mark_cmpxchg_8_seq_cst
6502       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6503     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6504 
6505     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6506   }
6507 #endif // LINUX
6508 
6509   // Continuation point for throwing of implicit exceptions that are
6510   // not handled in the current activation. Fabricates an exception
6511   // oop and initiates normal exception dispatching in this
6512   // frame. Since we need to preserve callee-saved values (currently
6513   // only for C2, but done for C1 as well) we need a callee-saved oop
6514   // map and therefore have to make these stubs into RuntimeStubs
6515   // rather than BufferBlobs.  If the compiler needs all registers to
6516   // be preserved between the fault point and the exception handler
6517   // then it must assume responsibility for that in
6518   // AbstractCompiler::continuation_for_implicit_null_exception or
6519   // continuation_for_implicit_division_by_zero_exception. All other
6520   // implicit exceptions (e.g., NullPointerException or
6521   // AbstractMethodError on entry) are either at call sites or
6522   // otherwise assume that stack unwinding will be initiated, so
6523   // caller saved registers were assumed volatile in the compiler.
6524 
6525 #undef __
6526 #define __ masm->
6527 
6528   address generate_throw_exception(const char* name,
6529                                    address runtime_entry,
6530                                    Register arg1 = noreg,
6531                                    Register arg2 = noreg) {
6532     // Information about frame layout at time of blocking runtime call.
6533     // Note that we only have to preserve callee-saved registers since
6534     // the compilers are responsible for supplying a continuation point
6535     // if they expect all registers to be preserved.
6536     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6537     enum layout {
6538       rfp_off = 0,
6539       rfp_off2,
6540       return_off,
6541       return_off2,
6542       framesize // inclusive of return address
6543     };
6544 
6545     int insts_size = 512;
6546     int locs_size  = 64;
6547 
6548     CodeBuffer code(name, insts_size, locs_size);
6549     OopMapSet* oop_maps  = new OopMapSet();
6550     MacroAssembler* masm = new MacroAssembler(&code);
6551 
6552     address start = __ pc();
6553 
6554     // This is an inlined and slightly modified version of call_VM
6555     // which has the ability to fetch the return PC out of
6556     // thread-local storage and also sets up last_Java_sp slightly
6557     // differently than the real call_VM
6558 
6559     __ enter(); // Save FP and LR before call
6560 
6561     assert(is_even(framesize/2), "sp not 16-byte aligned");
6562 
6563     // lr and fp are already in place
6564     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
6565 
6566     int frame_complete = __ pc() - start;
6567 
6568     // Set up last_Java_sp and last_Java_fp
6569     address the_pc = __ pc();
6570     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6571 
6572     // Call runtime
6573     if (arg1 != noreg) {
6574       assert(arg2 != c_rarg1, "clobbered");
6575       __ mov(c_rarg1, arg1);
6576     }
6577     if (arg2 != noreg) {
6578       __ mov(c_rarg2, arg2);
6579     }
6580     __ mov(c_rarg0, rthread);
6581     BLOCK_COMMENT("call runtime_entry");
6582     __ mov(rscratch1, runtime_entry);
6583     __ blr(rscratch1);
6584 
6585     // Generate oop map
6586     OopMap* map = new OopMap(framesize, 0);
6587 
6588     oop_maps->add_gc_map(the_pc - start, map);
6589 
6590     __ reset_last_Java_frame(true);
6591 
6592     // Reinitialize the ptrue predicate register, in case the external runtime
6593     // call clobbers ptrue reg, as we may return to SVE compiled code.
6594     __ reinitialize_ptrue();
6595 
6596     __ leave();
6597 
6598     // check for pending exceptions
6599 #ifdef ASSERT
6600     Label L;
6601     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6602     __ cbnz(rscratch1, L);
6603     __ should_not_reach_here();
6604     __ bind(L);
6605 #endif // ASSERT
6606     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6607 
6608 
6609     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6610     RuntimeStub* stub =
6611       RuntimeStub::new_runtime_stub(name,
6612                                     &code,
6613                                     frame_complete,
6614                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6615                                     oop_maps, false);
6616     return stub->entry_point();
6617   }
6618 
6619   class MontgomeryMultiplyGenerator : public MacroAssembler {
6620 
6621     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6622       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6623 
6624     RegSet _toSave;
6625     bool _squaring;
6626 
6627   public:
6628     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6629       : MacroAssembler(as->code()), _squaring(squaring) {
6630 
6631       // Register allocation
6632 
6633       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6634       Pa_base = *regs;       // Argument registers
6635       if (squaring)
6636         Pb_base = Pa_base;
6637       else
6638         Pb_base = *++regs;
6639       Pn_base = *++regs;
6640       Rlen= *++regs;
6641       inv = *++regs;
6642       Pm_base = *++regs;
6643 
6644                           // Working registers:
6645       Ra =  *++regs;        // The current digit of a, b, n, and m.
6646       Rb =  *++regs;
6647       Rm =  *++regs;
6648       Rn =  *++regs;
6649 
6650       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
6651       Pb =  *++regs;
6652       Pm =  *++regs;
6653       Pn =  *++regs;
6654 
6655       t0 =  *++regs;        // Three registers which form a
6656       t1 =  *++regs;        // triple-precision accumuator.
6657       t2 =  *++regs;
6658 
6659       Ri =  *++regs;        // Inner and outer loop indexes.
6660       Rj =  *++regs;
6661 
6662       Rhi_ab = *++regs;     // Product registers: low and high parts
6663       Rlo_ab = *++regs;     // of a*b and m*n.
6664       Rhi_mn = *++regs;
6665       Rlo_mn = *++regs;
6666 
6667       // r19 and up are callee-saved.
6668       _toSave = RegSet::range(r19, *regs) + Pm_base;
6669     }
6670 
6671   private:
6672     void save_regs() {
6673       push(_toSave, sp);
6674     }
6675 
6676     void restore_regs() {
6677       pop(_toSave, sp);
6678     }
6679 
6680     template <typename T>
6681     void unroll_2(Register count, T block) {
6682       Label loop, end, odd;
6683       tbnz(count, 0, odd);
6684       cbz(count, end);
6685       align(16);
6686       bind(loop);
6687       (this->*block)();
6688       bind(odd);
6689       (this->*block)();
6690       subs(count, count, 2);
6691       br(Assembler::GT, loop);
6692       bind(end);
6693     }
6694 
6695     template <typename T>
6696     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
6697       Label loop, end, odd;
6698       tbnz(count, 0, odd);
6699       cbz(count, end);
6700       align(16);
6701       bind(loop);
6702       (this->*block)(d, s, tmp);
6703       bind(odd);
6704       (this->*block)(d, s, tmp);
6705       subs(count, count, 2);
6706       br(Assembler::GT, loop);
6707       bind(end);
6708     }
6709 
6710     void pre1(RegisterOrConstant i) {
6711       block_comment("pre1");
6712       // Pa = Pa_base;
6713       // Pb = Pb_base + i;
6714       // Pm = Pm_base;
6715       // Pn = Pn_base + i;
6716       // Ra = *Pa;
6717       // Rb = *Pb;
6718       // Rm = *Pm;
6719       // Rn = *Pn;
6720       ldr(Ra, Address(Pa_base));
6721       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6722       ldr(Rm, Address(Pm_base));
6723       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6724       lea(Pa, Address(Pa_base));
6725       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6726       lea(Pm, Address(Pm_base));
6727       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6728 
6729       // Zero the m*n result.
6730       mov(Rhi_mn, zr);
6731       mov(Rlo_mn, zr);
6732     }
6733 
6734     // The core multiply-accumulate step of a Montgomery
6735     // multiplication.  The idea is to schedule operations as a
6736     // pipeline so that instructions with long latencies (loads and
6737     // multiplies) have time to complete before their results are
6738     // used.  This most benefits in-order implementations of the
6739     // architecture but out-of-order ones also benefit.
6740     void step() {
6741       block_comment("step");
6742       // MACC(Ra, Rb, t0, t1, t2);
6743       // Ra = *++Pa;
6744       // Rb = *--Pb;
6745       umulh(Rhi_ab, Ra, Rb);
6746       mul(Rlo_ab, Ra, Rb);
6747       ldr(Ra, pre(Pa, wordSize));
6748       ldr(Rb, pre(Pb, -wordSize));
6749       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
6750                                        // previous iteration.
6751       // MACC(Rm, Rn, t0, t1, t2);
6752       // Rm = *++Pm;
6753       // Rn = *--Pn;
6754       umulh(Rhi_mn, Rm, Rn);
6755       mul(Rlo_mn, Rm, Rn);
6756       ldr(Rm, pre(Pm, wordSize));
6757       ldr(Rn, pre(Pn, -wordSize));
6758       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6759     }
6760 
6761     void post1() {
6762       block_comment("post1");
6763 
6764       // MACC(Ra, Rb, t0, t1, t2);
6765       // Ra = *++Pa;
6766       // Rb = *--Pb;
6767       umulh(Rhi_ab, Ra, Rb);
6768       mul(Rlo_ab, Ra, Rb);
6769       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6770       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6771 
6772       // *Pm = Rm = t0 * inv;
6773       mul(Rm, t0, inv);
6774       str(Rm, Address(Pm));
6775 
6776       // MACC(Rm, Rn, t0, t1, t2);
6777       // t0 = t1; t1 = t2; t2 = 0;
6778       umulh(Rhi_mn, Rm, Rn);
6779 
6780 #ifndef PRODUCT
6781       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6782       {
6783         mul(Rlo_mn, Rm, Rn);
6784         add(Rlo_mn, t0, Rlo_mn);
6785         Label ok;
6786         cbz(Rlo_mn, ok); {
6787           stop("broken Montgomery multiply");
6788         } bind(ok);
6789       }
6790 #endif
6791       // We have very carefully set things up so that
6792       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6793       // the lower half of Rm * Rn because we know the result already:
6794       // it must be -t0.  t0 + (-t0) must generate a carry iff
6795       // t0 != 0.  So, rather than do a mul and an adds we just set
6796       // the carry flag iff t0 is nonzero.
6797       //
6798       // mul(Rlo_mn, Rm, Rn);
6799       // adds(zr, t0, Rlo_mn);
6800       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6801       adcs(t0, t1, Rhi_mn);
6802       adc(t1, t2, zr);
6803       mov(t2, zr);
6804     }
6805 
6806     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
6807       block_comment("pre2");
6808       // Pa = Pa_base + i-len;
6809       // Pb = Pb_base + len;
6810       // Pm = Pm_base + i-len;
6811       // Pn = Pn_base + len;
6812 
6813       if (i.is_register()) {
6814         sub(Rj, i.as_register(), len);
6815       } else {
6816         mov(Rj, i.as_constant());
6817         sub(Rj, Rj, len);
6818       }
6819       // Rj == i-len
6820 
6821       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
6822       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
6823       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6824       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
6825 
6826       // Ra = *++Pa;
6827       // Rb = *--Pb;
6828       // Rm = *++Pm;
6829       // Rn = *--Pn;
6830       ldr(Ra, pre(Pa, wordSize));
6831       ldr(Rb, pre(Pb, -wordSize));
6832       ldr(Rm, pre(Pm, wordSize));
6833       ldr(Rn, pre(Pn, -wordSize));
6834 
6835       mov(Rhi_mn, zr);
6836       mov(Rlo_mn, zr);
6837     }
6838 
6839     void post2(RegisterOrConstant i, RegisterOrConstant len) {
6840       block_comment("post2");
6841       if (i.is_constant()) {
6842         mov(Rj, i.as_constant()-len.as_constant());
6843       } else {
6844         sub(Rj, i.as_register(), len);
6845       }
6846 
6847       adds(t0, t0, Rlo_mn); // The pending m*n, low part
6848 
6849       // As soon as we know the least significant digit of our result,
6850       // store it.
6851       // Pm_base[i-len] = t0;
6852       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6853 
6854       // t0 = t1; t1 = t2; t2 = 0;
6855       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
6856       adc(t1, t2, zr);
6857       mov(t2, zr);
6858     }
6859 
6860     // A carry in t0 after Montgomery multiplication means that we
6861     // should subtract multiples of n from our result in m.  We'll
6862     // keep doing that until there is no carry.
6863     void normalize(RegisterOrConstant len) {
6864       block_comment("normalize");
6865       // while (t0)
6866       //   t0 = sub(Pm_base, Pn_base, t0, len);
6867       Label loop, post, again;
6868       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
6869       cbz(t0, post); {
6870         bind(again); {
6871           mov(i, zr);
6872           mov(cnt, len);
6873           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6874           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6875           subs(zr, zr, zr); // set carry flag, i.e. no borrow
6876           align(16);
6877           bind(loop); {
6878             sbcs(Rm, Rm, Rn);
6879             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6880             add(i, i, 1);
6881             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6882             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6883             sub(cnt, cnt, 1);
6884           } cbnz(cnt, loop);
6885           sbc(t0, t0, zr);
6886         } cbnz(t0, again);
6887       } bind(post);
6888     }
6889 
6890     // Move memory at s to d, reversing words.
6891     //    Increments d to end of copied memory
6892     //    Destroys tmp1, tmp2
6893     //    Preserves len
6894     //    Leaves s pointing to the address which was in d at start
6895     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
6896       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
6897 
6898       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
6899       mov(tmp1, len);
6900       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
6901       sub(s, d, len, ext::uxtw, LogBytesPerWord);
6902     }
6903     // where
6904     void reverse1(Register d, Register s, Register tmp) {
6905       ldr(tmp, pre(s, -wordSize));
6906       ror(tmp, tmp, 32);
6907       str(tmp, post(d, wordSize));
6908     }
6909 
6910     void step_squaring() {
6911       // An extra ACC
6912       step();
6913       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6914     }
6915 
6916     void last_squaring(RegisterOrConstant i) {
6917       Label dont;
6918       // if ((i & 1) == 0) {
6919       tbnz(i.as_register(), 0, dont); {
6920         // MACC(Ra, Rb, t0, t1, t2);
6921         // Ra = *++Pa;
6922         // Rb = *--Pb;
6923         umulh(Rhi_ab, Ra, Rb);
6924         mul(Rlo_ab, Ra, Rb);
6925         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6926       } bind(dont);
6927     }
6928 
6929     void extra_step_squaring() {
6930       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6931 
6932       // MACC(Rm, Rn, t0, t1, t2);
6933       // Rm = *++Pm;
6934       // Rn = *--Pn;
6935       umulh(Rhi_mn, Rm, Rn);
6936       mul(Rlo_mn, Rm, Rn);
6937       ldr(Rm, pre(Pm, wordSize));
6938       ldr(Rn, pre(Pn, -wordSize));
6939     }
6940 
6941     void post1_squaring() {
6942       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6943 
6944       // *Pm = Rm = t0 * inv;
6945       mul(Rm, t0, inv);
6946       str(Rm, Address(Pm));
6947 
6948       // MACC(Rm, Rn, t0, t1, t2);
6949       // t0 = t1; t1 = t2; t2 = 0;
6950       umulh(Rhi_mn, Rm, Rn);
6951 
6952 #ifndef PRODUCT
6953       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6954       {
6955         mul(Rlo_mn, Rm, Rn);
6956         add(Rlo_mn, t0, Rlo_mn);
6957         Label ok;
6958         cbz(Rlo_mn, ok); {
6959           stop("broken Montgomery multiply");
6960         } bind(ok);
6961       }
6962 #endif
6963       // We have very carefully set things up so that
6964       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6965       // the lower half of Rm * Rn because we know the result already:
6966       // it must be -t0.  t0 + (-t0) must generate a carry iff
6967       // t0 != 0.  So, rather than do a mul and an adds we just set
6968       // the carry flag iff t0 is nonzero.
6969       //
6970       // mul(Rlo_mn, Rm, Rn);
6971       // adds(zr, t0, Rlo_mn);
6972       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6973       adcs(t0, t1, Rhi_mn);
6974       adc(t1, t2, zr);
6975       mov(t2, zr);
6976     }
6977 
6978     void acc(Register Rhi, Register Rlo,
6979              Register t0, Register t1, Register t2) {
6980       adds(t0, t0, Rlo);
6981       adcs(t1, t1, Rhi);
6982       adc(t2, t2, zr);
6983     }
6984 
6985   public:
6986     /**
6987      * Fast Montgomery multiplication.  The derivation of the
6988      * algorithm is in A Cryptographic Library for the Motorola
6989      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
6990      *
6991      * Arguments:
6992      *
6993      * Inputs for multiplication:
6994      *   c_rarg0   - int array elements a
6995      *   c_rarg1   - int array elements b
6996      *   c_rarg2   - int array elements n (the modulus)
6997      *   c_rarg3   - int length
6998      *   c_rarg4   - int inv
6999      *   c_rarg5   - int array elements m (the result)
7000      *
7001      * Inputs for squaring:
7002      *   c_rarg0   - int array elements a
7003      *   c_rarg1   - int array elements n (the modulus)
7004      *   c_rarg2   - int length
7005      *   c_rarg3   - int inv
7006      *   c_rarg4   - int array elements m (the result)
7007      *
7008      */
7009     address generate_multiply() {
7010       Label argh, nothing;
7011       bind(argh);
7012       stop("MontgomeryMultiply total_allocation must be <= 8192");
7013 
7014       align(CodeEntryAlignment);
7015       address entry = pc();
7016 
7017       cbzw(Rlen, nothing);
7018 
7019       enter();
7020 
7021       // Make room.
7022       cmpw(Rlen, 512);
7023       br(Assembler::HI, argh);
7024       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7025       andr(sp, Ra, -2 * wordSize);
7026 
7027       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7028 
7029       {
7030         // Copy input args, reversing as we go.  We use Ra as a
7031         // temporary variable.
7032         reverse(Ra, Pa_base, Rlen, t0, t1);
7033         if (!_squaring)
7034           reverse(Ra, Pb_base, Rlen, t0, t1);
7035         reverse(Ra, Pn_base, Rlen, t0, t1);
7036       }
7037 
7038       // Push all call-saved registers and also Pm_base which we'll need
7039       // at the end.
7040       save_regs();
7041 
7042 #ifndef PRODUCT
7043       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7044       {
7045         ldr(Rn, Address(Pn_base, 0));
7046         mul(Rlo_mn, Rn, inv);
7047         subs(zr, Rlo_mn, -1);
7048         Label ok;
7049         br(EQ, ok); {
7050           stop("broken inverse in Montgomery multiply");
7051         } bind(ok);
7052       }
7053 #endif
7054 
7055       mov(Pm_base, Ra);
7056 
7057       mov(t0, zr);
7058       mov(t1, zr);
7059       mov(t2, zr);
7060 
7061       block_comment("for (int i = 0; i < len; i++) {");
7062       mov(Ri, zr); {
7063         Label loop, end;
7064         cmpw(Ri, Rlen);
7065         br(Assembler::GE, end);
7066 
7067         bind(loop);
7068         pre1(Ri);
7069 
7070         block_comment("  for (j = i; j; j--) {"); {
7071           movw(Rj, Ri);
7072           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7073         } block_comment("  } // j");
7074 
7075         post1();
7076         addw(Ri, Ri, 1);
7077         cmpw(Ri, Rlen);
7078         br(Assembler::LT, loop);
7079         bind(end);
7080         block_comment("} // i");
7081       }
7082 
7083       block_comment("for (int i = len; i < 2*len; i++) {");
7084       mov(Ri, Rlen); {
7085         Label loop, end;
7086         cmpw(Ri, Rlen, Assembler::LSL, 1);
7087         br(Assembler::GE, end);
7088 
7089         bind(loop);
7090         pre2(Ri, Rlen);
7091 
7092         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7093           lslw(Rj, Rlen, 1);
7094           subw(Rj, Rj, Ri);
7095           subw(Rj, Rj, 1);
7096           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7097         } block_comment("  } // j");
7098 
7099         post2(Ri, Rlen);
7100         addw(Ri, Ri, 1);
7101         cmpw(Ri, Rlen, Assembler::LSL, 1);
7102         br(Assembler::LT, loop);
7103         bind(end);
7104       }
7105       block_comment("} // i");
7106 
7107       normalize(Rlen);
7108 
7109       mov(Ra, Pm_base);  // Save Pm_base in Ra
7110       restore_regs();  // Restore caller's Pm_base
7111 
7112       // Copy our result into caller's Pm_base
7113       reverse(Pm_base, Ra, Rlen, t0, t1);
7114 
7115       leave();
7116       bind(nothing);
7117       ret(lr);
7118 
7119       return entry;
7120     }
7121     // In C, approximately:
7122 
7123     // void
7124     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7125     //                     julong Pn_base[], julong Pm_base[],
7126     //                     julong inv, int len) {
7127     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7128     //   julong *Pa, *Pb, *Pn, *Pm;
7129     //   julong Ra, Rb, Rn, Rm;
7130 
7131     //   int i;
7132 
7133     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7134 
7135     //   for (i = 0; i < len; i++) {
7136     //     int j;
7137 
7138     //     Pa = Pa_base;
7139     //     Pb = Pb_base + i;
7140     //     Pm = Pm_base;
7141     //     Pn = Pn_base + i;
7142 
7143     //     Ra = *Pa;
7144     //     Rb = *Pb;
7145     //     Rm = *Pm;
7146     //     Rn = *Pn;
7147 
7148     //     int iters = i;
7149     //     for (j = 0; iters--; j++) {
7150     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7151     //       MACC(Ra, Rb, t0, t1, t2);
7152     //       Ra = *++Pa;
7153     //       Rb = *--Pb;
7154     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7155     //       MACC(Rm, Rn, t0, t1, t2);
7156     //       Rm = *++Pm;
7157     //       Rn = *--Pn;
7158     //     }
7159 
7160     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7161     //     MACC(Ra, Rb, t0, t1, t2);
7162     //     *Pm = Rm = t0 * inv;
7163     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7164     //     MACC(Rm, Rn, t0, t1, t2);
7165 
7166     //     assert(t0 == 0, "broken Montgomery multiply");
7167 
7168     //     t0 = t1; t1 = t2; t2 = 0;
7169     //   }
7170 
7171     //   for (i = len; i < 2*len; i++) {
7172     //     int j;
7173 
7174     //     Pa = Pa_base + i-len;
7175     //     Pb = Pb_base + len;
7176     //     Pm = Pm_base + i-len;
7177     //     Pn = Pn_base + len;
7178 
7179     //     Ra = *++Pa;
7180     //     Rb = *--Pb;
7181     //     Rm = *++Pm;
7182     //     Rn = *--Pn;
7183 
7184     //     int iters = len*2-i-1;
7185     //     for (j = i-len+1; iters--; j++) {
7186     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7187     //       MACC(Ra, Rb, t0, t1, t2);
7188     //       Ra = *++Pa;
7189     //       Rb = *--Pb;
7190     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7191     //       MACC(Rm, Rn, t0, t1, t2);
7192     //       Rm = *++Pm;
7193     //       Rn = *--Pn;
7194     //     }
7195 
7196     //     Pm_base[i-len] = t0;
7197     //     t0 = t1; t1 = t2; t2 = 0;
7198     //   }
7199 
7200     //   while (t0)
7201     //     t0 = sub(Pm_base, Pn_base, t0, len);
7202     // }
7203 
7204     /**
7205      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7206      * multiplies than Montgomery multiplication so it should be up to
7207      * 25% faster.  However, its loop control is more complex and it
7208      * may actually run slower on some machines.
7209      *
7210      * Arguments:
7211      *
7212      * Inputs:
7213      *   c_rarg0   - int array elements a
7214      *   c_rarg1   - int array elements n (the modulus)
7215      *   c_rarg2   - int length
7216      *   c_rarg3   - int inv
7217      *   c_rarg4   - int array elements m (the result)
7218      *
7219      */
7220     address generate_square() {
7221       Label argh;
7222       bind(argh);
7223       stop("MontgomeryMultiply total_allocation must be <= 8192");
7224 
7225       align(CodeEntryAlignment);
7226       address entry = pc();
7227 
7228       enter();
7229 
7230       // Make room.
7231       cmpw(Rlen, 512);
7232       br(Assembler::HI, argh);
7233       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7234       andr(sp, Ra, -2 * wordSize);
7235 
7236       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7237 
7238       {
7239         // Copy input args, reversing as we go.  We use Ra as a
7240         // temporary variable.
7241         reverse(Ra, Pa_base, Rlen, t0, t1);
7242         reverse(Ra, Pn_base, Rlen, t0, t1);
7243       }
7244 
7245       // Push all call-saved registers and also Pm_base which we'll need
7246       // at the end.
7247       save_regs();
7248 
7249       mov(Pm_base, Ra);
7250 
7251       mov(t0, zr);
7252       mov(t1, zr);
7253       mov(t2, zr);
7254 
7255       block_comment("for (int i = 0; i < len; i++) {");
7256       mov(Ri, zr); {
7257         Label loop, end;
7258         bind(loop);
7259         cmp(Ri, Rlen);
7260         br(Assembler::GE, end);
7261 
7262         pre1(Ri);
7263 
7264         block_comment("for (j = (i+1)/2; j; j--) {"); {
7265           add(Rj, Ri, 1);
7266           lsr(Rj, Rj, 1);
7267           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7268         } block_comment("  } // j");
7269 
7270         last_squaring(Ri);
7271 
7272         block_comment("  for (j = i/2; j; j--) {"); {
7273           lsr(Rj, Ri, 1);
7274           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7275         } block_comment("  } // j");
7276 
7277         post1_squaring();
7278         add(Ri, Ri, 1);
7279         cmp(Ri, Rlen);
7280         br(Assembler::LT, loop);
7281 
7282         bind(end);
7283         block_comment("} // i");
7284       }
7285 
7286       block_comment("for (int i = len; i < 2*len; i++) {");
7287       mov(Ri, Rlen); {
7288         Label loop, end;
7289         bind(loop);
7290         cmp(Ri, Rlen, Assembler::LSL, 1);
7291         br(Assembler::GE, end);
7292 
7293         pre2(Ri, Rlen);
7294 
7295         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7296           lsl(Rj, Rlen, 1);
7297           sub(Rj, Rj, Ri);
7298           sub(Rj, Rj, 1);
7299           lsr(Rj, Rj, 1);
7300           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7301         } block_comment("  } // j");
7302 
7303         last_squaring(Ri);
7304 
7305         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7306           lsl(Rj, Rlen, 1);
7307           sub(Rj, Rj, Ri);
7308           lsr(Rj, Rj, 1);
7309           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7310         } block_comment("  } // j");
7311 
7312         post2(Ri, Rlen);
7313         add(Ri, Ri, 1);
7314         cmp(Ri, Rlen, Assembler::LSL, 1);
7315 
7316         br(Assembler::LT, loop);
7317         bind(end);
7318         block_comment("} // i");
7319       }
7320 
7321       normalize(Rlen);
7322 
7323       mov(Ra, Pm_base);  // Save Pm_base in Ra
7324       restore_regs();  // Restore caller's Pm_base
7325 
7326       // Copy our result into caller's Pm_base
7327       reverse(Pm_base, Ra, Rlen, t0, t1);
7328 
7329       leave();
7330       ret(lr);
7331 
7332       return entry;
7333     }
7334     // In C, approximately:
7335 
7336     // void
7337     // montgomery_square(julong Pa_base[], julong Pn_base[],
7338     //                   julong Pm_base[], julong inv, int len) {
7339     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7340     //   julong *Pa, *Pb, *Pn, *Pm;
7341     //   julong Ra, Rb, Rn, Rm;
7342 
7343     //   int i;
7344 
7345     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7346 
7347     //   for (i = 0; i < len; i++) {
7348     //     int j;
7349 
7350     //     Pa = Pa_base;
7351     //     Pb = Pa_base + i;
7352     //     Pm = Pm_base;
7353     //     Pn = Pn_base + i;
7354 
7355     //     Ra = *Pa;
7356     //     Rb = *Pb;
7357     //     Rm = *Pm;
7358     //     Rn = *Pn;
7359 
7360     //     int iters = (i+1)/2;
7361     //     for (j = 0; iters--; j++) {
7362     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7363     //       MACC2(Ra, Rb, t0, t1, t2);
7364     //       Ra = *++Pa;
7365     //       Rb = *--Pb;
7366     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7367     //       MACC(Rm, Rn, t0, t1, t2);
7368     //       Rm = *++Pm;
7369     //       Rn = *--Pn;
7370     //     }
7371     //     if ((i & 1) == 0) {
7372     //       assert(Ra == Pa_base[j], "must be");
7373     //       MACC(Ra, Ra, t0, t1, t2);
7374     //     }
7375     //     iters = i/2;
7376     //     assert(iters == i-j, "must be");
7377     //     for (; iters--; j++) {
7378     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7379     //       MACC(Rm, Rn, t0, t1, t2);
7380     //       Rm = *++Pm;
7381     //       Rn = *--Pn;
7382     //     }
7383 
7384     //     *Pm = Rm = t0 * inv;
7385     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7386     //     MACC(Rm, Rn, t0, t1, t2);
7387 
7388     //     assert(t0 == 0, "broken Montgomery multiply");
7389 
7390     //     t0 = t1; t1 = t2; t2 = 0;
7391     //   }
7392 
7393     //   for (i = len; i < 2*len; i++) {
7394     //     int start = i-len+1;
7395     //     int end = start + (len - start)/2;
7396     //     int j;
7397 
7398     //     Pa = Pa_base + i-len;
7399     //     Pb = Pa_base + len;
7400     //     Pm = Pm_base + i-len;
7401     //     Pn = Pn_base + len;
7402 
7403     //     Ra = *++Pa;
7404     //     Rb = *--Pb;
7405     //     Rm = *++Pm;
7406     //     Rn = *--Pn;
7407 
7408     //     int iters = (2*len-i-1)/2;
7409     //     assert(iters == end-start, "must be");
7410     //     for (j = start; iters--; j++) {
7411     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7412     //       MACC2(Ra, Rb, t0, t1, t2);
7413     //       Ra = *++Pa;
7414     //       Rb = *--Pb;
7415     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7416     //       MACC(Rm, Rn, t0, t1, t2);
7417     //       Rm = *++Pm;
7418     //       Rn = *--Pn;
7419     //     }
7420     //     if ((i & 1) == 0) {
7421     //       assert(Ra == Pa_base[j], "must be");
7422     //       MACC(Ra, Ra, t0, t1, t2);
7423     //     }
7424     //     iters =  (2*len-i)/2;
7425     //     assert(iters == len-j, "must be");
7426     //     for (; iters--; j++) {
7427     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7428     //       MACC(Rm, Rn, t0, t1, t2);
7429     //       Rm = *++Pm;
7430     //       Rn = *--Pn;
7431     //     }
7432     //     Pm_base[i-len] = t0;
7433     //     t0 = t1; t1 = t2; t2 = 0;
7434     //   }
7435 
7436     //   while (t0)
7437     //     t0 = sub(Pm_base, Pn_base, t0, len);
7438     // }
7439   };
7440 
7441 
7442   // Initialization
7443   void generate_initial() {
7444     // Generate initial stubs and initializes the entry points
7445 
7446     // entry points that exist in all platforms Note: This is code
7447     // that could be shared among different platforms - however the
7448     // benefit seems to be smaller than the disadvantage of having a
7449     // much more complicated generator structure. See also comment in
7450     // stubRoutines.hpp.
7451 
7452     StubRoutines::_forward_exception_entry = generate_forward_exception();
7453 
7454     StubRoutines::_call_stub_entry =
7455       generate_call_stub(StubRoutines::_call_stub_return_address);
7456 
7457     // is referenced by megamorphic call
7458     StubRoutines::_catch_exception_entry = generate_catch_exception();
7459 
7460     // Build this early so it's available for the interpreter.
7461     StubRoutines::_throw_StackOverflowError_entry =
7462       generate_throw_exception("StackOverflowError throw_exception",
7463                                CAST_FROM_FN_PTR(address,
7464                                                 SharedRuntime::throw_StackOverflowError));
7465     StubRoutines::_throw_delayed_StackOverflowError_entry =
7466       generate_throw_exception("delayed StackOverflowError throw_exception",
7467                                CAST_FROM_FN_PTR(address,
7468                                                 SharedRuntime::throw_delayed_StackOverflowError));
7469     if (UseCRC32Intrinsics) {
7470       // set table address before stub generation which use it
7471       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7472       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7473     }
7474 
7475     if (UseCRC32CIntrinsics) {
7476       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7477     }
7478 
7479     // Disabled until JDK-8210858 is fixed
7480     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7481     //   StubRoutines::_dlog = generate_dlog();
7482     // }
7483 
7484     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7485       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7486     }
7487 
7488     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7489       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7490     }
7491 
7492     // Safefetch stubs.
7493     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7494                                                        &StubRoutines::_safefetch32_fault_pc,
7495                                                        &StubRoutines::_safefetch32_continuation_pc);
7496     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7497                                                        &StubRoutines::_safefetchN_fault_pc,
7498                                                        &StubRoutines::_safefetchN_continuation_pc);
7499   }
7500 
7501   void generate_all() {
7502     // support for verify_oop (must happen after universe_init)
7503     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
7504     StubRoutines::_throw_AbstractMethodError_entry =
7505       generate_throw_exception("AbstractMethodError throw_exception",
7506                                CAST_FROM_FN_PTR(address,
7507                                                 SharedRuntime::
7508                                                 throw_AbstractMethodError));
7509 
7510     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7511       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7512                                CAST_FROM_FN_PTR(address,
7513                                                 SharedRuntime::
7514                                                 throw_IncompatibleClassChangeError));
7515 
7516     StubRoutines::_throw_NullPointerException_at_call_entry =
7517       generate_throw_exception("NullPointerException at call throw_exception",
7518                                CAST_FROM_FN_PTR(address,
7519                                                 SharedRuntime::
7520                                                 throw_NullPointerException_at_call));
7521 
7522     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7523 
7524     // arraycopy stubs used by compilers
7525     generate_arraycopy_stubs();
7526 
7527     // countPositives stub for large arrays.
7528     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
7529 
7530     // array equals stub for large arrays.
7531     if (!UseSimpleArrayEquals) {
7532       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7533     }
7534 
7535     generate_compare_long_strings();
7536 
7537     generate_string_indexof_stubs();
7538 
7539     // byte_array_inflate stub for large arrays.
7540     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7541 
7542     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7543     if (bs_nm != NULL) {
7544       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7545     }
7546 #ifdef COMPILER2
7547     if (UseMultiplyToLenIntrinsic) {
7548       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7549     }
7550 
7551     if (UseSquareToLenIntrinsic) {
7552       StubRoutines::_squareToLen = generate_squareToLen();
7553     }
7554 
7555     if (UseMulAddIntrinsic) {
7556       StubRoutines::_mulAdd = generate_mulAdd();
7557     }
7558 
7559     if (UseSIMDForBigIntegerShiftIntrinsics) {
7560       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7561       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7562     }
7563 
7564     if (UseMontgomeryMultiplyIntrinsic) {
7565       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7566       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7567       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7568     }
7569 
7570     if (UseMontgomerySquareIntrinsic) {
7571       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7572       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7573       // We use generate_multiply() rather than generate_square()
7574       // because it's faster for the sizes of modulus we care about.
7575       StubRoutines::_montgomerySquare = g.generate_multiply();
7576     }
7577 #endif // COMPILER2
7578 
7579     if (UseBASE64Intrinsics) {
7580         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7581         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7582     }
7583 
7584     // data cache line writeback
7585     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7586     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7587 
7588     if (UseAESIntrinsics) {
7589       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7590       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7591       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7592       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7593       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7594     }
7595     if (UseGHASHIntrinsics) {
7596       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7597       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
7598     }
7599     if (UseAESIntrinsics && UseGHASHIntrinsics) {
7600       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7601     }
7602 
7603     if (UseMD5Intrinsics) {
7604       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
7605       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
7606     }
7607     if (UseSHA1Intrinsics) {
7608       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7609       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7610     }
7611     if (UseSHA256Intrinsics) {
7612       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7613       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7614     }
7615     if (UseSHA512Intrinsics) {
7616       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7617       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7618     }
7619     if (UseSHA3Intrinsics) {
7620       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7621       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7622     }
7623 
7624     // generate Adler32 intrinsics code
7625     if (UseAdler32Intrinsics) {
7626       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7627     }
7628 
7629     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
7630 
7631 #ifdef LINUX
7632 
7633     generate_atomic_entry_points();
7634 
7635 #endif // LINUX
7636 
7637     StubRoutines::aarch64::set_completed();
7638   }
7639 
7640  public:
7641   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7642     if (all) {
7643       generate_all();
7644     } else {
7645       generate_initial();
7646     }
7647   }
7648 }; // end class declaration
7649 
7650 #define UCM_TABLE_MAX_ENTRIES 8
7651 void StubGenerator_generate(CodeBuffer* code, bool all) {
7652   if (UnsafeCopyMemory::_table == NULL) {
7653     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7654   }
7655   StubGenerator g(code, all);
7656 }
7657 
7658 
7659 #ifdef LINUX
7660 
7661 // Define pointers to atomic stubs and initialize them to point to the
7662 // code in atomic_aarch64.S.
7663 
7664 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
7665   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7666     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
7667   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7668     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7669 
7670 DEFAULT_ATOMIC_OP(fetch_add, 4, )
7671 DEFAULT_ATOMIC_OP(fetch_add, 8, )
7672 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
7673 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
7674 DEFAULT_ATOMIC_OP(xchg, 4, )
7675 DEFAULT_ATOMIC_OP(xchg, 8, )
7676 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
7677 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
7678 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
7679 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
7680 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
7681 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
7682 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
7683 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
7684 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
7685 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
7686 
7687 #undef DEFAULT_ATOMIC_OP
7688 
7689 #endif // LINUX