1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "atomic_aarch64.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/interpreter.hpp"
  36 #include "memory/universe.hpp"
  37 #include "nativeInst_aarch64.hpp"
  38 #include "oops/instanceOop.hpp"
  39 #include "oops/method.hpp"
  40 #include "oops/objArrayKlass.hpp"
  41 #include "oops/oop.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/atomic.hpp"
  44 #include "runtime/frame.inline.hpp"
  45 #include "runtime/handles.inline.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubCodeGenerator.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/thread.inline.hpp"
  50 #include "utilities/align.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 #ifdef COMPILER2
  53 #include "opto/runtime.hpp"
  54 #endif
  55 #if INCLUDE_ZGC
  56 #include "gc/z/zThreadLocalData.hpp"
  57 #endif
  58 
  59 // Declaration and definition of StubGenerator (no .hpp file).
  60 // For a more detailed description of the stub routine structure
  61 // see the comment in stubRoutines.hpp
  62 
  63 #undef __
  64 #define __ _masm->
  65 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  66 
  67 #ifdef PRODUCT
  68 #define BLOCK_COMMENT(str) /* nothing */
  69 #else
  70 #define BLOCK_COMMENT(str) __ block_comment(str)
  71 #endif
  72 
  73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  74 
  75 // Stub Code definitions
  76 
  77 class StubGenerator: public StubCodeGenerator {
  78  private:
  79 
  80 #ifdef PRODUCT
  81 #define inc_counter_np(counter) ((void)0)
  82 #else
  83   void inc_counter_np_(int& counter) {
  84     __ lea(rscratch2, ExternalAddress((address)&counter));
  85     __ ldrw(rscratch1, Address(rscratch2));
  86     __ addw(rscratch1, rscratch1, 1);
  87     __ strw(rscratch1, Address(rscratch2));
  88   }
  89 #define inc_counter_np(counter) \
  90   BLOCK_COMMENT("inc_counter " #counter); \
  91   inc_counter_np_(counter);
  92 #endif
  93 
  94   // Call stubs are used to call Java from C
  95   //
  96   // Arguments:
  97   //    c_rarg0:   call wrapper address                   address
  98   //    c_rarg1:   result                                 address
  99   //    c_rarg2:   result type                            BasicType
 100   //    c_rarg3:   method                                 Method*
 101   //    c_rarg4:   (interpreter) entry point              address
 102   //    c_rarg5:   parameters                             intptr_t*
 103   //    c_rarg6:   parameter size (in words)              int
 104   //    c_rarg7:   thread                                 Thread*
 105   //
 106   // There is no return from the stub itself as any Java result
 107   // is written to result
 108   //
 109   // we save r30 (lr) as the return PC at the base of the frame and
 110   // link r29 (fp) below it as the frame pointer installing sp (r31)
 111   // into fp.
 112   //
 113   // we save r0-r7, which accounts for all the c arguments.
 114   //
 115   // TODO: strictly do we need to save them all? they are treated as
 116   // volatile by C so could we omit saving the ones we are going to
 117   // place in global registers (thread? method?) or those we only use
 118   // during setup of the Java call?
 119   //
 120   // we don't need to save r8 which C uses as an indirect result location
 121   // return register.
 122   //
 123   // we don't need to save r9-r15 which both C and Java treat as
 124   // volatile
 125   //
 126   // we don't need to save r16-18 because Java does not use them
 127   //
 128   // we save r19-r28 which Java uses as scratch registers and C
 129   // expects to be callee-save
 130   //
 131   // we save the bottom 64 bits of each value stored in v8-v15; it is
 132   // the responsibility of the caller to preserve larger values.
 133   //
 134   // so the stub frame looks like this when we enter Java code
 135   //
 136   //     [ return_from_Java     ] <--- sp
 137   //     [ argument word n      ]
 138   //      ...
 139   // -27 [ argument word 1      ]
 140   // -26 [ saved v15            ] <--- sp_after_call
 141   // -25 [ saved v14            ]
 142   // -24 [ saved v13            ]
 143   // -23 [ saved v12            ]
 144   // -22 [ saved v11            ]
 145   // -21 [ saved v10            ]
 146   // -20 [ saved v9             ]
 147   // -19 [ saved v8             ]
 148   // -18 [ saved r28            ]
 149   // -17 [ saved r27            ]
 150   // -16 [ saved r26            ]
 151   // -15 [ saved r25            ]
 152   // -14 [ saved r24            ]
 153   // -13 [ saved r23            ]
 154   // -12 [ saved r22            ]
 155   // -11 [ saved r21            ]
 156   // -10 [ saved r20            ]
 157   //  -9 [ saved r19            ]
 158   //  -8 [ call wrapper    (r0) ]
 159   //  -7 [ result          (r1) ]
 160   //  -6 [ result type     (r2) ]
 161   //  -5 [ method          (r3) ]
 162   //  -4 [ entry point     (r4) ]
 163   //  -3 [ parameters      (r5) ]
 164   //  -2 [ parameter size  (r6) ]
 165   //  -1 [ thread (r7)          ]
 166   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 167   //   1 [ saved lr       (r30) ]
 168 
 169   // Call stub stack layout word offsets from fp
 170   enum call_stub_layout {
 171     sp_after_call_off = -26,
 172 
 173     d15_off            = -26,
 174     d13_off            = -24,
 175     d11_off            = -22,
 176     d9_off             = -20,
 177 
 178     r28_off            = -18,
 179     r26_off            = -16,
 180     r24_off            = -14,
 181     r22_off            = -12,
 182     r20_off            = -10,
 183     call_wrapper_off   =  -8,
 184     result_off         =  -7,
 185     result_type_off    =  -6,
 186     method_off         =  -5,
 187     entry_point_off    =  -4,
 188     parameter_size_off =  -2,
 189     thread_off         =  -1,
 190     fp_f               =   0,
 191     retaddr_off        =   1,
 192   };
 193 
 194   address generate_call_stub(address& return_address) {
 195     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 196            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 197            "adjust this code");
 198 
 199     StubCodeMark mark(this, "StubRoutines", "call_stub");
 200     address start = __ pc();
 201 
 202     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 203 
 204     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 205     const Address result        (rfp, result_off         * wordSize);
 206     const Address result_type   (rfp, result_type_off    * wordSize);
 207     const Address method        (rfp, method_off         * wordSize);
 208     const Address entry_point   (rfp, entry_point_off    * wordSize);
 209     const Address parameter_size(rfp, parameter_size_off * wordSize);
 210 
 211     const Address thread        (rfp, thread_off         * wordSize);
 212 
 213     const Address d15_save      (rfp, d15_off * wordSize);
 214     const Address d13_save      (rfp, d13_off * wordSize);
 215     const Address d11_save      (rfp, d11_off * wordSize);
 216     const Address d9_save       (rfp, d9_off * wordSize);
 217 
 218     const Address r28_save      (rfp, r28_off * wordSize);
 219     const Address r26_save      (rfp, r26_off * wordSize);
 220     const Address r24_save      (rfp, r24_off * wordSize);
 221     const Address r22_save      (rfp, r22_off * wordSize);
 222     const Address r20_save      (rfp, r20_off * wordSize);
 223 
 224     // stub code
 225 
 226     address aarch64_entry = __ pc();
 227 
 228     // set up frame and move sp to end of save area
 229     __ enter();
 230     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 231 
 232     // save register parameters and Java scratch/global registers
 233     // n.b. we save thread even though it gets installed in
 234     // rthread because we want to sanity check rthread later
 235     __ str(c_rarg7,  thread);
 236     __ strw(c_rarg6, parameter_size);
 237     __ stp(c_rarg4, c_rarg5,  entry_point);
 238     __ stp(c_rarg2, c_rarg3,  result_type);
 239     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 240 
 241     __ stp(r20, r19,   r20_save);
 242     __ stp(r22, r21,   r22_save);
 243     __ stp(r24, r23,   r24_save);
 244     __ stp(r26, r25,   r26_save);
 245     __ stp(r28, r27,   r28_save);
 246 
 247     __ stpd(v9,  v8,   d9_save);
 248     __ stpd(v11, v10,  d11_save);
 249     __ stpd(v13, v12,  d13_save);
 250     __ stpd(v15, v14,  d15_save);
 251 
 252     // install Java thread in global register now we have saved
 253     // whatever value it held
 254     __ mov(rthread, c_rarg7);
 255     // And method
 256     __ mov(rmethod, c_rarg3);
 257 
 258     // set up the heapbase register
 259     __ reinit_heapbase();
 260 
 261 #ifdef ASSERT
 262     // make sure we have no pending exceptions
 263     {
 264       Label L;
 265       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 266       __ cmp(rscratch1, (u1)NULL_WORD);
 267       __ br(Assembler::EQ, L);
 268       __ stop("StubRoutines::call_stub: entered with pending exception");
 269       __ BIND(L);
 270     }
 271 #endif
 272     // pass parameters if any
 273     __ mov(esp, sp);
 274     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 275     __ andr(sp, rscratch1, -2 * wordSize);
 276 
 277     BLOCK_COMMENT("pass parameters if any");
 278     Label parameters_done;
 279     // parameter count is still in c_rarg6
 280     // and parameter pointer identifying param 1 is in c_rarg5
 281     __ cbzw(c_rarg6, parameters_done);
 282 
 283     address loop = __ pc();
 284     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 285     __ subsw(c_rarg6, c_rarg6, 1);
 286     __ push(rscratch1);
 287     __ br(Assembler::GT, loop);
 288 
 289     __ BIND(parameters_done);
 290 
 291     // call Java entry -- passing methdoOop, and current sp
 292     //      rmethod: Method*
 293     //      r13: sender sp
 294     BLOCK_COMMENT("call Java function");
 295     __ mov(r13, sp);
 296     __ blr(c_rarg4);
 297 
 298     // we do this here because the notify will already have been done
 299     // if we get to the next instruction via an exception
 300     //
 301     // n.b. adding this instruction here affects the calculation of
 302     // whether or not a routine returns to the call stub (used when
 303     // doing stack walks) since the normal test is to check the return
 304     // pc against the address saved below. so we may need to allow for
 305     // this extra instruction in the check.
 306 
 307     // save current address for use by exception handling code
 308 
 309     return_address = __ pc();
 310 
 311     // store result depending on type (everything that is not
 312     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 313     // n.b. this assumes Java returns an integral result in r0
 314     // and a floating result in j_farg0
 315     __ ldr(j_rarg2, result);
 316     Label is_long, is_float, is_double, exit;
 317     __ ldr(j_rarg1, result_type);
 318     __ cmp(j_rarg1, (u1)T_OBJECT);
 319     __ br(Assembler::EQ, is_long);
 320     __ cmp(j_rarg1, (u1)T_LONG);
 321     __ br(Assembler::EQ, is_long);
 322     __ cmp(j_rarg1, (u1)T_FLOAT);
 323     __ br(Assembler::EQ, is_float);
 324     __ cmp(j_rarg1, (u1)T_DOUBLE);
 325     __ br(Assembler::EQ, is_double);
 326 
 327     // handle T_INT case
 328     __ strw(r0, Address(j_rarg2));
 329 
 330     __ BIND(exit);
 331 
 332     // pop parameters
 333     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 334 
 335 #ifdef ASSERT
 336     // verify that threads correspond
 337     {
 338       Label L, S;
 339       __ ldr(rscratch1, thread);
 340       __ cmp(rthread, rscratch1);
 341       __ br(Assembler::NE, S);
 342       __ get_thread(rscratch1);
 343       __ cmp(rthread, rscratch1);
 344       __ br(Assembler::EQ, L);
 345       __ BIND(S);
 346       __ stop("StubRoutines::call_stub: threads must correspond");
 347       __ BIND(L);
 348     }
 349 #endif
 350 
 351     // restore callee-save registers
 352     __ ldpd(v15, v14,  d15_save);
 353     __ ldpd(v13, v12,  d13_save);
 354     __ ldpd(v11, v10,  d11_save);
 355     __ ldpd(v9,  v8,   d9_save);
 356 
 357     __ ldp(r28, r27,   r28_save);
 358     __ ldp(r26, r25,   r26_save);
 359     __ ldp(r24, r23,   r24_save);
 360     __ ldp(r22, r21,   r22_save);
 361     __ ldp(r20, r19,   r20_save);
 362 
 363     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 364     __ ldrw(c_rarg2, result_type);
 365     __ ldr(c_rarg3,  method);
 366     __ ldp(c_rarg4, c_rarg5,  entry_point);
 367     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 368 
 369     // leave frame and return to caller
 370     __ leave();
 371     __ ret(lr);
 372 
 373     // handle return types different from T_INT
 374 
 375     __ BIND(is_long);
 376     __ str(r0, Address(j_rarg2, 0));
 377     __ br(Assembler::AL, exit);
 378 
 379     __ BIND(is_float);
 380     __ strs(j_farg0, Address(j_rarg2, 0));
 381     __ br(Assembler::AL, exit);
 382 
 383     __ BIND(is_double);
 384     __ strd(j_farg0, Address(j_rarg2, 0));
 385     __ br(Assembler::AL, exit);
 386 
 387     return start;
 388   }
 389 
 390   // Return point for a Java call if there's an exception thrown in
 391   // Java code.  The exception is caught and transformed into a
 392   // pending exception stored in JavaThread that can be tested from
 393   // within the VM.
 394   //
 395   // Note: Usually the parameters are removed by the callee. In case
 396   // of an exception crossing an activation frame boundary, that is
 397   // not the case if the callee is compiled code => need to setup the
 398   // rsp.
 399   //
 400   // r0: exception oop
 401 
 402   address generate_catch_exception() {
 403     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 404     address start = __ pc();
 405 
 406     // same as in generate_call_stub():
 407     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 408     const Address thread        (rfp, thread_off         * wordSize);
 409 
 410 #ifdef ASSERT
 411     // verify that threads correspond
 412     {
 413       Label L, S;
 414       __ ldr(rscratch1, thread);
 415       __ cmp(rthread, rscratch1);
 416       __ br(Assembler::NE, S);
 417       __ get_thread(rscratch1);
 418       __ cmp(rthread, rscratch1);
 419       __ br(Assembler::EQ, L);
 420       __ bind(S);
 421       __ stop("StubRoutines::catch_exception: threads must correspond");
 422       __ bind(L);
 423     }
 424 #endif
 425 
 426     // set pending exception
 427     __ verify_oop(r0);
 428 
 429     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 430     __ mov(rscratch1, (address)__FILE__);
 431     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 432     __ movw(rscratch1, (int)__LINE__);
 433     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 434 
 435     // complete return to VM
 436     assert(StubRoutines::_call_stub_return_address != NULL,
 437            "_call_stub_return_address must have been generated before");
 438     __ b(StubRoutines::_call_stub_return_address);
 439 
 440     return start;
 441   }
 442 
 443   // Continuation point for runtime calls returning with a pending
 444   // exception.  The pending exception check happened in the runtime
 445   // or native call stub.  The pending exception in Thread is
 446   // converted into a Java-level exception.
 447   //
 448   // Contract with Java-level exception handlers:
 449   // r0: exception
 450   // r3: throwing pc
 451   //
 452   // NOTE: At entry of this stub, exception-pc must be in LR !!
 453 
 454   // NOTE: this is always used as a jump target within generated code
 455   // so it just needs to be generated code wiht no x86 prolog
 456 
 457   address generate_forward_exception() {
 458     StubCodeMark mark(this, "StubRoutines", "forward exception");
 459     address start = __ pc();
 460 
 461     // Upon entry, LR points to the return address returning into
 462     // Java (interpreted or compiled) code; i.e., the return address
 463     // becomes the throwing pc.
 464     //
 465     // Arguments pushed before the runtime call are still on the stack
 466     // but the exception handler will reset the stack pointer ->
 467     // ignore them.  A potential result in registers can be ignored as
 468     // well.
 469 
 470 #ifdef ASSERT
 471     // make sure this code is only executed if there is a pending exception
 472     {
 473       Label L;
 474       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 475       __ cbnz(rscratch1, L);
 476       __ stop("StubRoutines::forward exception: no pending exception (1)");
 477       __ bind(L);
 478     }
 479 #endif
 480 
 481     // compute exception handler into r19
 482 
 483     // call the VM to find the handler address associated with the
 484     // caller address. pass thread in r0 and caller pc (ret address)
 485     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 486     // the stack.
 487     __ mov(c_rarg1, lr);
 488     // lr will be trashed by the VM call so we move it to R19
 489     // (callee-saved) because we also need to pass it to the handler
 490     // returned by this call.
 491     __ mov(r19, lr);
 492     BLOCK_COMMENT("call exception_handler_for_return_address");
 493     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 494                          SharedRuntime::exception_handler_for_return_address),
 495                     rthread, c_rarg1);
 496     // Reinitialize the ptrue predicate register, in case the external runtime
 497     // call clobbers ptrue reg, as we may return to SVE compiled code.
 498     __ reinitialize_ptrue();
 499 
 500     // we should not really care that lr is no longer the callee
 501     // address. we saved the value the handler needs in r19 so we can
 502     // just copy it to r3. however, the C2 handler will push its own
 503     // frame and then calls into the VM and the VM code asserts that
 504     // the PC for the frame above the handler belongs to a compiled
 505     // Java method. So, we restore lr here to satisfy that assert.
 506     __ mov(lr, r19);
 507     // setup r0 & r3 & clear pending exception
 508     __ mov(r3, r19);
 509     __ mov(r19, r0);
 510     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 511     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 512 
 513 #ifdef ASSERT
 514     // make sure exception is set
 515     {
 516       Label L;
 517       __ cbnz(r0, L);
 518       __ stop("StubRoutines::forward exception: no pending exception (2)");
 519       __ bind(L);
 520     }
 521 #endif
 522 
 523     // continue at exception handler
 524     // r0: exception
 525     // r3: throwing pc
 526     // r19: exception handler
 527     __ verify_oop(r0);
 528     __ br(r19);
 529 
 530     return start;
 531   }
 532 
 533   // Non-destructive plausibility checks for oops
 534   //
 535   // Arguments:
 536   //    r0: oop to verify
 537   //    rscratch1: error message
 538   //
 539   // Stack after saving c_rarg3:
 540   //    [tos + 0]: saved c_rarg3
 541   //    [tos + 1]: saved c_rarg2
 542   //    [tos + 2]: saved lr
 543   //    [tos + 3]: saved rscratch2
 544   //    [tos + 4]: saved r0
 545   //    [tos + 5]: saved rscratch1
 546   address generate_verify_oop() {
 547 
 548     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 549     address start = __ pc();
 550 
 551     Label exit, error;
 552 
 553     // save c_rarg2 and c_rarg3
 554     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 555 
 556     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 557     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 558     __ ldr(c_rarg3, Address(c_rarg2));
 559     __ add(c_rarg3, c_rarg3, 1);
 560     __ str(c_rarg3, Address(c_rarg2));
 561 
 562     // object is in r0
 563     // make sure object is 'reasonable'
 564     __ cbz(r0, exit); // if obj is NULL it is OK
 565 
 566 #if INCLUDE_ZGC
 567     if (UseZGC) {
 568       // Check if mask is good.
 569       // verifies that ZAddressBadMask & r0 == 0
 570       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 571       __ andr(c_rarg2, r0, c_rarg3);
 572       __ cbnz(c_rarg2, error);
 573     }
 574 #endif
 575 
 576     // Check if the oop is in the right area of memory
 577     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 578     __ andr(c_rarg2, r0, c_rarg3);
 579     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 580 
 581     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 582     // instruction here because the flags register is live.
 583     __ eor(c_rarg2, c_rarg2, c_rarg3);
 584     __ cbnz(c_rarg2, error);
 585 
 586     // make sure klass is 'reasonable', which is not zero.
 587     __ load_klass(r0, r0);  // get klass
 588     __ cbz(r0, error);      // if klass is NULL it is broken
 589 
 590     // return if everything seems ok
 591     __ bind(exit);
 592 
 593     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 594     __ ret(lr);
 595 
 596     // handle errors
 597     __ bind(error);
 598     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 599 
 600     __ push(RegSet::range(r0, r29), sp);
 601     // debug(char* msg, int64_t pc, int64_t regs[])
 602     __ mov(c_rarg0, rscratch1);      // pass address of error message
 603     __ mov(c_rarg1, lr);             // pass return address
 604     __ mov(c_rarg2, sp);             // pass address of regs on stack
 605 #ifndef PRODUCT
 606     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 607 #endif
 608     BLOCK_COMMENT("call MacroAssembler::debug");
 609     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 610     __ blr(rscratch1);
 611     __ hlt(0);
 612 
 613     return start;
 614   }
 615 
 616   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 617 
 618   // Generate indices for iota vector.
 619   address generate_iota_indices(const char *stub_name) {
 620     __ align(CodeEntryAlignment);
 621     StubCodeMark mark(this, "StubRoutines", stub_name);
 622     address start = __ pc();
 623     __ emit_data64(0x0706050403020100, relocInfo::none);
 624     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 625     return start;
 626   }
 627 
 628   // The inner part of zero_words().  This is the bulk operation,
 629   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 630   // caller is responsible for zeroing the last few words.
 631   //
 632   // Inputs:
 633   // r10: the HeapWord-aligned base address of an array to zero.
 634   // r11: the count in HeapWords, r11 > 0.
 635   //
 636   // Returns r10 and r11, adjusted for the caller to clear.
 637   // r10: the base address of the tail of words left to clear.
 638   // r11: the number of words in the tail.
 639   //      r11 < MacroAssembler::zero_words_block_size.
 640 
 641   address generate_zero_blocks() {
 642     Label done;
 643     Label base_aligned;
 644 
 645     Register base = r10, cnt = r11;
 646 
 647     __ align(CodeEntryAlignment);
 648     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 649     address start = __ pc();
 650 
 651     if (UseBlockZeroing) {
 652       int zva_length = VM_Version::zva_length();
 653 
 654       // Ensure ZVA length can be divided by 16. This is required by
 655       // the subsequent operations.
 656       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 657 
 658       __ tbz(base, 3, base_aligned);
 659       __ str(zr, Address(__ post(base, 8)));
 660       __ sub(cnt, cnt, 1);
 661       __ bind(base_aligned);
 662 
 663       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 664       // alignment.
 665       Label small;
 666       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 667       __ subs(rscratch1, cnt, low_limit >> 3);
 668       __ br(Assembler::LT, small);
 669       __ zero_dcache_blocks(base, cnt);
 670       __ bind(small);
 671     }
 672 
 673     {
 674       // Number of stp instructions we'll unroll
 675       const int unroll =
 676         MacroAssembler::zero_words_block_size / 2;
 677       // Clear the remaining blocks.
 678       Label loop;
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::LT, done);
 681       __ bind(loop);
 682       for (int i = 0; i < unroll; i++)
 683         __ stp(zr, zr, __ post(base, 16));
 684       __ subs(cnt, cnt, unroll * 2);
 685       __ br(Assembler::GE, loop);
 686       __ bind(done);
 687       __ add(cnt, cnt, unroll * 2);
 688     }
 689 
 690     __ ret(lr);
 691 
 692     return start;
 693   }
 694 
 695 
 696   typedef enum {
 697     copy_forwards = 1,
 698     copy_backwards = -1
 699   } copy_direction;
 700 
 701   // Bulk copy of blocks of 8 words.
 702   //
 703   // count is a count of words.
 704   //
 705   // Precondition: count >= 8
 706   //
 707   // Postconditions:
 708   //
 709   // The least significant bit of count contains the remaining count
 710   // of words to copy.  The rest of count is trash.
 711   //
 712   // s and d are adjusted to point to the remaining words to copy
 713   //
 714   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 715                            copy_direction direction) {
 716     int unit = wordSize * direction;
 717     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 718 
 719     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 720       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 721     const Register stride = r13;
 722 
 723     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 724     assert_different_registers(s, d, count, rscratch1);
 725 
 726     Label again, drain;
 727     const char *stub_name;
 728     if (direction == copy_forwards)
 729       stub_name = "forward_copy_longs";
 730     else
 731       stub_name = "backward_copy_longs";
 732 
 733     __ align(CodeEntryAlignment);
 734 
 735     StubCodeMark mark(this, "StubRoutines", stub_name);
 736 
 737     __ bind(start);
 738 
 739     Label unaligned_copy_long;
 740     if (AvoidUnalignedAccesses) {
 741       __ tbnz(d, 3, unaligned_copy_long);
 742     }
 743 
 744     if (direction == copy_forwards) {
 745       __ sub(s, s, bias);
 746       __ sub(d, d, bias);
 747     }
 748 
 749 #ifdef ASSERT
 750     // Make sure we are never given < 8 words
 751     {
 752       Label L;
 753       __ cmp(count, (u1)8);
 754       __ br(Assembler::GE, L);
 755       __ stop("genrate_copy_longs called with < 8 words");
 756       __ bind(L);
 757     }
 758 #endif
 759 
 760     // Fill 8 registers
 761     if (UseSIMDForMemoryOps) {
 762       __ ldpq(v0, v1, Address(s, 4 * unit));
 763       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 764     } else {
 765       __ ldp(t0, t1, Address(s, 2 * unit));
 766       __ ldp(t2, t3, Address(s, 4 * unit));
 767       __ ldp(t4, t5, Address(s, 6 * unit));
 768       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 769     }
 770 
 771     __ subs(count, count, 16);
 772     __ br(Assembler::LO, drain);
 773 
 774     int prefetch = PrefetchCopyIntervalInBytes;
 775     bool use_stride = false;
 776     if (direction == copy_backwards) {
 777        use_stride = prefetch > 256;
 778        prefetch = -prefetch;
 779        if (use_stride) __ mov(stride, prefetch);
 780     }
 781 
 782     __ bind(again);
 783 
 784     if (PrefetchCopyIntervalInBytes > 0)
 785       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 786 
 787     if (UseSIMDForMemoryOps) {
 788       __ stpq(v0, v1, Address(d, 4 * unit));
 789       __ ldpq(v0, v1, Address(s, 4 * unit));
 790       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 791       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 792     } else {
 793       __ stp(t0, t1, Address(d, 2 * unit));
 794       __ ldp(t0, t1, Address(s, 2 * unit));
 795       __ stp(t2, t3, Address(d, 4 * unit));
 796       __ ldp(t2, t3, Address(s, 4 * unit));
 797       __ stp(t4, t5, Address(d, 6 * unit));
 798       __ ldp(t4, t5, Address(s, 6 * unit));
 799       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 800       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 801     }
 802 
 803     __ subs(count, count, 8);
 804     __ br(Assembler::HS, again);
 805 
 806     // Drain
 807     __ bind(drain);
 808     if (UseSIMDForMemoryOps) {
 809       __ stpq(v0, v1, Address(d, 4 * unit));
 810       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 811     } else {
 812       __ stp(t0, t1, Address(d, 2 * unit));
 813       __ stp(t2, t3, Address(d, 4 * unit));
 814       __ stp(t4, t5, Address(d, 6 * unit));
 815       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 816     }
 817 
 818     {
 819       Label L1, L2;
 820       __ tbz(count, exact_log2(4), L1);
 821       if (UseSIMDForMemoryOps) {
 822         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 823         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 824       } else {
 825         __ ldp(t0, t1, Address(s, 2 * unit));
 826         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 827         __ stp(t0, t1, Address(d, 2 * unit));
 828         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 829       }
 830       __ bind(L1);
 831 
 832       if (direction == copy_forwards) {
 833         __ add(s, s, bias);
 834         __ add(d, d, bias);
 835       }
 836 
 837       __ tbz(count, 1, L2);
 838       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 839       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 840       __ bind(L2);
 841     }
 842 
 843     __ ret(lr);
 844 
 845     if (AvoidUnalignedAccesses) {
 846       Label drain, again;
 847       // Register order for storing. Order is different for backward copy.
 848 
 849       __ bind(unaligned_copy_long);
 850 
 851       // source address is even aligned, target odd aligned
 852       //
 853       // when forward copying word pairs we read long pairs at offsets
 854       // {0, 2, 4, 6} (in long words). when backwards copying we read
 855       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 856       // address by -2 in the forwards case so we can compute the
 857       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 858       // or -1.
 859       //
 860       // when forward copying we need to store 1 word, 3 pairs and
 861       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 862       // zero offset We adjust the destination by -1 which means we
 863       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 864       //
 865       // When backwards copyng we need to store 1 word, 3 pairs and
 866       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 867       // offsets {1, 3, 5, 7, 8} * unit.
 868 
 869       if (direction == copy_forwards) {
 870         __ sub(s, s, 16);
 871         __ sub(d, d, 8);
 872       }
 873 
 874       // Fill 8 registers
 875       //
 876       // for forwards copy s was offset by -16 from the original input
 877       // value of s so the register contents are at these offsets
 878       // relative to the 64 bit block addressed by that original input
 879       // and so on for each successive 64 byte block when s is updated
 880       //
 881       // t0 at offset 0,  t1 at offset 8
 882       // t2 at offset 16, t3 at offset 24
 883       // t4 at offset 32, t5 at offset 40
 884       // t6 at offset 48, t7 at offset 56
 885 
 886       // for backwards copy s was not offset so the register contents
 887       // are at these offsets into the preceding 64 byte block
 888       // relative to that original input and so on for each successive
 889       // preceding 64 byte block when s is updated. this explains the
 890       // slightly counter-intuitive looking pattern of register usage
 891       // in the stp instructions for backwards copy.
 892       //
 893       // t0 at offset -16, t1 at offset -8
 894       // t2 at offset -32, t3 at offset -24
 895       // t4 at offset -48, t5 at offset -40
 896       // t6 at offset -64, t7 at offset -56
 897 
 898       __ ldp(t0, t1, Address(s, 2 * unit));
 899       __ ldp(t2, t3, Address(s, 4 * unit));
 900       __ ldp(t4, t5, Address(s, 6 * unit));
 901       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 902 
 903       __ subs(count, count, 16);
 904       __ br(Assembler::LO, drain);
 905 
 906       int prefetch = PrefetchCopyIntervalInBytes;
 907       bool use_stride = false;
 908       if (direction == copy_backwards) {
 909          use_stride = prefetch > 256;
 910          prefetch = -prefetch;
 911          if (use_stride) __ mov(stride, prefetch);
 912       }
 913 
 914       __ bind(again);
 915 
 916       if (PrefetchCopyIntervalInBytes > 0)
 917         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 918 
 919       if (direction == copy_forwards) {
 920        // allowing for the offset of -8 the store instructions place
 921        // registers into the target 64 bit block at the following
 922        // offsets
 923        //
 924        // t0 at offset 0
 925        // t1 at offset 8,  t2 at offset 16
 926        // t3 at offset 24, t4 at offset 32
 927        // t5 at offset 40, t6 at offset 48
 928        // t7 at offset 56
 929 
 930         __ str(t0, Address(d, 1 * unit));
 931         __ stp(t1, t2, Address(d, 2 * unit));
 932         __ ldp(t0, t1, Address(s, 2 * unit));
 933         __ stp(t3, t4, Address(d, 4 * unit));
 934         __ ldp(t2, t3, Address(s, 4 * unit));
 935         __ stp(t5, t6, Address(d, 6 * unit));
 936         __ ldp(t4, t5, Address(s, 6 * unit));
 937         __ str(t7, Address(__ pre(d, 8 * unit)));
 938         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 939       } else {
 940        // d was not offset when we started so the registers are
 941        // written into the 64 bit block preceding d with the following
 942        // offsets
 943        //
 944        // t1 at offset -8
 945        // t3 at offset -24, t0 at offset -16
 946        // t5 at offset -48, t2 at offset -32
 947        // t7 at offset -56, t4 at offset -48
 948        //                   t6 at offset -64
 949        //
 950        // note that this matches the offsets previously noted for the
 951        // loads
 952 
 953         __ str(t1, Address(d, 1 * unit));
 954         __ stp(t3, t0, Address(d, 3 * unit));
 955         __ ldp(t0, t1, Address(s, 2 * unit));
 956         __ stp(t5, t2, Address(d, 5 * unit));
 957         __ ldp(t2, t3, Address(s, 4 * unit));
 958         __ stp(t7, t4, Address(d, 7 * unit));
 959         __ ldp(t4, t5, Address(s, 6 * unit));
 960         __ str(t6, Address(__ pre(d, 8 * unit)));
 961         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 962       }
 963 
 964       __ subs(count, count, 8);
 965       __ br(Assembler::HS, again);
 966 
 967       // Drain
 968       //
 969       // this uses the same pattern of offsets and register arguments
 970       // as above
 971       __ bind(drain);
 972       if (direction == copy_forwards) {
 973         __ str(t0, Address(d, 1 * unit));
 974         __ stp(t1, t2, Address(d, 2 * unit));
 975         __ stp(t3, t4, Address(d, 4 * unit));
 976         __ stp(t5, t6, Address(d, 6 * unit));
 977         __ str(t7, Address(__ pre(d, 8 * unit)));
 978       } else {
 979         __ str(t1, Address(d, 1 * unit));
 980         __ stp(t3, t0, Address(d, 3 * unit));
 981         __ stp(t5, t2, Address(d, 5 * unit));
 982         __ stp(t7, t4, Address(d, 7 * unit));
 983         __ str(t6, Address(__ pre(d, 8 * unit)));
 984       }
 985       // now we need to copy any remaining part block which may
 986       // include a 4 word block subblock and/or a 2 word subblock.
 987       // bits 2 and 1 in the count are the tell-tale for whetehr we
 988       // have each such subblock
 989       {
 990         Label L1, L2;
 991         __ tbz(count, exact_log2(4), L1);
 992        // this is the same as above but copying only 4 longs hence
 993        // with ony one intervening stp between the str instructions
 994        // but note that the offsets and registers still follow the
 995        // same pattern
 996         __ ldp(t0, t1, Address(s, 2 * unit));
 997         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 998         if (direction == copy_forwards) {
 999           __ str(t0, Address(d, 1 * unit));
1000           __ stp(t1, t2, Address(d, 2 * unit));
1001           __ str(t3, Address(__ pre(d, 4 * unit)));
1002         } else {
1003           __ str(t1, Address(d, 1 * unit));
1004           __ stp(t3, t0, Address(d, 3 * unit));
1005           __ str(t2, Address(__ pre(d, 4 * unit)));
1006         }
1007         __ bind(L1);
1008 
1009         __ tbz(count, 1, L2);
1010        // this is the same as above but copying only 2 longs hence
1011        // there is no intervening stp between the str instructions
1012        // but note that the offset and register patterns are still
1013        // the same
1014         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1015         if (direction == copy_forwards) {
1016           __ str(t0, Address(d, 1 * unit));
1017           __ str(t1, Address(__ pre(d, 2 * unit)));
1018         } else {
1019           __ str(t1, Address(d, 1 * unit));
1020           __ str(t0, Address(__ pre(d, 2 * unit)));
1021         }
1022         __ bind(L2);
1023 
1024        // for forwards copy we need to re-adjust the offsets we
1025        // applied so that s and d are follow the last words written
1026 
1027        if (direction == copy_forwards) {
1028          __ add(s, s, 16);
1029          __ add(d, d, 8);
1030        }
1031 
1032       }
1033 
1034       __ ret(lr);
1035       }
1036   }
1037 
1038   // Small copy: less than 16 bytes.
1039   //
1040   // NB: Ignores all of the bits of count which represent more than 15
1041   // bytes, so a caller doesn't have to mask them.
1042 
1043   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1044     bool is_backwards = step < 0;
1045     size_t granularity = uabs(step);
1046     int direction = is_backwards ? -1 : 1;
1047     int unit = wordSize * direction;
1048 
1049     Label Lword, Lint, Lshort, Lbyte;
1050 
1051     assert(granularity
1052            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1053 
1054     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1055 
1056     // ??? I don't know if this bit-test-and-branch is the right thing
1057     // to do.  It does a lot of jumping, resulting in several
1058     // mispredicted branches.  It might make more sense to do this
1059     // with something like Duff's device with a single computed branch.
1060 
1061     __ tbz(count, 3 - exact_log2(granularity), Lword);
1062     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1063     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1064     __ bind(Lword);
1065 
1066     if (granularity <= sizeof (jint)) {
1067       __ tbz(count, 2 - exact_log2(granularity), Lint);
1068       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1069       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1070       __ bind(Lint);
1071     }
1072 
1073     if (granularity <= sizeof (jshort)) {
1074       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1075       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1076       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1077       __ bind(Lshort);
1078     }
1079 
1080     if (granularity <= sizeof (jbyte)) {
1081       __ tbz(count, 0, Lbyte);
1082       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1083       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1084       __ bind(Lbyte);
1085     }
1086   }
1087 
1088   Label copy_f, copy_b;
1089 
1090   // All-singing all-dancing memory copy.
1091   //
1092   // Copy count units of memory from s to d.  The size of a unit is
1093   // step, which can be positive or negative depending on the direction
1094   // of copy.  If is_aligned is false, we align the source address.
1095   //
1096 
1097   void copy_memory(bool is_aligned, Register s, Register d,
1098                    Register count, Register tmp, int step) {
1099     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1100     bool is_backwards = step < 0;
1101     unsigned int granularity = uabs(step);
1102     const Register t0 = r3, t1 = r4;
1103 
1104     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1105     // load all the data before writing anything
1106     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1107     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1108     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1109     const Register send = r17, dend = r16;
1110 
1111     if (PrefetchCopyIntervalInBytes > 0)
1112       __ prfm(Address(s, 0), PLDL1KEEP);
1113     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1114     __ br(Assembler::HI, copy_big);
1115 
1116     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1117     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1118 
1119     __ cmp(count, u1(16/granularity));
1120     __ br(Assembler::LS, copy16);
1121 
1122     __ cmp(count, u1(64/granularity));
1123     __ br(Assembler::HI, copy80);
1124 
1125     __ cmp(count, u1(32/granularity));
1126     __ br(Assembler::LS, copy32);
1127 
1128     // 33..64 bytes
1129     if (UseSIMDForMemoryOps) {
1130       __ ldpq(v0, v1, Address(s, 0));
1131       __ ldpq(v2, v3, Address(send, -32));
1132       __ stpq(v0, v1, Address(d, 0));
1133       __ stpq(v2, v3, Address(dend, -32));
1134     } else {
1135       __ ldp(t0, t1, Address(s, 0));
1136       __ ldp(t2, t3, Address(s, 16));
1137       __ ldp(t4, t5, Address(send, -32));
1138       __ ldp(t6, t7, Address(send, -16));
1139 
1140       __ stp(t0, t1, Address(d, 0));
1141       __ stp(t2, t3, Address(d, 16));
1142       __ stp(t4, t5, Address(dend, -32));
1143       __ stp(t6, t7, Address(dend, -16));
1144     }
1145     __ b(finish);
1146 
1147     // 17..32 bytes
1148     __ bind(copy32);
1149     __ ldp(t0, t1, Address(s, 0));
1150     __ ldp(t2, t3, Address(send, -16));
1151     __ stp(t0, t1, Address(d, 0));
1152     __ stp(t2, t3, Address(dend, -16));
1153     __ b(finish);
1154 
1155     // 65..80/96 bytes
1156     // (96 bytes if SIMD because we do 32 byes per instruction)
1157     __ bind(copy80);
1158     if (UseSIMDForMemoryOps) {
1159       __ ldpq(v0, v1, Address(s, 0));
1160       __ ldpq(v2, v3, Address(s, 32));
1161       // Unaligned pointers can be an issue for copying.
1162       // The issue has more chances to happen when granularity of data is
1163       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1164       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1165       // The most performance drop has been seen for the range 65-80 bytes.
1166       // For such cases using the pair of ldp/stp instead of the third pair of
1167       // ldpq/stpq fixes the performance issue.
1168       if (granularity < sizeof (jint)) {
1169         Label copy96;
1170         __ cmp(count, u1(80/granularity));
1171         __ br(Assembler::HI, copy96);
1172         __ ldp(t0, t1, Address(send, -16));
1173 
1174         __ stpq(v0, v1, Address(d, 0));
1175         __ stpq(v2, v3, Address(d, 32));
1176         __ stp(t0, t1, Address(dend, -16));
1177         __ b(finish);
1178 
1179         __ bind(copy96);
1180       }
1181       __ ldpq(v4, v5, Address(send, -32));
1182 
1183       __ stpq(v0, v1, Address(d, 0));
1184       __ stpq(v2, v3, Address(d, 32));
1185       __ stpq(v4, v5, Address(dend, -32));
1186     } else {
1187       __ ldp(t0, t1, Address(s, 0));
1188       __ ldp(t2, t3, Address(s, 16));
1189       __ ldp(t4, t5, Address(s, 32));
1190       __ ldp(t6, t7, Address(s, 48));
1191       __ ldp(t8, t9, Address(send, -16));
1192 
1193       __ stp(t0, t1, Address(d, 0));
1194       __ stp(t2, t3, Address(d, 16));
1195       __ stp(t4, t5, Address(d, 32));
1196       __ stp(t6, t7, Address(d, 48));
1197       __ stp(t8, t9, Address(dend, -16));
1198     }
1199     __ b(finish);
1200 
1201     // 0..16 bytes
1202     __ bind(copy16);
1203     __ cmp(count, u1(8/granularity));
1204     __ br(Assembler::LO, copy8);
1205 
1206     // 8..16 bytes
1207     __ ldr(t0, Address(s, 0));
1208     __ ldr(t1, Address(send, -8));
1209     __ str(t0, Address(d, 0));
1210     __ str(t1, Address(dend, -8));
1211     __ b(finish);
1212 
1213     if (granularity < 8) {
1214       // 4..7 bytes
1215       __ bind(copy8);
1216       __ tbz(count, 2 - exact_log2(granularity), copy4);
1217       __ ldrw(t0, Address(s, 0));
1218       __ ldrw(t1, Address(send, -4));
1219       __ strw(t0, Address(d, 0));
1220       __ strw(t1, Address(dend, -4));
1221       __ b(finish);
1222       if (granularity < 4) {
1223         // 0..3 bytes
1224         __ bind(copy4);
1225         __ cbz(count, finish); // get rid of 0 case
1226         if (granularity == 2) {
1227           __ ldrh(t0, Address(s, 0));
1228           __ strh(t0, Address(d, 0));
1229         } else { // granularity == 1
1230           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1231           // the first and last byte.
1232           // Handle the 3 byte case by loading and storing base + count/2
1233           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1234           // This does means in the 1 byte case we load/store the same
1235           // byte 3 times.
1236           __ lsr(count, count, 1);
1237           __ ldrb(t0, Address(s, 0));
1238           __ ldrb(t1, Address(send, -1));
1239           __ ldrb(t2, Address(s, count));
1240           __ strb(t0, Address(d, 0));
1241           __ strb(t1, Address(dend, -1));
1242           __ strb(t2, Address(d, count));
1243         }
1244         __ b(finish);
1245       }
1246     }
1247 
1248     __ bind(copy_big);
1249     if (is_backwards) {
1250       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1251       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1252     }
1253 
1254     // Now we've got the small case out of the way we can align the
1255     // source address on a 2-word boundary.
1256 
1257     Label aligned;
1258 
1259     if (is_aligned) {
1260       // We may have to adjust by 1 word to get s 2-word-aligned.
1261       __ tbz(s, exact_log2(wordSize), aligned);
1262       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1263       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1264       __ sub(count, count, wordSize/granularity);
1265     } else {
1266       if (is_backwards) {
1267         __ andr(rscratch2, s, 2 * wordSize - 1);
1268       } else {
1269         __ neg(rscratch2, s);
1270         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1271       }
1272       // rscratch2 is the byte adjustment needed to align s.
1273       __ cbz(rscratch2, aligned);
1274       int shift = exact_log2(granularity);
1275       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1276       __ sub(count, count, rscratch2);
1277 
1278 #if 0
1279       // ?? This code is only correct for a disjoint copy.  It may or
1280       // may not make sense to use it in that case.
1281 
1282       // Copy the first pair; s and d may not be aligned.
1283       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1284       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1285 
1286       // Align s and d, adjust count
1287       if (is_backwards) {
1288         __ sub(s, s, rscratch2);
1289         __ sub(d, d, rscratch2);
1290       } else {
1291         __ add(s, s, rscratch2);
1292         __ add(d, d, rscratch2);
1293       }
1294 #else
1295       copy_memory_small(s, d, rscratch2, rscratch1, step);
1296 #endif
1297     }
1298 
1299     __ bind(aligned);
1300 
1301     // s is now 2-word-aligned.
1302 
1303     // We have a count of units and some trailing bytes.  Adjust the
1304     // count and do a bulk copy of words.
1305     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1306     if (direction == copy_forwards)
1307       __ bl(copy_f);
1308     else
1309       __ bl(copy_b);
1310 
1311     // And the tail.
1312     copy_memory_small(s, d, count, tmp, step);
1313 
1314     if (granularity >= 8) __ bind(copy8);
1315     if (granularity >= 4) __ bind(copy4);
1316     __ bind(finish);
1317   }
1318 
1319 
1320   void clobber_registers() {
1321 #ifdef ASSERT
1322     RegSet clobbered
1323       = MacroAssembler::call_clobbered_registers() - rscratch1;
1324     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1325     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1326     for (RegSetIterator<> it = clobbered.begin(); *it != noreg; ++it) {
1327       __ mov(*it, rscratch1);
1328     }
1329 #endif
1330 
1331   }
1332 
1333   // Scan over array at a for count oops, verifying each one.
1334   // Preserves a and count, clobbers rscratch1 and rscratch2.
1335   void verify_oop_array (int size, Register a, Register count, Register temp) {
1336     Label loop, end;
1337     __ mov(rscratch1, a);
1338     __ mov(rscratch2, zr);
1339     __ bind(loop);
1340     __ cmp(rscratch2, count);
1341     __ br(Assembler::HS, end);
1342     if (size == wordSize) {
1343       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1344       __ verify_oop(temp);
1345     } else {
1346       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1347       __ decode_heap_oop(temp); // calls verify_oop
1348     }
1349     __ add(rscratch2, rscratch2, 1);
1350     __ b(loop);
1351     __ bind(end);
1352   }
1353 
1354   // Arguments:
1355   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1356   //             ignored
1357   //   is_oop  - true => oop array, so generate store check code
1358   //   name    - stub name string
1359   //
1360   // Inputs:
1361   //   c_rarg0   - source array address
1362   //   c_rarg1   - destination array address
1363   //   c_rarg2   - element count, treated as ssize_t, can be zero
1364   //
1365   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1366   // the hardware handle it.  The two dwords within qwords that span
1367   // cache line boundaries will still be loaded and stored atomically.
1368   //
1369   // Side Effects:
1370   //   disjoint_int_copy_entry is set to the no-overlap entry point
1371   //   used by generate_conjoint_int_oop_copy().
1372   //
1373   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1374                                   const char *name, bool dest_uninitialized = false) {
1375     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1376     RegSet saved_reg = RegSet::of(s, d, count);
1377     __ align(CodeEntryAlignment);
1378     StubCodeMark mark(this, "StubRoutines", name);
1379     address start = __ pc();
1380     __ enter();
1381 
1382     if (entry != NULL) {
1383       *entry = __ pc();
1384       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1385       BLOCK_COMMENT("Entry:");
1386     }
1387 
1388     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1389     if (dest_uninitialized) {
1390       decorators |= IS_DEST_UNINITIALIZED;
1391     }
1392     if (aligned) {
1393       decorators |= ARRAYCOPY_ALIGNED;
1394     }
1395 
1396     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1397     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1398 
1399     if (is_oop) {
1400       // save regs before copy_memory
1401       __ push(RegSet::of(d, count), sp);
1402     }
1403     {
1404       // UnsafeCopyMemory page error: continue after ucm
1405       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1406       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1407       copy_memory(aligned, s, d, count, rscratch1, size);
1408     }
1409 
1410     if (is_oop) {
1411       __ pop(RegSet::of(d, count), sp);
1412       if (VerifyOops)
1413         verify_oop_array(size, d, count, r16);
1414     }
1415 
1416     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1417 
1418     __ leave();
1419     __ mov(r0, zr); // return 0
1420     __ ret(lr);
1421     return start;
1422   }
1423 
1424   // Arguments:
1425   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1426   //             ignored
1427   //   is_oop  - true => oop array, so generate store check code
1428   //   name    - stub name string
1429   //
1430   // Inputs:
1431   //   c_rarg0   - source array address
1432   //   c_rarg1   - destination array address
1433   //   c_rarg2   - element count, treated as ssize_t, can be zero
1434   //
1435   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1436   // the hardware handle it.  The two dwords within qwords that span
1437   // cache line boundaries will still be loaded and stored atomically.
1438   //
1439   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1440                                  address *entry, const char *name,
1441                                  bool dest_uninitialized = false) {
1442     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1443     RegSet saved_regs = RegSet::of(s, d, count);
1444     StubCodeMark mark(this, "StubRoutines", name);
1445     address start = __ pc();
1446     __ enter();
1447 
1448     if (entry != NULL) {
1449       *entry = __ pc();
1450       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1451       BLOCK_COMMENT("Entry:");
1452     }
1453 
1454     // use fwd copy when (d-s) above_equal (count*size)
1455     __ sub(rscratch1, d, s);
1456     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1457     __ br(Assembler::HS, nooverlap_target);
1458 
1459     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1460     if (dest_uninitialized) {
1461       decorators |= IS_DEST_UNINITIALIZED;
1462     }
1463     if (aligned) {
1464       decorators |= ARRAYCOPY_ALIGNED;
1465     }
1466 
1467     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1468     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1469 
1470     if (is_oop) {
1471       // save regs before copy_memory
1472       __ push(RegSet::of(d, count), sp);
1473     }
1474     {
1475       // UnsafeCopyMemory page error: continue after ucm
1476       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1477       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1478       copy_memory(aligned, s, d, count, rscratch1, -size);
1479     }
1480     if (is_oop) {
1481       __ pop(RegSet::of(d, count), sp);
1482       if (VerifyOops)
1483         verify_oop_array(size, d, count, r16);
1484     }
1485     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1486     __ leave();
1487     __ mov(r0, zr); // return 0
1488     __ ret(lr);
1489     return start;
1490 }
1491 
1492   // Arguments:
1493   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1494   //             ignored
1495   //   name    - stub name string
1496   //
1497   // Inputs:
1498   //   c_rarg0   - source array address
1499   //   c_rarg1   - destination array address
1500   //   c_rarg2   - element count, treated as ssize_t, can be zero
1501   //
1502   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1503   // we let the hardware handle it.  The one to eight bytes within words,
1504   // dwords or qwords that span cache line boundaries will still be loaded
1505   // and stored atomically.
1506   //
1507   // Side Effects:
1508   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1509   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1510   // we let the hardware handle it.  The one to eight bytes within words,
1511   // dwords or qwords that span cache line boundaries will still be loaded
1512   // and stored atomically.
1513   //
1514   // Side Effects:
1515   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1516   //   used by generate_conjoint_byte_copy().
1517   //
1518   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1519     const bool not_oop = false;
1520     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1521   }
1522 
1523   // Arguments:
1524   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1525   //             ignored
1526   //   name    - stub name string
1527   //
1528   // Inputs:
1529   //   c_rarg0   - source array address
1530   //   c_rarg1   - destination array address
1531   //   c_rarg2   - element count, treated as ssize_t, can be zero
1532   //
1533   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1534   // we let the hardware handle it.  The one to eight bytes within words,
1535   // dwords or qwords that span cache line boundaries will still be loaded
1536   // and stored atomically.
1537   //
1538   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1539                                       address* entry, const char *name) {
1540     const bool not_oop = false;
1541     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1542   }
1543 
1544   // Arguments:
1545   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546   //             ignored
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1555   // let the hardware handle it.  The two or four words within dwords
1556   // or qwords that span cache line boundaries will still be loaded
1557   // and stored atomically.
1558   //
1559   // Side Effects:
1560   //   disjoint_short_copy_entry is set to the no-overlap entry point
1561   //   used by generate_conjoint_short_copy().
1562   //
1563   address generate_disjoint_short_copy(bool aligned,
1564                                        address* entry, const char *name) {
1565     const bool not_oop = false;
1566     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1567   }
1568 
1569   // Arguments:
1570   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1571   //             ignored
1572   //   name    - stub name string
1573   //
1574   // Inputs:
1575   //   c_rarg0   - source array address
1576   //   c_rarg1   - destination array address
1577   //   c_rarg2   - element count, treated as ssize_t, can be zero
1578   //
1579   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1580   // let the hardware handle it.  The two or four words within dwords
1581   // or qwords that span cache line boundaries will still be loaded
1582   // and stored atomically.
1583   //
1584   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1585                                        address *entry, const char *name) {
1586     const bool not_oop = false;
1587     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1588 
1589   }
1590   // Arguments:
1591   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1592   //             ignored
1593   //   name    - stub name string
1594   //
1595   // Inputs:
1596   //   c_rarg0   - source array address
1597   //   c_rarg1   - destination array address
1598   //   c_rarg2   - element count, treated as ssize_t, can be zero
1599   //
1600   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1601   // the hardware handle it.  The two dwords within qwords that span
1602   // cache line boundaries will still be loaded and stored atomically.
1603   //
1604   // Side Effects:
1605   //   disjoint_int_copy_entry is set to the no-overlap entry point
1606   //   used by generate_conjoint_int_oop_copy().
1607   //
1608   address generate_disjoint_int_copy(bool aligned, address *entry,
1609                                          const char *name, bool dest_uninitialized = false) {
1610     const bool not_oop = false;
1611     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1612   }
1613 
1614   // Arguments:
1615   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1616   //             ignored
1617   //   name    - stub name string
1618   //
1619   // Inputs:
1620   //   c_rarg0   - source array address
1621   //   c_rarg1   - destination array address
1622   //   c_rarg2   - element count, treated as ssize_t, can be zero
1623   //
1624   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1625   // the hardware handle it.  The two dwords within qwords that span
1626   // cache line boundaries will still be loaded and stored atomically.
1627   //
1628   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1629                                      address *entry, const char *name,
1630                                      bool dest_uninitialized = false) {
1631     const bool not_oop = false;
1632     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1633   }
1634 
1635 
1636   // Arguments:
1637   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1638   //             ignored
1639   //   name    - stub name string
1640   //
1641   // Inputs:
1642   //   c_rarg0   - source array address
1643   //   c_rarg1   - destination array address
1644   //   c_rarg2   - element count, treated as size_t, can be zero
1645   //
1646   // Side Effects:
1647   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1648   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1649   //
1650   address generate_disjoint_long_copy(bool aligned, address *entry,
1651                                           const char *name, bool dest_uninitialized = false) {
1652     const bool not_oop = false;
1653     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1654   }
1655 
1656   // Arguments:
1657   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1658   //             ignored
1659   //   name    - stub name string
1660   //
1661   // Inputs:
1662   //   c_rarg0   - source array address
1663   //   c_rarg1   - destination array address
1664   //   c_rarg2   - element count, treated as size_t, can be zero
1665   //
1666   address generate_conjoint_long_copy(bool aligned,
1667                                       address nooverlap_target, address *entry,
1668                                       const char *name, bool dest_uninitialized = false) {
1669     const bool not_oop = false;
1670     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1671   }
1672 
1673   // Arguments:
1674   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1675   //             ignored
1676   //   name    - stub name string
1677   //
1678   // Inputs:
1679   //   c_rarg0   - source array address
1680   //   c_rarg1   - destination array address
1681   //   c_rarg2   - element count, treated as size_t, can be zero
1682   //
1683   // Side Effects:
1684   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1685   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1686   //
1687   address generate_disjoint_oop_copy(bool aligned, address *entry,
1688                                      const char *name, bool dest_uninitialized) {
1689     const bool is_oop = true;
1690     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1691     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1692   }
1693 
1694   // Arguments:
1695   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1696   //             ignored
1697   //   name    - stub name string
1698   //
1699   // Inputs:
1700   //   c_rarg0   - source array address
1701   //   c_rarg1   - destination array address
1702   //   c_rarg2   - element count, treated as size_t, can be zero
1703   //
1704   address generate_conjoint_oop_copy(bool aligned,
1705                                      address nooverlap_target, address *entry,
1706                                      const char *name, bool dest_uninitialized) {
1707     const bool is_oop = true;
1708     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1709     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1710                                   name, dest_uninitialized);
1711   }
1712 
1713 
1714   // Helper for generating a dynamic type check.
1715   // Smashes rscratch1, rscratch2.
1716   void generate_type_check(Register sub_klass,
1717                            Register super_check_offset,
1718                            Register super_klass,
1719                            Label& L_success) {
1720     assert_different_registers(sub_klass, super_check_offset, super_klass);
1721 
1722     BLOCK_COMMENT("type_check:");
1723 
1724     Label L_miss;
1725 
1726     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1727                                      super_check_offset);
1728     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1729 
1730     // Fall through on failure!
1731     __ BIND(L_miss);
1732   }
1733 
1734   //
1735   //  Generate checkcasting array copy stub
1736   //
1737   //  Input:
1738   //    c_rarg0   - source array address
1739   //    c_rarg1   - destination array address
1740   //    c_rarg2   - element count, treated as ssize_t, can be zero
1741   //    c_rarg3   - size_t ckoff (super_check_offset)
1742   //    c_rarg4   - oop ckval (super_klass)
1743   //
1744   //  Output:
1745   //    r0 ==  0  -  success
1746   //    r0 == -1^K - failure, where K is partial transfer count
1747   //
1748   address generate_checkcast_copy(const char *name, address *entry,
1749                                   bool dest_uninitialized = false) {
1750 
1751     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1752 
1753     // Input registers (after setup_arg_regs)
1754     const Register from        = c_rarg0;   // source array address
1755     const Register to          = c_rarg1;   // destination array address
1756     const Register count       = c_rarg2;   // elementscount
1757     const Register ckoff       = c_rarg3;   // super_check_offset
1758     const Register ckval       = c_rarg4;   // super_klass
1759 
1760     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1761     RegSet wb_post_saved_regs = RegSet::of(count);
1762 
1763     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1764     const Register copied_oop  = r22;       // actual oop copied
1765     const Register count_save  = r21;       // orig elementscount
1766     const Register start_to    = r20;       // destination array start address
1767     const Register r19_klass   = r19;       // oop._klass
1768 
1769     //---------------------------------------------------------------
1770     // Assembler stub will be used for this call to arraycopy
1771     // if the two arrays are subtypes of Object[] but the
1772     // destination array type is not equal to or a supertype
1773     // of the source type.  Each element must be separately
1774     // checked.
1775 
1776     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1777                                copied_oop, r19_klass, count_save);
1778 
1779     __ align(CodeEntryAlignment);
1780     StubCodeMark mark(this, "StubRoutines", name);
1781     address start = __ pc();
1782 
1783     __ enter(); // required for proper stackwalking of RuntimeStub frame
1784 
1785 #ifdef ASSERT
1786     // caller guarantees that the arrays really are different
1787     // otherwise, we would have to make conjoint checks
1788     { Label L;
1789       array_overlap_test(L, TIMES_OOP);
1790       __ stop("checkcast_copy within a single array");
1791       __ bind(L);
1792     }
1793 #endif //ASSERT
1794 
1795     // Caller of this entry point must set up the argument registers.
1796     if (entry != NULL) {
1797       *entry = __ pc();
1798       BLOCK_COMMENT("Entry:");
1799     }
1800 
1801      // Empty array:  Nothing to do.
1802     __ cbz(count, L_done);
1803     __ push(RegSet::of(r19, r20, r21, r22), sp);
1804 
1805 #ifdef ASSERT
1806     BLOCK_COMMENT("assert consistent ckoff/ckval");
1807     // The ckoff and ckval must be mutually consistent,
1808     // even though caller generates both.
1809     { Label L;
1810       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1811       __ ldrw(start_to, Address(ckval, sco_offset));
1812       __ cmpw(ckoff, start_to);
1813       __ br(Assembler::EQ, L);
1814       __ stop("super_check_offset inconsistent");
1815       __ bind(L);
1816     }
1817 #endif //ASSERT
1818 
1819     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1820     bool is_oop = true;
1821     if (dest_uninitialized) {
1822       decorators |= IS_DEST_UNINITIALIZED;
1823     }
1824 
1825     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1826     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1827 
1828     // save the original count
1829     __ mov(count_save, count);
1830 
1831     // Copy from low to high addresses
1832     __ mov(start_to, to);              // Save destination array start address
1833     __ b(L_load_element);
1834 
1835     // ======== begin loop ========
1836     // (Loop is rotated; its entry is L_load_element.)
1837     // Loop control:
1838     //   for (; count != 0; count--) {
1839     //     copied_oop = load_heap_oop(from++);
1840     //     ... generate_type_check ...;
1841     //     store_heap_oop(to++, copied_oop);
1842     //   }
1843     __ align(OptoLoopAlignment);
1844 
1845     __ BIND(L_store_element);
1846     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1847     __ sub(count, count, 1);
1848     __ cbz(count, L_do_card_marks);
1849 
1850     // ======== loop entry is here ========
1851     __ BIND(L_load_element);
1852     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1853     __ cbz(copied_oop, L_store_element);
1854 
1855     __ load_klass(r19_klass, copied_oop);// query the object klass
1856     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1857     // ======== end loop ========
1858 
1859     // It was a real error; we must depend on the caller to finish the job.
1860     // Register count = remaining oops, count_orig = total oops.
1861     // Emit GC store barriers for the oops we have copied and report
1862     // their number to the caller.
1863 
1864     __ subs(count, count_save, count);     // K = partially copied oop count
1865     __ eon(count, count, zr);                   // report (-1^K) to caller
1866     __ br(Assembler::EQ, L_done_pop);
1867 
1868     __ BIND(L_do_card_marks);
1869     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1870 
1871     __ bind(L_done_pop);
1872     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1873     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1874 
1875     __ bind(L_done);
1876     __ mov(r0, count);
1877     __ leave();
1878     __ ret(lr);
1879 
1880     return start;
1881   }
1882 
1883   // Perform range checks on the proposed arraycopy.
1884   // Kills temp, but nothing else.
1885   // Also, clean the sign bits of src_pos and dst_pos.
1886   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1887                               Register src_pos, // source position (c_rarg1)
1888                               Register dst,     // destination array oo (c_rarg2)
1889                               Register dst_pos, // destination position (c_rarg3)
1890                               Register length,
1891                               Register temp,
1892                               Label& L_failed) {
1893     BLOCK_COMMENT("arraycopy_range_checks:");
1894 
1895     assert_different_registers(rscratch1, temp);
1896 
1897     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1898     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1899     __ addw(temp, length, src_pos);
1900     __ cmpw(temp, rscratch1);
1901     __ br(Assembler::HI, L_failed);
1902 
1903     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1904     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1905     __ addw(temp, length, dst_pos);
1906     __ cmpw(temp, rscratch1);
1907     __ br(Assembler::HI, L_failed);
1908 
1909     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1910     __ movw(src_pos, src_pos);
1911     __ movw(dst_pos, dst_pos);
1912 
1913     BLOCK_COMMENT("arraycopy_range_checks done");
1914   }
1915 
1916   // These stubs get called from some dumb test routine.
1917   // I'll write them properly when they're called from
1918   // something that's actually doing something.
1919   static void fake_arraycopy_stub(address src, address dst, int count) {
1920     assert(count == 0, "huh?");
1921   }
1922 
1923 
1924   //
1925   //  Generate 'unsafe' array copy stub
1926   //  Though just as safe as the other stubs, it takes an unscaled
1927   //  size_t argument instead of an element count.
1928   //
1929   //  Input:
1930   //    c_rarg0   - source array address
1931   //    c_rarg1   - destination array address
1932   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1933   //
1934   // Examines the alignment of the operands and dispatches
1935   // to a long, int, short, or byte copy loop.
1936   //
1937   address generate_unsafe_copy(const char *name,
1938                                address byte_copy_entry,
1939                                address short_copy_entry,
1940                                address int_copy_entry,
1941                                address long_copy_entry) {
1942     Label L_long_aligned, L_int_aligned, L_short_aligned;
1943     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1944 
1945     __ align(CodeEntryAlignment);
1946     StubCodeMark mark(this, "StubRoutines", name);
1947     address start = __ pc();
1948     __ enter(); // required for proper stackwalking of RuntimeStub frame
1949 
1950     // bump this on entry, not on exit:
1951     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1952 
1953     __ orr(rscratch1, s, d);
1954     __ orr(rscratch1, rscratch1, count);
1955 
1956     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1957     __ cbz(rscratch1, L_long_aligned);
1958     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1959     __ cbz(rscratch1, L_int_aligned);
1960     __ tbz(rscratch1, 0, L_short_aligned);
1961     __ b(RuntimeAddress(byte_copy_entry));
1962 
1963     __ BIND(L_short_aligned);
1964     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1965     __ b(RuntimeAddress(short_copy_entry));
1966     __ BIND(L_int_aligned);
1967     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1968     __ b(RuntimeAddress(int_copy_entry));
1969     __ BIND(L_long_aligned);
1970     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1971     __ b(RuntimeAddress(long_copy_entry));
1972 
1973     return start;
1974   }
1975 
1976   //
1977   //  Generate generic array copy stubs
1978   //
1979   //  Input:
1980   //    c_rarg0    -  src oop
1981   //    c_rarg1    -  src_pos (32-bits)
1982   //    c_rarg2    -  dst oop
1983   //    c_rarg3    -  dst_pos (32-bits)
1984   //    c_rarg4    -  element count (32-bits)
1985   //
1986   //  Output:
1987   //    r0 ==  0  -  success
1988   //    r0 == -1^K - failure, where K is partial transfer count
1989   //
1990   address generate_generic_copy(const char *name,
1991                                 address byte_copy_entry, address short_copy_entry,
1992                                 address int_copy_entry, address oop_copy_entry,
1993                                 address long_copy_entry, address checkcast_copy_entry) {
1994 
1995     Label L_failed, L_objArray;
1996     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1997 
1998     // Input registers
1999     const Register src        = c_rarg0;  // source array oop
2000     const Register src_pos    = c_rarg1;  // source position
2001     const Register dst        = c_rarg2;  // destination array oop
2002     const Register dst_pos    = c_rarg3;  // destination position
2003     const Register length     = c_rarg4;
2004 
2005 
2006     // Registers used as temps
2007     const Register dst_klass  = c_rarg5;
2008 
2009     __ align(CodeEntryAlignment);
2010 
2011     StubCodeMark mark(this, "StubRoutines", name);
2012 
2013     address start = __ pc();
2014 
2015     __ enter(); // required for proper stackwalking of RuntimeStub frame
2016 
2017     // bump this on entry, not on exit:
2018     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2019 
2020     //-----------------------------------------------------------------------
2021     // Assembler stub will be used for this call to arraycopy
2022     // if the following conditions are met:
2023     //
2024     // (1) src and dst must not be null.
2025     // (2) src_pos must not be negative.
2026     // (3) dst_pos must not be negative.
2027     // (4) length  must not be negative.
2028     // (5) src klass and dst klass should be the same and not NULL.
2029     // (6) src and dst should be arrays.
2030     // (7) src_pos + length must not exceed length of src.
2031     // (8) dst_pos + length must not exceed length of dst.
2032     //
2033 
2034     //  if (src == NULL) return -1;
2035     __ cbz(src, L_failed);
2036 
2037     //  if (src_pos < 0) return -1;
2038     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2039 
2040     //  if (dst == NULL) return -1;
2041     __ cbz(dst, L_failed);
2042 
2043     //  if (dst_pos < 0) return -1;
2044     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2045 
2046     // registers used as temp
2047     const Register scratch_length    = r16; // elements count to copy
2048     const Register scratch_src_klass = r17; // array klass
2049     const Register lh                = r15; // layout helper
2050 
2051     //  if (length < 0) return -1;
2052     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2053     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2054 
2055     __ load_klass(scratch_src_klass, src);
2056 #ifdef ASSERT
2057     //  assert(src->klass() != NULL);
2058     {
2059       BLOCK_COMMENT("assert klasses not null {");
2060       Label L1, L2;
2061       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2062       __ bind(L1);
2063       __ stop("broken null klass");
2064       __ bind(L2);
2065       __ load_klass(rscratch1, dst);
2066       __ cbz(rscratch1, L1);     // this would be broken also
2067       BLOCK_COMMENT("} assert klasses not null done");
2068     }
2069 #endif
2070 
2071     // Load layout helper (32-bits)
2072     //
2073     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2074     // 32        30    24            16              8     2                 0
2075     //
2076     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2077     //
2078 
2079     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2080 
2081     // Handle objArrays completely differently...
2082     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2083     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2084     __ movw(rscratch1, objArray_lh);
2085     __ eorw(rscratch2, lh, rscratch1);
2086     __ cbzw(rscratch2, L_objArray);
2087 
2088     //  if (src->klass() != dst->klass()) return -1;
2089     __ load_klass(rscratch2, dst);
2090     __ eor(rscratch2, rscratch2, scratch_src_klass);
2091     __ cbnz(rscratch2, L_failed);
2092 
2093     //  if (!src->is_Array()) return -1;
2094     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2095 
2096     // At this point, it is known to be a typeArray (array_tag 0x3).
2097 #ifdef ASSERT
2098     {
2099       BLOCK_COMMENT("assert primitive array {");
2100       Label L;
2101       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2102       __ cmpw(lh, rscratch2);
2103       __ br(Assembler::GE, L);
2104       __ stop("must be a primitive array");
2105       __ bind(L);
2106       BLOCK_COMMENT("} assert primitive array done");
2107     }
2108 #endif
2109 
2110     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2111                            rscratch2, L_failed);
2112 
2113     // TypeArrayKlass
2114     //
2115     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2116     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2117     //
2118 
2119     const Register rscratch1_offset = rscratch1;    // array offset
2120     const Register r15_elsize = lh; // element size
2121 
2122     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2123            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2124     __ add(src, src, rscratch1_offset);           // src array offset
2125     __ add(dst, dst, rscratch1_offset);           // dst array offset
2126     BLOCK_COMMENT("choose copy loop based on element size");
2127 
2128     // next registers should be set before the jump to corresponding stub
2129     const Register from     = c_rarg0;  // source array address
2130     const Register to       = c_rarg1;  // destination array address
2131     const Register count    = c_rarg2;  // elements count
2132 
2133     // 'from', 'to', 'count' registers should be set in such order
2134     // since they are the same as 'src', 'src_pos', 'dst'.
2135 
2136     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2137 
2138     // The possible values of elsize are 0-3, i.e. exact_log2(element
2139     // size in bytes).  We do a simple bitwise binary search.
2140   __ BIND(L_copy_bytes);
2141     __ tbnz(r15_elsize, 1, L_copy_ints);
2142     __ tbnz(r15_elsize, 0, L_copy_shorts);
2143     __ lea(from, Address(src, src_pos));// src_addr
2144     __ lea(to,   Address(dst, dst_pos));// dst_addr
2145     __ movw(count, scratch_length); // length
2146     __ b(RuntimeAddress(byte_copy_entry));
2147 
2148   __ BIND(L_copy_shorts);
2149     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2150     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2151     __ movw(count, scratch_length); // length
2152     __ b(RuntimeAddress(short_copy_entry));
2153 
2154   __ BIND(L_copy_ints);
2155     __ tbnz(r15_elsize, 0, L_copy_longs);
2156     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2157     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2158     __ movw(count, scratch_length); // length
2159     __ b(RuntimeAddress(int_copy_entry));
2160 
2161   __ BIND(L_copy_longs);
2162 #ifdef ASSERT
2163     {
2164       BLOCK_COMMENT("assert long copy {");
2165       Label L;
2166       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2167       __ cmpw(r15_elsize, LogBytesPerLong);
2168       __ br(Assembler::EQ, L);
2169       __ stop("must be long copy, but elsize is wrong");
2170       __ bind(L);
2171       BLOCK_COMMENT("} assert long copy done");
2172     }
2173 #endif
2174     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2175     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2176     __ movw(count, scratch_length); // length
2177     __ b(RuntimeAddress(long_copy_entry));
2178 
2179     // ObjArrayKlass
2180   __ BIND(L_objArray);
2181     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2182 
2183     Label L_plain_copy, L_checkcast_copy;
2184     //  test array classes for subtyping
2185     __ load_klass(r15, dst);
2186     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2187     __ br(Assembler::NE, L_checkcast_copy);
2188 
2189     // Identically typed arrays can be copied without element-wise checks.
2190     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2191                            rscratch2, L_failed);
2192 
2193     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2194     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2195     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2196     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2197     __ movw(count, scratch_length); // length
2198   __ BIND(L_plain_copy);
2199     __ b(RuntimeAddress(oop_copy_entry));
2200 
2201   __ BIND(L_checkcast_copy);
2202     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2203     {
2204       // Before looking at dst.length, make sure dst is also an objArray.
2205       __ ldrw(rscratch1, Address(r15, lh_offset));
2206       __ movw(rscratch2, objArray_lh);
2207       __ eorw(rscratch1, rscratch1, rscratch2);
2208       __ cbnzw(rscratch1, L_failed);
2209 
2210       // It is safe to examine both src.length and dst.length.
2211       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2212                              r15, L_failed);
2213 
2214       __ load_klass(dst_klass, dst); // reload
2215 
2216       // Marshal the base address arguments now, freeing registers.
2217       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2218       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2219       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2220       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2221       __ movw(count, length);           // length (reloaded)
2222       Register sco_temp = c_rarg3;      // this register is free now
2223       assert_different_registers(from, to, count, sco_temp,
2224                                  dst_klass, scratch_src_klass);
2225       // assert_clean_int(count, sco_temp);
2226 
2227       // Generate the type check.
2228       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2229       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2230 
2231       // Smashes rscratch1, rscratch2
2232       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2233 
2234       // Fetch destination element klass from the ObjArrayKlass header.
2235       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2236       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2237       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2238 
2239       // the checkcast_copy loop needs two extra arguments:
2240       assert(c_rarg3 == sco_temp, "#3 already in place");
2241       // Set up arguments for checkcast_copy_entry.
2242       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2243       __ b(RuntimeAddress(checkcast_copy_entry));
2244     }
2245 
2246   __ BIND(L_failed);
2247     __ mov(r0, -1);
2248     __ leave();   // required for proper stackwalking of RuntimeStub frame
2249     __ ret(lr);
2250 
2251     return start;
2252   }
2253 
2254   //
2255   // Generate stub for array fill. If "aligned" is true, the
2256   // "to" address is assumed to be heapword aligned.
2257   //
2258   // Arguments for generated stub:
2259   //   to:    c_rarg0
2260   //   value: c_rarg1
2261   //   count: c_rarg2 treated as signed
2262   //
2263   address generate_fill(BasicType t, bool aligned, const char *name) {
2264     __ align(CodeEntryAlignment);
2265     StubCodeMark mark(this, "StubRoutines", name);
2266     address start = __ pc();
2267 
2268     BLOCK_COMMENT("Entry:");
2269 
2270     const Register to        = c_rarg0;  // source array address
2271     const Register value     = c_rarg1;  // value
2272     const Register count     = c_rarg2;  // elements count
2273 
2274     const Register bz_base = r10;        // base for block_zero routine
2275     const Register cnt_words = r11;      // temp register
2276 
2277     __ enter();
2278 
2279     Label L_fill_elements, L_exit1;
2280 
2281     int shift = -1;
2282     switch (t) {
2283       case T_BYTE:
2284         shift = 0;
2285         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2286         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2287         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2288         __ br(Assembler::LO, L_fill_elements);
2289         break;
2290       case T_SHORT:
2291         shift = 1;
2292         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2293         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2294         __ br(Assembler::LO, L_fill_elements);
2295         break;
2296       case T_INT:
2297         shift = 2;
2298         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2299         __ br(Assembler::LO, L_fill_elements);
2300         break;
2301       default: ShouldNotReachHere();
2302     }
2303 
2304     // Align source address at 8 bytes address boundary.
2305     Label L_skip_align1, L_skip_align2, L_skip_align4;
2306     if (!aligned) {
2307       switch (t) {
2308         case T_BYTE:
2309           // One byte misalignment happens only for byte arrays.
2310           __ tbz(to, 0, L_skip_align1);
2311           __ strb(value, Address(__ post(to, 1)));
2312           __ subw(count, count, 1);
2313           __ bind(L_skip_align1);
2314           // Fallthrough
2315         case T_SHORT:
2316           // Two bytes misalignment happens only for byte and short (char) arrays.
2317           __ tbz(to, 1, L_skip_align2);
2318           __ strh(value, Address(__ post(to, 2)));
2319           __ subw(count, count, 2 >> shift);
2320           __ bind(L_skip_align2);
2321           // Fallthrough
2322         case T_INT:
2323           // Align to 8 bytes, we know we are 4 byte aligned to start.
2324           __ tbz(to, 2, L_skip_align4);
2325           __ strw(value, Address(__ post(to, 4)));
2326           __ subw(count, count, 4 >> shift);
2327           __ bind(L_skip_align4);
2328           break;
2329         default: ShouldNotReachHere();
2330       }
2331     }
2332 
2333     //
2334     //  Fill large chunks
2335     //
2336     __ lsrw(cnt_words, count, 3 - shift); // number of words
2337     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2338     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2339     if (UseBlockZeroing) {
2340       Label non_block_zeroing, rest;
2341       // If the fill value is zero we can use the fast zero_words().
2342       __ cbnz(value, non_block_zeroing);
2343       __ mov(bz_base, to);
2344       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2345       __ zero_words(bz_base, cnt_words);
2346       __ b(rest);
2347       __ bind(non_block_zeroing);
2348       __ fill_words(to, cnt_words, value);
2349       __ bind(rest);
2350     } else {
2351       __ fill_words(to, cnt_words, value);
2352     }
2353 
2354     // Remaining count is less than 8 bytes. Fill it by a single store.
2355     // Note that the total length is no less than 8 bytes.
2356     if (t == T_BYTE || t == T_SHORT) {
2357       Label L_exit1;
2358       __ cbzw(count, L_exit1);
2359       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2360       __ str(value, Address(to, -8));    // overwrite some elements
2361       __ bind(L_exit1);
2362       __ leave();
2363       __ ret(lr);
2364     }
2365 
2366     // Handle copies less than 8 bytes.
2367     Label L_fill_2, L_fill_4, L_exit2;
2368     __ bind(L_fill_elements);
2369     switch (t) {
2370       case T_BYTE:
2371         __ tbz(count, 0, L_fill_2);
2372         __ strb(value, Address(__ post(to, 1)));
2373         __ bind(L_fill_2);
2374         __ tbz(count, 1, L_fill_4);
2375         __ strh(value, Address(__ post(to, 2)));
2376         __ bind(L_fill_4);
2377         __ tbz(count, 2, L_exit2);
2378         __ strw(value, Address(to));
2379         break;
2380       case T_SHORT:
2381         __ tbz(count, 0, L_fill_4);
2382         __ strh(value, Address(__ post(to, 2)));
2383         __ bind(L_fill_4);
2384         __ tbz(count, 1, L_exit2);
2385         __ strw(value, Address(to));
2386         break;
2387       case T_INT:
2388         __ cbzw(count, L_exit2);
2389         __ strw(value, Address(to));
2390         break;
2391       default: ShouldNotReachHere();
2392     }
2393     __ bind(L_exit2);
2394     __ leave();
2395     __ ret(lr);
2396     return start;
2397   }
2398 
2399   address generate_data_cache_writeback() {
2400     const Register line        = c_rarg0;  // address of line to write back
2401 
2402     __ align(CodeEntryAlignment);
2403 
2404     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2405 
2406     address start = __ pc();
2407     __ enter();
2408     __ cache_wb(Address(line, 0));
2409     __ leave();
2410     __ ret(lr);
2411 
2412     return start;
2413   }
2414 
2415   address generate_data_cache_writeback_sync() {
2416     const Register is_pre     = c_rarg0;  // pre or post sync
2417 
2418     __ align(CodeEntryAlignment);
2419 
2420     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2421 
2422     // pre wbsync is a no-op
2423     // post wbsync translates to an sfence
2424 
2425     Label skip;
2426     address start = __ pc();
2427     __ enter();
2428     __ cbnz(is_pre, skip);
2429     __ cache_wbsync(false);
2430     __ bind(skip);
2431     __ leave();
2432     __ ret(lr);
2433 
2434     return start;
2435   }
2436 
2437   void generate_arraycopy_stubs() {
2438     address entry;
2439     address entry_jbyte_arraycopy;
2440     address entry_jshort_arraycopy;
2441     address entry_jint_arraycopy;
2442     address entry_oop_arraycopy;
2443     address entry_jlong_arraycopy;
2444     address entry_checkcast_arraycopy;
2445 
2446     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2447     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2448 
2449     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2450 
2451     //*** jbyte
2452     // Always need aligned and unaligned versions
2453     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2454                                                                                   "jbyte_disjoint_arraycopy");
2455     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2456                                                                                   &entry_jbyte_arraycopy,
2457                                                                                   "jbyte_arraycopy");
2458     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2459                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2460     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2461                                                                                   "arrayof_jbyte_arraycopy");
2462 
2463     //*** jshort
2464     // Always need aligned and unaligned versions
2465     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2466                                                                                     "jshort_disjoint_arraycopy");
2467     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2468                                                                                     &entry_jshort_arraycopy,
2469                                                                                     "jshort_arraycopy");
2470     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2471                                                                                     "arrayof_jshort_disjoint_arraycopy");
2472     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2473                                                                                     "arrayof_jshort_arraycopy");
2474 
2475     //*** jint
2476     // Aligned versions
2477     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2478                                                                                 "arrayof_jint_disjoint_arraycopy");
2479     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2480                                                                                 "arrayof_jint_arraycopy");
2481     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2482     // entry_jint_arraycopy always points to the unaligned version
2483     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2484                                                                                 "jint_disjoint_arraycopy");
2485     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2486                                                                                 &entry_jint_arraycopy,
2487                                                                                 "jint_arraycopy");
2488 
2489     //*** jlong
2490     // It is always aligned
2491     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2492                                                                                   "arrayof_jlong_disjoint_arraycopy");
2493     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2494                                                                                   "arrayof_jlong_arraycopy");
2495     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2496     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2497 
2498     //*** oops
2499     {
2500       // With compressed oops we need unaligned versions; notice that
2501       // we overwrite entry_oop_arraycopy.
2502       bool aligned = !UseCompressedOops;
2503 
2504       StubRoutines::_arrayof_oop_disjoint_arraycopy
2505         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2506                                      /*dest_uninitialized*/false);
2507       StubRoutines::_arrayof_oop_arraycopy
2508         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2509                                      /*dest_uninitialized*/false);
2510       // Aligned versions without pre-barriers
2511       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2512         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2513                                      /*dest_uninitialized*/true);
2514       StubRoutines::_arrayof_oop_arraycopy_uninit
2515         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2516                                      /*dest_uninitialized*/true);
2517     }
2518 
2519     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2520     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2521     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2522     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2523 
2524     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2525     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2526                                                                         /*dest_uninitialized*/true);
2527 
2528     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2529                                                               entry_jbyte_arraycopy,
2530                                                               entry_jshort_arraycopy,
2531                                                               entry_jint_arraycopy,
2532                                                               entry_jlong_arraycopy);
2533 
2534     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2535                                                                entry_jbyte_arraycopy,
2536                                                                entry_jshort_arraycopy,
2537                                                                entry_jint_arraycopy,
2538                                                                entry_oop_arraycopy,
2539                                                                entry_jlong_arraycopy,
2540                                                                entry_checkcast_arraycopy);
2541 
2542     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2543     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2544     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2545     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2546     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2547     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2548   }
2549 
2550   void generate_math_stubs() { Unimplemented(); }
2551 
2552   // Arguments:
2553   //
2554   // Inputs:
2555   //   c_rarg0   - source byte array address
2556   //   c_rarg1   - destination byte array address
2557   //   c_rarg2   - K (key) in little endian int array
2558   //
2559   address generate_aescrypt_encryptBlock() {
2560     __ align(CodeEntryAlignment);
2561     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2562 
2563     Label L_doLast;
2564 
2565     const Register from        = c_rarg0;  // source array address
2566     const Register to          = c_rarg1;  // destination array address
2567     const Register key         = c_rarg2;  // key array address
2568     const Register keylen      = rscratch1;
2569 
2570     address start = __ pc();
2571     __ enter();
2572 
2573     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2574 
2575     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2576 
2577     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2578     __ rev32(v1, __ T16B, v1);
2579     __ rev32(v2, __ T16B, v2);
2580     __ rev32(v3, __ T16B, v3);
2581     __ rev32(v4, __ T16B, v4);
2582     __ aese(v0, v1);
2583     __ aesmc(v0, v0);
2584     __ aese(v0, v2);
2585     __ aesmc(v0, v0);
2586     __ aese(v0, v3);
2587     __ aesmc(v0, v0);
2588     __ aese(v0, v4);
2589     __ aesmc(v0, v0);
2590 
2591     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2592     __ rev32(v1, __ T16B, v1);
2593     __ rev32(v2, __ T16B, v2);
2594     __ rev32(v3, __ T16B, v3);
2595     __ rev32(v4, __ T16B, v4);
2596     __ aese(v0, v1);
2597     __ aesmc(v0, v0);
2598     __ aese(v0, v2);
2599     __ aesmc(v0, v0);
2600     __ aese(v0, v3);
2601     __ aesmc(v0, v0);
2602     __ aese(v0, v4);
2603     __ aesmc(v0, v0);
2604 
2605     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2606     __ rev32(v1, __ T16B, v1);
2607     __ rev32(v2, __ T16B, v2);
2608 
2609     __ cmpw(keylen, 44);
2610     __ br(Assembler::EQ, L_doLast);
2611 
2612     __ aese(v0, v1);
2613     __ aesmc(v0, v0);
2614     __ aese(v0, v2);
2615     __ aesmc(v0, v0);
2616 
2617     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2618     __ rev32(v1, __ T16B, v1);
2619     __ rev32(v2, __ T16B, v2);
2620 
2621     __ cmpw(keylen, 52);
2622     __ br(Assembler::EQ, L_doLast);
2623 
2624     __ aese(v0, v1);
2625     __ aesmc(v0, v0);
2626     __ aese(v0, v2);
2627     __ aesmc(v0, v0);
2628 
2629     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2630     __ rev32(v1, __ T16B, v1);
2631     __ rev32(v2, __ T16B, v2);
2632 
2633     __ BIND(L_doLast);
2634 
2635     __ aese(v0, v1);
2636     __ aesmc(v0, v0);
2637     __ aese(v0, v2);
2638 
2639     __ ld1(v1, __ T16B, key);
2640     __ rev32(v1, __ T16B, v1);
2641     __ eor(v0, __ T16B, v0, v1);
2642 
2643     __ st1(v0, __ T16B, to);
2644 
2645     __ mov(r0, 0);
2646 
2647     __ leave();
2648     __ ret(lr);
2649 
2650     return start;
2651   }
2652 
2653   // Arguments:
2654   //
2655   // Inputs:
2656   //   c_rarg0   - source byte array address
2657   //   c_rarg1   - destination byte array address
2658   //   c_rarg2   - K (key) in little endian int array
2659   //
2660   address generate_aescrypt_decryptBlock() {
2661     assert(UseAES, "need AES cryptographic extension support");
2662     __ align(CodeEntryAlignment);
2663     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2664     Label L_doLast;
2665 
2666     const Register from        = c_rarg0;  // source array address
2667     const Register to          = c_rarg1;  // destination array address
2668     const Register key         = c_rarg2;  // key array address
2669     const Register keylen      = rscratch1;
2670 
2671     address start = __ pc();
2672     __ enter(); // required for proper stackwalking of RuntimeStub frame
2673 
2674     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2675 
2676     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2677 
2678     __ ld1(v5, __ T16B, __ post(key, 16));
2679     __ rev32(v5, __ T16B, v5);
2680 
2681     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2682     __ rev32(v1, __ T16B, v1);
2683     __ rev32(v2, __ T16B, v2);
2684     __ rev32(v3, __ T16B, v3);
2685     __ rev32(v4, __ T16B, v4);
2686     __ aesd(v0, v1);
2687     __ aesimc(v0, v0);
2688     __ aesd(v0, v2);
2689     __ aesimc(v0, v0);
2690     __ aesd(v0, v3);
2691     __ aesimc(v0, v0);
2692     __ aesd(v0, v4);
2693     __ aesimc(v0, v0);
2694 
2695     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2696     __ rev32(v1, __ T16B, v1);
2697     __ rev32(v2, __ T16B, v2);
2698     __ rev32(v3, __ T16B, v3);
2699     __ rev32(v4, __ T16B, v4);
2700     __ aesd(v0, v1);
2701     __ aesimc(v0, v0);
2702     __ aesd(v0, v2);
2703     __ aesimc(v0, v0);
2704     __ aesd(v0, v3);
2705     __ aesimc(v0, v0);
2706     __ aesd(v0, v4);
2707     __ aesimc(v0, v0);
2708 
2709     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2710     __ rev32(v1, __ T16B, v1);
2711     __ rev32(v2, __ T16B, v2);
2712 
2713     __ cmpw(keylen, 44);
2714     __ br(Assembler::EQ, L_doLast);
2715 
2716     __ aesd(v0, v1);
2717     __ aesimc(v0, v0);
2718     __ aesd(v0, v2);
2719     __ aesimc(v0, v0);
2720 
2721     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2722     __ rev32(v1, __ T16B, v1);
2723     __ rev32(v2, __ T16B, v2);
2724 
2725     __ cmpw(keylen, 52);
2726     __ br(Assembler::EQ, L_doLast);
2727 
2728     __ aesd(v0, v1);
2729     __ aesimc(v0, v0);
2730     __ aesd(v0, v2);
2731     __ aesimc(v0, v0);
2732 
2733     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2734     __ rev32(v1, __ T16B, v1);
2735     __ rev32(v2, __ T16B, v2);
2736 
2737     __ BIND(L_doLast);
2738 
2739     __ aesd(v0, v1);
2740     __ aesimc(v0, v0);
2741     __ aesd(v0, v2);
2742 
2743     __ eor(v0, __ T16B, v0, v5);
2744 
2745     __ st1(v0, __ T16B, to);
2746 
2747     __ mov(r0, 0);
2748 
2749     __ leave();
2750     __ ret(lr);
2751 
2752     return start;
2753   }
2754 
2755   // Arguments:
2756   //
2757   // Inputs:
2758   //   c_rarg0   - source byte array address
2759   //   c_rarg1   - destination byte array address
2760   //   c_rarg2   - K (key) in little endian int array
2761   //   c_rarg3   - r vector byte array address
2762   //   c_rarg4   - input length
2763   //
2764   // Output:
2765   //   x0        - input length
2766   //
2767   address generate_cipherBlockChaining_encryptAESCrypt() {
2768     assert(UseAES, "need AES cryptographic extension support");
2769     __ align(CodeEntryAlignment);
2770     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2771 
2772     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2773 
2774     const Register from        = c_rarg0;  // source array address
2775     const Register to          = c_rarg1;  // destination array address
2776     const Register key         = c_rarg2;  // key array address
2777     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2778                                            // and left with the results of the last encryption block
2779     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2780     const Register keylen      = rscratch1;
2781 
2782     address start = __ pc();
2783 
2784       __ enter();
2785 
2786       __ movw(rscratch2, len_reg);
2787 
2788       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2789 
2790       __ ld1(v0, __ T16B, rvec);
2791 
2792       __ cmpw(keylen, 52);
2793       __ br(Assembler::CC, L_loadkeys_44);
2794       __ br(Assembler::EQ, L_loadkeys_52);
2795 
2796       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2797       __ rev32(v17, __ T16B, v17);
2798       __ rev32(v18, __ T16B, v18);
2799     __ BIND(L_loadkeys_52);
2800       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2801       __ rev32(v19, __ T16B, v19);
2802       __ rev32(v20, __ T16B, v20);
2803     __ BIND(L_loadkeys_44);
2804       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2805       __ rev32(v21, __ T16B, v21);
2806       __ rev32(v22, __ T16B, v22);
2807       __ rev32(v23, __ T16B, v23);
2808       __ rev32(v24, __ T16B, v24);
2809       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2810       __ rev32(v25, __ T16B, v25);
2811       __ rev32(v26, __ T16B, v26);
2812       __ rev32(v27, __ T16B, v27);
2813       __ rev32(v28, __ T16B, v28);
2814       __ ld1(v29, v30, v31, __ T16B, key);
2815       __ rev32(v29, __ T16B, v29);
2816       __ rev32(v30, __ T16B, v30);
2817       __ rev32(v31, __ T16B, v31);
2818 
2819     __ BIND(L_aes_loop);
2820       __ ld1(v1, __ T16B, __ post(from, 16));
2821       __ eor(v0, __ T16B, v0, v1);
2822 
2823       __ br(Assembler::CC, L_rounds_44);
2824       __ br(Assembler::EQ, L_rounds_52);
2825 
2826       __ aese(v0, v17); __ aesmc(v0, v0);
2827       __ aese(v0, v18); __ aesmc(v0, v0);
2828     __ BIND(L_rounds_52);
2829       __ aese(v0, v19); __ aesmc(v0, v0);
2830       __ aese(v0, v20); __ aesmc(v0, v0);
2831     __ BIND(L_rounds_44);
2832       __ aese(v0, v21); __ aesmc(v0, v0);
2833       __ aese(v0, v22); __ aesmc(v0, v0);
2834       __ aese(v0, v23); __ aesmc(v0, v0);
2835       __ aese(v0, v24); __ aesmc(v0, v0);
2836       __ aese(v0, v25); __ aesmc(v0, v0);
2837       __ aese(v0, v26); __ aesmc(v0, v0);
2838       __ aese(v0, v27); __ aesmc(v0, v0);
2839       __ aese(v0, v28); __ aesmc(v0, v0);
2840       __ aese(v0, v29); __ aesmc(v0, v0);
2841       __ aese(v0, v30);
2842       __ eor(v0, __ T16B, v0, v31);
2843 
2844       __ st1(v0, __ T16B, __ post(to, 16));
2845 
2846       __ subw(len_reg, len_reg, 16);
2847       __ cbnzw(len_reg, L_aes_loop);
2848 
2849       __ st1(v0, __ T16B, rvec);
2850 
2851       __ mov(r0, rscratch2);
2852 
2853       __ leave();
2854       __ ret(lr);
2855 
2856       return start;
2857   }
2858 
2859   // Arguments:
2860   //
2861   // Inputs:
2862   //   c_rarg0   - source byte array address
2863   //   c_rarg1   - destination byte array address
2864   //   c_rarg2   - K (key) in little endian int array
2865   //   c_rarg3   - r vector byte array address
2866   //   c_rarg4   - input length
2867   //
2868   // Output:
2869   //   r0        - input length
2870   //
2871   address generate_cipherBlockChaining_decryptAESCrypt() {
2872     assert(UseAES, "need AES cryptographic extension support");
2873     __ align(CodeEntryAlignment);
2874     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2875 
2876     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2877 
2878     const Register from        = c_rarg0;  // source array address
2879     const Register to          = c_rarg1;  // destination array address
2880     const Register key         = c_rarg2;  // key array address
2881     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2882                                            // and left with the results of the last encryption block
2883     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2884     const Register keylen      = rscratch1;
2885 
2886     address start = __ pc();
2887 
2888       __ enter();
2889 
2890       __ movw(rscratch2, len_reg);
2891 
2892       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2893 
2894       __ ld1(v2, __ T16B, rvec);
2895 
2896       __ ld1(v31, __ T16B, __ post(key, 16));
2897       __ rev32(v31, __ T16B, v31);
2898 
2899       __ cmpw(keylen, 52);
2900       __ br(Assembler::CC, L_loadkeys_44);
2901       __ br(Assembler::EQ, L_loadkeys_52);
2902 
2903       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2904       __ rev32(v17, __ T16B, v17);
2905       __ rev32(v18, __ T16B, v18);
2906     __ BIND(L_loadkeys_52);
2907       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2908       __ rev32(v19, __ T16B, v19);
2909       __ rev32(v20, __ T16B, v20);
2910     __ BIND(L_loadkeys_44);
2911       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2912       __ rev32(v21, __ T16B, v21);
2913       __ rev32(v22, __ T16B, v22);
2914       __ rev32(v23, __ T16B, v23);
2915       __ rev32(v24, __ T16B, v24);
2916       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2917       __ rev32(v25, __ T16B, v25);
2918       __ rev32(v26, __ T16B, v26);
2919       __ rev32(v27, __ T16B, v27);
2920       __ rev32(v28, __ T16B, v28);
2921       __ ld1(v29, v30, __ T16B, key);
2922       __ rev32(v29, __ T16B, v29);
2923       __ rev32(v30, __ T16B, v30);
2924 
2925     __ BIND(L_aes_loop);
2926       __ ld1(v0, __ T16B, __ post(from, 16));
2927       __ orr(v1, __ T16B, v0, v0);
2928 
2929       __ br(Assembler::CC, L_rounds_44);
2930       __ br(Assembler::EQ, L_rounds_52);
2931 
2932       __ aesd(v0, v17); __ aesimc(v0, v0);
2933       __ aesd(v0, v18); __ aesimc(v0, v0);
2934     __ BIND(L_rounds_52);
2935       __ aesd(v0, v19); __ aesimc(v0, v0);
2936       __ aesd(v0, v20); __ aesimc(v0, v0);
2937     __ BIND(L_rounds_44);
2938       __ aesd(v0, v21); __ aesimc(v0, v0);
2939       __ aesd(v0, v22); __ aesimc(v0, v0);
2940       __ aesd(v0, v23); __ aesimc(v0, v0);
2941       __ aesd(v0, v24); __ aesimc(v0, v0);
2942       __ aesd(v0, v25); __ aesimc(v0, v0);
2943       __ aesd(v0, v26); __ aesimc(v0, v0);
2944       __ aesd(v0, v27); __ aesimc(v0, v0);
2945       __ aesd(v0, v28); __ aesimc(v0, v0);
2946       __ aesd(v0, v29); __ aesimc(v0, v0);
2947       __ aesd(v0, v30);
2948       __ eor(v0, __ T16B, v0, v31);
2949       __ eor(v0, __ T16B, v0, v2);
2950 
2951       __ st1(v0, __ T16B, __ post(to, 16));
2952       __ orr(v2, __ T16B, v1, v1);
2953 
2954       __ subw(len_reg, len_reg, 16);
2955       __ cbnzw(len_reg, L_aes_loop);
2956 
2957       __ st1(v2, __ T16B, rvec);
2958 
2959       __ mov(r0, rscratch2);
2960 
2961       __ leave();
2962       __ ret(lr);
2963 
2964     return start;
2965   }
2966 
2967   // CTR AES crypt.
2968   // Arguments:
2969   //
2970   // Inputs:
2971   //   c_rarg0   - source byte array address
2972   //   c_rarg1   - destination byte array address
2973   //   c_rarg2   - K (key) in little endian int array
2974   //   c_rarg3   - counter vector byte array address
2975   //   c_rarg4   - input length
2976   //   c_rarg5   - saved encryptedCounter start
2977   //   c_rarg6   - saved used length
2978   //
2979   // Output:
2980   //   r0       - input length
2981   //
2982   address generate_counterMode_AESCrypt() {
2983     const Register in = c_rarg0;
2984     const Register out = c_rarg1;
2985     const Register key = c_rarg2;
2986     const Register counter = c_rarg3;
2987     const Register saved_len = c_rarg4, len = r10;
2988     const Register saved_encrypted_ctr = c_rarg5;
2989     const Register used_ptr = c_rarg6, used = r12;
2990 
2991     const Register offset = r7;
2992     const Register keylen = r11;
2993 
2994     const unsigned char block_size = 16;
2995     const int bulk_width = 4;
2996     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2997     // performance with larger data sizes, but it also means that the
2998     // fast path isn't used until you have at least 8 blocks, and up
2999     // to 127 bytes of data will be executed on the slow path. For
3000     // that reason, and also so as not to blow away too much icache, 4
3001     // blocks seems like a sensible compromise.
3002 
3003     // Algorithm:
3004     //
3005     //    if (len == 0) {
3006     //        goto DONE;
3007     //    }
3008     //    int result = len;
3009     //    do {
3010     //        if (used >= blockSize) {
3011     //            if (len >= bulk_width * blockSize) {
3012     //                CTR_large_block();
3013     //                if (len == 0)
3014     //                    goto DONE;
3015     //            }
3016     //            for (;;) {
3017     //                16ByteVector v0 = counter;
3018     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3019     //                used = 0;
3020     //                if (len < blockSize)
3021     //                    break;    /* goto NEXT */
3022     //                16ByteVector v1 = load16Bytes(in, offset);
3023     //                v1 = v1 ^ encryptedCounter;
3024     //                store16Bytes(out, offset);
3025     //                used = blockSize;
3026     //                offset += blockSize;
3027     //                len -= blockSize;
3028     //                if (len == 0)
3029     //                    goto DONE;
3030     //            }
3031     //        }
3032     //      NEXT:
3033     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3034     //        len--;
3035     //    } while (len != 0);
3036     //  DONE:
3037     //    return result;
3038     //
3039     // CTR_large_block()
3040     //    Wide bulk encryption of whole blocks.
3041 
3042     __ align(CodeEntryAlignment);
3043     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3044     const address start = __ pc();
3045     __ enter();
3046 
3047     Label DONE, CTR_large_block, large_block_return;
3048     __ ldrw(used, Address(used_ptr));
3049     __ cbzw(saved_len, DONE);
3050 
3051     __ mov(len, saved_len);
3052     __ mov(offset, 0);
3053 
3054     // Compute #rounds for AES based on the length of the key array
3055     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3056 
3057     __ aesenc_loadkeys(key, keylen);
3058 
3059     {
3060       Label L_CTR_loop, NEXT;
3061 
3062       __ bind(L_CTR_loop);
3063 
3064       __ cmp(used, block_size);
3065       __ br(__ LO, NEXT);
3066 
3067       // Maybe we have a lot of data
3068       __ subsw(rscratch1, len, bulk_width * block_size);
3069       __ br(__ HS, CTR_large_block);
3070       __ BIND(large_block_return);
3071       __ cbzw(len, DONE);
3072 
3073       // Setup the counter
3074       __ movi(v4, __ T4S, 0);
3075       __ movi(v5, __ T4S, 1);
3076       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
3077 
3078       __ ld1(v0, __ T16B, counter); // Load the counter into v0
3079       __ rev32(v16, __ T16B, v0);
3080       __ addv(v16, __ T4S, v16, v4);
3081       __ rev32(v16, __ T16B, v16);
3082       __ st1(v16, __ T16B, counter); // Save the incremented counter back
3083 
3084       {
3085         // We have fewer than bulk_width blocks of data left. Encrypt
3086         // them one by one until there is less than a full block
3087         // remaining, being careful to save both the encrypted counter
3088         // and the counter.
3089 
3090         Label inner_loop;
3091         __ bind(inner_loop);
3092         // Counter to encrypt is in v0
3093         __ aesecb_encrypt(noreg, noreg, keylen);
3094         __ st1(v0, __ T16B, saved_encrypted_ctr);
3095 
3096         // Do we have a remaining full block?
3097 
3098         __ mov(used, 0);
3099         __ cmp(len, block_size);
3100         __ br(__ LO, NEXT);
3101 
3102         // Yes, we have a full block
3103         __ ldrq(v1, Address(in, offset));
3104         __ eor(v1, __ T16B, v1, v0);
3105         __ strq(v1, Address(out, offset));
3106         __ mov(used, block_size);
3107         __ add(offset, offset, block_size);
3108 
3109         __ subw(len, len, block_size);
3110         __ cbzw(len, DONE);
3111 
3112         // Increment the counter, store it back
3113         __ orr(v0, __ T16B, v16, v16);
3114         __ rev32(v16, __ T16B, v16);
3115         __ addv(v16, __ T4S, v16, v4);
3116         __ rev32(v16, __ T16B, v16);
3117         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3118 
3119         __ b(inner_loop);
3120       }
3121 
3122       __ BIND(NEXT);
3123 
3124       // Encrypt a single byte, and loop.
3125       // We expect this to be a rare event.
3126       __ ldrb(rscratch1, Address(in, offset));
3127       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3128       __ eor(rscratch1, rscratch1, rscratch2);
3129       __ strb(rscratch1, Address(out, offset));
3130       __ add(offset, offset, 1);
3131       __ add(used, used, 1);
3132       __ subw(len, len,1);
3133       __ cbnzw(len, L_CTR_loop);
3134     }
3135 
3136     __ bind(DONE);
3137     __ strw(used, Address(used_ptr));
3138     __ mov(r0, saved_len);
3139 
3140     __ leave(); // required for proper stackwalking of RuntimeStub frame
3141     __ ret(lr);
3142 
3143     // Bulk encryption
3144 
3145     __ BIND (CTR_large_block);
3146     assert(bulk_width == 4 || bulk_width == 8, "must be");
3147 
3148     if (bulk_width == 8) {
3149       __ sub(sp, sp, 4 * 16);
3150       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3151     }
3152     __ sub(sp, sp, 4 * 16);
3153     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3154     RegSet saved_regs = (RegSet::of(in, out, offset)
3155                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3156     __ push(saved_regs, sp);
3157     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3158     __ add(in, in, offset);
3159     __ add(out, out, offset);
3160 
3161     // Keys should already be loaded into the correct registers
3162 
3163     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3164     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3165 
3166     // AES/CTR loop
3167     {
3168       Label L_CTR_loop;
3169       __ BIND(L_CTR_loop);
3170 
3171       // Setup the counters
3172       __ movi(v8, __ T4S, 0);
3173       __ movi(v9, __ T4S, 1);
3174       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3175 
3176       for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
3177         __ rev32(f, __ T16B, v16);
3178         __ addv(v16, __ T4S, v16, v8);
3179       }
3180 
3181       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3182 
3183       // Encrypt the counters
3184       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3185 
3186       if (bulk_width == 8) {
3187         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3188       }
3189 
3190       // XOR the encrypted counters with the inputs
3191       for (int i = 0; i < bulk_width; i++) {
3192         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3193       }
3194 
3195       // Write the encrypted data
3196       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3197       if (bulk_width == 8) {
3198         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3199       }
3200 
3201       __ subw(len, len, 16 * bulk_width);
3202       __ cbnzw(len, L_CTR_loop);
3203     }
3204 
3205     // Save the counter back where it goes
3206     __ rev32(v16, __ T16B, v16);
3207     __ st1(v16, __ T16B, counter);
3208 
3209     __ pop(saved_regs, sp);
3210 
3211     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3212     if (bulk_width == 8) {
3213       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3214     }
3215 
3216     __ andr(rscratch1, len, -16 * bulk_width);
3217     __ sub(len, len, rscratch1);
3218     __ add(offset, offset, rscratch1);
3219     __ mov(used, 16);
3220     __ strw(used, Address(used_ptr));
3221     __ b(large_block_return);
3222 
3223     return start;
3224   }
3225 
3226   // Arguments:
3227   //
3228   // Inputs:
3229   //   c_rarg0   - byte[]  source+offset
3230   //   c_rarg1   - int[]   SHA.state
3231   //   c_rarg2   - int     offset
3232   //   c_rarg3   - int     limit
3233   //
3234   address generate_md5_implCompress(bool multi_block, const char *name) {
3235     __ align(CodeEntryAlignment);
3236     StubCodeMark mark(this, "StubRoutines", name);
3237     address start = __ pc();
3238 
3239     Register buf       = c_rarg0;
3240     Register state     = c_rarg1;
3241     Register ofs       = c_rarg2;
3242     Register limit     = c_rarg3;
3243     Register a         = r4;
3244     Register b         = r5;
3245     Register c         = r6;
3246     Register d         = r7;
3247     Register rscratch3 = r10;
3248     Register rscratch4 = r11;
3249 
3250     Label keys;
3251     Label md5_loop;
3252 
3253     __ BIND(md5_loop);
3254 
3255     // Save hash values for addition after rounds
3256     __ ldrw(a, Address(state,  0));
3257     __ ldrw(b, Address(state,  4));
3258     __ ldrw(c, Address(state,  8));
3259     __ ldrw(d, Address(state, 12));
3260 
3261 #define FF(r1, r2, r3, r4, k, s, t)              \
3262     __ eorw(rscratch3, r3, r4);                  \
3263     __ movw(rscratch2, t);                       \
3264     __ andw(rscratch3, rscratch3, r2);           \
3265     __ addw(rscratch4, r1, rscratch2);           \
3266     __ ldrw(rscratch1, Address(buf, k*4));       \
3267     __ eorw(rscratch3, rscratch3, r4);           \
3268     __ addw(rscratch3, rscratch3, rscratch1);    \
3269     __ addw(rscratch3, rscratch3, rscratch4);    \
3270     __ rorw(rscratch2, rscratch3, 32 - s);       \
3271     __ addw(r1, rscratch2, r2);
3272 
3273 #define GG(r1, r2, r3, r4, k, s, t)              \
3274     __ eorw(rscratch2, r2, r3);                  \
3275     __ ldrw(rscratch1, Address(buf, k*4));       \
3276     __ andw(rscratch3, rscratch2, r4);           \
3277     __ movw(rscratch2, t);                       \
3278     __ eorw(rscratch3, rscratch3, r3);           \
3279     __ addw(rscratch4, r1, rscratch2);           \
3280     __ addw(rscratch3, rscratch3, rscratch1);    \
3281     __ addw(rscratch3, rscratch3, rscratch4);    \
3282     __ rorw(rscratch2, rscratch3, 32 - s);       \
3283     __ addw(r1, rscratch2, r2);
3284 
3285 #define HH(r1, r2, r3, r4, k, s, t)              \
3286     __ eorw(rscratch3, r3, r4);                  \
3287     __ movw(rscratch2, t);                       \
3288     __ addw(rscratch4, r1, rscratch2);           \
3289     __ ldrw(rscratch1, Address(buf, k*4));       \
3290     __ eorw(rscratch3, rscratch3, r2);           \
3291     __ addw(rscratch3, rscratch3, rscratch1);    \
3292     __ addw(rscratch3, rscratch3, rscratch4);    \
3293     __ rorw(rscratch2, rscratch3, 32 - s);       \
3294     __ addw(r1, rscratch2, r2);
3295 
3296 #define II(r1, r2, r3, r4, k, s, t)              \
3297     __ movw(rscratch3, t);                       \
3298     __ ornw(rscratch2, r2, r4);                  \
3299     __ addw(rscratch4, r1, rscratch3);           \
3300     __ ldrw(rscratch1, Address(buf, k*4));       \
3301     __ eorw(rscratch3, rscratch2, r3);           \
3302     __ addw(rscratch3, rscratch3, rscratch1);    \
3303     __ addw(rscratch3, rscratch3, rscratch4);    \
3304     __ rorw(rscratch2, rscratch3, 32 - s);       \
3305     __ addw(r1, rscratch2, r2);
3306 
3307     // Round 1
3308     FF(a, b, c, d,  0,  7, 0xd76aa478)
3309     FF(d, a, b, c,  1, 12, 0xe8c7b756)
3310     FF(c, d, a, b,  2, 17, 0x242070db)
3311     FF(b, c, d, a,  3, 22, 0xc1bdceee)
3312     FF(a, b, c, d,  4,  7, 0xf57c0faf)
3313     FF(d, a, b, c,  5, 12, 0x4787c62a)
3314     FF(c, d, a, b,  6, 17, 0xa8304613)
3315     FF(b, c, d, a,  7, 22, 0xfd469501)
3316     FF(a, b, c, d,  8,  7, 0x698098d8)
3317     FF(d, a, b, c,  9, 12, 0x8b44f7af)
3318     FF(c, d, a, b, 10, 17, 0xffff5bb1)
3319     FF(b, c, d, a, 11, 22, 0x895cd7be)
3320     FF(a, b, c, d, 12,  7, 0x6b901122)
3321     FF(d, a, b, c, 13, 12, 0xfd987193)
3322     FF(c, d, a, b, 14, 17, 0xa679438e)
3323     FF(b, c, d, a, 15, 22, 0x49b40821)
3324 
3325     // Round 2
3326     GG(a, b, c, d,  1,  5, 0xf61e2562)
3327     GG(d, a, b, c,  6,  9, 0xc040b340)
3328     GG(c, d, a, b, 11, 14, 0x265e5a51)
3329     GG(b, c, d, a,  0, 20, 0xe9b6c7aa)
3330     GG(a, b, c, d,  5,  5, 0xd62f105d)
3331     GG(d, a, b, c, 10,  9, 0x02441453)
3332     GG(c, d, a, b, 15, 14, 0xd8a1e681)
3333     GG(b, c, d, a,  4, 20, 0xe7d3fbc8)
3334     GG(a, b, c, d,  9,  5, 0x21e1cde6)
3335     GG(d, a, b, c, 14,  9, 0xc33707d6)
3336     GG(c, d, a, b,  3, 14, 0xf4d50d87)
3337     GG(b, c, d, a,  8, 20, 0x455a14ed)
3338     GG(a, b, c, d, 13,  5, 0xa9e3e905)
3339     GG(d, a, b, c,  2,  9, 0xfcefa3f8)
3340     GG(c, d, a, b,  7, 14, 0x676f02d9)
3341     GG(b, c, d, a, 12, 20, 0x8d2a4c8a)
3342 
3343     // Round 3
3344     HH(a, b, c, d,  5,  4, 0xfffa3942)
3345     HH(d, a, b, c,  8, 11, 0x8771f681)
3346     HH(c, d, a, b, 11, 16, 0x6d9d6122)
3347     HH(b, c, d, a, 14, 23, 0xfde5380c)
3348     HH(a, b, c, d,  1,  4, 0xa4beea44)
3349     HH(d, a, b, c,  4, 11, 0x4bdecfa9)
3350     HH(c, d, a, b,  7, 16, 0xf6bb4b60)
3351     HH(b, c, d, a, 10, 23, 0xbebfbc70)
3352     HH(a, b, c, d, 13,  4, 0x289b7ec6)
3353     HH(d, a, b, c,  0, 11, 0xeaa127fa)
3354     HH(c, d, a, b,  3, 16, 0xd4ef3085)
3355     HH(b, c, d, a,  6, 23, 0x04881d05)
3356     HH(a, b, c, d,  9,  4, 0xd9d4d039)
3357     HH(d, a, b, c, 12, 11, 0xe6db99e5)
3358     HH(c, d, a, b, 15, 16, 0x1fa27cf8)
3359     HH(b, c, d, a,  2, 23, 0xc4ac5665)
3360 
3361     // Round 4
3362     II(a, b, c, d,  0,  6, 0xf4292244)
3363     II(d, a, b, c,  7, 10, 0x432aff97)
3364     II(c, d, a, b, 14, 15, 0xab9423a7)
3365     II(b, c, d, a,  5, 21, 0xfc93a039)
3366     II(a, b, c, d, 12,  6, 0x655b59c3)
3367     II(d, a, b, c,  3, 10, 0x8f0ccc92)
3368     II(c, d, a, b, 10, 15, 0xffeff47d)
3369     II(b, c, d, a,  1, 21, 0x85845dd1)
3370     II(a, b, c, d,  8,  6, 0x6fa87e4f)
3371     II(d, a, b, c, 15, 10, 0xfe2ce6e0)
3372     II(c, d, a, b,  6, 15, 0xa3014314)
3373     II(b, c, d, a, 13, 21, 0x4e0811a1)
3374     II(a, b, c, d,  4,  6, 0xf7537e82)
3375     II(d, a, b, c, 11, 10, 0xbd3af235)
3376     II(c, d, a, b,  2, 15, 0x2ad7d2bb)
3377     II(b, c, d, a,  9, 21, 0xeb86d391)
3378 
3379 #undef FF
3380 #undef GG
3381 #undef HH
3382 #undef II
3383 
3384     // write hash values back in the correct order
3385     __ ldrw(rscratch1, Address(state,  0));
3386     __ addw(rscratch1, rscratch1, a);
3387     __ strw(rscratch1, Address(state,  0));
3388 
3389     __ ldrw(rscratch2, Address(state,  4));
3390     __ addw(rscratch2, rscratch2, b);
3391     __ strw(rscratch2, Address(state,  4));
3392 
3393     __ ldrw(rscratch3, Address(state,  8));
3394     __ addw(rscratch3, rscratch3, c);
3395     __ strw(rscratch3, Address(state,  8));
3396 
3397     __ ldrw(rscratch4, Address(state, 12));
3398     __ addw(rscratch4, rscratch4, d);
3399     __ strw(rscratch4, Address(state, 12));
3400 
3401     if (multi_block) {
3402       __ add(buf, buf, 64);
3403       __ add(ofs, ofs, 64);
3404       __ cmp(ofs, limit);
3405       __ br(Assembler::LE, md5_loop);
3406       __ mov(c_rarg0, ofs); // return ofs
3407     }
3408 
3409     __ ret(lr);
3410 
3411     return start;
3412   }
3413 
3414   // Arguments:
3415   //
3416   // Inputs:
3417   //   c_rarg0   - byte[]  source+offset
3418   //   c_rarg1   - int[]   SHA.state
3419   //   c_rarg2   - int     offset
3420   //   c_rarg3   - int     limit
3421   //
3422   address generate_sha1_implCompress(bool multi_block, const char *name) {
3423     __ align(CodeEntryAlignment);
3424     StubCodeMark mark(this, "StubRoutines", name);
3425     address start = __ pc();
3426 
3427     Register buf   = c_rarg0;
3428     Register state = c_rarg1;
3429     Register ofs   = c_rarg2;
3430     Register limit = c_rarg3;
3431 
3432     Label keys;
3433     Label sha1_loop;
3434 
3435     // load the keys into v0..v3
3436     __ adr(rscratch1, keys);
3437     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3438     // load 5 words state into v6, v7
3439     __ ldrq(v6, Address(state, 0));
3440     __ ldrs(v7, Address(state, 16));
3441 
3442 
3443     __ BIND(sha1_loop);
3444     // load 64 bytes of data into v16..v19
3445     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3446     __ rev32(v16, __ T16B, v16);
3447     __ rev32(v17, __ T16B, v17);
3448     __ rev32(v18, __ T16B, v18);
3449     __ rev32(v19, __ T16B, v19);
3450 
3451     // do the sha1
3452     __ addv(v4, __ T4S, v16, v0);
3453     __ orr(v20, __ T16B, v6, v6);
3454 
3455     FloatRegister d0 = v16;
3456     FloatRegister d1 = v17;
3457     FloatRegister d2 = v18;
3458     FloatRegister d3 = v19;
3459 
3460     for (int round = 0; round < 20; round++) {
3461       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3462       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3463       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3464       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3465       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3466 
3467       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3468       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3469       __ sha1h(tmp2, __ T4S, v20);
3470       if (round < 5)
3471         __ sha1c(v20, __ T4S, tmp3, tmp4);
3472       else if (round < 10 || round >= 15)
3473         __ sha1p(v20, __ T4S, tmp3, tmp4);
3474       else
3475         __ sha1m(v20, __ T4S, tmp3, tmp4);
3476       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3477 
3478       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3479     }
3480 
3481     __ addv(v7, __ T2S, v7, v21);
3482     __ addv(v6, __ T4S, v6, v20);
3483 
3484     if (multi_block) {
3485       __ add(ofs, ofs, 64);
3486       __ cmp(ofs, limit);
3487       __ br(Assembler::LE, sha1_loop);
3488       __ mov(c_rarg0, ofs); // return ofs
3489     }
3490 
3491     __ strq(v6, Address(state, 0));
3492     __ strs(v7, Address(state, 16));
3493 
3494     __ ret(lr);
3495 
3496     __ bind(keys);
3497     __ emit_int32(0x5a827999);
3498     __ emit_int32(0x6ed9eba1);
3499     __ emit_int32(0x8f1bbcdc);
3500     __ emit_int32(0xca62c1d6);
3501 
3502     return start;
3503   }
3504 
3505 
3506   // Arguments:
3507   //
3508   // Inputs:
3509   //   c_rarg0   - byte[]  source+offset
3510   //   c_rarg1   - int[]   SHA.state
3511   //   c_rarg2   - int     offset
3512   //   c_rarg3   - int     limit
3513   //
3514   address generate_sha256_implCompress(bool multi_block, const char *name) {
3515     static const uint32_t round_consts[64] = {
3516       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3517       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3518       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3519       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3520       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3521       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3522       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3523       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3524       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3525       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3526       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3527       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3528       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3529       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3530       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3531       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3532     };
3533     __ align(CodeEntryAlignment);
3534     StubCodeMark mark(this, "StubRoutines", name);
3535     address start = __ pc();
3536 
3537     Register buf   = c_rarg0;
3538     Register state = c_rarg1;
3539     Register ofs   = c_rarg2;
3540     Register limit = c_rarg3;
3541 
3542     Label sha1_loop;
3543 
3544     __ stpd(v8, v9, __ pre(sp, -32));
3545     __ stpd(v10, v11, Address(sp, 16));
3546 
3547 // dga == v0
3548 // dgb == v1
3549 // dg0 == v2
3550 // dg1 == v3
3551 // dg2 == v4
3552 // t0 == v6
3553 // t1 == v7
3554 
3555     // load 16 keys to v16..v31
3556     __ lea(rscratch1, ExternalAddress((address)round_consts));
3557     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3558     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3559     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3560     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3561 
3562     // load 8 words (256 bits) state
3563     __ ldpq(v0, v1, state);
3564 
3565     __ BIND(sha1_loop);
3566     // load 64 bytes of data into v8..v11
3567     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3568     __ rev32(v8, __ T16B, v8);
3569     __ rev32(v9, __ T16B, v9);
3570     __ rev32(v10, __ T16B, v10);
3571     __ rev32(v11, __ T16B, v11);
3572 
3573     __ addv(v6, __ T4S, v8, v16);
3574     __ orr(v2, __ T16B, v0, v0);
3575     __ orr(v3, __ T16B, v1, v1);
3576 
3577     FloatRegister d0 = v8;
3578     FloatRegister d1 = v9;
3579     FloatRegister d2 = v10;
3580     FloatRegister d3 = v11;
3581 
3582 
3583     for (int round = 0; round < 16; round++) {
3584       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3585       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3586       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3587       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3588 
3589       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3590        __ orr(v4, __ T16B, v2, v2);
3591       if (round < 15)
3592         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3593       __ sha256h(v2, __ T4S, v3, tmp2);
3594       __ sha256h2(v3, __ T4S, v4, tmp2);
3595       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3596 
3597       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3598     }
3599 
3600     __ addv(v0, __ T4S, v0, v2);
3601     __ addv(v1, __ T4S, v1, v3);
3602 
3603     if (multi_block) {
3604       __ add(ofs, ofs, 64);
3605       __ cmp(ofs, limit);
3606       __ br(Assembler::LE, sha1_loop);
3607       __ mov(c_rarg0, ofs); // return ofs
3608     }
3609 
3610     __ ldpd(v10, v11, Address(sp, 16));
3611     __ ldpd(v8, v9, __ post(sp, 32));
3612 
3613     __ stpq(v0, v1, state);
3614 
3615     __ ret(lr);
3616 
3617     return start;
3618   }
3619 
3620   // Arguments:
3621   //
3622   // Inputs:
3623   //   c_rarg0   - byte[]  source+offset
3624   //   c_rarg1   - int[]   SHA.state
3625   //   c_rarg2   - int     offset
3626   //   c_rarg3   - int     limit
3627   //
3628   address generate_sha512_implCompress(bool multi_block, const char *name) {
3629     static const uint64_t round_consts[80] = {
3630       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3631       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3632       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3633       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3634       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3635       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3636       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3637       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3638       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3639       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3640       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3641       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3642       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3643       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3644       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3645       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3646       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3647       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3648       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3649       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3650       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3651       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3652       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3653       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3654       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3655       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3656       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3657     };
3658 
3659     // Double rounds for sha512.
3660     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3661       if (dr < 36)                                                                   \
3662         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3663       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3664       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3665       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3666       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3667       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3668       if (dr < 32) {                                                                 \
3669         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3670         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3671       }                                                                              \
3672       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3673       if (dr < 32)                                                                   \
3674         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3675       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3676       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3677 
3678     __ align(CodeEntryAlignment);
3679     StubCodeMark mark(this, "StubRoutines", name);
3680     address start = __ pc();
3681 
3682     Register buf   = c_rarg0;
3683     Register state = c_rarg1;
3684     Register ofs   = c_rarg2;
3685     Register limit = c_rarg3;
3686 
3687     __ stpd(v8, v9, __ pre(sp, -64));
3688     __ stpd(v10, v11, Address(sp, 16));
3689     __ stpd(v12, v13, Address(sp, 32));
3690     __ stpd(v14, v15, Address(sp, 48));
3691 
3692     Label sha512_loop;
3693 
3694     // load state
3695     __ ld1(v8, v9, v10, v11, __ T2D, state);
3696 
3697     // load first 4 round constants
3698     __ lea(rscratch1, ExternalAddress((address)round_consts));
3699     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3700 
3701     __ BIND(sha512_loop);
3702     // load 128B of data into v12..v19
3703     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3704     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3705     __ rev64(v12, __ T16B, v12);
3706     __ rev64(v13, __ T16B, v13);
3707     __ rev64(v14, __ T16B, v14);
3708     __ rev64(v15, __ T16B, v15);
3709     __ rev64(v16, __ T16B, v16);
3710     __ rev64(v17, __ T16B, v17);
3711     __ rev64(v18, __ T16B, v18);
3712     __ rev64(v19, __ T16B, v19);
3713 
3714     __ mov(rscratch2, rscratch1);
3715 
3716     __ mov(v0, __ T16B, v8);
3717     __ mov(v1, __ T16B, v9);
3718     __ mov(v2, __ T16B, v10);
3719     __ mov(v3, __ T16B, v11);
3720 
3721     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3722     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3723     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3724     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3725     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3726     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3727     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3728     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3729     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3730     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3731     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3732     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3733     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3734     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3735     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3736     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3737     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3738     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3739     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3740     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3741     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3742     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3743     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3744     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3745     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3746     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3747     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3748     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3749     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3750     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3751     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3752     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3753     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3754     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3755     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3756     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3757     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3758     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3759     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3760     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3761 
3762     __ addv(v8, __ T2D, v8, v0);
3763     __ addv(v9, __ T2D, v9, v1);
3764     __ addv(v10, __ T2D, v10, v2);
3765     __ addv(v11, __ T2D, v11, v3);
3766 
3767     if (multi_block) {
3768       __ add(ofs, ofs, 128);
3769       __ cmp(ofs, limit);
3770       __ br(Assembler::LE, sha512_loop);
3771       __ mov(c_rarg0, ofs); // return ofs
3772     }
3773 
3774     __ st1(v8, v9, v10, v11, __ T2D, state);
3775 
3776     __ ldpd(v14, v15, Address(sp, 48));
3777     __ ldpd(v12, v13, Address(sp, 32));
3778     __ ldpd(v10, v11, Address(sp, 16));
3779     __ ldpd(v8, v9, __ post(sp, 64));
3780 
3781     __ ret(lr);
3782 
3783     return start;
3784   }
3785 
3786   // Arguments:
3787   //
3788   // Inputs:
3789   //   c_rarg0   - byte[]  source+offset
3790   //   c_rarg1   - byte[]   SHA.state
3791   //   c_rarg2   - int     digest_length
3792   //   c_rarg3   - int     offset
3793   //   c_rarg4   - int     limit
3794   //
3795   address generate_sha3_implCompress(bool multi_block, const char *name) {
3796     static const uint64_t round_consts[24] = {
3797       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3798       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3799       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3800       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3801       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3802       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3803       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3804       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3805     };
3806 
3807     __ align(CodeEntryAlignment);
3808     StubCodeMark mark(this, "StubRoutines", name);
3809     address start = __ pc();
3810 
3811     Register buf           = c_rarg0;
3812     Register state         = c_rarg1;
3813     Register digest_length = c_rarg2;
3814     Register ofs           = c_rarg3;
3815     Register limit         = c_rarg4;
3816 
3817     Label sha3_loop, rounds24_loop;
3818     Label sha3_512, sha3_384_or_224, sha3_256;
3819 
3820     __ stpd(v8, v9, __ pre(sp, -64));
3821     __ stpd(v10, v11, Address(sp, 16));
3822     __ stpd(v12, v13, Address(sp, 32));
3823     __ stpd(v14, v15, Address(sp, 48));
3824 
3825     // load state
3826     __ add(rscratch1, state, 32);
3827     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3828     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3829     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3830     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3831     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3832     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3833     __ ld1(v24, __ T1D, rscratch1);
3834 
3835     __ BIND(sha3_loop);
3836 
3837     // 24 keccak rounds
3838     __ movw(rscratch2, 24);
3839 
3840     // load round_constants base
3841     __ lea(rscratch1, ExternalAddress((address) round_consts));
3842 
3843     // load input
3844     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3845     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3846     __ eor(v0, __ T8B, v0, v25);
3847     __ eor(v1, __ T8B, v1, v26);
3848     __ eor(v2, __ T8B, v2, v27);
3849     __ eor(v3, __ T8B, v3, v28);
3850     __ eor(v4, __ T8B, v4, v29);
3851     __ eor(v5, __ T8B, v5, v30);
3852     __ eor(v6, __ T8B, v6, v31);
3853 
3854     // digest_length == 64, SHA3-512
3855     __ tbnz(digest_length, 6, sha3_512);
3856 
3857     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3858     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3859     __ eor(v7, __ T8B, v7, v25);
3860     __ eor(v8, __ T8B, v8, v26);
3861     __ eor(v9, __ T8B, v9, v27);
3862     __ eor(v10, __ T8B, v10, v28);
3863     __ eor(v11, __ T8B, v11, v29);
3864     __ eor(v12, __ T8B, v12, v30);
3865 
3866     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3867     __ tbnz(digest_length, 4, sha3_384_or_224);
3868 
3869     // SHA3-256
3870     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3871     __ eor(v13, __ T8B, v13, v25);
3872     __ eor(v14, __ T8B, v14, v26);
3873     __ eor(v15, __ T8B, v15, v27);
3874     __ eor(v16, __ T8B, v16, v28);
3875     __ b(rounds24_loop);
3876 
3877     __ BIND(sha3_384_or_224);
3878     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3879 
3880     // SHA3-224
3881     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3882     __ ld1(v29, __ T8B, __ post(buf, 8));
3883     __ eor(v13, __ T8B, v13, v25);
3884     __ eor(v14, __ T8B, v14, v26);
3885     __ eor(v15, __ T8B, v15, v27);
3886     __ eor(v16, __ T8B, v16, v28);
3887     __ eor(v17, __ T8B, v17, v29);
3888     __ b(rounds24_loop);
3889 
3890     __ BIND(sha3_512);
3891     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3892     __ eor(v7, __ T8B, v7, v25);
3893     __ eor(v8, __ T8B, v8, v26);
3894 
3895     __ BIND(rounds24_loop);
3896     __ subw(rscratch2, rscratch2, 1);
3897 
3898     __ eor3(v29, __ T16B, v4, v9, v14);
3899     __ eor3(v26, __ T16B, v1, v6, v11);
3900     __ eor3(v28, __ T16B, v3, v8, v13);
3901     __ eor3(v25, __ T16B, v0, v5, v10);
3902     __ eor3(v27, __ T16B, v2, v7, v12);
3903     __ eor3(v29, __ T16B, v29, v19, v24);
3904     __ eor3(v26, __ T16B, v26, v16, v21);
3905     __ eor3(v28, __ T16B, v28, v18, v23);
3906     __ eor3(v25, __ T16B, v25, v15, v20);
3907     __ eor3(v27, __ T16B, v27, v17, v22);
3908 
3909     __ rax1(v30, __ T2D, v29, v26);
3910     __ rax1(v26, __ T2D, v26, v28);
3911     __ rax1(v28, __ T2D, v28, v25);
3912     __ rax1(v25, __ T2D, v25, v27);
3913     __ rax1(v27, __ T2D, v27, v29);
3914 
3915     __ eor(v0, __ T16B, v0, v30);
3916     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3917     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3918     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3919     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3920     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3921     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3922     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3923     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3924     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3925     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3926     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3927     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3928     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3929     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3930     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3931     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3932     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3933     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3934     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3935     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3936     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3937     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3938     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3939     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3940 
3941     __ bcax(v20, __ T16B, v31, v22, v8);
3942     __ bcax(v21, __ T16B, v8,  v23, v22);
3943     __ bcax(v22, __ T16B, v22, v24, v23);
3944     __ bcax(v23, __ T16B, v23, v31, v24);
3945     __ bcax(v24, __ T16B, v24, v8,  v31);
3946 
3947     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3948 
3949     __ bcax(v17, __ T16B, v25, v19, v3);
3950     __ bcax(v18, __ T16B, v3,  v15, v19);
3951     __ bcax(v19, __ T16B, v19, v16, v15);
3952     __ bcax(v15, __ T16B, v15, v25, v16);
3953     __ bcax(v16, __ T16B, v16, v3,  v25);
3954 
3955     __ bcax(v10, __ T16B, v29, v12, v26);
3956     __ bcax(v11, __ T16B, v26, v13, v12);
3957     __ bcax(v12, __ T16B, v12, v14, v13);
3958     __ bcax(v13, __ T16B, v13, v29, v14);
3959     __ bcax(v14, __ T16B, v14, v26, v29);
3960 
3961     __ bcax(v7, __ T16B, v30, v9,  v4);
3962     __ bcax(v8, __ T16B, v4,  v5,  v9);
3963     __ bcax(v9, __ T16B, v9,  v6,  v5);
3964     __ bcax(v5, __ T16B, v5,  v30, v6);
3965     __ bcax(v6, __ T16B, v6,  v4,  v30);
3966 
3967     __ bcax(v3, __ T16B, v27, v0,  v28);
3968     __ bcax(v4, __ T16B, v28, v1,  v0);
3969     __ bcax(v0, __ T16B, v0,  v2,  v1);
3970     __ bcax(v1, __ T16B, v1,  v27, v2);
3971     __ bcax(v2, __ T16B, v2,  v28, v27);
3972 
3973     __ eor(v0, __ T16B, v0, v31);
3974 
3975     __ cbnzw(rscratch2, rounds24_loop);
3976 
3977     if (multi_block) {
3978       // block_size =  200 - 2 * digest_length, ofs += block_size
3979       __ add(ofs, ofs, 200);
3980       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3981 
3982       __ cmp(ofs, limit);
3983       __ br(Assembler::LE, sha3_loop);
3984       __ mov(c_rarg0, ofs); // return ofs
3985     }
3986 
3987     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3988     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3989     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3990     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3991     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3992     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3993     __ st1(v24, __ T1D, state);
3994 
3995     __ ldpd(v14, v15, Address(sp, 48));
3996     __ ldpd(v12, v13, Address(sp, 32));
3997     __ ldpd(v10, v11, Address(sp, 16));
3998     __ ldpd(v8, v9, __ post(sp, 64));
3999 
4000     __ ret(lr);
4001 
4002     return start;
4003   }
4004 
4005   // Safefetch stubs.
4006   void generate_safefetch(const char* name, int size, address* entry,
4007                           address* fault_pc, address* continuation_pc) {
4008     // safefetch signatures:
4009     //   int      SafeFetch32(int*      adr, int      errValue);
4010     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
4011     //
4012     // arguments:
4013     //   c_rarg0 = adr
4014     //   c_rarg1 = errValue
4015     //
4016     // result:
4017     //   PPC_RET  = *adr or errValue
4018 
4019     StubCodeMark mark(this, "StubRoutines", name);
4020 
4021     // Entry point, pc or function descriptor.
4022     *entry = __ pc();
4023 
4024     // Load *adr into c_rarg1, may fault.
4025     *fault_pc = __ pc();
4026     switch (size) {
4027       case 4:
4028         // int32_t
4029         __ ldrw(c_rarg1, Address(c_rarg0, 0));
4030         break;
4031       case 8:
4032         // int64_t
4033         __ ldr(c_rarg1, Address(c_rarg0, 0));
4034         break;
4035       default:
4036         ShouldNotReachHere();
4037     }
4038 
4039     // return errValue or *adr
4040     *continuation_pc = __ pc();
4041     __ mov(r0, c_rarg1);
4042     __ ret(lr);
4043   }
4044 
4045   /**
4046    *  Arguments:
4047    *
4048    * Inputs:
4049    *   c_rarg0   - int crc
4050    *   c_rarg1   - byte* buf
4051    *   c_rarg2   - int length
4052    *
4053    * Ouput:
4054    *       rax   - int crc result
4055    */
4056   address generate_updateBytesCRC32() {
4057     assert(UseCRC32Intrinsics, "what are we doing here?");
4058 
4059     __ align(CodeEntryAlignment);
4060     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4061 
4062     address start = __ pc();
4063 
4064     const Register crc   = c_rarg0;  // crc
4065     const Register buf   = c_rarg1;  // source java byte array address
4066     const Register len   = c_rarg2;  // length
4067     const Register table0 = c_rarg3; // crc_table address
4068     const Register table1 = c_rarg4;
4069     const Register table2 = c_rarg5;
4070     const Register table3 = c_rarg6;
4071     const Register tmp3 = c_rarg7;
4072 
4073     BLOCK_COMMENT("Entry:");
4074     __ enter(); // required for proper stackwalking of RuntimeStub frame
4075 
4076     __ kernel_crc32(crc, buf, len,
4077               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4078 
4079     __ leave(); // required for proper stackwalking of RuntimeStub frame
4080     __ ret(lr);
4081 
4082     return start;
4083   }
4084 
4085   /**
4086    *  Arguments:
4087    *
4088    * Inputs:
4089    *   c_rarg0   - int crc
4090    *   c_rarg1   - byte* buf
4091    *   c_rarg2   - int length
4092    *   c_rarg3   - int* table
4093    *
4094    * Ouput:
4095    *       r0   - int crc result
4096    */
4097   address generate_updateBytesCRC32C() {
4098     assert(UseCRC32CIntrinsics, "what are we doing here?");
4099 
4100     __ align(CodeEntryAlignment);
4101     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4102 
4103     address start = __ pc();
4104 
4105     const Register crc   = c_rarg0;  // crc
4106     const Register buf   = c_rarg1;  // source java byte array address
4107     const Register len   = c_rarg2;  // length
4108     const Register table0 = c_rarg3; // crc_table address
4109     const Register table1 = c_rarg4;
4110     const Register table2 = c_rarg5;
4111     const Register table3 = c_rarg6;
4112     const Register tmp3 = c_rarg7;
4113 
4114     BLOCK_COMMENT("Entry:");
4115     __ enter(); // required for proper stackwalking of RuntimeStub frame
4116 
4117     __ kernel_crc32c(crc, buf, len,
4118               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4119 
4120     __ leave(); // required for proper stackwalking of RuntimeStub frame
4121     __ ret(lr);
4122 
4123     return start;
4124   }
4125 
4126   /***
4127    *  Arguments:
4128    *
4129    *  Inputs:
4130    *   c_rarg0   - int   adler
4131    *   c_rarg1   - byte* buff
4132    *   c_rarg2   - int   len
4133    *
4134    * Output:
4135    *   c_rarg0   - int adler result
4136    */
4137   address generate_updateBytesAdler32() {
4138     __ align(CodeEntryAlignment);
4139     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4140     address start = __ pc();
4141 
4142     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4143 
4144     // Aliases
4145     Register adler  = c_rarg0;
4146     Register s1     = c_rarg0;
4147     Register s2     = c_rarg3;
4148     Register buff   = c_rarg1;
4149     Register len    = c_rarg2;
4150     Register nmax  = r4;
4151     Register base  = r5;
4152     Register count = r6;
4153     Register temp0 = rscratch1;
4154     Register temp1 = rscratch2;
4155     FloatRegister vbytes = v0;
4156     FloatRegister vs1acc = v1;
4157     FloatRegister vs2acc = v2;
4158     FloatRegister vtable = v3;
4159 
4160     // Max number of bytes we can process before having to take the mod
4161     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4162     uint64_t BASE = 0xfff1;
4163     uint64_t NMAX = 0x15B0;
4164 
4165     __ mov(base, BASE);
4166     __ mov(nmax, NMAX);
4167 
4168     // Load accumulation coefficients for the upper 16 bits
4169     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4170     __ ld1(vtable, __ T16B, Address(temp0));
4171 
4172     // s1 is initialized to the lower 16 bits of adler
4173     // s2 is initialized to the upper 16 bits of adler
4174     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4175     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4176 
4177     // The pipelined loop needs at least 16 elements for 1 iteration
4178     // It does check this, but it is more effective to skip to the cleanup loop
4179     __ cmp(len, (u1)16);
4180     __ br(Assembler::HS, L_nmax);
4181     __ cbz(len, L_combine);
4182 
4183     __ bind(L_simple_by1_loop);
4184     __ ldrb(temp0, Address(__ post(buff, 1)));
4185     __ add(s1, s1, temp0);
4186     __ add(s2, s2, s1);
4187     __ subs(len, len, 1);
4188     __ br(Assembler::HI, L_simple_by1_loop);
4189 
4190     // s1 = s1 % BASE
4191     __ subs(temp0, s1, base);
4192     __ csel(s1, temp0, s1, Assembler::HS);
4193 
4194     // s2 = s2 % BASE
4195     __ lsr(temp0, s2, 16);
4196     __ lsl(temp1, temp0, 4);
4197     __ sub(temp1, temp1, temp0);
4198     __ add(s2, temp1, s2, ext::uxth);
4199 
4200     __ subs(temp0, s2, base);
4201     __ csel(s2, temp0, s2, Assembler::HS);
4202 
4203     __ b(L_combine);
4204 
4205     __ bind(L_nmax);
4206     __ subs(len, len, nmax);
4207     __ sub(count, nmax, 16);
4208     __ br(Assembler::LO, L_by16);
4209 
4210     __ bind(L_nmax_loop);
4211 
4212     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4213                                       vbytes, vs1acc, vs2acc, vtable);
4214 
4215     __ subs(count, count, 16);
4216     __ br(Assembler::HS, L_nmax_loop);
4217 
4218     // s1 = s1 % BASE
4219     __ lsr(temp0, s1, 16);
4220     __ lsl(temp1, temp0, 4);
4221     __ sub(temp1, temp1, temp0);
4222     __ add(temp1, temp1, s1, ext::uxth);
4223 
4224     __ lsr(temp0, temp1, 16);
4225     __ lsl(s1, temp0, 4);
4226     __ sub(s1, s1, temp0);
4227     __ add(s1, s1, temp1, ext:: uxth);
4228 
4229     __ subs(temp0, s1, base);
4230     __ csel(s1, temp0, s1, Assembler::HS);
4231 
4232     // s2 = s2 % BASE
4233     __ lsr(temp0, s2, 16);
4234     __ lsl(temp1, temp0, 4);
4235     __ sub(temp1, temp1, temp0);
4236     __ add(temp1, temp1, s2, ext::uxth);
4237 
4238     __ lsr(temp0, temp1, 16);
4239     __ lsl(s2, temp0, 4);
4240     __ sub(s2, s2, temp0);
4241     __ add(s2, s2, temp1, ext:: uxth);
4242 
4243     __ subs(temp0, s2, base);
4244     __ csel(s2, temp0, s2, Assembler::HS);
4245 
4246     __ subs(len, len, nmax);
4247     __ sub(count, nmax, 16);
4248     __ br(Assembler::HS, L_nmax_loop);
4249 
4250     __ bind(L_by16);
4251     __ adds(len, len, count);
4252     __ br(Assembler::LO, L_by1);
4253 
4254     __ bind(L_by16_loop);
4255 
4256     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4257                                       vbytes, vs1acc, vs2acc, vtable);
4258 
4259     __ subs(len, len, 16);
4260     __ br(Assembler::HS, L_by16_loop);
4261 
4262     __ bind(L_by1);
4263     __ adds(len, len, 15);
4264     __ br(Assembler::LO, L_do_mod);
4265 
4266     __ bind(L_by1_loop);
4267     __ ldrb(temp0, Address(__ post(buff, 1)));
4268     __ add(s1, temp0, s1);
4269     __ add(s2, s2, s1);
4270     __ subs(len, len, 1);
4271     __ br(Assembler::HS, L_by1_loop);
4272 
4273     __ bind(L_do_mod);
4274     // s1 = s1 % BASE
4275     __ lsr(temp0, s1, 16);
4276     __ lsl(temp1, temp0, 4);
4277     __ sub(temp1, temp1, temp0);
4278     __ add(temp1, temp1, s1, ext::uxth);
4279 
4280     __ lsr(temp0, temp1, 16);
4281     __ lsl(s1, temp0, 4);
4282     __ sub(s1, s1, temp0);
4283     __ add(s1, s1, temp1, ext:: uxth);
4284 
4285     __ subs(temp0, s1, base);
4286     __ csel(s1, temp0, s1, Assembler::HS);
4287 
4288     // s2 = s2 % BASE
4289     __ lsr(temp0, s2, 16);
4290     __ lsl(temp1, temp0, 4);
4291     __ sub(temp1, temp1, temp0);
4292     __ add(temp1, temp1, s2, ext::uxth);
4293 
4294     __ lsr(temp0, temp1, 16);
4295     __ lsl(s2, temp0, 4);
4296     __ sub(s2, s2, temp0);
4297     __ add(s2, s2, temp1, ext:: uxth);
4298 
4299     __ subs(temp0, s2, base);
4300     __ csel(s2, temp0, s2, Assembler::HS);
4301 
4302     // Combine lower bits and higher bits
4303     __ bind(L_combine);
4304     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4305 
4306     __ ret(lr);
4307 
4308     return start;
4309   }
4310 
4311   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4312           Register temp0, Register temp1, FloatRegister vbytes,
4313           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4314     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4315     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4316     // In non-vectorized code, we update s1 and s2 as:
4317     //   s1 <- s1 + b1
4318     //   s2 <- s2 + s1
4319     //   s1 <- s1 + b2
4320     //   s2 <- s2 + b1
4321     //   ...
4322     //   s1 <- s1 + b16
4323     //   s2 <- s2 + s1
4324     // Putting above assignments together, we have:
4325     //   s1_new = s1 + b1 + b2 + ... + b16
4326     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4327     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4328     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4329     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4330 
4331     // s2 = s2 + s1 * 16
4332     __ add(s2, s2, s1, Assembler::LSL, 4);
4333 
4334     // vs1acc = b1 + b2 + b3 + ... + b16
4335     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4336     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4337     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4338     __ uaddlv(vs1acc, __ T16B, vbytes);
4339     __ uaddlv(vs2acc, __ T8H, vs2acc);
4340 
4341     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4342     __ fmovd(temp0, vs1acc);
4343     __ fmovd(temp1, vs2acc);
4344     __ add(s1, s1, temp0);
4345     __ add(s2, s2, temp1);
4346   }
4347 
4348   /**
4349    *  Arguments:
4350    *
4351    *  Input:
4352    *    c_rarg0   - x address
4353    *    c_rarg1   - x length
4354    *    c_rarg2   - y address
4355    *    c_rarg3   - y lenth
4356    *    c_rarg4   - z address
4357    *    c_rarg5   - z length
4358    */
4359   address generate_multiplyToLen() {
4360     __ align(CodeEntryAlignment);
4361     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4362 
4363     address start = __ pc();
4364     const Register x     = r0;
4365     const Register xlen  = r1;
4366     const Register y     = r2;
4367     const Register ylen  = r3;
4368     const Register z     = r4;
4369     const Register zlen  = r5;
4370 
4371     const Register tmp1  = r10;
4372     const Register tmp2  = r11;
4373     const Register tmp3  = r12;
4374     const Register tmp4  = r13;
4375     const Register tmp5  = r14;
4376     const Register tmp6  = r15;
4377     const Register tmp7  = r16;
4378 
4379     BLOCK_COMMENT("Entry:");
4380     __ enter(); // required for proper stackwalking of RuntimeStub frame
4381     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4382     __ leave(); // required for proper stackwalking of RuntimeStub frame
4383     __ ret(lr);
4384 
4385     return start;
4386   }
4387 
4388   address generate_squareToLen() {
4389     // squareToLen algorithm for sizes 1..127 described in java code works
4390     // faster than multiply_to_len on some CPUs and slower on others, but
4391     // multiply_to_len shows a bit better overall results
4392     __ align(CodeEntryAlignment);
4393     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4394     address start = __ pc();
4395 
4396     const Register x     = r0;
4397     const Register xlen  = r1;
4398     const Register z     = r2;
4399     const Register zlen  = r3;
4400     const Register y     = r4; // == x
4401     const Register ylen  = r5; // == xlen
4402 
4403     const Register tmp1  = r10;
4404     const Register tmp2  = r11;
4405     const Register tmp3  = r12;
4406     const Register tmp4  = r13;
4407     const Register tmp5  = r14;
4408     const Register tmp6  = r15;
4409     const Register tmp7  = r16;
4410 
4411     RegSet spilled_regs = RegSet::of(y, ylen);
4412     BLOCK_COMMENT("Entry:");
4413     __ enter();
4414     __ push(spilled_regs, sp);
4415     __ mov(y, x);
4416     __ mov(ylen, xlen);
4417     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4418     __ pop(spilled_regs, sp);
4419     __ leave();
4420     __ ret(lr);
4421     return start;
4422   }
4423 
4424   address generate_mulAdd() {
4425     __ align(CodeEntryAlignment);
4426     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4427 
4428     address start = __ pc();
4429 
4430     const Register out     = r0;
4431     const Register in      = r1;
4432     const Register offset  = r2;
4433     const Register len     = r3;
4434     const Register k       = r4;
4435 
4436     BLOCK_COMMENT("Entry:");
4437     __ enter();
4438     __ mul_add(out, in, offset, len, k);
4439     __ leave();
4440     __ ret(lr);
4441 
4442     return start;
4443   }
4444 
4445   // Arguments:
4446   //
4447   // Input:
4448   //   c_rarg0   - newArr address
4449   //   c_rarg1   - oldArr address
4450   //   c_rarg2   - newIdx
4451   //   c_rarg3   - shiftCount
4452   //   c_rarg4   - numIter
4453   //
4454   address generate_bigIntegerRightShift() {
4455     __ align(CodeEntryAlignment);
4456     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4457     address start = __ pc();
4458 
4459     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4460 
4461     Register newArr        = c_rarg0;
4462     Register oldArr        = c_rarg1;
4463     Register newIdx        = c_rarg2;
4464     Register shiftCount    = c_rarg3;
4465     Register numIter       = c_rarg4;
4466     Register idx           = numIter;
4467 
4468     Register newArrCur     = rscratch1;
4469     Register shiftRevCount = rscratch2;
4470     Register oldArrCur     = r13;
4471     Register oldArrNext    = r14;
4472 
4473     FloatRegister oldElem0        = v0;
4474     FloatRegister oldElem1        = v1;
4475     FloatRegister newElem         = v2;
4476     FloatRegister shiftVCount     = v3;
4477     FloatRegister shiftVRevCount  = v4;
4478 
4479     __ cbz(idx, Exit);
4480 
4481     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4482 
4483     // left shift count
4484     __ movw(shiftRevCount, 32);
4485     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4486 
4487     // numIter too small to allow a 4-words SIMD loop, rolling back
4488     __ cmp(numIter, (u1)4);
4489     __ br(Assembler::LT, ShiftThree);
4490 
4491     __ dup(shiftVCount,    __ T4S, shiftCount);
4492     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4493     __ negr(shiftVCount,   __ T4S, shiftVCount);
4494 
4495     __ BIND(ShiftSIMDLoop);
4496 
4497     // Calculate the load addresses
4498     __ sub(idx, idx, 4);
4499     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4500     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4501     __ add(oldArrCur,  oldArrNext, 4);
4502 
4503     // Load 4 words and process
4504     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4505     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4506     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4507     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4508     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4509     __ st1(newElem,   __ T4S,  Address(newArrCur));
4510 
4511     __ cmp(idx, (u1)4);
4512     __ br(Assembler::LT, ShiftTwoLoop);
4513     __ b(ShiftSIMDLoop);
4514 
4515     __ BIND(ShiftTwoLoop);
4516     __ cbz(idx, Exit);
4517     __ cmp(idx, (u1)1);
4518     __ br(Assembler::EQ, ShiftOne);
4519 
4520     // Calculate the load addresses
4521     __ sub(idx, idx, 2);
4522     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4523     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4524     __ add(oldArrCur,  oldArrNext, 4);
4525 
4526     // Load 2 words and process
4527     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4528     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4529     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4530     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4531     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4532     __ st1(newElem,   __ T2S, Address(newArrCur));
4533     __ b(ShiftTwoLoop);
4534 
4535     __ BIND(ShiftThree);
4536     __ tbz(idx, 1, ShiftOne);
4537     __ tbz(idx, 0, ShiftTwo);
4538     __ ldrw(r10,  Address(oldArr, 12));
4539     __ ldrw(r11,  Address(oldArr, 8));
4540     __ lsrvw(r10, r10, shiftCount);
4541     __ lslvw(r11, r11, shiftRevCount);
4542     __ orrw(r12,  r10, r11);
4543     __ strw(r12,  Address(newArr, 8));
4544 
4545     __ BIND(ShiftTwo);
4546     __ ldrw(r10,  Address(oldArr, 8));
4547     __ ldrw(r11,  Address(oldArr, 4));
4548     __ lsrvw(r10, r10, shiftCount);
4549     __ lslvw(r11, r11, shiftRevCount);
4550     __ orrw(r12,  r10, r11);
4551     __ strw(r12,  Address(newArr, 4));
4552 
4553     __ BIND(ShiftOne);
4554     __ ldrw(r10,  Address(oldArr, 4));
4555     __ ldrw(r11,  Address(oldArr));
4556     __ lsrvw(r10, r10, shiftCount);
4557     __ lslvw(r11, r11, shiftRevCount);
4558     __ orrw(r12,  r10, r11);
4559     __ strw(r12,  Address(newArr));
4560 
4561     __ BIND(Exit);
4562     __ ret(lr);
4563 
4564     return start;
4565   }
4566 
4567   // Arguments:
4568   //
4569   // Input:
4570   //   c_rarg0   - newArr address
4571   //   c_rarg1   - oldArr address
4572   //   c_rarg2   - newIdx
4573   //   c_rarg3   - shiftCount
4574   //   c_rarg4   - numIter
4575   //
4576   address generate_bigIntegerLeftShift() {
4577     __ align(CodeEntryAlignment);
4578     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4579     address start = __ pc();
4580 
4581     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4582 
4583     Register newArr        = c_rarg0;
4584     Register oldArr        = c_rarg1;
4585     Register newIdx        = c_rarg2;
4586     Register shiftCount    = c_rarg3;
4587     Register numIter       = c_rarg4;
4588 
4589     Register shiftRevCount = rscratch1;
4590     Register oldArrNext    = rscratch2;
4591 
4592     FloatRegister oldElem0        = v0;
4593     FloatRegister oldElem1        = v1;
4594     FloatRegister newElem         = v2;
4595     FloatRegister shiftVCount     = v3;
4596     FloatRegister shiftVRevCount  = v4;
4597 
4598     __ cbz(numIter, Exit);
4599 
4600     __ add(oldArrNext, oldArr, 4);
4601     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4602 
4603     // right shift count
4604     __ movw(shiftRevCount, 32);
4605     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4606 
4607     // numIter too small to allow a 4-words SIMD loop, rolling back
4608     __ cmp(numIter, (u1)4);
4609     __ br(Assembler::LT, ShiftThree);
4610 
4611     __ dup(shiftVCount,     __ T4S, shiftCount);
4612     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4613     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4614 
4615     __ BIND(ShiftSIMDLoop);
4616 
4617     // load 4 words and process
4618     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4619     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4620     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4621     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4622     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4623     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4624     __ sub(numIter,   numIter, 4);
4625 
4626     __ cmp(numIter, (u1)4);
4627     __ br(Assembler::LT, ShiftTwoLoop);
4628     __ b(ShiftSIMDLoop);
4629 
4630     __ BIND(ShiftTwoLoop);
4631     __ cbz(numIter, Exit);
4632     __ cmp(numIter, (u1)1);
4633     __ br(Assembler::EQ, ShiftOne);
4634 
4635     // load 2 words and process
4636     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4637     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4638     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4639     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4640     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4641     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4642     __ sub(numIter,   numIter, 2);
4643     __ b(ShiftTwoLoop);
4644 
4645     __ BIND(ShiftThree);
4646     __ ldrw(r10,  __ post(oldArr, 4));
4647     __ ldrw(r11,  __ post(oldArrNext, 4));
4648     __ lslvw(r10, r10, shiftCount);
4649     __ lsrvw(r11, r11, shiftRevCount);
4650     __ orrw(r12,  r10, r11);
4651     __ strw(r12,  __ post(newArr, 4));
4652     __ tbz(numIter, 1, Exit);
4653     __ tbz(numIter, 0, ShiftOne);
4654 
4655     __ BIND(ShiftTwo);
4656     __ ldrw(r10,  __ post(oldArr, 4));
4657     __ ldrw(r11,  __ post(oldArrNext, 4));
4658     __ lslvw(r10, r10, shiftCount);
4659     __ lsrvw(r11, r11, shiftRevCount);
4660     __ orrw(r12,  r10, r11);
4661     __ strw(r12,  __ post(newArr, 4));
4662 
4663     __ BIND(ShiftOne);
4664     __ ldrw(r10,  Address(oldArr));
4665     __ ldrw(r11,  Address(oldArrNext));
4666     __ lslvw(r10, r10, shiftCount);
4667     __ lsrvw(r11, r11, shiftRevCount);
4668     __ orrw(r12,  r10, r11);
4669     __ strw(r12,  Address(newArr));
4670 
4671     __ BIND(Exit);
4672     __ ret(lr);
4673 
4674     return start;
4675   }
4676 
4677   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
4678                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
4679                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
4680     // Karatsuba multiplication performs a 128*128 -> 256-bit
4681     // multiplication in three 128-bit multiplications and a few
4682     // additions.
4683     //
4684     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
4685     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
4686     //
4687     // Inputs:
4688     //
4689     // A0 in a.d[0]     (subkey)
4690     // A1 in a.d[1]
4691     // (A1+A0) in a1_xor_a0.d[0]
4692     //
4693     // B0 in b.d[0]     (state)
4694     // B1 in b.d[1]
4695 
4696     __ ext(tmp1, __ T16B, b, b, 0x08);
4697     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
4698     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
4699     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
4700     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
4701 
4702     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
4703     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
4704     __ eor(tmp2, __ T16B, tmp2, tmp4);
4705     __ eor(tmp2, __ T16B, tmp2, tmp3);
4706 
4707     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
4708     __ ins(result_hi, __ D, tmp2, 0, 1);
4709     __ ins(result_lo, __ D, tmp2, 1, 0);
4710   }
4711 
4712   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
4713                     FloatRegister p, FloatRegister z, FloatRegister t1) {
4714     const FloatRegister t0 = result;
4715 
4716     // The GCM field polynomial f is z^128 + p(z), where p =
4717     // z^7+z^2+z+1.
4718     //
4719     //    z^128 === -p(z)  (mod (z^128 + p(z)))
4720     //
4721     // so, given that the product we're reducing is
4722     //    a == lo + hi * z^128
4723     // substituting,
4724     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
4725     //
4726     // we reduce by multiplying hi by p(z) and subtracting the result
4727     // from (i.e. XORing it with) lo.  Because p has no nonzero high
4728     // bits we can do this with two 64-bit multiplications, lo*p and
4729     // hi*p.
4730 
4731     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
4732     __ ext(t1, __ T16B, t0, z, 8);
4733     __ eor(hi, __ T16B, hi, t1);
4734     __ ext(t1, __ T16B, z, t0, 8);
4735     __ eor(lo, __ T16B, lo, t1);
4736     __ pmull(t0, __ T1Q, hi, p, __ T1D);
4737     __ eor(result, __ T16B, lo, t0);
4738   }
4739 
4740   address generate_has_negatives(address &has_negatives_long) {
4741     const u1 large_loop_size = 64;
4742     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4743     int dcache_line = VM_Version::dcache_line_size();
4744 
4745     Register ary1 = r1, len = r2, result = r0;
4746 
4747     __ align(CodeEntryAlignment);
4748 
4749     StubCodeMark mark(this, "StubRoutines", "has_negatives");
4750 
4751     address entry = __ pc();
4752 
4753     __ enter();
4754 
4755   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16,
4756         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4757 
4758   __ cmp(len, (u1)15);
4759   __ br(Assembler::GT, LEN_OVER_15);
4760   // The only case when execution falls into this code is when pointer is near
4761   // the end of memory page and we have to avoid reading next page
4762   __ add(ary1, ary1, len);
4763   __ subs(len, len, 8);
4764   __ br(Assembler::GT, LEN_OVER_8);
4765   __ ldr(rscratch2, Address(ary1, -8));
4766   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4767   __ lsrv(rscratch2, rscratch2, rscratch1);
4768   __ tst(rscratch2, UPPER_BIT_MASK);
4769   __ cset(result, Assembler::NE);
4770   __ leave();
4771   __ ret(lr);
4772   __ bind(LEN_OVER_8);
4773   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4774   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4775   __ tst(rscratch2, UPPER_BIT_MASK);
4776   __ br(Assembler::NE, RET_TRUE_NO_POP);
4777   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4778   __ lsrv(rscratch1, rscratch1, rscratch2);
4779   __ tst(rscratch1, UPPER_BIT_MASK);
4780   __ cset(result, Assembler::NE);
4781   __ leave();
4782   __ ret(lr);
4783 
4784   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4785   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4786 
4787   has_negatives_long = __ pc(); // 2nd entry point
4788 
4789   __ enter();
4790 
4791   __ bind(LEN_OVER_15);
4792     __ push(spilled_regs, sp);
4793     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4794     __ cbz(rscratch2, ALIGNED);
4795     __ ldp(tmp6, tmp1, Address(ary1));
4796     __ mov(tmp5, 16);
4797     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4798     __ add(ary1, ary1, rscratch1);
4799     __ sub(len, len, rscratch1);
4800     __ orr(tmp6, tmp6, tmp1);
4801     __ tst(tmp6, UPPER_BIT_MASK);
4802     __ br(Assembler::NE, RET_TRUE);
4803 
4804   __ bind(ALIGNED);
4805     __ cmp(len, large_loop_size);
4806     __ br(Assembler::LT, CHECK_16);
4807     // Perform 16-byte load as early return in pre-loop to handle situation
4808     // when initially aligned large array has negative values at starting bytes,
4809     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4810     // slower. Cases with negative bytes further ahead won't be affected that
4811     // much. In fact, it'll be faster due to early loads, less instructions and
4812     // less branches in LARGE_LOOP.
4813     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4814     __ sub(len, len, 16);
4815     __ orr(tmp6, tmp6, tmp1);
4816     __ tst(tmp6, UPPER_BIT_MASK);
4817     __ br(Assembler::NE, RET_TRUE);
4818     __ cmp(len, large_loop_size);
4819     __ br(Assembler::LT, CHECK_16);
4820 
4821     if (SoftwarePrefetchHintDistance >= 0
4822         && SoftwarePrefetchHintDistance >= dcache_line) {
4823       // initial prefetch
4824       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4825     }
4826   __ bind(LARGE_LOOP);
4827     if (SoftwarePrefetchHintDistance >= 0) {
4828       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4829     }
4830     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4831     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4832     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4833     // instructions per cycle and have less branches, but this approach disables
4834     // early return, thus, all 64 bytes are loaded and checked every time.
4835     __ ldp(tmp2, tmp3, Address(ary1));
4836     __ ldp(tmp4, tmp5, Address(ary1, 16));
4837     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4838     __ ldp(tmp6, tmp1, Address(ary1, 48));
4839     __ add(ary1, ary1, large_loop_size);
4840     __ sub(len, len, large_loop_size);
4841     __ orr(tmp2, tmp2, tmp3);
4842     __ orr(tmp4, tmp4, tmp5);
4843     __ orr(rscratch1, rscratch1, rscratch2);
4844     __ orr(tmp6, tmp6, tmp1);
4845     __ orr(tmp2, tmp2, tmp4);
4846     __ orr(rscratch1, rscratch1, tmp6);
4847     __ orr(tmp2, tmp2, rscratch1);
4848     __ tst(tmp2, UPPER_BIT_MASK);
4849     __ br(Assembler::NE, RET_TRUE);
4850     __ cmp(len, large_loop_size);
4851     __ br(Assembler::GE, LARGE_LOOP);
4852 
4853   __ bind(CHECK_16); // small 16-byte load pre-loop
4854     __ cmp(len, (u1)16);
4855     __ br(Assembler::LT, POST_LOOP16);
4856 
4857   __ bind(LOOP16); // small 16-byte load loop
4858     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4859     __ sub(len, len, 16);
4860     __ orr(tmp2, tmp2, tmp3);
4861     __ tst(tmp2, UPPER_BIT_MASK);
4862     __ br(Assembler::NE, RET_TRUE);
4863     __ cmp(len, (u1)16);
4864     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4865 
4866   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4867     __ cmp(len, (u1)8);
4868     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4869     __ ldr(tmp3, Address(__ post(ary1, 8)));
4870     __ sub(len, len, 8);
4871     __ tst(tmp3, UPPER_BIT_MASK);
4872     __ br(Assembler::NE, RET_TRUE);
4873 
4874   __ bind(POST_LOOP16_LOAD_TAIL);
4875     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
4876     __ ldr(tmp1, Address(ary1));
4877     __ mov(tmp2, 64);
4878     __ sub(tmp4, tmp2, len, __ LSL, 3);
4879     __ lslv(tmp1, tmp1, tmp4);
4880     __ tst(tmp1, UPPER_BIT_MASK);
4881     __ br(Assembler::NE, RET_TRUE);
4882     // Fallthrough
4883 
4884   __ bind(RET_FALSE);
4885     __ pop(spilled_regs, sp);
4886     __ leave();
4887     __ mov(result, zr);
4888     __ ret(lr);
4889 
4890   __ bind(RET_TRUE);
4891     __ pop(spilled_regs, sp);
4892   __ bind(RET_TRUE_NO_POP);
4893     __ leave();
4894     __ mov(result, 1);
4895     __ ret(lr);
4896 
4897     return entry;
4898   }
4899 
4900   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4901         bool usePrefetch, Label &NOT_EQUAL) {
4902     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4903         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4904         tmp7 = r12, tmp8 = r13;
4905     Label LOOP;
4906 
4907     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4908     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4909     __ bind(LOOP);
4910     if (usePrefetch) {
4911       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4912       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4913     }
4914     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4915     __ eor(tmp1, tmp1, tmp2);
4916     __ eor(tmp3, tmp3, tmp4);
4917     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4918     __ orr(tmp1, tmp1, tmp3);
4919     __ cbnz(tmp1, NOT_EQUAL);
4920     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4921     __ eor(tmp5, tmp5, tmp6);
4922     __ eor(tmp7, tmp7, tmp8);
4923     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4924     __ orr(tmp5, tmp5, tmp7);
4925     __ cbnz(tmp5, NOT_EQUAL);
4926     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4927     __ eor(tmp1, tmp1, tmp2);
4928     __ eor(tmp3, tmp3, tmp4);
4929     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4930     __ orr(tmp1, tmp1, tmp3);
4931     __ cbnz(tmp1, NOT_EQUAL);
4932     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4933     __ eor(tmp5, tmp5, tmp6);
4934     __ sub(cnt1, cnt1, 8 * wordSize);
4935     __ eor(tmp7, tmp7, tmp8);
4936     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4937     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4938     // cmp) because subs allows an unlimited range of immediate operand.
4939     __ subs(tmp6, cnt1, loopThreshold);
4940     __ orr(tmp5, tmp5, tmp7);
4941     __ cbnz(tmp5, NOT_EQUAL);
4942     __ br(__ GE, LOOP);
4943     // post-loop
4944     __ eor(tmp1, tmp1, tmp2);
4945     __ eor(tmp3, tmp3, tmp4);
4946     __ orr(tmp1, tmp1, tmp3);
4947     __ sub(cnt1, cnt1, 2 * wordSize);
4948     __ cbnz(tmp1, NOT_EQUAL);
4949   }
4950 
4951   void generate_large_array_equals_loop_simd(int loopThreshold,
4952         bool usePrefetch, Label &NOT_EQUAL) {
4953     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4954         tmp2 = rscratch2;
4955     Label LOOP;
4956 
4957     __ bind(LOOP);
4958     if (usePrefetch) {
4959       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4960       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4961     }
4962     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4963     __ sub(cnt1, cnt1, 8 * wordSize);
4964     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4965     __ subs(tmp1, cnt1, loopThreshold);
4966     __ eor(v0, __ T16B, v0, v4);
4967     __ eor(v1, __ T16B, v1, v5);
4968     __ eor(v2, __ T16B, v2, v6);
4969     __ eor(v3, __ T16B, v3, v7);
4970     __ orr(v0, __ T16B, v0, v1);
4971     __ orr(v1, __ T16B, v2, v3);
4972     __ orr(v0, __ T16B, v0, v1);
4973     __ umov(tmp1, v0, __ D, 0);
4974     __ umov(tmp2, v0, __ D, 1);
4975     __ orr(tmp1, tmp1, tmp2);
4976     __ cbnz(tmp1, NOT_EQUAL);
4977     __ br(__ GE, LOOP);
4978   }
4979 
4980   // a1 = r1 - array1 address
4981   // a2 = r2 - array2 address
4982   // result = r0 - return value. Already contains "false"
4983   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4984   // r3-r5 are reserved temporary registers
4985   address generate_large_array_equals() {
4986     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4987         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4988         tmp7 = r12, tmp8 = r13;
4989     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4990         SMALL_LOOP, POST_LOOP;
4991     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4992     // calculate if at least 32 prefetched bytes are used
4993     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4994     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4995     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4996     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4997         tmp5, tmp6, tmp7, tmp8);
4998 
4999     __ align(CodeEntryAlignment);
5000 
5001     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5002 
5003     address entry = __ pc();
5004     __ enter();
5005     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5006     // also advance pointers to use post-increment instead of pre-increment
5007     __ add(a1, a1, wordSize);
5008     __ add(a2, a2, wordSize);
5009     if (AvoidUnalignedAccesses) {
5010       // both implementations (SIMD/nonSIMD) are using relatively large load
5011       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5012       // on some CPUs in case of address is not at least 16-byte aligned.
5013       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5014       // load if needed at least for 1st address and make if 16-byte aligned.
5015       Label ALIGNED16;
5016       __ tbz(a1, 3, ALIGNED16);
5017       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5018       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5019       __ sub(cnt1, cnt1, wordSize);
5020       __ eor(tmp1, tmp1, tmp2);
5021       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5022       __ bind(ALIGNED16);
5023     }
5024     if (UseSIMDForArrayEquals) {
5025       if (SoftwarePrefetchHintDistance >= 0) {
5026         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5027         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5028         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5029             /* prfm = */ true, NOT_EQUAL);
5030         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5031         __ br(__ LT, TAIL);
5032       }
5033       __ bind(NO_PREFETCH_LARGE_LOOP);
5034       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5035           /* prfm = */ false, NOT_EQUAL);
5036     } else {
5037       __ push(spilled_regs, sp);
5038       if (SoftwarePrefetchHintDistance >= 0) {
5039         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5040         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5041         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5042             /* prfm = */ true, NOT_EQUAL);
5043         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5044         __ br(__ LT, TAIL);
5045       }
5046       __ bind(NO_PREFETCH_LARGE_LOOP);
5047       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5048           /* prfm = */ false, NOT_EQUAL);
5049     }
5050     __ bind(TAIL);
5051       __ cbz(cnt1, EQUAL);
5052       __ subs(cnt1, cnt1, wordSize);
5053       __ br(__ LE, POST_LOOP);
5054     __ bind(SMALL_LOOP);
5055       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5056       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5057       __ subs(cnt1, cnt1, wordSize);
5058       __ eor(tmp1, tmp1, tmp2);
5059       __ cbnz(tmp1, NOT_EQUAL);
5060       __ br(__ GT, SMALL_LOOP);
5061     __ bind(POST_LOOP);
5062       __ ldr(tmp1, Address(a1, cnt1));
5063       __ ldr(tmp2, Address(a2, cnt1));
5064       __ eor(tmp1, tmp1, tmp2);
5065       __ cbnz(tmp1, NOT_EQUAL);
5066     __ bind(EQUAL);
5067       __ mov(result, true);
5068     __ bind(NOT_EQUAL);
5069       if (!UseSIMDForArrayEquals) {
5070         __ pop(spilled_regs, sp);
5071       }
5072     __ bind(NOT_EQUAL_NO_POP);
5073     __ leave();
5074     __ ret(lr);
5075     return entry;
5076   }
5077 
5078   address generate_dsin_dcos(bool isCos) {
5079     __ align(CodeEntryAlignment);
5080     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5081     address start = __ pc();
5082     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5083         (address)StubRoutines::aarch64::_two_over_pi,
5084         (address)StubRoutines::aarch64::_pio2,
5085         (address)StubRoutines::aarch64::_dsin_coef,
5086         (address)StubRoutines::aarch64::_dcos_coef);
5087     return start;
5088   }
5089 
5090   address generate_dlog() {
5091     __ align(CodeEntryAlignment);
5092     StubCodeMark mark(this, "StubRoutines", "dlog");
5093     address entry = __ pc();
5094     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5095         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5096     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5097     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5098         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5099     return entry;
5100   }
5101 
5102 
5103   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5104   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5105       Label &DIFF2) {
5106     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5107     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5108 
5109     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5110     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5111     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5112     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5113 
5114     __ fmovd(tmpL, vtmp3);
5115     __ eor(rscratch2, tmp3, tmpL);
5116     __ cbnz(rscratch2, DIFF2);
5117 
5118     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5119     __ umov(tmpL, vtmp3, __ D, 1);
5120     __ eor(rscratch2, tmpU, tmpL);
5121     __ cbnz(rscratch2, DIFF1);
5122 
5123     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5124     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5125     __ fmovd(tmpL, vtmp);
5126     __ eor(rscratch2, tmp3, tmpL);
5127     __ cbnz(rscratch2, DIFF2);
5128 
5129     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5130     __ umov(tmpL, vtmp, __ D, 1);
5131     __ eor(rscratch2, tmpU, tmpL);
5132     __ cbnz(rscratch2, DIFF1);
5133   }
5134 
5135   // r0  = result
5136   // r1  = str1
5137   // r2  = cnt1
5138   // r3  = str2
5139   // r4  = cnt2
5140   // r10 = tmp1
5141   // r11 = tmp2
5142   address generate_compare_long_string_different_encoding(bool isLU) {
5143     __ align(CodeEntryAlignment);
5144     StubCodeMark mark(this, "StubRoutines", isLU
5145         ? "compare_long_string_different_encoding LU"
5146         : "compare_long_string_different_encoding UL");
5147     address entry = __ pc();
5148     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5149         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5150         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5151     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5152         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5153     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5154     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5155 
5156     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5157 
5158     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5159     // cnt2 == amount of characters left to compare
5160     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5161     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5162     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5163     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5164     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5165     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5166     __ eor(rscratch2, tmp1, tmp2);
5167     __ mov(rscratch1, tmp2);
5168     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5169     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5170              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5171     __ push(spilled_regs, sp);
5172     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5173     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5174 
5175     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5176 
5177     if (SoftwarePrefetchHintDistance >= 0) {
5178       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5179       __ br(__ LT, NO_PREFETCH);
5180       __ bind(LARGE_LOOP_PREFETCH);
5181         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5182         __ mov(tmp4, 2);
5183         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5184         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5185           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5186           __ subs(tmp4, tmp4, 1);
5187           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5188           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5189           __ mov(tmp4, 2);
5190         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5191           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5192           __ subs(tmp4, tmp4, 1);
5193           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5194           __ sub(cnt2, cnt2, 64);
5195           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5196           __ br(__ GE, LARGE_LOOP_PREFETCH);
5197     }
5198     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5199     __ bind(NO_PREFETCH);
5200     __ subs(cnt2, cnt2, 16);
5201     __ br(__ LT, TAIL);
5202     __ align(OptoLoopAlignment);
5203     __ bind(SMALL_LOOP); // smaller loop
5204       __ subs(cnt2, cnt2, 16);
5205       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5206       __ br(__ GE, SMALL_LOOP);
5207       __ cmn(cnt2, (u1)16);
5208       __ br(__ EQ, LOAD_LAST);
5209     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5210       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5211       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5212       __ ldr(tmp3, Address(cnt1, -8));
5213       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5214       __ b(LOAD_LAST);
5215     __ bind(DIFF2);
5216       __ mov(tmpU, tmp3);
5217     __ bind(DIFF1);
5218       __ pop(spilled_regs, sp);
5219       __ b(CALCULATE_DIFFERENCE);
5220     __ bind(LOAD_LAST);
5221       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5222       // No need to load it again
5223       __ mov(tmpU, tmp3);
5224       __ pop(spilled_regs, sp);
5225 
5226       // tmp2 points to the address of the last 4 Latin1 characters right now
5227       __ ldrs(vtmp, Address(tmp2));
5228       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5229       __ fmovd(tmpL, vtmp);
5230 
5231       __ eor(rscratch2, tmpU, tmpL);
5232       __ cbz(rscratch2, DONE);
5233 
5234     // Find the first different characters in the longwords and
5235     // compute their difference.
5236     __ bind(CALCULATE_DIFFERENCE);
5237       __ rev(rscratch2, rscratch2);
5238       __ clz(rscratch2, rscratch2);
5239       __ andr(rscratch2, rscratch2, -16);
5240       __ lsrv(tmp1, tmp1, rscratch2);
5241       __ uxthw(tmp1, tmp1);
5242       __ lsrv(rscratch1, rscratch1, rscratch2);
5243       __ uxthw(rscratch1, rscratch1);
5244       __ subw(result, tmp1, rscratch1);
5245     __ bind(DONE);
5246       __ ret(lr);
5247     return entry;
5248   }
5249 
5250     address generate_method_entry_barrier() {
5251     __ align(CodeEntryAlignment);
5252     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5253 
5254     Label deoptimize_label;
5255 
5256     address start = __ pc();
5257 
5258     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5259 
5260     __ enter();
5261     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5262 
5263     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5264 
5265     __ push_call_clobbered_registers();
5266 
5267     __ mov(c_rarg0, rscratch2);
5268     __ call_VM_leaf
5269          (CAST_FROM_FN_PTR
5270           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5271 
5272     __ reset_last_Java_frame(true);
5273 
5274     __ mov(rscratch1, r0);
5275 
5276     __ pop_call_clobbered_registers();
5277 
5278     __ cbnz(rscratch1, deoptimize_label);
5279 
5280     __ leave();
5281     __ ret(lr);
5282 
5283     __ BIND(deoptimize_label);
5284 
5285     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5286     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5287 
5288     __ mov(sp, rscratch1);
5289     __ br(rscratch2);
5290 
5291     return start;
5292   }
5293 
5294   // r0  = result
5295   // r1  = str1
5296   // r2  = cnt1
5297   // r3  = str2
5298   // r4  = cnt2
5299   // r10 = tmp1
5300   // r11 = tmp2
5301   address generate_compare_long_string_same_encoding(bool isLL) {
5302     __ align(CodeEntryAlignment);
5303     StubCodeMark mark(this, "StubRoutines", isLL
5304         ? "compare_long_string_same_encoding LL"
5305         : "compare_long_string_same_encoding UU");
5306     address entry = __ pc();
5307     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5308         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5309 
5310     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5311 
5312     // exit from large loop when less than 64 bytes left to read or we're about
5313     // to prefetch memory behind array border
5314     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5315 
5316     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5317     __ eor(rscratch2, tmp1, tmp2);
5318     __ cbnz(rscratch2, CAL_DIFFERENCE);
5319 
5320     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5321     // update pointers, because of previous read
5322     __ add(str1, str1, wordSize);
5323     __ add(str2, str2, wordSize);
5324     if (SoftwarePrefetchHintDistance >= 0) {
5325       __ align(OptoLoopAlignment);
5326       __ bind(LARGE_LOOP_PREFETCH);
5327         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5328         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5329 
5330         for (int i = 0; i < 4; i++) {
5331           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5332           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5333           __ cmp(tmp1, tmp2);
5334           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5335           __ br(Assembler::NE, DIFF);
5336         }
5337         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5338         __ add(str1, str1, 64);
5339         __ add(str2, str2, 64);
5340         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5341         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5342         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5343     }
5344 
5345     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5346     __ br(Assembler::LE, LESS16);
5347     __ align(OptoLoopAlignment);
5348     __ bind(LOOP_COMPARE16);
5349       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5350       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5351       __ cmp(tmp1, tmp2);
5352       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5353       __ br(Assembler::NE, DIFF);
5354       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5355       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5356       __ br(Assembler::LT, LESS16);
5357 
5358       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5359       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5360       __ cmp(tmp1, tmp2);
5361       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5362       __ br(Assembler::NE, DIFF);
5363       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5364       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5365       __ br(Assembler::GE, LOOP_COMPARE16);
5366       __ cbz(cnt2, LENGTH_DIFF);
5367 
5368     __ bind(LESS16);
5369       // each 8 compare
5370       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5371       __ br(Assembler::LE, LESS8);
5372       __ ldr(tmp1, Address(__ post(str1, 8)));
5373       __ ldr(tmp2, Address(__ post(str2, 8)));
5374       __ eor(rscratch2, tmp1, tmp2);
5375       __ cbnz(rscratch2, CAL_DIFFERENCE);
5376       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5377 
5378     __ bind(LESS8); // directly load last 8 bytes
5379       if (!isLL) {
5380         __ add(cnt2, cnt2, cnt2);
5381       }
5382       __ ldr(tmp1, Address(str1, cnt2));
5383       __ ldr(tmp2, Address(str2, cnt2));
5384       __ eor(rscratch2, tmp1, tmp2);
5385       __ cbz(rscratch2, LENGTH_DIFF);
5386       __ b(CAL_DIFFERENCE);
5387 
5388     __ bind(DIFF);
5389       __ cmp(tmp1, tmp2);
5390       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5391       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5392       // reuse rscratch2 register for the result of eor instruction
5393       __ eor(rscratch2, tmp1, tmp2);
5394 
5395     __ bind(CAL_DIFFERENCE);
5396       __ rev(rscratch2, rscratch2);
5397       __ clz(rscratch2, rscratch2);
5398       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5399       __ lsrv(tmp1, tmp1, rscratch2);
5400       __ lsrv(tmp2, tmp2, rscratch2);
5401       if (isLL) {
5402         __ uxtbw(tmp1, tmp1);
5403         __ uxtbw(tmp2, tmp2);
5404       } else {
5405         __ uxthw(tmp1, tmp1);
5406         __ uxthw(tmp2, tmp2);
5407       }
5408       __ subw(result, tmp1, tmp2);
5409 
5410     __ bind(LENGTH_DIFF);
5411       __ ret(lr);
5412     return entry;
5413   }
5414 
5415   void generate_compare_long_strings() {
5416       StubRoutines::aarch64::_compare_long_string_LL
5417           = generate_compare_long_string_same_encoding(true);
5418       StubRoutines::aarch64::_compare_long_string_UU
5419           = generate_compare_long_string_same_encoding(false);
5420       StubRoutines::aarch64::_compare_long_string_LU
5421           = generate_compare_long_string_different_encoding(true);
5422       StubRoutines::aarch64::_compare_long_string_UL
5423           = generate_compare_long_string_different_encoding(false);
5424   }
5425 
5426   // R0 = result
5427   // R1 = str2
5428   // R2 = cnt1
5429   // R3 = str1
5430   // R4 = cnt2
5431   // This generic linear code use few additional ideas, which makes it faster:
5432   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5433   // in order to skip initial loading(help in systems with 1 ld pipeline)
5434   // 2) we can use "fast" algorithm of finding single character to search for
5435   // first symbol with less branches(1 branch per each loaded register instead
5436   // of branch for each symbol), so, this is where constants like
5437   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5438   // 3) after loading and analyzing 1st register of source string, it can be
5439   // used to search for every 1st character entry, saving few loads in
5440   // comparison with "simplier-but-slower" implementation
5441   // 4) in order to avoid lots of push/pop operations, code below is heavily
5442   // re-using/re-initializing/compressing register values, which makes code
5443   // larger and a bit less readable, however, most of extra operations are
5444   // issued during loads or branches, so, penalty is minimal
5445   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5446     const char* stubName = str1_isL
5447         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5448         : "indexof_linear_uu";
5449     __ align(CodeEntryAlignment);
5450     StubCodeMark mark(this, "StubRoutines", stubName);
5451     address entry = __ pc();
5452 
5453     int str1_chr_size = str1_isL ? 1 : 2;
5454     int str2_chr_size = str2_isL ? 1 : 2;
5455     int str1_chr_shift = str1_isL ? 0 : 1;
5456     int str2_chr_shift = str2_isL ? 0 : 1;
5457     bool isL = str1_isL && str2_isL;
5458    // parameters
5459     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5460     // temporary registers
5461     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5462     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5463     // redefinitions
5464     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5465 
5466     __ push(spilled_regs, sp);
5467     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5468         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5469         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5470         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5471         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5472         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5473     // Read whole register from str1. It is safe, because length >=8 here
5474     __ ldr(ch1, Address(str1));
5475     // Read whole register from str2. It is safe, because length >=8 here
5476     __ ldr(ch2, Address(str2));
5477     __ sub(cnt2, cnt2, cnt1);
5478     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5479     if (str1_isL != str2_isL) {
5480       __ eor(v0, __ T16B, v0, v0);
5481     }
5482     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5483     __ mul(first, first, tmp1);
5484     // check if we have less than 1 register to check
5485     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5486     if (str1_isL != str2_isL) {
5487       __ fmovd(v1, ch1);
5488     }
5489     __ br(__ LE, L_SMALL);
5490     __ eor(ch2, first, ch2);
5491     if (str1_isL != str2_isL) {
5492       __ zip1(v1, __ T16B, v1, v0);
5493     }
5494     __ sub(tmp2, ch2, tmp1);
5495     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5496     __ bics(tmp2, tmp2, ch2);
5497     if (str1_isL != str2_isL) {
5498       __ fmovd(ch1, v1);
5499     }
5500     __ br(__ NE, L_HAS_ZERO);
5501     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5502     __ add(result, result, wordSize/str2_chr_size);
5503     __ add(str2, str2, wordSize);
5504     __ br(__ LT, L_POST_LOOP);
5505     __ BIND(L_LOOP);
5506       __ ldr(ch2, Address(str2));
5507       __ eor(ch2, first, ch2);
5508       __ sub(tmp2, ch2, tmp1);
5509       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5510       __ bics(tmp2, tmp2, ch2);
5511       __ br(__ NE, L_HAS_ZERO);
5512     __ BIND(L_LOOP_PROCEED);
5513       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5514       __ add(str2, str2, wordSize);
5515       __ add(result, result, wordSize/str2_chr_size);
5516       __ br(__ GE, L_LOOP);
5517     __ BIND(L_POST_LOOP);
5518       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5519       __ br(__ LE, NOMATCH);
5520       __ ldr(ch2, Address(str2));
5521       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5522       __ eor(ch2, first, ch2);
5523       __ sub(tmp2, ch2, tmp1);
5524       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5525       __ mov(tmp4, -1); // all bits set
5526       __ b(L_SMALL_PROCEED);
5527     __ align(OptoLoopAlignment);
5528     __ BIND(L_SMALL);
5529       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5530       __ eor(ch2, first, ch2);
5531       if (str1_isL != str2_isL) {
5532         __ zip1(v1, __ T16B, v1, v0);
5533       }
5534       __ sub(tmp2, ch2, tmp1);
5535       __ mov(tmp4, -1); // all bits set
5536       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5537       if (str1_isL != str2_isL) {
5538         __ fmovd(ch1, v1); // move converted 4 symbols
5539       }
5540     __ BIND(L_SMALL_PROCEED);
5541       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5542       __ bic(tmp2, tmp2, ch2);
5543       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5544       __ rbit(tmp2, tmp2);
5545       __ br(__ EQ, NOMATCH);
5546     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5547       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5548       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5549       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5550       if (str2_isL) { // LL
5551         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5552         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5553         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5554         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5555         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5556       } else {
5557         __ mov(ch2, 0xE); // all bits in byte set except last one
5558         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5559         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5560         __ lslv(tmp2, tmp2, tmp4);
5561         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5562         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5563         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5564         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5565       }
5566       __ cmp(ch1, ch2);
5567       __ mov(tmp4, wordSize/str2_chr_size);
5568       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5569     __ BIND(L_SMALL_CMP_LOOP);
5570       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5571                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5572       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5573                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5574       __ add(tmp4, tmp4, 1);
5575       __ cmp(tmp4, cnt1);
5576       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5577       __ cmp(first, ch2);
5578       __ br(__ EQ, L_SMALL_CMP_LOOP);
5579     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5580       __ cbz(tmp2, NOMATCH); // no more matches. exit
5581       __ clz(tmp4, tmp2);
5582       __ add(result, result, 1); // advance index
5583       __ add(str2, str2, str2_chr_size); // advance pointer
5584       __ b(L_SMALL_HAS_ZERO_LOOP);
5585     __ align(OptoLoopAlignment);
5586     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5587       __ cmp(first, ch2);
5588       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5589       __ b(DONE);
5590     __ align(OptoLoopAlignment);
5591     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5592       if (str2_isL) { // LL
5593         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5594         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5595         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5596         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5597         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5598       } else {
5599         __ mov(ch2, 0xE); // all bits in byte set except last one
5600         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5601         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5602         __ lslv(tmp2, tmp2, tmp4);
5603         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5604         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5605         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5606         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5607       }
5608       __ cmp(ch1, ch2);
5609       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5610       __ b(DONE);
5611     __ align(OptoLoopAlignment);
5612     __ BIND(L_HAS_ZERO);
5613       __ rbit(tmp2, tmp2);
5614       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5615       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5616       // It's fine because both counters are 32bit and are not changed in this
5617       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5618       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5619       __ sub(result, result, 1);
5620     __ BIND(L_HAS_ZERO_LOOP);
5621       __ mov(cnt1, wordSize/str2_chr_size);
5622       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5623       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5624       if (str2_isL) {
5625         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5626         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5627         __ lslv(tmp2, tmp2, tmp4);
5628         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5629         __ add(tmp4, tmp4, 1);
5630         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5631         __ lsl(tmp2, tmp2, 1);
5632         __ mov(tmp4, wordSize/str2_chr_size);
5633       } else {
5634         __ mov(ch2, 0xE);
5635         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5636         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5637         __ lslv(tmp2, tmp2, tmp4);
5638         __ add(tmp4, tmp4, 1);
5639         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5640         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5641         __ lsl(tmp2, tmp2, 1);
5642         __ mov(tmp4, wordSize/str2_chr_size);
5643         __ sub(str2, str2, str2_chr_size);
5644       }
5645       __ cmp(ch1, ch2);
5646       __ mov(tmp4, wordSize/str2_chr_size);
5647       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5648     __ BIND(L_CMP_LOOP);
5649       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5650                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5651       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5652                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5653       __ add(tmp4, tmp4, 1);
5654       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5655       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5656       __ cmp(cnt1, ch2);
5657       __ br(__ EQ, L_CMP_LOOP);
5658     __ BIND(L_CMP_LOOP_NOMATCH);
5659       // here we're not matched
5660       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5661       __ clz(tmp4, tmp2);
5662       __ add(str2, str2, str2_chr_size); // advance pointer
5663       __ b(L_HAS_ZERO_LOOP);
5664     __ align(OptoLoopAlignment);
5665     __ BIND(L_CMP_LOOP_LAST_CMP);
5666       __ cmp(cnt1, ch2);
5667       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5668       __ b(DONE);
5669     __ align(OptoLoopAlignment);
5670     __ BIND(L_CMP_LOOP_LAST_CMP2);
5671       if (str2_isL) {
5672         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5673         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5674         __ lslv(tmp2, tmp2, tmp4);
5675         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5676         __ add(tmp4, tmp4, 1);
5677         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5678         __ lsl(tmp2, tmp2, 1);
5679       } else {
5680         __ mov(ch2, 0xE);
5681         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5682         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5683         __ lslv(tmp2, tmp2, tmp4);
5684         __ add(tmp4, tmp4, 1);
5685         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5686         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5687         __ lsl(tmp2, tmp2, 1);
5688         __ sub(str2, str2, str2_chr_size);
5689       }
5690       __ cmp(ch1, ch2);
5691       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5692       __ b(DONE);
5693     __ align(OptoLoopAlignment);
5694     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5695       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5696       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5697       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5698       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5699       // result by analyzed characters value, so, we can just reset lower bits
5700       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5701       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5702       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5703       // index of last analyzed substring inside current octet. So, str2 in at
5704       // respective start address. We need to advance it to next octet
5705       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5706       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5707       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5708       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5709       __ movw(cnt2, cnt2);
5710       __ b(L_LOOP_PROCEED);
5711     __ align(OptoLoopAlignment);
5712     __ BIND(NOMATCH);
5713       __ mov(result, -1);
5714     __ BIND(DONE);
5715       __ pop(spilled_regs, sp);
5716       __ ret(lr);
5717     return entry;
5718   }
5719 
5720   void generate_string_indexof_stubs() {
5721     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5722     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5723     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5724   }
5725 
5726   void inflate_and_store_2_fp_registers(bool generatePrfm,
5727       FloatRegister src1, FloatRegister src2) {
5728     Register dst = r1;
5729     __ zip1(v1, __ T16B, src1, v0);
5730     __ zip2(v2, __ T16B, src1, v0);
5731     if (generatePrfm) {
5732       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5733     }
5734     __ zip1(v3, __ T16B, src2, v0);
5735     __ zip2(v4, __ T16B, src2, v0);
5736     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5737   }
5738 
5739   // R0 = src
5740   // R1 = dst
5741   // R2 = len
5742   // R3 = len >> 3
5743   // V0 = 0
5744   // v1 = loaded 8 bytes
5745   address generate_large_byte_array_inflate() {
5746     __ align(CodeEntryAlignment);
5747     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5748     address entry = __ pc();
5749     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5750     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5751     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5752 
5753     // do one more 8-byte read to have address 16-byte aligned in most cases
5754     // also use single store instruction
5755     __ ldrd(v2, __ post(src, 8));
5756     __ sub(octetCounter, octetCounter, 2);
5757     __ zip1(v1, __ T16B, v1, v0);
5758     __ zip1(v2, __ T16B, v2, v0);
5759     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5760     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5761     __ subs(rscratch1, octetCounter, large_loop_threshold);
5762     __ br(__ LE, LOOP_START);
5763     __ b(LOOP_PRFM_START);
5764     __ bind(LOOP_PRFM);
5765       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5766     __ bind(LOOP_PRFM_START);
5767       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5768       __ sub(octetCounter, octetCounter, 8);
5769       __ subs(rscratch1, octetCounter, large_loop_threshold);
5770       inflate_and_store_2_fp_registers(true, v3, v4);
5771       inflate_and_store_2_fp_registers(true, v5, v6);
5772       __ br(__ GT, LOOP_PRFM);
5773       __ cmp(octetCounter, (u1)8);
5774       __ br(__ LT, DONE);
5775     __ bind(LOOP);
5776       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5777       __ bind(LOOP_START);
5778       __ sub(octetCounter, octetCounter, 8);
5779       __ cmp(octetCounter, (u1)8);
5780       inflate_and_store_2_fp_registers(false, v3, v4);
5781       inflate_and_store_2_fp_registers(false, v5, v6);
5782       __ br(__ GE, LOOP);
5783     __ bind(DONE);
5784       __ ret(lr);
5785     return entry;
5786   }
5787 
5788   /**
5789    *  Arguments:
5790    *
5791    *  Input:
5792    *  c_rarg0   - current state address
5793    *  c_rarg1   - H key address
5794    *  c_rarg2   - data address
5795    *  c_rarg3   - number of blocks
5796    *
5797    *  Output:
5798    *  Updated state at c_rarg0
5799    */
5800   address generate_ghash_processBlocks() {
5801     // Bafflingly, GCM uses little-endian for the byte order, but
5802     // big-endian for the bit order.  For example, the polynomial 1 is
5803     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5804     //
5805     // So, we must either reverse the bytes in each word and do
5806     // everything big-endian or reverse the bits in each byte and do
5807     // it little-endian.  On AArch64 it's more idiomatic to reverse
5808     // the bits in each byte (we have an instruction, RBIT, to do
5809     // that) and keep the data in little-endian bit order throught the
5810     // calculation, bit-reversing the inputs and outputs.
5811 
5812     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5813     __ align(wordSize * 2);
5814     address p = __ pc();
5815     __ emit_int64(0x87);  // The low-order bits of the field
5816                           // polynomial (i.e. p = z^7+z^2+z+1)
5817                           // repeated in the low and high parts of a
5818                           // 128-bit vector
5819     __ emit_int64(0x87);
5820 
5821     __ align(CodeEntryAlignment);
5822     address start = __ pc();
5823 
5824     Register state   = c_rarg0;
5825     Register subkeyH = c_rarg1;
5826     Register data    = c_rarg2;
5827     Register blocks  = c_rarg3;
5828 
5829     FloatRegister vzr = v30;
5830     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5831 
5832     __ ldrq(v0, Address(state));
5833     __ ldrq(v1, Address(subkeyH));
5834 
5835     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5836     __ rbit(v0, __ T16B, v0);
5837     __ rev64(v1, __ T16B, v1);
5838     __ rbit(v1, __ T16B, v1);
5839 
5840     __ ldrq(v26, p);
5841 
5842     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5843     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5844 
5845     {
5846       Label L_ghash_loop;
5847       __ bind(L_ghash_loop);
5848 
5849       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5850                                                  // reversing each byte
5851       __ rbit(v2, __ T16B, v2);
5852       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5853 
5854       // Multiply state in v2 by subkey in v1
5855       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5856                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
5857                      /*temps*/v6, v20, v18, v21);
5858       // Reduce v7:v5 by the field polynomial
5859       ghash_reduce(v0, v5, v7, v26, vzr, v20);
5860 
5861       __ sub(blocks, blocks, 1);
5862       __ cbnz(blocks, L_ghash_loop);
5863     }
5864 
5865     // The bit-reversed result is at this point in v0
5866     __ rev64(v1, __ T16B, v0);
5867     __ rbit(v1, __ T16B, v1);
5868 
5869     __ st1(v1, __ T16B, state);
5870     __ ret(lr);
5871 
5872     return start;
5873   }
5874 
5875   void generate_base64_encode_simdround(Register src, Register dst,
5876         FloatRegister codec, u8 size) {
5877 
5878     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
5879     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5880     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5881 
5882     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5883 
5884     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
5885 
5886     __ ushr(ind0, arrangement, in0,  2);
5887 
5888     __ ushr(ind1, arrangement, in1,  2);
5889     __ shl(in0,   arrangement, in0,  6);
5890     __ orr(ind1,  arrangement, ind1, in0);
5891     __ ushr(ind1, arrangement, ind1, 2);
5892 
5893     __ ushr(ind2, arrangement, in2,  4);
5894     __ shl(in1,   arrangement, in1,  4);
5895     __ orr(ind2,  arrangement, in1,  ind2);
5896     __ ushr(ind2, arrangement, ind2, 2);
5897 
5898     __ shl(ind3,  arrangement, in2,  2);
5899     __ ushr(ind3, arrangement, ind3, 2);
5900 
5901     __ tbl(out0,  arrangement, codec,  4, ind0);
5902     __ tbl(out1,  arrangement, codec,  4, ind1);
5903     __ tbl(out2,  arrangement, codec,  4, ind2);
5904     __ tbl(out3,  arrangement, codec,  4, ind3);
5905 
5906     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
5907   }
5908 
5909    /**
5910    *  Arguments:
5911    *
5912    *  Input:
5913    *  c_rarg0   - src_start
5914    *  c_rarg1   - src_offset
5915    *  c_rarg2   - src_length
5916    *  c_rarg3   - dest_start
5917    *  c_rarg4   - dest_offset
5918    *  c_rarg5   - isURL
5919    *
5920    */
5921   address generate_base64_encodeBlock() {
5922 
5923     static const char toBase64[64] = {
5924       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5925       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5926       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5927       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5928       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5929     };
5930 
5931     static const char toBase64URL[64] = {
5932       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5933       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5934       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5935       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5936       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5937     };
5938 
5939     __ align(CodeEntryAlignment);
5940     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5941     address start = __ pc();
5942 
5943     Register src   = c_rarg0;  // source array
5944     Register soff  = c_rarg1;  // source start offset
5945     Register send  = c_rarg2;  // source end offset
5946     Register dst   = c_rarg3;  // dest array
5947     Register doff  = c_rarg4;  // position for writing to dest array
5948     Register isURL = c_rarg5;  // Base64 or URL chracter set
5949 
5950     // c_rarg6 and c_rarg7 are free to use as temps
5951     Register codec  = c_rarg6;
5952     Register length = c_rarg7;
5953 
5954     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
5955 
5956     __ add(src, src, soff);
5957     __ add(dst, dst, doff);
5958     __ sub(length, send, soff);
5959 
5960     // load the codec base address
5961     __ lea(codec, ExternalAddress((address) toBase64));
5962     __ cbz(isURL, ProcessData);
5963     __ lea(codec, ExternalAddress((address) toBase64URL));
5964 
5965     __ BIND(ProcessData);
5966 
5967     // too short to formup a SIMD loop, roll back
5968     __ cmp(length, (u1)24);
5969     __ br(Assembler::LT, Process3B);
5970 
5971     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
5972 
5973     __ BIND(Process48B);
5974     __ cmp(length, (u1)48);
5975     __ br(Assembler::LT, Process24B);
5976     generate_base64_encode_simdround(src, dst, v0, 16);
5977     __ sub(length, length, 48);
5978     __ b(Process48B);
5979 
5980     __ BIND(Process24B);
5981     __ cmp(length, (u1)24);
5982     __ br(Assembler::LT, SIMDExit);
5983     generate_base64_encode_simdround(src, dst, v0, 8);
5984     __ sub(length, length, 24);
5985 
5986     __ BIND(SIMDExit);
5987     __ cbz(length, Exit);
5988 
5989     __ BIND(Process3B);
5990     //  3 src bytes, 24 bits
5991     __ ldrb(r10, __ post(src, 1));
5992     __ ldrb(r11, __ post(src, 1));
5993     __ ldrb(r12, __ post(src, 1));
5994     __ orrw(r11, r11, r10, Assembler::LSL, 8);
5995     __ orrw(r12, r12, r11, Assembler::LSL, 8);
5996     // codec index
5997     __ ubfmw(r15, r12, 18, 23);
5998     __ ubfmw(r14, r12, 12, 17);
5999     __ ubfmw(r13, r12, 6,  11);
6000     __ andw(r12,  r12, 63);
6001     // get the code based on the codec
6002     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6003     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6004     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6005     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6006     __ strb(r15, __ post(dst, 1));
6007     __ strb(r14, __ post(dst, 1));
6008     __ strb(r13, __ post(dst, 1));
6009     __ strb(r12, __ post(dst, 1));
6010     __ sub(length, length, 3);
6011     __ cbnz(length, Process3B);
6012 
6013     __ BIND(Exit);
6014     __ ret(lr);
6015 
6016     return start;
6017   }
6018 
6019   void generate_base64_decode_simdround(Register src, Register dst,
6020         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6021 
6022     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6023     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6024 
6025     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6026     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6027 
6028     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6029 
6030     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6031 
6032     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6033 
6034     // we need unsigned saturating substract, to make sure all input values
6035     // in range [0, 63] will have 0U value in the higher half lookup
6036     __ uqsubv(decH0, __ T16B, in0, v27);
6037     __ uqsubv(decH1, __ T16B, in1, v27);
6038     __ uqsubv(decH2, __ T16B, in2, v27);
6039     __ uqsubv(decH3, __ T16B, in3, v27);
6040 
6041     // lower half lookup
6042     __ tbl(decL0, arrangement, codecL, 4, in0);
6043     __ tbl(decL1, arrangement, codecL, 4, in1);
6044     __ tbl(decL2, arrangement, codecL, 4, in2);
6045     __ tbl(decL3, arrangement, codecL, 4, in3);
6046 
6047     // higher half lookup
6048     __ tbx(decH0, arrangement, codecH, 4, decH0);
6049     __ tbx(decH1, arrangement, codecH, 4, decH1);
6050     __ tbx(decH2, arrangement, codecH, 4, decH2);
6051     __ tbx(decH3, arrangement, codecH, 4, decH3);
6052 
6053     // combine lower and higher
6054     __ orr(decL0, arrangement, decL0, decH0);
6055     __ orr(decL1, arrangement, decL1, decH1);
6056     __ orr(decL2, arrangement, decL2, decH2);
6057     __ orr(decL3, arrangement, decL3, decH3);
6058 
6059     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6060     __ cmhi(decH0, arrangement, decL0, v27);
6061     __ cmhi(decH1, arrangement, decL1, v27);
6062     __ cmhi(decH2, arrangement, decL2, v27);
6063     __ cmhi(decH3, arrangement, decL3, v27);
6064     __ orr(in0, arrangement, decH0, decH1);
6065     __ orr(in1, arrangement, decH2, decH3);
6066     __ orr(in2, arrangement, in0,   in1);
6067     __ umaxv(in3, arrangement, in2);
6068     __ umov(rscratch2, in3, __ B, 0);
6069 
6070     // get the data to output
6071     __ shl(out0,  arrangement, decL0, 2);
6072     __ ushr(out1, arrangement, decL1, 4);
6073     __ orr(out0,  arrangement, out0,  out1);
6074     __ shl(out1,  arrangement, decL1, 4);
6075     __ ushr(out2, arrangement, decL2, 2);
6076     __ orr(out1,  arrangement, out1,  out2);
6077     __ shl(out2,  arrangement, decL2, 6);
6078     __ orr(out2,  arrangement, out2,  decL3);
6079 
6080     __ cbz(rscratch2, NoIllegalData);
6081 
6082     // handle illegal input
6083     __ umov(r10, in2, __ D, 0);
6084     if (size == 16) {
6085       __ cbnz(r10, ErrorInLowerHalf);
6086 
6087       // illegal input is in higher half, store the lower half now.
6088       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6089 
6090       __ umov(r10, in2,  __ D, 1);
6091       __ umov(r11, out0, __ D, 1);
6092       __ umov(r12, out1, __ D, 1);
6093       __ umov(r13, out2, __ D, 1);
6094       __ b(StoreLegalData);
6095 
6096       __ BIND(ErrorInLowerHalf);
6097     }
6098     __ umov(r11, out0, __ D, 0);
6099     __ umov(r12, out1, __ D, 0);
6100     __ umov(r13, out2, __ D, 0);
6101 
6102     __ BIND(StoreLegalData);
6103     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6104     __ strb(r11, __ post(dst, 1));
6105     __ strb(r12, __ post(dst, 1));
6106     __ strb(r13, __ post(dst, 1));
6107     __ lsr(r10, r10, 8);
6108     __ lsr(r11, r11, 8);
6109     __ lsr(r12, r12, 8);
6110     __ lsr(r13, r13, 8);
6111     __ b(StoreLegalData);
6112 
6113     __ BIND(NoIllegalData);
6114     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6115   }
6116 
6117 
6118    /**
6119    *  Arguments:
6120    *
6121    *  Input:
6122    *  c_rarg0   - src_start
6123    *  c_rarg1   - src_offset
6124    *  c_rarg2   - src_length
6125    *  c_rarg3   - dest_start
6126    *  c_rarg4   - dest_offset
6127    *  c_rarg5   - isURL
6128    *  c_rarg6   - isMIME
6129    *
6130    */
6131   address generate_base64_decodeBlock() {
6132 
6133     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6134     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6135     // titled "Base64 decoding".
6136 
6137     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6138     // except the trailing character '=' is also treated illegal value in this instrinsic. That
6139     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6140     static const uint8_t fromBase64ForNoSIMD[256] = {
6141       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6142       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6143       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6144        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6145       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6146        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6147       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6148        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6149       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6150       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6151       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6152       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6153       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6154       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6155       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6156       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6157     };
6158 
6159     static const uint8_t fromBase64URLForNoSIMD[256] = {
6160       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6161       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6162       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6163        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6164       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6165        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6166       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6167        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6168       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6169       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6170       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6171       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6172       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6173       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6174       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6175       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6176     };
6177 
6178     // A legal value of base64 code is in range [0, 127].  We need two lookups
6179     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6180     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6181     // table vector lookup use tbx, out of range indices are unchanged in
6182     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6183     // The value of index 64 is set to 0, so that we know that we already get the
6184     // decoded data with the 1st lookup.
6185     static const uint8_t fromBase64ForSIMD[128] = {
6186       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6187       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6188       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6189        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6190         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6191        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6192       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6193        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6194     };
6195 
6196     static const uint8_t fromBase64URLForSIMD[128] = {
6197       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6198       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6199       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6200        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6201         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6202        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6203        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6204        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6205     };
6206 
6207     __ align(CodeEntryAlignment);
6208     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6209     address start = __ pc();
6210 
6211     Register src    = c_rarg0;  // source array
6212     Register soff   = c_rarg1;  // source start offset
6213     Register send   = c_rarg2;  // source end offset
6214     Register dst    = c_rarg3;  // dest array
6215     Register doff   = c_rarg4;  // position for writing to dest array
6216     Register isURL  = c_rarg5;  // Base64 or URL character set
6217     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6218 
6219     Register length = send;    // reuse send as length of source data to process
6220 
6221     Register simd_codec   = c_rarg6;
6222     Register nosimd_codec = c_rarg7;
6223 
6224     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6225 
6226     __ enter();
6227 
6228     __ add(src, src, soff);
6229     __ add(dst, dst, doff);
6230 
6231     __ mov(doff, dst);
6232 
6233     __ sub(length, send, soff);
6234     __ bfm(length, zr, 0, 1);
6235 
6236     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6237     __ cbz(isURL, ProcessData);
6238     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6239 
6240     __ BIND(ProcessData);
6241     __ mov(rscratch1, length);
6242     __ cmp(length, (u1)144); // 144 = 80 + 64
6243     __ br(Assembler::LT, Process4B);
6244 
6245     // In the MIME case, the line length cannot be more than 76
6246     // bytes (see RFC 2045). This is too short a block for SIMD
6247     // to be worthwhile, so we use non-SIMD here.
6248     __ movw(rscratch1, 79);
6249 
6250     __ BIND(Process4B);
6251     __ ldrw(r14, __ post(src, 4));
6252     __ ubfxw(r10, r14, 0,  8);
6253     __ ubfxw(r11, r14, 8,  8);
6254     __ ubfxw(r12, r14, 16, 8);
6255     __ ubfxw(r13, r14, 24, 8);
6256     // get the de-code
6257     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6258     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6259     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6260     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6261     // error detection, 255u indicates an illegal input
6262     __ orrw(r14, r10, r11);
6263     __ orrw(r15, r12, r13);
6264     __ orrw(r14, r14, r15);
6265     __ tbnz(r14, 7, Exit);
6266     // recover the data
6267     __ lslw(r14, r10, 10);
6268     __ bfiw(r14, r11, 4, 6);
6269     __ bfmw(r14, r12, 2, 5);
6270     __ rev16w(r14, r14);
6271     __ bfiw(r13, r12, 6, 2);
6272     __ strh(r14, __ post(dst, 2));
6273     __ strb(r13, __ post(dst, 1));
6274     // non-simd loop
6275     __ subsw(rscratch1, rscratch1, 4);
6276     __ br(Assembler::GT, Process4B);
6277 
6278     // if exiting from PreProcess80B, rscratch1 == -1;
6279     // otherwise, rscratch1 == 0.
6280     __ cbzw(rscratch1, Exit);
6281     __ sub(length, length, 80);
6282 
6283     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6284     __ cbz(isURL, SIMDEnter);
6285     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6286 
6287     __ BIND(SIMDEnter);
6288     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6289     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6290     __ mov(rscratch1, 63);
6291     __ dup(v27, __ T16B, rscratch1);
6292 
6293     __ BIND(Process64B);
6294     __ cmp(length, (u1)64);
6295     __ br(Assembler::LT, Process32B);
6296     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6297     __ sub(length, length, 64);
6298     __ b(Process64B);
6299 
6300     __ BIND(Process32B);
6301     __ cmp(length, (u1)32);
6302     __ br(Assembler::LT, SIMDExit);
6303     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6304     __ sub(length, length, 32);
6305     __ b(Process32B);
6306 
6307     __ BIND(SIMDExit);
6308     __ cbz(length, Exit);
6309     __ movw(rscratch1, length);
6310     __ b(Process4B);
6311 
6312     __ BIND(Exit);
6313     __ sub(c_rarg0, dst, doff);
6314 
6315     __ leave();
6316     __ ret(lr);
6317 
6318     return start;
6319   }
6320 
6321   address generate_ghash_processBlocks_wide() {
6322     address small = generate_ghash_processBlocks();
6323 
6324     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6325     __ align(wordSize * 2);
6326     address p = __ pc();
6327     __ emit_int64(0x87);  // The low-order bits of the field
6328                           // polynomial (i.e. p = z^7+z^2+z+1)
6329                           // repeated in the low and high parts of a
6330                           // 128-bit vector
6331     __ emit_int64(0x87);
6332 
6333     __ align(CodeEntryAlignment);
6334     address start = __ pc();
6335 
6336     Register state   = c_rarg0;
6337     Register subkeyH = c_rarg1;
6338     Register data    = c_rarg2;
6339     Register blocks  = c_rarg3;
6340 
6341     const int unroll = 4;
6342 
6343     __ cmp(blocks, (unsigned char)(unroll * 2));
6344     __ br(__ LT, small);
6345 
6346     if (unroll > 1) {
6347     // Save state before entering routine
6348       __ sub(sp, sp, 4 * 16);
6349       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6350       __ sub(sp, sp, 4 * 16);
6351       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6352     }
6353 
6354     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6355 
6356     if (unroll > 1) {
6357       // And restore state
6358       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6359       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6360     }
6361 
6362     __ cmp(blocks, zr);
6363     __ br(__ GT, small);
6364 
6365     __ ret(lr);
6366 
6367     return start;
6368   }
6369 
6370   // Support for spin waits.
6371   address generate_spin_wait() {
6372     __ align(CodeEntryAlignment);
6373     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6374     address start = __ pc();
6375 
6376     __ spin_wait();
6377     __ ret(lr);
6378 
6379     return start;
6380   }
6381 
6382 #ifdef LINUX
6383 
6384   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6385   //
6386   // If LSE is in use, generate LSE versions of all the stubs. The
6387   // non-LSE versions are in atomic_aarch64.S.
6388 
6389   // class AtomicStubMark records the entry point of a stub and the
6390   // stub pointer which will point to it. The stub pointer is set to
6391   // the entry point when ~AtomicStubMark() is called, which must be
6392   // after ICache::invalidate_range. This ensures safe publication of
6393   // the generated code.
6394   class AtomicStubMark {
6395     address _entry_point;
6396     aarch64_atomic_stub_t *_stub;
6397     MacroAssembler *_masm;
6398   public:
6399     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6400       _masm = masm;
6401       __ align(32);
6402       _entry_point = __ pc();
6403       _stub = stub;
6404     }
6405     ~AtomicStubMark() {
6406       *_stub = (aarch64_atomic_stub_t)_entry_point;
6407     }
6408   };
6409 
6410   // NB: For memory_order_conservative we need a trailing membar after
6411   // LSE atomic operations but not a leading membar.
6412   //
6413   // We don't need a leading membar because a clause in the Arm ARM
6414   // says:
6415   //
6416   //   Barrier-ordered-before
6417   //
6418   //   Barrier instructions order prior Memory effects before subsequent
6419   //   Memory effects generated by the same Observer. A read or a write
6420   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6421   //   Observer if and only if RW1 appears in program order before RW 2
6422   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6423   //   instruction with both Acquire and Release semantics.
6424   //
6425   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6426   // and Release semantics, therefore we don't need a leading
6427   // barrier. However, there is no corresponding Barrier-ordered-after
6428   // relationship, therefore we need a trailing membar to prevent a
6429   // later store or load from being reordered with the store in an
6430   // atomic instruction.
6431   //
6432   // This was checked by using the herd7 consistency model simulator
6433   // (http://diy.inria.fr/) with this test case:
6434   //
6435   // AArch64 LseCas
6436   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6437   // P0 | P1;
6438   // LDR W4, [X2] | MOV W3, #0;
6439   // DMB LD       | MOV W4, #1;
6440   // LDR W3, [X1] | CASAL W3, W4, [X1];
6441   //              | DMB ISH;
6442   //              | STR W4, [X2];
6443   // exists
6444   // (0:X3=0 /\ 0:X4=1)
6445   //
6446   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6447   // with the store to x in P1. Without the DMB in P1 this may happen.
6448   //
6449   // At the time of writing we don't know of any AArch64 hardware that
6450   // reorders stores in this way, but the Reference Manual permits it.
6451 
6452   void gen_cas_entry(Assembler::operand_size size,
6453                      atomic_memory_order order) {
6454     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6455       exchange_val = c_rarg2;
6456     bool acquire, release;
6457     switch (order) {
6458       case memory_order_relaxed:
6459         acquire = false;
6460         release = false;
6461         break;
6462       case memory_order_release:
6463         acquire = false;
6464         release = true;
6465         break;
6466       default:
6467         acquire = true;
6468         release = true;
6469         break;
6470     }
6471     __ mov(prev, compare_val);
6472     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6473     if (order == memory_order_conservative) {
6474       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6475     }
6476     if (size == Assembler::xword) {
6477       __ mov(r0, prev);
6478     } else {
6479       __ movw(r0, prev);
6480     }
6481     __ ret(lr);
6482   }
6483 
6484   void gen_ldaddal_entry(Assembler::operand_size size) {
6485     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6486     __ ldaddal(size, incr, prev, addr);
6487     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6488     if (size == Assembler::xword) {
6489       __ mov(r0, prev);
6490     } else {
6491       __ movw(r0, prev);
6492     }
6493     __ ret(lr);
6494   }
6495 
6496   void gen_swpal_entry(Assembler::operand_size size) {
6497     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6498     __ swpal(size, incr, prev, addr);
6499     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6500     if (size == Assembler::xword) {
6501       __ mov(r0, prev);
6502     } else {
6503       __ movw(r0, prev);
6504     }
6505     __ ret(lr);
6506   }
6507 
6508   void generate_atomic_entry_points() {
6509     if (! UseLSE) {
6510       return;
6511     }
6512 
6513     __ align(CodeEntryAlignment);
6514     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6515     address first_entry = __ pc();
6516 
6517     // All memory_order_conservative
6518     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6519     gen_ldaddal_entry(Assembler::word);
6520     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6521     gen_ldaddal_entry(Assembler::xword);
6522 
6523     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6524     gen_swpal_entry(Assembler::word);
6525     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6526     gen_swpal_entry(Assembler::xword);
6527 
6528     // CAS, memory_order_conservative
6529     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6530     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6531     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6532     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6533     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6534     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6535 
6536     // CAS, memory_order_relaxed
6537     AtomicStubMark mark_cmpxchg_1_relaxed
6538       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6539     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6540     AtomicStubMark mark_cmpxchg_4_relaxed
6541       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6542     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6543     AtomicStubMark mark_cmpxchg_8_relaxed
6544       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6545     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6546 
6547     AtomicStubMark mark_cmpxchg_4_release
6548       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6549     gen_cas_entry(MacroAssembler::word, memory_order_release);
6550     AtomicStubMark mark_cmpxchg_8_release
6551       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6552     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6553 
6554     AtomicStubMark mark_cmpxchg_4_seq_cst
6555       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6556     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6557     AtomicStubMark mark_cmpxchg_8_seq_cst
6558       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6559     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6560 
6561     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6562   }
6563 #endif // LINUX
6564 
6565   // Continuation point for throwing of implicit exceptions that are
6566   // not handled in the current activation. Fabricates an exception
6567   // oop and initiates normal exception dispatching in this
6568   // frame. Since we need to preserve callee-saved values (currently
6569   // only for C2, but done for C1 as well) we need a callee-saved oop
6570   // map and therefore have to make these stubs into RuntimeStubs
6571   // rather than BufferBlobs.  If the compiler needs all registers to
6572   // be preserved between the fault point and the exception handler
6573   // then it must assume responsibility for that in
6574   // AbstractCompiler::continuation_for_implicit_null_exception or
6575   // continuation_for_implicit_division_by_zero_exception. All other
6576   // implicit exceptions (e.g., NullPointerException or
6577   // AbstractMethodError on entry) are either at call sites or
6578   // otherwise assume that stack unwinding will be initiated, so
6579   // caller saved registers were assumed volatile in the compiler.
6580 
6581 #undef __
6582 #define __ masm->
6583 
6584   address generate_throw_exception(const char* name,
6585                                    address runtime_entry,
6586                                    Register arg1 = noreg,
6587                                    Register arg2 = noreg) {
6588     // Information about frame layout at time of blocking runtime call.
6589     // Note that we only have to preserve callee-saved registers since
6590     // the compilers are responsible for supplying a continuation point
6591     // if they expect all registers to be preserved.
6592     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6593     enum layout {
6594       rfp_off = 0,
6595       rfp_off2,
6596       return_off,
6597       return_off2,
6598       framesize // inclusive of return address
6599     };
6600 
6601     int insts_size = 512;
6602     int locs_size  = 64;
6603 
6604     CodeBuffer code(name, insts_size, locs_size);
6605     OopMapSet* oop_maps  = new OopMapSet();
6606     MacroAssembler* masm = new MacroAssembler(&code);
6607 
6608     address start = __ pc();
6609 
6610     // This is an inlined and slightly modified version of call_VM
6611     // which has the ability to fetch the return PC out of
6612     // thread-local storage and also sets up last_Java_sp slightly
6613     // differently than the real call_VM
6614 
6615     __ enter(); // Save FP and LR before call
6616 
6617     assert(is_even(framesize/2), "sp not 16-byte aligned");
6618 
6619     // lr and fp are already in place
6620     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
6621 
6622     int frame_complete = __ pc() - start;
6623 
6624     // Set up last_Java_sp and last_Java_fp
6625     address the_pc = __ pc();
6626     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6627 
6628     // Call runtime
6629     if (arg1 != noreg) {
6630       assert(arg2 != c_rarg1, "clobbered");
6631       __ mov(c_rarg1, arg1);
6632     }
6633     if (arg2 != noreg) {
6634       __ mov(c_rarg2, arg2);
6635     }
6636     __ mov(c_rarg0, rthread);
6637     BLOCK_COMMENT("call runtime_entry");
6638     __ mov(rscratch1, runtime_entry);
6639     __ blr(rscratch1);
6640 
6641     // Generate oop map
6642     OopMap* map = new OopMap(framesize, 0);
6643 
6644     oop_maps->add_gc_map(the_pc - start, map);
6645 
6646     __ reset_last_Java_frame(true);
6647 
6648     // Reinitialize the ptrue predicate register, in case the external runtime
6649     // call clobbers ptrue reg, as we may return to SVE compiled code.
6650     __ reinitialize_ptrue();
6651 
6652     __ leave();
6653 
6654     // check for pending exceptions
6655 #ifdef ASSERT
6656     Label L;
6657     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6658     __ cbnz(rscratch1, L);
6659     __ should_not_reach_here();
6660     __ bind(L);
6661 #endif // ASSERT
6662     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6663 
6664 
6665     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6666     RuntimeStub* stub =
6667       RuntimeStub::new_runtime_stub(name,
6668                                     &code,
6669                                     frame_complete,
6670                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6671                                     oop_maps, false);
6672     return stub->entry_point();
6673   }
6674 
6675   class MontgomeryMultiplyGenerator : public MacroAssembler {
6676 
6677     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6678       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6679 
6680     RegSet _toSave;
6681     bool _squaring;
6682 
6683   public:
6684     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6685       : MacroAssembler(as->code()), _squaring(squaring) {
6686 
6687       // Register allocation
6688 
6689       RegSetIterator<> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6690       Pa_base = *regs;       // Argument registers
6691       if (squaring)
6692         Pb_base = Pa_base;
6693       else
6694         Pb_base = *++regs;
6695       Pn_base = *++regs;
6696       Rlen= *++regs;
6697       inv = *++regs;
6698       Pm_base = *++regs;
6699 
6700                           // Working registers:
6701       Ra =  *++regs;        // The current digit of a, b, n, and m.
6702       Rb =  *++regs;
6703       Rm =  *++regs;
6704       Rn =  *++regs;
6705 
6706       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
6707       Pb =  *++regs;
6708       Pm =  *++regs;
6709       Pn =  *++regs;
6710 
6711       t0 =  *++regs;        // Three registers which form a
6712       t1 =  *++regs;        // triple-precision accumuator.
6713       t2 =  *++regs;
6714 
6715       Ri =  *++regs;        // Inner and outer loop indexes.
6716       Rj =  *++regs;
6717 
6718       Rhi_ab = *++regs;     // Product registers: low and high parts
6719       Rlo_ab = *++regs;     // of a*b and m*n.
6720       Rhi_mn = *++regs;
6721       Rlo_mn = *++regs;
6722 
6723       // r19 and up are callee-saved.
6724       _toSave = RegSet::range(r19, *regs) + Pm_base;
6725     }
6726 
6727   private:
6728     void save_regs() {
6729       push(_toSave, sp);
6730     }
6731 
6732     void restore_regs() {
6733       pop(_toSave, sp);
6734     }
6735 
6736     template <typename T>
6737     void unroll_2(Register count, T block) {
6738       Label loop, end, odd;
6739       tbnz(count, 0, odd);
6740       cbz(count, end);
6741       align(16);
6742       bind(loop);
6743       (this->*block)();
6744       bind(odd);
6745       (this->*block)();
6746       subs(count, count, 2);
6747       br(Assembler::GT, loop);
6748       bind(end);
6749     }
6750 
6751     template <typename T>
6752     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
6753       Label loop, end, odd;
6754       tbnz(count, 0, odd);
6755       cbz(count, end);
6756       align(16);
6757       bind(loop);
6758       (this->*block)(d, s, tmp);
6759       bind(odd);
6760       (this->*block)(d, s, tmp);
6761       subs(count, count, 2);
6762       br(Assembler::GT, loop);
6763       bind(end);
6764     }
6765 
6766     void pre1(RegisterOrConstant i) {
6767       block_comment("pre1");
6768       // Pa = Pa_base;
6769       // Pb = Pb_base + i;
6770       // Pm = Pm_base;
6771       // Pn = Pn_base + i;
6772       // Ra = *Pa;
6773       // Rb = *Pb;
6774       // Rm = *Pm;
6775       // Rn = *Pn;
6776       ldr(Ra, Address(Pa_base));
6777       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6778       ldr(Rm, Address(Pm_base));
6779       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6780       lea(Pa, Address(Pa_base));
6781       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6782       lea(Pm, Address(Pm_base));
6783       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6784 
6785       // Zero the m*n result.
6786       mov(Rhi_mn, zr);
6787       mov(Rlo_mn, zr);
6788     }
6789 
6790     // The core multiply-accumulate step of a Montgomery
6791     // multiplication.  The idea is to schedule operations as a
6792     // pipeline so that instructions with long latencies (loads and
6793     // multiplies) have time to complete before their results are
6794     // used.  This most benefits in-order implementations of the
6795     // architecture but out-of-order ones also benefit.
6796     void step() {
6797       block_comment("step");
6798       // MACC(Ra, Rb, t0, t1, t2);
6799       // Ra = *++Pa;
6800       // Rb = *--Pb;
6801       umulh(Rhi_ab, Ra, Rb);
6802       mul(Rlo_ab, Ra, Rb);
6803       ldr(Ra, pre(Pa, wordSize));
6804       ldr(Rb, pre(Pb, -wordSize));
6805       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
6806                                        // previous iteration.
6807       // MACC(Rm, Rn, t0, t1, t2);
6808       // Rm = *++Pm;
6809       // Rn = *--Pn;
6810       umulh(Rhi_mn, Rm, Rn);
6811       mul(Rlo_mn, Rm, Rn);
6812       ldr(Rm, pre(Pm, wordSize));
6813       ldr(Rn, pre(Pn, -wordSize));
6814       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6815     }
6816 
6817     void post1() {
6818       block_comment("post1");
6819 
6820       // MACC(Ra, Rb, t0, t1, t2);
6821       // Ra = *++Pa;
6822       // Rb = *--Pb;
6823       umulh(Rhi_ab, Ra, Rb);
6824       mul(Rlo_ab, Ra, Rb);
6825       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6826       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6827 
6828       // *Pm = Rm = t0 * inv;
6829       mul(Rm, t0, inv);
6830       str(Rm, Address(Pm));
6831 
6832       // MACC(Rm, Rn, t0, t1, t2);
6833       // t0 = t1; t1 = t2; t2 = 0;
6834       umulh(Rhi_mn, Rm, Rn);
6835 
6836 #ifndef PRODUCT
6837       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6838       {
6839         mul(Rlo_mn, Rm, Rn);
6840         add(Rlo_mn, t0, Rlo_mn);
6841         Label ok;
6842         cbz(Rlo_mn, ok); {
6843           stop("broken Montgomery multiply");
6844         } bind(ok);
6845       }
6846 #endif
6847       // We have very carefully set things up so that
6848       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6849       // the lower half of Rm * Rn because we know the result already:
6850       // it must be -t0.  t0 + (-t0) must generate a carry iff
6851       // t0 != 0.  So, rather than do a mul and an adds we just set
6852       // the carry flag iff t0 is nonzero.
6853       //
6854       // mul(Rlo_mn, Rm, Rn);
6855       // adds(zr, t0, Rlo_mn);
6856       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6857       adcs(t0, t1, Rhi_mn);
6858       adc(t1, t2, zr);
6859       mov(t2, zr);
6860     }
6861 
6862     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
6863       block_comment("pre2");
6864       // Pa = Pa_base + i-len;
6865       // Pb = Pb_base + len;
6866       // Pm = Pm_base + i-len;
6867       // Pn = Pn_base + len;
6868 
6869       if (i.is_register()) {
6870         sub(Rj, i.as_register(), len);
6871       } else {
6872         mov(Rj, i.as_constant());
6873         sub(Rj, Rj, len);
6874       }
6875       // Rj == i-len
6876 
6877       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
6878       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
6879       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6880       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
6881 
6882       // Ra = *++Pa;
6883       // Rb = *--Pb;
6884       // Rm = *++Pm;
6885       // Rn = *--Pn;
6886       ldr(Ra, pre(Pa, wordSize));
6887       ldr(Rb, pre(Pb, -wordSize));
6888       ldr(Rm, pre(Pm, wordSize));
6889       ldr(Rn, pre(Pn, -wordSize));
6890 
6891       mov(Rhi_mn, zr);
6892       mov(Rlo_mn, zr);
6893     }
6894 
6895     void post2(RegisterOrConstant i, RegisterOrConstant len) {
6896       block_comment("post2");
6897       if (i.is_constant()) {
6898         mov(Rj, i.as_constant()-len.as_constant());
6899       } else {
6900         sub(Rj, i.as_register(), len);
6901       }
6902 
6903       adds(t0, t0, Rlo_mn); // The pending m*n, low part
6904 
6905       // As soon as we know the least significant digit of our result,
6906       // store it.
6907       // Pm_base[i-len] = t0;
6908       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6909 
6910       // t0 = t1; t1 = t2; t2 = 0;
6911       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
6912       adc(t1, t2, zr);
6913       mov(t2, zr);
6914     }
6915 
6916     // A carry in t0 after Montgomery multiplication means that we
6917     // should subtract multiples of n from our result in m.  We'll
6918     // keep doing that until there is no carry.
6919     void normalize(RegisterOrConstant len) {
6920       block_comment("normalize");
6921       // while (t0)
6922       //   t0 = sub(Pm_base, Pn_base, t0, len);
6923       Label loop, post, again;
6924       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
6925       cbz(t0, post); {
6926         bind(again); {
6927           mov(i, zr);
6928           mov(cnt, len);
6929           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6930           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6931           subs(zr, zr, zr); // set carry flag, i.e. no borrow
6932           align(16);
6933           bind(loop); {
6934             sbcs(Rm, Rm, Rn);
6935             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6936             add(i, i, 1);
6937             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6938             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6939             sub(cnt, cnt, 1);
6940           } cbnz(cnt, loop);
6941           sbc(t0, t0, zr);
6942         } cbnz(t0, again);
6943       } bind(post);
6944     }
6945 
6946     // Move memory at s to d, reversing words.
6947     //    Increments d to end of copied memory
6948     //    Destroys tmp1, tmp2
6949     //    Preserves len
6950     //    Leaves s pointing to the address which was in d at start
6951     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
6952       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
6953 
6954       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
6955       mov(tmp1, len);
6956       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
6957       sub(s, d, len, ext::uxtw, LogBytesPerWord);
6958     }
6959     // where
6960     void reverse1(Register d, Register s, Register tmp) {
6961       ldr(tmp, pre(s, -wordSize));
6962       ror(tmp, tmp, 32);
6963       str(tmp, post(d, wordSize));
6964     }
6965 
6966     void step_squaring() {
6967       // An extra ACC
6968       step();
6969       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6970     }
6971 
6972     void last_squaring(RegisterOrConstant i) {
6973       Label dont;
6974       // if ((i & 1) == 0) {
6975       tbnz(i.as_register(), 0, dont); {
6976         // MACC(Ra, Rb, t0, t1, t2);
6977         // Ra = *++Pa;
6978         // Rb = *--Pb;
6979         umulh(Rhi_ab, Ra, Rb);
6980         mul(Rlo_ab, Ra, Rb);
6981         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6982       } bind(dont);
6983     }
6984 
6985     void extra_step_squaring() {
6986       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6987 
6988       // MACC(Rm, Rn, t0, t1, t2);
6989       // Rm = *++Pm;
6990       // Rn = *--Pn;
6991       umulh(Rhi_mn, Rm, Rn);
6992       mul(Rlo_mn, Rm, Rn);
6993       ldr(Rm, pre(Pm, wordSize));
6994       ldr(Rn, pre(Pn, -wordSize));
6995     }
6996 
6997     void post1_squaring() {
6998       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6999 
7000       // *Pm = Rm = t0 * inv;
7001       mul(Rm, t0, inv);
7002       str(Rm, Address(Pm));
7003 
7004       // MACC(Rm, Rn, t0, t1, t2);
7005       // t0 = t1; t1 = t2; t2 = 0;
7006       umulh(Rhi_mn, Rm, Rn);
7007 
7008 #ifndef PRODUCT
7009       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7010       {
7011         mul(Rlo_mn, Rm, Rn);
7012         add(Rlo_mn, t0, Rlo_mn);
7013         Label ok;
7014         cbz(Rlo_mn, ok); {
7015           stop("broken Montgomery multiply");
7016         } bind(ok);
7017       }
7018 #endif
7019       // We have very carefully set things up so that
7020       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7021       // the lower half of Rm * Rn because we know the result already:
7022       // it must be -t0.  t0 + (-t0) must generate a carry iff
7023       // t0 != 0.  So, rather than do a mul and an adds we just set
7024       // the carry flag iff t0 is nonzero.
7025       //
7026       // mul(Rlo_mn, Rm, Rn);
7027       // adds(zr, t0, Rlo_mn);
7028       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7029       adcs(t0, t1, Rhi_mn);
7030       adc(t1, t2, zr);
7031       mov(t2, zr);
7032     }
7033 
7034     void acc(Register Rhi, Register Rlo,
7035              Register t0, Register t1, Register t2) {
7036       adds(t0, t0, Rlo);
7037       adcs(t1, t1, Rhi);
7038       adc(t2, t2, zr);
7039     }
7040 
7041   public:
7042     /**
7043      * Fast Montgomery multiplication.  The derivation of the
7044      * algorithm is in A Cryptographic Library for the Motorola
7045      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7046      *
7047      * Arguments:
7048      *
7049      * Inputs for multiplication:
7050      *   c_rarg0   - int array elements a
7051      *   c_rarg1   - int array elements b
7052      *   c_rarg2   - int array elements n (the modulus)
7053      *   c_rarg3   - int length
7054      *   c_rarg4   - int inv
7055      *   c_rarg5   - int array elements m (the result)
7056      *
7057      * Inputs for squaring:
7058      *   c_rarg0   - int array elements a
7059      *   c_rarg1   - int array elements n (the modulus)
7060      *   c_rarg2   - int length
7061      *   c_rarg3   - int inv
7062      *   c_rarg4   - int array elements m (the result)
7063      *
7064      */
7065     address generate_multiply() {
7066       Label argh, nothing;
7067       bind(argh);
7068       stop("MontgomeryMultiply total_allocation must be <= 8192");
7069 
7070       align(CodeEntryAlignment);
7071       address entry = pc();
7072 
7073       cbzw(Rlen, nothing);
7074 
7075       enter();
7076 
7077       // Make room.
7078       cmpw(Rlen, 512);
7079       br(Assembler::HI, argh);
7080       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7081       andr(sp, Ra, -2 * wordSize);
7082 
7083       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7084 
7085       {
7086         // Copy input args, reversing as we go.  We use Ra as a
7087         // temporary variable.
7088         reverse(Ra, Pa_base, Rlen, t0, t1);
7089         if (!_squaring)
7090           reverse(Ra, Pb_base, Rlen, t0, t1);
7091         reverse(Ra, Pn_base, Rlen, t0, t1);
7092       }
7093 
7094       // Push all call-saved registers and also Pm_base which we'll need
7095       // at the end.
7096       save_regs();
7097 
7098 #ifndef PRODUCT
7099       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7100       {
7101         ldr(Rn, Address(Pn_base, 0));
7102         mul(Rlo_mn, Rn, inv);
7103         subs(zr, Rlo_mn, -1);
7104         Label ok;
7105         br(EQ, ok); {
7106           stop("broken inverse in Montgomery multiply");
7107         } bind(ok);
7108       }
7109 #endif
7110 
7111       mov(Pm_base, Ra);
7112 
7113       mov(t0, zr);
7114       mov(t1, zr);
7115       mov(t2, zr);
7116 
7117       block_comment("for (int i = 0; i < len; i++) {");
7118       mov(Ri, zr); {
7119         Label loop, end;
7120         cmpw(Ri, Rlen);
7121         br(Assembler::GE, end);
7122 
7123         bind(loop);
7124         pre1(Ri);
7125 
7126         block_comment("  for (j = i; j; j--) {"); {
7127           movw(Rj, Ri);
7128           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7129         } block_comment("  } // j");
7130 
7131         post1();
7132         addw(Ri, Ri, 1);
7133         cmpw(Ri, Rlen);
7134         br(Assembler::LT, loop);
7135         bind(end);
7136         block_comment("} // i");
7137       }
7138 
7139       block_comment("for (int i = len; i < 2*len; i++) {");
7140       mov(Ri, Rlen); {
7141         Label loop, end;
7142         cmpw(Ri, Rlen, Assembler::LSL, 1);
7143         br(Assembler::GE, end);
7144 
7145         bind(loop);
7146         pre2(Ri, Rlen);
7147 
7148         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7149           lslw(Rj, Rlen, 1);
7150           subw(Rj, Rj, Ri);
7151           subw(Rj, Rj, 1);
7152           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7153         } block_comment("  } // j");
7154 
7155         post2(Ri, Rlen);
7156         addw(Ri, Ri, 1);
7157         cmpw(Ri, Rlen, Assembler::LSL, 1);
7158         br(Assembler::LT, loop);
7159         bind(end);
7160       }
7161       block_comment("} // i");
7162 
7163       normalize(Rlen);
7164 
7165       mov(Ra, Pm_base);  // Save Pm_base in Ra
7166       restore_regs();  // Restore caller's Pm_base
7167 
7168       // Copy our result into caller's Pm_base
7169       reverse(Pm_base, Ra, Rlen, t0, t1);
7170 
7171       leave();
7172       bind(nothing);
7173       ret(lr);
7174 
7175       return entry;
7176     }
7177     // In C, approximately:
7178 
7179     // void
7180     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7181     //                     julong Pn_base[], julong Pm_base[],
7182     //                     julong inv, int len) {
7183     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7184     //   julong *Pa, *Pb, *Pn, *Pm;
7185     //   julong Ra, Rb, Rn, Rm;
7186 
7187     //   int i;
7188 
7189     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7190 
7191     //   for (i = 0; i < len; i++) {
7192     //     int j;
7193 
7194     //     Pa = Pa_base;
7195     //     Pb = Pb_base + i;
7196     //     Pm = Pm_base;
7197     //     Pn = Pn_base + i;
7198 
7199     //     Ra = *Pa;
7200     //     Rb = *Pb;
7201     //     Rm = *Pm;
7202     //     Rn = *Pn;
7203 
7204     //     int iters = i;
7205     //     for (j = 0; iters--; j++) {
7206     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7207     //       MACC(Ra, Rb, t0, t1, t2);
7208     //       Ra = *++Pa;
7209     //       Rb = *--Pb;
7210     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7211     //       MACC(Rm, Rn, t0, t1, t2);
7212     //       Rm = *++Pm;
7213     //       Rn = *--Pn;
7214     //     }
7215 
7216     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7217     //     MACC(Ra, Rb, t0, t1, t2);
7218     //     *Pm = Rm = t0 * inv;
7219     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7220     //     MACC(Rm, Rn, t0, t1, t2);
7221 
7222     //     assert(t0 == 0, "broken Montgomery multiply");
7223 
7224     //     t0 = t1; t1 = t2; t2 = 0;
7225     //   }
7226 
7227     //   for (i = len; i < 2*len; i++) {
7228     //     int j;
7229 
7230     //     Pa = Pa_base + i-len;
7231     //     Pb = Pb_base + len;
7232     //     Pm = Pm_base + i-len;
7233     //     Pn = Pn_base + len;
7234 
7235     //     Ra = *++Pa;
7236     //     Rb = *--Pb;
7237     //     Rm = *++Pm;
7238     //     Rn = *--Pn;
7239 
7240     //     int iters = len*2-i-1;
7241     //     for (j = i-len+1; iters--; j++) {
7242     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7243     //       MACC(Ra, Rb, t0, t1, t2);
7244     //       Ra = *++Pa;
7245     //       Rb = *--Pb;
7246     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7247     //       MACC(Rm, Rn, t0, t1, t2);
7248     //       Rm = *++Pm;
7249     //       Rn = *--Pn;
7250     //     }
7251 
7252     //     Pm_base[i-len] = t0;
7253     //     t0 = t1; t1 = t2; t2 = 0;
7254     //   }
7255 
7256     //   while (t0)
7257     //     t0 = sub(Pm_base, Pn_base, t0, len);
7258     // }
7259 
7260     /**
7261      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7262      * multiplies than Montgomery multiplication so it should be up to
7263      * 25% faster.  However, its loop control is more complex and it
7264      * may actually run slower on some machines.
7265      *
7266      * Arguments:
7267      *
7268      * Inputs:
7269      *   c_rarg0   - int array elements a
7270      *   c_rarg1   - int array elements n (the modulus)
7271      *   c_rarg2   - int length
7272      *   c_rarg3   - int inv
7273      *   c_rarg4   - int array elements m (the result)
7274      *
7275      */
7276     address generate_square() {
7277       Label argh;
7278       bind(argh);
7279       stop("MontgomeryMultiply total_allocation must be <= 8192");
7280 
7281       align(CodeEntryAlignment);
7282       address entry = pc();
7283 
7284       enter();
7285 
7286       // Make room.
7287       cmpw(Rlen, 512);
7288       br(Assembler::HI, argh);
7289       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7290       andr(sp, Ra, -2 * wordSize);
7291 
7292       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7293 
7294       {
7295         // Copy input args, reversing as we go.  We use Ra as a
7296         // temporary variable.
7297         reverse(Ra, Pa_base, Rlen, t0, t1);
7298         reverse(Ra, Pn_base, Rlen, t0, t1);
7299       }
7300 
7301       // Push all call-saved registers and also Pm_base which we'll need
7302       // at the end.
7303       save_regs();
7304 
7305       mov(Pm_base, Ra);
7306 
7307       mov(t0, zr);
7308       mov(t1, zr);
7309       mov(t2, zr);
7310 
7311       block_comment("for (int i = 0; i < len; i++) {");
7312       mov(Ri, zr); {
7313         Label loop, end;
7314         bind(loop);
7315         cmp(Ri, Rlen);
7316         br(Assembler::GE, end);
7317 
7318         pre1(Ri);
7319 
7320         block_comment("for (j = (i+1)/2; j; j--) {"); {
7321           add(Rj, Ri, 1);
7322           lsr(Rj, Rj, 1);
7323           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7324         } block_comment("  } // j");
7325 
7326         last_squaring(Ri);
7327 
7328         block_comment("  for (j = i/2; j; j--) {"); {
7329           lsr(Rj, Ri, 1);
7330           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7331         } block_comment("  } // j");
7332 
7333         post1_squaring();
7334         add(Ri, Ri, 1);
7335         cmp(Ri, Rlen);
7336         br(Assembler::LT, loop);
7337 
7338         bind(end);
7339         block_comment("} // i");
7340       }
7341 
7342       block_comment("for (int i = len; i < 2*len; i++) {");
7343       mov(Ri, Rlen); {
7344         Label loop, end;
7345         bind(loop);
7346         cmp(Ri, Rlen, Assembler::LSL, 1);
7347         br(Assembler::GE, end);
7348 
7349         pre2(Ri, Rlen);
7350 
7351         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7352           lsl(Rj, Rlen, 1);
7353           sub(Rj, Rj, Ri);
7354           sub(Rj, Rj, 1);
7355           lsr(Rj, Rj, 1);
7356           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7357         } block_comment("  } // j");
7358 
7359         last_squaring(Ri);
7360 
7361         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7362           lsl(Rj, Rlen, 1);
7363           sub(Rj, Rj, Ri);
7364           lsr(Rj, Rj, 1);
7365           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7366         } block_comment("  } // j");
7367 
7368         post2(Ri, Rlen);
7369         add(Ri, Ri, 1);
7370         cmp(Ri, Rlen, Assembler::LSL, 1);
7371 
7372         br(Assembler::LT, loop);
7373         bind(end);
7374         block_comment("} // i");
7375       }
7376 
7377       normalize(Rlen);
7378 
7379       mov(Ra, Pm_base);  // Save Pm_base in Ra
7380       restore_regs();  // Restore caller's Pm_base
7381 
7382       // Copy our result into caller's Pm_base
7383       reverse(Pm_base, Ra, Rlen, t0, t1);
7384 
7385       leave();
7386       ret(lr);
7387 
7388       return entry;
7389     }
7390     // In C, approximately:
7391 
7392     // void
7393     // montgomery_square(julong Pa_base[], julong Pn_base[],
7394     //                   julong Pm_base[], julong inv, int len) {
7395     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7396     //   julong *Pa, *Pb, *Pn, *Pm;
7397     //   julong Ra, Rb, Rn, Rm;
7398 
7399     //   int i;
7400 
7401     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7402 
7403     //   for (i = 0; i < len; i++) {
7404     //     int j;
7405 
7406     //     Pa = Pa_base;
7407     //     Pb = Pa_base + i;
7408     //     Pm = Pm_base;
7409     //     Pn = Pn_base + i;
7410 
7411     //     Ra = *Pa;
7412     //     Rb = *Pb;
7413     //     Rm = *Pm;
7414     //     Rn = *Pn;
7415 
7416     //     int iters = (i+1)/2;
7417     //     for (j = 0; iters--; j++) {
7418     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7419     //       MACC2(Ra, Rb, t0, t1, t2);
7420     //       Ra = *++Pa;
7421     //       Rb = *--Pb;
7422     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7423     //       MACC(Rm, Rn, t0, t1, t2);
7424     //       Rm = *++Pm;
7425     //       Rn = *--Pn;
7426     //     }
7427     //     if ((i & 1) == 0) {
7428     //       assert(Ra == Pa_base[j], "must be");
7429     //       MACC(Ra, Ra, t0, t1, t2);
7430     //     }
7431     //     iters = i/2;
7432     //     assert(iters == i-j, "must be");
7433     //     for (; iters--; j++) {
7434     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7435     //       MACC(Rm, Rn, t0, t1, t2);
7436     //       Rm = *++Pm;
7437     //       Rn = *--Pn;
7438     //     }
7439 
7440     //     *Pm = Rm = t0 * inv;
7441     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7442     //     MACC(Rm, Rn, t0, t1, t2);
7443 
7444     //     assert(t0 == 0, "broken Montgomery multiply");
7445 
7446     //     t0 = t1; t1 = t2; t2 = 0;
7447     //   }
7448 
7449     //   for (i = len; i < 2*len; i++) {
7450     //     int start = i-len+1;
7451     //     int end = start + (len - start)/2;
7452     //     int j;
7453 
7454     //     Pa = Pa_base + i-len;
7455     //     Pb = Pa_base + len;
7456     //     Pm = Pm_base + i-len;
7457     //     Pn = Pn_base + len;
7458 
7459     //     Ra = *++Pa;
7460     //     Rb = *--Pb;
7461     //     Rm = *++Pm;
7462     //     Rn = *--Pn;
7463 
7464     //     int iters = (2*len-i-1)/2;
7465     //     assert(iters == end-start, "must be");
7466     //     for (j = start; iters--; j++) {
7467     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7468     //       MACC2(Ra, Rb, t0, t1, t2);
7469     //       Ra = *++Pa;
7470     //       Rb = *--Pb;
7471     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7472     //       MACC(Rm, Rn, t0, t1, t2);
7473     //       Rm = *++Pm;
7474     //       Rn = *--Pn;
7475     //     }
7476     //     if ((i & 1) == 0) {
7477     //       assert(Ra == Pa_base[j], "must be");
7478     //       MACC(Ra, Ra, t0, t1, t2);
7479     //     }
7480     //     iters =  (2*len-i)/2;
7481     //     assert(iters == len-j, "must be");
7482     //     for (; iters--; j++) {
7483     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7484     //       MACC(Rm, Rn, t0, t1, t2);
7485     //       Rm = *++Pm;
7486     //       Rn = *--Pn;
7487     //     }
7488     //     Pm_base[i-len] = t0;
7489     //     t0 = t1; t1 = t2; t2 = 0;
7490     //   }
7491 
7492     //   while (t0)
7493     //     t0 = sub(Pm_base, Pn_base, t0, len);
7494     // }
7495   };
7496 
7497 
7498   // Initialization
7499   void generate_initial() {
7500     // Generate initial stubs and initializes the entry points
7501 
7502     // entry points that exist in all platforms Note: This is code
7503     // that could be shared among different platforms - however the
7504     // benefit seems to be smaller than the disadvantage of having a
7505     // much more complicated generator structure. See also comment in
7506     // stubRoutines.hpp.
7507 
7508     StubRoutines::_forward_exception_entry = generate_forward_exception();
7509 
7510     StubRoutines::_call_stub_entry =
7511       generate_call_stub(StubRoutines::_call_stub_return_address);
7512 
7513     // is referenced by megamorphic call
7514     StubRoutines::_catch_exception_entry = generate_catch_exception();
7515 
7516     // Build this early so it's available for the interpreter.
7517     StubRoutines::_throw_StackOverflowError_entry =
7518       generate_throw_exception("StackOverflowError throw_exception",
7519                                CAST_FROM_FN_PTR(address,
7520                                                 SharedRuntime::throw_StackOverflowError));
7521     StubRoutines::_throw_delayed_StackOverflowError_entry =
7522       generate_throw_exception("delayed StackOverflowError throw_exception",
7523                                CAST_FROM_FN_PTR(address,
7524                                                 SharedRuntime::throw_delayed_StackOverflowError));
7525     if (UseCRC32Intrinsics) {
7526       // set table address before stub generation which use it
7527       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7528       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7529     }
7530 
7531     if (UseCRC32CIntrinsics) {
7532       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7533     }
7534 
7535     // Disabled until JDK-8210858 is fixed
7536     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7537     //   StubRoutines::_dlog = generate_dlog();
7538     // }
7539 
7540     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7541       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7542     }
7543 
7544     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7545       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7546     }
7547 
7548     // Safefetch stubs.
7549     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7550                                                        &StubRoutines::_safefetch32_fault_pc,
7551                                                        &StubRoutines::_safefetch32_continuation_pc);
7552     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7553                                                        &StubRoutines::_safefetchN_fault_pc,
7554                                                        &StubRoutines::_safefetchN_continuation_pc);
7555   }
7556 
7557   void generate_all() {
7558     // support for verify_oop (must happen after universe_init)
7559     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
7560     StubRoutines::_throw_AbstractMethodError_entry =
7561       generate_throw_exception("AbstractMethodError throw_exception",
7562                                CAST_FROM_FN_PTR(address,
7563                                                 SharedRuntime::
7564                                                 throw_AbstractMethodError));
7565 
7566     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7567       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7568                                CAST_FROM_FN_PTR(address,
7569                                                 SharedRuntime::
7570                                                 throw_IncompatibleClassChangeError));
7571 
7572     StubRoutines::_throw_NullPointerException_at_call_entry =
7573       generate_throw_exception("NullPointerException at call throw_exception",
7574                                CAST_FROM_FN_PTR(address,
7575                                                 SharedRuntime::
7576                                                 throw_NullPointerException_at_call));
7577 
7578     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7579 
7580     // arraycopy stubs used by compilers
7581     generate_arraycopy_stubs();
7582 
7583     // has negatives stub for large arrays.
7584     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
7585 
7586     // array equals stub for large arrays.
7587     if (!UseSimpleArrayEquals) {
7588       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7589     }
7590 
7591     generate_compare_long_strings();
7592 
7593     generate_string_indexof_stubs();
7594 
7595     // byte_array_inflate stub for large arrays.
7596     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7597 
7598     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7599     if (bs_nm != NULL) {
7600       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7601     }
7602 #ifdef COMPILER2
7603     if (UseMultiplyToLenIntrinsic) {
7604       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7605     }
7606 
7607     if (UseSquareToLenIntrinsic) {
7608       StubRoutines::_squareToLen = generate_squareToLen();
7609     }
7610 
7611     if (UseMulAddIntrinsic) {
7612       StubRoutines::_mulAdd = generate_mulAdd();
7613     }
7614 
7615     if (UseSIMDForBigIntegerShiftIntrinsics) {
7616       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7617       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7618     }
7619 
7620     if (UseMontgomeryMultiplyIntrinsic) {
7621       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7622       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7623       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7624     }
7625 
7626     if (UseMontgomerySquareIntrinsic) {
7627       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7628       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7629       // We use generate_multiply() rather than generate_square()
7630       // because it's faster for the sizes of modulus we care about.
7631       StubRoutines::_montgomerySquare = g.generate_multiply();
7632     }
7633 #endif // COMPILER2
7634 
7635     // generate GHASH intrinsics code
7636     if (UseGHASHIntrinsics) {
7637       if (UseAESCTRIntrinsics) {
7638         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
7639       } else {
7640         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7641       }
7642     }
7643 
7644     if (UseBASE64Intrinsics) {
7645         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7646         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7647     }
7648 
7649     // data cache line writeback
7650     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7651     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7652 
7653     if (UseAESIntrinsics) {
7654       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7655       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7656       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7657       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7658     }
7659 
7660     if (UseAESCTRIntrinsics) {
7661       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7662     }
7663 
7664     if (UseMD5Intrinsics) {
7665       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
7666       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
7667     }
7668     if (UseSHA1Intrinsics) {
7669       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7670       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7671     }
7672     if (UseSHA256Intrinsics) {
7673       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7674       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7675     }
7676     if (UseSHA512Intrinsics) {
7677       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7678       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7679     }
7680     if (UseSHA3Intrinsics) {
7681       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7682       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7683     }
7684 
7685     // generate Adler32 intrinsics code
7686     if (UseAdler32Intrinsics) {
7687       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7688     }
7689 
7690     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
7691 
7692 #ifdef LINUX
7693 
7694     generate_atomic_entry_points();
7695 
7696 #endif // LINUX
7697 
7698     StubRoutines::aarch64::set_completed();
7699   }
7700 
7701  public:
7702   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7703     if (all) {
7704       generate_all();
7705     } else {
7706       generate_initial();
7707     }
7708   }
7709 }; // end class declaration
7710 
7711 #define UCM_TABLE_MAX_ENTRIES 8
7712 void StubGenerator_generate(CodeBuffer* code, bool all) {
7713   if (UnsafeCopyMemory::_table == NULL) {
7714     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7715   }
7716   StubGenerator g(code, all);
7717 }
7718 
7719 
7720 #ifdef LINUX
7721 
7722 // Define pointers to atomic stubs and initialize them to point to the
7723 // code in atomic_aarch64.S.
7724 
7725 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
7726   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7727     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
7728   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7729     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7730 
7731 DEFAULT_ATOMIC_OP(fetch_add, 4, )
7732 DEFAULT_ATOMIC_OP(fetch_add, 8, )
7733 DEFAULT_ATOMIC_OP(xchg, 4, )
7734 DEFAULT_ATOMIC_OP(xchg, 8, )
7735 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
7736 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
7737 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
7738 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
7739 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
7740 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
7741 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
7742 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
7743 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
7744 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
7745 
7746 #undef DEFAULT_ATOMIC_OP
7747 
7748 #endif // LINUX