1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "atomic_aarch64.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/interpreter.hpp"
  36 #include "memory/universe.hpp"
  37 #include "nativeInst_aarch64.hpp"
  38 #include "oops/instanceOop.hpp"
  39 #include "oops/method.hpp"
  40 #include "oops/objArrayKlass.hpp"
  41 #include "oops/oop.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/atomic.hpp"
  44 #include "runtime/frame.inline.hpp"
  45 #include "runtime/handles.inline.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubCodeGenerator.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/thread.inline.hpp"
  50 #include "utilities/align.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 #ifdef COMPILER2
  53 #include "opto/runtime.hpp"
  54 #endif
  55 #if INCLUDE_ZGC
  56 #include "gc/z/zThreadLocalData.hpp"
  57 #endif
  58 
  59 // Declaration and definition of StubGenerator (no .hpp file).
  60 // For a more detailed description of the stub routine structure
  61 // see the comment in stubRoutines.hpp
  62 
  63 #undef __
  64 #define __ _masm->
  65 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  66 
  67 #ifdef PRODUCT
  68 #define BLOCK_COMMENT(str) /* nothing */
  69 #else
  70 #define BLOCK_COMMENT(str) __ block_comment(str)
  71 #endif
  72 
  73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  74 
  75 // Stub Code definitions
  76 
  77 class StubGenerator: public StubCodeGenerator {
  78  private:
  79 
  80 #ifdef PRODUCT
  81 #define inc_counter_np(counter) ((void)0)
  82 #else
  83   void inc_counter_np_(int& counter) {
  84     __ lea(rscratch2, ExternalAddress((address)&counter));
  85     __ ldrw(rscratch1, Address(rscratch2));
  86     __ addw(rscratch1, rscratch1, 1);
  87     __ strw(rscratch1, Address(rscratch2));
  88   }
  89 #define inc_counter_np(counter) \
  90   BLOCK_COMMENT("inc_counter " #counter); \
  91   inc_counter_np_(counter);
  92 #endif
  93 
  94   // Call stubs are used to call Java from C
  95   //
  96   // Arguments:
  97   //    c_rarg0:   call wrapper address                   address
  98   //    c_rarg1:   result                                 address
  99   //    c_rarg2:   result type                            BasicType
 100   //    c_rarg3:   method                                 Method*
 101   //    c_rarg4:   (interpreter) entry point              address
 102   //    c_rarg5:   parameters                             intptr_t*
 103   //    c_rarg6:   parameter size (in words)              int
 104   //    c_rarg7:   thread                                 Thread*
 105   //
 106   // There is no return from the stub itself as any Java result
 107   // is written to result
 108   //
 109   // we save r30 (lr) as the return PC at the base of the frame and
 110   // link r29 (fp) below it as the frame pointer installing sp (r31)
 111   // into fp.
 112   //
 113   // we save r0-r7, which accounts for all the c arguments.
 114   //
 115   // TODO: strictly do we need to save them all? they are treated as
 116   // volatile by C so could we omit saving the ones we are going to
 117   // place in global registers (thread? method?) or those we only use
 118   // during setup of the Java call?
 119   //
 120   // we don't need to save r8 which C uses as an indirect result location
 121   // return register.
 122   //
 123   // we don't need to save r9-r15 which both C and Java treat as
 124   // volatile
 125   //
 126   // we don't need to save r16-18 because Java does not use them
 127   //
 128   // we save r19-r28 which Java uses as scratch registers and C
 129   // expects to be callee-save
 130   //
 131   // we save the bottom 64 bits of each value stored in v8-v15; it is
 132   // the responsibility of the caller to preserve larger values.
 133   //
 134   // so the stub frame looks like this when we enter Java code
 135   //
 136   //     [ return_from_Java     ] <--- sp
 137   //     [ argument word n      ]
 138   //      ...
 139   // -27 [ argument word 1      ]
 140   // -26 [ saved v15            ] <--- sp_after_call
 141   // -25 [ saved v14            ]
 142   // -24 [ saved v13            ]
 143   // -23 [ saved v12            ]
 144   // -22 [ saved v11            ]
 145   // -21 [ saved v10            ]
 146   // -20 [ saved v9             ]
 147   // -19 [ saved v8             ]
 148   // -18 [ saved r28            ]
 149   // -17 [ saved r27            ]
 150   // -16 [ saved r26            ]
 151   // -15 [ saved r25            ]
 152   // -14 [ saved r24            ]
 153   // -13 [ saved r23            ]
 154   // -12 [ saved r22            ]
 155   // -11 [ saved r21            ]
 156   // -10 [ saved r20            ]
 157   //  -9 [ saved r19            ]
 158   //  -8 [ call wrapper    (r0) ]
 159   //  -7 [ result          (r1) ]
 160   //  -6 [ result type     (r2) ]
 161   //  -5 [ method          (r3) ]
 162   //  -4 [ entry point     (r4) ]
 163   //  -3 [ parameters      (r5) ]
 164   //  -2 [ parameter size  (r6) ]
 165   //  -1 [ thread (r7)          ]
 166   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 167   //   1 [ saved lr       (r30) ]
 168 
 169   // Call stub stack layout word offsets from fp
 170   enum call_stub_layout {
 171     sp_after_call_off = -26,
 172 
 173     d15_off            = -26,
 174     d13_off            = -24,
 175     d11_off            = -22,
 176     d9_off             = -20,
 177 
 178     r28_off            = -18,
 179     r26_off            = -16,
 180     r24_off            = -14,
 181     r22_off            = -12,
 182     r20_off            = -10,
 183     call_wrapper_off   =  -8,
 184     result_off         =  -7,
 185     result_type_off    =  -6,
 186     method_off         =  -5,
 187     entry_point_off    =  -4,
 188     parameter_size_off =  -2,
 189     thread_off         =  -1,
 190     fp_f               =   0,
 191     retaddr_off        =   1,
 192   };
 193 
 194   address generate_call_stub(address& return_address) {
 195     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 196            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 197            "adjust this code");
 198 
 199     StubCodeMark mark(this, "StubRoutines", "call_stub");
 200     address start = __ pc();
 201 
 202     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 203 
 204     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 205     const Address result        (rfp, result_off         * wordSize);
 206     const Address result_type   (rfp, result_type_off    * wordSize);
 207     const Address method        (rfp, method_off         * wordSize);
 208     const Address entry_point   (rfp, entry_point_off    * wordSize);
 209     const Address parameter_size(rfp, parameter_size_off * wordSize);
 210 
 211     const Address thread        (rfp, thread_off         * wordSize);
 212 
 213     const Address d15_save      (rfp, d15_off * wordSize);
 214     const Address d13_save      (rfp, d13_off * wordSize);
 215     const Address d11_save      (rfp, d11_off * wordSize);
 216     const Address d9_save       (rfp, d9_off * wordSize);
 217 
 218     const Address r28_save      (rfp, r28_off * wordSize);
 219     const Address r26_save      (rfp, r26_off * wordSize);
 220     const Address r24_save      (rfp, r24_off * wordSize);
 221     const Address r22_save      (rfp, r22_off * wordSize);
 222     const Address r20_save      (rfp, r20_off * wordSize);
 223 
 224     // stub code
 225 
 226     address aarch64_entry = __ pc();
 227 
 228     // set up frame and move sp to end of save area
 229     __ enter();
 230     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 231 
 232     // save register parameters and Java scratch/global registers
 233     // n.b. we save thread even though it gets installed in
 234     // rthread because we want to sanity check rthread later
 235     __ str(c_rarg7,  thread);
 236     __ strw(c_rarg6, parameter_size);
 237     __ stp(c_rarg4, c_rarg5,  entry_point);
 238     __ stp(c_rarg2, c_rarg3,  result_type);
 239     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 240 
 241     __ stp(r20, r19,   r20_save);
 242     __ stp(r22, r21,   r22_save);
 243     __ stp(r24, r23,   r24_save);
 244     __ stp(r26, r25,   r26_save);
 245     __ stp(r28, r27,   r28_save);
 246 
 247     __ stpd(v9,  v8,   d9_save);
 248     __ stpd(v11, v10,  d11_save);
 249     __ stpd(v13, v12,  d13_save);
 250     __ stpd(v15, v14,  d15_save);
 251 
 252     // install Java thread in global register now we have saved
 253     // whatever value it held
 254     __ mov(rthread, c_rarg7);
 255     // And method
 256     __ mov(rmethod, c_rarg3);
 257 
 258     // set up the heapbase register
 259     __ reinit_heapbase();
 260 
 261 #ifdef ASSERT
 262     // make sure we have no pending exceptions
 263     {
 264       Label L;
 265       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 266       __ cmp(rscratch1, (u1)NULL_WORD);
 267       __ br(Assembler::EQ, L);
 268       __ stop("StubRoutines::call_stub: entered with pending exception");
 269       __ BIND(L);
 270     }
 271 #endif
 272     // pass parameters if any
 273     __ mov(esp, sp);
 274     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 275     __ andr(sp, rscratch1, -2 * wordSize);
 276 
 277     BLOCK_COMMENT("pass parameters if any");
 278     Label parameters_done;
 279     // parameter count is still in c_rarg6
 280     // and parameter pointer identifying param 1 is in c_rarg5
 281     __ cbzw(c_rarg6, parameters_done);
 282 
 283     address loop = __ pc();
 284     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 285     __ subsw(c_rarg6, c_rarg6, 1);
 286     __ push(rscratch1);
 287     __ br(Assembler::GT, loop);
 288 
 289     __ BIND(parameters_done);
 290 
 291     // call Java entry -- passing methdoOop, and current sp
 292     //      rmethod: Method*
 293     //      r13: sender sp
 294     BLOCK_COMMENT("call Java function");
 295     __ mov(r13, sp);
 296     __ blr(c_rarg4);
 297 
 298     // we do this here because the notify will already have been done
 299     // if we get to the next instruction via an exception
 300     //
 301     // n.b. adding this instruction here affects the calculation of
 302     // whether or not a routine returns to the call stub (used when
 303     // doing stack walks) since the normal test is to check the return
 304     // pc against the address saved below. so we may need to allow for
 305     // this extra instruction in the check.
 306 
 307     // save current address for use by exception handling code
 308 
 309     return_address = __ pc();
 310 
 311     // store result depending on type (everything that is not
 312     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 313     // n.b. this assumes Java returns an integral result in r0
 314     // and a floating result in j_farg0
 315     __ ldr(j_rarg2, result);
 316     Label is_long, is_float, is_double, exit;
 317     __ ldr(j_rarg1, result_type);
 318     __ cmp(j_rarg1, (u1)T_OBJECT);
 319     __ br(Assembler::EQ, is_long);
 320     __ cmp(j_rarg1, (u1)T_LONG);
 321     __ br(Assembler::EQ, is_long);
 322     __ cmp(j_rarg1, (u1)T_FLOAT);
 323     __ br(Assembler::EQ, is_float);
 324     __ cmp(j_rarg1, (u1)T_DOUBLE);
 325     __ br(Assembler::EQ, is_double);
 326 
 327     // handle T_INT case
 328     __ strw(r0, Address(j_rarg2));
 329 
 330     __ BIND(exit);
 331 
 332     // pop parameters
 333     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 334 
 335 #ifdef ASSERT
 336     // verify that threads correspond
 337     {
 338       Label L, S;
 339       __ ldr(rscratch1, thread);
 340       __ cmp(rthread, rscratch1);
 341       __ br(Assembler::NE, S);
 342       __ get_thread(rscratch1);
 343       __ cmp(rthread, rscratch1);
 344       __ br(Assembler::EQ, L);
 345       __ BIND(S);
 346       __ stop("StubRoutines::call_stub: threads must correspond");
 347       __ BIND(L);
 348     }
 349 #endif
 350 
 351     // restore callee-save registers
 352     __ ldpd(v15, v14,  d15_save);
 353     __ ldpd(v13, v12,  d13_save);
 354     __ ldpd(v11, v10,  d11_save);
 355     __ ldpd(v9,  v8,   d9_save);
 356 
 357     __ ldp(r28, r27,   r28_save);
 358     __ ldp(r26, r25,   r26_save);
 359     __ ldp(r24, r23,   r24_save);
 360     __ ldp(r22, r21,   r22_save);
 361     __ ldp(r20, r19,   r20_save);
 362 
 363     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 364     __ ldrw(c_rarg2, result_type);
 365     __ ldr(c_rarg3,  method);
 366     __ ldp(c_rarg4, c_rarg5,  entry_point);
 367     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 368 
 369     // leave frame and return to caller
 370     __ leave();
 371     __ ret(lr);
 372 
 373     // handle return types different from T_INT
 374 
 375     __ BIND(is_long);
 376     __ str(r0, Address(j_rarg2, 0));
 377     __ br(Assembler::AL, exit);
 378 
 379     __ BIND(is_float);
 380     __ strs(j_farg0, Address(j_rarg2, 0));
 381     __ br(Assembler::AL, exit);
 382 
 383     __ BIND(is_double);
 384     __ strd(j_farg0, Address(j_rarg2, 0));
 385     __ br(Assembler::AL, exit);
 386 
 387     return start;
 388   }
 389 
 390   // Return point for a Java call if there's an exception thrown in
 391   // Java code.  The exception is caught and transformed into a
 392   // pending exception stored in JavaThread that can be tested from
 393   // within the VM.
 394   //
 395   // Note: Usually the parameters are removed by the callee. In case
 396   // of an exception crossing an activation frame boundary, that is
 397   // not the case if the callee is compiled code => need to setup the
 398   // rsp.
 399   //
 400   // r0: exception oop
 401 
 402   address generate_catch_exception() {
 403     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 404     address start = __ pc();
 405 
 406     // same as in generate_call_stub():
 407     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 408     const Address thread        (rfp, thread_off         * wordSize);
 409 
 410 #ifdef ASSERT
 411     // verify that threads correspond
 412     {
 413       Label L, S;
 414       __ ldr(rscratch1, thread);
 415       __ cmp(rthread, rscratch1);
 416       __ br(Assembler::NE, S);
 417       __ get_thread(rscratch1);
 418       __ cmp(rthread, rscratch1);
 419       __ br(Assembler::EQ, L);
 420       __ bind(S);
 421       __ stop("StubRoutines::catch_exception: threads must correspond");
 422       __ bind(L);
 423     }
 424 #endif
 425 
 426     // set pending exception
 427     __ verify_oop(r0);
 428 
 429     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 430     __ mov(rscratch1, (address)__FILE__);
 431     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 432     __ movw(rscratch1, (int)__LINE__);
 433     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 434 
 435     // complete return to VM
 436     assert(StubRoutines::_call_stub_return_address != NULL,
 437            "_call_stub_return_address must have been generated before");
 438     __ b(StubRoutines::_call_stub_return_address);
 439 
 440     return start;
 441   }
 442 
 443   // Continuation point for runtime calls returning with a pending
 444   // exception.  The pending exception check happened in the runtime
 445   // or native call stub.  The pending exception in Thread is
 446   // converted into a Java-level exception.
 447   //
 448   // Contract with Java-level exception handlers:
 449   // r0: exception
 450   // r3: throwing pc
 451   //
 452   // NOTE: At entry of this stub, exception-pc must be in LR !!
 453 
 454   // NOTE: this is always used as a jump target within generated code
 455   // so it just needs to be generated code wiht no x86 prolog
 456 
 457   address generate_forward_exception() {
 458     StubCodeMark mark(this, "StubRoutines", "forward exception");
 459     address start = __ pc();
 460 
 461     // Upon entry, LR points to the return address returning into
 462     // Java (interpreted or compiled) code; i.e., the return address
 463     // becomes the throwing pc.
 464     //
 465     // Arguments pushed before the runtime call are still on the stack
 466     // but the exception handler will reset the stack pointer ->
 467     // ignore them.  A potential result in registers can be ignored as
 468     // well.
 469 
 470 #ifdef ASSERT
 471     // make sure this code is only executed if there is a pending exception
 472     {
 473       Label L;
 474       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 475       __ cbnz(rscratch1, L);
 476       __ stop("StubRoutines::forward exception: no pending exception (1)");
 477       __ bind(L);
 478     }
 479 #endif
 480 
 481     // compute exception handler into r19
 482 
 483     // call the VM to find the handler address associated with the
 484     // caller address. pass thread in r0 and caller pc (ret address)
 485     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 486     // the stack.
 487     __ mov(c_rarg1, lr);
 488     // lr will be trashed by the VM call so we move it to R19
 489     // (callee-saved) because we also need to pass it to the handler
 490     // returned by this call.
 491     __ mov(r19, lr);
 492     BLOCK_COMMENT("call exception_handler_for_return_address");
 493     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 494                          SharedRuntime::exception_handler_for_return_address),
 495                     rthread, c_rarg1);
 496     // Reinitialize the ptrue predicate register, in case the external runtime
 497     // call clobbers ptrue reg, as we may return to SVE compiled code.
 498     __ reinitialize_ptrue();
 499 
 500     // we should not really care that lr is no longer the callee
 501     // address. we saved the value the handler needs in r19 so we can
 502     // just copy it to r3. however, the C2 handler will push its own
 503     // frame and then calls into the VM and the VM code asserts that
 504     // the PC for the frame above the handler belongs to a compiled
 505     // Java method. So, we restore lr here to satisfy that assert.
 506     __ mov(lr, r19);
 507     // setup r0 & r3 & clear pending exception
 508     __ mov(r3, r19);
 509     __ mov(r19, r0);
 510     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 511     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 512 
 513 #ifdef ASSERT
 514     // make sure exception is set
 515     {
 516       Label L;
 517       __ cbnz(r0, L);
 518       __ stop("StubRoutines::forward exception: no pending exception (2)");
 519       __ bind(L);
 520     }
 521 #endif
 522 
 523     // continue at exception handler
 524     // r0: exception
 525     // r3: throwing pc
 526     // r19: exception handler
 527     __ verify_oop(r0);
 528     __ br(r19);
 529 
 530     return start;
 531   }
 532 
 533   // Non-destructive plausibility checks for oops
 534   //
 535   // Arguments:
 536   //    r0: oop to verify
 537   //    rscratch1: error message
 538   //
 539   // Stack after saving c_rarg3:
 540   //    [tos + 0]: saved c_rarg3
 541   //    [tos + 1]: saved c_rarg2
 542   //    [tos + 2]: saved lr
 543   //    [tos + 3]: saved rscratch2
 544   //    [tos + 4]: saved r0
 545   //    [tos + 5]: saved rscratch1
 546   address generate_verify_oop() {
 547 
 548     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 549     address start = __ pc();
 550 
 551     Label exit, error;
 552 
 553     // save c_rarg2 and c_rarg3
 554     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 555 
 556     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 557     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 558     __ ldr(c_rarg3, Address(c_rarg2));
 559     __ add(c_rarg3, c_rarg3, 1);
 560     __ str(c_rarg3, Address(c_rarg2));
 561 
 562     // object is in r0
 563     // make sure object is 'reasonable'
 564     __ cbz(r0, exit); // if obj is NULL it is OK
 565 
 566 #if INCLUDE_ZGC
 567     if (UseZGC) {
 568       // Check if mask is good.
 569       // verifies that ZAddressBadMask & r0 == 0
 570       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 571       __ andr(c_rarg2, r0, c_rarg3);
 572       __ cbnz(c_rarg2, error);
 573     }
 574 #endif
 575 
 576     // Check if the oop is in the right area of memory
 577     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 578     __ andr(c_rarg2, r0, c_rarg3);
 579     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 580 
 581     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 582     // instruction here because the flags register is live.
 583     __ eor(c_rarg2, c_rarg2, c_rarg3);
 584     __ cbnz(c_rarg2, error);
 585 
 586     // make sure klass is 'reasonable', which is not zero.
 587     __ load_klass(r0, r0);  // get klass
 588     __ cbz(r0, error);      // if klass is NULL it is broken
 589 
 590     // return if everything seems ok
 591     __ bind(exit);
 592 
 593     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 594     __ ret(lr);
 595 
 596     // handle errors
 597     __ bind(error);
 598     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 599 
 600     __ push(RegSet::range(r0, r29), sp);
 601     // debug(char* msg, int64_t pc, int64_t regs[])
 602     __ mov(c_rarg0, rscratch1);      // pass address of error message
 603     __ mov(c_rarg1, lr);             // pass return address
 604     __ mov(c_rarg2, sp);             // pass address of regs on stack
 605 #ifndef PRODUCT
 606     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 607 #endif
 608     BLOCK_COMMENT("call MacroAssembler::debug");
 609     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 610     __ blr(rscratch1);
 611     __ hlt(0);
 612 
 613     return start;
 614   }
 615 
 616   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 617 
 618   // Generate indices for iota vector.
 619   address generate_iota_indices(const char *stub_name) {
 620     __ align(CodeEntryAlignment);
 621     StubCodeMark mark(this, "StubRoutines", stub_name);
 622     address start = __ pc();
 623     __ emit_data64(0x0706050403020100, relocInfo::none);
 624     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 625     return start;
 626   }
 627 
 628   // The inner part of zero_words().  This is the bulk operation,
 629   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 630   // caller is responsible for zeroing the last few words.
 631   //
 632   // Inputs:
 633   // r10: the HeapWord-aligned base address of an array to zero.
 634   // r11: the count in HeapWords, r11 > 0.
 635   //
 636   // Returns r10 and r11, adjusted for the caller to clear.
 637   // r10: the base address of the tail of words left to clear.
 638   // r11: the number of words in the tail.
 639   //      r11 < MacroAssembler::zero_words_block_size.
 640 
 641   address generate_zero_blocks() {
 642     Label done;
 643     Label base_aligned;
 644 
 645     Register base = r10, cnt = r11;
 646 
 647     __ align(CodeEntryAlignment);
 648     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 649     address start = __ pc();
 650 
 651     if (UseBlockZeroing) {
 652       int zva_length = VM_Version::zva_length();
 653 
 654       // Ensure ZVA length can be divided by 16. This is required by
 655       // the subsequent operations.
 656       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 657 
 658       __ tbz(base, 3, base_aligned);
 659       __ str(zr, Address(__ post(base, 8)));
 660       __ sub(cnt, cnt, 1);
 661       __ bind(base_aligned);
 662 
 663       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 664       // alignment.
 665       Label small;
 666       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 667       __ subs(rscratch1, cnt, low_limit >> 3);
 668       __ br(Assembler::LT, small);
 669       __ zero_dcache_blocks(base, cnt);
 670       __ bind(small);
 671     }
 672 
 673     {
 674       // Number of stp instructions we'll unroll
 675       const int unroll =
 676         MacroAssembler::zero_words_block_size / 2;
 677       // Clear the remaining blocks.
 678       Label loop;
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::LT, done);
 681       __ bind(loop);
 682       for (int i = 0; i < unroll; i++)
 683         __ stp(zr, zr, __ post(base, 16));
 684       __ subs(cnt, cnt, unroll * 2);
 685       __ br(Assembler::GE, loop);
 686       __ bind(done);
 687       __ add(cnt, cnt, unroll * 2);
 688     }
 689 
 690     __ ret(lr);
 691 
 692     return start;
 693   }
 694 
 695 
 696   typedef enum {
 697     copy_forwards = 1,
 698     copy_backwards = -1
 699   } copy_direction;
 700 
 701   // Bulk copy of blocks of 8 words.
 702   //
 703   // count is a count of words.
 704   //
 705   // Precondition: count >= 8
 706   //
 707   // Postconditions:
 708   //
 709   // The least significant bit of count contains the remaining count
 710   // of words to copy.  The rest of count is trash.
 711   //
 712   // s and d are adjusted to point to the remaining words to copy
 713   //
 714   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 715                            copy_direction direction) {
 716     int unit = wordSize * direction;
 717     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 718 
 719     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 720       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 721     const Register stride = r13;
 722 
 723     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 724     assert_different_registers(s, d, count, rscratch1);
 725 
 726     Label again, drain;
 727     const char *stub_name;
 728     if (direction == copy_forwards)
 729       stub_name = "forward_copy_longs";
 730     else
 731       stub_name = "backward_copy_longs";
 732 
 733     __ align(CodeEntryAlignment);
 734 
 735     StubCodeMark mark(this, "StubRoutines", stub_name);
 736 
 737     __ bind(start);
 738 
 739     Label unaligned_copy_long;
 740     if (AvoidUnalignedAccesses) {
 741       __ tbnz(d, 3, unaligned_copy_long);
 742     }
 743 
 744     if (direction == copy_forwards) {
 745       __ sub(s, s, bias);
 746       __ sub(d, d, bias);
 747     }
 748 
 749 #ifdef ASSERT
 750     // Make sure we are never given < 8 words
 751     {
 752       Label L;
 753       __ cmp(count, (u1)8);
 754       __ br(Assembler::GE, L);
 755       __ stop("genrate_copy_longs called with < 8 words");
 756       __ bind(L);
 757     }
 758 #endif
 759 
 760     // Fill 8 registers
 761     if (UseSIMDForMemoryOps) {
 762       __ ldpq(v0, v1, Address(s, 4 * unit));
 763       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 764     } else {
 765       __ ldp(t0, t1, Address(s, 2 * unit));
 766       __ ldp(t2, t3, Address(s, 4 * unit));
 767       __ ldp(t4, t5, Address(s, 6 * unit));
 768       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 769     }
 770 
 771     __ subs(count, count, 16);
 772     __ br(Assembler::LO, drain);
 773 
 774     int prefetch = PrefetchCopyIntervalInBytes;
 775     bool use_stride = false;
 776     if (direction == copy_backwards) {
 777        use_stride = prefetch > 256;
 778        prefetch = -prefetch;
 779        if (use_stride) __ mov(stride, prefetch);
 780     }
 781 
 782     __ bind(again);
 783 
 784     if (PrefetchCopyIntervalInBytes > 0)
 785       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 786 
 787     if (UseSIMDForMemoryOps) {
 788       __ stpq(v0, v1, Address(d, 4 * unit));
 789       __ ldpq(v0, v1, Address(s, 4 * unit));
 790       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 791       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 792     } else {
 793       __ stp(t0, t1, Address(d, 2 * unit));
 794       __ ldp(t0, t1, Address(s, 2 * unit));
 795       __ stp(t2, t3, Address(d, 4 * unit));
 796       __ ldp(t2, t3, Address(s, 4 * unit));
 797       __ stp(t4, t5, Address(d, 6 * unit));
 798       __ ldp(t4, t5, Address(s, 6 * unit));
 799       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 800       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 801     }
 802 
 803     __ subs(count, count, 8);
 804     __ br(Assembler::HS, again);
 805 
 806     // Drain
 807     __ bind(drain);
 808     if (UseSIMDForMemoryOps) {
 809       __ stpq(v0, v1, Address(d, 4 * unit));
 810       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 811     } else {
 812       __ stp(t0, t1, Address(d, 2 * unit));
 813       __ stp(t2, t3, Address(d, 4 * unit));
 814       __ stp(t4, t5, Address(d, 6 * unit));
 815       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 816     }
 817 
 818     {
 819       Label L1, L2;
 820       __ tbz(count, exact_log2(4), L1);
 821       if (UseSIMDForMemoryOps) {
 822         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 823         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 824       } else {
 825         __ ldp(t0, t1, Address(s, 2 * unit));
 826         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 827         __ stp(t0, t1, Address(d, 2 * unit));
 828         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 829       }
 830       __ bind(L1);
 831 
 832       if (direction == copy_forwards) {
 833         __ add(s, s, bias);
 834         __ add(d, d, bias);
 835       }
 836 
 837       __ tbz(count, 1, L2);
 838       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 839       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 840       __ bind(L2);
 841     }
 842 
 843     __ ret(lr);
 844 
 845     if (AvoidUnalignedAccesses) {
 846       Label drain, again;
 847       // Register order for storing. Order is different for backward copy.
 848 
 849       __ bind(unaligned_copy_long);
 850 
 851       // source address is even aligned, target odd aligned
 852       //
 853       // when forward copying word pairs we read long pairs at offsets
 854       // {0, 2, 4, 6} (in long words). when backwards copying we read
 855       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 856       // address by -2 in the forwards case so we can compute the
 857       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 858       // or -1.
 859       //
 860       // when forward copying we need to store 1 word, 3 pairs and
 861       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 862       // zero offset We adjust the destination by -1 which means we
 863       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 864       //
 865       // When backwards copyng we need to store 1 word, 3 pairs and
 866       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 867       // offsets {1, 3, 5, 7, 8} * unit.
 868 
 869       if (direction == copy_forwards) {
 870         __ sub(s, s, 16);
 871         __ sub(d, d, 8);
 872       }
 873 
 874       // Fill 8 registers
 875       //
 876       // for forwards copy s was offset by -16 from the original input
 877       // value of s so the register contents are at these offsets
 878       // relative to the 64 bit block addressed by that original input
 879       // and so on for each successive 64 byte block when s is updated
 880       //
 881       // t0 at offset 0,  t1 at offset 8
 882       // t2 at offset 16, t3 at offset 24
 883       // t4 at offset 32, t5 at offset 40
 884       // t6 at offset 48, t7 at offset 56
 885 
 886       // for backwards copy s was not offset so the register contents
 887       // are at these offsets into the preceding 64 byte block
 888       // relative to that original input and so on for each successive
 889       // preceding 64 byte block when s is updated. this explains the
 890       // slightly counter-intuitive looking pattern of register usage
 891       // in the stp instructions for backwards copy.
 892       //
 893       // t0 at offset -16, t1 at offset -8
 894       // t2 at offset -32, t3 at offset -24
 895       // t4 at offset -48, t5 at offset -40
 896       // t6 at offset -64, t7 at offset -56
 897 
 898       __ ldp(t0, t1, Address(s, 2 * unit));
 899       __ ldp(t2, t3, Address(s, 4 * unit));
 900       __ ldp(t4, t5, Address(s, 6 * unit));
 901       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 902 
 903       __ subs(count, count, 16);
 904       __ br(Assembler::LO, drain);
 905 
 906       int prefetch = PrefetchCopyIntervalInBytes;
 907       bool use_stride = false;
 908       if (direction == copy_backwards) {
 909          use_stride = prefetch > 256;
 910          prefetch = -prefetch;
 911          if (use_stride) __ mov(stride, prefetch);
 912       }
 913 
 914       __ bind(again);
 915 
 916       if (PrefetchCopyIntervalInBytes > 0)
 917         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 918 
 919       if (direction == copy_forwards) {
 920        // allowing for the offset of -8 the store instructions place
 921        // registers into the target 64 bit block at the following
 922        // offsets
 923        //
 924        // t0 at offset 0
 925        // t1 at offset 8,  t2 at offset 16
 926        // t3 at offset 24, t4 at offset 32
 927        // t5 at offset 40, t6 at offset 48
 928        // t7 at offset 56
 929 
 930         __ str(t0, Address(d, 1 * unit));
 931         __ stp(t1, t2, Address(d, 2 * unit));
 932         __ ldp(t0, t1, Address(s, 2 * unit));
 933         __ stp(t3, t4, Address(d, 4 * unit));
 934         __ ldp(t2, t3, Address(s, 4 * unit));
 935         __ stp(t5, t6, Address(d, 6 * unit));
 936         __ ldp(t4, t5, Address(s, 6 * unit));
 937         __ str(t7, Address(__ pre(d, 8 * unit)));
 938         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 939       } else {
 940        // d was not offset when we started so the registers are
 941        // written into the 64 bit block preceding d with the following
 942        // offsets
 943        //
 944        // t1 at offset -8
 945        // t3 at offset -24, t0 at offset -16
 946        // t5 at offset -48, t2 at offset -32
 947        // t7 at offset -56, t4 at offset -48
 948        //                   t6 at offset -64
 949        //
 950        // note that this matches the offsets previously noted for the
 951        // loads
 952 
 953         __ str(t1, Address(d, 1 * unit));
 954         __ stp(t3, t0, Address(d, 3 * unit));
 955         __ ldp(t0, t1, Address(s, 2 * unit));
 956         __ stp(t5, t2, Address(d, 5 * unit));
 957         __ ldp(t2, t3, Address(s, 4 * unit));
 958         __ stp(t7, t4, Address(d, 7 * unit));
 959         __ ldp(t4, t5, Address(s, 6 * unit));
 960         __ str(t6, Address(__ pre(d, 8 * unit)));
 961         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 962       }
 963 
 964       __ subs(count, count, 8);
 965       __ br(Assembler::HS, again);
 966 
 967       // Drain
 968       //
 969       // this uses the same pattern of offsets and register arguments
 970       // as above
 971       __ bind(drain);
 972       if (direction == copy_forwards) {
 973         __ str(t0, Address(d, 1 * unit));
 974         __ stp(t1, t2, Address(d, 2 * unit));
 975         __ stp(t3, t4, Address(d, 4 * unit));
 976         __ stp(t5, t6, Address(d, 6 * unit));
 977         __ str(t7, Address(__ pre(d, 8 * unit)));
 978       } else {
 979         __ str(t1, Address(d, 1 * unit));
 980         __ stp(t3, t0, Address(d, 3 * unit));
 981         __ stp(t5, t2, Address(d, 5 * unit));
 982         __ stp(t7, t4, Address(d, 7 * unit));
 983         __ str(t6, Address(__ pre(d, 8 * unit)));
 984       }
 985       // now we need to copy any remaining part block which may
 986       // include a 4 word block subblock and/or a 2 word subblock.
 987       // bits 2 and 1 in the count are the tell-tale for whetehr we
 988       // have each such subblock
 989       {
 990         Label L1, L2;
 991         __ tbz(count, exact_log2(4), L1);
 992        // this is the same as above but copying only 4 longs hence
 993        // with ony one intervening stp between the str instructions
 994        // but note that the offsets and registers still follow the
 995        // same pattern
 996         __ ldp(t0, t1, Address(s, 2 * unit));
 997         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 998         if (direction == copy_forwards) {
 999           __ str(t0, Address(d, 1 * unit));
1000           __ stp(t1, t2, Address(d, 2 * unit));
1001           __ str(t3, Address(__ pre(d, 4 * unit)));
1002         } else {
1003           __ str(t1, Address(d, 1 * unit));
1004           __ stp(t3, t0, Address(d, 3 * unit));
1005           __ str(t2, Address(__ pre(d, 4 * unit)));
1006         }
1007         __ bind(L1);
1008 
1009         __ tbz(count, 1, L2);
1010        // this is the same as above but copying only 2 longs hence
1011        // there is no intervening stp between the str instructions
1012        // but note that the offset and register patterns are still
1013        // the same
1014         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1015         if (direction == copy_forwards) {
1016           __ str(t0, Address(d, 1 * unit));
1017           __ str(t1, Address(__ pre(d, 2 * unit)));
1018         } else {
1019           __ str(t1, Address(d, 1 * unit));
1020           __ str(t0, Address(__ pre(d, 2 * unit)));
1021         }
1022         __ bind(L2);
1023 
1024        // for forwards copy we need to re-adjust the offsets we
1025        // applied so that s and d are follow the last words written
1026 
1027        if (direction == copy_forwards) {
1028          __ add(s, s, 16);
1029          __ add(d, d, 8);
1030        }
1031 
1032       }
1033 
1034       __ ret(lr);
1035       }
1036   }
1037 
1038   // Small copy: less than 16 bytes.
1039   //
1040   // NB: Ignores all of the bits of count which represent more than 15
1041   // bytes, so a caller doesn't have to mask them.
1042 
1043   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1044     bool is_backwards = step < 0;
1045     size_t granularity = uabs(step);
1046     int direction = is_backwards ? -1 : 1;
1047     int unit = wordSize * direction;
1048 
1049     Label Lword, Lint, Lshort, Lbyte;
1050 
1051     assert(granularity
1052            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1053 
1054     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1055 
1056     // ??? I don't know if this bit-test-and-branch is the right thing
1057     // to do.  It does a lot of jumping, resulting in several
1058     // mispredicted branches.  It might make more sense to do this
1059     // with something like Duff's device with a single computed branch.
1060 
1061     __ tbz(count, 3 - exact_log2(granularity), Lword);
1062     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1063     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1064     __ bind(Lword);
1065 
1066     if (granularity <= sizeof (jint)) {
1067       __ tbz(count, 2 - exact_log2(granularity), Lint);
1068       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1069       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1070       __ bind(Lint);
1071     }
1072 
1073     if (granularity <= sizeof (jshort)) {
1074       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1075       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1076       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1077       __ bind(Lshort);
1078     }
1079 
1080     if (granularity <= sizeof (jbyte)) {
1081       __ tbz(count, 0, Lbyte);
1082       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1083       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1084       __ bind(Lbyte);
1085     }
1086   }
1087 
1088   Label copy_f, copy_b;
1089 
1090   // All-singing all-dancing memory copy.
1091   //
1092   // Copy count units of memory from s to d.  The size of a unit is
1093   // step, which can be positive or negative depending on the direction
1094   // of copy.  If is_aligned is false, we align the source address.
1095   //
1096 
1097   void copy_memory(bool is_aligned, Register s, Register d,
1098                    Register count, Register tmp, int step) {
1099     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1100     bool is_backwards = step < 0;
1101     unsigned int granularity = uabs(step);
1102     const Register t0 = r3, t1 = r4;
1103 
1104     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1105     // load all the data before writing anything
1106     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1107     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1108     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1109     const Register send = r17, dend = r16;
1110 
1111     if (PrefetchCopyIntervalInBytes > 0)
1112       __ prfm(Address(s, 0), PLDL1KEEP);
1113     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1114     __ br(Assembler::HI, copy_big);
1115 
1116     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1117     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1118 
1119     __ cmp(count, u1(16/granularity));
1120     __ br(Assembler::LS, copy16);
1121 
1122     __ cmp(count, u1(64/granularity));
1123     __ br(Assembler::HI, copy80);
1124 
1125     __ cmp(count, u1(32/granularity));
1126     __ br(Assembler::LS, copy32);
1127 
1128     // 33..64 bytes
1129     if (UseSIMDForMemoryOps) {
1130       __ ldpq(v0, v1, Address(s, 0));
1131       __ ldpq(v2, v3, Address(send, -32));
1132       __ stpq(v0, v1, Address(d, 0));
1133       __ stpq(v2, v3, Address(dend, -32));
1134     } else {
1135       __ ldp(t0, t1, Address(s, 0));
1136       __ ldp(t2, t3, Address(s, 16));
1137       __ ldp(t4, t5, Address(send, -32));
1138       __ ldp(t6, t7, Address(send, -16));
1139 
1140       __ stp(t0, t1, Address(d, 0));
1141       __ stp(t2, t3, Address(d, 16));
1142       __ stp(t4, t5, Address(dend, -32));
1143       __ stp(t6, t7, Address(dend, -16));
1144     }
1145     __ b(finish);
1146 
1147     // 17..32 bytes
1148     __ bind(copy32);
1149     __ ldp(t0, t1, Address(s, 0));
1150     __ ldp(t2, t3, Address(send, -16));
1151     __ stp(t0, t1, Address(d, 0));
1152     __ stp(t2, t3, Address(dend, -16));
1153     __ b(finish);
1154 
1155     // 65..80/96 bytes
1156     // (96 bytes if SIMD because we do 32 byes per instruction)
1157     __ bind(copy80);
1158     if (UseSIMDForMemoryOps) {
1159       __ ldpq(v0, v1, Address(s, 0));
1160       __ ldpq(v2, v3, Address(s, 32));
1161       // Unaligned pointers can be an issue for copying.
1162       // The issue has more chances to happen when granularity of data is
1163       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1164       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1165       // The most performance drop has been seen for the range 65-80 bytes.
1166       // For such cases using the pair of ldp/stp instead of the third pair of
1167       // ldpq/stpq fixes the performance issue.
1168       if (granularity < sizeof (jint)) {
1169         Label copy96;
1170         __ cmp(count, u1(80/granularity));
1171         __ br(Assembler::HI, copy96);
1172         __ ldp(t0, t1, Address(send, -16));
1173 
1174         __ stpq(v0, v1, Address(d, 0));
1175         __ stpq(v2, v3, Address(d, 32));
1176         __ stp(t0, t1, Address(dend, -16));
1177         __ b(finish);
1178 
1179         __ bind(copy96);
1180       }
1181       __ ldpq(v4, v5, Address(send, -32));
1182 
1183       __ stpq(v0, v1, Address(d, 0));
1184       __ stpq(v2, v3, Address(d, 32));
1185       __ stpq(v4, v5, Address(dend, -32));
1186     } else {
1187       __ ldp(t0, t1, Address(s, 0));
1188       __ ldp(t2, t3, Address(s, 16));
1189       __ ldp(t4, t5, Address(s, 32));
1190       __ ldp(t6, t7, Address(s, 48));
1191       __ ldp(t8, t9, Address(send, -16));
1192 
1193       __ stp(t0, t1, Address(d, 0));
1194       __ stp(t2, t3, Address(d, 16));
1195       __ stp(t4, t5, Address(d, 32));
1196       __ stp(t6, t7, Address(d, 48));
1197       __ stp(t8, t9, Address(dend, -16));
1198     }
1199     __ b(finish);
1200 
1201     // 0..16 bytes
1202     __ bind(copy16);
1203     __ cmp(count, u1(8/granularity));
1204     __ br(Assembler::LO, copy8);
1205 
1206     // 8..16 bytes
1207     __ ldr(t0, Address(s, 0));
1208     __ ldr(t1, Address(send, -8));
1209     __ str(t0, Address(d, 0));
1210     __ str(t1, Address(dend, -8));
1211     __ b(finish);
1212 
1213     if (granularity < 8) {
1214       // 4..7 bytes
1215       __ bind(copy8);
1216       __ tbz(count, 2 - exact_log2(granularity), copy4);
1217       __ ldrw(t0, Address(s, 0));
1218       __ ldrw(t1, Address(send, -4));
1219       __ strw(t0, Address(d, 0));
1220       __ strw(t1, Address(dend, -4));
1221       __ b(finish);
1222       if (granularity < 4) {
1223         // 0..3 bytes
1224         __ bind(copy4);
1225         __ cbz(count, finish); // get rid of 0 case
1226         if (granularity == 2) {
1227           __ ldrh(t0, Address(s, 0));
1228           __ strh(t0, Address(d, 0));
1229         } else { // granularity == 1
1230           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1231           // the first and last byte.
1232           // Handle the 3 byte case by loading and storing base + count/2
1233           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1234           // This does means in the 1 byte case we load/store the same
1235           // byte 3 times.
1236           __ lsr(count, count, 1);
1237           __ ldrb(t0, Address(s, 0));
1238           __ ldrb(t1, Address(send, -1));
1239           __ ldrb(t2, Address(s, count));
1240           __ strb(t0, Address(d, 0));
1241           __ strb(t1, Address(dend, -1));
1242           __ strb(t2, Address(d, count));
1243         }
1244         __ b(finish);
1245       }
1246     }
1247 
1248     __ bind(copy_big);
1249     if (is_backwards) {
1250       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1251       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1252     }
1253 
1254     // Now we've got the small case out of the way we can align the
1255     // source address on a 2-word boundary.
1256 
1257     Label aligned;
1258 
1259     if (is_aligned) {
1260       // We may have to adjust by 1 word to get s 2-word-aligned.
1261       __ tbz(s, exact_log2(wordSize), aligned);
1262       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1263       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1264       __ sub(count, count, wordSize/granularity);
1265     } else {
1266       if (is_backwards) {
1267         __ andr(rscratch2, s, 2 * wordSize - 1);
1268       } else {
1269         __ neg(rscratch2, s);
1270         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1271       }
1272       // rscratch2 is the byte adjustment needed to align s.
1273       __ cbz(rscratch2, aligned);
1274       int shift = exact_log2(granularity);
1275       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1276       __ sub(count, count, rscratch2);
1277 
1278 #if 0
1279       // ?? This code is only correct for a disjoint copy.  It may or
1280       // may not make sense to use it in that case.
1281 
1282       // Copy the first pair; s and d may not be aligned.
1283       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1284       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1285 
1286       // Align s and d, adjust count
1287       if (is_backwards) {
1288         __ sub(s, s, rscratch2);
1289         __ sub(d, d, rscratch2);
1290       } else {
1291         __ add(s, s, rscratch2);
1292         __ add(d, d, rscratch2);
1293       }
1294 #else
1295       copy_memory_small(s, d, rscratch2, rscratch1, step);
1296 #endif
1297     }
1298 
1299     __ bind(aligned);
1300 
1301     // s is now 2-word-aligned.
1302 
1303     // We have a count of units and some trailing bytes.  Adjust the
1304     // count and do a bulk copy of words.
1305     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1306     if (direction == copy_forwards)
1307       __ bl(copy_f);
1308     else
1309       __ bl(copy_b);
1310 
1311     // And the tail.
1312     copy_memory_small(s, d, count, tmp, step);
1313 
1314     if (granularity >= 8) __ bind(copy8);
1315     if (granularity >= 4) __ bind(copy4);
1316     __ bind(finish);
1317   }
1318 
1319 
1320   void clobber_registers() {
1321 #ifdef ASSERT
1322     RegSet clobbered
1323       = MacroAssembler::call_clobbered_registers() - rscratch1;
1324     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1325     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1326     for (RegSetIterator<> it = clobbered.begin(); *it != noreg; ++it) {
1327       __ mov(*it, rscratch1);
1328     }
1329 #endif
1330 
1331   }
1332 
1333   // Scan over array at a for count oops, verifying each one.
1334   // Preserves a and count, clobbers rscratch1 and rscratch2.
1335   void verify_oop_array (int size, Register a, Register count, Register temp) {
1336     Label loop, end;
1337     __ mov(rscratch1, a);
1338     __ mov(rscratch2, zr);
1339     __ bind(loop);
1340     __ cmp(rscratch2, count);
1341     __ br(Assembler::HS, end);
1342     if (size == wordSize) {
1343       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1344       __ verify_oop(temp);
1345     } else {
1346       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1347       __ decode_heap_oop(temp); // calls verify_oop
1348     }
1349     __ add(rscratch2, rscratch2, 1);
1350     __ b(loop);
1351     __ bind(end);
1352   }
1353 
1354   // Arguments:
1355   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1356   //             ignored
1357   //   is_oop  - true => oop array, so generate store check code
1358   //   name    - stub name string
1359   //
1360   // Inputs:
1361   //   c_rarg0   - source array address
1362   //   c_rarg1   - destination array address
1363   //   c_rarg2   - element count, treated as ssize_t, can be zero
1364   //
1365   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1366   // the hardware handle it.  The two dwords within qwords that span
1367   // cache line boundaries will still be loaded and stored atomically.
1368   //
1369   // Side Effects:
1370   //   disjoint_int_copy_entry is set to the no-overlap entry point
1371   //   used by generate_conjoint_int_oop_copy().
1372   //
1373   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1374                                   const char *name, bool dest_uninitialized = false) {
1375     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1376     RegSet saved_reg = RegSet::of(s, d, count);
1377     __ align(CodeEntryAlignment);
1378     StubCodeMark mark(this, "StubRoutines", name);
1379     address start = __ pc();
1380     __ enter();
1381 
1382     if (entry != NULL) {
1383       *entry = __ pc();
1384       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1385       BLOCK_COMMENT("Entry:");
1386     }
1387 
1388     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1389     if (dest_uninitialized) {
1390       decorators |= IS_DEST_UNINITIALIZED;
1391     }
1392     if (aligned) {
1393       decorators |= ARRAYCOPY_ALIGNED;
1394     }
1395 
1396     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1397     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1398 
1399     if (is_oop) {
1400       // save regs before copy_memory
1401       __ push(RegSet::of(d, count), sp);
1402     }
1403     {
1404       // UnsafeCopyMemory page error: continue after ucm
1405       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1406       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1407       copy_memory(aligned, s, d, count, rscratch1, size);
1408     }
1409 
1410     if (is_oop) {
1411       __ pop(RegSet::of(d, count), sp);
1412       if (VerifyOops)
1413         verify_oop_array(size, d, count, r16);
1414     }
1415 
1416     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1417 
1418     __ leave();
1419     __ mov(r0, zr); // return 0
1420     __ ret(lr);
1421     return start;
1422   }
1423 
1424   // Arguments:
1425   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1426   //             ignored
1427   //   is_oop  - true => oop array, so generate store check code
1428   //   name    - stub name string
1429   //
1430   // Inputs:
1431   //   c_rarg0   - source array address
1432   //   c_rarg1   - destination array address
1433   //   c_rarg2   - element count, treated as ssize_t, can be zero
1434   //
1435   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1436   // the hardware handle it.  The two dwords within qwords that span
1437   // cache line boundaries will still be loaded and stored atomically.
1438   //
1439   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1440                                  address *entry, const char *name,
1441                                  bool dest_uninitialized = false) {
1442     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1443     RegSet saved_regs = RegSet::of(s, d, count);
1444     StubCodeMark mark(this, "StubRoutines", name);
1445     address start = __ pc();
1446     __ enter();
1447 
1448     if (entry != NULL) {
1449       *entry = __ pc();
1450       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1451       BLOCK_COMMENT("Entry:");
1452     }
1453 
1454     // use fwd copy when (d-s) above_equal (count*size)
1455     __ sub(rscratch1, d, s);
1456     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1457     __ br(Assembler::HS, nooverlap_target);
1458 
1459     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1460     if (dest_uninitialized) {
1461       decorators |= IS_DEST_UNINITIALIZED;
1462     }
1463     if (aligned) {
1464       decorators |= ARRAYCOPY_ALIGNED;
1465     }
1466 
1467     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1468     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1469 
1470     if (is_oop) {
1471       // save regs before copy_memory
1472       __ push(RegSet::of(d, count), sp);
1473     }
1474     {
1475       // UnsafeCopyMemory page error: continue after ucm
1476       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1477       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1478       copy_memory(aligned, s, d, count, rscratch1, -size);
1479     }
1480     if (is_oop) {
1481       __ pop(RegSet::of(d, count), sp);
1482       if (VerifyOops)
1483         verify_oop_array(size, d, count, r16);
1484     }
1485     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1486     __ leave();
1487     __ mov(r0, zr); // return 0
1488     __ ret(lr);
1489     return start;
1490 }
1491 
1492   // Arguments:
1493   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1494   //             ignored
1495   //   name    - stub name string
1496   //
1497   // Inputs:
1498   //   c_rarg0   - source array address
1499   //   c_rarg1   - destination array address
1500   //   c_rarg2   - element count, treated as ssize_t, can be zero
1501   //
1502   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1503   // we let the hardware handle it.  The one to eight bytes within words,
1504   // dwords or qwords that span cache line boundaries will still be loaded
1505   // and stored atomically.
1506   //
1507   // Side Effects:
1508   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1509   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1510   // we let the hardware handle it.  The one to eight bytes within words,
1511   // dwords or qwords that span cache line boundaries will still be loaded
1512   // and stored atomically.
1513   //
1514   // Side Effects:
1515   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1516   //   used by generate_conjoint_byte_copy().
1517   //
1518   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1519     const bool not_oop = false;
1520     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1521   }
1522 
1523   // Arguments:
1524   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1525   //             ignored
1526   //   name    - stub name string
1527   //
1528   // Inputs:
1529   //   c_rarg0   - source array address
1530   //   c_rarg1   - destination array address
1531   //   c_rarg2   - element count, treated as ssize_t, can be zero
1532   //
1533   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1534   // we let the hardware handle it.  The one to eight bytes within words,
1535   // dwords or qwords that span cache line boundaries will still be loaded
1536   // and stored atomically.
1537   //
1538   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1539                                       address* entry, const char *name) {
1540     const bool not_oop = false;
1541     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1542   }
1543 
1544   // Arguments:
1545   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546   //             ignored
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1555   // let the hardware handle it.  The two or four words within dwords
1556   // or qwords that span cache line boundaries will still be loaded
1557   // and stored atomically.
1558   //
1559   // Side Effects:
1560   //   disjoint_short_copy_entry is set to the no-overlap entry point
1561   //   used by generate_conjoint_short_copy().
1562   //
1563   address generate_disjoint_short_copy(bool aligned,
1564                                        address* entry, const char *name) {
1565     const bool not_oop = false;
1566     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1567   }
1568 
1569   // Arguments:
1570   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1571   //             ignored
1572   //   name    - stub name string
1573   //
1574   // Inputs:
1575   //   c_rarg0   - source array address
1576   //   c_rarg1   - destination array address
1577   //   c_rarg2   - element count, treated as ssize_t, can be zero
1578   //
1579   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1580   // let the hardware handle it.  The two or four words within dwords
1581   // or qwords that span cache line boundaries will still be loaded
1582   // and stored atomically.
1583   //
1584   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1585                                        address *entry, const char *name) {
1586     const bool not_oop = false;
1587     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1588 
1589   }
1590   // Arguments:
1591   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1592   //             ignored
1593   //   name    - stub name string
1594   //
1595   // Inputs:
1596   //   c_rarg0   - source array address
1597   //   c_rarg1   - destination array address
1598   //   c_rarg2   - element count, treated as ssize_t, can be zero
1599   //
1600   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1601   // the hardware handle it.  The two dwords within qwords that span
1602   // cache line boundaries will still be loaded and stored atomically.
1603   //
1604   // Side Effects:
1605   //   disjoint_int_copy_entry is set to the no-overlap entry point
1606   //   used by generate_conjoint_int_oop_copy().
1607   //
1608   address generate_disjoint_int_copy(bool aligned, address *entry,
1609                                          const char *name, bool dest_uninitialized = false) {
1610     const bool not_oop = false;
1611     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1612   }
1613 
1614   // Arguments:
1615   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1616   //             ignored
1617   //   name    - stub name string
1618   //
1619   // Inputs:
1620   //   c_rarg0   - source array address
1621   //   c_rarg1   - destination array address
1622   //   c_rarg2   - element count, treated as ssize_t, can be zero
1623   //
1624   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1625   // the hardware handle it.  The two dwords within qwords that span
1626   // cache line boundaries will still be loaded and stored atomically.
1627   //
1628   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1629                                      address *entry, const char *name,
1630                                      bool dest_uninitialized = false) {
1631     const bool not_oop = false;
1632     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1633   }
1634 
1635 
1636   // Arguments:
1637   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1638   //             ignored
1639   //   name    - stub name string
1640   //
1641   // Inputs:
1642   //   c_rarg0   - source array address
1643   //   c_rarg1   - destination array address
1644   //   c_rarg2   - element count, treated as size_t, can be zero
1645   //
1646   // Side Effects:
1647   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1648   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1649   //
1650   address generate_disjoint_long_copy(bool aligned, address *entry,
1651                                           const char *name, bool dest_uninitialized = false) {
1652     const bool not_oop = false;
1653     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1654   }
1655 
1656   // Arguments:
1657   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1658   //             ignored
1659   //   name    - stub name string
1660   //
1661   // Inputs:
1662   //   c_rarg0   - source array address
1663   //   c_rarg1   - destination array address
1664   //   c_rarg2   - element count, treated as size_t, can be zero
1665   //
1666   address generate_conjoint_long_copy(bool aligned,
1667                                       address nooverlap_target, address *entry,
1668                                       const char *name, bool dest_uninitialized = false) {
1669     const bool not_oop = false;
1670     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1671   }
1672 
1673   // Arguments:
1674   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1675   //             ignored
1676   //   name    - stub name string
1677   //
1678   // Inputs:
1679   //   c_rarg0   - source array address
1680   //   c_rarg1   - destination array address
1681   //   c_rarg2   - element count, treated as size_t, can be zero
1682   //
1683   // Side Effects:
1684   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1685   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1686   //
1687   address generate_disjoint_oop_copy(bool aligned, address *entry,
1688                                      const char *name, bool dest_uninitialized) {
1689     const bool is_oop = true;
1690     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1691     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1692   }
1693 
1694   // Arguments:
1695   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1696   //             ignored
1697   //   name    - stub name string
1698   //
1699   // Inputs:
1700   //   c_rarg0   - source array address
1701   //   c_rarg1   - destination array address
1702   //   c_rarg2   - element count, treated as size_t, can be zero
1703   //
1704   address generate_conjoint_oop_copy(bool aligned,
1705                                      address nooverlap_target, address *entry,
1706                                      const char *name, bool dest_uninitialized) {
1707     const bool is_oop = true;
1708     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1709     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1710                                   name, dest_uninitialized);
1711   }
1712 
1713 
1714   // Helper for generating a dynamic type check.
1715   // Smashes rscratch1, rscratch2.
1716   void generate_type_check(Register sub_klass,
1717                            Register super_check_offset,
1718                            Register super_klass,
1719                            Label& L_success) {
1720     assert_different_registers(sub_klass, super_check_offset, super_klass);
1721 
1722     BLOCK_COMMENT("type_check:");
1723 
1724     Label L_miss;
1725 
1726     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1727                                      super_check_offset);
1728     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1729 
1730     // Fall through on failure!
1731     __ BIND(L_miss);
1732   }
1733 
1734   //
1735   //  Generate checkcasting array copy stub
1736   //
1737   //  Input:
1738   //    c_rarg0   - source array address
1739   //    c_rarg1   - destination array address
1740   //    c_rarg2   - element count, treated as ssize_t, can be zero
1741   //    c_rarg3   - size_t ckoff (super_check_offset)
1742   //    c_rarg4   - oop ckval (super_klass)
1743   //
1744   //  Output:
1745   //    r0 ==  0  -  success
1746   //    r0 == -1^K - failure, where K is partial transfer count
1747   //
1748   address generate_checkcast_copy(const char *name, address *entry,
1749                                   bool dest_uninitialized = false) {
1750 
1751     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1752 
1753     // Input registers (after setup_arg_regs)
1754     const Register from        = c_rarg0;   // source array address
1755     const Register to          = c_rarg1;   // destination array address
1756     const Register count       = c_rarg2;   // elementscount
1757     const Register ckoff       = c_rarg3;   // super_check_offset
1758     const Register ckval       = c_rarg4;   // super_klass
1759 
1760     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1761     RegSet wb_post_saved_regs = RegSet::of(count);
1762 
1763     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1764     const Register copied_oop  = r22;       // actual oop copied
1765     const Register count_save  = r21;       // orig elementscount
1766     const Register start_to    = r20;       // destination array start address
1767     const Register r19_klass   = r19;       // oop._klass
1768 
1769     //---------------------------------------------------------------
1770     // Assembler stub will be used for this call to arraycopy
1771     // if the two arrays are subtypes of Object[] but the
1772     // destination array type is not equal to or a supertype
1773     // of the source type.  Each element must be separately
1774     // checked.
1775 
1776     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1777                                copied_oop, r19_klass, count_save);
1778 
1779     __ align(CodeEntryAlignment);
1780     StubCodeMark mark(this, "StubRoutines", name);
1781     address start = __ pc();
1782 
1783     __ enter(); // required for proper stackwalking of RuntimeStub frame
1784 
1785 #ifdef ASSERT
1786     // caller guarantees that the arrays really are different
1787     // otherwise, we would have to make conjoint checks
1788     { Label L;
1789       array_overlap_test(L, TIMES_OOP);
1790       __ stop("checkcast_copy within a single array");
1791       __ bind(L);
1792     }
1793 #endif //ASSERT
1794 
1795     // Caller of this entry point must set up the argument registers.
1796     if (entry != NULL) {
1797       *entry = __ pc();
1798       BLOCK_COMMENT("Entry:");
1799     }
1800 
1801      // Empty array:  Nothing to do.
1802     __ cbz(count, L_done);
1803     __ push(RegSet::of(r19, r20, r21, r22), sp);
1804 
1805 #ifdef ASSERT
1806     BLOCK_COMMENT("assert consistent ckoff/ckval");
1807     // The ckoff and ckval must be mutually consistent,
1808     // even though caller generates both.
1809     { Label L;
1810       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1811       __ ldrw(start_to, Address(ckval, sco_offset));
1812       __ cmpw(ckoff, start_to);
1813       __ br(Assembler::EQ, L);
1814       __ stop("super_check_offset inconsistent");
1815       __ bind(L);
1816     }
1817 #endif //ASSERT
1818 
1819     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1820     bool is_oop = true;
1821     if (dest_uninitialized) {
1822       decorators |= IS_DEST_UNINITIALIZED;
1823     }
1824 
1825     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1826     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1827 
1828     // save the original count
1829     __ mov(count_save, count);
1830 
1831     // Copy from low to high addresses
1832     __ mov(start_to, to);              // Save destination array start address
1833     __ b(L_load_element);
1834 
1835     // ======== begin loop ========
1836     // (Loop is rotated; its entry is L_load_element.)
1837     // Loop control:
1838     //   for (; count != 0; count--) {
1839     //     copied_oop = load_heap_oop(from++);
1840     //     ... generate_type_check ...;
1841     //     store_heap_oop(to++, copied_oop);
1842     //   }
1843     __ align(OptoLoopAlignment);
1844 
1845     __ BIND(L_store_element);
1846     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1847     __ sub(count, count, 1);
1848     __ cbz(count, L_do_card_marks);
1849 
1850     // ======== loop entry is here ========
1851     __ BIND(L_load_element);
1852     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1853     __ cbz(copied_oop, L_store_element);
1854 
1855     __ load_klass(r19_klass, copied_oop);// query the object klass
1856     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1857     // ======== end loop ========
1858 
1859     // It was a real error; we must depend on the caller to finish the job.
1860     // Register count = remaining oops, count_orig = total oops.
1861     // Emit GC store barriers for the oops we have copied and report
1862     // their number to the caller.
1863 
1864     __ subs(count, count_save, count);     // K = partially copied oop count
1865     __ eon(count, count, zr);                   // report (-1^K) to caller
1866     __ br(Assembler::EQ, L_done_pop);
1867 
1868     __ BIND(L_do_card_marks);
1869     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1870 
1871     __ bind(L_done_pop);
1872     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1873     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1874 
1875     __ bind(L_done);
1876     __ mov(r0, count);
1877     __ leave();
1878     __ ret(lr);
1879 
1880     return start;
1881   }
1882 
1883   // Perform range checks on the proposed arraycopy.
1884   // Kills temp, but nothing else.
1885   // Also, clean the sign bits of src_pos and dst_pos.
1886   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1887                               Register src_pos, // source position (c_rarg1)
1888                               Register dst,     // destination array oo (c_rarg2)
1889                               Register dst_pos, // destination position (c_rarg3)
1890                               Register length,
1891                               Register temp,
1892                               Label& L_failed) {
1893     BLOCK_COMMENT("arraycopy_range_checks:");
1894 
1895     assert_different_registers(rscratch1, temp);
1896 
1897     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1898     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1899     __ addw(temp, length, src_pos);
1900     __ cmpw(temp, rscratch1);
1901     __ br(Assembler::HI, L_failed);
1902 
1903     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1904     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1905     __ addw(temp, length, dst_pos);
1906     __ cmpw(temp, rscratch1);
1907     __ br(Assembler::HI, L_failed);
1908 
1909     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1910     __ movw(src_pos, src_pos);
1911     __ movw(dst_pos, dst_pos);
1912 
1913     BLOCK_COMMENT("arraycopy_range_checks done");
1914   }
1915 
1916   // These stubs get called from some dumb test routine.
1917   // I'll write them properly when they're called from
1918   // something that's actually doing something.
1919   static void fake_arraycopy_stub(address src, address dst, int count) {
1920     assert(count == 0, "huh?");
1921   }
1922 
1923 
1924   //
1925   //  Generate 'unsafe' array copy stub
1926   //  Though just as safe as the other stubs, it takes an unscaled
1927   //  size_t argument instead of an element count.
1928   //
1929   //  Input:
1930   //    c_rarg0   - source array address
1931   //    c_rarg1   - destination array address
1932   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1933   //
1934   // Examines the alignment of the operands and dispatches
1935   // to a long, int, short, or byte copy loop.
1936   //
1937   address generate_unsafe_copy(const char *name,
1938                                address byte_copy_entry,
1939                                address short_copy_entry,
1940                                address int_copy_entry,
1941                                address long_copy_entry) {
1942     Label L_long_aligned, L_int_aligned, L_short_aligned;
1943     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1944 
1945     __ align(CodeEntryAlignment);
1946     StubCodeMark mark(this, "StubRoutines", name);
1947     address start = __ pc();
1948     __ enter(); // required for proper stackwalking of RuntimeStub frame
1949 
1950     // bump this on entry, not on exit:
1951     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1952 
1953     __ orr(rscratch1, s, d);
1954     __ orr(rscratch1, rscratch1, count);
1955 
1956     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1957     __ cbz(rscratch1, L_long_aligned);
1958     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1959     __ cbz(rscratch1, L_int_aligned);
1960     __ tbz(rscratch1, 0, L_short_aligned);
1961     __ b(RuntimeAddress(byte_copy_entry));
1962 
1963     __ BIND(L_short_aligned);
1964     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1965     __ b(RuntimeAddress(short_copy_entry));
1966     __ BIND(L_int_aligned);
1967     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1968     __ b(RuntimeAddress(int_copy_entry));
1969     __ BIND(L_long_aligned);
1970     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1971     __ b(RuntimeAddress(long_copy_entry));
1972 
1973     return start;
1974   }
1975 
1976   //
1977   //  Generate generic array copy stubs
1978   //
1979   //  Input:
1980   //    c_rarg0    -  src oop
1981   //    c_rarg1    -  src_pos (32-bits)
1982   //    c_rarg2    -  dst oop
1983   //    c_rarg3    -  dst_pos (32-bits)
1984   //    c_rarg4    -  element count (32-bits)
1985   //
1986   //  Output:
1987   //    r0 ==  0  -  success
1988   //    r0 == -1^K - failure, where K is partial transfer count
1989   //
1990   address generate_generic_copy(const char *name,
1991                                 address byte_copy_entry, address short_copy_entry,
1992                                 address int_copy_entry, address oop_copy_entry,
1993                                 address long_copy_entry, address checkcast_copy_entry) {
1994 
1995     Label L_failed, L_objArray;
1996     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1997 
1998     // Input registers
1999     const Register src        = c_rarg0;  // source array oop
2000     const Register src_pos    = c_rarg1;  // source position
2001     const Register dst        = c_rarg2;  // destination array oop
2002     const Register dst_pos    = c_rarg3;  // destination position
2003     const Register length     = c_rarg4;
2004 
2005 
2006     // Registers used as temps
2007     const Register dst_klass  = c_rarg5;
2008 
2009     __ align(CodeEntryAlignment);
2010 
2011     StubCodeMark mark(this, "StubRoutines", name);
2012 
2013     address start = __ pc();
2014 
2015     __ enter(); // required for proper stackwalking of RuntimeStub frame
2016 
2017     // bump this on entry, not on exit:
2018     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2019 
2020     //-----------------------------------------------------------------------
2021     // Assembler stub will be used for this call to arraycopy
2022     // if the following conditions are met:
2023     //
2024     // (1) src and dst must not be null.
2025     // (2) src_pos must not be negative.
2026     // (3) dst_pos must not be negative.
2027     // (4) length  must not be negative.
2028     // (5) src klass and dst klass should be the same and not NULL.
2029     // (6) src and dst should be arrays.
2030     // (7) src_pos + length must not exceed length of src.
2031     // (8) dst_pos + length must not exceed length of dst.
2032     //
2033 
2034     //  if (src == NULL) return -1;
2035     __ cbz(src, L_failed);
2036 
2037     //  if (src_pos < 0) return -1;
2038     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2039 
2040     //  if (dst == NULL) return -1;
2041     __ cbz(dst, L_failed);
2042 
2043     //  if (dst_pos < 0) return -1;
2044     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2045 
2046     // registers used as temp
2047     const Register scratch_length    = r16; // elements count to copy
2048     const Register scratch_src_klass = r17; // array klass
2049     const Register lh                = r15; // layout helper
2050 
2051     //  if (length < 0) return -1;
2052     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2053     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2054 
2055     __ load_klass(scratch_src_klass, src);
2056 #ifdef ASSERT
2057     //  assert(src->klass() != NULL);
2058     {
2059       BLOCK_COMMENT("assert klasses not null {");
2060       Label L1, L2;
2061       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2062       __ bind(L1);
2063       __ stop("broken null klass");
2064       __ bind(L2);
2065       __ load_klass(rscratch1, dst);
2066       __ cbz(rscratch1, L1);     // this would be broken also
2067       BLOCK_COMMENT("} assert klasses not null done");
2068     }
2069 #endif
2070 
2071     // Load layout helper (32-bits)
2072     //
2073     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2074     // 32        30    24            16              8     2                 0
2075     //
2076     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2077     //
2078 
2079     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2080 
2081     // Handle objArrays completely differently...
2082     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2083     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2084     __ movw(rscratch1, objArray_lh);
2085     __ eorw(rscratch2, lh, rscratch1);
2086     __ cbzw(rscratch2, L_objArray);
2087 
2088     //  if (src->klass() != dst->klass()) return -1;
2089     __ load_klass(rscratch2, dst);
2090     __ eor(rscratch2, rscratch2, scratch_src_klass);
2091     __ cbnz(rscratch2, L_failed);
2092 
2093     //  if (!src->is_Array()) return -1;
2094     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2095 
2096     // At this point, it is known to be a typeArray (array_tag 0x3).
2097 #ifdef ASSERT
2098     {
2099       BLOCK_COMMENT("assert primitive array {");
2100       Label L;
2101       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2102       __ cmpw(lh, rscratch2);
2103       __ br(Assembler::GE, L);
2104       __ stop("must be a primitive array");
2105       __ bind(L);
2106       BLOCK_COMMENT("} assert primitive array done");
2107     }
2108 #endif
2109 
2110     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2111                            rscratch2, L_failed);
2112 
2113     // TypeArrayKlass
2114     //
2115     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2116     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2117     //
2118 
2119     const Register rscratch1_offset = rscratch1;    // array offset
2120     const Register r15_elsize = lh; // element size
2121 
2122     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2123            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2124     __ add(src, src, rscratch1_offset);           // src array offset
2125     __ add(dst, dst, rscratch1_offset);           // dst array offset
2126     BLOCK_COMMENT("choose copy loop based on element size");
2127 
2128     // next registers should be set before the jump to corresponding stub
2129     const Register from     = c_rarg0;  // source array address
2130     const Register to       = c_rarg1;  // destination array address
2131     const Register count    = c_rarg2;  // elements count
2132 
2133     // 'from', 'to', 'count' registers should be set in such order
2134     // since they are the same as 'src', 'src_pos', 'dst'.
2135 
2136     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2137 
2138     // The possible values of elsize are 0-3, i.e. exact_log2(element
2139     // size in bytes).  We do a simple bitwise binary search.
2140   __ BIND(L_copy_bytes);
2141     __ tbnz(r15_elsize, 1, L_copy_ints);
2142     __ tbnz(r15_elsize, 0, L_copy_shorts);
2143     __ lea(from, Address(src, src_pos));// src_addr
2144     __ lea(to,   Address(dst, dst_pos));// dst_addr
2145     __ movw(count, scratch_length); // length
2146     __ b(RuntimeAddress(byte_copy_entry));
2147 
2148   __ BIND(L_copy_shorts);
2149     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2150     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2151     __ movw(count, scratch_length); // length
2152     __ b(RuntimeAddress(short_copy_entry));
2153 
2154   __ BIND(L_copy_ints);
2155     __ tbnz(r15_elsize, 0, L_copy_longs);
2156     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2157     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2158     __ movw(count, scratch_length); // length
2159     __ b(RuntimeAddress(int_copy_entry));
2160 
2161   __ BIND(L_copy_longs);
2162 #ifdef ASSERT
2163     {
2164       BLOCK_COMMENT("assert long copy {");
2165       Label L;
2166       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2167       __ cmpw(r15_elsize, LogBytesPerLong);
2168       __ br(Assembler::EQ, L);
2169       __ stop("must be long copy, but elsize is wrong");
2170       __ bind(L);
2171       BLOCK_COMMENT("} assert long copy done");
2172     }
2173 #endif
2174     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2175     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2176     __ movw(count, scratch_length); // length
2177     __ b(RuntimeAddress(long_copy_entry));
2178 
2179     // ObjArrayKlass
2180   __ BIND(L_objArray);
2181     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2182 
2183     Label L_plain_copy, L_checkcast_copy;
2184     //  test array classes for subtyping
2185     __ load_klass(r15, dst);
2186     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2187     __ br(Assembler::NE, L_checkcast_copy);
2188 
2189     // Identically typed arrays can be copied without element-wise checks.
2190     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2191                            rscratch2, L_failed);
2192 
2193     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2194     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2195     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2196     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2197     __ movw(count, scratch_length); // length
2198   __ BIND(L_plain_copy);
2199     __ b(RuntimeAddress(oop_copy_entry));
2200 
2201   __ BIND(L_checkcast_copy);
2202     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2203     {
2204       // Before looking at dst.length, make sure dst is also an objArray.
2205       __ ldrw(rscratch1, Address(r15, lh_offset));
2206       __ movw(rscratch2, objArray_lh);
2207       __ eorw(rscratch1, rscratch1, rscratch2);
2208       __ cbnzw(rscratch1, L_failed);
2209 
2210       // It is safe to examine both src.length and dst.length.
2211       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2212                              r15, L_failed);
2213 
2214       __ load_klass(dst_klass, dst); // reload
2215 
2216       // Marshal the base address arguments now, freeing registers.
2217       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2218       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2219       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2220       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2221       __ movw(count, length);           // length (reloaded)
2222       Register sco_temp = c_rarg3;      // this register is free now
2223       assert_different_registers(from, to, count, sco_temp,
2224                                  dst_klass, scratch_src_klass);
2225       // assert_clean_int(count, sco_temp);
2226 
2227       // Generate the type check.
2228       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2229       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2230 
2231       // Smashes rscratch1, rscratch2
2232       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2233 
2234       // Fetch destination element klass from the ObjArrayKlass header.
2235       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2236       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2237       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2238 
2239       // the checkcast_copy loop needs two extra arguments:
2240       assert(c_rarg3 == sco_temp, "#3 already in place");
2241       // Set up arguments for checkcast_copy_entry.
2242       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2243       __ b(RuntimeAddress(checkcast_copy_entry));
2244     }
2245 
2246   __ BIND(L_failed);
2247     __ mov(r0, -1);
2248     __ leave();   // required for proper stackwalking of RuntimeStub frame
2249     __ ret(lr);
2250 
2251     return start;
2252   }
2253 
2254   //
2255   // Generate stub for array fill. If "aligned" is true, the
2256   // "to" address is assumed to be heapword aligned.
2257   //
2258   // Arguments for generated stub:
2259   //   to:    c_rarg0
2260   //   value: c_rarg1
2261   //   count: c_rarg2 treated as signed
2262   //
2263   address generate_fill(BasicType t, bool aligned, const char *name) {
2264     __ align(CodeEntryAlignment);
2265     StubCodeMark mark(this, "StubRoutines", name);
2266     address start = __ pc();
2267 
2268     BLOCK_COMMENT("Entry:");
2269 
2270     const Register to        = c_rarg0;  // source array address
2271     const Register value     = c_rarg1;  // value
2272     const Register count     = c_rarg2;  // elements count
2273 
2274     const Register bz_base = r10;        // base for block_zero routine
2275     const Register cnt_words = r11;      // temp register
2276 
2277     __ enter();
2278 
2279     Label L_fill_elements, L_exit1;
2280 
2281     int shift = -1;
2282     switch (t) {
2283       case T_BYTE:
2284         shift = 0;
2285         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2286         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2287         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2288         __ br(Assembler::LO, L_fill_elements);
2289         break;
2290       case T_SHORT:
2291         shift = 1;
2292         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2293         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2294         __ br(Assembler::LO, L_fill_elements);
2295         break;
2296       case T_INT:
2297         shift = 2;
2298         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2299         __ br(Assembler::LO, L_fill_elements);
2300         break;
2301       default: ShouldNotReachHere();
2302     }
2303 
2304     // Align source address at 8 bytes address boundary.
2305     Label L_skip_align1, L_skip_align2, L_skip_align4;
2306     if (!aligned) {
2307       switch (t) {
2308         case T_BYTE:
2309           // One byte misalignment happens only for byte arrays.
2310           __ tbz(to, 0, L_skip_align1);
2311           __ strb(value, Address(__ post(to, 1)));
2312           __ subw(count, count, 1);
2313           __ bind(L_skip_align1);
2314           // Fallthrough
2315         case T_SHORT:
2316           // Two bytes misalignment happens only for byte and short (char) arrays.
2317           __ tbz(to, 1, L_skip_align2);
2318           __ strh(value, Address(__ post(to, 2)));
2319           __ subw(count, count, 2 >> shift);
2320           __ bind(L_skip_align2);
2321           // Fallthrough
2322         case T_INT:
2323           // Align to 8 bytes, we know we are 4 byte aligned to start.
2324           __ tbz(to, 2, L_skip_align4);
2325           __ strw(value, Address(__ post(to, 4)));
2326           __ subw(count, count, 4 >> shift);
2327           __ bind(L_skip_align4);
2328           break;
2329         default: ShouldNotReachHere();
2330       }
2331     }
2332 
2333     //
2334     //  Fill large chunks
2335     //
2336     __ lsrw(cnt_words, count, 3 - shift); // number of words
2337     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2338     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2339     if (UseBlockZeroing) {
2340       Label non_block_zeroing, rest;
2341       // If the fill value is zero we can use the fast zero_words().
2342       __ cbnz(value, non_block_zeroing);
2343       __ mov(bz_base, to);
2344       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2345       __ zero_words(bz_base, cnt_words);
2346       __ b(rest);
2347       __ bind(non_block_zeroing);
2348       __ fill_words(to, cnt_words, value);
2349       __ bind(rest);
2350     } else {
2351       __ fill_words(to, cnt_words, value);
2352     }
2353 
2354     // Remaining count is less than 8 bytes. Fill it by a single store.
2355     // Note that the total length is no less than 8 bytes.
2356     if (t == T_BYTE || t == T_SHORT) {
2357       Label L_exit1;
2358       __ cbzw(count, L_exit1);
2359       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2360       __ str(value, Address(to, -8));    // overwrite some elements
2361       __ bind(L_exit1);
2362       __ leave();
2363       __ ret(lr);
2364     }
2365 
2366     // Handle copies less than 8 bytes.
2367     Label L_fill_2, L_fill_4, L_exit2;
2368     __ bind(L_fill_elements);
2369     switch (t) {
2370       case T_BYTE:
2371         __ tbz(count, 0, L_fill_2);
2372         __ strb(value, Address(__ post(to, 1)));
2373         __ bind(L_fill_2);
2374         __ tbz(count, 1, L_fill_4);
2375         __ strh(value, Address(__ post(to, 2)));
2376         __ bind(L_fill_4);
2377         __ tbz(count, 2, L_exit2);
2378         __ strw(value, Address(to));
2379         break;
2380       case T_SHORT:
2381         __ tbz(count, 0, L_fill_4);
2382         __ strh(value, Address(__ post(to, 2)));
2383         __ bind(L_fill_4);
2384         __ tbz(count, 1, L_exit2);
2385         __ strw(value, Address(to));
2386         break;
2387       case T_INT:
2388         __ cbzw(count, L_exit2);
2389         __ strw(value, Address(to));
2390         break;
2391       default: ShouldNotReachHere();
2392     }
2393     __ bind(L_exit2);
2394     __ leave();
2395     __ ret(lr);
2396     return start;
2397   }
2398 
2399   address generate_data_cache_writeback() {
2400     const Register line        = c_rarg0;  // address of line to write back
2401 
2402     __ align(CodeEntryAlignment);
2403 
2404     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2405 
2406     address start = __ pc();
2407     __ enter();
2408     __ cache_wb(Address(line, 0));
2409     __ leave();
2410     __ ret(lr);
2411 
2412     return start;
2413   }
2414 
2415   address generate_data_cache_writeback_sync() {
2416     const Register is_pre     = c_rarg0;  // pre or post sync
2417 
2418     __ align(CodeEntryAlignment);
2419 
2420     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2421 
2422     // pre wbsync is a no-op
2423     // post wbsync translates to an sfence
2424 
2425     Label skip;
2426     address start = __ pc();
2427     __ enter();
2428     __ cbnz(is_pre, skip);
2429     __ cache_wbsync(false);
2430     __ bind(skip);
2431     __ leave();
2432     __ ret(lr);
2433 
2434     return start;
2435   }
2436 
2437   void generate_arraycopy_stubs() {
2438     address entry;
2439     address entry_jbyte_arraycopy;
2440     address entry_jshort_arraycopy;
2441     address entry_jint_arraycopy;
2442     address entry_oop_arraycopy;
2443     address entry_jlong_arraycopy;
2444     address entry_checkcast_arraycopy;
2445 
2446     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2447     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2448 
2449     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2450 
2451     //*** jbyte
2452     // Always need aligned and unaligned versions
2453     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2454                                                                                   "jbyte_disjoint_arraycopy");
2455     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2456                                                                                   &entry_jbyte_arraycopy,
2457                                                                                   "jbyte_arraycopy");
2458     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2459                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2460     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2461                                                                                   "arrayof_jbyte_arraycopy");
2462 
2463     //*** jshort
2464     // Always need aligned and unaligned versions
2465     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2466                                                                                     "jshort_disjoint_arraycopy");
2467     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2468                                                                                     &entry_jshort_arraycopy,
2469                                                                                     "jshort_arraycopy");
2470     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2471                                                                                     "arrayof_jshort_disjoint_arraycopy");
2472     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2473                                                                                     "arrayof_jshort_arraycopy");
2474 
2475     //*** jint
2476     // Aligned versions
2477     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2478                                                                                 "arrayof_jint_disjoint_arraycopy");
2479     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2480                                                                                 "arrayof_jint_arraycopy");
2481     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2482     // entry_jint_arraycopy always points to the unaligned version
2483     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2484                                                                                 "jint_disjoint_arraycopy");
2485     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2486                                                                                 &entry_jint_arraycopy,
2487                                                                                 "jint_arraycopy");
2488 
2489     //*** jlong
2490     // It is always aligned
2491     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2492                                                                                   "arrayof_jlong_disjoint_arraycopy");
2493     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2494                                                                                   "arrayof_jlong_arraycopy");
2495     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2496     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2497 
2498     //*** oops
2499     {
2500       // With compressed oops we need unaligned versions; notice that
2501       // we overwrite entry_oop_arraycopy.
2502       bool aligned = !UseCompressedOops;
2503 
2504       StubRoutines::_arrayof_oop_disjoint_arraycopy
2505         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2506                                      /*dest_uninitialized*/false);
2507       StubRoutines::_arrayof_oop_arraycopy
2508         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2509                                      /*dest_uninitialized*/false);
2510       // Aligned versions without pre-barriers
2511       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2512         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2513                                      /*dest_uninitialized*/true);
2514       StubRoutines::_arrayof_oop_arraycopy_uninit
2515         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2516                                      /*dest_uninitialized*/true);
2517     }
2518 
2519     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2520     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2521     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2522     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2523 
2524     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2525     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2526                                                                         /*dest_uninitialized*/true);
2527 
2528     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2529                                                               entry_jbyte_arraycopy,
2530                                                               entry_jshort_arraycopy,
2531                                                               entry_jint_arraycopy,
2532                                                               entry_jlong_arraycopy);
2533 
2534     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2535                                                                entry_jbyte_arraycopy,
2536                                                                entry_jshort_arraycopy,
2537                                                                entry_jint_arraycopy,
2538                                                                entry_oop_arraycopy,
2539                                                                entry_jlong_arraycopy,
2540                                                                entry_checkcast_arraycopy);
2541 
2542     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2543     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2544     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2545     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2546     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2547     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2548   }
2549 
2550   void generate_math_stubs() { Unimplemented(); }
2551 
2552   // Arguments:
2553   //
2554   // Inputs:
2555   //   c_rarg0   - source byte array address
2556   //   c_rarg1   - destination byte array address
2557   //   c_rarg2   - K (key) in little endian int array
2558   //
2559   address generate_aescrypt_encryptBlock() {
2560     __ align(CodeEntryAlignment);
2561     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2562 
2563     Label L_doLast;
2564 
2565     const Register from        = c_rarg0;  // source array address
2566     const Register to          = c_rarg1;  // destination array address
2567     const Register key         = c_rarg2;  // key array address
2568     const Register keylen      = rscratch1;
2569 
2570     address start = __ pc();
2571     __ enter();
2572 
2573     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2574 
2575     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2576 
2577     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2578     __ rev32(v1, __ T16B, v1);
2579     __ rev32(v2, __ T16B, v2);
2580     __ rev32(v3, __ T16B, v3);
2581     __ rev32(v4, __ T16B, v4);
2582     __ aese(v0, v1);
2583     __ aesmc(v0, v0);
2584     __ aese(v0, v2);
2585     __ aesmc(v0, v0);
2586     __ aese(v0, v3);
2587     __ aesmc(v0, v0);
2588     __ aese(v0, v4);
2589     __ aesmc(v0, v0);
2590 
2591     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2592     __ rev32(v1, __ T16B, v1);
2593     __ rev32(v2, __ T16B, v2);
2594     __ rev32(v3, __ T16B, v3);
2595     __ rev32(v4, __ T16B, v4);
2596     __ aese(v0, v1);
2597     __ aesmc(v0, v0);
2598     __ aese(v0, v2);
2599     __ aesmc(v0, v0);
2600     __ aese(v0, v3);
2601     __ aesmc(v0, v0);
2602     __ aese(v0, v4);
2603     __ aesmc(v0, v0);
2604 
2605     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2606     __ rev32(v1, __ T16B, v1);
2607     __ rev32(v2, __ T16B, v2);
2608 
2609     __ cmpw(keylen, 44);
2610     __ br(Assembler::EQ, L_doLast);
2611 
2612     __ aese(v0, v1);
2613     __ aesmc(v0, v0);
2614     __ aese(v0, v2);
2615     __ aesmc(v0, v0);
2616 
2617     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2618     __ rev32(v1, __ T16B, v1);
2619     __ rev32(v2, __ T16B, v2);
2620 
2621     __ cmpw(keylen, 52);
2622     __ br(Assembler::EQ, L_doLast);
2623 
2624     __ aese(v0, v1);
2625     __ aesmc(v0, v0);
2626     __ aese(v0, v2);
2627     __ aesmc(v0, v0);
2628 
2629     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2630     __ rev32(v1, __ T16B, v1);
2631     __ rev32(v2, __ T16B, v2);
2632 
2633     __ BIND(L_doLast);
2634 
2635     __ aese(v0, v1);
2636     __ aesmc(v0, v0);
2637     __ aese(v0, v2);
2638 
2639     __ ld1(v1, __ T16B, key);
2640     __ rev32(v1, __ T16B, v1);
2641     __ eor(v0, __ T16B, v0, v1);
2642 
2643     __ st1(v0, __ T16B, to);
2644 
2645     __ mov(r0, 0);
2646 
2647     __ leave();
2648     __ ret(lr);
2649 
2650     return start;
2651   }
2652 
2653   // Arguments:
2654   //
2655   // Inputs:
2656   //   c_rarg0   - source byte array address
2657   //   c_rarg1   - destination byte array address
2658   //   c_rarg2   - K (key) in little endian int array
2659   //
2660   address generate_aescrypt_decryptBlock() {
2661     assert(UseAES, "need AES cryptographic extension support");
2662     __ align(CodeEntryAlignment);
2663     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2664     Label L_doLast;
2665 
2666     const Register from        = c_rarg0;  // source array address
2667     const Register to          = c_rarg1;  // destination array address
2668     const Register key         = c_rarg2;  // key array address
2669     const Register keylen      = rscratch1;
2670 
2671     address start = __ pc();
2672     __ enter(); // required for proper stackwalking of RuntimeStub frame
2673 
2674     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2675 
2676     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2677 
2678     __ ld1(v5, __ T16B, __ post(key, 16));
2679     __ rev32(v5, __ T16B, v5);
2680 
2681     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2682     __ rev32(v1, __ T16B, v1);
2683     __ rev32(v2, __ T16B, v2);
2684     __ rev32(v3, __ T16B, v3);
2685     __ rev32(v4, __ T16B, v4);
2686     __ aesd(v0, v1);
2687     __ aesimc(v0, v0);
2688     __ aesd(v0, v2);
2689     __ aesimc(v0, v0);
2690     __ aesd(v0, v3);
2691     __ aesimc(v0, v0);
2692     __ aesd(v0, v4);
2693     __ aesimc(v0, v0);
2694 
2695     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2696     __ rev32(v1, __ T16B, v1);
2697     __ rev32(v2, __ T16B, v2);
2698     __ rev32(v3, __ T16B, v3);
2699     __ rev32(v4, __ T16B, v4);
2700     __ aesd(v0, v1);
2701     __ aesimc(v0, v0);
2702     __ aesd(v0, v2);
2703     __ aesimc(v0, v0);
2704     __ aesd(v0, v3);
2705     __ aesimc(v0, v0);
2706     __ aesd(v0, v4);
2707     __ aesimc(v0, v0);
2708 
2709     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2710     __ rev32(v1, __ T16B, v1);
2711     __ rev32(v2, __ T16B, v2);
2712 
2713     __ cmpw(keylen, 44);
2714     __ br(Assembler::EQ, L_doLast);
2715 
2716     __ aesd(v0, v1);
2717     __ aesimc(v0, v0);
2718     __ aesd(v0, v2);
2719     __ aesimc(v0, v0);
2720 
2721     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2722     __ rev32(v1, __ T16B, v1);
2723     __ rev32(v2, __ T16B, v2);
2724 
2725     __ cmpw(keylen, 52);
2726     __ br(Assembler::EQ, L_doLast);
2727 
2728     __ aesd(v0, v1);
2729     __ aesimc(v0, v0);
2730     __ aesd(v0, v2);
2731     __ aesimc(v0, v0);
2732 
2733     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2734     __ rev32(v1, __ T16B, v1);
2735     __ rev32(v2, __ T16B, v2);
2736 
2737     __ BIND(L_doLast);
2738 
2739     __ aesd(v0, v1);
2740     __ aesimc(v0, v0);
2741     __ aesd(v0, v2);
2742 
2743     __ eor(v0, __ T16B, v0, v5);
2744 
2745     __ st1(v0, __ T16B, to);
2746 
2747     __ mov(r0, 0);
2748 
2749     __ leave();
2750     __ ret(lr);
2751 
2752     return start;
2753   }
2754 
2755   // Arguments:
2756   //
2757   // Inputs:
2758   //   c_rarg0   - source byte array address
2759   //   c_rarg1   - destination byte array address
2760   //   c_rarg2   - K (key) in little endian int array
2761   //   c_rarg3   - r vector byte array address
2762   //   c_rarg4   - input length
2763   //
2764   // Output:
2765   //   x0        - input length
2766   //
2767   address generate_cipherBlockChaining_encryptAESCrypt() {
2768     assert(UseAES, "need AES cryptographic extension support");
2769     __ align(CodeEntryAlignment);
2770     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2771 
2772     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2773 
2774     const Register from        = c_rarg0;  // source array address
2775     const Register to          = c_rarg1;  // destination array address
2776     const Register key         = c_rarg2;  // key array address
2777     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2778                                            // and left with the results of the last encryption block
2779     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2780     const Register keylen      = rscratch1;
2781 
2782     address start = __ pc();
2783 
2784       __ enter();
2785 
2786       __ movw(rscratch2, len_reg);
2787 
2788       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2789 
2790       __ ld1(v0, __ T16B, rvec);
2791 
2792       __ cmpw(keylen, 52);
2793       __ br(Assembler::CC, L_loadkeys_44);
2794       __ br(Assembler::EQ, L_loadkeys_52);
2795 
2796       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2797       __ rev32(v17, __ T16B, v17);
2798       __ rev32(v18, __ T16B, v18);
2799     __ BIND(L_loadkeys_52);
2800       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2801       __ rev32(v19, __ T16B, v19);
2802       __ rev32(v20, __ T16B, v20);
2803     __ BIND(L_loadkeys_44);
2804       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2805       __ rev32(v21, __ T16B, v21);
2806       __ rev32(v22, __ T16B, v22);
2807       __ rev32(v23, __ T16B, v23);
2808       __ rev32(v24, __ T16B, v24);
2809       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2810       __ rev32(v25, __ T16B, v25);
2811       __ rev32(v26, __ T16B, v26);
2812       __ rev32(v27, __ T16B, v27);
2813       __ rev32(v28, __ T16B, v28);
2814       __ ld1(v29, v30, v31, __ T16B, key);
2815       __ rev32(v29, __ T16B, v29);
2816       __ rev32(v30, __ T16B, v30);
2817       __ rev32(v31, __ T16B, v31);
2818 
2819     __ BIND(L_aes_loop);
2820       __ ld1(v1, __ T16B, __ post(from, 16));
2821       __ eor(v0, __ T16B, v0, v1);
2822 
2823       __ br(Assembler::CC, L_rounds_44);
2824       __ br(Assembler::EQ, L_rounds_52);
2825 
2826       __ aese(v0, v17); __ aesmc(v0, v0);
2827       __ aese(v0, v18); __ aesmc(v0, v0);
2828     __ BIND(L_rounds_52);
2829       __ aese(v0, v19); __ aesmc(v0, v0);
2830       __ aese(v0, v20); __ aesmc(v0, v0);
2831     __ BIND(L_rounds_44);
2832       __ aese(v0, v21); __ aesmc(v0, v0);
2833       __ aese(v0, v22); __ aesmc(v0, v0);
2834       __ aese(v0, v23); __ aesmc(v0, v0);
2835       __ aese(v0, v24); __ aesmc(v0, v0);
2836       __ aese(v0, v25); __ aesmc(v0, v0);
2837       __ aese(v0, v26); __ aesmc(v0, v0);
2838       __ aese(v0, v27); __ aesmc(v0, v0);
2839       __ aese(v0, v28); __ aesmc(v0, v0);
2840       __ aese(v0, v29); __ aesmc(v0, v0);
2841       __ aese(v0, v30);
2842       __ eor(v0, __ T16B, v0, v31);
2843 
2844       __ st1(v0, __ T16B, __ post(to, 16));
2845 
2846       __ subw(len_reg, len_reg, 16);
2847       __ cbnzw(len_reg, L_aes_loop);
2848 
2849       __ st1(v0, __ T16B, rvec);
2850 
2851       __ mov(r0, rscratch2);
2852 
2853       __ leave();
2854       __ ret(lr);
2855 
2856       return start;
2857   }
2858 
2859   // Arguments:
2860   //
2861   // Inputs:
2862   //   c_rarg0   - source byte array address
2863   //   c_rarg1   - destination byte array address
2864   //   c_rarg2   - K (key) in little endian int array
2865   //   c_rarg3   - r vector byte array address
2866   //   c_rarg4   - input length
2867   //
2868   // Output:
2869   //   r0        - input length
2870   //
2871   address generate_cipherBlockChaining_decryptAESCrypt() {
2872     assert(UseAES, "need AES cryptographic extension support");
2873     __ align(CodeEntryAlignment);
2874     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2875 
2876     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2877 
2878     const Register from        = c_rarg0;  // source array address
2879     const Register to          = c_rarg1;  // destination array address
2880     const Register key         = c_rarg2;  // key array address
2881     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2882                                            // and left with the results of the last encryption block
2883     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2884     const Register keylen      = rscratch1;
2885 
2886     address start = __ pc();
2887 
2888       __ enter();
2889 
2890       __ movw(rscratch2, len_reg);
2891 
2892       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2893 
2894       __ ld1(v2, __ T16B, rvec);
2895 
2896       __ ld1(v31, __ T16B, __ post(key, 16));
2897       __ rev32(v31, __ T16B, v31);
2898 
2899       __ cmpw(keylen, 52);
2900       __ br(Assembler::CC, L_loadkeys_44);
2901       __ br(Assembler::EQ, L_loadkeys_52);
2902 
2903       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2904       __ rev32(v17, __ T16B, v17);
2905       __ rev32(v18, __ T16B, v18);
2906     __ BIND(L_loadkeys_52);
2907       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2908       __ rev32(v19, __ T16B, v19);
2909       __ rev32(v20, __ T16B, v20);
2910     __ BIND(L_loadkeys_44);
2911       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2912       __ rev32(v21, __ T16B, v21);
2913       __ rev32(v22, __ T16B, v22);
2914       __ rev32(v23, __ T16B, v23);
2915       __ rev32(v24, __ T16B, v24);
2916       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2917       __ rev32(v25, __ T16B, v25);
2918       __ rev32(v26, __ T16B, v26);
2919       __ rev32(v27, __ T16B, v27);
2920       __ rev32(v28, __ T16B, v28);
2921       __ ld1(v29, v30, __ T16B, key);
2922       __ rev32(v29, __ T16B, v29);
2923       __ rev32(v30, __ T16B, v30);
2924 
2925     __ BIND(L_aes_loop);
2926       __ ld1(v0, __ T16B, __ post(from, 16));
2927       __ orr(v1, __ T16B, v0, v0);
2928 
2929       __ br(Assembler::CC, L_rounds_44);
2930       __ br(Assembler::EQ, L_rounds_52);
2931 
2932       __ aesd(v0, v17); __ aesimc(v0, v0);
2933       __ aesd(v0, v18); __ aesimc(v0, v0);
2934     __ BIND(L_rounds_52);
2935       __ aesd(v0, v19); __ aesimc(v0, v0);
2936       __ aesd(v0, v20); __ aesimc(v0, v0);
2937     __ BIND(L_rounds_44);
2938       __ aesd(v0, v21); __ aesimc(v0, v0);
2939       __ aesd(v0, v22); __ aesimc(v0, v0);
2940       __ aesd(v0, v23); __ aesimc(v0, v0);
2941       __ aesd(v0, v24); __ aesimc(v0, v0);
2942       __ aesd(v0, v25); __ aesimc(v0, v0);
2943       __ aesd(v0, v26); __ aesimc(v0, v0);
2944       __ aesd(v0, v27); __ aesimc(v0, v0);
2945       __ aesd(v0, v28); __ aesimc(v0, v0);
2946       __ aesd(v0, v29); __ aesimc(v0, v0);
2947       __ aesd(v0, v30);
2948       __ eor(v0, __ T16B, v0, v31);
2949       __ eor(v0, __ T16B, v0, v2);
2950 
2951       __ st1(v0, __ T16B, __ post(to, 16));
2952       __ orr(v2, __ T16B, v1, v1);
2953 
2954       __ subw(len_reg, len_reg, 16);
2955       __ cbnzw(len_reg, L_aes_loop);
2956 
2957       __ st1(v2, __ T16B, rvec);
2958 
2959       __ mov(r0, rscratch2);
2960 
2961       __ leave();
2962       __ ret(lr);
2963 
2964     return start;
2965   }
2966 
2967   // CTR AES crypt.
2968   // Arguments:
2969   //
2970   // Inputs:
2971   //   c_rarg0   - source byte array address
2972   //   c_rarg1   - destination byte array address
2973   //   c_rarg2   - K (key) in little endian int array
2974   //   c_rarg3   - counter vector byte array address
2975   //   c_rarg4   - input length
2976   //   c_rarg5   - saved encryptedCounter start
2977   //   c_rarg6   - saved used length
2978   //
2979   // Output:
2980   //   r0       - input length
2981   //
2982   address generate_counterMode_AESCrypt() {
2983     const Register in = c_rarg0;
2984     const Register out = c_rarg1;
2985     const Register key = c_rarg2;
2986     const Register counter = c_rarg3;
2987     const Register saved_len = c_rarg4, len = r10;
2988     const Register saved_encrypted_ctr = c_rarg5;
2989     const Register used_ptr = c_rarg6, used = r12;
2990 
2991     const Register offset = r7;
2992     const Register keylen = r11;
2993 
2994     const unsigned char block_size = 16;
2995     const int bulk_width = 4;
2996     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2997     // performance with larger data sizes, but it also means that the
2998     // fast path isn't used until you have at least 8 blocks, and up
2999     // to 127 bytes of data will be executed on the slow path. For
3000     // that reason, and also so as not to blow away too much icache, 4
3001     // blocks seems like a sensible compromise.
3002 
3003     // Algorithm:
3004     //
3005     //    if (len == 0) {
3006     //        goto DONE;
3007     //    }
3008     //    int result = len;
3009     //    do {
3010     //        if (used >= blockSize) {
3011     //            if (len >= bulk_width * blockSize) {
3012     //                CTR_large_block();
3013     //                if (len == 0)
3014     //                    goto DONE;
3015     //            }
3016     //            for (;;) {
3017     //                16ByteVector v0 = counter;
3018     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3019     //                used = 0;
3020     //                if (len < blockSize)
3021     //                    break;    /* goto NEXT */
3022     //                16ByteVector v1 = load16Bytes(in, offset);
3023     //                v1 = v1 ^ encryptedCounter;
3024     //                store16Bytes(out, offset);
3025     //                used = blockSize;
3026     //                offset += blockSize;
3027     //                len -= blockSize;
3028     //                if (len == 0)
3029     //                    goto DONE;
3030     //            }
3031     //        }
3032     //      NEXT:
3033     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3034     //        len--;
3035     //    } while (len != 0);
3036     //  DONE:
3037     //    return result;
3038     //
3039     // CTR_large_block()
3040     //    Wide bulk encryption of whole blocks.
3041 
3042     __ align(CodeEntryAlignment);
3043     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3044     const address start = __ pc();
3045     __ enter();
3046 
3047     Label DONE, CTR_large_block, large_block_return;
3048     __ ldrw(used, Address(used_ptr));
3049     __ cbzw(saved_len, DONE);
3050 
3051     __ mov(len, saved_len);
3052     __ mov(offset, 0);
3053 
3054     // Compute #rounds for AES based on the length of the key array
3055     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3056 
3057     __ aesenc_loadkeys(key, keylen);
3058 
3059     {
3060       Label L_CTR_loop, NEXT;
3061 
3062       __ bind(L_CTR_loop);
3063 
3064       __ cmp(used, block_size);
3065       __ br(__ LO, NEXT);
3066 
3067       // Maybe we have a lot of data
3068       __ subsw(rscratch1, len, bulk_width * block_size);
3069       __ br(__ HS, CTR_large_block);
3070       __ BIND(large_block_return);
3071       __ cbzw(len, DONE);
3072 
3073       // Setup the counter
3074       __ movi(v4, __ T4S, 0);
3075       __ movi(v5, __ T4S, 1);
3076       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
3077 
3078       __ ld1(v0, __ T16B, counter); // Load the counter into v0
3079       __ rev32(v16, __ T16B, v0);
3080       __ addv(v16, __ T4S, v16, v4);
3081       __ rev32(v16, __ T16B, v16);
3082       __ st1(v16, __ T16B, counter); // Save the incremented counter back
3083 
3084       {
3085         // We have fewer than bulk_width blocks of data left. Encrypt
3086         // them one by one until there is less than a full block
3087         // remaining, being careful to save both the encrypted counter
3088         // and the counter.
3089 
3090         Label inner_loop;
3091         __ bind(inner_loop);
3092         // Counter to encrypt is in v0
3093         __ aesecb_encrypt(noreg, noreg, keylen);
3094         __ st1(v0, __ T16B, saved_encrypted_ctr);
3095 
3096         // Do we have a remaining full block?
3097 
3098         __ mov(used, 0);
3099         __ cmp(len, block_size);
3100         __ br(__ LO, NEXT);
3101 
3102         // Yes, we have a full block
3103         __ ldrq(v1, Address(in, offset));
3104         __ eor(v1, __ T16B, v1, v0);
3105         __ strq(v1, Address(out, offset));
3106         __ mov(used, block_size);
3107         __ add(offset, offset, block_size);
3108 
3109         __ subw(len, len, block_size);
3110         __ cbzw(len, DONE);
3111 
3112         // Increment the counter, store it back
3113         __ orr(v0, __ T16B, v16, v16);
3114         __ rev32(v16, __ T16B, v16);
3115         __ addv(v16, __ T4S, v16, v4);
3116         __ rev32(v16, __ T16B, v16);
3117         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3118 
3119         __ b(inner_loop);
3120       }
3121 
3122       __ BIND(NEXT);
3123 
3124       // Encrypt a single byte, and loop.
3125       // We expect this to be a rare event.
3126       __ ldrb(rscratch1, Address(in, offset));
3127       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3128       __ eor(rscratch1, rscratch1, rscratch2);
3129       __ strb(rscratch1, Address(out, offset));
3130       __ add(offset, offset, 1);
3131       __ add(used, used, 1);
3132       __ subw(len, len,1);
3133       __ cbnzw(len, L_CTR_loop);
3134     }
3135 
3136     __ bind(DONE);
3137     __ strw(used, Address(used_ptr));
3138     __ mov(r0, saved_len);
3139 
3140     __ leave(); // required for proper stackwalking of RuntimeStub frame
3141     __ ret(lr);
3142 
3143     // Bulk encryption
3144 
3145     __ BIND (CTR_large_block);
3146     assert(bulk_width == 4 || bulk_width == 8, "must be");
3147 
3148     if (bulk_width == 8) {
3149       __ sub(sp, sp, 4 * 16);
3150       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3151     }
3152     __ sub(sp, sp, 4 * 16);
3153     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3154     RegSet saved_regs = (RegSet::of(in, out, offset)
3155                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3156     __ push(saved_regs, sp);
3157     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3158     __ add(in, in, offset);
3159     __ add(out, out, offset);
3160 
3161     // Keys should already be loaded into the correct registers
3162 
3163     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3164     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3165 
3166     // AES/CTR loop
3167     {
3168       Label L_CTR_loop;
3169       __ BIND(L_CTR_loop);
3170 
3171       // Setup the counters
3172       __ movi(v8, __ T4S, 0);
3173       __ movi(v9, __ T4S, 1);
3174       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3175 
3176       for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
3177         __ rev32(f, __ T16B, v16);
3178         __ addv(v16, __ T4S, v16, v8);
3179       }
3180 
3181       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3182 
3183       // Encrypt the counters
3184       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3185 
3186       if (bulk_width == 8) {
3187         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3188       }
3189 
3190       // XOR the encrypted counters with the inputs
3191       for (int i = 0; i < bulk_width; i++) {
3192         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3193       }
3194 
3195       // Write the encrypted data
3196       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3197       if (bulk_width == 8) {
3198         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3199       }
3200 
3201       __ subw(len, len, 16 * bulk_width);
3202       __ cbnzw(len, L_CTR_loop);
3203     }
3204 
3205     // Save the counter back where it goes
3206     __ rev32(v16, __ T16B, v16);
3207     __ st1(v16, __ T16B, counter);
3208 
3209     __ pop(saved_regs, sp);
3210 
3211     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3212     if (bulk_width == 8) {
3213       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3214     }
3215 
3216     __ andr(rscratch1, len, -16 * bulk_width);
3217     __ sub(len, len, rscratch1);
3218     __ add(offset, offset, rscratch1);
3219     __ mov(used, 16);
3220     __ strw(used, Address(used_ptr));
3221     __ b(large_block_return);
3222 
3223     return start;
3224   }
3225 
3226   // Arguments:
3227   //
3228   // Inputs:
3229   //   c_rarg0   - byte[]  source+offset
3230   //   c_rarg1   - int[]   SHA.state
3231   //   c_rarg2   - int     offset
3232   //   c_rarg3   - int     limit
3233   //
3234   address generate_md5_implCompress(bool multi_block, const char *name) {
3235     __ align(CodeEntryAlignment);
3236     StubCodeMark mark(this, "StubRoutines", name);
3237     address start = __ pc();
3238 
3239     Register buf       = c_rarg0;
3240     Register state     = c_rarg1;
3241     Register ofs       = c_rarg2;
3242     Register limit     = c_rarg3;
3243     Register a         = r4;
3244     Register b         = r5;
3245     Register c         = r6;
3246     Register d         = r7;
3247     Register rscratch3 = r10;
3248     Register rscratch4 = r11;
3249 
3250     Label keys;
3251     Label md5_loop;
3252 
3253     __ BIND(md5_loop);
3254 
3255     // Save hash values for addition after rounds
3256     __ ldrw(a, Address(state,  0));
3257     __ ldrw(b, Address(state,  4));
3258     __ ldrw(c, Address(state,  8));
3259     __ ldrw(d, Address(state, 12));
3260 
3261 #define FF(r1, r2, r3, r4, k, s, t)              \
3262     __ eorw(rscratch3, r3, r4);                  \
3263     __ movw(rscratch2, t);                       \
3264     __ andw(rscratch3, rscratch3, r2);           \
3265     __ addw(rscratch4, r1, rscratch2);           \
3266     __ ldrw(rscratch1, Address(buf, k*4));       \
3267     __ eorw(rscratch3, rscratch3, r4);           \
3268     __ addw(rscratch3, rscratch3, rscratch1);    \
3269     __ addw(rscratch3, rscratch3, rscratch4);    \
3270     __ rorw(rscratch2, rscratch3, 32 - s);       \
3271     __ addw(r1, rscratch2, r2);
3272 
3273 #define GG(r1, r2, r3, r4, k, s, t)              \
3274     __ eorw(rscratch2, r2, r3);                  \
3275     __ ldrw(rscratch1, Address(buf, k*4));       \
3276     __ andw(rscratch3, rscratch2, r4);           \
3277     __ movw(rscratch2, t);                       \
3278     __ eorw(rscratch3, rscratch3, r3);           \
3279     __ addw(rscratch4, r1, rscratch2);           \
3280     __ addw(rscratch3, rscratch3, rscratch1);    \
3281     __ addw(rscratch3, rscratch3, rscratch4);    \
3282     __ rorw(rscratch2, rscratch3, 32 - s);       \
3283     __ addw(r1, rscratch2, r2);
3284 
3285 #define HH(r1, r2, r3, r4, k, s, t)              \
3286     __ eorw(rscratch3, r3, r4);                  \
3287     __ movw(rscratch2, t);                       \
3288     __ addw(rscratch4, r1, rscratch2);           \
3289     __ ldrw(rscratch1, Address(buf, k*4));       \
3290     __ eorw(rscratch3, rscratch3, r2);           \
3291     __ addw(rscratch3, rscratch3, rscratch1);    \
3292     __ addw(rscratch3, rscratch3, rscratch4);    \
3293     __ rorw(rscratch2, rscratch3, 32 - s);       \
3294     __ addw(r1, rscratch2, r2);
3295 
3296 #define II(r1, r2, r3, r4, k, s, t)              \
3297     __ movw(rscratch3, t);                       \
3298     __ ornw(rscratch2, r2, r4);                  \
3299     __ addw(rscratch4, r1, rscratch3);           \
3300     __ ldrw(rscratch1, Address(buf, k*4));       \
3301     __ eorw(rscratch3, rscratch2, r3);           \
3302     __ addw(rscratch3, rscratch3, rscratch1);    \
3303     __ addw(rscratch3, rscratch3, rscratch4);    \
3304     __ rorw(rscratch2, rscratch3, 32 - s);       \
3305     __ addw(r1, rscratch2, r2);
3306 
3307     // Round 1
3308     FF(a, b, c, d,  0,  7, 0xd76aa478)
3309     FF(d, a, b, c,  1, 12, 0xe8c7b756)
3310     FF(c, d, a, b,  2, 17, 0x242070db)
3311     FF(b, c, d, a,  3, 22, 0xc1bdceee)
3312     FF(a, b, c, d,  4,  7, 0xf57c0faf)
3313     FF(d, a, b, c,  5, 12, 0x4787c62a)
3314     FF(c, d, a, b,  6, 17, 0xa8304613)
3315     FF(b, c, d, a,  7, 22, 0xfd469501)
3316     FF(a, b, c, d,  8,  7, 0x698098d8)
3317     FF(d, a, b, c,  9, 12, 0x8b44f7af)
3318     FF(c, d, a, b, 10, 17, 0xffff5bb1)
3319     FF(b, c, d, a, 11, 22, 0x895cd7be)
3320     FF(a, b, c, d, 12,  7, 0x6b901122)
3321     FF(d, a, b, c, 13, 12, 0xfd987193)
3322     FF(c, d, a, b, 14, 17, 0xa679438e)
3323     FF(b, c, d, a, 15, 22, 0x49b40821)
3324 
3325     // Round 2
3326     GG(a, b, c, d,  1,  5, 0xf61e2562)
3327     GG(d, a, b, c,  6,  9, 0xc040b340)
3328     GG(c, d, a, b, 11, 14, 0x265e5a51)
3329     GG(b, c, d, a,  0, 20, 0xe9b6c7aa)
3330     GG(a, b, c, d,  5,  5, 0xd62f105d)
3331     GG(d, a, b, c, 10,  9, 0x02441453)
3332     GG(c, d, a, b, 15, 14, 0xd8a1e681)
3333     GG(b, c, d, a,  4, 20, 0xe7d3fbc8)
3334     GG(a, b, c, d,  9,  5, 0x21e1cde6)
3335     GG(d, a, b, c, 14,  9, 0xc33707d6)
3336     GG(c, d, a, b,  3, 14, 0xf4d50d87)
3337     GG(b, c, d, a,  8, 20, 0x455a14ed)
3338     GG(a, b, c, d, 13,  5, 0xa9e3e905)
3339     GG(d, a, b, c,  2,  9, 0xfcefa3f8)
3340     GG(c, d, a, b,  7, 14, 0x676f02d9)
3341     GG(b, c, d, a, 12, 20, 0x8d2a4c8a)
3342 
3343     // Round 3
3344     HH(a, b, c, d,  5,  4, 0xfffa3942)
3345     HH(d, a, b, c,  8, 11, 0x8771f681)
3346     HH(c, d, a, b, 11, 16, 0x6d9d6122)
3347     HH(b, c, d, a, 14, 23, 0xfde5380c)
3348     HH(a, b, c, d,  1,  4, 0xa4beea44)
3349     HH(d, a, b, c,  4, 11, 0x4bdecfa9)
3350     HH(c, d, a, b,  7, 16, 0xf6bb4b60)
3351     HH(b, c, d, a, 10, 23, 0xbebfbc70)
3352     HH(a, b, c, d, 13,  4, 0x289b7ec6)
3353     HH(d, a, b, c,  0, 11, 0xeaa127fa)
3354     HH(c, d, a, b,  3, 16, 0xd4ef3085)
3355     HH(b, c, d, a,  6, 23, 0x04881d05)
3356     HH(a, b, c, d,  9,  4, 0xd9d4d039)
3357     HH(d, a, b, c, 12, 11, 0xe6db99e5)
3358     HH(c, d, a, b, 15, 16, 0x1fa27cf8)
3359     HH(b, c, d, a,  2, 23, 0xc4ac5665)
3360 
3361     // Round 4
3362     II(a, b, c, d,  0,  6, 0xf4292244)
3363     II(d, a, b, c,  7, 10, 0x432aff97)
3364     II(c, d, a, b, 14, 15, 0xab9423a7)
3365     II(b, c, d, a,  5, 21, 0xfc93a039)
3366     II(a, b, c, d, 12,  6, 0x655b59c3)
3367     II(d, a, b, c,  3, 10, 0x8f0ccc92)
3368     II(c, d, a, b, 10, 15, 0xffeff47d)
3369     II(b, c, d, a,  1, 21, 0x85845dd1)
3370     II(a, b, c, d,  8,  6, 0x6fa87e4f)
3371     II(d, a, b, c, 15, 10, 0xfe2ce6e0)
3372     II(c, d, a, b,  6, 15, 0xa3014314)
3373     II(b, c, d, a, 13, 21, 0x4e0811a1)
3374     II(a, b, c, d,  4,  6, 0xf7537e82)
3375     II(d, a, b, c, 11, 10, 0xbd3af235)
3376     II(c, d, a, b,  2, 15, 0x2ad7d2bb)
3377     II(b, c, d, a,  9, 21, 0xeb86d391)
3378 
3379 #undef FF
3380 #undef GG
3381 #undef HH
3382 #undef II
3383 
3384     // write hash values back in the correct order
3385     __ ldrw(rscratch1, Address(state,  0));
3386     __ addw(rscratch1, rscratch1, a);
3387     __ strw(rscratch1, Address(state,  0));
3388 
3389     __ ldrw(rscratch2, Address(state,  4));
3390     __ addw(rscratch2, rscratch2, b);
3391     __ strw(rscratch2, Address(state,  4));
3392 
3393     __ ldrw(rscratch3, Address(state,  8));
3394     __ addw(rscratch3, rscratch3, c);
3395     __ strw(rscratch3, Address(state,  8));
3396 
3397     __ ldrw(rscratch4, Address(state, 12));
3398     __ addw(rscratch4, rscratch4, d);
3399     __ strw(rscratch4, Address(state, 12));
3400 
3401     if (multi_block) {
3402       __ add(buf, buf, 64);
3403       __ add(ofs, ofs, 64);
3404       __ cmp(ofs, limit);
3405       __ br(Assembler::LE, md5_loop);
3406       __ mov(c_rarg0, ofs); // return ofs
3407     }
3408 
3409     __ ret(lr);
3410 
3411     return start;
3412   }
3413 
3414   // Arguments:
3415   //
3416   // Inputs:
3417   //   c_rarg0   - byte[]  source+offset
3418   //   c_rarg1   - int[]   SHA.state
3419   //   c_rarg2   - int     offset
3420   //   c_rarg3   - int     limit
3421   //
3422   address generate_sha1_implCompress(bool multi_block, const char *name) {
3423     __ align(CodeEntryAlignment);
3424     StubCodeMark mark(this, "StubRoutines", name);
3425     address start = __ pc();
3426 
3427     Register buf   = c_rarg0;
3428     Register state = c_rarg1;
3429     Register ofs   = c_rarg2;
3430     Register limit = c_rarg3;
3431 
3432     Label keys;
3433     Label sha1_loop;
3434 
3435     // load the keys into v0..v3
3436     __ adr(rscratch1, keys);
3437     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3438     // load 5 words state into v6, v7
3439     __ ldrq(v6, Address(state, 0));
3440     __ ldrs(v7, Address(state, 16));
3441 
3442 
3443     __ BIND(sha1_loop);
3444     // load 64 bytes of data into v16..v19
3445     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3446     __ rev32(v16, __ T16B, v16);
3447     __ rev32(v17, __ T16B, v17);
3448     __ rev32(v18, __ T16B, v18);
3449     __ rev32(v19, __ T16B, v19);
3450 
3451     // do the sha1
3452     __ addv(v4, __ T4S, v16, v0);
3453     __ orr(v20, __ T16B, v6, v6);
3454 
3455     FloatRegister d0 = v16;
3456     FloatRegister d1 = v17;
3457     FloatRegister d2 = v18;
3458     FloatRegister d3 = v19;
3459 
3460     for (int round = 0; round < 20; round++) {
3461       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3462       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3463       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3464       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3465       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3466 
3467       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3468       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3469       __ sha1h(tmp2, __ T4S, v20);
3470       if (round < 5)
3471         __ sha1c(v20, __ T4S, tmp3, tmp4);
3472       else if (round < 10 || round >= 15)
3473         __ sha1p(v20, __ T4S, tmp3, tmp4);
3474       else
3475         __ sha1m(v20, __ T4S, tmp3, tmp4);
3476       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3477 
3478       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3479     }
3480 
3481     __ addv(v7, __ T2S, v7, v21);
3482     __ addv(v6, __ T4S, v6, v20);
3483 
3484     if (multi_block) {
3485       __ add(ofs, ofs, 64);
3486       __ cmp(ofs, limit);
3487       __ br(Assembler::LE, sha1_loop);
3488       __ mov(c_rarg0, ofs); // return ofs
3489     }
3490 
3491     __ strq(v6, Address(state, 0));
3492     __ strs(v7, Address(state, 16));
3493 
3494     __ ret(lr);
3495 
3496     __ bind(keys);
3497     __ emit_int32(0x5a827999);
3498     __ emit_int32(0x6ed9eba1);
3499     __ emit_int32(0x8f1bbcdc);
3500     __ emit_int32(0xca62c1d6);
3501 
3502     return start;
3503   }
3504 
3505 
3506   // Arguments:
3507   //
3508   // Inputs:
3509   //   c_rarg0   - byte[]  source+offset
3510   //   c_rarg1   - int[]   SHA.state
3511   //   c_rarg2   - int     offset
3512   //   c_rarg3   - int     limit
3513   //
3514   address generate_sha256_implCompress(bool multi_block, const char *name) {
3515     static const uint32_t round_consts[64] = {
3516       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3517       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3518       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3519       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3520       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3521       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3522       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3523       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3524       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3525       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3526       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3527       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3528       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3529       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3530       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3531       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3532     };
3533     __ align(CodeEntryAlignment);
3534     StubCodeMark mark(this, "StubRoutines", name);
3535     address start = __ pc();
3536 
3537     Register buf   = c_rarg0;
3538     Register state = c_rarg1;
3539     Register ofs   = c_rarg2;
3540     Register limit = c_rarg3;
3541 
3542     Label sha1_loop;
3543 
3544     __ stpd(v8, v9, __ pre(sp, -32));
3545     __ stpd(v10, v11, Address(sp, 16));
3546 
3547 // dga == v0
3548 // dgb == v1
3549 // dg0 == v2
3550 // dg1 == v3
3551 // dg2 == v4
3552 // t0 == v6
3553 // t1 == v7
3554 
3555     // load 16 keys to v16..v31
3556     __ lea(rscratch1, ExternalAddress((address)round_consts));
3557     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3558     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3559     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3560     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3561 
3562     // load 8 words (256 bits) state
3563     __ ldpq(v0, v1, state);
3564 
3565     __ BIND(sha1_loop);
3566     // load 64 bytes of data into v8..v11
3567     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3568     __ rev32(v8, __ T16B, v8);
3569     __ rev32(v9, __ T16B, v9);
3570     __ rev32(v10, __ T16B, v10);
3571     __ rev32(v11, __ T16B, v11);
3572 
3573     __ addv(v6, __ T4S, v8, v16);
3574     __ orr(v2, __ T16B, v0, v0);
3575     __ orr(v3, __ T16B, v1, v1);
3576 
3577     FloatRegister d0 = v8;
3578     FloatRegister d1 = v9;
3579     FloatRegister d2 = v10;
3580     FloatRegister d3 = v11;
3581 
3582 
3583     for (int round = 0; round < 16; round++) {
3584       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3585       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3586       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3587       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3588 
3589       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3590        __ orr(v4, __ T16B, v2, v2);
3591       if (round < 15)
3592         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3593       __ sha256h(v2, __ T4S, v3, tmp2);
3594       __ sha256h2(v3, __ T4S, v4, tmp2);
3595       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3596 
3597       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3598     }
3599 
3600     __ addv(v0, __ T4S, v0, v2);
3601     __ addv(v1, __ T4S, v1, v3);
3602 
3603     if (multi_block) {
3604       __ add(ofs, ofs, 64);
3605       __ cmp(ofs, limit);
3606       __ br(Assembler::LE, sha1_loop);
3607       __ mov(c_rarg0, ofs); // return ofs
3608     }
3609 
3610     __ ldpd(v10, v11, Address(sp, 16));
3611     __ ldpd(v8, v9, __ post(sp, 32));
3612 
3613     __ stpq(v0, v1, state);
3614 
3615     __ ret(lr);
3616 
3617     return start;
3618   }
3619 
3620   // Arguments:
3621   //
3622   // Inputs:
3623   //   c_rarg0   - byte[]  source+offset
3624   //   c_rarg1   - int[]   SHA.state
3625   //   c_rarg2   - int     offset
3626   //   c_rarg3   - int     limit
3627   //
3628   address generate_sha512_implCompress(bool multi_block, const char *name) {
3629     static const uint64_t round_consts[80] = {
3630       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3631       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3632       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3633       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3634       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3635       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3636       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3637       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3638       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3639       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3640       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3641       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3642       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3643       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3644       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3645       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3646       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3647       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3648       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3649       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3650       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3651       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3652       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3653       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3654       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3655       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3656       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3657     };
3658 
3659     // Double rounds for sha512.
3660     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3661       if (dr < 36)                                                                   \
3662         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3663       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3664       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3665       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3666       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3667       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3668       if (dr < 32) {                                                                 \
3669         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3670         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3671       }                                                                              \
3672       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3673       if (dr < 32)                                                                   \
3674         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3675       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3676       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3677 
3678     __ align(CodeEntryAlignment);
3679     StubCodeMark mark(this, "StubRoutines", name);
3680     address start = __ pc();
3681 
3682     Register buf   = c_rarg0;
3683     Register state = c_rarg1;
3684     Register ofs   = c_rarg2;
3685     Register limit = c_rarg3;
3686 
3687     __ stpd(v8, v9, __ pre(sp, -64));
3688     __ stpd(v10, v11, Address(sp, 16));
3689     __ stpd(v12, v13, Address(sp, 32));
3690     __ stpd(v14, v15, Address(sp, 48));
3691 
3692     Label sha512_loop;
3693 
3694     // load state
3695     __ ld1(v8, v9, v10, v11, __ T2D, state);
3696 
3697     // load first 4 round constants
3698     __ lea(rscratch1, ExternalAddress((address)round_consts));
3699     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3700 
3701     __ BIND(sha512_loop);
3702     // load 128B of data into v12..v19
3703     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3704     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3705     __ rev64(v12, __ T16B, v12);
3706     __ rev64(v13, __ T16B, v13);
3707     __ rev64(v14, __ T16B, v14);
3708     __ rev64(v15, __ T16B, v15);
3709     __ rev64(v16, __ T16B, v16);
3710     __ rev64(v17, __ T16B, v17);
3711     __ rev64(v18, __ T16B, v18);
3712     __ rev64(v19, __ T16B, v19);
3713 
3714     __ mov(rscratch2, rscratch1);
3715 
3716     __ mov(v0, __ T16B, v8);
3717     __ mov(v1, __ T16B, v9);
3718     __ mov(v2, __ T16B, v10);
3719     __ mov(v3, __ T16B, v11);
3720 
3721     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3722     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3723     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3724     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3725     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3726     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3727     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3728     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3729     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3730     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3731     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3732     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3733     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3734     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3735     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3736     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3737     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3738     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3739     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3740     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3741     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3742     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3743     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3744     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3745     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3746     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3747     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3748     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3749     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3750     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3751     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3752     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3753     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3754     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3755     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3756     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3757     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3758     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3759     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3760     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3761 
3762     __ addv(v8, __ T2D, v8, v0);
3763     __ addv(v9, __ T2D, v9, v1);
3764     __ addv(v10, __ T2D, v10, v2);
3765     __ addv(v11, __ T2D, v11, v3);
3766 
3767     if (multi_block) {
3768       __ add(ofs, ofs, 128);
3769       __ cmp(ofs, limit);
3770       __ br(Assembler::LE, sha512_loop);
3771       __ mov(c_rarg0, ofs); // return ofs
3772     }
3773 
3774     __ st1(v8, v9, v10, v11, __ T2D, state);
3775 
3776     __ ldpd(v14, v15, Address(sp, 48));
3777     __ ldpd(v12, v13, Address(sp, 32));
3778     __ ldpd(v10, v11, Address(sp, 16));
3779     __ ldpd(v8, v9, __ post(sp, 64));
3780 
3781     __ ret(lr);
3782 
3783     return start;
3784   }
3785 
3786   // Arguments:
3787   //
3788   // Inputs:
3789   //   c_rarg0   - byte[]  source+offset
3790   //   c_rarg1   - byte[]   SHA.state
3791   //   c_rarg2   - int     digest_length
3792   //   c_rarg3   - int     offset
3793   //   c_rarg4   - int     limit
3794   //
3795   address generate_sha3_implCompress(bool multi_block, const char *name) {
3796     static const uint64_t round_consts[24] = {
3797       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3798       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3799       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3800       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3801       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3802       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3803       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3804       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3805     };
3806 
3807     __ align(CodeEntryAlignment);
3808     StubCodeMark mark(this, "StubRoutines", name);
3809     address start = __ pc();
3810 
3811     Register buf           = c_rarg0;
3812     Register state         = c_rarg1;
3813     Register digest_length = c_rarg2;
3814     Register ofs           = c_rarg3;
3815     Register limit         = c_rarg4;
3816 
3817     Label sha3_loop, rounds24_loop;
3818     Label sha3_512, sha3_384_or_224, sha3_256;
3819 
3820     __ stpd(v8, v9, __ pre(sp, -64));
3821     __ stpd(v10, v11, Address(sp, 16));
3822     __ stpd(v12, v13, Address(sp, 32));
3823     __ stpd(v14, v15, Address(sp, 48));
3824 
3825     // load state
3826     __ add(rscratch1, state, 32);
3827     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3828     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3829     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3830     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3831     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3832     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3833     __ ld1(v24, __ T1D, rscratch1);
3834 
3835     __ BIND(sha3_loop);
3836 
3837     // 24 keccak rounds
3838     __ movw(rscratch2, 24);
3839 
3840     // load round_constants base
3841     __ lea(rscratch1, ExternalAddress((address) round_consts));
3842 
3843     // load input
3844     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3845     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3846     __ eor(v0, __ T8B, v0, v25);
3847     __ eor(v1, __ T8B, v1, v26);
3848     __ eor(v2, __ T8B, v2, v27);
3849     __ eor(v3, __ T8B, v3, v28);
3850     __ eor(v4, __ T8B, v4, v29);
3851     __ eor(v5, __ T8B, v5, v30);
3852     __ eor(v6, __ T8B, v6, v31);
3853 
3854     // digest_length == 64, SHA3-512
3855     __ tbnz(digest_length, 6, sha3_512);
3856 
3857     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3858     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3859     __ eor(v7, __ T8B, v7, v25);
3860     __ eor(v8, __ T8B, v8, v26);
3861     __ eor(v9, __ T8B, v9, v27);
3862     __ eor(v10, __ T8B, v10, v28);
3863     __ eor(v11, __ T8B, v11, v29);
3864     __ eor(v12, __ T8B, v12, v30);
3865 
3866     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3867     __ tbnz(digest_length, 4, sha3_384_or_224);
3868 
3869     // SHA3-256
3870     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3871     __ eor(v13, __ T8B, v13, v25);
3872     __ eor(v14, __ T8B, v14, v26);
3873     __ eor(v15, __ T8B, v15, v27);
3874     __ eor(v16, __ T8B, v16, v28);
3875     __ b(rounds24_loop);
3876 
3877     __ BIND(sha3_384_or_224);
3878     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3879 
3880     // SHA3-224
3881     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3882     __ ld1(v29, __ T8B, __ post(buf, 8));
3883     __ eor(v13, __ T8B, v13, v25);
3884     __ eor(v14, __ T8B, v14, v26);
3885     __ eor(v15, __ T8B, v15, v27);
3886     __ eor(v16, __ T8B, v16, v28);
3887     __ eor(v17, __ T8B, v17, v29);
3888     __ b(rounds24_loop);
3889 
3890     __ BIND(sha3_512);
3891     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3892     __ eor(v7, __ T8B, v7, v25);
3893     __ eor(v8, __ T8B, v8, v26);
3894 
3895     __ BIND(rounds24_loop);
3896     __ subw(rscratch2, rscratch2, 1);
3897 
3898     __ eor3(v29, __ T16B, v4, v9, v14);
3899     __ eor3(v26, __ T16B, v1, v6, v11);
3900     __ eor3(v28, __ T16B, v3, v8, v13);
3901     __ eor3(v25, __ T16B, v0, v5, v10);
3902     __ eor3(v27, __ T16B, v2, v7, v12);
3903     __ eor3(v29, __ T16B, v29, v19, v24);
3904     __ eor3(v26, __ T16B, v26, v16, v21);
3905     __ eor3(v28, __ T16B, v28, v18, v23);
3906     __ eor3(v25, __ T16B, v25, v15, v20);
3907     __ eor3(v27, __ T16B, v27, v17, v22);
3908 
3909     __ rax1(v30, __ T2D, v29, v26);
3910     __ rax1(v26, __ T2D, v26, v28);
3911     __ rax1(v28, __ T2D, v28, v25);
3912     __ rax1(v25, __ T2D, v25, v27);
3913     __ rax1(v27, __ T2D, v27, v29);
3914 
3915     __ eor(v0, __ T16B, v0, v30);
3916     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3917     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3918     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3919     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3920     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3921     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3922     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3923     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3924     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3925     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3926     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3927     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3928     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3929     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3930     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3931     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3932     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3933     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3934     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3935     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3936     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3937     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3938     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3939     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3940 
3941     __ bcax(v20, __ T16B, v31, v22, v8);
3942     __ bcax(v21, __ T16B, v8,  v23, v22);
3943     __ bcax(v22, __ T16B, v22, v24, v23);
3944     __ bcax(v23, __ T16B, v23, v31, v24);
3945     __ bcax(v24, __ T16B, v24, v8,  v31);
3946 
3947     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3948 
3949     __ bcax(v17, __ T16B, v25, v19, v3);
3950     __ bcax(v18, __ T16B, v3,  v15, v19);
3951     __ bcax(v19, __ T16B, v19, v16, v15);
3952     __ bcax(v15, __ T16B, v15, v25, v16);
3953     __ bcax(v16, __ T16B, v16, v3,  v25);
3954 
3955     __ bcax(v10, __ T16B, v29, v12, v26);
3956     __ bcax(v11, __ T16B, v26, v13, v12);
3957     __ bcax(v12, __ T16B, v12, v14, v13);
3958     __ bcax(v13, __ T16B, v13, v29, v14);
3959     __ bcax(v14, __ T16B, v14, v26, v29);
3960 
3961     __ bcax(v7, __ T16B, v30, v9,  v4);
3962     __ bcax(v8, __ T16B, v4,  v5,  v9);
3963     __ bcax(v9, __ T16B, v9,  v6,  v5);
3964     __ bcax(v5, __ T16B, v5,  v30, v6);
3965     __ bcax(v6, __ T16B, v6,  v4,  v30);
3966 
3967     __ bcax(v3, __ T16B, v27, v0,  v28);
3968     __ bcax(v4, __ T16B, v28, v1,  v0);
3969     __ bcax(v0, __ T16B, v0,  v2,  v1);
3970     __ bcax(v1, __ T16B, v1,  v27, v2);
3971     __ bcax(v2, __ T16B, v2,  v28, v27);
3972 
3973     __ eor(v0, __ T16B, v0, v31);
3974 
3975     __ cbnzw(rscratch2, rounds24_loop);
3976 
3977     if (multi_block) {
3978       // block_size =  200 - 2 * digest_length, ofs += block_size
3979       __ add(ofs, ofs, 200);
3980       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3981 
3982       __ cmp(ofs, limit);
3983       __ br(Assembler::LE, sha3_loop);
3984       __ mov(c_rarg0, ofs); // return ofs
3985     }
3986 
3987     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3988     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3989     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3990     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3991     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3992     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3993     __ st1(v24, __ T1D, state);
3994 
3995     __ ldpd(v14, v15, Address(sp, 48));
3996     __ ldpd(v12, v13, Address(sp, 32));
3997     __ ldpd(v10, v11, Address(sp, 16));
3998     __ ldpd(v8, v9, __ post(sp, 64));
3999 
4000     __ ret(lr);
4001 
4002     return start;
4003   }
4004 
4005   // Safefetch stubs.
4006   void generate_safefetch(const char* name, int size, address* entry,
4007                           address* fault_pc, address* continuation_pc) {
4008     // safefetch signatures:
4009     //   int      SafeFetch32(int*      adr, int      errValue);
4010     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
4011     //
4012     // arguments:
4013     //   c_rarg0 = adr
4014     //   c_rarg1 = errValue
4015     //
4016     // result:
4017     //   PPC_RET  = *adr or errValue
4018 
4019     StubCodeMark mark(this, "StubRoutines", name);
4020 
4021     // Entry point, pc or function descriptor.
4022     *entry = __ pc();
4023 
4024     // Load *adr into c_rarg1, may fault.
4025     *fault_pc = __ pc();
4026     switch (size) {
4027       case 4:
4028         // int32_t
4029         __ ldrw(c_rarg1, Address(c_rarg0, 0));
4030         break;
4031       case 8:
4032         // int64_t
4033         __ ldr(c_rarg1, Address(c_rarg0, 0));
4034         break;
4035       default:
4036         ShouldNotReachHere();
4037     }
4038 
4039     // return errValue or *adr
4040     *continuation_pc = __ pc();
4041     __ mov(r0, c_rarg1);
4042     __ ret(lr);
4043   }
4044 
4045   /**
4046    *  Arguments:
4047    *
4048    * Inputs:
4049    *   c_rarg0   - int crc
4050    *   c_rarg1   - byte* buf
4051    *   c_rarg2   - int length
4052    *
4053    * Ouput:
4054    *       rax   - int crc result
4055    */
4056   address generate_updateBytesCRC32() {
4057     assert(UseCRC32Intrinsics, "what are we doing here?");
4058 
4059     __ align(CodeEntryAlignment);
4060     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4061 
4062     address start = __ pc();
4063 
4064     const Register crc   = c_rarg0;  // crc
4065     const Register buf   = c_rarg1;  // source java byte array address
4066     const Register len   = c_rarg2;  // length
4067     const Register table0 = c_rarg3; // crc_table address
4068     const Register table1 = c_rarg4;
4069     const Register table2 = c_rarg5;
4070     const Register table3 = c_rarg6;
4071     const Register tmp3 = c_rarg7;
4072 
4073     BLOCK_COMMENT("Entry:");
4074     __ enter(); // required for proper stackwalking of RuntimeStub frame
4075 
4076     __ kernel_crc32(crc, buf, len,
4077               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4078 
4079     __ leave(); // required for proper stackwalking of RuntimeStub frame
4080     __ ret(lr);
4081 
4082     return start;
4083   }
4084 
4085   /**
4086    *  Arguments:
4087    *
4088    * Inputs:
4089    *   c_rarg0   - int crc
4090    *   c_rarg1   - byte* buf
4091    *   c_rarg2   - int length
4092    *   c_rarg3   - int* table
4093    *
4094    * Ouput:
4095    *       r0   - int crc result
4096    */
4097   address generate_updateBytesCRC32C() {
4098     assert(UseCRC32CIntrinsics, "what are we doing here?");
4099 
4100     __ align(CodeEntryAlignment);
4101     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4102 
4103     address start = __ pc();
4104 
4105     const Register crc   = c_rarg0;  // crc
4106     const Register buf   = c_rarg1;  // source java byte array address
4107     const Register len   = c_rarg2;  // length
4108     const Register table0 = c_rarg3; // crc_table address
4109     const Register table1 = c_rarg4;
4110     const Register table2 = c_rarg5;
4111     const Register table3 = c_rarg6;
4112     const Register tmp3 = c_rarg7;
4113 
4114     BLOCK_COMMENT("Entry:");
4115     __ enter(); // required for proper stackwalking of RuntimeStub frame
4116 
4117     __ kernel_crc32c(crc, buf, len,
4118               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4119 
4120     __ leave(); // required for proper stackwalking of RuntimeStub frame
4121     __ ret(lr);
4122 
4123     return start;
4124   }
4125 
4126   /***
4127    *  Arguments:
4128    *
4129    *  Inputs:
4130    *   c_rarg0   - int   adler
4131    *   c_rarg1   - byte* buff
4132    *   c_rarg2   - int   len
4133    *
4134    * Output:
4135    *   c_rarg0   - int adler result
4136    */
4137   address generate_updateBytesAdler32() {
4138     __ align(CodeEntryAlignment);
4139     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4140     address start = __ pc();
4141 
4142     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4143 
4144     // Aliases
4145     Register adler  = c_rarg0;
4146     Register s1     = c_rarg0;
4147     Register s2     = c_rarg3;
4148     Register buff   = c_rarg1;
4149     Register len    = c_rarg2;
4150     Register nmax  = r4;
4151     Register base  = r5;
4152     Register count = r6;
4153     Register temp0 = rscratch1;
4154     Register temp1 = rscratch2;
4155     FloatRegister vbytes = v0;
4156     FloatRegister vs1acc = v1;
4157     FloatRegister vs2acc = v2;
4158     FloatRegister vtable = v3;
4159 
4160     // Max number of bytes we can process before having to take the mod
4161     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4162     uint64_t BASE = 0xfff1;
4163     uint64_t NMAX = 0x15B0;
4164 
4165     __ mov(base, BASE);
4166     __ mov(nmax, NMAX);
4167 
4168     // Load accumulation coefficients for the upper 16 bits
4169     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4170     __ ld1(vtable, __ T16B, Address(temp0));
4171 
4172     // s1 is initialized to the lower 16 bits of adler
4173     // s2 is initialized to the upper 16 bits of adler
4174     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4175     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4176 
4177     // The pipelined loop needs at least 16 elements for 1 iteration
4178     // It does check this, but it is more effective to skip to the cleanup loop
4179     __ cmp(len, (u1)16);
4180     __ br(Assembler::HS, L_nmax);
4181     __ cbz(len, L_combine);
4182 
4183     __ bind(L_simple_by1_loop);
4184     __ ldrb(temp0, Address(__ post(buff, 1)));
4185     __ add(s1, s1, temp0);
4186     __ add(s2, s2, s1);
4187     __ subs(len, len, 1);
4188     __ br(Assembler::HI, L_simple_by1_loop);
4189 
4190     // s1 = s1 % BASE
4191     __ subs(temp0, s1, base);
4192     __ csel(s1, temp0, s1, Assembler::HS);
4193 
4194     // s2 = s2 % BASE
4195     __ lsr(temp0, s2, 16);
4196     __ lsl(temp1, temp0, 4);
4197     __ sub(temp1, temp1, temp0);
4198     __ add(s2, temp1, s2, ext::uxth);
4199 
4200     __ subs(temp0, s2, base);
4201     __ csel(s2, temp0, s2, Assembler::HS);
4202 
4203     __ b(L_combine);
4204 
4205     __ bind(L_nmax);
4206     __ subs(len, len, nmax);
4207     __ sub(count, nmax, 16);
4208     __ br(Assembler::LO, L_by16);
4209 
4210     __ bind(L_nmax_loop);
4211 
4212     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4213                                       vbytes, vs1acc, vs2acc, vtable);
4214 
4215     __ subs(count, count, 16);
4216     __ br(Assembler::HS, L_nmax_loop);
4217 
4218     // s1 = s1 % BASE
4219     __ lsr(temp0, s1, 16);
4220     __ lsl(temp1, temp0, 4);
4221     __ sub(temp1, temp1, temp0);
4222     __ add(temp1, temp1, s1, ext::uxth);
4223 
4224     __ lsr(temp0, temp1, 16);
4225     __ lsl(s1, temp0, 4);
4226     __ sub(s1, s1, temp0);
4227     __ add(s1, s1, temp1, ext:: uxth);
4228 
4229     __ subs(temp0, s1, base);
4230     __ csel(s1, temp0, s1, Assembler::HS);
4231 
4232     // s2 = s2 % BASE
4233     __ lsr(temp0, s2, 16);
4234     __ lsl(temp1, temp0, 4);
4235     __ sub(temp1, temp1, temp0);
4236     __ add(temp1, temp1, s2, ext::uxth);
4237 
4238     __ lsr(temp0, temp1, 16);
4239     __ lsl(s2, temp0, 4);
4240     __ sub(s2, s2, temp0);
4241     __ add(s2, s2, temp1, ext:: uxth);
4242 
4243     __ subs(temp0, s2, base);
4244     __ csel(s2, temp0, s2, Assembler::HS);
4245 
4246     __ subs(len, len, nmax);
4247     __ sub(count, nmax, 16);
4248     __ br(Assembler::HS, L_nmax_loop);
4249 
4250     __ bind(L_by16);
4251     __ adds(len, len, count);
4252     __ br(Assembler::LO, L_by1);
4253 
4254     __ bind(L_by16_loop);
4255 
4256     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4257                                       vbytes, vs1acc, vs2acc, vtable);
4258 
4259     __ subs(len, len, 16);
4260     __ br(Assembler::HS, L_by16_loop);
4261 
4262     __ bind(L_by1);
4263     __ adds(len, len, 15);
4264     __ br(Assembler::LO, L_do_mod);
4265 
4266     __ bind(L_by1_loop);
4267     __ ldrb(temp0, Address(__ post(buff, 1)));
4268     __ add(s1, temp0, s1);
4269     __ add(s2, s2, s1);
4270     __ subs(len, len, 1);
4271     __ br(Assembler::HS, L_by1_loop);
4272 
4273     __ bind(L_do_mod);
4274     // s1 = s1 % BASE
4275     __ lsr(temp0, s1, 16);
4276     __ lsl(temp1, temp0, 4);
4277     __ sub(temp1, temp1, temp0);
4278     __ add(temp1, temp1, s1, ext::uxth);
4279 
4280     __ lsr(temp0, temp1, 16);
4281     __ lsl(s1, temp0, 4);
4282     __ sub(s1, s1, temp0);
4283     __ add(s1, s1, temp1, ext:: uxth);
4284 
4285     __ subs(temp0, s1, base);
4286     __ csel(s1, temp0, s1, Assembler::HS);
4287 
4288     // s2 = s2 % BASE
4289     __ lsr(temp0, s2, 16);
4290     __ lsl(temp1, temp0, 4);
4291     __ sub(temp1, temp1, temp0);
4292     __ add(temp1, temp1, s2, ext::uxth);
4293 
4294     __ lsr(temp0, temp1, 16);
4295     __ lsl(s2, temp0, 4);
4296     __ sub(s2, s2, temp0);
4297     __ add(s2, s2, temp1, ext:: uxth);
4298 
4299     __ subs(temp0, s2, base);
4300     __ csel(s2, temp0, s2, Assembler::HS);
4301 
4302     // Combine lower bits and higher bits
4303     __ bind(L_combine);
4304     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4305 
4306     __ ret(lr);
4307 
4308     return start;
4309   }
4310 
4311   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4312           Register temp0, Register temp1, FloatRegister vbytes,
4313           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4314     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4315     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4316     // In non-vectorized code, we update s1 and s2 as:
4317     //   s1 <- s1 + b1
4318     //   s2 <- s2 + s1
4319     //   s1 <- s1 + b2
4320     //   s2 <- s2 + b1
4321     //   ...
4322     //   s1 <- s1 + b16
4323     //   s2 <- s2 + s1
4324     // Putting above assignments together, we have:
4325     //   s1_new = s1 + b1 + b2 + ... + b16
4326     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4327     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4328     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4329     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4330 
4331     // s2 = s2 + s1 * 16
4332     __ add(s2, s2, s1, Assembler::LSL, 4);
4333 
4334     // vs1acc = b1 + b2 + b3 + ... + b16
4335     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4336     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4337     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4338     __ uaddlv(vs1acc, __ T16B, vbytes);
4339     __ uaddlv(vs2acc, __ T8H, vs2acc);
4340 
4341     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4342     __ fmovd(temp0, vs1acc);
4343     __ fmovd(temp1, vs2acc);
4344     __ add(s1, s1, temp0);
4345     __ add(s2, s2, temp1);
4346   }
4347 
4348   /**
4349    *  Arguments:
4350    *
4351    *  Input:
4352    *    c_rarg0   - x address
4353    *    c_rarg1   - x length
4354    *    c_rarg2   - y address
4355    *    c_rarg3   - y lenth
4356    *    c_rarg4   - z address
4357    *    c_rarg5   - z length
4358    */
4359   address generate_multiplyToLen() {
4360     __ align(CodeEntryAlignment);
4361     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4362 
4363     address start = __ pc();
4364     const Register x     = r0;
4365     const Register xlen  = r1;
4366     const Register y     = r2;
4367     const Register ylen  = r3;
4368     const Register z     = r4;
4369     const Register zlen  = r5;
4370 
4371     const Register tmp1  = r10;
4372     const Register tmp2  = r11;
4373     const Register tmp3  = r12;
4374     const Register tmp4  = r13;
4375     const Register tmp5  = r14;
4376     const Register tmp6  = r15;
4377     const Register tmp7  = r16;
4378 
4379     BLOCK_COMMENT("Entry:");
4380     __ enter(); // required for proper stackwalking of RuntimeStub frame
4381     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4382     __ leave(); // required for proper stackwalking of RuntimeStub frame
4383     __ ret(lr);
4384 
4385     return start;
4386   }
4387 
4388   address generate_squareToLen() {
4389     // squareToLen algorithm for sizes 1..127 described in java code works
4390     // faster than multiply_to_len on some CPUs and slower on others, but
4391     // multiply_to_len shows a bit better overall results
4392     __ align(CodeEntryAlignment);
4393     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4394     address start = __ pc();
4395 
4396     const Register x     = r0;
4397     const Register xlen  = r1;
4398     const Register z     = r2;
4399     const Register zlen  = r3;
4400     const Register y     = r4; // == x
4401     const Register ylen  = r5; // == xlen
4402 
4403     const Register tmp1  = r10;
4404     const Register tmp2  = r11;
4405     const Register tmp3  = r12;
4406     const Register tmp4  = r13;
4407     const Register tmp5  = r14;
4408     const Register tmp6  = r15;
4409     const Register tmp7  = r16;
4410 
4411     RegSet spilled_regs = RegSet::of(y, ylen);
4412     BLOCK_COMMENT("Entry:");
4413     __ enter();
4414     __ push(spilled_regs, sp);
4415     __ mov(y, x);
4416     __ mov(ylen, xlen);
4417     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4418     __ pop(spilled_regs, sp);
4419     __ leave();
4420     __ ret(lr);
4421     return start;
4422   }
4423 
4424   address generate_mulAdd() {
4425     __ align(CodeEntryAlignment);
4426     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4427 
4428     address start = __ pc();
4429 
4430     const Register out     = r0;
4431     const Register in      = r1;
4432     const Register offset  = r2;
4433     const Register len     = r3;
4434     const Register k       = r4;
4435 
4436     BLOCK_COMMENT("Entry:");
4437     __ enter();
4438     __ mul_add(out, in, offset, len, k);
4439     __ leave();
4440     __ ret(lr);
4441 
4442     return start;
4443   }
4444 
4445   // Arguments:
4446   //
4447   // Input:
4448   //   c_rarg0   - newArr address
4449   //   c_rarg1   - oldArr address
4450   //   c_rarg2   - newIdx
4451   //   c_rarg3   - shiftCount
4452   //   c_rarg4   - numIter
4453   //
4454   address generate_bigIntegerRightShift() {
4455     __ align(CodeEntryAlignment);
4456     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4457     address start = __ pc();
4458 
4459     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4460 
4461     Register newArr        = c_rarg0;
4462     Register oldArr        = c_rarg1;
4463     Register newIdx        = c_rarg2;
4464     Register shiftCount    = c_rarg3;
4465     Register numIter       = c_rarg4;
4466     Register idx           = numIter;
4467 
4468     Register newArrCur     = rscratch1;
4469     Register shiftRevCount = rscratch2;
4470     Register oldArrCur     = r13;
4471     Register oldArrNext    = r14;
4472 
4473     FloatRegister oldElem0        = v0;
4474     FloatRegister oldElem1        = v1;
4475     FloatRegister newElem         = v2;
4476     FloatRegister shiftVCount     = v3;
4477     FloatRegister shiftVRevCount  = v4;
4478 
4479     __ cbz(idx, Exit);
4480 
4481     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4482 
4483     // left shift count
4484     __ movw(shiftRevCount, 32);
4485     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4486 
4487     // numIter too small to allow a 4-words SIMD loop, rolling back
4488     __ cmp(numIter, (u1)4);
4489     __ br(Assembler::LT, ShiftThree);
4490 
4491     __ dup(shiftVCount,    __ T4S, shiftCount);
4492     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4493     __ negr(shiftVCount,   __ T4S, shiftVCount);
4494 
4495     __ BIND(ShiftSIMDLoop);
4496 
4497     // Calculate the load addresses
4498     __ sub(idx, idx, 4);
4499     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4500     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4501     __ add(oldArrCur,  oldArrNext, 4);
4502 
4503     // Load 4 words and process
4504     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4505     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4506     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4507     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4508     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4509     __ st1(newElem,   __ T4S,  Address(newArrCur));
4510 
4511     __ cmp(idx, (u1)4);
4512     __ br(Assembler::LT, ShiftTwoLoop);
4513     __ b(ShiftSIMDLoop);
4514 
4515     __ BIND(ShiftTwoLoop);
4516     __ cbz(idx, Exit);
4517     __ cmp(idx, (u1)1);
4518     __ br(Assembler::EQ, ShiftOne);
4519 
4520     // Calculate the load addresses
4521     __ sub(idx, idx, 2);
4522     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4523     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4524     __ add(oldArrCur,  oldArrNext, 4);
4525 
4526     // Load 2 words and process
4527     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4528     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4529     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4530     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4531     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4532     __ st1(newElem,   __ T2S, Address(newArrCur));
4533     __ b(ShiftTwoLoop);
4534 
4535     __ BIND(ShiftThree);
4536     __ tbz(idx, 1, ShiftOne);
4537     __ tbz(idx, 0, ShiftTwo);
4538     __ ldrw(r10,  Address(oldArr, 12));
4539     __ ldrw(r11,  Address(oldArr, 8));
4540     __ lsrvw(r10, r10, shiftCount);
4541     __ lslvw(r11, r11, shiftRevCount);
4542     __ orrw(r12,  r10, r11);
4543     __ strw(r12,  Address(newArr, 8));
4544 
4545     __ BIND(ShiftTwo);
4546     __ ldrw(r10,  Address(oldArr, 8));
4547     __ ldrw(r11,  Address(oldArr, 4));
4548     __ lsrvw(r10, r10, shiftCount);
4549     __ lslvw(r11, r11, shiftRevCount);
4550     __ orrw(r12,  r10, r11);
4551     __ strw(r12,  Address(newArr, 4));
4552 
4553     __ BIND(ShiftOne);
4554     __ ldrw(r10,  Address(oldArr, 4));
4555     __ ldrw(r11,  Address(oldArr));
4556     __ lsrvw(r10, r10, shiftCount);
4557     __ lslvw(r11, r11, shiftRevCount);
4558     __ orrw(r12,  r10, r11);
4559     __ strw(r12,  Address(newArr));
4560 
4561     __ BIND(Exit);
4562     __ ret(lr);
4563 
4564     return start;
4565   }
4566 
4567   // Arguments:
4568   //
4569   // Input:
4570   //   c_rarg0   - newArr address
4571   //   c_rarg1   - oldArr address
4572   //   c_rarg2   - newIdx
4573   //   c_rarg3   - shiftCount
4574   //   c_rarg4   - numIter
4575   //
4576   address generate_bigIntegerLeftShift() {
4577     __ align(CodeEntryAlignment);
4578     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4579     address start = __ pc();
4580 
4581     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4582 
4583     Register newArr        = c_rarg0;
4584     Register oldArr        = c_rarg1;
4585     Register newIdx        = c_rarg2;
4586     Register shiftCount    = c_rarg3;
4587     Register numIter       = c_rarg4;
4588 
4589     Register shiftRevCount = rscratch1;
4590     Register oldArrNext    = rscratch2;
4591 
4592     FloatRegister oldElem0        = v0;
4593     FloatRegister oldElem1        = v1;
4594     FloatRegister newElem         = v2;
4595     FloatRegister shiftVCount     = v3;
4596     FloatRegister shiftVRevCount  = v4;
4597 
4598     __ cbz(numIter, Exit);
4599 
4600     __ add(oldArrNext, oldArr, 4);
4601     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4602 
4603     // right shift count
4604     __ movw(shiftRevCount, 32);
4605     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4606 
4607     // numIter too small to allow a 4-words SIMD loop, rolling back
4608     __ cmp(numIter, (u1)4);
4609     __ br(Assembler::LT, ShiftThree);
4610 
4611     __ dup(shiftVCount,     __ T4S, shiftCount);
4612     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4613     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4614 
4615     __ BIND(ShiftSIMDLoop);
4616 
4617     // load 4 words and process
4618     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4619     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4620     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4621     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4622     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4623     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4624     __ sub(numIter,   numIter, 4);
4625 
4626     __ cmp(numIter, (u1)4);
4627     __ br(Assembler::LT, ShiftTwoLoop);
4628     __ b(ShiftSIMDLoop);
4629 
4630     __ BIND(ShiftTwoLoop);
4631     __ cbz(numIter, Exit);
4632     __ cmp(numIter, (u1)1);
4633     __ br(Assembler::EQ, ShiftOne);
4634 
4635     // load 2 words and process
4636     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4637     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4638     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4639     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4640     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4641     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4642     __ sub(numIter,   numIter, 2);
4643     __ b(ShiftTwoLoop);
4644 
4645     __ BIND(ShiftThree);
4646     __ ldrw(r10,  __ post(oldArr, 4));
4647     __ ldrw(r11,  __ post(oldArrNext, 4));
4648     __ lslvw(r10, r10, shiftCount);
4649     __ lsrvw(r11, r11, shiftRevCount);
4650     __ orrw(r12,  r10, r11);
4651     __ strw(r12,  __ post(newArr, 4));
4652     __ tbz(numIter, 1, Exit);
4653     __ tbz(numIter, 0, ShiftOne);
4654 
4655     __ BIND(ShiftTwo);
4656     __ ldrw(r10,  __ post(oldArr, 4));
4657     __ ldrw(r11,  __ post(oldArrNext, 4));
4658     __ lslvw(r10, r10, shiftCount);
4659     __ lsrvw(r11, r11, shiftRevCount);
4660     __ orrw(r12,  r10, r11);
4661     __ strw(r12,  __ post(newArr, 4));
4662 
4663     __ BIND(ShiftOne);
4664     __ ldrw(r10,  Address(oldArr));
4665     __ ldrw(r11,  Address(oldArrNext));
4666     __ lslvw(r10, r10, shiftCount);
4667     __ lsrvw(r11, r11, shiftRevCount);
4668     __ orrw(r12,  r10, r11);
4669     __ strw(r12,  Address(newArr));
4670 
4671     __ BIND(Exit);
4672     __ ret(lr);
4673 
4674     return start;
4675   }
4676 
4677   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
4678                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
4679                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
4680     // Karatsuba multiplication performs a 128*128 -> 256-bit
4681     // multiplication in three 128-bit multiplications and a few
4682     // additions.
4683     //
4684     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
4685     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
4686     //
4687     // Inputs:
4688     //
4689     // A0 in a.d[0]     (subkey)
4690     // A1 in a.d[1]
4691     // (A1+A0) in a1_xor_a0.d[0]
4692     //
4693     // B0 in b.d[0]     (state)
4694     // B1 in b.d[1]
4695 
4696     __ ext(tmp1, __ T16B, b, b, 0x08);
4697     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
4698     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
4699     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
4700     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
4701 
4702     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
4703     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
4704     __ eor(tmp2, __ T16B, tmp2, tmp4);
4705     __ eor(tmp2, __ T16B, tmp2, tmp3);
4706 
4707     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
4708     __ ins(result_hi, __ D, tmp2, 0, 1);
4709     __ ins(result_lo, __ D, tmp2, 1, 0);
4710   }
4711 
4712   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
4713                     FloatRegister p, FloatRegister z, FloatRegister t1) {
4714     const FloatRegister t0 = result;
4715 
4716     // The GCM field polynomial f is z^128 + p(z), where p =
4717     // z^7+z^2+z+1.
4718     //
4719     //    z^128 === -p(z)  (mod (z^128 + p(z)))
4720     //
4721     // so, given that the product we're reducing is
4722     //    a == lo + hi * z^128
4723     // substituting,
4724     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
4725     //
4726     // we reduce by multiplying hi by p(z) and subtracting the result
4727     // from (i.e. XORing it with) lo.  Because p has no nonzero high
4728     // bits we can do this with two 64-bit multiplications, lo*p and
4729     // hi*p.
4730 
4731     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
4732     __ ext(t1, __ T16B, t0, z, 8);
4733     __ eor(hi, __ T16B, hi, t1);
4734     __ ext(t1, __ T16B, z, t0, 8);
4735     __ eor(lo, __ T16B, lo, t1);
4736     __ pmull(t0, __ T1Q, hi, p, __ T1D);
4737     __ eor(result, __ T16B, lo, t0);
4738   }
4739 
4740   address generate_has_negatives(address &has_negatives_long) {
4741     const u1 large_loop_size = 64;
4742     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4743     int dcache_line = VM_Version::dcache_line_size();
4744 
4745     Register ary1 = r1, len = r2, result = r0;
4746 
4747     __ align(CodeEntryAlignment);
4748 
4749     StubCodeMark mark(this, "StubRoutines", "has_negatives");
4750 
4751     address entry = __ pc();
4752 
4753     __ enter();
4754 
4755   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16,
4756         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4757 
4758   __ cmp(len, (u1)15);
4759   __ br(Assembler::GT, LEN_OVER_15);
4760   // The only case when execution falls into this code is when pointer is near
4761   // the end of memory page and we have to avoid reading next page
4762   __ add(ary1, ary1, len);
4763   __ subs(len, len, 8);
4764   __ br(Assembler::GT, LEN_OVER_8);
4765   __ ldr(rscratch2, Address(ary1, -8));
4766   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4767   __ lsrv(rscratch2, rscratch2, rscratch1);
4768   __ tst(rscratch2, UPPER_BIT_MASK);
4769   __ cset(result, Assembler::NE);
4770   __ leave();
4771   __ ret(lr);
4772   __ bind(LEN_OVER_8);
4773   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4774   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4775   __ tst(rscratch2, UPPER_BIT_MASK);
4776   __ br(Assembler::NE, RET_TRUE_NO_POP);
4777   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4778   __ lsrv(rscratch1, rscratch1, rscratch2);
4779   __ tst(rscratch1, UPPER_BIT_MASK);
4780   __ cset(result, Assembler::NE);
4781   __ leave();
4782   __ ret(lr);
4783 
4784   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4785   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4786 
4787   has_negatives_long = __ pc(); // 2nd entry point
4788 
4789   __ enter();
4790 
4791   __ bind(LEN_OVER_15);
4792     __ push(spilled_regs, sp);
4793     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4794     __ cbz(rscratch2, ALIGNED);
4795     __ ldp(tmp6, tmp1, Address(ary1));
4796     __ mov(tmp5, 16);
4797     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4798     __ add(ary1, ary1, rscratch1);
4799     __ sub(len, len, rscratch1);
4800     __ orr(tmp6, tmp6, tmp1);
4801     __ tst(tmp6, UPPER_BIT_MASK);
4802     __ br(Assembler::NE, RET_TRUE);
4803 
4804   __ bind(ALIGNED);
4805     __ cmp(len, large_loop_size);
4806     __ br(Assembler::LT, CHECK_16);
4807     // Perform 16-byte load as early return in pre-loop to handle situation
4808     // when initially aligned large array has negative values at starting bytes,
4809     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4810     // slower. Cases with negative bytes further ahead won't be affected that
4811     // much. In fact, it'll be faster due to early loads, less instructions and
4812     // less branches in LARGE_LOOP.
4813     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4814     __ sub(len, len, 16);
4815     __ orr(tmp6, tmp6, tmp1);
4816     __ tst(tmp6, UPPER_BIT_MASK);
4817     __ br(Assembler::NE, RET_TRUE);
4818     __ cmp(len, large_loop_size);
4819     __ br(Assembler::LT, CHECK_16);
4820 
4821     if (SoftwarePrefetchHintDistance >= 0
4822         && SoftwarePrefetchHintDistance >= dcache_line) {
4823       // initial prefetch
4824       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4825     }
4826   __ bind(LARGE_LOOP);
4827     if (SoftwarePrefetchHintDistance >= 0) {
4828       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4829     }
4830     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4831     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4832     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4833     // instructions per cycle and have less branches, but this approach disables
4834     // early return, thus, all 64 bytes are loaded and checked every time.
4835     __ ldp(tmp2, tmp3, Address(ary1));
4836     __ ldp(tmp4, tmp5, Address(ary1, 16));
4837     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4838     __ ldp(tmp6, tmp1, Address(ary1, 48));
4839     __ add(ary1, ary1, large_loop_size);
4840     __ sub(len, len, large_loop_size);
4841     __ orr(tmp2, tmp2, tmp3);
4842     __ orr(tmp4, tmp4, tmp5);
4843     __ orr(rscratch1, rscratch1, rscratch2);
4844     __ orr(tmp6, tmp6, tmp1);
4845     __ orr(tmp2, tmp2, tmp4);
4846     __ orr(rscratch1, rscratch1, tmp6);
4847     __ orr(tmp2, tmp2, rscratch1);
4848     __ tst(tmp2, UPPER_BIT_MASK);
4849     __ br(Assembler::NE, RET_TRUE);
4850     __ cmp(len, large_loop_size);
4851     __ br(Assembler::GE, LARGE_LOOP);
4852 
4853   __ bind(CHECK_16); // small 16-byte load pre-loop
4854     __ cmp(len, (u1)16);
4855     __ br(Assembler::LT, POST_LOOP16);
4856 
4857   __ bind(LOOP16); // small 16-byte load loop
4858     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4859     __ sub(len, len, 16);
4860     __ orr(tmp2, tmp2, tmp3);
4861     __ tst(tmp2, UPPER_BIT_MASK);
4862     __ br(Assembler::NE, RET_TRUE);
4863     __ cmp(len, (u1)16);
4864     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4865 
4866   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4867     __ cmp(len, (u1)8);
4868     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4869     __ ldr(tmp3, Address(__ post(ary1, 8)));
4870     __ sub(len, len, 8);
4871     __ tst(tmp3, UPPER_BIT_MASK);
4872     __ br(Assembler::NE, RET_TRUE);
4873 
4874   __ bind(POST_LOOP16_LOAD_TAIL);
4875     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
4876     __ ldr(tmp1, Address(ary1));
4877     __ mov(tmp2, 64);
4878     __ sub(tmp4, tmp2, len, __ LSL, 3);
4879     __ lslv(tmp1, tmp1, tmp4);
4880     __ tst(tmp1, UPPER_BIT_MASK);
4881     __ br(Assembler::NE, RET_TRUE);
4882     // Fallthrough
4883 
4884   __ bind(RET_FALSE);
4885     __ pop(spilled_regs, sp);
4886     __ leave();
4887     __ mov(result, zr);
4888     __ ret(lr);
4889 
4890   __ bind(RET_TRUE);
4891     __ pop(spilled_regs, sp);
4892   __ bind(RET_TRUE_NO_POP);
4893     __ leave();
4894     __ mov(result, 1);
4895     __ ret(lr);
4896 
4897     return entry;
4898   }
4899 
4900   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4901         bool usePrefetch, Label &NOT_EQUAL) {
4902     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4903         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4904         tmp7 = r12, tmp8 = r13;
4905     Label LOOP;
4906 
4907     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4908     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4909     __ bind(LOOP);
4910     if (usePrefetch) {
4911       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4912       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4913     }
4914     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4915     __ eor(tmp1, tmp1, tmp2);
4916     __ eor(tmp3, tmp3, tmp4);
4917     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4918     __ orr(tmp1, tmp1, tmp3);
4919     __ cbnz(tmp1, NOT_EQUAL);
4920     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4921     __ eor(tmp5, tmp5, tmp6);
4922     __ eor(tmp7, tmp7, tmp8);
4923     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4924     __ orr(tmp5, tmp5, tmp7);
4925     __ cbnz(tmp5, NOT_EQUAL);
4926     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4927     __ eor(tmp1, tmp1, tmp2);
4928     __ eor(tmp3, tmp3, tmp4);
4929     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4930     __ orr(tmp1, tmp1, tmp3);
4931     __ cbnz(tmp1, NOT_EQUAL);
4932     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4933     __ eor(tmp5, tmp5, tmp6);
4934     __ sub(cnt1, cnt1, 8 * wordSize);
4935     __ eor(tmp7, tmp7, tmp8);
4936     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4937     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4938     // cmp) because subs allows an unlimited range of immediate operand.
4939     __ subs(tmp6, cnt1, loopThreshold);
4940     __ orr(tmp5, tmp5, tmp7);
4941     __ cbnz(tmp5, NOT_EQUAL);
4942     __ br(__ GE, LOOP);
4943     // post-loop
4944     __ eor(tmp1, tmp1, tmp2);
4945     __ eor(tmp3, tmp3, tmp4);
4946     __ orr(tmp1, tmp1, tmp3);
4947     __ sub(cnt1, cnt1, 2 * wordSize);
4948     __ cbnz(tmp1, NOT_EQUAL);
4949   }
4950 
4951   void generate_large_array_equals_loop_simd(int loopThreshold,
4952         bool usePrefetch, Label &NOT_EQUAL) {
4953     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4954         tmp2 = rscratch2;
4955     Label LOOP;
4956 
4957     __ bind(LOOP);
4958     if (usePrefetch) {
4959       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4960       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4961     }
4962     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4963     __ sub(cnt1, cnt1, 8 * wordSize);
4964     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4965     __ subs(tmp1, cnt1, loopThreshold);
4966     __ eor(v0, __ T16B, v0, v4);
4967     __ eor(v1, __ T16B, v1, v5);
4968     __ eor(v2, __ T16B, v2, v6);
4969     __ eor(v3, __ T16B, v3, v7);
4970     __ orr(v0, __ T16B, v0, v1);
4971     __ orr(v1, __ T16B, v2, v3);
4972     __ orr(v0, __ T16B, v0, v1);
4973     __ umov(tmp1, v0, __ D, 0);
4974     __ umov(tmp2, v0, __ D, 1);
4975     __ orr(tmp1, tmp1, tmp2);
4976     __ cbnz(tmp1, NOT_EQUAL);
4977     __ br(__ GE, LOOP);
4978   }
4979 
4980   // a1 = r1 - array1 address
4981   // a2 = r2 - array2 address
4982   // result = r0 - return value. Already contains "false"
4983   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4984   // r3-r5 are reserved temporary registers
4985   address generate_large_array_equals() {
4986     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4987         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4988         tmp7 = r12, tmp8 = r13;
4989     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4990         SMALL_LOOP, POST_LOOP;
4991     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4992     // calculate if at least 32 prefetched bytes are used
4993     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4994     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4995     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4996     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4997         tmp5, tmp6, tmp7, tmp8);
4998 
4999     __ align(CodeEntryAlignment);
5000 
5001     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5002 
5003     address entry = __ pc();
5004     __ enter();
5005     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5006     // also advance pointers to use post-increment instead of pre-increment
5007     __ add(a1, a1, wordSize);
5008     __ add(a2, a2, wordSize);
5009     if (AvoidUnalignedAccesses) {
5010       // both implementations (SIMD/nonSIMD) are using relatively large load
5011       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5012       // on some CPUs in case of address is not at least 16-byte aligned.
5013       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5014       // load if needed at least for 1st address and make if 16-byte aligned.
5015       Label ALIGNED16;
5016       __ tbz(a1, 3, ALIGNED16);
5017       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5018       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5019       __ sub(cnt1, cnt1, wordSize);
5020       __ eor(tmp1, tmp1, tmp2);
5021       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5022       __ bind(ALIGNED16);
5023     }
5024     if (UseSIMDForArrayEquals) {
5025       if (SoftwarePrefetchHintDistance >= 0) {
5026         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5027         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5028         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5029             /* prfm = */ true, NOT_EQUAL);
5030         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5031         __ br(__ LT, TAIL);
5032       }
5033       __ bind(NO_PREFETCH_LARGE_LOOP);
5034       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5035           /* prfm = */ false, NOT_EQUAL);
5036     } else {
5037       __ push(spilled_regs, sp);
5038       if (SoftwarePrefetchHintDistance >= 0) {
5039         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5040         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5041         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5042             /* prfm = */ true, NOT_EQUAL);
5043         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5044         __ br(__ LT, TAIL);
5045       }
5046       __ bind(NO_PREFETCH_LARGE_LOOP);
5047       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5048           /* prfm = */ false, NOT_EQUAL);
5049     }
5050     __ bind(TAIL);
5051       __ cbz(cnt1, EQUAL);
5052       __ subs(cnt1, cnt1, wordSize);
5053       __ br(__ LE, POST_LOOP);
5054     __ bind(SMALL_LOOP);
5055       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5056       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5057       __ subs(cnt1, cnt1, wordSize);
5058       __ eor(tmp1, tmp1, tmp2);
5059       __ cbnz(tmp1, NOT_EQUAL);
5060       __ br(__ GT, SMALL_LOOP);
5061     __ bind(POST_LOOP);
5062       __ ldr(tmp1, Address(a1, cnt1));
5063       __ ldr(tmp2, Address(a2, cnt1));
5064       __ eor(tmp1, tmp1, tmp2);
5065       __ cbnz(tmp1, NOT_EQUAL);
5066     __ bind(EQUAL);
5067       __ mov(result, true);
5068     __ bind(NOT_EQUAL);
5069       if (!UseSIMDForArrayEquals) {
5070         __ pop(spilled_regs, sp);
5071       }
5072     __ bind(NOT_EQUAL_NO_POP);
5073     __ leave();
5074     __ ret(lr);
5075     return entry;
5076   }
5077 
5078   address generate_dsin_dcos(bool isCos) {
5079     __ align(CodeEntryAlignment);
5080     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5081     address start = __ pc();
5082     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5083         (address)StubRoutines::aarch64::_two_over_pi,
5084         (address)StubRoutines::aarch64::_pio2,
5085         (address)StubRoutines::aarch64::_dsin_coef,
5086         (address)StubRoutines::aarch64::_dcos_coef);
5087     return start;
5088   }
5089 
5090   address generate_dlog() {
5091     __ align(CodeEntryAlignment);
5092     StubCodeMark mark(this, "StubRoutines", "dlog");
5093     address entry = __ pc();
5094     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5095         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5096     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5097     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5098         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5099     return entry;
5100   }
5101 
5102 
5103   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5104   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5105       Label &DIFF2) {
5106     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5107     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5108 
5109     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5110     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5111     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5112     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5113 
5114     __ fmovd(tmpL, vtmp3);
5115     __ eor(rscratch2, tmp3, tmpL);
5116     __ cbnz(rscratch2, DIFF2);
5117 
5118     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5119     __ umov(tmpL, vtmp3, __ D, 1);
5120     __ eor(rscratch2, tmpU, tmpL);
5121     __ cbnz(rscratch2, DIFF1);
5122 
5123     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5124     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5125     __ fmovd(tmpL, vtmp);
5126     __ eor(rscratch2, tmp3, tmpL);
5127     __ cbnz(rscratch2, DIFF2);
5128 
5129     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5130     __ umov(tmpL, vtmp, __ D, 1);
5131     __ eor(rscratch2, tmpU, tmpL);
5132     __ cbnz(rscratch2, DIFF1);
5133   }
5134 
5135   // r0  = result
5136   // r1  = str1
5137   // r2  = cnt1
5138   // r3  = str2
5139   // r4  = cnt2
5140   // r10 = tmp1
5141   // r11 = tmp2
5142   address generate_compare_long_string_different_encoding(bool isLU) {
5143     __ align(CodeEntryAlignment);
5144     StubCodeMark mark(this, "StubRoutines", isLU
5145         ? "compare_long_string_different_encoding LU"
5146         : "compare_long_string_different_encoding UL");
5147     address entry = __ pc();
5148     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5149         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5150         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5151     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5152         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5153     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5154     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5155 
5156     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5157 
5158     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5159     // cnt2 == amount of characters left to compare
5160     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5161     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5162     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5163     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5164     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5165     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5166     __ eor(rscratch2, tmp1, tmp2);
5167     __ mov(rscratch1, tmp2);
5168     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5169     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5170              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5171     __ push(spilled_regs, sp);
5172     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5173     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5174 
5175     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5176 
5177     if (SoftwarePrefetchHintDistance >= 0) {
5178       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5179       __ br(__ LT, NO_PREFETCH);
5180       __ bind(LARGE_LOOP_PREFETCH);
5181         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5182         __ mov(tmp4, 2);
5183         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5184         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5185           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5186           __ subs(tmp4, tmp4, 1);
5187           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5188           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5189           __ mov(tmp4, 2);
5190         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5191           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5192           __ subs(tmp4, tmp4, 1);
5193           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5194           __ sub(cnt2, cnt2, 64);
5195           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5196           __ br(__ GE, LARGE_LOOP_PREFETCH);
5197     }
5198     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5199     __ bind(NO_PREFETCH);
5200     __ subs(cnt2, cnt2, 16);
5201     __ br(__ LT, TAIL);
5202     __ align(OptoLoopAlignment);
5203     __ bind(SMALL_LOOP); // smaller loop
5204       __ subs(cnt2, cnt2, 16);
5205       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5206       __ br(__ GE, SMALL_LOOP);
5207       __ cmn(cnt2, (u1)16);
5208       __ br(__ EQ, LOAD_LAST);
5209     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5210       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5211       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5212       __ ldr(tmp3, Address(cnt1, -8));
5213       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5214       __ b(LOAD_LAST);
5215     __ bind(DIFF2);
5216       __ mov(tmpU, tmp3);
5217     __ bind(DIFF1);
5218       __ pop(spilled_regs, sp);
5219       __ b(CALCULATE_DIFFERENCE);
5220     __ bind(LOAD_LAST);
5221       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5222       // No need to load it again
5223       __ mov(tmpU, tmp3);
5224       __ pop(spilled_regs, sp);
5225 
5226       // tmp2 points to the address of the last 4 Latin1 characters right now
5227       __ ldrs(vtmp, Address(tmp2));
5228       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5229       __ fmovd(tmpL, vtmp);
5230 
5231       __ eor(rscratch2, tmpU, tmpL);
5232       __ cbz(rscratch2, DONE);
5233 
5234     // Find the first different characters in the longwords and
5235     // compute their difference.
5236     __ bind(CALCULATE_DIFFERENCE);
5237       __ rev(rscratch2, rscratch2);
5238       __ clz(rscratch2, rscratch2);
5239       __ andr(rscratch2, rscratch2, -16);
5240       __ lsrv(tmp1, tmp1, rscratch2);
5241       __ uxthw(tmp1, tmp1);
5242       __ lsrv(rscratch1, rscratch1, rscratch2);
5243       __ uxthw(rscratch1, rscratch1);
5244       __ subw(result, tmp1, rscratch1);
5245     __ bind(DONE);
5246       __ ret(lr);
5247     return entry;
5248   }
5249 
5250     address generate_method_entry_barrier() {
5251     __ align(CodeEntryAlignment);
5252     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5253 
5254     Label deoptimize_label;
5255 
5256     address start = __ pc();
5257 
5258     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5259 
5260     __ enter();
5261     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5262 
5263     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5264 
5265     __ push_call_clobbered_registers();
5266 
5267     __ mov(c_rarg0, rscratch2);
5268     __ call_VM_leaf
5269          (CAST_FROM_FN_PTR
5270           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5271 
5272     __ reset_last_Java_frame(true);
5273 
5274     __ mov(rscratch1, r0);
5275 
5276     __ pop_call_clobbered_registers();
5277 
5278     __ cbnz(rscratch1, deoptimize_label);
5279 
5280     __ leave();
5281     __ ret(lr);
5282 
5283     __ BIND(deoptimize_label);
5284 
5285     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5286     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5287 
5288     __ mov(sp, rscratch1);
5289     __ br(rscratch2);
5290 
5291     return start;
5292   }
5293 
5294   address generate_check_lock_stack() {
5295     __ align(CodeEntryAlignment);
5296     StubCodeMark mark(this, "StubRoutines", "check_lock_stack");
5297 
5298     address start = __ pc();
5299 
5300     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5301     __ enter();
5302     __ push_call_clobbered_registers();
5303 
5304     __ mov(c_rarg0, r9);
5305     __ call_VM_leaf(CAST_FROM_FN_PTR(address, LockStack::ensure_lock_stack_size), 1);
5306 
5307 
5308     __ pop_call_clobbered_registers();
5309     __ leave();
5310     __ reset_last_Java_frame(true);
5311 
5312     __ ret(lr);
5313 
5314     return start;
5315   }
5316 
5317   // r0  = result
5318   // r1  = str1
5319   // r2  = cnt1
5320   // r3  = str2
5321   // r4  = cnt2
5322   // r10 = tmp1
5323   // r11 = tmp2
5324   address generate_compare_long_string_same_encoding(bool isLL) {
5325     __ align(CodeEntryAlignment);
5326     StubCodeMark mark(this, "StubRoutines", isLL
5327         ? "compare_long_string_same_encoding LL"
5328         : "compare_long_string_same_encoding UU");
5329     address entry = __ pc();
5330     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5331         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5332 
5333     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5334 
5335     // exit from large loop when less than 64 bytes left to read or we're about
5336     // to prefetch memory behind array border
5337     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5338 
5339     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5340     __ eor(rscratch2, tmp1, tmp2);
5341     __ cbnz(rscratch2, CAL_DIFFERENCE);
5342 
5343     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5344     // update pointers, because of previous read
5345     __ add(str1, str1, wordSize);
5346     __ add(str2, str2, wordSize);
5347     if (SoftwarePrefetchHintDistance >= 0) {
5348       __ align(OptoLoopAlignment);
5349       __ bind(LARGE_LOOP_PREFETCH);
5350         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5351         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5352 
5353         for (int i = 0; i < 4; i++) {
5354           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5355           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5356           __ cmp(tmp1, tmp2);
5357           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5358           __ br(Assembler::NE, DIFF);
5359         }
5360         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5361         __ add(str1, str1, 64);
5362         __ add(str2, str2, 64);
5363         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5364         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5365         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5366     }
5367 
5368     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5369     __ br(Assembler::LE, LESS16);
5370     __ align(OptoLoopAlignment);
5371     __ bind(LOOP_COMPARE16);
5372       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5373       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5374       __ cmp(tmp1, tmp2);
5375       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5376       __ br(Assembler::NE, DIFF);
5377       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5378       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5379       __ br(Assembler::LT, LESS16);
5380 
5381       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5382       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5383       __ cmp(tmp1, tmp2);
5384       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5385       __ br(Assembler::NE, DIFF);
5386       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5387       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5388       __ br(Assembler::GE, LOOP_COMPARE16);
5389       __ cbz(cnt2, LENGTH_DIFF);
5390 
5391     __ bind(LESS16);
5392       // each 8 compare
5393       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5394       __ br(Assembler::LE, LESS8);
5395       __ ldr(tmp1, Address(__ post(str1, 8)));
5396       __ ldr(tmp2, Address(__ post(str2, 8)));
5397       __ eor(rscratch2, tmp1, tmp2);
5398       __ cbnz(rscratch2, CAL_DIFFERENCE);
5399       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5400 
5401     __ bind(LESS8); // directly load last 8 bytes
5402       if (!isLL) {
5403         __ add(cnt2, cnt2, cnt2);
5404       }
5405       __ ldr(tmp1, Address(str1, cnt2));
5406       __ ldr(tmp2, Address(str2, cnt2));
5407       __ eor(rscratch2, tmp1, tmp2);
5408       __ cbz(rscratch2, LENGTH_DIFF);
5409       __ b(CAL_DIFFERENCE);
5410 
5411     __ bind(DIFF);
5412       __ cmp(tmp1, tmp2);
5413       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5414       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5415       // reuse rscratch2 register for the result of eor instruction
5416       __ eor(rscratch2, tmp1, tmp2);
5417 
5418     __ bind(CAL_DIFFERENCE);
5419       __ rev(rscratch2, rscratch2);
5420       __ clz(rscratch2, rscratch2);
5421       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5422       __ lsrv(tmp1, tmp1, rscratch2);
5423       __ lsrv(tmp2, tmp2, rscratch2);
5424       if (isLL) {
5425         __ uxtbw(tmp1, tmp1);
5426         __ uxtbw(tmp2, tmp2);
5427       } else {
5428         __ uxthw(tmp1, tmp1);
5429         __ uxthw(tmp2, tmp2);
5430       }
5431       __ subw(result, tmp1, tmp2);
5432 
5433     __ bind(LENGTH_DIFF);
5434       __ ret(lr);
5435     return entry;
5436   }
5437 
5438   void generate_compare_long_strings() {
5439       StubRoutines::aarch64::_compare_long_string_LL
5440           = generate_compare_long_string_same_encoding(true);
5441       StubRoutines::aarch64::_compare_long_string_UU
5442           = generate_compare_long_string_same_encoding(false);
5443       StubRoutines::aarch64::_compare_long_string_LU
5444           = generate_compare_long_string_different_encoding(true);
5445       StubRoutines::aarch64::_compare_long_string_UL
5446           = generate_compare_long_string_different_encoding(false);
5447   }
5448 
5449   // R0 = result
5450   // R1 = str2
5451   // R2 = cnt1
5452   // R3 = str1
5453   // R4 = cnt2
5454   // This generic linear code use few additional ideas, which makes it faster:
5455   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5456   // in order to skip initial loading(help in systems with 1 ld pipeline)
5457   // 2) we can use "fast" algorithm of finding single character to search for
5458   // first symbol with less branches(1 branch per each loaded register instead
5459   // of branch for each symbol), so, this is where constants like
5460   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5461   // 3) after loading and analyzing 1st register of source string, it can be
5462   // used to search for every 1st character entry, saving few loads in
5463   // comparison with "simplier-but-slower" implementation
5464   // 4) in order to avoid lots of push/pop operations, code below is heavily
5465   // re-using/re-initializing/compressing register values, which makes code
5466   // larger and a bit less readable, however, most of extra operations are
5467   // issued during loads or branches, so, penalty is minimal
5468   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5469     const char* stubName = str1_isL
5470         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5471         : "indexof_linear_uu";
5472     __ align(CodeEntryAlignment);
5473     StubCodeMark mark(this, "StubRoutines", stubName);
5474     address entry = __ pc();
5475 
5476     int str1_chr_size = str1_isL ? 1 : 2;
5477     int str2_chr_size = str2_isL ? 1 : 2;
5478     int str1_chr_shift = str1_isL ? 0 : 1;
5479     int str2_chr_shift = str2_isL ? 0 : 1;
5480     bool isL = str1_isL && str2_isL;
5481    // parameters
5482     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5483     // temporary registers
5484     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5485     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5486     // redefinitions
5487     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5488 
5489     __ push(spilled_regs, sp);
5490     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5491         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5492         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5493         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5494         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5495         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5496     // Read whole register from str1. It is safe, because length >=8 here
5497     __ ldr(ch1, Address(str1));
5498     // Read whole register from str2. It is safe, because length >=8 here
5499     __ ldr(ch2, Address(str2));
5500     __ sub(cnt2, cnt2, cnt1);
5501     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5502     if (str1_isL != str2_isL) {
5503       __ eor(v0, __ T16B, v0, v0);
5504     }
5505     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5506     __ mul(first, first, tmp1);
5507     // check if we have less than 1 register to check
5508     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5509     if (str1_isL != str2_isL) {
5510       __ fmovd(v1, ch1);
5511     }
5512     __ br(__ LE, L_SMALL);
5513     __ eor(ch2, first, ch2);
5514     if (str1_isL != str2_isL) {
5515       __ zip1(v1, __ T16B, v1, v0);
5516     }
5517     __ sub(tmp2, ch2, tmp1);
5518     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5519     __ bics(tmp2, tmp2, ch2);
5520     if (str1_isL != str2_isL) {
5521       __ fmovd(ch1, v1);
5522     }
5523     __ br(__ NE, L_HAS_ZERO);
5524     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5525     __ add(result, result, wordSize/str2_chr_size);
5526     __ add(str2, str2, wordSize);
5527     __ br(__ LT, L_POST_LOOP);
5528     __ BIND(L_LOOP);
5529       __ ldr(ch2, Address(str2));
5530       __ eor(ch2, first, ch2);
5531       __ sub(tmp2, ch2, tmp1);
5532       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5533       __ bics(tmp2, tmp2, ch2);
5534       __ br(__ NE, L_HAS_ZERO);
5535     __ BIND(L_LOOP_PROCEED);
5536       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5537       __ add(str2, str2, wordSize);
5538       __ add(result, result, wordSize/str2_chr_size);
5539       __ br(__ GE, L_LOOP);
5540     __ BIND(L_POST_LOOP);
5541       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5542       __ br(__ LE, NOMATCH);
5543       __ ldr(ch2, Address(str2));
5544       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5545       __ eor(ch2, first, ch2);
5546       __ sub(tmp2, ch2, tmp1);
5547       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5548       __ mov(tmp4, -1); // all bits set
5549       __ b(L_SMALL_PROCEED);
5550     __ align(OptoLoopAlignment);
5551     __ BIND(L_SMALL);
5552       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5553       __ eor(ch2, first, ch2);
5554       if (str1_isL != str2_isL) {
5555         __ zip1(v1, __ T16B, v1, v0);
5556       }
5557       __ sub(tmp2, ch2, tmp1);
5558       __ mov(tmp4, -1); // all bits set
5559       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5560       if (str1_isL != str2_isL) {
5561         __ fmovd(ch1, v1); // move converted 4 symbols
5562       }
5563     __ BIND(L_SMALL_PROCEED);
5564       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5565       __ bic(tmp2, tmp2, ch2);
5566       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5567       __ rbit(tmp2, tmp2);
5568       __ br(__ EQ, NOMATCH);
5569     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5570       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5571       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5572       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5573       if (str2_isL) { // LL
5574         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5575         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5576         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5577         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5578         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5579       } else {
5580         __ mov(ch2, 0xE); // all bits in byte set except last one
5581         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5582         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5583         __ lslv(tmp2, tmp2, tmp4);
5584         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5585         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5586         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5587         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5588       }
5589       __ cmp(ch1, ch2);
5590       __ mov(tmp4, wordSize/str2_chr_size);
5591       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5592     __ BIND(L_SMALL_CMP_LOOP);
5593       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5594                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5595       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5596                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5597       __ add(tmp4, tmp4, 1);
5598       __ cmp(tmp4, cnt1);
5599       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5600       __ cmp(first, ch2);
5601       __ br(__ EQ, L_SMALL_CMP_LOOP);
5602     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5603       __ cbz(tmp2, NOMATCH); // no more matches. exit
5604       __ clz(tmp4, tmp2);
5605       __ add(result, result, 1); // advance index
5606       __ add(str2, str2, str2_chr_size); // advance pointer
5607       __ b(L_SMALL_HAS_ZERO_LOOP);
5608     __ align(OptoLoopAlignment);
5609     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5610       __ cmp(first, ch2);
5611       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5612       __ b(DONE);
5613     __ align(OptoLoopAlignment);
5614     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5615       if (str2_isL) { // LL
5616         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5617         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5618         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5619         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5620         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5621       } else {
5622         __ mov(ch2, 0xE); // all bits in byte set except last one
5623         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5624         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5625         __ lslv(tmp2, tmp2, tmp4);
5626         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5627         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5628         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5629         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5630       }
5631       __ cmp(ch1, ch2);
5632       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5633       __ b(DONE);
5634     __ align(OptoLoopAlignment);
5635     __ BIND(L_HAS_ZERO);
5636       __ rbit(tmp2, tmp2);
5637       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5638       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5639       // It's fine because both counters are 32bit and are not changed in this
5640       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5641       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5642       __ sub(result, result, 1);
5643     __ BIND(L_HAS_ZERO_LOOP);
5644       __ mov(cnt1, wordSize/str2_chr_size);
5645       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5646       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5647       if (str2_isL) {
5648         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5649         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5650         __ lslv(tmp2, tmp2, tmp4);
5651         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5652         __ add(tmp4, tmp4, 1);
5653         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5654         __ lsl(tmp2, tmp2, 1);
5655         __ mov(tmp4, wordSize/str2_chr_size);
5656       } else {
5657         __ mov(ch2, 0xE);
5658         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5659         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5660         __ lslv(tmp2, tmp2, tmp4);
5661         __ add(tmp4, tmp4, 1);
5662         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5663         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5664         __ lsl(tmp2, tmp2, 1);
5665         __ mov(tmp4, wordSize/str2_chr_size);
5666         __ sub(str2, str2, str2_chr_size);
5667       }
5668       __ cmp(ch1, ch2);
5669       __ mov(tmp4, wordSize/str2_chr_size);
5670       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5671     __ BIND(L_CMP_LOOP);
5672       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5673                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5674       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5675                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5676       __ add(tmp4, tmp4, 1);
5677       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5678       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5679       __ cmp(cnt1, ch2);
5680       __ br(__ EQ, L_CMP_LOOP);
5681     __ BIND(L_CMP_LOOP_NOMATCH);
5682       // here we're not matched
5683       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5684       __ clz(tmp4, tmp2);
5685       __ add(str2, str2, str2_chr_size); // advance pointer
5686       __ b(L_HAS_ZERO_LOOP);
5687     __ align(OptoLoopAlignment);
5688     __ BIND(L_CMP_LOOP_LAST_CMP);
5689       __ cmp(cnt1, ch2);
5690       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5691       __ b(DONE);
5692     __ align(OptoLoopAlignment);
5693     __ BIND(L_CMP_LOOP_LAST_CMP2);
5694       if (str2_isL) {
5695         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5696         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5697         __ lslv(tmp2, tmp2, tmp4);
5698         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5699         __ add(tmp4, tmp4, 1);
5700         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5701         __ lsl(tmp2, tmp2, 1);
5702       } else {
5703         __ mov(ch2, 0xE);
5704         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5705         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5706         __ lslv(tmp2, tmp2, tmp4);
5707         __ add(tmp4, tmp4, 1);
5708         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5709         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5710         __ lsl(tmp2, tmp2, 1);
5711         __ sub(str2, str2, str2_chr_size);
5712       }
5713       __ cmp(ch1, ch2);
5714       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5715       __ b(DONE);
5716     __ align(OptoLoopAlignment);
5717     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5718       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5719       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5720       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5721       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5722       // result by analyzed characters value, so, we can just reset lower bits
5723       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5724       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5725       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5726       // index of last analyzed substring inside current octet. So, str2 in at
5727       // respective start address. We need to advance it to next octet
5728       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5729       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5730       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5731       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5732       __ movw(cnt2, cnt2);
5733       __ b(L_LOOP_PROCEED);
5734     __ align(OptoLoopAlignment);
5735     __ BIND(NOMATCH);
5736       __ mov(result, -1);
5737     __ BIND(DONE);
5738       __ pop(spilled_regs, sp);
5739       __ ret(lr);
5740     return entry;
5741   }
5742 
5743   void generate_string_indexof_stubs() {
5744     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5745     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5746     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5747   }
5748 
5749   void inflate_and_store_2_fp_registers(bool generatePrfm,
5750       FloatRegister src1, FloatRegister src2) {
5751     Register dst = r1;
5752     __ zip1(v1, __ T16B, src1, v0);
5753     __ zip2(v2, __ T16B, src1, v0);
5754     if (generatePrfm) {
5755       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5756     }
5757     __ zip1(v3, __ T16B, src2, v0);
5758     __ zip2(v4, __ T16B, src2, v0);
5759     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5760   }
5761 
5762   // R0 = src
5763   // R1 = dst
5764   // R2 = len
5765   // R3 = len >> 3
5766   // V0 = 0
5767   // v1 = loaded 8 bytes
5768   address generate_large_byte_array_inflate() {
5769     __ align(CodeEntryAlignment);
5770     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5771     address entry = __ pc();
5772     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5773     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5774     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5775 
5776     // do one more 8-byte read to have address 16-byte aligned in most cases
5777     // also use single store instruction
5778     __ ldrd(v2, __ post(src, 8));
5779     __ sub(octetCounter, octetCounter, 2);
5780     __ zip1(v1, __ T16B, v1, v0);
5781     __ zip1(v2, __ T16B, v2, v0);
5782     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5783     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5784     __ subs(rscratch1, octetCounter, large_loop_threshold);
5785     __ br(__ LE, LOOP_START);
5786     __ b(LOOP_PRFM_START);
5787     __ bind(LOOP_PRFM);
5788       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5789     __ bind(LOOP_PRFM_START);
5790       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5791       __ sub(octetCounter, octetCounter, 8);
5792       __ subs(rscratch1, octetCounter, large_loop_threshold);
5793       inflate_and_store_2_fp_registers(true, v3, v4);
5794       inflate_and_store_2_fp_registers(true, v5, v6);
5795       __ br(__ GT, LOOP_PRFM);
5796       __ cmp(octetCounter, (u1)8);
5797       __ br(__ LT, DONE);
5798     __ bind(LOOP);
5799       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5800       __ bind(LOOP_START);
5801       __ sub(octetCounter, octetCounter, 8);
5802       __ cmp(octetCounter, (u1)8);
5803       inflate_and_store_2_fp_registers(false, v3, v4);
5804       inflate_and_store_2_fp_registers(false, v5, v6);
5805       __ br(__ GE, LOOP);
5806     __ bind(DONE);
5807       __ ret(lr);
5808     return entry;
5809   }
5810 
5811   /**
5812    *  Arguments:
5813    *
5814    *  Input:
5815    *  c_rarg0   - current state address
5816    *  c_rarg1   - H key address
5817    *  c_rarg2   - data address
5818    *  c_rarg3   - number of blocks
5819    *
5820    *  Output:
5821    *  Updated state at c_rarg0
5822    */
5823   address generate_ghash_processBlocks() {
5824     // Bafflingly, GCM uses little-endian for the byte order, but
5825     // big-endian for the bit order.  For example, the polynomial 1 is
5826     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5827     //
5828     // So, we must either reverse the bytes in each word and do
5829     // everything big-endian or reverse the bits in each byte and do
5830     // it little-endian.  On AArch64 it's more idiomatic to reverse
5831     // the bits in each byte (we have an instruction, RBIT, to do
5832     // that) and keep the data in little-endian bit order throught the
5833     // calculation, bit-reversing the inputs and outputs.
5834 
5835     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5836     __ align(wordSize * 2);
5837     address p = __ pc();
5838     __ emit_int64(0x87);  // The low-order bits of the field
5839                           // polynomial (i.e. p = z^7+z^2+z+1)
5840                           // repeated in the low and high parts of a
5841                           // 128-bit vector
5842     __ emit_int64(0x87);
5843 
5844     __ align(CodeEntryAlignment);
5845     address start = __ pc();
5846 
5847     Register state   = c_rarg0;
5848     Register subkeyH = c_rarg1;
5849     Register data    = c_rarg2;
5850     Register blocks  = c_rarg3;
5851 
5852     FloatRegister vzr = v30;
5853     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5854 
5855     __ ldrq(v0, Address(state));
5856     __ ldrq(v1, Address(subkeyH));
5857 
5858     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5859     __ rbit(v0, __ T16B, v0);
5860     __ rev64(v1, __ T16B, v1);
5861     __ rbit(v1, __ T16B, v1);
5862 
5863     __ ldrq(v26, p);
5864 
5865     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5866     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5867 
5868     {
5869       Label L_ghash_loop;
5870       __ bind(L_ghash_loop);
5871 
5872       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5873                                                  // reversing each byte
5874       __ rbit(v2, __ T16B, v2);
5875       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5876 
5877       // Multiply state in v2 by subkey in v1
5878       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5879                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
5880                      /*temps*/v6, v20, v18, v21);
5881       // Reduce v7:v5 by the field polynomial
5882       ghash_reduce(v0, v5, v7, v26, vzr, v20);
5883 
5884       __ sub(blocks, blocks, 1);
5885       __ cbnz(blocks, L_ghash_loop);
5886     }
5887 
5888     // The bit-reversed result is at this point in v0
5889     __ rev64(v1, __ T16B, v0);
5890     __ rbit(v1, __ T16B, v1);
5891 
5892     __ st1(v1, __ T16B, state);
5893     __ ret(lr);
5894 
5895     return start;
5896   }
5897 
5898   void generate_base64_encode_simdround(Register src, Register dst,
5899         FloatRegister codec, u8 size) {
5900 
5901     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
5902     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5903     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5904 
5905     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5906 
5907     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
5908 
5909     __ ushr(ind0, arrangement, in0,  2);
5910 
5911     __ ushr(ind1, arrangement, in1,  2);
5912     __ shl(in0,   arrangement, in0,  6);
5913     __ orr(ind1,  arrangement, ind1, in0);
5914     __ ushr(ind1, arrangement, ind1, 2);
5915 
5916     __ ushr(ind2, arrangement, in2,  4);
5917     __ shl(in1,   arrangement, in1,  4);
5918     __ orr(ind2,  arrangement, in1,  ind2);
5919     __ ushr(ind2, arrangement, ind2, 2);
5920 
5921     __ shl(ind3,  arrangement, in2,  2);
5922     __ ushr(ind3, arrangement, ind3, 2);
5923 
5924     __ tbl(out0,  arrangement, codec,  4, ind0);
5925     __ tbl(out1,  arrangement, codec,  4, ind1);
5926     __ tbl(out2,  arrangement, codec,  4, ind2);
5927     __ tbl(out3,  arrangement, codec,  4, ind3);
5928 
5929     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
5930   }
5931 
5932    /**
5933    *  Arguments:
5934    *
5935    *  Input:
5936    *  c_rarg0   - src_start
5937    *  c_rarg1   - src_offset
5938    *  c_rarg2   - src_length
5939    *  c_rarg3   - dest_start
5940    *  c_rarg4   - dest_offset
5941    *  c_rarg5   - isURL
5942    *
5943    */
5944   address generate_base64_encodeBlock() {
5945 
5946     static const char toBase64[64] = {
5947       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5948       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5949       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5950       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5951       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5952     };
5953 
5954     static const char toBase64URL[64] = {
5955       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5956       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5957       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5958       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5959       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5960     };
5961 
5962     __ align(CodeEntryAlignment);
5963     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5964     address start = __ pc();
5965 
5966     Register src   = c_rarg0;  // source array
5967     Register soff  = c_rarg1;  // source start offset
5968     Register send  = c_rarg2;  // source end offset
5969     Register dst   = c_rarg3;  // dest array
5970     Register doff  = c_rarg4;  // position for writing to dest array
5971     Register isURL = c_rarg5;  // Base64 or URL chracter set
5972 
5973     // c_rarg6 and c_rarg7 are free to use as temps
5974     Register codec  = c_rarg6;
5975     Register length = c_rarg7;
5976 
5977     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
5978 
5979     __ add(src, src, soff);
5980     __ add(dst, dst, doff);
5981     __ sub(length, send, soff);
5982 
5983     // load the codec base address
5984     __ lea(codec, ExternalAddress((address) toBase64));
5985     __ cbz(isURL, ProcessData);
5986     __ lea(codec, ExternalAddress((address) toBase64URL));
5987 
5988     __ BIND(ProcessData);
5989 
5990     // too short to formup a SIMD loop, roll back
5991     __ cmp(length, (u1)24);
5992     __ br(Assembler::LT, Process3B);
5993 
5994     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
5995 
5996     __ BIND(Process48B);
5997     __ cmp(length, (u1)48);
5998     __ br(Assembler::LT, Process24B);
5999     generate_base64_encode_simdround(src, dst, v0, 16);
6000     __ sub(length, length, 48);
6001     __ b(Process48B);
6002 
6003     __ BIND(Process24B);
6004     __ cmp(length, (u1)24);
6005     __ br(Assembler::LT, SIMDExit);
6006     generate_base64_encode_simdround(src, dst, v0, 8);
6007     __ sub(length, length, 24);
6008 
6009     __ BIND(SIMDExit);
6010     __ cbz(length, Exit);
6011 
6012     __ BIND(Process3B);
6013     //  3 src bytes, 24 bits
6014     __ ldrb(r10, __ post(src, 1));
6015     __ ldrb(r11, __ post(src, 1));
6016     __ ldrb(r12, __ post(src, 1));
6017     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6018     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6019     // codec index
6020     __ ubfmw(r15, r12, 18, 23);
6021     __ ubfmw(r14, r12, 12, 17);
6022     __ ubfmw(r13, r12, 6,  11);
6023     __ andw(r12,  r12, 63);
6024     // get the code based on the codec
6025     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6026     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6027     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6028     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6029     __ strb(r15, __ post(dst, 1));
6030     __ strb(r14, __ post(dst, 1));
6031     __ strb(r13, __ post(dst, 1));
6032     __ strb(r12, __ post(dst, 1));
6033     __ sub(length, length, 3);
6034     __ cbnz(length, Process3B);
6035 
6036     __ BIND(Exit);
6037     __ ret(lr);
6038 
6039     return start;
6040   }
6041 
6042   void generate_base64_decode_simdround(Register src, Register dst,
6043         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6044 
6045     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6046     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6047 
6048     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6049     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6050 
6051     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6052 
6053     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6054 
6055     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6056 
6057     // we need unsigned saturating substract, to make sure all input values
6058     // in range [0, 63] will have 0U value in the higher half lookup
6059     __ uqsubv(decH0, __ T16B, in0, v27);
6060     __ uqsubv(decH1, __ T16B, in1, v27);
6061     __ uqsubv(decH2, __ T16B, in2, v27);
6062     __ uqsubv(decH3, __ T16B, in3, v27);
6063 
6064     // lower half lookup
6065     __ tbl(decL0, arrangement, codecL, 4, in0);
6066     __ tbl(decL1, arrangement, codecL, 4, in1);
6067     __ tbl(decL2, arrangement, codecL, 4, in2);
6068     __ tbl(decL3, arrangement, codecL, 4, in3);
6069 
6070     // higher half lookup
6071     __ tbx(decH0, arrangement, codecH, 4, decH0);
6072     __ tbx(decH1, arrangement, codecH, 4, decH1);
6073     __ tbx(decH2, arrangement, codecH, 4, decH2);
6074     __ tbx(decH3, arrangement, codecH, 4, decH3);
6075 
6076     // combine lower and higher
6077     __ orr(decL0, arrangement, decL0, decH0);
6078     __ orr(decL1, arrangement, decL1, decH1);
6079     __ orr(decL2, arrangement, decL2, decH2);
6080     __ orr(decL3, arrangement, decL3, decH3);
6081 
6082     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6083     __ cmhi(decH0, arrangement, decL0, v27);
6084     __ cmhi(decH1, arrangement, decL1, v27);
6085     __ cmhi(decH2, arrangement, decL2, v27);
6086     __ cmhi(decH3, arrangement, decL3, v27);
6087     __ orr(in0, arrangement, decH0, decH1);
6088     __ orr(in1, arrangement, decH2, decH3);
6089     __ orr(in2, arrangement, in0,   in1);
6090     __ umaxv(in3, arrangement, in2);
6091     __ umov(rscratch2, in3, __ B, 0);
6092 
6093     // get the data to output
6094     __ shl(out0,  arrangement, decL0, 2);
6095     __ ushr(out1, arrangement, decL1, 4);
6096     __ orr(out0,  arrangement, out0,  out1);
6097     __ shl(out1,  arrangement, decL1, 4);
6098     __ ushr(out2, arrangement, decL2, 2);
6099     __ orr(out1,  arrangement, out1,  out2);
6100     __ shl(out2,  arrangement, decL2, 6);
6101     __ orr(out2,  arrangement, out2,  decL3);
6102 
6103     __ cbz(rscratch2, NoIllegalData);
6104 
6105     // handle illegal input
6106     __ umov(r10, in2, __ D, 0);
6107     if (size == 16) {
6108       __ cbnz(r10, ErrorInLowerHalf);
6109 
6110       // illegal input is in higher half, store the lower half now.
6111       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6112 
6113       __ umov(r10, in2,  __ D, 1);
6114       __ umov(r11, out0, __ D, 1);
6115       __ umov(r12, out1, __ D, 1);
6116       __ umov(r13, out2, __ D, 1);
6117       __ b(StoreLegalData);
6118 
6119       __ BIND(ErrorInLowerHalf);
6120     }
6121     __ umov(r11, out0, __ D, 0);
6122     __ umov(r12, out1, __ D, 0);
6123     __ umov(r13, out2, __ D, 0);
6124 
6125     __ BIND(StoreLegalData);
6126     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6127     __ strb(r11, __ post(dst, 1));
6128     __ strb(r12, __ post(dst, 1));
6129     __ strb(r13, __ post(dst, 1));
6130     __ lsr(r10, r10, 8);
6131     __ lsr(r11, r11, 8);
6132     __ lsr(r12, r12, 8);
6133     __ lsr(r13, r13, 8);
6134     __ b(StoreLegalData);
6135 
6136     __ BIND(NoIllegalData);
6137     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6138   }
6139 
6140 
6141    /**
6142    *  Arguments:
6143    *
6144    *  Input:
6145    *  c_rarg0   - src_start
6146    *  c_rarg1   - src_offset
6147    *  c_rarg2   - src_length
6148    *  c_rarg3   - dest_start
6149    *  c_rarg4   - dest_offset
6150    *  c_rarg5   - isURL
6151    *  c_rarg6   - isMIME
6152    *
6153    */
6154   address generate_base64_decodeBlock() {
6155 
6156     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6157     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6158     // titled "Base64 decoding".
6159 
6160     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6161     // except the trailing character '=' is also treated illegal value in this instrinsic. That
6162     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6163     static const uint8_t fromBase64ForNoSIMD[256] = {
6164       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6165       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6166       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6167        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6168       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6169        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6170       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6171        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6172       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6173       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6174       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6175       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6176       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6177       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6178       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6179       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6180     };
6181 
6182     static const uint8_t fromBase64URLForNoSIMD[256] = {
6183       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6184       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6185       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6186        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6187       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6188        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6189       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6190        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6191       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6192       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6193       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6194       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6195       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6196       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6197       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6198       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6199     };
6200 
6201     // A legal value of base64 code is in range [0, 127].  We need two lookups
6202     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6203     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6204     // table vector lookup use tbx, out of range indices are unchanged in
6205     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6206     // The value of index 64 is set to 0, so that we know that we already get the
6207     // decoded data with the 1st lookup.
6208     static const uint8_t fromBase64ForSIMD[128] = {
6209       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6210       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6211       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6212        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6213         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6214        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6215       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6216        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6217     };
6218 
6219     static const uint8_t fromBase64URLForSIMD[128] = {
6220       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6221       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6222       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6223        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6224         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6225        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6226        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6227        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6228     };
6229 
6230     __ align(CodeEntryAlignment);
6231     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6232     address start = __ pc();
6233 
6234     Register src    = c_rarg0;  // source array
6235     Register soff   = c_rarg1;  // source start offset
6236     Register send   = c_rarg2;  // source end offset
6237     Register dst    = c_rarg3;  // dest array
6238     Register doff   = c_rarg4;  // position for writing to dest array
6239     Register isURL  = c_rarg5;  // Base64 or URL character set
6240     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6241 
6242     Register length = send;    // reuse send as length of source data to process
6243 
6244     Register simd_codec   = c_rarg6;
6245     Register nosimd_codec = c_rarg7;
6246 
6247     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6248 
6249     __ enter();
6250 
6251     __ add(src, src, soff);
6252     __ add(dst, dst, doff);
6253 
6254     __ mov(doff, dst);
6255 
6256     __ sub(length, send, soff);
6257     __ bfm(length, zr, 0, 1);
6258 
6259     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6260     __ cbz(isURL, ProcessData);
6261     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6262 
6263     __ BIND(ProcessData);
6264     __ mov(rscratch1, length);
6265     __ cmp(length, (u1)144); // 144 = 80 + 64
6266     __ br(Assembler::LT, Process4B);
6267 
6268     // In the MIME case, the line length cannot be more than 76
6269     // bytes (see RFC 2045). This is too short a block for SIMD
6270     // to be worthwhile, so we use non-SIMD here.
6271     __ movw(rscratch1, 79);
6272 
6273     __ BIND(Process4B);
6274     __ ldrw(r14, __ post(src, 4));
6275     __ ubfxw(r10, r14, 0,  8);
6276     __ ubfxw(r11, r14, 8,  8);
6277     __ ubfxw(r12, r14, 16, 8);
6278     __ ubfxw(r13, r14, 24, 8);
6279     // get the de-code
6280     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6281     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6282     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6283     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6284     // error detection, 255u indicates an illegal input
6285     __ orrw(r14, r10, r11);
6286     __ orrw(r15, r12, r13);
6287     __ orrw(r14, r14, r15);
6288     __ tbnz(r14, 7, Exit);
6289     // recover the data
6290     __ lslw(r14, r10, 10);
6291     __ bfiw(r14, r11, 4, 6);
6292     __ bfmw(r14, r12, 2, 5);
6293     __ rev16w(r14, r14);
6294     __ bfiw(r13, r12, 6, 2);
6295     __ strh(r14, __ post(dst, 2));
6296     __ strb(r13, __ post(dst, 1));
6297     // non-simd loop
6298     __ subsw(rscratch1, rscratch1, 4);
6299     __ br(Assembler::GT, Process4B);
6300 
6301     // if exiting from PreProcess80B, rscratch1 == -1;
6302     // otherwise, rscratch1 == 0.
6303     __ cbzw(rscratch1, Exit);
6304     __ sub(length, length, 80);
6305 
6306     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6307     __ cbz(isURL, SIMDEnter);
6308     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6309 
6310     __ BIND(SIMDEnter);
6311     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6312     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6313     __ mov(rscratch1, 63);
6314     __ dup(v27, __ T16B, rscratch1);
6315 
6316     __ BIND(Process64B);
6317     __ cmp(length, (u1)64);
6318     __ br(Assembler::LT, Process32B);
6319     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6320     __ sub(length, length, 64);
6321     __ b(Process64B);
6322 
6323     __ BIND(Process32B);
6324     __ cmp(length, (u1)32);
6325     __ br(Assembler::LT, SIMDExit);
6326     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6327     __ sub(length, length, 32);
6328     __ b(Process32B);
6329 
6330     __ BIND(SIMDExit);
6331     __ cbz(length, Exit);
6332     __ movw(rscratch1, length);
6333     __ b(Process4B);
6334 
6335     __ BIND(Exit);
6336     __ sub(c_rarg0, dst, doff);
6337 
6338     __ leave();
6339     __ ret(lr);
6340 
6341     return start;
6342   }
6343 
6344   address generate_ghash_processBlocks_wide() {
6345     address small = generate_ghash_processBlocks();
6346 
6347     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6348     __ align(wordSize * 2);
6349     address p = __ pc();
6350     __ emit_int64(0x87);  // The low-order bits of the field
6351                           // polynomial (i.e. p = z^7+z^2+z+1)
6352                           // repeated in the low and high parts of a
6353                           // 128-bit vector
6354     __ emit_int64(0x87);
6355 
6356     __ align(CodeEntryAlignment);
6357     address start = __ pc();
6358 
6359     Register state   = c_rarg0;
6360     Register subkeyH = c_rarg1;
6361     Register data    = c_rarg2;
6362     Register blocks  = c_rarg3;
6363 
6364     const int unroll = 4;
6365 
6366     __ cmp(blocks, (unsigned char)(unroll * 2));
6367     __ br(__ LT, small);
6368 
6369     if (unroll > 1) {
6370     // Save state before entering routine
6371       __ sub(sp, sp, 4 * 16);
6372       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6373       __ sub(sp, sp, 4 * 16);
6374       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6375     }
6376 
6377     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6378 
6379     if (unroll > 1) {
6380       // And restore state
6381       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6382       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6383     }
6384 
6385     __ cmp(blocks, zr);
6386     __ br(__ GT, small);
6387 
6388     __ ret(lr);
6389 
6390     return start;
6391   }
6392 
6393   // Support for spin waits.
6394   address generate_spin_wait() {
6395     __ align(CodeEntryAlignment);
6396     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6397     address start = __ pc();
6398 
6399     __ spin_wait();
6400     __ ret(lr);
6401 
6402     return start;
6403   }
6404 
6405 #ifdef LINUX
6406 
6407   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6408   //
6409   // If LSE is in use, generate LSE versions of all the stubs. The
6410   // non-LSE versions are in atomic_aarch64.S.
6411 
6412   // class AtomicStubMark records the entry point of a stub and the
6413   // stub pointer which will point to it. The stub pointer is set to
6414   // the entry point when ~AtomicStubMark() is called, which must be
6415   // after ICache::invalidate_range. This ensures safe publication of
6416   // the generated code.
6417   class AtomicStubMark {
6418     address _entry_point;
6419     aarch64_atomic_stub_t *_stub;
6420     MacroAssembler *_masm;
6421   public:
6422     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6423       _masm = masm;
6424       __ align(32);
6425       _entry_point = __ pc();
6426       _stub = stub;
6427     }
6428     ~AtomicStubMark() {
6429       *_stub = (aarch64_atomic_stub_t)_entry_point;
6430     }
6431   };
6432 
6433   // NB: For memory_order_conservative we need a trailing membar after
6434   // LSE atomic operations but not a leading membar.
6435   //
6436   // We don't need a leading membar because a clause in the Arm ARM
6437   // says:
6438   //
6439   //   Barrier-ordered-before
6440   //
6441   //   Barrier instructions order prior Memory effects before subsequent
6442   //   Memory effects generated by the same Observer. A read or a write
6443   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6444   //   Observer if and only if RW1 appears in program order before RW 2
6445   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6446   //   instruction with both Acquire and Release semantics.
6447   //
6448   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6449   // and Release semantics, therefore we don't need a leading
6450   // barrier. However, there is no corresponding Barrier-ordered-after
6451   // relationship, therefore we need a trailing membar to prevent a
6452   // later store or load from being reordered with the store in an
6453   // atomic instruction.
6454   //
6455   // This was checked by using the herd7 consistency model simulator
6456   // (http://diy.inria.fr/) with this test case:
6457   //
6458   // AArch64 LseCas
6459   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6460   // P0 | P1;
6461   // LDR W4, [X2] | MOV W3, #0;
6462   // DMB LD       | MOV W4, #1;
6463   // LDR W3, [X1] | CASAL W3, W4, [X1];
6464   //              | DMB ISH;
6465   //              | STR W4, [X2];
6466   // exists
6467   // (0:X3=0 /\ 0:X4=1)
6468   //
6469   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6470   // with the store to x in P1. Without the DMB in P1 this may happen.
6471   //
6472   // At the time of writing we don't know of any AArch64 hardware that
6473   // reorders stores in this way, but the Reference Manual permits it.
6474 
6475   void gen_cas_entry(Assembler::operand_size size,
6476                      atomic_memory_order order) {
6477     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6478       exchange_val = c_rarg2;
6479     bool acquire, release;
6480     switch (order) {
6481       case memory_order_relaxed:
6482         acquire = false;
6483         release = false;
6484         break;
6485       case memory_order_release:
6486         acquire = false;
6487         release = true;
6488         break;
6489       default:
6490         acquire = true;
6491         release = true;
6492         break;
6493     }
6494     __ mov(prev, compare_val);
6495     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6496     if (order == memory_order_conservative) {
6497       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6498     }
6499     if (size == Assembler::xword) {
6500       __ mov(r0, prev);
6501     } else {
6502       __ movw(r0, prev);
6503     }
6504     __ ret(lr);
6505   }
6506 
6507   void gen_ldaddal_entry(Assembler::operand_size size) {
6508     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6509     __ ldaddal(size, incr, prev, addr);
6510     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6511     if (size == Assembler::xword) {
6512       __ mov(r0, prev);
6513     } else {
6514       __ movw(r0, prev);
6515     }
6516     __ ret(lr);
6517   }
6518 
6519   void gen_swpal_entry(Assembler::operand_size size) {
6520     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6521     __ swpal(size, incr, prev, addr);
6522     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6523     if (size == Assembler::xword) {
6524       __ mov(r0, prev);
6525     } else {
6526       __ movw(r0, prev);
6527     }
6528     __ ret(lr);
6529   }
6530 
6531   void generate_atomic_entry_points() {
6532     if (! UseLSE) {
6533       return;
6534     }
6535 
6536     __ align(CodeEntryAlignment);
6537     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6538     address first_entry = __ pc();
6539 
6540     // All memory_order_conservative
6541     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6542     gen_ldaddal_entry(Assembler::word);
6543     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6544     gen_ldaddal_entry(Assembler::xword);
6545 
6546     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6547     gen_swpal_entry(Assembler::word);
6548     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6549     gen_swpal_entry(Assembler::xword);
6550 
6551     // CAS, memory_order_conservative
6552     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6553     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6554     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6555     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6556     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6557     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6558 
6559     // CAS, memory_order_relaxed
6560     AtomicStubMark mark_cmpxchg_1_relaxed
6561       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6562     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6563     AtomicStubMark mark_cmpxchg_4_relaxed
6564       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6565     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6566     AtomicStubMark mark_cmpxchg_8_relaxed
6567       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6568     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6569 
6570     AtomicStubMark mark_cmpxchg_4_release
6571       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6572     gen_cas_entry(MacroAssembler::word, memory_order_release);
6573     AtomicStubMark mark_cmpxchg_8_release
6574       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6575     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6576 
6577     AtomicStubMark mark_cmpxchg_4_seq_cst
6578       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6579     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6580     AtomicStubMark mark_cmpxchg_8_seq_cst
6581       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6582     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6583 
6584     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6585   }
6586 #endif // LINUX
6587 
6588   // Continuation point for throwing of implicit exceptions that are
6589   // not handled in the current activation. Fabricates an exception
6590   // oop and initiates normal exception dispatching in this
6591   // frame. Since we need to preserve callee-saved values (currently
6592   // only for C2, but done for C1 as well) we need a callee-saved oop
6593   // map and therefore have to make these stubs into RuntimeStubs
6594   // rather than BufferBlobs.  If the compiler needs all registers to
6595   // be preserved between the fault point and the exception handler
6596   // then it must assume responsibility for that in
6597   // AbstractCompiler::continuation_for_implicit_null_exception or
6598   // continuation_for_implicit_division_by_zero_exception. All other
6599   // implicit exceptions (e.g., NullPointerException or
6600   // AbstractMethodError on entry) are either at call sites or
6601   // otherwise assume that stack unwinding will be initiated, so
6602   // caller saved registers were assumed volatile in the compiler.
6603 
6604 #undef __
6605 #define __ masm->
6606 
6607   address generate_throw_exception(const char* name,
6608                                    address runtime_entry,
6609                                    Register arg1 = noreg,
6610                                    Register arg2 = noreg) {
6611     // Information about frame layout at time of blocking runtime call.
6612     // Note that we only have to preserve callee-saved registers since
6613     // the compilers are responsible for supplying a continuation point
6614     // if they expect all registers to be preserved.
6615     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6616     enum layout {
6617       rfp_off = 0,
6618       rfp_off2,
6619       return_off,
6620       return_off2,
6621       framesize // inclusive of return address
6622     };
6623 
6624     int insts_size = 512;
6625     int locs_size  = 64;
6626 
6627     CodeBuffer code(name, insts_size, locs_size);
6628     OopMapSet* oop_maps  = new OopMapSet();
6629     MacroAssembler* masm = new MacroAssembler(&code);
6630 
6631     address start = __ pc();
6632 
6633     // This is an inlined and slightly modified version of call_VM
6634     // which has the ability to fetch the return PC out of
6635     // thread-local storage and also sets up last_Java_sp slightly
6636     // differently than the real call_VM
6637 
6638     __ enter(); // Save FP and LR before call
6639 
6640     assert(is_even(framesize/2), "sp not 16-byte aligned");
6641 
6642     // lr and fp are already in place
6643     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
6644 
6645     int frame_complete = __ pc() - start;
6646 
6647     // Set up last_Java_sp and last_Java_fp
6648     address the_pc = __ pc();
6649     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6650 
6651     // Call runtime
6652     if (arg1 != noreg) {
6653       assert(arg2 != c_rarg1, "clobbered");
6654       __ mov(c_rarg1, arg1);
6655     }
6656     if (arg2 != noreg) {
6657       __ mov(c_rarg2, arg2);
6658     }
6659     __ mov(c_rarg0, rthread);
6660     BLOCK_COMMENT("call runtime_entry");
6661     __ mov(rscratch1, runtime_entry);
6662     __ blr(rscratch1);
6663 
6664     // Generate oop map
6665     OopMap* map = new OopMap(framesize, 0);
6666 
6667     oop_maps->add_gc_map(the_pc - start, map);
6668 
6669     __ reset_last_Java_frame(true);
6670 
6671     // Reinitialize the ptrue predicate register, in case the external runtime
6672     // call clobbers ptrue reg, as we may return to SVE compiled code.
6673     __ reinitialize_ptrue();
6674 
6675     __ leave();
6676 
6677     // check for pending exceptions
6678 #ifdef ASSERT
6679     Label L;
6680     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6681     __ cbnz(rscratch1, L);
6682     __ should_not_reach_here();
6683     __ bind(L);
6684 #endif // ASSERT
6685     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6686 
6687 
6688     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6689     RuntimeStub* stub =
6690       RuntimeStub::new_runtime_stub(name,
6691                                     &code,
6692                                     frame_complete,
6693                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6694                                     oop_maps, false);
6695     return stub->entry_point();
6696   }
6697 
6698   class MontgomeryMultiplyGenerator : public MacroAssembler {
6699 
6700     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6701       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6702 
6703     RegSet _toSave;
6704     bool _squaring;
6705 
6706   public:
6707     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6708       : MacroAssembler(as->code()), _squaring(squaring) {
6709 
6710       // Register allocation
6711 
6712       RegSetIterator<> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6713       Pa_base = *regs;       // Argument registers
6714       if (squaring)
6715         Pb_base = Pa_base;
6716       else
6717         Pb_base = *++regs;
6718       Pn_base = *++regs;
6719       Rlen= *++regs;
6720       inv = *++regs;
6721       Pm_base = *++regs;
6722 
6723                           // Working registers:
6724       Ra =  *++regs;        // The current digit of a, b, n, and m.
6725       Rb =  *++regs;
6726       Rm =  *++regs;
6727       Rn =  *++regs;
6728 
6729       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
6730       Pb =  *++regs;
6731       Pm =  *++regs;
6732       Pn =  *++regs;
6733 
6734       t0 =  *++regs;        // Three registers which form a
6735       t1 =  *++regs;        // triple-precision accumuator.
6736       t2 =  *++regs;
6737 
6738       Ri =  *++regs;        // Inner and outer loop indexes.
6739       Rj =  *++regs;
6740 
6741       Rhi_ab = *++regs;     // Product registers: low and high parts
6742       Rlo_ab = *++regs;     // of a*b and m*n.
6743       Rhi_mn = *++regs;
6744       Rlo_mn = *++regs;
6745 
6746       // r19 and up are callee-saved.
6747       _toSave = RegSet::range(r19, *regs) + Pm_base;
6748     }
6749 
6750   private:
6751     void save_regs() {
6752       push(_toSave, sp);
6753     }
6754 
6755     void restore_regs() {
6756       pop(_toSave, sp);
6757     }
6758 
6759     template <typename T>
6760     void unroll_2(Register count, T block) {
6761       Label loop, end, odd;
6762       tbnz(count, 0, odd);
6763       cbz(count, end);
6764       align(16);
6765       bind(loop);
6766       (this->*block)();
6767       bind(odd);
6768       (this->*block)();
6769       subs(count, count, 2);
6770       br(Assembler::GT, loop);
6771       bind(end);
6772     }
6773 
6774     template <typename T>
6775     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
6776       Label loop, end, odd;
6777       tbnz(count, 0, odd);
6778       cbz(count, end);
6779       align(16);
6780       bind(loop);
6781       (this->*block)(d, s, tmp);
6782       bind(odd);
6783       (this->*block)(d, s, tmp);
6784       subs(count, count, 2);
6785       br(Assembler::GT, loop);
6786       bind(end);
6787     }
6788 
6789     void pre1(RegisterOrConstant i) {
6790       block_comment("pre1");
6791       // Pa = Pa_base;
6792       // Pb = Pb_base + i;
6793       // Pm = Pm_base;
6794       // Pn = Pn_base + i;
6795       // Ra = *Pa;
6796       // Rb = *Pb;
6797       // Rm = *Pm;
6798       // Rn = *Pn;
6799       ldr(Ra, Address(Pa_base));
6800       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6801       ldr(Rm, Address(Pm_base));
6802       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6803       lea(Pa, Address(Pa_base));
6804       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6805       lea(Pm, Address(Pm_base));
6806       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6807 
6808       // Zero the m*n result.
6809       mov(Rhi_mn, zr);
6810       mov(Rlo_mn, zr);
6811     }
6812 
6813     // The core multiply-accumulate step of a Montgomery
6814     // multiplication.  The idea is to schedule operations as a
6815     // pipeline so that instructions with long latencies (loads and
6816     // multiplies) have time to complete before their results are
6817     // used.  This most benefits in-order implementations of the
6818     // architecture but out-of-order ones also benefit.
6819     void step() {
6820       block_comment("step");
6821       // MACC(Ra, Rb, t0, t1, t2);
6822       // Ra = *++Pa;
6823       // Rb = *--Pb;
6824       umulh(Rhi_ab, Ra, Rb);
6825       mul(Rlo_ab, Ra, Rb);
6826       ldr(Ra, pre(Pa, wordSize));
6827       ldr(Rb, pre(Pb, -wordSize));
6828       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
6829                                        // previous iteration.
6830       // MACC(Rm, Rn, t0, t1, t2);
6831       // Rm = *++Pm;
6832       // Rn = *--Pn;
6833       umulh(Rhi_mn, Rm, Rn);
6834       mul(Rlo_mn, Rm, Rn);
6835       ldr(Rm, pre(Pm, wordSize));
6836       ldr(Rn, pre(Pn, -wordSize));
6837       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6838     }
6839 
6840     void post1() {
6841       block_comment("post1");
6842 
6843       // MACC(Ra, Rb, t0, t1, t2);
6844       // Ra = *++Pa;
6845       // Rb = *--Pb;
6846       umulh(Rhi_ab, Ra, Rb);
6847       mul(Rlo_ab, Ra, Rb);
6848       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6849       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6850 
6851       // *Pm = Rm = t0 * inv;
6852       mul(Rm, t0, inv);
6853       str(Rm, Address(Pm));
6854 
6855       // MACC(Rm, Rn, t0, t1, t2);
6856       // t0 = t1; t1 = t2; t2 = 0;
6857       umulh(Rhi_mn, Rm, Rn);
6858 
6859 #ifndef PRODUCT
6860       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6861       {
6862         mul(Rlo_mn, Rm, Rn);
6863         add(Rlo_mn, t0, Rlo_mn);
6864         Label ok;
6865         cbz(Rlo_mn, ok); {
6866           stop("broken Montgomery multiply");
6867         } bind(ok);
6868       }
6869 #endif
6870       // We have very carefully set things up so that
6871       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6872       // the lower half of Rm * Rn because we know the result already:
6873       // it must be -t0.  t0 + (-t0) must generate a carry iff
6874       // t0 != 0.  So, rather than do a mul and an adds we just set
6875       // the carry flag iff t0 is nonzero.
6876       //
6877       // mul(Rlo_mn, Rm, Rn);
6878       // adds(zr, t0, Rlo_mn);
6879       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6880       adcs(t0, t1, Rhi_mn);
6881       adc(t1, t2, zr);
6882       mov(t2, zr);
6883     }
6884 
6885     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
6886       block_comment("pre2");
6887       // Pa = Pa_base + i-len;
6888       // Pb = Pb_base + len;
6889       // Pm = Pm_base + i-len;
6890       // Pn = Pn_base + len;
6891 
6892       if (i.is_register()) {
6893         sub(Rj, i.as_register(), len);
6894       } else {
6895         mov(Rj, i.as_constant());
6896         sub(Rj, Rj, len);
6897       }
6898       // Rj == i-len
6899 
6900       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
6901       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
6902       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6903       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
6904 
6905       // Ra = *++Pa;
6906       // Rb = *--Pb;
6907       // Rm = *++Pm;
6908       // Rn = *--Pn;
6909       ldr(Ra, pre(Pa, wordSize));
6910       ldr(Rb, pre(Pb, -wordSize));
6911       ldr(Rm, pre(Pm, wordSize));
6912       ldr(Rn, pre(Pn, -wordSize));
6913 
6914       mov(Rhi_mn, zr);
6915       mov(Rlo_mn, zr);
6916     }
6917 
6918     void post2(RegisterOrConstant i, RegisterOrConstant len) {
6919       block_comment("post2");
6920       if (i.is_constant()) {
6921         mov(Rj, i.as_constant()-len.as_constant());
6922       } else {
6923         sub(Rj, i.as_register(), len);
6924       }
6925 
6926       adds(t0, t0, Rlo_mn); // The pending m*n, low part
6927 
6928       // As soon as we know the least significant digit of our result,
6929       // store it.
6930       // Pm_base[i-len] = t0;
6931       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6932 
6933       // t0 = t1; t1 = t2; t2 = 0;
6934       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
6935       adc(t1, t2, zr);
6936       mov(t2, zr);
6937     }
6938 
6939     // A carry in t0 after Montgomery multiplication means that we
6940     // should subtract multiples of n from our result in m.  We'll
6941     // keep doing that until there is no carry.
6942     void normalize(RegisterOrConstant len) {
6943       block_comment("normalize");
6944       // while (t0)
6945       //   t0 = sub(Pm_base, Pn_base, t0, len);
6946       Label loop, post, again;
6947       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
6948       cbz(t0, post); {
6949         bind(again); {
6950           mov(i, zr);
6951           mov(cnt, len);
6952           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6953           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6954           subs(zr, zr, zr); // set carry flag, i.e. no borrow
6955           align(16);
6956           bind(loop); {
6957             sbcs(Rm, Rm, Rn);
6958             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6959             add(i, i, 1);
6960             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6961             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6962             sub(cnt, cnt, 1);
6963           } cbnz(cnt, loop);
6964           sbc(t0, t0, zr);
6965         } cbnz(t0, again);
6966       } bind(post);
6967     }
6968 
6969     // Move memory at s to d, reversing words.
6970     //    Increments d to end of copied memory
6971     //    Destroys tmp1, tmp2
6972     //    Preserves len
6973     //    Leaves s pointing to the address which was in d at start
6974     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
6975       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
6976 
6977       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
6978       mov(tmp1, len);
6979       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
6980       sub(s, d, len, ext::uxtw, LogBytesPerWord);
6981     }
6982     // where
6983     void reverse1(Register d, Register s, Register tmp) {
6984       ldr(tmp, pre(s, -wordSize));
6985       ror(tmp, tmp, 32);
6986       str(tmp, post(d, wordSize));
6987     }
6988 
6989     void step_squaring() {
6990       // An extra ACC
6991       step();
6992       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6993     }
6994 
6995     void last_squaring(RegisterOrConstant i) {
6996       Label dont;
6997       // if ((i & 1) == 0) {
6998       tbnz(i.as_register(), 0, dont); {
6999         // MACC(Ra, Rb, t0, t1, t2);
7000         // Ra = *++Pa;
7001         // Rb = *--Pb;
7002         umulh(Rhi_ab, Ra, Rb);
7003         mul(Rlo_ab, Ra, Rb);
7004         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7005       } bind(dont);
7006     }
7007 
7008     void extra_step_squaring() {
7009       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7010 
7011       // MACC(Rm, Rn, t0, t1, t2);
7012       // Rm = *++Pm;
7013       // Rn = *--Pn;
7014       umulh(Rhi_mn, Rm, Rn);
7015       mul(Rlo_mn, Rm, Rn);
7016       ldr(Rm, pre(Pm, wordSize));
7017       ldr(Rn, pre(Pn, -wordSize));
7018     }
7019 
7020     void post1_squaring() {
7021       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7022 
7023       // *Pm = Rm = t0 * inv;
7024       mul(Rm, t0, inv);
7025       str(Rm, Address(Pm));
7026 
7027       // MACC(Rm, Rn, t0, t1, t2);
7028       // t0 = t1; t1 = t2; t2 = 0;
7029       umulh(Rhi_mn, Rm, Rn);
7030 
7031 #ifndef PRODUCT
7032       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7033       {
7034         mul(Rlo_mn, Rm, Rn);
7035         add(Rlo_mn, t0, Rlo_mn);
7036         Label ok;
7037         cbz(Rlo_mn, ok); {
7038           stop("broken Montgomery multiply");
7039         } bind(ok);
7040       }
7041 #endif
7042       // We have very carefully set things up so that
7043       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7044       // the lower half of Rm * Rn because we know the result already:
7045       // it must be -t0.  t0 + (-t0) must generate a carry iff
7046       // t0 != 0.  So, rather than do a mul and an adds we just set
7047       // the carry flag iff t0 is nonzero.
7048       //
7049       // mul(Rlo_mn, Rm, Rn);
7050       // adds(zr, t0, Rlo_mn);
7051       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7052       adcs(t0, t1, Rhi_mn);
7053       adc(t1, t2, zr);
7054       mov(t2, zr);
7055     }
7056 
7057     void acc(Register Rhi, Register Rlo,
7058              Register t0, Register t1, Register t2) {
7059       adds(t0, t0, Rlo);
7060       adcs(t1, t1, Rhi);
7061       adc(t2, t2, zr);
7062     }
7063 
7064   public:
7065     /**
7066      * Fast Montgomery multiplication.  The derivation of the
7067      * algorithm is in A Cryptographic Library for the Motorola
7068      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7069      *
7070      * Arguments:
7071      *
7072      * Inputs for multiplication:
7073      *   c_rarg0   - int array elements a
7074      *   c_rarg1   - int array elements b
7075      *   c_rarg2   - int array elements n (the modulus)
7076      *   c_rarg3   - int length
7077      *   c_rarg4   - int inv
7078      *   c_rarg5   - int array elements m (the result)
7079      *
7080      * Inputs for squaring:
7081      *   c_rarg0   - int array elements a
7082      *   c_rarg1   - int array elements n (the modulus)
7083      *   c_rarg2   - int length
7084      *   c_rarg3   - int inv
7085      *   c_rarg4   - int array elements m (the result)
7086      *
7087      */
7088     address generate_multiply() {
7089       Label argh, nothing;
7090       bind(argh);
7091       stop("MontgomeryMultiply total_allocation must be <= 8192");
7092 
7093       align(CodeEntryAlignment);
7094       address entry = pc();
7095 
7096       cbzw(Rlen, nothing);
7097 
7098       enter();
7099 
7100       // Make room.
7101       cmpw(Rlen, 512);
7102       br(Assembler::HI, argh);
7103       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7104       andr(sp, Ra, -2 * wordSize);
7105 
7106       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7107 
7108       {
7109         // Copy input args, reversing as we go.  We use Ra as a
7110         // temporary variable.
7111         reverse(Ra, Pa_base, Rlen, t0, t1);
7112         if (!_squaring)
7113           reverse(Ra, Pb_base, Rlen, t0, t1);
7114         reverse(Ra, Pn_base, Rlen, t0, t1);
7115       }
7116 
7117       // Push all call-saved registers and also Pm_base which we'll need
7118       // at the end.
7119       save_regs();
7120 
7121 #ifndef PRODUCT
7122       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7123       {
7124         ldr(Rn, Address(Pn_base, 0));
7125         mul(Rlo_mn, Rn, inv);
7126         subs(zr, Rlo_mn, -1);
7127         Label ok;
7128         br(EQ, ok); {
7129           stop("broken inverse in Montgomery multiply");
7130         } bind(ok);
7131       }
7132 #endif
7133 
7134       mov(Pm_base, Ra);
7135 
7136       mov(t0, zr);
7137       mov(t1, zr);
7138       mov(t2, zr);
7139 
7140       block_comment("for (int i = 0; i < len; i++) {");
7141       mov(Ri, zr); {
7142         Label loop, end;
7143         cmpw(Ri, Rlen);
7144         br(Assembler::GE, end);
7145 
7146         bind(loop);
7147         pre1(Ri);
7148 
7149         block_comment("  for (j = i; j; j--) {"); {
7150           movw(Rj, Ri);
7151           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7152         } block_comment("  } // j");
7153 
7154         post1();
7155         addw(Ri, Ri, 1);
7156         cmpw(Ri, Rlen);
7157         br(Assembler::LT, loop);
7158         bind(end);
7159         block_comment("} // i");
7160       }
7161 
7162       block_comment("for (int i = len; i < 2*len; i++) {");
7163       mov(Ri, Rlen); {
7164         Label loop, end;
7165         cmpw(Ri, Rlen, Assembler::LSL, 1);
7166         br(Assembler::GE, end);
7167 
7168         bind(loop);
7169         pre2(Ri, Rlen);
7170 
7171         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7172           lslw(Rj, Rlen, 1);
7173           subw(Rj, Rj, Ri);
7174           subw(Rj, Rj, 1);
7175           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7176         } block_comment("  } // j");
7177 
7178         post2(Ri, Rlen);
7179         addw(Ri, Ri, 1);
7180         cmpw(Ri, Rlen, Assembler::LSL, 1);
7181         br(Assembler::LT, loop);
7182         bind(end);
7183       }
7184       block_comment("} // i");
7185 
7186       normalize(Rlen);
7187 
7188       mov(Ra, Pm_base);  // Save Pm_base in Ra
7189       restore_regs();  // Restore caller's Pm_base
7190 
7191       // Copy our result into caller's Pm_base
7192       reverse(Pm_base, Ra, Rlen, t0, t1);
7193 
7194       leave();
7195       bind(nothing);
7196       ret(lr);
7197 
7198       return entry;
7199     }
7200     // In C, approximately:
7201 
7202     // void
7203     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7204     //                     julong Pn_base[], julong Pm_base[],
7205     //                     julong inv, int len) {
7206     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7207     //   julong *Pa, *Pb, *Pn, *Pm;
7208     //   julong Ra, Rb, Rn, Rm;
7209 
7210     //   int i;
7211 
7212     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7213 
7214     //   for (i = 0; i < len; i++) {
7215     //     int j;
7216 
7217     //     Pa = Pa_base;
7218     //     Pb = Pb_base + i;
7219     //     Pm = Pm_base;
7220     //     Pn = Pn_base + i;
7221 
7222     //     Ra = *Pa;
7223     //     Rb = *Pb;
7224     //     Rm = *Pm;
7225     //     Rn = *Pn;
7226 
7227     //     int iters = i;
7228     //     for (j = 0; iters--; j++) {
7229     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7230     //       MACC(Ra, Rb, t0, t1, t2);
7231     //       Ra = *++Pa;
7232     //       Rb = *--Pb;
7233     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7234     //       MACC(Rm, Rn, t0, t1, t2);
7235     //       Rm = *++Pm;
7236     //       Rn = *--Pn;
7237     //     }
7238 
7239     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7240     //     MACC(Ra, Rb, t0, t1, t2);
7241     //     *Pm = Rm = t0 * inv;
7242     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7243     //     MACC(Rm, Rn, t0, t1, t2);
7244 
7245     //     assert(t0 == 0, "broken Montgomery multiply");
7246 
7247     //     t0 = t1; t1 = t2; t2 = 0;
7248     //   }
7249 
7250     //   for (i = len; i < 2*len; i++) {
7251     //     int j;
7252 
7253     //     Pa = Pa_base + i-len;
7254     //     Pb = Pb_base + len;
7255     //     Pm = Pm_base + i-len;
7256     //     Pn = Pn_base + len;
7257 
7258     //     Ra = *++Pa;
7259     //     Rb = *--Pb;
7260     //     Rm = *++Pm;
7261     //     Rn = *--Pn;
7262 
7263     //     int iters = len*2-i-1;
7264     //     for (j = i-len+1; iters--; j++) {
7265     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7266     //       MACC(Ra, Rb, t0, t1, t2);
7267     //       Ra = *++Pa;
7268     //       Rb = *--Pb;
7269     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7270     //       MACC(Rm, Rn, t0, t1, t2);
7271     //       Rm = *++Pm;
7272     //       Rn = *--Pn;
7273     //     }
7274 
7275     //     Pm_base[i-len] = t0;
7276     //     t0 = t1; t1 = t2; t2 = 0;
7277     //   }
7278 
7279     //   while (t0)
7280     //     t0 = sub(Pm_base, Pn_base, t0, len);
7281     // }
7282 
7283     /**
7284      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7285      * multiplies than Montgomery multiplication so it should be up to
7286      * 25% faster.  However, its loop control is more complex and it
7287      * may actually run slower on some machines.
7288      *
7289      * Arguments:
7290      *
7291      * Inputs:
7292      *   c_rarg0   - int array elements a
7293      *   c_rarg1   - int array elements n (the modulus)
7294      *   c_rarg2   - int length
7295      *   c_rarg3   - int inv
7296      *   c_rarg4   - int array elements m (the result)
7297      *
7298      */
7299     address generate_square() {
7300       Label argh;
7301       bind(argh);
7302       stop("MontgomeryMultiply total_allocation must be <= 8192");
7303 
7304       align(CodeEntryAlignment);
7305       address entry = pc();
7306 
7307       enter();
7308 
7309       // Make room.
7310       cmpw(Rlen, 512);
7311       br(Assembler::HI, argh);
7312       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7313       andr(sp, Ra, -2 * wordSize);
7314 
7315       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7316 
7317       {
7318         // Copy input args, reversing as we go.  We use Ra as a
7319         // temporary variable.
7320         reverse(Ra, Pa_base, Rlen, t0, t1);
7321         reverse(Ra, Pn_base, Rlen, t0, t1);
7322       }
7323 
7324       // Push all call-saved registers and also Pm_base which we'll need
7325       // at the end.
7326       save_regs();
7327 
7328       mov(Pm_base, Ra);
7329 
7330       mov(t0, zr);
7331       mov(t1, zr);
7332       mov(t2, zr);
7333 
7334       block_comment("for (int i = 0; i < len; i++) {");
7335       mov(Ri, zr); {
7336         Label loop, end;
7337         bind(loop);
7338         cmp(Ri, Rlen);
7339         br(Assembler::GE, end);
7340 
7341         pre1(Ri);
7342 
7343         block_comment("for (j = (i+1)/2; j; j--) {"); {
7344           add(Rj, Ri, 1);
7345           lsr(Rj, Rj, 1);
7346           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7347         } block_comment("  } // j");
7348 
7349         last_squaring(Ri);
7350 
7351         block_comment("  for (j = i/2; j; j--) {"); {
7352           lsr(Rj, Ri, 1);
7353           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7354         } block_comment("  } // j");
7355 
7356         post1_squaring();
7357         add(Ri, Ri, 1);
7358         cmp(Ri, Rlen);
7359         br(Assembler::LT, loop);
7360 
7361         bind(end);
7362         block_comment("} // i");
7363       }
7364 
7365       block_comment("for (int i = len; i < 2*len; i++) {");
7366       mov(Ri, Rlen); {
7367         Label loop, end;
7368         bind(loop);
7369         cmp(Ri, Rlen, Assembler::LSL, 1);
7370         br(Assembler::GE, end);
7371 
7372         pre2(Ri, Rlen);
7373 
7374         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7375           lsl(Rj, Rlen, 1);
7376           sub(Rj, Rj, Ri);
7377           sub(Rj, Rj, 1);
7378           lsr(Rj, Rj, 1);
7379           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7380         } block_comment("  } // j");
7381 
7382         last_squaring(Ri);
7383 
7384         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7385           lsl(Rj, Rlen, 1);
7386           sub(Rj, Rj, Ri);
7387           lsr(Rj, Rj, 1);
7388           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7389         } block_comment("  } // j");
7390 
7391         post2(Ri, Rlen);
7392         add(Ri, Ri, 1);
7393         cmp(Ri, Rlen, Assembler::LSL, 1);
7394 
7395         br(Assembler::LT, loop);
7396         bind(end);
7397         block_comment("} // i");
7398       }
7399 
7400       normalize(Rlen);
7401 
7402       mov(Ra, Pm_base);  // Save Pm_base in Ra
7403       restore_regs();  // Restore caller's Pm_base
7404 
7405       // Copy our result into caller's Pm_base
7406       reverse(Pm_base, Ra, Rlen, t0, t1);
7407 
7408       leave();
7409       ret(lr);
7410 
7411       return entry;
7412     }
7413     // In C, approximately:
7414 
7415     // void
7416     // montgomery_square(julong Pa_base[], julong Pn_base[],
7417     //                   julong Pm_base[], julong inv, int len) {
7418     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7419     //   julong *Pa, *Pb, *Pn, *Pm;
7420     //   julong Ra, Rb, Rn, Rm;
7421 
7422     //   int i;
7423 
7424     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7425 
7426     //   for (i = 0; i < len; i++) {
7427     //     int j;
7428 
7429     //     Pa = Pa_base;
7430     //     Pb = Pa_base + i;
7431     //     Pm = Pm_base;
7432     //     Pn = Pn_base + i;
7433 
7434     //     Ra = *Pa;
7435     //     Rb = *Pb;
7436     //     Rm = *Pm;
7437     //     Rn = *Pn;
7438 
7439     //     int iters = (i+1)/2;
7440     //     for (j = 0; iters--; j++) {
7441     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7442     //       MACC2(Ra, Rb, t0, t1, t2);
7443     //       Ra = *++Pa;
7444     //       Rb = *--Pb;
7445     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7446     //       MACC(Rm, Rn, t0, t1, t2);
7447     //       Rm = *++Pm;
7448     //       Rn = *--Pn;
7449     //     }
7450     //     if ((i & 1) == 0) {
7451     //       assert(Ra == Pa_base[j], "must be");
7452     //       MACC(Ra, Ra, t0, t1, t2);
7453     //     }
7454     //     iters = i/2;
7455     //     assert(iters == i-j, "must be");
7456     //     for (; iters--; j++) {
7457     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7458     //       MACC(Rm, Rn, t0, t1, t2);
7459     //       Rm = *++Pm;
7460     //       Rn = *--Pn;
7461     //     }
7462 
7463     //     *Pm = Rm = t0 * inv;
7464     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7465     //     MACC(Rm, Rn, t0, t1, t2);
7466 
7467     //     assert(t0 == 0, "broken Montgomery multiply");
7468 
7469     //     t0 = t1; t1 = t2; t2 = 0;
7470     //   }
7471 
7472     //   for (i = len; i < 2*len; i++) {
7473     //     int start = i-len+1;
7474     //     int end = start + (len - start)/2;
7475     //     int j;
7476 
7477     //     Pa = Pa_base + i-len;
7478     //     Pb = Pa_base + len;
7479     //     Pm = Pm_base + i-len;
7480     //     Pn = Pn_base + len;
7481 
7482     //     Ra = *++Pa;
7483     //     Rb = *--Pb;
7484     //     Rm = *++Pm;
7485     //     Rn = *--Pn;
7486 
7487     //     int iters = (2*len-i-1)/2;
7488     //     assert(iters == end-start, "must be");
7489     //     for (j = start; iters--; j++) {
7490     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7491     //       MACC2(Ra, Rb, t0, t1, t2);
7492     //       Ra = *++Pa;
7493     //       Rb = *--Pb;
7494     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7495     //       MACC(Rm, Rn, t0, t1, t2);
7496     //       Rm = *++Pm;
7497     //       Rn = *--Pn;
7498     //     }
7499     //     if ((i & 1) == 0) {
7500     //       assert(Ra == Pa_base[j], "must be");
7501     //       MACC(Ra, Ra, t0, t1, t2);
7502     //     }
7503     //     iters =  (2*len-i)/2;
7504     //     assert(iters == len-j, "must be");
7505     //     for (; iters--; j++) {
7506     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7507     //       MACC(Rm, Rn, t0, t1, t2);
7508     //       Rm = *++Pm;
7509     //       Rn = *--Pn;
7510     //     }
7511     //     Pm_base[i-len] = t0;
7512     //     t0 = t1; t1 = t2; t2 = 0;
7513     //   }
7514 
7515     //   while (t0)
7516     //     t0 = sub(Pm_base, Pn_base, t0, len);
7517     // }
7518   };
7519 
7520 
7521   // Initialization
7522   void generate_initial() {
7523     // Generate initial stubs and initializes the entry points
7524 
7525     // entry points that exist in all platforms Note: This is code
7526     // that could be shared among different platforms - however the
7527     // benefit seems to be smaller than the disadvantage of having a
7528     // much more complicated generator structure. See also comment in
7529     // stubRoutines.hpp.
7530 
7531     StubRoutines::_forward_exception_entry = generate_forward_exception();
7532 
7533     StubRoutines::_call_stub_entry =
7534       generate_call_stub(StubRoutines::_call_stub_return_address);
7535 
7536     // is referenced by megamorphic call
7537     StubRoutines::_catch_exception_entry = generate_catch_exception();
7538 
7539     // Build this early so it's available for the interpreter.
7540     StubRoutines::_throw_StackOverflowError_entry =
7541       generate_throw_exception("StackOverflowError throw_exception",
7542                                CAST_FROM_FN_PTR(address,
7543                                                 SharedRuntime::throw_StackOverflowError));
7544     StubRoutines::_throw_delayed_StackOverflowError_entry =
7545       generate_throw_exception("delayed StackOverflowError throw_exception",
7546                                CAST_FROM_FN_PTR(address,
7547                                                 SharedRuntime::throw_delayed_StackOverflowError));
7548     if (UseCRC32Intrinsics) {
7549       // set table address before stub generation which use it
7550       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7551       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7552     }
7553 
7554     if (UseCRC32CIntrinsics) {
7555       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7556     }
7557 
7558     // Disabled until JDK-8210858 is fixed
7559     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7560     //   StubRoutines::_dlog = generate_dlog();
7561     // }
7562 
7563     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7564       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7565     }
7566 
7567     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7568       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7569     }
7570 
7571     // Safefetch stubs.
7572     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7573                                                        &StubRoutines::_safefetch32_fault_pc,
7574                                                        &StubRoutines::_safefetch32_continuation_pc);
7575     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7576                                                        &StubRoutines::_safefetchN_fault_pc,
7577                                                        &StubRoutines::_safefetchN_continuation_pc);
7578   }
7579 
7580   void generate_all() {
7581     // support for verify_oop (must happen after universe_init)
7582     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
7583     StubRoutines::_throw_AbstractMethodError_entry =
7584       generate_throw_exception("AbstractMethodError throw_exception",
7585                                CAST_FROM_FN_PTR(address,
7586                                                 SharedRuntime::
7587                                                 throw_AbstractMethodError));
7588 
7589     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7590       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7591                                CAST_FROM_FN_PTR(address,
7592                                                 SharedRuntime::
7593                                                 throw_IncompatibleClassChangeError));
7594 
7595     StubRoutines::_throw_NullPointerException_at_call_entry =
7596       generate_throw_exception("NullPointerException at call throw_exception",
7597                                CAST_FROM_FN_PTR(address,
7598                                                 SharedRuntime::
7599                                                 throw_NullPointerException_at_call));
7600 
7601     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7602 
7603     // arraycopy stubs used by compilers
7604     generate_arraycopy_stubs();
7605 
7606     // has negatives stub for large arrays.
7607     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
7608 
7609     // array equals stub for large arrays.
7610     if (!UseSimpleArrayEquals) {
7611       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7612     }
7613 
7614     generate_compare_long_strings();
7615 
7616     generate_string_indexof_stubs();
7617 
7618     // byte_array_inflate stub for large arrays.
7619     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7620 
7621     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7622     if (bs_nm != NULL) {
7623       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7624     }
7625     if (UseFastLocking) {
7626       StubRoutines::aarch64::_check_lock_stack = generate_check_lock_stack();
7627     }
7628 #ifdef COMPILER2
7629     if (UseMultiplyToLenIntrinsic) {
7630       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7631     }
7632 
7633     if (UseSquareToLenIntrinsic) {
7634       StubRoutines::_squareToLen = generate_squareToLen();
7635     }
7636 
7637     if (UseMulAddIntrinsic) {
7638       StubRoutines::_mulAdd = generate_mulAdd();
7639     }
7640 
7641     if (UseSIMDForBigIntegerShiftIntrinsics) {
7642       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7643       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7644     }
7645 
7646     if (UseMontgomeryMultiplyIntrinsic) {
7647       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7648       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7649       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7650     }
7651 
7652     if (UseMontgomerySquareIntrinsic) {
7653       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7654       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7655       // We use generate_multiply() rather than generate_square()
7656       // because it's faster for the sizes of modulus we care about.
7657       StubRoutines::_montgomerySquare = g.generate_multiply();
7658     }
7659 #endif // COMPILER2
7660 
7661     // generate GHASH intrinsics code
7662     if (UseGHASHIntrinsics) {
7663       if (UseAESCTRIntrinsics) {
7664         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
7665       } else {
7666         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7667       }
7668     }
7669 
7670     if (UseBASE64Intrinsics) {
7671         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7672         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7673     }
7674 
7675     // data cache line writeback
7676     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7677     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7678 
7679     if (UseAESIntrinsics) {
7680       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7681       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7682       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7683       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7684     }
7685 
7686     if (UseAESCTRIntrinsics) {
7687       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7688     }
7689 
7690     if (UseMD5Intrinsics) {
7691       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
7692       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
7693     }
7694     if (UseSHA1Intrinsics) {
7695       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7696       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7697     }
7698     if (UseSHA256Intrinsics) {
7699       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7700       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7701     }
7702     if (UseSHA512Intrinsics) {
7703       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7704       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7705     }
7706     if (UseSHA3Intrinsics) {
7707       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7708       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7709     }
7710 
7711     // generate Adler32 intrinsics code
7712     if (UseAdler32Intrinsics) {
7713       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7714     }
7715 
7716     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
7717 
7718 #ifdef LINUX
7719 
7720     generate_atomic_entry_points();
7721 
7722 #endif // LINUX
7723 
7724     StubRoutines::aarch64::set_completed();
7725   }
7726 
7727  public:
7728   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7729     if (all) {
7730       generate_all();
7731     } else {
7732       generate_initial();
7733     }
7734   }
7735 }; // end class declaration
7736 
7737 #define UCM_TABLE_MAX_ENTRIES 8
7738 void StubGenerator_generate(CodeBuffer* code, bool all) {
7739   if (UnsafeCopyMemory::_table == NULL) {
7740     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7741   }
7742   StubGenerator g(code, all);
7743 }
7744 
7745 
7746 #ifdef LINUX
7747 
7748 // Define pointers to atomic stubs and initialize them to point to the
7749 // code in atomic_aarch64.S.
7750 
7751 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
7752   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7753     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
7754   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7755     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7756 
7757 DEFAULT_ATOMIC_OP(fetch_add, 4, )
7758 DEFAULT_ATOMIC_OP(fetch_add, 8, )
7759 DEFAULT_ATOMIC_OP(xchg, 4, )
7760 DEFAULT_ATOMIC_OP(xchg, 8, )
7761 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
7762 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
7763 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
7764 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
7765 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
7766 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
7767 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
7768 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
7769 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
7770 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
7771 
7772 #undef DEFAULT_ATOMIC_OP
7773 
7774 #endif // LINUX