1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/atomic.hpp"
  45 #include "runtime/frame.inline.hpp"
  46 #include "runtime/handles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubCodeGenerator.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/thread.inline.hpp"
  51 #include "utilities/align.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 #ifdef COMPILER2
  54 #include "opto/runtime.hpp"
  55 #endif
  56 #if INCLUDE_ZGC
  57 #include "gc/z/zThreadLocalData.hpp"
  58 #endif
  59 
  60 // Declaration and definition of StubGenerator (no .hpp file).
  61 // For a more detailed description of the stub routine structure
  62 // see the comment in stubRoutines.hpp
  63 
  64 #undef __
  65 #define __ _masm->
  66 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  67 
  68 #ifdef PRODUCT
  69 #define BLOCK_COMMENT(str) /* nothing */
  70 #else
  71 #define BLOCK_COMMENT(str) __ block_comment(str)
  72 #endif
  73 
  74 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  75 
  76 // Stub Code definitions
  77 
  78 class StubGenerator: public StubCodeGenerator {
  79  private:
  80 
  81 #ifdef PRODUCT
  82 #define inc_counter_np(counter) ((void)0)
  83 #else
  84   void inc_counter_np_(int& counter) {
  85     __ lea(rscratch2, ExternalAddress((address)&counter));
  86     __ ldrw(rscratch1, Address(rscratch2));
  87     __ addw(rscratch1, rscratch1, 1);
  88     __ strw(rscratch1, Address(rscratch2));
  89   }
  90 #define inc_counter_np(counter) \
  91   BLOCK_COMMENT("inc_counter " #counter); \
  92   inc_counter_np_(counter);
  93 #endif
  94 
  95   // Call stubs are used to call Java from C
  96   //
  97   // Arguments:
  98   //    c_rarg0:   call wrapper address                   address
  99   //    c_rarg1:   result                                 address
 100   //    c_rarg2:   result type                            BasicType
 101   //    c_rarg3:   method                                 Method*
 102   //    c_rarg4:   (interpreter) entry point              address
 103   //    c_rarg5:   parameters                             intptr_t*
 104   //    c_rarg6:   parameter size (in words)              int
 105   //    c_rarg7:   thread                                 Thread*
 106   //
 107   // There is no return from the stub itself as any Java result
 108   // is written to result
 109   //
 110   // we save r30 (lr) as the return PC at the base of the frame and
 111   // link r29 (fp) below it as the frame pointer installing sp (r31)
 112   // into fp.
 113   //
 114   // we save r0-r7, which accounts for all the c arguments.
 115   //
 116   // TODO: strictly do we need to save them all? they are treated as
 117   // volatile by C so could we omit saving the ones we are going to
 118   // place in global registers (thread? method?) or those we only use
 119   // during setup of the Java call?
 120   //
 121   // we don't need to save r8 which C uses as an indirect result location
 122   // return register.
 123   //
 124   // we don't need to save r9-r15 which both C and Java treat as
 125   // volatile
 126   //
 127   // we don't need to save r16-18 because Java does not use them
 128   //
 129   // we save r19-r28 which Java uses as scratch registers and C
 130   // expects to be callee-save
 131   //
 132   // we save the bottom 64 bits of each value stored in v8-v15; it is
 133   // the responsibility of the caller to preserve larger values.
 134   //
 135   // so the stub frame looks like this when we enter Java code
 136   //
 137   //     [ return_from_Java     ] <--- sp
 138   //     [ argument word n      ]
 139   //      ...
 140   // -27 [ argument word 1      ]
 141   // -26 [ saved v15            ] <--- sp_after_call
 142   // -25 [ saved v14            ]
 143   // -24 [ saved v13            ]
 144   // -23 [ saved v12            ]
 145   // -22 [ saved v11            ]
 146   // -21 [ saved v10            ]
 147   // -20 [ saved v9             ]
 148   // -19 [ saved v8             ]
 149   // -18 [ saved r28            ]
 150   // -17 [ saved r27            ]
 151   // -16 [ saved r26            ]
 152   // -15 [ saved r25            ]
 153   // -14 [ saved r24            ]
 154   // -13 [ saved r23            ]
 155   // -12 [ saved r22            ]
 156   // -11 [ saved r21            ]
 157   // -10 [ saved r20            ]
 158   //  -9 [ saved r19            ]
 159   //  -8 [ call wrapper    (r0) ]
 160   //  -7 [ result          (r1) ]
 161   //  -6 [ result type     (r2) ]
 162   //  -5 [ method          (r3) ]
 163   //  -4 [ entry point     (r4) ]
 164   //  -3 [ parameters      (r5) ]
 165   //  -2 [ parameter size  (r6) ]
 166   //  -1 [ thread (r7)          ]
 167   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 168   //   1 [ saved lr       (r30) ]
 169 
 170   // Call stub stack layout word offsets from fp
 171   enum call_stub_layout {
 172     sp_after_call_off = -26,
 173 
 174     d15_off            = -26,
 175     d13_off            = -24,
 176     d11_off            = -22,
 177     d9_off             = -20,
 178 
 179     r28_off            = -18,
 180     r26_off            = -16,
 181     r24_off            = -14,
 182     r22_off            = -12,
 183     r20_off            = -10,
 184     call_wrapper_off   =  -8,
 185     result_off         =  -7,
 186     result_type_off    =  -6,
 187     method_off         =  -5,
 188     entry_point_off    =  -4,
 189     parameter_size_off =  -2,
 190     thread_off         =  -1,
 191     fp_f               =   0,
 192     retaddr_off        =   1,
 193   };
 194 
 195   address generate_call_stub(address& return_address) {
 196     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 197            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 198            "adjust this code");
 199 
 200     StubCodeMark mark(this, "StubRoutines", "call_stub");
 201     address start = __ pc();
 202 
 203     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 204 
 205     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 206     const Address result        (rfp, result_off         * wordSize);
 207     const Address result_type   (rfp, result_type_off    * wordSize);
 208     const Address method        (rfp, method_off         * wordSize);
 209     const Address entry_point   (rfp, entry_point_off    * wordSize);
 210     const Address parameter_size(rfp, parameter_size_off * wordSize);
 211 
 212     const Address thread        (rfp, thread_off         * wordSize);
 213 
 214     const Address d15_save      (rfp, d15_off * wordSize);
 215     const Address d13_save      (rfp, d13_off * wordSize);
 216     const Address d11_save      (rfp, d11_off * wordSize);
 217     const Address d9_save       (rfp, d9_off * wordSize);
 218 
 219     const Address r28_save      (rfp, r28_off * wordSize);
 220     const Address r26_save      (rfp, r26_off * wordSize);
 221     const Address r24_save      (rfp, r24_off * wordSize);
 222     const Address r22_save      (rfp, r22_off * wordSize);
 223     const Address r20_save      (rfp, r20_off * wordSize);
 224 
 225     // stub code
 226 
 227     address aarch64_entry = __ pc();
 228 
 229     // set up frame and move sp to end of save area
 230     __ enter();
 231     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 232 
 233     // save register parameters and Java scratch/global registers
 234     // n.b. we save thread even though it gets installed in
 235     // rthread because we want to sanity check rthread later
 236     __ str(c_rarg7,  thread);
 237     __ strw(c_rarg6, parameter_size);
 238     __ stp(c_rarg4, c_rarg5,  entry_point);
 239     __ stp(c_rarg2, c_rarg3,  result_type);
 240     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 241 
 242     __ stp(r20, r19,   r20_save);
 243     __ stp(r22, r21,   r22_save);
 244     __ stp(r24, r23,   r24_save);
 245     __ stp(r26, r25,   r26_save);
 246     __ stp(r28, r27,   r28_save);
 247 
 248     __ stpd(v9,  v8,   d9_save);
 249     __ stpd(v11, v10,  d11_save);
 250     __ stpd(v13, v12,  d13_save);
 251     __ stpd(v15, v14,  d15_save);
 252 
 253     // install Java thread in global register now we have saved
 254     // whatever value it held
 255     __ mov(rthread, c_rarg7);
 256     // And method
 257     __ mov(rmethod, c_rarg3);
 258 
 259     // set up the heapbase register
 260     __ reinit_heapbase();
 261 
 262 #ifdef ASSERT
 263     // make sure we have no pending exceptions
 264     {
 265       Label L;
 266       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 267       __ cmp(rscratch1, (u1)NULL_WORD);
 268       __ br(Assembler::EQ, L);
 269       __ stop("StubRoutines::call_stub: entered with pending exception");
 270       __ BIND(L);
 271     }
 272 #endif
 273     // pass parameters if any
 274     __ mov(esp, sp);
 275     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 276     __ andr(sp, rscratch1, -2 * wordSize);
 277 
 278     BLOCK_COMMENT("pass parameters if any");
 279     Label parameters_done;
 280     // parameter count is still in c_rarg6
 281     // and parameter pointer identifying param 1 is in c_rarg5
 282     __ cbzw(c_rarg6, parameters_done);
 283 
 284     address loop = __ pc();
 285     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 286     __ subsw(c_rarg6, c_rarg6, 1);
 287     __ push(rscratch1);
 288     __ br(Assembler::GT, loop);
 289 
 290     __ BIND(parameters_done);
 291 
 292     // call Java entry -- passing methdoOop, and current sp
 293     //      rmethod: Method*
 294     //      r13: sender sp
 295     BLOCK_COMMENT("call Java function");
 296     __ mov(r13, sp);
 297     __ blr(c_rarg4);
 298 
 299     // we do this here because the notify will already have been done
 300     // if we get to the next instruction via an exception
 301     //
 302     // n.b. adding this instruction here affects the calculation of
 303     // whether or not a routine returns to the call stub (used when
 304     // doing stack walks) since the normal test is to check the return
 305     // pc against the address saved below. so we may need to allow for
 306     // this extra instruction in the check.
 307 
 308     // save current address for use by exception handling code
 309 
 310     return_address = __ pc();
 311 
 312     // store result depending on type (everything that is not
 313     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 314     // n.b. this assumes Java returns an integral result in r0
 315     // and a floating result in j_farg0
 316     __ ldr(j_rarg2, result);
 317     Label is_long, is_float, is_double, exit;
 318     __ ldr(j_rarg1, result_type);
 319     __ cmp(j_rarg1, (u1)T_OBJECT);
 320     __ br(Assembler::EQ, is_long);
 321     __ cmp(j_rarg1, (u1)T_LONG);
 322     __ br(Assembler::EQ, is_long);
 323     __ cmp(j_rarg1, (u1)T_FLOAT);
 324     __ br(Assembler::EQ, is_float);
 325     __ cmp(j_rarg1, (u1)T_DOUBLE);
 326     __ br(Assembler::EQ, is_double);
 327 
 328     // handle T_INT case
 329     __ strw(r0, Address(j_rarg2));
 330 
 331     __ BIND(exit);
 332 
 333     // pop parameters
 334     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 335 
 336 #ifdef ASSERT
 337     // verify that threads correspond
 338     {
 339       Label L, S;
 340       __ ldr(rscratch1, thread);
 341       __ cmp(rthread, rscratch1);
 342       __ br(Assembler::NE, S);
 343       __ get_thread(rscratch1);
 344       __ cmp(rthread, rscratch1);
 345       __ br(Assembler::EQ, L);
 346       __ BIND(S);
 347       __ stop("StubRoutines::call_stub: threads must correspond");
 348       __ BIND(L);
 349     }
 350 #endif
 351 
 352     // restore callee-save registers
 353     __ ldpd(v15, v14,  d15_save);
 354     __ ldpd(v13, v12,  d13_save);
 355     __ ldpd(v11, v10,  d11_save);
 356     __ ldpd(v9,  v8,   d9_save);
 357 
 358     __ ldp(r28, r27,   r28_save);
 359     __ ldp(r26, r25,   r26_save);
 360     __ ldp(r24, r23,   r24_save);
 361     __ ldp(r22, r21,   r22_save);
 362     __ ldp(r20, r19,   r20_save);
 363 
 364     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 365     __ ldrw(c_rarg2, result_type);
 366     __ ldr(c_rarg3,  method);
 367     __ ldp(c_rarg4, c_rarg5,  entry_point);
 368     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 369 
 370     // leave frame and return to caller
 371     __ leave();
 372     __ ret(lr);
 373 
 374     // handle return types different from T_INT
 375 
 376     __ BIND(is_long);
 377     __ str(r0, Address(j_rarg2, 0));
 378     __ br(Assembler::AL, exit);
 379 
 380     __ BIND(is_float);
 381     __ strs(j_farg0, Address(j_rarg2, 0));
 382     __ br(Assembler::AL, exit);
 383 
 384     __ BIND(is_double);
 385     __ strd(j_farg0, Address(j_rarg2, 0));
 386     __ br(Assembler::AL, exit);
 387 
 388     return start;
 389   }
 390 
 391   // Return point for a Java call if there's an exception thrown in
 392   // Java code.  The exception is caught and transformed into a
 393   // pending exception stored in JavaThread that can be tested from
 394   // within the VM.
 395   //
 396   // Note: Usually the parameters are removed by the callee. In case
 397   // of an exception crossing an activation frame boundary, that is
 398   // not the case if the callee is compiled code => need to setup the
 399   // rsp.
 400   //
 401   // r0: exception oop
 402 
 403   address generate_catch_exception() {
 404     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 405     address start = __ pc();
 406 
 407     // same as in generate_call_stub():
 408     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 409     const Address thread        (rfp, thread_off         * wordSize);
 410 
 411 #ifdef ASSERT
 412     // verify that threads correspond
 413     {
 414       Label L, S;
 415       __ ldr(rscratch1, thread);
 416       __ cmp(rthread, rscratch1);
 417       __ br(Assembler::NE, S);
 418       __ get_thread(rscratch1);
 419       __ cmp(rthread, rscratch1);
 420       __ br(Assembler::EQ, L);
 421       __ bind(S);
 422       __ stop("StubRoutines::catch_exception: threads must correspond");
 423       __ bind(L);
 424     }
 425 #endif
 426 
 427     // set pending exception
 428     __ verify_oop(r0);
 429 
 430     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 431     __ mov(rscratch1, (address)__FILE__);
 432     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 433     __ movw(rscratch1, (int)__LINE__);
 434     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 435 
 436     // complete return to VM
 437     assert(StubRoutines::_call_stub_return_address != NULL,
 438            "_call_stub_return_address must have been generated before");
 439     __ b(StubRoutines::_call_stub_return_address);
 440 
 441     return start;
 442   }
 443 
 444   // Continuation point for runtime calls returning with a pending
 445   // exception.  The pending exception check happened in the runtime
 446   // or native call stub.  The pending exception in Thread is
 447   // converted into a Java-level exception.
 448   //
 449   // Contract with Java-level exception handlers:
 450   // r0: exception
 451   // r3: throwing pc
 452   //
 453   // NOTE: At entry of this stub, exception-pc must be in LR !!
 454 
 455   // NOTE: this is always used as a jump target within generated code
 456   // so it just needs to be generated code with no x86 prolog
 457 
 458   address generate_forward_exception() {
 459     StubCodeMark mark(this, "StubRoutines", "forward exception");
 460     address start = __ pc();
 461 
 462     // Upon entry, LR points to the return address returning into
 463     // Java (interpreted or compiled) code; i.e., the return address
 464     // becomes the throwing pc.
 465     //
 466     // Arguments pushed before the runtime call are still on the stack
 467     // but the exception handler will reset the stack pointer ->
 468     // ignore them.  A potential result in registers can be ignored as
 469     // well.
 470 
 471 #ifdef ASSERT
 472     // make sure this code is only executed if there is a pending exception
 473     {
 474       Label L;
 475       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 476       __ cbnz(rscratch1, L);
 477       __ stop("StubRoutines::forward exception: no pending exception (1)");
 478       __ bind(L);
 479     }
 480 #endif
 481 
 482     // compute exception handler into r19
 483 
 484     // call the VM to find the handler address associated with the
 485     // caller address. pass thread in r0 and caller pc (ret address)
 486     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 487     // the stack.
 488     __ mov(c_rarg1, lr);
 489     // lr will be trashed by the VM call so we move it to R19
 490     // (callee-saved) because we also need to pass it to the handler
 491     // returned by this call.
 492     __ mov(r19, lr);
 493     BLOCK_COMMENT("call exception_handler_for_return_address");
 494     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 495                          SharedRuntime::exception_handler_for_return_address),
 496                     rthread, c_rarg1);
 497     // Reinitialize the ptrue predicate register, in case the external runtime
 498     // call clobbers ptrue reg, as we may return to SVE compiled code.
 499     __ reinitialize_ptrue();
 500 
 501     // we should not really care that lr is no longer the callee
 502     // address. we saved the value the handler needs in r19 so we can
 503     // just copy it to r3. however, the C2 handler will push its own
 504     // frame and then calls into the VM and the VM code asserts that
 505     // the PC for the frame above the handler belongs to a compiled
 506     // Java method. So, we restore lr here to satisfy that assert.
 507     __ mov(lr, r19);
 508     // setup r0 & r3 & clear pending exception
 509     __ mov(r3, r19);
 510     __ mov(r19, r0);
 511     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 512     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 513 
 514 #ifdef ASSERT
 515     // make sure exception is set
 516     {
 517       Label L;
 518       __ cbnz(r0, L);
 519       __ stop("StubRoutines::forward exception: no pending exception (2)");
 520       __ bind(L);
 521     }
 522 #endif
 523 
 524     // continue at exception handler
 525     // r0: exception
 526     // r3: throwing pc
 527     // r19: exception handler
 528     __ verify_oop(r0);
 529     __ br(r19);
 530 
 531     return start;
 532   }
 533 
 534   // Non-destructive plausibility checks for oops
 535   //
 536   // Arguments:
 537   //    r0: oop to verify
 538   //    rscratch1: error message
 539   //
 540   // Stack after saving c_rarg3:
 541   //    [tos + 0]: saved c_rarg3
 542   //    [tos + 1]: saved c_rarg2
 543   //    [tos + 2]: saved lr
 544   //    [tos + 3]: saved rscratch2
 545   //    [tos + 4]: saved r0
 546   //    [tos + 5]: saved rscratch1
 547   address generate_verify_oop() {
 548 
 549     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 550     address start = __ pc();
 551 
 552     Label exit, error;
 553 
 554     // save c_rarg2 and c_rarg3
 555     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 556 
 557     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 558     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 559     __ ldr(c_rarg3, Address(c_rarg2));
 560     __ add(c_rarg3, c_rarg3, 1);
 561     __ str(c_rarg3, Address(c_rarg2));
 562 
 563     // object is in r0
 564     // make sure object is 'reasonable'
 565     __ cbz(r0, exit); // if obj is NULL it is OK
 566 
 567 #if INCLUDE_ZGC
 568     if (UseZGC) {
 569       // Check if mask is good.
 570       // verifies that ZAddressBadMask & r0 == 0
 571       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 572       __ andr(c_rarg2, r0, c_rarg3);
 573       __ cbnz(c_rarg2, error);
 574     }
 575 #endif
 576 
 577     // Check if the oop is in the right area of memory
 578     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 579     __ andr(c_rarg2, r0, c_rarg3);
 580     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 581 
 582     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 583     // instruction here because the flags register is live.
 584     __ eor(c_rarg2, c_rarg2, c_rarg3);
 585     __ cbnz(c_rarg2, error);
 586 
 587     // make sure klass is 'reasonable', which is not zero.
 588     // NOTE: We used to load the Klass* here, and compare that to zero.
 589     // However, with current Lilliput implementation, that would require
 590     // checking the locking bits and calling into the runtime, which
 591     // clobbers the condition flags, which may be live around this call.
 592     // OTOH, this is a simple NULL-check, and we can simply load the upper
 593     // 32bit of the header as narrowKlass, and compare that to 0. The
 594     // worst that can happen (rarely) is that the object is locked and
 595     // we have lock pointer bits in the upper 32bits. We can't get a false
 596     // negative.
 597     assert(oopDesc::klass_offset_in_bytes() % 4 == 0, "must be 4 byte aligned");
 598     __ ldrw(r0, Address(r0, oopDesc::klass_offset_in_bytes()));  // get klass
 599     __ cbzw(r0, error);      // if klass is NULL it is broken
 600 
 601     // return if everything seems ok
 602     __ bind(exit);
 603 
 604     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 605     __ ret(lr);
 606 
 607     // handle errors
 608     __ bind(error);
 609     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 610 
 611     __ push(RegSet::range(r0, r29), sp);
 612     // debug(char* msg, int64_t pc, int64_t regs[])
 613     __ mov(c_rarg0, rscratch1);      // pass address of error message
 614     __ mov(c_rarg1, lr);             // pass return address
 615     __ mov(c_rarg2, sp);             // pass address of regs on stack
 616 #ifndef PRODUCT
 617     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 618 #endif
 619     BLOCK_COMMENT("call MacroAssembler::debug");
 620     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 621     __ blr(rscratch1);
 622     __ hlt(0);
 623 
 624     return start;
 625   }
 626 
 627   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 628 
 629   // Generate indices for iota vector.
 630   address generate_iota_indices(const char *stub_name) {
 631     __ align(CodeEntryAlignment);
 632     StubCodeMark mark(this, "StubRoutines", stub_name);
 633     address start = __ pc();
 634     __ emit_data64(0x0706050403020100, relocInfo::none);
 635     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 636     return start;
 637   }
 638 
 639   // The inner part of zero_words().  This is the bulk operation,
 640   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 641   // caller is responsible for zeroing the last few words.
 642   //
 643   // Inputs:
 644   // r10: the HeapWord-aligned base address of an array to zero.
 645   // r11: the count in HeapWords, r11 > 0.
 646   //
 647   // Returns r10 and r11, adjusted for the caller to clear.
 648   // r10: the base address of the tail of words left to clear.
 649   // r11: the number of words in the tail.
 650   //      r11 < MacroAssembler::zero_words_block_size.
 651 
 652   address generate_zero_blocks() {
 653     Label done;
 654     Label base_aligned;
 655 
 656     Register base = r10, cnt = r11;
 657 
 658     __ align(CodeEntryAlignment);
 659     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 660     address start = __ pc();
 661 
 662     if (UseBlockZeroing) {
 663       int zva_length = VM_Version::zva_length();
 664 
 665       // Ensure ZVA length can be divided by 16. This is required by
 666       // the subsequent operations.
 667       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 668 
 669       __ tbz(base, 3, base_aligned);
 670       __ str(zr, Address(__ post(base, 8)));
 671       __ sub(cnt, cnt, 1);
 672       __ bind(base_aligned);
 673 
 674       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 675       // alignment.
 676       Label small;
 677       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 678       __ subs(rscratch1, cnt, low_limit >> 3);
 679       __ br(Assembler::LT, small);
 680       __ zero_dcache_blocks(base, cnt);
 681       __ bind(small);
 682     }
 683 
 684     {
 685       // Number of stp instructions we'll unroll
 686       const int unroll =
 687         MacroAssembler::zero_words_block_size / 2;
 688       // Clear the remaining blocks.
 689       Label loop;
 690       __ subs(cnt, cnt, unroll * 2);
 691       __ br(Assembler::LT, done);
 692       __ bind(loop);
 693       for (int i = 0; i < unroll; i++)
 694         __ stp(zr, zr, __ post(base, 16));
 695       __ subs(cnt, cnt, unroll * 2);
 696       __ br(Assembler::GE, loop);
 697       __ bind(done);
 698       __ add(cnt, cnt, unroll * 2);
 699     }
 700 
 701     __ ret(lr);
 702 
 703     return start;
 704   }
 705 
 706 
 707   typedef enum {
 708     copy_forwards = 1,
 709     copy_backwards = -1
 710   } copy_direction;
 711 
 712   // Bulk copy of blocks of 8 words.
 713   //
 714   // count is a count of words.
 715   //
 716   // Precondition: count >= 8
 717   //
 718   // Postconditions:
 719   //
 720   // The least significant bit of count contains the remaining count
 721   // of words to copy.  The rest of count is trash.
 722   //
 723   // s and d are adjusted to point to the remaining words to copy
 724   //
 725   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 726                            copy_direction direction) {
 727     int unit = wordSize * direction;
 728     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 729 
 730     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 731       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 732     const Register stride = r13;
 733 
 734     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 735     assert_different_registers(s, d, count, rscratch1);
 736 
 737     Label again, drain;
 738     const char *stub_name;
 739     if (direction == copy_forwards)
 740       stub_name = "forward_copy_longs";
 741     else
 742       stub_name = "backward_copy_longs";
 743 
 744     __ align(CodeEntryAlignment);
 745 
 746     StubCodeMark mark(this, "StubRoutines", stub_name);
 747 
 748     __ bind(start);
 749 
 750     Label unaligned_copy_long;
 751     if (AvoidUnalignedAccesses) {
 752       __ tbnz(d, 3, unaligned_copy_long);
 753     }
 754 
 755     if (direction == copy_forwards) {
 756       __ sub(s, s, bias);
 757       __ sub(d, d, bias);
 758     }
 759 
 760 #ifdef ASSERT
 761     // Make sure we are never given < 8 words
 762     {
 763       Label L;
 764       __ cmp(count, (u1)8);
 765       __ br(Assembler::GE, L);
 766       __ stop("genrate_copy_longs called with < 8 words");
 767       __ bind(L);
 768     }
 769 #endif
 770 
 771     // Fill 8 registers
 772     if (UseSIMDForMemoryOps) {
 773       __ ldpq(v0, v1, Address(s, 4 * unit));
 774       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 775     } else {
 776       __ ldp(t0, t1, Address(s, 2 * unit));
 777       __ ldp(t2, t3, Address(s, 4 * unit));
 778       __ ldp(t4, t5, Address(s, 6 * unit));
 779       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 780     }
 781 
 782     __ subs(count, count, 16);
 783     __ br(Assembler::LO, drain);
 784 
 785     int prefetch = PrefetchCopyIntervalInBytes;
 786     bool use_stride = false;
 787     if (direction == copy_backwards) {
 788        use_stride = prefetch > 256;
 789        prefetch = -prefetch;
 790        if (use_stride) __ mov(stride, prefetch);
 791     }
 792 
 793     __ bind(again);
 794 
 795     if (PrefetchCopyIntervalInBytes > 0)
 796       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 797 
 798     if (UseSIMDForMemoryOps) {
 799       __ stpq(v0, v1, Address(d, 4 * unit));
 800       __ ldpq(v0, v1, Address(s, 4 * unit));
 801       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 802       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 803     } else {
 804       __ stp(t0, t1, Address(d, 2 * unit));
 805       __ ldp(t0, t1, Address(s, 2 * unit));
 806       __ stp(t2, t3, Address(d, 4 * unit));
 807       __ ldp(t2, t3, Address(s, 4 * unit));
 808       __ stp(t4, t5, Address(d, 6 * unit));
 809       __ ldp(t4, t5, Address(s, 6 * unit));
 810       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 811       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 812     }
 813 
 814     __ subs(count, count, 8);
 815     __ br(Assembler::HS, again);
 816 
 817     // Drain
 818     __ bind(drain);
 819     if (UseSIMDForMemoryOps) {
 820       __ stpq(v0, v1, Address(d, 4 * unit));
 821       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 822     } else {
 823       __ stp(t0, t1, Address(d, 2 * unit));
 824       __ stp(t2, t3, Address(d, 4 * unit));
 825       __ stp(t4, t5, Address(d, 6 * unit));
 826       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 827     }
 828 
 829     {
 830       Label L1, L2;
 831       __ tbz(count, exact_log2(4), L1);
 832       if (UseSIMDForMemoryOps) {
 833         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 834         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 835       } else {
 836         __ ldp(t0, t1, Address(s, 2 * unit));
 837         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 838         __ stp(t0, t1, Address(d, 2 * unit));
 839         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 840       }
 841       __ bind(L1);
 842 
 843       if (direction == copy_forwards) {
 844         __ add(s, s, bias);
 845         __ add(d, d, bias);
 846       }
 847 
 848       __ tbz(count, 1, L2);
 849       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 850       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 851       __ bind(L2);
 852     }
 853 
 854     __ ret(lr);
 855 
 856     if (AvoidUnalignedAccesses) {
 857       Label drain, again;
 858       // Register order for storing. Order is different for backward copy.
 859 
 860       __ bind(unaligned_copy_long);
 861 
 862       // source address is even aligned, target odd aligned
 863       //
 864       // when forward copying word pairs we read long pairs at offsets
 865       // {0, 2, 4, 6} (in long words). when backwards copying we read
 866       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 867       // address by -2 in the forwards case so we can compute the
 868       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 869       // or -1.
 870       //
 871       // when forward copying we need to store 1 word, 3 pairs and
 872       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 873       // zero offset We adjust the destination by -1 which means we
 874       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 875       //
 876       // When backwards copyng we need to store 1 word, 3 pairs and
 877       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 878       // offsets {1, 3, 5, 7, 8} * unit.
 879 
 880       if (direction == copy_forwards) {
 881         __ sub(s, s, 16);
 882         __ sub(d, d, 8);
 883       }
 884 
 885       // Fill 8 registers
 886       //
 887       // for forwards copy s was offset by -16 from the original input
 888       // value of s so the register contents are at these offsets
 889       // relative to the 64 bit block addressed by that original input
 890       // and so on for each successive 64 byte block when s is updated
 891       //
 892       // t0 at offset 0,  t1 at offset 8
 893       // t2 at offset 16, t3 at offset 24
 894       // t4 at offset 32, t5 at offset 40
 895       // t6 at offset 48, t7 at offset 56
 896 
 897       // for backwards copy s was not offset so the register contents
 898       // are at these offsets into the preceding 64 byte block
 899       // relative to that original input and so on for each successive
 900       // preceding 64 byte block when s is updated. this explains the
 901       // slightly counter-intuitive looking pattern of register usage
 902       // in the stp instructions for backwards copy.
 903       //
 904       // t0 at offset -16, t1 at offset -8
 905       // t2 at offset -32, t3 at offset -24
 906       // t4 at offset -48, t5 at offset -40
 907       // t6 at offset -64, t7 at offset -56
 908 
 909       __ ldp(t0, t1, Address(s, 2 * unit));
 910       __ ldp(t2, t3, Address(s, 4 * unit));
 911       __ ldp(t4, t5, Address(s, 6 * unit));
 912       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 913 
 914       __ subs(count, count, 16);
 915       __ br(Assembler::LO, drain);
 916 
 917       int prefetch = PrefetchCopyIntervalInBytes;
 918       bool use_stride = false;
 919       if (direction == copy_backwards) {
 920          use_stride = prefetch > 256;
 921          prefetch = -prefetch;
 922          if (use_stride) __ mov(stride, prefetch);
 923       }
 924 
 925       __ bind(again);
 926 
 927       if (PrefetchCopyIntervalInBytes > 0)
 928         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 929 
 930       if (direction == copy_forwards) {
 931        // allowing for the offset of -8 the store instructions place
 932        // registers into the target 64 bit block at the following
 933        // offsets
 934        //
 935        // t0 at offset 0
 936        // t1 at offset 8,  t2 at offset 16
 937        // t3 at offset 24, t4 at offset 32
 938        // t5 at offset 40, t6 at offset 48
 939        // t7 at offset 56
 940 
 941         __ str(t0, Address(d, 1 * unit));
 942         __ stp(t1, t2, Address(d, 2 * unit));
 943         __ ldp(t0, t1, Address(s, 2 * unit));
 944         __ stp(t3, t4, Address(d, 4 * unit));
 945         __ ldp(t2, t3, Address(s, 4 * unit));
 946         __ stp(t5, t6, Address(d, 6 * unit));
 947         __ ldp(t4, t5, Address(s, 6 * unit));
 948         __ str(t7, Address(__ pre(d, 8 * unit)));
 949         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 950       } else {
 951        // d was not offset when we started so the registers are
 952        // written into the 64 bit block preceding d with the following
 953        // offsets
 954        //
 955        // t1 at offset -8
 956        // t3 at offset -24, t0 at offset -16
 957        // t5 at offset -48, t2 at offset -32
 958        // t7 at offset -56, t4 at offset -48
 959        //                   t6 at offset -64
 960        //
 961        // note that this matches the offsets previously noted for the
 962        // loads
 963 
 964         __ str(t1, Address(d, 1 * unit));
 965         __ stp(t3, t0, Address(d, 3 * unit));
 966         __ ldp(t0, t1, Address(s, 2 * unit));
 967         __ stp(t5, t2, Address(d, 5 * unit));
 968         __ ldp(t2, t3, Address(s, 4 * unit));
 969         __ stp(t7, t4, Address(d, 7 * unit));
 970         __ ldp(t4, t5, Address(s, 6 * unit));
 971         __ str(t6, Address(__ pre(d, 8 * unit)));
 972         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 973       }
 974 
 975       __ subs(count, count, 8);
 976       __ br(Assembler::HS, again);
 977 
 978       // Drain
 979       //
 980       // this uses the same pattern of offsets and register arguments
 981       // as above
 982       __ bind(drain);
 983       if (direction == copy_forwards) {
 984         __ str(t0, Address(d, 1 * unit));
 985         __ stp(t1, t2, Address(d, 2 * unit));
 986         __ stp(t3, t4, Address(d, 4 * unit));
 987         __ stp(t5, t6, Address(d, 6 * unit));
 988         __ str(t7, Address(__ pre(d, 8 * unit)));
 989       } else {
 990         __ str(t1, Address(d, 1 * unit));
 991         __ stp(t3, t0, Address(d, 3 * unit));
 992         __ stp(t5, t2, Address(d, 5 * unit));
 993         __ stp(t7, t4, Address(d, 7 * unit));
 994         __ str(t6, Address(__ pre(d, 8 * unit)));
 995       }
 996       // now we need to copy any remaining part block which may
 997       // include a 4 word block subblock and/or a 2 word subblock.
 998       // bits 2 and 1 in the count are the tell-tale for whether we
 999       // have each such subblock
1000       {
1001         Label L1, L2;
1002         __ tbz(count, exact_log2(4), L1);
1003        // this is the same as above but copying only 4 longs hence
1004        // with only one intervening stp between the str instructions
1005        // but note that the offsets and registers still follow the
1006        // same pattern
1007         __ ldp(t0, t1, Address(s, 2 * unit));
1008         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1009         if (direction == copy_forwards) {
1010           __ str(t0, Address(d, 1 * unit));
1011           __ stp(t1, t2, Address(d, 2 * unit));
1012           __ str(t3, Address(__ pre(d, 4 * unit)));
1013         } else {
1014           __ str(t1, Address(d, 1 * unit));
1015           __ stp(t3, t0, Address(d, 3 * unit));
1016           __ str(t2, Address(__ pre(d, 4 * unit)));
1017         }
1018         __ bind(L1);
1019 
1020         __ tbz(count, 1, L2);
1021        // this is the same as above but copying only 2 longs hence
1022        // there is no intervening stp between the str instructions
1023        // but note that the offset and register patterns are still
1024        // the same
1025         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1026         if (direction == copy_forwards) {
1027           __ str(t0, Address(d, 1 * unit));
1028           __ str(t1, Address(__ pre(d, 2 * unit)));
1029         } else {
1030           __ str(t1, Address(d, 1 * unit));
1031           __ str(t0, Address(__ pre(d, 2 * unit)));
1032         }
1033         __ bind(L2);
1034 
1035        // for forwards copy we need to re-adjust the offsets we
1036        // applied so that s and d are follow the last words written
1037 
1038        if (direction == copy_forwards) {
1039          __ add(s, s, 16);
1040          __ add(d, d, 8);
1041        }
1042 
1043       }
1044 
1045       __ ret(lr);
1046       }
1047   }
1048 
1049   // Small copy: less than 16 bytes.
1050   //
1051   // NB: Ignores all of the bits of count which represent more than 15
1052   // bytes, so a caller doesn't have to mask them.
1053 
1054   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1055     bool is_backwards = step < 0;
1056     size_t granularity = uabs(step);
1057     int direction = is_backwards ? -1 : 1;
1058     int unit = wordSize * direction;
1059 
1060     Label Lword, Lint, Lshort, Lbyte;
1061 
1062     assert(granularity
1063            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1064 
1065     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1066 
1067     // ??? I don't know if this bit-test-and-branch is the right thing
1068     // to do.  It does a lot of jumping, resulting in several
1069     // mispredicted branches.  It might make more sense to do this
1070     // with something like Duff's device with a single computed branch.
1071 
1072     __ tbz(count, 3 - exact_log2(granularity), Lword);
1073     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1074     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1075     __ bind(Lword);
1076 
1077     if (granularity <= sizeof (jint)) {
1078       __ tbz(count, 2 - exact_log2(granularity), Lint);
1079       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1080       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1081       __ bind(Lint);
1082     }
1083 
1084     if (granularity <= sizeof (jshort)) {
1085       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1086       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1087       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1088       __ bind(Lshort);
1089     }
1090 
1091     if (granularity <= sizeof (jbyte)) {
1092       __ tbz(count, 0, Lbyte);
1093       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1094       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1095       __ bind(Lbyte);
1096     }
1097   }
1098 
1099   Label copy_f, copy_b;
1100 
1101   // All-singing all-dancing memory copy.
1102   //
1103   // Copy count units of memory from s to d.  The size of a unit is
1104   // step, which can be positive or negative depending on the direction
1105   // of copy.  If is_aligned is false, we align the source address.
1106   //
1107 
1108   void copy_memory(bool is_aligned, Register s, Register d,
1109                    Register count, Register tmp, int step) {
1110     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1111     bool is_backwards = step < 0;
1112     unsigned int granularity = uabs(step);
1113     const Register t0 = r3, t1 = r4;
1114 
1115     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1116     // load all the data before writing anything
1117     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1118     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1119     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1120     const Register send = r17, dend = r16;
1121 
1122     if (PrefetchCopyIntervalInBytes > 0)
1123       __ prfm(Address(s, 0), PLDL1KEEP);
1124     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1125     __ br(Assembler::HI, copy_big);
1126 
1127     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1128     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1129 
1130     __ cmp(count, u1(16/granularity));
1131     __ br(Assembler::LS, copy16);
1132 
1133     __ cmp(count, u1(64/granularity));
1134     __ br(Assembler::HI, copy80);
1135 
1136     __ cmp(count, u1(32/granularity));
1137     __ br(Assembler::LS, copy32);
1138 
1139     // 33..64 bytes
1140     if (UseSIMDForMemoryOps) {
1141       __ ldpq(v0, v1, Address(s, 0));
1142       __ ldpq(v2, v3, Address(send, -32));
1143       __ stpq(v0, v1, Address(d, 0));
1144       __ stpq(v2, v3, Address(dend, -32));
1145     } else {
1146       __ ldp(t0, t1, Address(s, 0));
1147       __ ldp(t2, t3, Address(s, 16));
1148       __ ldp(t4, t5, Address(send, -32));
1149       __ ldp(t6, t7, Address(send, -16));
1150 
1151       __ stp(t0, t1, Address(d, 0));
1152       __ stp(t2, t3, Address(d, 16));
1153       __ stp(t4, t5, Address(dend, -32));
1154       __ stp(t6, t7, Address(dend, -16));
1155     }
1156     __ b(finish);
1157 
1158     // 17..32 bytes
1159     __ bind(copy32);
1160     __ ldp(t0, t1, Address(s, 0));
1161     __ ldp(t2, t3, Address(send, -16));
1162     __ stp(t0, t1, Address(d, 0));
1163     __ stp(t2, t3, Address(dend, -16));
1164     __ b(finish);
1165 
1166     // 65..80/96 bytes
1167     // (96 bytes if SIMD because we do 32 byes per instruction)
1168     __ bind(copy80);
1169     if (UseSIMDForMemoryOps) {
1170       __ ldpq(v0, v1, Address(s, 0));
1171       __ ldpq(v2, v3, Address(s, 32));
1172       // Unaligned pointers can be an issue for copying.
1173       // The issue has more chances to happen when granularity of data is
1174       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1175       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1176       // The most performance drop has been seen for the range 65-80 bytes.
1177       // For such cases using the pair of ldp/stp instead of the third pair of
1178       // ldpq/stpq fixes the performance issue.
1179       if (granularity < sizeof (jint)) {
1180         Label copy96;
1181         __ cmp(count, u1(80/granularity));
1182         __ br(Assembler::HI, copy96);
1183         __ ldp(t0, t1, Address(send, -16));
1184 
1185         __ stpq(v0, v1, Address(d, 0));
1186         __ stpq(v2, v3, Address(d, 32));
1187         __ stp(t0, t1, Address(dend, -16));
1188         __ b(finish);
1189 
1190         __ bind(copy96);
1191       }
1192       __ ldpq(v4, v5, Address(send, -32));
1193 
1194       __ stpq(v0, v1, Address(d, 0));
1195       __ stpq(v2, v3, Address(d, 32));
1196       __ stpq(v4, v5, Address(dend, -32));
1197     } else {
1198       __ ldp(t0, t1, Address(s, 0));
1199       __ ldp(t2, t3, Address(s, 16));
1200       __ ldp(t4, t5, Address(s, 32));
1201       __ ldp(t6, t7, Address(s, 48));
1202       __ ldp(t8, t9, Address(send, -16));
1203 
1204       __ stp(t0, t1, Address(d, 0));
1205       __ stp(t2, t3, Address(d, 16));
1206       __ stp(t4, t5, Address(d, 32));
1207       __ stp(t6, t7, Address(d, 48));
1208       __ stp(t8, t9, Address(dend, -16));
1209     }
1210     __ b(finish);
1211 
1212     // 0..16 bytes
1213     __ bind(copy16);
1214     __ cmp(count, u1(8/granularity));
1215     __ br(Assembler::LO, copy8);
1216 
1217     // 8..16 bytes
1218     __ ldr(t0, Address(s, 0));
1219     __ ldr(t1, Address(send, -8));
1220     __ str(t0, Address(d, 0));
1221     __ str(t1, Address(dend, -8));
1222     __ b(finish);
1223 
1224     if (granularity < 8) {
1225       // 4..7 bytes
1226       __ bind(copy8);
1227       __ tbz(count, 2 - exact_log2(granularity), copy4);
1228       __ ldrw(t0, Address(s, 0));
1229       __ ldrw(t1, Address(send, -4));
1230       __ strw(t0, Address(d, 0));
1231       __ strw(t1, Address(dend, -4));
1232       __ b(finish);
1233       if (granularity < 4) {
1234         // 0..3 bytes
1235         __ bind(copy4);
1236         __ cbz(count, finish); // get rid of 0 case
1237         if (granularity == 2) {
1238           __ ldrh(t0, Address(s, 0));
1239           __ strh(t0, Address(d, 0));
1240         } else { // granularity == 1
1241           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1242           // the first and last byte.
1243           // Handle the 3 byte case by loading and storing base + count/2
1244           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1245           // This does means in the 1 byte case we load/store the same
1246           // byte 3 times.
1247           __ lsr(count, count, 1);
1248           __ ldrb(t0, Address(s, 0));
1249           __ ldrb(t1, Address(send, -1));
1250           __ ldrb(t2, Address(s, count));
1251           __ strb(t0, Address(d, 0));
1252           __ strb(t1, Address(dend, -1));
1253           __ strb(t2, Address(d, count));
1254         }
1255         __ b(finish);
1256       }
1257     }
1258 
1259     __ bind(copy_big);
1260     if (is_backwards) {
1261       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1262       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1263     }
1264 
1265     // Now we've got the small case out of the way we can align the
1266     // source address on a 2-word boundary.
1267 
1268     Label aligned;
1269 
1270     if (is_aligned) {
1271       // We may have to adjust by 1 word to get s 2-word-aligned.
1272       __ tbz(s, exact_log2(wordSize), aligned);
1273       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1274       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1275       __ sub(count, count, wordSize/granularity);
1276     } else {
1277       if (is_backwards) {
1278         __ andr(rscratch2, s, 2 * wordSize - 1);
1279       } else {
1280         __ neg(rscratch2, s);
1281         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1282       }
1283       // rscratch2 is the byte adjustment needed to align s.
1284       __ cbz(rscratch2, aligned);
1285       int shift = exact_log2(granularity);
1286       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1287       __ sub(count, count, rscratch2);
1288 
1289 #if 0
1290       // ?? This code is only correct for a disjoint copy.  It may or
1291       // may not make sense to use it in that case.
1292 
1293       // Copy the first pair; s and d may not be aligned.
1294       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1295       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1296 
1297       // Align s and d, adjust count
1298       if (is_backwards) {
1299         __ sub(s, s, rscratch2);
1300         __ sub(d, d, rscratch2);
1301       } else {
1302         __ add(s, s, rscratch2);
1303         __ add(d, d, rscratch2);
1304       }
1305 #else
1306       copy_memory_small(s, d, rscratch2, rscratch1, step);
1307 #endif
1308     }
1309 
1310     __ bind(aligned);
1311 
1312     // s is now 2-word-aligned.
1313 
1314     // We have a count of units and some trailing bytes.  Adjust the
1315     // count and do a bulk copy of words.
1316     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1317     if (direction == copy_forwards)
1318       __ bl(copy_f);
1319     else
1320       __ bl(copy_b);
1321 
1322     // And the tail.
1323     copy_memory_small(s, d, count, tmp, step);
1324 
1325     if (granularity >= 8) __ bind(copy8);
1326     if (granularity >= 4) __ bind(copy4);
1327     __ bind(finish);
1328   }
1329 
1330 
1331   void clobber_registers() {
1332 #ifdef ASSERT
1333     RegSet clobbered
1334       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1335     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1336     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1337     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1338       __ mov(*it, rscratch1);
1339     }
1340 #endif
1341 
1342   }
1343 
1344   // Scan over array at a for count oops, verifying each one.
1345   // Preserves a and count, clobbers rscratch1 and rscratch2.
1346   void verify_oop_array (int size, Register a, Register count, Register temp) {
1347     Label loop, end;
1348     __ mov(rscratch1, a);
1349     __ mov(rscratch2, zr);
1350     __ bind(loop);
1351     __ cmp(rscratch2, count);
1352     __ br(Assembler::HS, end);
1353     if (size == wordSize) {
1354       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1355       __ verify_oop(temp);
1356     } else {
1357       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1358       __ decode_heap_oop(temp); // calls verify_oop
1359     }
1360     __ add(rscratch2, rscratch2, 1);
1361     __ b(loop);
1362     __ bind(end);
1363   }
1364 
1365   // Arguments:
1366   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1367   //             ignored
1368   //   is_oop  - true => oop array, so generate store check code
1369   //   name    - stub name string
1370   //
1371   // Inputs:
1372   //   c_rarg0   - source array address
1373   //   c_rarg1   - destination array address
1374   //   c_rarg2   - element count, treated as ssize_t, can be zero
1375   //
1376   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1377   // the hardware handle it.  The two dwords within qwords that span
1378   // cache line boundaries will still be loaded and stored atomically.
1379   //
1380   // Side Effects:
1381   //   disjoint_int_copy_entry is set to the no-overlap entry point
1382   //   used by generate_conjoint_int_oop_copy().
1383   //
1384   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1385                                   const char *name, bool dest_uninitialized = false) {
1386     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1387     RegSet saved_reg = RegSet::of(s, d, count);
1388     __ align(CodeEntryAlignment);
1389     StubCodeMark mark(this, "StubRoutines", name);
1390     address start = __ pc();
1391     __ enter();
1392 
1393     if (entry != NULL) {
1394       *entry = __ pc();
1395       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1396       BLOCK_COMMENT("Entry:");
1397     }
1398 
1399     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1400     if (dest_uninitialized) {
1401       decorators |= IS_DEST_UNINITIALIZED;
1402     }
1403     if (aligned) {
1404       decorators |= ARRAYCOPY_ALIGNED;
1405     }
1406 
1407     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1408     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1409 
1410     if (is_oop) {
1411       // save regs before copy_memory
1412       __ push(RegSet::of(d, count), sp);
1413     }
1414     {
1415       // UnsafeCopyMemory page error: continue after ucm
1416       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1417       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1418       copy_memory(aligned, s, d, count, rscratch1, size);
1419     }
1420 
1421     if (is_oop) {
1422       __ pop(RegSet::of(d, count), sp);
1423       if (VerifyOops)
1424         verify_oop_array(size, d, count, r16);
1425     }
1426 
1427     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1428 
1429     __ leave();
1430     __ mov(r0, zr); // return 0
1431     __ ret(lr);
1432     return start;
1433   }
1434 
1435   // Arguments:
1436   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1437   //             ignored
1438   //   is_oop  - true => oop array, so generate store check code
1439   //   name    - stub name string
1440   //
1441   // Inputs:
1442   //   c_rarg0   - source array address
1443   //   c_rarg1   - destination array address
1444   //   c_rarg2   - element count, treated as ssize_t, can be zero
1445   //
1446   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1447   // the hardware handle it.  The two dwords within qwords that span
1448   // cache line boundaries will still be loaded and stored atomically.
1449   //
1450   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1451                                  address *entry, const char *name,
1452                                  bool dest_uninitialized = false) {
1453     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1454     RegSet saved_regs = RegSet::of(s, d, count);
1455     StubCodeMark mark(this, "StubRoutines", name);
1456     address start = __ pc();
1457     __ enter();
1458 
1459     if (entry != NULL) {
1460       *entry = __ pc();
1461       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1462       BLOCK_COMMENT("Entry:");
1463     }
1464 
1465     // use fwd copy when (d-s) above_equal (count*size)
1466     __ sub(rscratch1, d, s);
1467     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1468     __ br(Assembler::HS, nooverlap_target);
1469 
1470     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1471     if (dest_uninitialized) {
1472       decorators |= IS_DEST_UNINITIALIZED;
1473     }
1474     if (aligned) {
1475       decorators |= ARRAYCOPY_ALIGNED;
1476     }
1477 
1478     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1479     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1480 
1481     if (is_oop) {
1482       // save regs before copy_memory
1483       __ push(RegSet::of(d, count), sp);
1484     }
1485     {
1486       // UnsafeCopyMemory page error: continue after ucm
1487       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1488       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1489       copy_memory(aligned, s, d, count, rscratch1, -size);
1490     }
1491     if (is_oop) {
1492       __ pop(RegSet::of(d, count), sp);
1493       if (VerifyOops)
1494         verify_oop_array(size, d, count, r16);
1495     }
1496     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1497     __ leave();
1498     __ mov(r0, zr); // return 0
1499     __ ret(lr);
1500     return start;
1501 }
1502 
1503   // Arguments:
1504   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1505   //             ignored
1506   //   name    - stub name string
1507   //
1508   // Inputs:
1509   //   c_rarg0   - source array address
1510   //   c_rarg1   - destination array address
1511   //   c_rarg2   - element count, treated as ssize_t, can be zero
1512   //
1513   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1514   // we let the hardware handle it.  The one to eight bytes within words,
1515   // dwords or qwords that span cache line boundaries will still be loaded
1516   // and stored atomically.
1517   //
1518   // Side Effects:
1519   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1520   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1521   // we let the hardware handle it.  The one to eight bytes within words,
1522   // dwords or qwords that span cache line boundaries will still be loaded
1523   // and stored atomically.
1524   //
1525   // Side Effects:
1526   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1527   //   used by generate_conjoint_byte_copy().
1528   //
1529   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1530     const bool not_oop = false;
1531     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1532   }
1533 
1534   // Arguments:
1535   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1536   //             ignored
1537   //   name    - stub name string
1538   //
1539   // Inputs:
1540   //   c_rarg0   - source array address
1541   //   c_rarg1   - destination array address
1542   //   c_rarg2   - element count, treated as ssize_t, can be zero
1543   //
1544   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1545   // we let the hardware handle it.  The one to eight bytes within words,
1546   // dwords or qwords that span cache line boundaries will still be loaded
1547   // and stored atomically.
1548   //
1549   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1550                                       address* entry, const char *name) {
1551     const bool not_oop = false;
1552     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1553   }
1554 
1555   // Arguments:
1556   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1557   //             ignored
1558   //   name    - stub name string
1559   //
1560   // Inputs:
1561   //   c_rarg0   - source array address
1562   //   c_rarg1   - destination array address
1563   //   c_rarg2   - element count, treated as ssize_t, can be zero
1564   //
1565   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1566   // let the hardware handle it.  The two or four words within dwords
1567   // or qwords that span cache line boundaries will still be loaded
1568   // and stored atomically.
1569   //
1570   // Side Effects:
1571   //   disjoint_short_copy_entry is set to the no-overlap entry point
1572   //   used by generate_conjoint_short_copy().
1573   //
1574   address generate_disjoint_short_copy(bool aligned,
1575                                        address* entry, const char *name) {
1576     const bool not_oop = false;
1577     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1578   }
1579 
1580   // Arguments:
1581   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1582   //             ignored
1583   //   name    - stub name string
1584   //
1585   // Inputs:
1586   //   c_rarg0   - source array address
1587   //   c_rarg1   - destination array address
1588   //   c_rarg2   - element count, treated as ssize_t, can be zero
1589   //
1590   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1591   // let the hardware handle it.  The two or four words within dwords
1592   // or qwords that span cache line boundaries will still be loaded
1593   // and stored atomically.
1594   //
1595   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1596                                        address *entry, const char *name) {
1597     const bool not_oop = false;
1598     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1599 
1600   }
1601   // Arguments:
1602   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1603   //             ignored
1604   //   name    - stub name string
1605   //
1606   // Inputs:
1607   //   c_rarg0   - source array address
1608   //   c_rarg1   - destination array address
1609   //   c_rarg2   - element count, treated as ssize_t, can be zero
1610   //
1611   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1612   // the hardware handle it.  The two dwords within qwords that span
1613   // cache line boundaries will still be loaded and stored atomically.
1614   //
1615   // Side Effects:
1616   //   disjoint_int_copy_entry is set to the no-overlap entry point
1617   //   used by generate_conjoint_int_oop_copy().
1618   //
1619   address generate_disjoint_int_copy(bool aligned, address *entry,
1620                                          const char *name, bool dest_uninitialized = false) {
1621     const bool not_oop = false;
1622     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1623   }
1624 
1625   // Arguments:
1626   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1627   //             ignored
1628   //   name    - stub name string
1629   //
1630   // Inputs:
1631   //   c_rarg0   - source array address
1632   //   c_rarg1   - destination array address
1633   //   c_rarg2   - element count, treated as ssize_t, can be zero
1634   //
1635   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1636   // the hardware handle it.  The two dwords within qwords that span
1637   // cache line boundaries will still be loaded and stored atomically.
1638   //
1639   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1640                                      address *entry, const char *name,
1641                                      bool dest_uninitialized = false) {
1642     const bool not_oop = false;
1643     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1644   }
1645 
1646 
1647   // Arguments:
1648   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1649   //             ignored
1650   //   name    - stub name string
1651   //
1652   // Inputs:
1653   //   c_rarg0   - source array address
1654   //   c_rarg1   - destination array address
1655   //   c_rarg2   - element count, treated as size_t, can be zero
1656   //
1657   // Side Effects:
1658   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1659   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1660   //
1661   address generate_disjoint_long_copy(bool aligned, address *entry,
1662                                           const char *name, bool dest_uninitialized = false) {
1663     const bool not_oop = false;
1664     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1665   }
1666 
1667   // Arguments:
1668   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1669   //             ignored
1670   //   name    - stub name string
1671   //
1672   // Inputs:
1673   //   c_rarg0   - source array address
1674   //   c_rarg1   - destination array address
1675   //   c_rarg2   - element count, treated as size_t, can be zero
1676   //
1677   address generate_conjoint_long_copy(bool aligned,
1678                                       address nooverlap_target, address *entry,
1679                                       const char *name, bool dest_uninitialized = false) {
1680     const bool not_oop = false;
1681     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1682   }
1683 
1684   // Arguments:
1685   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1686   //             ignored
1687   //   name    - stub name string
1688   //
1689   // Inputs:
1690   //   c_rarg0   - source array address
1691   //   c_rarg1   - destination array address
1692   //   c_rarg2   - element count, treated as size_t, can be zero
1693   //
1694   // Side Effects:
1695   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1696   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1697   //
1698   address generate_disjoint_oop_copy(bool aligned, address *entry,
1699                                      const char *name, bool dest_uninitialized) {
1700     const bool is_oop = true;
1701     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1702     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1703   }
1704 
1705   // Arguments:
1706   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1707   //             ignored
1708   //   name    - stub name string
1709   //
1710   // Inputs:
1711   //   c_rarg0   - source array address
1712   //   c_rarg1   - destination array address
1713   //   c_rarg2   - element count, treated as size_t, can be zero
1714   //
1715   address generate_conjoint_oop_copy(bool aligned,
1716                                      address nooverlap_target, address *entry,
1717                                      const char *name, bool dest_uninitialized) {
1718     const bool is_oop = true;
1719     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1720     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1721                                   name, dest_uninitialized);
1722   }
1723 
1724 
1725   // Helper for generating a dynamic type check.
1726   // Smashes rscratch1, rscratch2.
1727   void generate_type_check(Register sub_klass,
1728                            Register super_check_offset,
1729                            Register super_klass,
1730                            Label& L_success) {
1731     assert_different_registers(sub_klass, super_check_offset, super_klass);
1732 
1733     BLOCK_COMMENT("type_check:");
1734 
1735     Label L_miss;
1736 
1737     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1738                                      super_check_offset);
1739     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1740 
1741     // Fall through on failure!
1742     __ BIND(L_miss);
1743   }
1744 
1745   //
1746   //  Generate checkcasting array copy stub
1747   //
1748   //  Input:
1749   //    c_rarg0   - source array address
1750   //    c_rarg1   - destination array address
1751   //    c_rarg2   - element count, treated as ssize_t, can be zero
1752   //    c_rarg3   - size_t ckoff (super_check_offset)
1753   //    c_rarg4   - oop ckval (super_klass)
1754   //
1755   //  Output:
1756   //    r0 ==  0  -  success
1757   //    r0 == -1^K - failure, where K is partial transfer count
1758   //
1759   address generate_checkcast_copy(const char *name, address *entry,
1760                                   bool dest_uninitialized = false) {
1761 
1762     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1763 
1764     // Input registers (after setup_arg_regs)
1765     const Register from        = c_rarg0;   // source array address
1766     const Register to          = c_rarg1;   // destination array address
1767     const Register count       = c_rarg2;   // elementscount
1768     const Register ckoff       = c_rarg3;   // super_check_offset
1769     const Register ckval       = c_rarg4;   // super_klass
1770 
1771     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1772     RegSet wb_post_saved_regs = RegSet::of(count);
1773 
1774     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1775     const Register copied_oop  = r22;       // actual oop copied
1776     const Register count_save  = r21;       // orig elementscount
1777     const Register start_to    = r20;       // destination array start address
1778     const Register r19_klass   = r19;       // oop._klass
1779 
1780     //---------------------------------------------------------------
1781     // Assembler stub will be used for this call to arraycopy
1782     // if the two arrays are subtypes of Object[] but the
1783     // destination array type is not equal to or a supertype
1784     // of the source type.  Each element must be separately
1785     // checked.
1786 
1787     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1788                                copied_oop, r19_klass, count_save);
1789 
1790     __ align(CodeEntryAlignment);
1791     StubCodeMark mark(this, "StubRoutines", name);
1792     address start = __ pc();
1793 
1794     __ enter(); // required for proper stackwalking of RuntimeStub frame
1795 
1796 #ifdef ASSERT
1797     // caller guarantees that the arrays really are different
1798     // otherwise, we would have to make conjoint checks
1799     { Label L;
1800       array_overlap_test(L, TIMES_OOP);
1801       __ stop("checkcast_copy within a single array");
1802       __ bind(L);
1803     }
1804 #endif //ASSERT
1805 
1806     // Caller of this entry point must set up the argument registers.
1807     if (entry != NULL) {
1808       *entry = __ pc();
1809       BLOCK_COMMENT("Entry:");
1810     }
1811 
1812      // Empty array:  Nothing to do.
1813     __ cbz(count, L_done);
1814     __ push(RegSet::of(r19, r20, r21, r22), sp);
1815 
1816 #ifdef ASSERT
1817     BLOCK_COMMENT("assert consistent ckoff/ckval");
1818     // The ckoff and ckval must be mutually consistent,
1819     // even though caller generates both.
1820     { Label L;
1821       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1822       __ ldrw(start_to, Address(ckval, sco_offset));
1823       __ cmpw(ckoff, start_to);
1824       __ br(Assembler::EQ, L);
1825       __ stop("super_check_offset inconsistent");
1826       __ bind(L);
1827     }
1828 #endif //ASSERT
1829 
1830     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1831     bool is_oop = true;
1832     if (dest_uninitialized) {
1833       decorators |= IS_DEST_UNINITIALIZED;
1834     }
1835 
1836     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1837     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1838 
1839     // save the original count
1840     __ mov(count_save, count);
1841 
1842     // Copy from low to high addresses
1843     __ mov(start_to, to);              // Save destination array start address
1844     __ b(L_load_element);
1845 
1846     // ======== begin loop ========
1847     // (Loop is rotated; its entry is L_load_element.)
1848     // Loop control:
1849     //   for (; count != 0; count--) {
1850     //     copied_oop = load_heap_oop(from++);
1851     //     ... generate_type_check ...;
1852     //     store_heap_oop(to++, copied_oop);
1853     //   }
1854     __ align(OptoLoopAlignment);
1855 
1856     __ BIND(L_store_element);
1857     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1858     __ sub(count, count, 1);
1859     __ cbz(count, L_do_card_marks);
1860 
1861     // ======== loop entry is here ========
1862     __ BIND(L_load_element);
1863     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1864     __ cbz(copied_oop, L_store_element);
1865 
1866     __ load_klass(r19_klass, copied_oop);// query the object klass
1867     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1868     // ======== end loop ========
1869 
1870     // It was a real error; we must depend on the caller to finish the job.
1871     // Register count = remaining oops, count_orig = total oops.
1872     // Emit GC store barriers for the oops we have copied and report
1873     // their number to the caller.
1874 
1875     __ subs(count, count_save, count);     // K = partially copied oop count
1876     __ eon(count, count, zr);                   // report (-1^K) to caller
1877     __ br(Assembler::EQ, L_done_pop);
1878 
1879     __ BIND(L_do_card_marks);
1880     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1881 
1882     __ bind(L_done_pop);
1883     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1884     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1885 
1886     __ bind(L_done);
1887     __ mov(r0, count);
1888     __ leave();
1889     __ ret(lr);
1890 
1891     return start;
1892   }
1893 
1894   // Perform range checks on the proposed arraycopy.
1895   // Kills temp, but nothing else.
1896   // Also, clean the sign bits of src_pos and dst_pos.
1897   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1898                               Register src_pos, // source position (c_rarg1)
1899                               Register dst,     // destination array oo (c_rarg2)
1900                               Register dst_pos, // destination position (c_rarg3)
1901                               Register length,
1902                               Register temp,
1903                               Label& L_failed) {
1904     BLOCK_COMMENT("arraycopy_range_checks:");
1905 
1906     assert_different_registers(rscratch1, temp);
1907 
1908     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1909     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1910     __ addw(temp, length, src_pos);
1911     __ cmpw(temp, rscratch1);
1912     __ br(Assembler::HI, L_failed);
1913 
1914     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1915     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1916     __ addw(temp, length, dst_pos);
1917     __ cmpw(temp, rscratch1);
1918     __ br(Assembler::HI, L_failed);
1919 
1920     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1921     __ movw(src_pos, src_pos);
1922     __ movw(dst_pos, dst_pos);
1923 
1924     BLOCK_COMMENT("arraycopy_range_checks done");
1925   }
1926 
1927   // These stubs get called from some dumb test routine.
1928   // I'll write them properly when they're called from
1929   // something that's actually doing something.
1930   static void fake_arraycopy_stub(address src, address dst, int count) {
1931     assert(count == 0, "huh?");
1932   }
1933 
1934 
1935   //
1936   //  Generate 'unsafe' array copy stub
1937   //  Though just as safe as the other stubs, it takes an unscaled
1938   //  size_t argument instead of an element count.
1939   //
1940   //  Input:
1941   //    c_rarg0   - source array address
1942   //    c_rarg1   - destination array address
1943   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1944   //
1945   // Examines the alignment of the operands and dispatches
1946   // to a long, int, short, or byte copy loop.
1947   //
1948   address generate_unsafe_copy(const char *name,
1949                                address byte_copy_entry,
1950                                address short_copy_entry,
1951                                address int_copy_entry,
1952                                address long_copy_entry) {
1953     Label L_long_aligned, L_int_aligned, L_short_aligned;
1954     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1955 
1956     __ align(CodeEntryAlignment);
1957     StubCodeMark mark(this, "StubRoutines", name);
1958     address start = __ pc();
1959     __ enter(); // required for proper stackwalking of RuntimeStub frame
1960 
1961     // bump this on entry, not on exit:
1962     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1963 
1964     __ orr(rscratch1, s, d);
1965     __ orr(rscratch1, rscratch1, count);
1966 
1967     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1968     __ cbz(rscratch1, L_long_aligned);
1969     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1970     __ cbz(rscratch1, L_int_aligned);
1971     __ tbz(rscratch1, 0, L_short_aligned);
1972     __ b(RuntimeAddress(byte_copy_entry));
1973 
1974     __ BIND(L_short_aligned);
1975     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1976     __ b(RuntimeAddress(short_copy_entry));
1977     __ BIND(L_int_aligned);
1978     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1979     __ b(RuntimeAddress(int_copy_entry));
1980     __ BIND(L_long_aligned);
1981     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1982     __ b(RuntimeAddress(long_copy_entry));
1983 
1984     return start;
1985   }
1986 
1987   //
1988   //  Generate generic array copy stubs
1989   //
1990   //  Input:
1991   //    c_rarg0    -  src oop
1992   //    c_rarg1    -  src_pos (32-bits)
1993   //    c_rarg2    -  dst oop
1994   //    c_rarg3    -  dst_pos (32-bits)
1995   //    c_rarg4    -  element count (32-bits)
1996   //
1997   //  Output:
1998   //    r0 ==  0  -  success
1999   //    r0 == -1^K - failure, where K is partial transfer count
2000   //
2001   address generate_generic_copy(const char *name,
2002                                 address byte_copy_entry, address short_copy_entry,
2003                                 address int_copy_entry, address oop_copy_entry,
2004                                 address long_copy_entry, address checkcast_copy_entry) {
2005 
2006     Label L_failed, L_objArray;
2007     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2008 
2009     // Input registers
2010     const Register src        = c_rarg0;  // source array oop
2011     const Register src_pos    = c_rarg1;  // source position
2012     const Register dst        = c_rarg2;  // destination array oop
2013     const Register dst_pos    = c_rarg3;  // destination position
2014     const Register length     = c_rarg4;
2015 
2016 
2017     // Registers used as temps
2018     const Register dst_klass  = c_rarg5;
2019 
2020     __ align(CodeEntryAlignment);
2021 
2022     StubCodeMark mark(this, "StubRoutines", name);
2023 
2024     address start = __ pc();
2025 
2026     __ enter(); // required for proper stackwalking of RuntimeStub frame
2027 
2028     // bump this on entry, not on exit:
2029     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2030 
2031     //-----------------------------------------------------------------------
2032     // Assembler stub will be used for this call to arraycopy
2033     // if the following conditions are met:
2034     //
2035     // (1) src and dst must not be null.
2036     // (2) src_pos must not be negative.
2037     // (3) dst_pos must not be negative.
2038     // (4) length  must not be negative.
2039     // (5) src klass and dst klass should be the same and not NULL.
2040     // (6) src and dst should be arrays.
2041     // (7) src_pos + length must not exceed length of src.
2042     // (8) dst_pos + length must not exceed length of dst.
2043     //
2044 
2045     //  if (src == NULL) return -1;
2046     __ cbz(src, L_failed);
2047 
2048     //  if (src_pos < 0) return -1;
2049     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2050 
2051     //  if (dst == NULL) return -1;
2052     __ cbz(dst, L_failed);
2053 
2054     //  if (dst_pos < 0) return -1;
2055     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2056 
2057     // registers used as temp
2058     const Register scratch_length    = r16; // elements count to copy
2059     const Register scratch_src_klass = r17; // array klass
2060     const Register lh                = r15; // layout helper
2061 
2062     //  if (length < 0) return -1;
2063     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2064     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2065 
2066     __ load_klass(scratch_src_klass, src);
2067 #ifdef ASSERT
2068     //  assert(src->klass() != NULL);
2069     {
2070       BLOCK_COMMENT("assert klasses not null {");
2071       Label L1, L2;
2072       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2073       __ bind(L1);
2074       __ stop("broken null klass");
2075       __ bind(L2);
2076       __ load_klass(rscratch1, dst);
2077       __ cbz(rscratch1, L1);     // this would be broken also
2078       BLOCK_COMMENT("} assert klasses not null done");
2079     }
2080 #endif
2081 
2082     // Load layout helper (32-bits)
2083     //
2084     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2085     // 32        30    24            16              8     2                 0
2086     //
2087     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2088     //
2089 
2090     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2091 
2092     // Handle objArrays completely differently...
2093     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2094     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2095     __ movw(rscratch1, objArray_lh);
2096     __ eorw(rscratch2, lh, rscratch1);
2097     __ cbzw(rscratch2, L_objArray);
2098 
2099     //  if (src->klass() != dst->klass()) return -1;
2100     __ load_klass(rscratch2, dst);
2101     __ eor(rscratch2, rscratch2, scratch_src_klass);
2102     __ cbnz(rscratch2, L_failed);
2103 
2104     //  if (!src->is_Array()) return -1;
2105     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2106 
2107     // At this point, it is known to be a typeArray (array_tag 0x3).
2108 #ifdef ASSERT
2109     {
2110       BLOCK_COMMENT("assert primitive array {");
2111       Label L;
2112       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2113       __ cmpw(lh, rscratch2);
2114       __ br(Assembler::GE, L);
2115       __ stop("must be a primitive array");
2116       __ bind(L);
2117       BLOCK_COMMENT("} assert primitive array done");
2118     }
2119 #endif
2120 
2121     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2122                            rscratch2, L_failed);
2123 
2124     // TypeArrayKlass
2125     //
2126     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2127     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2128     //
2129 
2130     const Register rscratch1_offset = rscratch1;    // array offset
2131     const Register r15_elsize = lh; // element size
2132 
2133     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2134            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2135     __ add(src, src, rscratch1_offset);           // src array offset
2136     __ add(dst, dst, rscratch1_offset);           // dst array offset
2137     BLOCK_COMMENT("choose copy loop based on element size");
2138 
2139     // next registers should be set before the jump to corresponding stub
2140     const Register from     = c_rarg0;  // source array address
2141     const Register to       = c_rarg1;  // destination array address
2142     const Register count    = c_rarg2;  // elements count
2143 
2144     // 'from', 'to', 'count' registers should be set in such order
2145     // since they are the same as 'src', 'src_pos', 'dst'.
2146 
2147     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2148 
2149     // The possible values of elsize are 0-3, i.e. exact_log2(element
2150     // size in bytes).  We do a simple bitwise binary search.
2151   __ BIND(L_copy_bytes);
2152     __ tbnz(r15_elsize, 1, L_copy_ints);
2153     __ tbnz(r15_elsize, 0, L_copy_shorts);
2154     __ lea(from, Address(src, src_pos));// src_addr
2155     __ lea(to,   Address(dst, dst_pos));// dst_addr
2156     __ movw(count, scratch_length); // length
2157     __ b(RuntimeAddress(byte_copy_entry));
2158 
2159   __ BIND(L_copy_shorts);
2160     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2161     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2162     __ movw(count, scratch_length); // length
2163     __ b(RuntimeAddress(short_copy_entry));
2164 
2165   __ BIND(L_copy_ints);
2166     __ tbnz(r15_elsize, 0, L_copy_longs);
2167     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2168     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2169     __ movw(count, scratch_length); // length
2170     __ b(RuntimeAddress(int_copy_entry));
2171 
2172   __ BIND(L_copy_longs);
2173 #ifdef ASSERT
2174     {
2175       BLOCK_COMMENT("assert long copy {");
2176       Label L;
2177       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2178       __ cmpw(r15_elsize, LogBytesPerLong);
2179       __ br(Assembler::EQ, L);
2180       __ stop("must be long copy, but elsize is wrong");
2181       __ bind(L);
2182       BLOCK_COMMENT("} assert long copy done");
2183     }
2184 #endif
2185     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2186     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2187     __ movw(count, scratch_length); // length
2188     __ b(RuntimeAddress(long_copy_entry));
2189 
2190     // ObjArrayKlass
2191   __ BIND(L_objArray);
2192     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2193 
2194     Label L_plain_copy, L_checkcast_copy;
2195     //  test array classes for subtyping
2196     __ load_klass(r15, dst);
2197     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2198     __ br(Assembler::NE, L_checkcast_copy);
2199 
2200     // Identically typed arrays can be copied without element-wise checks.
2201     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2202                            rscratch2, L_failed);
2203 
2204     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2205     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2206     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2207     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2208     __ movw(count, scratch_length); // length
2209   __ BIND(L_plain_copy);
2210     __ b(RuntimeAddress(oop_copy_entry));
2211 
2212   __ BIND(L_checkcast_copy);
2213     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2214     {
2215       // Before looking at dst.length, make sure dst is also an objArray.
2216       __ ldrw(rscratch1, Address(r15, lh_offset));
2217       __ movw(rscratch2, objArray_lh);
2218       __ eorw(rscratch1, rscratch1, rscratch2);
2219       __ cbnzw(rscratch1, L_failed);
2220 
2221       // It is safe to examine both src.length and dst.length.
2222       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2223                              r15, L_failed);
2224 
2225       __ load_klass(dst_klass, dst); // reload
2226 
2227       // Marshal the base address arguments now, freeing registers.
2228       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2229       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2230       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2231       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2232       __ movw(count, length);           // length (reloaded)
2233       Register sco_temp = c_rarg3;      // this register is free now
2234       assert_different_registers(from, to, count, sco_temp,
2235                                  dst_klass, scratch_src_klass);
2236       // assert_clean_int(count, sco_temp);
2237 
2238       // Generate the type check.
2239       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2240       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2241 
2242       // Smashes rscratch1, rscratch2
2243       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2244 
2245       // Fetch destination element klass from the ObjArrayKlass header.
2246       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2247       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2248       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2249 
2250       // the checkcast_copy loop needs two extra arguments:
2251       assert(c_rarg3 == sco_temp, "#3 already in place");
2252       // Set up arguments for checkcast_copy_entry.
2253       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2254       __ b(RuntimeAddress(checkcast_copy_entry));
2255     }
2256 
2257   __ BIND(L_failed);
2258     __ mov(r0, -1);
2259     __ leave();   // required for proper stackwalking of RuntimeStub frame
2260     __ ret(lr);
2261 
2262     return start;
2263   }
2264 
2265   //
2266   // Generate stub for array fill. If "aligned" is true, the
2267   // "to" address is assumed to be heapword aligned.
2268   //
2269   // Arguments for generated stub:
2270   //   to:    c_rarg0
2271   //   value: c_rarg1
2272   //   count: c_rarg2 treated as signed
2273   //
2274   address generate_fill(BasicType t, bool aligned, const char *name) {
2275     __ align(CodeEntryAlignment);
2276     StubCodeMark mark(this, "StubRoutines", name);
2277     address start = __ pc();
2278 
2279     BLOCK_COMMENT("Entry:");
2280 
2281     const Register to        = c_rarg0;  // source array address
2282     const Register value     = c_rarg1;  // value
2283     const Register count     = c_rarg2;  // elements count
2284 
2285     const Register bz_base = r10;        // base for block_zero routine
2286     const Register cnt_words = r11;      // temp register
2287 
2288     __ enter();
2289 
2290     Label L_fill_elements, L_exit1;
2291 
2292     int shift = -1;
2293     switch (t) {
2294       case T_BYTE:
2295         shift = 0;
2296         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2297         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2298         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2299         __ br(Assembler::LO, L_fill_elements);
2300         break;
2301       case T_SHORT:
2302         shift = 1;
2303         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2304         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2305         __ br(Assembler::LO, L_fill_elements);
2306         break;
2307       case T_INT:
2308         shift = 2;
2309         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2310         __ br(Assembler::LO, L_fill_elements);
2311         break;
2312       default: ShouldNotReachHere();
2313     }
2314 
2315     // Align source address at 8 bytes address boundary.
2316     Label L_skip_align1, L_skip_align2, L_skip_align4;
2317     if (!aligned) {
2318       switch (t) {
2319         case T_BYTE:
2320           // One byte misalignment happens only for byte arrays.
2321           __ tbz(to, 0, L_skip_align1);
2322           __ strb(value, Address(__ post(to, 1)));
2323           __ subw(count, count, 1);
2324           __ bind(L_skip_align1);
2325           // Fallthrough
2326         case T_SHORT:
2327           // Two bytes misalignment happens only for byte and short (char) arrays.
2328           __ tbz(to, 1, L_skip_align2);
2329           __ strh(value, Address(__ post(to, 2)));
2330           __ subw(count, count, 2 >> shift);
2331           __ bind(L_skip_align2);
2332           // Fallthrough
2333         case T_INT:
2334           // Align to 8 bytes, we know we are 4 byte aligned to start.
2335           __ tbz(to, 2, L_skip_align4);
2336           __ strw(value, Address(__ post(to, 4)));
2337           __ subw(count, count, 4 >> shift);
2338           __ bind(L_skip_align4);
2339           break;
2340         default: ShouldNotReachHere();
2341       }
2342     }
2343 
2344     //
2345     //  Fill large chunks
2346     //
2347     __ lsrw(cnt_words, count, 3 - shift); // number of words
2348     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2349     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2350     if (UseBlockZeroing) {
2351       Label non_block_zeroing, rest;
2352       // If the fill value is zero we can use the fast zero_words().
2353       __ cbnz(value, non_block_zeroing);
2354       __ mov(bz_base, to);
2355       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2356       __ zero_words(bz_base, cnt_words);
2357       __ b(rest);
2358       __ bind(non_block_zeroing);
2359       __ fill_words(to, cnt_words, value);
2360       __ bind(rest);
2361     } else {
2362       __ fill_words(to, cnt_words, value);
2363     }
2364 
2365     // Remaining count is less than 8 bytes. Fill it by a single store.
2366     // Note that the total length is no less than 8 bytes.
2367     if (t == T_BYTE || t == T_SHORT) {
2368       Label L_exit1;
2369       __ cbzw(count, L_exit1);
2370       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2371       __ str(value, Address(to, -8));    // overwrite some elements
2372       __ bind(L_exit1);
2373       __ leave();
2374       __ ret(lr);
2375     }
2376 
2377     // Handle copies less than 8 bytes.
2378     Label L_fill_2, L_fill_4, L_exit2;
2379     __ bind(L_fill_elements);
2380     switch (t) {
2381       case T_BYTE:
2382         __ tbz(count, 0, L_fill_2);
2383         __ strb(value, Address(__ post(to, 1)));
2384         __ bind(L_fill_2);
2385         __ tbz(count, 1, L_fill_4);
2386         __ strh(value, Address(__ post(to, 2)));
2387         __ bind(L_fill_4);
2388         __ tbz(count, 2, L_exit2);
2389         __ strw(value, Address(to));
2390         break;
2391       case T_SHORT:
2392         __ tbz(count, 0, L_fill_4);
2393         __ strh(value, Address(__ post(to, 2)));
2394         __ bind(L_fill_4);
2395         __ tbz(count, 1, L_exit2);
2396         __ strw(value, Address(to));
2397         break;
2398       case T_INT:
2399         __ cbzw(count, L_exit2);
2400         __ strw(value, Address(to));
2401         break;
2402       default: ShouldNotReachHere();
2403     }
2404     __ bind(L_exit2);
2405     __ leave();
2406     __ ret(lr);
2407     return start;
2408   }
2409 
2410   address generate_data_cache_writeback() {
2411     const Register line        = c_rarg0;  // address of line to write back
2412 
2413     __ align(CodeEntryAlignment);
2414 
2415     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2416 
2417     address start = __ pc();
2418     __ enter();
2419     __ cache_wb(Address(line, 0));
2420     __ leave();
2421     __ ret(lr);
2422 
2423     return start;
2424   }
2425 
2426   address generate_data_cache_writeback_sync() {
2427     const Register is_pre     = c_rarg0;  // pre or post sync
2428 
2429     __ align(CodeEntryAlignment);
2430 
2431     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2432 
2433     // pre wbsync is a no-op
2434     // post wbsync translates to an sfence
2435 
2436     Label skip;
2437     address start = __ pc();
2438     __ enter();
2439     __ cbnz(is_pre, skip);
2440     __ cache_wbsync(false);
2441     __ bind(skip);
2442     __ leave();
2443     __ ret(lr);
2444 
2445     return start;
2446   }
2447 
2448   void generate_arraycopy_stubs() {
2449     address entry;
2450     address entry_jbyte_arraycopy;
2451     address entry_jshort_arraycopy;
2452     address entry_jint_arraycopy;
2453     address entry_oop_arraycopy;
2454     address entry_jlong_arraycopy;
2455     address entry_checkcast_arraycopy;
2456 
2457     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2458     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2459 
2460     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2461 
2462     //*** jbyte
2463     // Always need aligned and unaligned versions
2464     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2465                                                                                   "jbyte_disjoint_arraycopy");
2466     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2467                                                                                   &entry_jbyte_arraycopy,
2468                                                                                   "jbyte_arraycopy");
2469     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2470                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2471     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2472                                                                                   "arrayof_jbyte_arraycopy");
2473 
2474     //*** jshort
2475     // Always need aligned and unaligned versions
2476     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2477                                                                                     "jshort_disjoint_arraycopy");
2478     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2479                                                                                     &entry_jshort_arraycopy,
2480                                                                                     "jshort_arraycopy");
2481     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2482                                                                                     "arrayof_jshort_disjoint_arraycopy");
2483     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2484                                                                                     "arrayof_jshort_arraycopy");
2485 
2486     //*** jint
2487     // Aligned versions
2488     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2489                                                                                 "arrayof_jint_disjoint_arraycopy");
2490     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2491                                                                                 "arrayof_jint_arraycopy");
2492     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2493     // entry_jint_arraycopy always points to the unaligned version
2494     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2495                                                                                 "jint_disjoint_arraycopy");
2496     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2497                                                                                 &entry_jint_arraycopy,
2498                                                                                 "jint_arraycopy");
2499 
2500     //*** jlong
2501     // It is always aligned
2502     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2503                                                                                   "arrayof_jlong_disjoint_arraycopy");
2504     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2505                                                                                   "arrayof_jlong_arraycopy");
2506     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2507     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2508 
2509     //*** oops
2510     {
2511       // With compressed oops we need unaligned versions; notice that
2512       // we overwrite entry_oop_arraycopy.
2513       bool aligned = !UseCompressedOops;
2514 
2515       StubRoutines::_arrayof_oop_disjoint_arraycopy
2516         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2517                                      /*dest_uninitialized*/false);
2518       StubRoutines::_arrayof_oop_arraycopy
2519         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2520                                      /*dest_uninitialized*/false);
2521       // Aligned versions without pre-barriers
2522       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2523         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2524                                      /*dest_uninitialized*/true);
2525       StubRoutines::_arrayof_oop_arraycopy_uninit
2526         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2527                                      /*dest_uninitialized*/true);
2528     }
2529 
2530     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2531     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2532     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2533     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2534 
2535     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2536     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2537                                                                         /*dest_uninitialized*/true);
2538 
2539     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2540                                                               entry_jbyte_arraycopy,
2541                                                               entry_jshort_arraycopy,
2542                                                               entry_jint_arraycopy,
2543                                                               entry_jlong_arraycopy);
2544 
2545     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2546                                                                entry_jbyte_arraycopy,
2547                                                                entry_jshort_arraycopy,
2548                                                                entry_jint_arraycopy,
2549                                                                entry_oop_arraycopy,
2550                                                                entry_jlong_arraycopy,
2551                                                                entry_checkcast_arraycopy);
2552 
2553     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2554     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2555     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2556     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2557     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2558     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2559   }
2560 
2561   void generate_math_stubs() { Unimplemented(); }
2562 
2563   // Arguments:
2564   //
2565   // Inputs:
2566   //   c_rarg0   - source byte array address
2567   //   c_rarg1   - destination byte array address
2568   //   c_rarg2   - K (key) in little endian int array
2569   //
2570   address generate_aescrypt_encryptBlock() {
2571     __ align(CodeEntryAlignment);
2572     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2573 
2574     const Register from        = c_rarg0;  // source array address
2575     const Register to          = c_rarg1;  // destination array address
2576     const Register key         = c_rarg2;  // key array address
2577     const Register keylen      = rscratch1;
2578 
2579     address start = __ pc();
2580     __ enter();
2581 
2582     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2583 
2584     __ aesenc_loadkeys(key, keylen);
2585     __ aesecb_encrypt(from, to, keylen);
2586 
2587     __ mov(r0, 0);
2588 
2589     __ leave();
2590     __ ret(lr);
2591 
2592     return start;
2593   }
2594 
2595   // Arguments:
2596   //
2597   // Inputs:
2598   //   c_rarg0   - source byte array address
2599   //   c_rarg1   - destination byte array address
2600   //   c_rarg2   - K (key) in little endian int array
2601   //
2602   address generate_aescrypt_decryptBlock() {
2603     assert(UseAES, "need AES cryptographic extension support");
2604     __ align(CodeEntryAlignment);
2605     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2606     Label L_doLast;
2607 
2608     const Register from        = c_rarg0;  // source array address
2609     const Register to          = c_rarg1;  // destination array address
2610     const Register key         = c_rarg2;  // key array address
2611     const Register keylen      = rscratch1;
2612 
2613     address start = __ pc();
2614     __ enter(); // required for proper stackwalking of RuntimeStub frame
2615 
2616     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2617 
2618     __ aesecb_decrypt(from, to, key, keylen);
2619 
2620     __ mov(r0, 0);
2621 
2622     __ leave();
2623     __ ret(lr);
2624 
2625     return start;
2626   }
2627 
2628   // Arguments:
2629   //
2630   // Inputs:
2631   //   c_rarg0   - source byte array address
2632   //   c_rarg1   - destination byte array address
2633   //   c_rarg2   - K (key) in little endian int array
2634   //   c_rarg3   - r vector byte array address
2635   //   c_rarg4   - input length
2636   //
2637   // Output:
2638   //   x0        - input length
2639   //
2640   address generate_cipherBlockChaining_encryptAESCrypt() {
2641     assert(UseAES, "need AES cryptographic extension support");
2642     __ align(CodeEntryAlignment);
2643     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2644 
2645     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2646 
2647     const Register from        = c_rarg0;  // source array address
2648     const Register to          = c_rarg1;  // destination array address
2649     const Register key         = c_rarg2;  // key array address
2650     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2651                                            // and left with the results of the last encryption block
2652     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2653     const Register keylen      = rscratch1;
2654 
2655     address start = __ pc();
2656 
2657       __ enter();
2658 
2659       __ movw(rscratch2, len_reg);
2660 
2661       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2662 
2663       __ ld1(v0, __ T16B, rvec);
2664 
2665       __ cmpw(keylen, 52);
2666       __ br(Assembler::CC, L_loadkeys_44);
2667       __ br(Assembler::EQ, L_loadkeys_52);
2668 
2669       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2670       __ rev32(v17, __ T16B, v17);
2671       __ rev32(v18, __ T16B, v18);
2672     __ BIND(L_loadkeys_52);
2673       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2674       __ rev32(v19, __ T16B, v19);
2675       __ rev32(v20, __ T16B, v20);
2676     __ BIND(L_loadkeys_44);
2677       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2678       __ rev32(v21, __ T16B, v21);
2679       __ rev32(v22, __ T16B, v22);
2680       __ rev32(v23, __ T16B, v23);
2681       __ rev32(v24, __ T16B, v24);
2682       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2683       __ rev32(v25, __ T16B, v25);
2684       __ rev32(v26, __ T16B, v26);
2685       __ rev32(v27, __ T16B, v27);
2686       __ rev32(v28, __ T16B, v28);
2687       __ ld1(v29, v30, v31, __ T16B, key);
2688       __ rev32(v29, __ T16B, v29);
2689       __ rev32(v30, __ T16B, v30);
2690       __ rev32(v31, __ T16B, v31);
2691 
2692     __ BIND(L_aes_loop);
2693       __ ld1(v1, __ T16B, __ post(from, 16));
2694       __ eor(v0, __ T16B, v0, v1);
2695 
2696       __ br(Assembler::CC, L_rounds_44);
2697       __ br(Assembler::EQ, L_rounds_52);
2698 
2699       __ aese(v0, v17); __ aesmc(v0, v0);
2700       __ aese(v0, v18); __ aesmc(v0, v0);
2701     __ BIND(L_rounds_52);
2702       __ aese(v0, v19); __ aesmc(v0, v0);
2703       __ aese(v0, v20); __ aesmc(v0, v0);
2704     __ BIND(L_rounds_44);
2705       __ aese(v0, v21); __ aesmc(v0, v0);
2706       __ aese(v0, v22); __ aesmc(v0, v0);
2707       __ aese(v0, v23); __ aesmc(v0, v0);
2708       __ aese(v0, v24); __ aesmc(v0, v0);
2709       __ aese(v0, v25); __ aesmc(v0, v0);
2710       __ aese(v0, v26); __ aesmc(v0, v0);
2711       __ aese(v0, v27); __ aesmc(v0, v0);
2712       __ aese(v0, v28); __ aesmc(v0, v0);
2713       __ aese(v0, v29); __ aesmc(v0, v0);
2714       __ aese(v0, v30);
2715       __ eor(v0, __ T16B, v0, v31);
2716 
2717       __ st1(v0, __ T16B, __ post(to, 16));
2718 
2719       __ subw(len_reg, len_reg, 16);
2720       __ cbnzw(len_reg, L_aes_loop);
2721 
2722       __ st1(v0, __ T16B, rvec);
2723 
2724       __ mov(r0, rscratch2);
2725 
2726       __ leave();
2727       __ ret(lr);
2728 
2729       return start;
2730   }
2731 
2732   // Arguments:
2733   //
2734   // Inputs:
2735   //   c_rarg0   - source byte array address
2736   //   c_rarg1   - destination byte array address
2737   //   c_rarg2   - K (key) in little endian int array
2738   //   c_rarg3   - r vector byte array address
2739   //   c_rarg4   - input length
2740   //
2741   // Output:
2742   //   r0        - input length
2743   //
2744   address generate_cipherBlockChaining_decryptAESCrypt() {
2745     assert(UseAES, "need AES cryptographic extension support");
2746     __ align(CodeEntryAlignment);
2747     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2748 
2749     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2750 
2751     const Register from        = c_rarg0;  // source array address
2752     const Register to          = c_rarg1;  // destination array address
2753     const Register key         = c_rarg2;  // key array address
2754     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2755                                            // and left with the results of the last encryption block
2756     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2757     const Register keylen      = rscratch1;
2758 
2759     address start = __ pc();
2760 
2761       __ enter();
2762 
2763       __ movw(rscratch2, len_reg);
2764 
2765       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2766 
2767       __ ld1(v2, __ T16B, rvec);
2768 
2769       __ ld1(v31, __ T16B, __ post(key, 16));
2770       __ rev32(v31, __ T16B, v31);
2771 
2772       __ cmpw(keylen, 52);
2773       __ br(Assembler::CC, L_loadkeys_44);
2774       __ br(Assembler::EQ, L_loadkeys_52);
2775 
2776       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2777       __ rev32(v17, __ T16B, v17);
2778       __ rev32(v18, __ T16B, v18);
2779     __ BIND(L_loadkeys_52);
2780       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2781       __ rev32(v19, __ T16B, v19);
2782       __ rev32(v20, __ T16B, v20);
2783     __ BIND(L_loadkeys_44);
2784       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2785       __ rev32(v21, __ T16B, v21);
2786       __ rev32(v22, __ T16B, v22);
2787       __ rev32(v23, __ T16B, v23);
2788       __ rev32(v24, __ T16B, v24);
2789       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2790       __ rev32(v25, __ T16B, v25);
2791       __ rev32(v26, __ T16B, v26);
2792       __ rev32(v27, __ T16B, v27);
2793       __ rev32(v28, __ T16B, v28);
2794       __ ld1(v29, v30, __ T16B, key);
2795       __ rev32(v29, __ T16B, v29);
2796       __ rev32(v30, __ T16B, v30);
2797 
2798     __ BIND(L_aes_loop);
2799       __ ld1(v0, __ T16B, __ post(from, 16));
2800       __ orr(v1, __ T16B, v0, v0);
2801 
2802       __ br(Assembler::CC, L_rounds_44);
2803       __ br(Assembler::EQ, L_rounds_52);
2804 
2805       __ aesd(v0, v17); __ aesimc(v0, v0);
2806       __ aesd(v0, v18); __ aesimc(v0, v0);
2807     __ BIND(L_rounds_52);
2808       __ aesd(v0, v19); __ aesimc(v0, v0);
2809       __ aesd(v0, v20); __ aesimc(v0, v0);
2810     __ BIND(L_rounds_44);
2811       __ aesd(v0, v21); __ aesimc(v0, v0);
2812       __ aesd(v0, v22); __ aesimc(v0, v0);
2813       __ aesd(v0, v23); __ aesimc(v0, v0);
2814       __ aesd(v0, v24); __ aesimc(v0, v0);
2815       __ aesd(v0, v25); __ aesimc(v0, v0);
2816       __ aesd(v0, v26); __ aesimc(v0, v0);
2817       __ aesd(v0, v27); __ aesimc(v0, v0);
2818       __ aesd(v0, v28); __ aesimc(v0, v0);
2819       __ aesd(v0, v29); __ aesimc(v0, v0);
2820       __ aesd(v0, v30);
2821       __ eor(v0, __ T16B, v0, v31);
2822       __ eor(v0, __ T16B, v0, v2);
2823 
2824       __ st1(v0, __ T16B, __ post(to, 16));
2825       __ orr(v2, __ T16B, v1, v1);
2826 
2827       __ subw(len_reg, len_reg, 16);
2828       __ cbnzw(len_reg, L_aes_loop);
2829 
2830       __ st1(v2, __ T16B, rvec);
2831 
2832       __ mov(r0, rscratch2);
2833 
2834       __ leave();
2835       __ ret(lr);
2836 
2837     return start;
2838   }
2839 
2840   // CTR AES crypt.
2841   // Arguments:
2842   //
2843   // Inputs:
2844   //   c_rarg0   - source byte array address
2845   //   c_rarg1   - destination byte array address
2846   //   c_rarg2   - K (key) in little endian int array
2847   //   c_rarg3   - counter vector byte array address
2848   //   c_rarg4   - input length
2849   //   c_rarg5   - saved encryptedCounter start
2850   //   c_rarg6   - saved used length
2851   //
2852   // Output:
2853   //   r0       - input length
2854   //
2855   address generate_counterMode_AESCrypt() {
2856     const Register in = c_rarg0;
2857     const Register out = c_rarg1;
2858     const Register key = c_rarg2;
2859     const Register counter = c_rarg3;
2860     const Register saved_len = c_rarg4, len = r10;
2861     const Register saved_encrypted_ctr = c_rarg5;
2862     const Register used_ptr = c_rarg6, used = r12;
2863 
2864     const Register offset = r7;
2865     const Register keylen = r11;
2866 
2867     const unsigned char block_size = 16;
2868     const int bulk_width = 4;
2869     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2870     // performance with larger data sizes, but it also means that the
2871     // fast path isn't used until you have at least 8 blocks, and up
2872     // to 127 bytes of data will be executed on the slow path. For
2873     // that reason, and also so as not to blow away too much icache, 4
2874     // blocks seems like a sensible compromise.
2875 
2876     // Algorithm:
2877     //
2878     //    if (len == 0) {
2879     //        goto DONE;
2880     //    }
2881     //    int result = len;
2882     //    do {
2883     //        if (used >= blockSize) {
2884     //            if (len >= bulk_width * blockSize) {
2885     //                CTR_large_block();
2886     //                if (len == 0)
2887     //                    goto DONE;
2888     //            }
2889     //            for (;;) {
2890     //                16ByteVector v0 = counter;
2891     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2892     //                used = 0;
2893     //                if (len < blockSize)
2894     //                    break;    /* goto NEXT */
2895     //                16ByteVector v1 = load16Bytes(in, offset);
2896     //                v1 = v1 ^ encryptedCounter;
2897     //                store16Bytes(out, offset);
2898     //                used = blockSize;
2899     //                offset += blockSize;
2900     //                len -= blockSize;
2901     //                if (len == 0)
2902     //                    goto DONE;
2903     //            }
2904     //        }
2905     //      NEXT:
2906     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2907     //        len--;
2908     //    } while (len != 0);
2909     //  DONE:
2910     //    return result;
2911     //
2912     // CTR_large_block()
2913     //    Wide bulk encryption of whole blocks.
2914 
2915     __ align(CodeEntryAlignment);
2916     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2917     const address start = __ pc();
2918     __ enter();
2919 
2920     Label DONE, CTR_large_block, large_block_return;
2921     __ ldrw(used, Address(used_ptr));
2922     __ cbzw(saved_len, DONE);
2923 
2924     __ mov(len, saved_len);
2925     __ mov(offset, 0);
2926 
2927     // Compute #rounds for AES based on the length of the key array
2928     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2929 
2930     __ aesenc_loadkeys(key, keylen);
2931 
2932     {
2933       Label L_CTR_loop, NEXT;
2934 
2935       __ bind(L_CTR_loop);
2936 
2937       __ cmp(used, block_size);
2938       __ br(__ LO, NEXT);
2939 
2940       // Maybe we have a lot of data
2941       __ subsw(rscratch1, len, bulk_width * block_size);
2942       __ br(__ HS, CTR_large_block);
2943       __ BIND(large_block_return);
2944       __ cbzw(len, DONE);
2945 
2946       // Setup the counter
2947       __ movi(v4, __ T4S, 0);
2948       __ movi(v5, __ T4S, 1);
2949       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2950 
2951       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2952       __ rev32(v16, __ T16B, v0);
2953       __ addv(v16, __ T4S, v16, v4);
2954       __ rev32(v16, __ T16B, v16);
2955       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2956 
2957       {
2958         // We have fewer than bulk_width blocks of data left. Encrypt
2959         // them one by one until there is less than a full block
2960         // remaining, being careful to save both the encrypted counter
2961         // and the counter.
2962 
2963         Label inner_loop;
2964         __ bind(inner_loop);
2965         // Counter to encrypt is in v0
2966         __ aesecb_encrypt(noreg, noreg, keylen);
2967         __ st1(v0, __ T16B, saved_encrypted_ctr);
2968 
2969         // Do we have a remaining full block?
2970 
2971         __ mov(used, 0);
2972         __ cmp(len, block_size);
2973         __ br(__ LO, NEXT);
2974 
2975         // Yes, we have a full block
2976         __ ldrq(v1, Address(in, offset));
2977         __ eor(v1, __ T16B, v1, v0);
2978         __ strq(v1, Address(out, offset));
2979         __ mov(used, block_size);
2980         __ add(offset, offset, block_size);
2981 
2982         __ subw(len, len, block_size);
2983         __ cbzw(len, DONE);
2984 
2985         // Increment the counter, store it back
2986         __ orr(v0, __ T16B, v16, v16);
2987         __ rev32(v16, __ T16B, v16);
2988         __ addv(v16, __ T4S, v16, v4);
2989         __ rev32(v16, __ T16B, v16);
2990         __ st1(v16, __ T16B, counter); // Save the incremented counter back
2991 
2992         __ b(inner_loop);
2993       }
2994 
2995       __ BIND(NEXT);
2996 
2997       // Encrypt a single byte, and loop.
2998       // We expect this to be a rare event.
2999       __ ldrb(rscratch1, Address(in, offset));
3000       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3001       __ eor(rscratch1, rscratch1, rscratch2);
3002       __ strb(rscratch1, Address(out, offset));
3003       __ add(offset, offset, 1);
3004       __ add(used, used, 1);
3005       __ subw(len, len,1);
3006       __ cbnzw(len, L_CTR_loop);
3007     }
3008 
3009     __ bind(DONE);
3010     __ strw(used, Address(used_ptr));
3011     __ mov(r0, saved_len);
3012 
3013     __ leave(); // required for proper stackwalking of RuntimeStub frame
3014     __ ret(lr);
3015 
3016     // Bulk encryption
3017 
3018     __ BIND (CTR_large_block);
3019     assert(bulk_width == 4 || bulk_width == 8, "must be");
3020 
3021     if (bulk_width == 8) {
3022       __ sub(sp, sp, 4 * 16);
3023       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3024     }
3025     __ sub(sp, sp, 4 * 16);
3026     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3027     RegSet saved_regs = (RegSet::of(in, out, offset)
3028                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3029     __ push(saved_regs, sp);
3030     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3031     __ add(in, in, offset);
3032     __ add(out, out, offset);
3033 
3034     // Keys should already be loaded into the correct registers
3035 
3036     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3037     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3038 
3039     // AES/CTR loop
3040     {
3041       Label L_CTR_loop;
3042       __ BIND(L_CTR_loop);
3043 
3044       // Setup the counters
3045       __ movi(v8, __ T4S, 0);
3046       __ movi(v9, __ T4S, 1);
3047       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3048 
3049       for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
3050         __ rev32(f, __ T16B, v16);
3051         __ addv(v16, __ T4S, v16, v8);
3052       }
3053 
3054       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3055 
3056       // Encrypt the counters
3057       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3058 
3059       if (bulk_width == 8) {
3060         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3061       }
3062 
3063       // XOR the encrypted counters with the inputs
3064       for (int i = 0; i < bulk_width; i++) {
3065         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3066       }
3067 
3068       // Write the encrypted data
3069       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3070       if (bulk_width == 8) {
3071         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3072       }
3073 
3074       __ subw(len, len, 16 * bulk_width);
3075       __ cbnzw(len, L_CTR_loop);
3076     }
3077 
3078     // Save the counter back where it goes
3079     __ rev32(v16, __ T16B, v16);
3080     __ st1(v16, __ T16B, counter);
3081 
3082     __ pop(saved_regs, sp);
3083 
3084     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3085     if (bulk_width == 8) {
3086       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3087     }
3088 
3089     __ andr(rscratch1, len, -16 * bulk_width);
3090     __ sub(len, len, rscratch1);
3091     __ add(offset, offset, rscratch1);
3092     __ mov(used, 16);
3093     __ strw(used, Address(used_ptr));
3094     __ b(large_block_return);
3095 
3096     return start;
3097   }
3098 
3099   // Vector AES Galois Counter Mode implementation. Parameters:
3100   //
3101   // in = c_rarg0
3102   // len = c_rarg1
3103   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3104   // out = c_rarg3
3105   // key = c_rarg4
3106   // state = c_rarg5 - GHASH.state
3107   // subkeyHtbl = c_rarg6 - powers of H
3108   // counter = c_rarg7 - 16 bytes of CTR
3109   // return - number of processed bytes
3110   address generate_galoisCounterMode_AESCrypt() {
3111     address ghash_polynomial = __ pc();
3112     __ emit_int64(0x87);  // The low-order bits of the field
3113                           // polynomial (i.e. p = z^7+z^2+z+1)
3114                           // repeated in the low and high parts of a
3115                           // 128-bit vector
3116     __ emit_int64(0x87);
3117 
3118     __ align(CodeEntryAlignment);
3119      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3120     address start = __ pc();
3121     __ enter();
3122 
3123     const Register in = c_rarg0;
3124     const Register len = c_rarg1;
3125     const Register ct = c_rarg2;
3126     const Register out = c_rarg3;
3127     // and updated with the incremented counter in the end
3128 
3129     const Register key = c_rarg4;
3130     const Register state = c_rarg5;
3131 
3132     const Register subkeyHtbl = c_rarg6;
3133 
3134     const Register counter = c_rarg7;
3135 
3136     const Register keylen = r10;
3137     // Save state before entering routine
3138     __ sub(sp, sp, 4 * 16);
3139     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3140     __ sub(sp, sp, 4 * 16);
3141     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3142 
3143     // __ andr(len, len, -512);
3144     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3145     __ str(len, __ pre(sp, -2 * wordSize));
3146 
3147     Label DONE;
3148     __ cbz(len, DONE);
3149 
3150     // Compute #rounds for AES based on the length of the key array
3151     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3152 
3153     __ aesenc_loadkeys(key, keylen);
3154     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3155     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3156 
3157     // AES/CTR loop
3158     {
3159       Label L_CTR_loop;
3160       __ BIND(L_CTR_loop);
3161 
3162       // Setup the counters
3163       __ movi(v8, __ T4S, 0);
3164       __ movi(v9, __ T4S, 1);
3165       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3166       for (FloatRegister f = v0; f < v8; f++) {
3167         __ rev32(f, __ T16B, v16);
3168         __ addv(v16, __ T4S, v16, v8);
3169       }
3170 
3171       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3172 
3173       // Encrypt the counters
3174       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3175 
3176       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3177 
3178       // XOR the encrypted counters with the inputs
3179       for (int i = 0; i < 8; i++) {
3180         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3181       }
3182       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3183       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3184 
3185       __ subw(len, len, 16 * 8);
3186       __ cbnzw(len, L_CTR_loop);
3187     }
3188 
3189     __ rev32(v16, __ T16B, v16);
3190     __ st1(v16, __ T16B, counter);
3191 
3192     __ ldr(len, Address(sp));
3193     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3194 
3195     // GHASH/CTR loop
3196     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3197                                 len, /*unrolls*/4);
3198 
3199 #ifdef ASSERT
3200     { Label L;
3201       __ cmp(len, (unsigned char)0);
3202       __ br(Assembler::EQ, L);
3203       __ stop("stubGenerator: abort");
3204       __ bind(L);
3205   }
3206 #endif
3207 
3208   __ bind(DONE);
3209     // Return the number of bytes processed
3210     __ ldr(r0, __ post(sp, 2 * wordSize));
3211 
3212     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3213     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3214 
3215     __ leave(); // required for proper stackwalking of RuntimeStub frame
3216     __ ret(lr);
3217      return start;
3218   }
3219 
3220   // Arguments:
3221   //
3222   // Inputs:
3223   //   c_rarg0   - byte[]  source+offset
3224   //   c_rarg1   - int[]   SHA.state
3225   //   c_rarg2   - int     offset
3226   //   c_rarg3   - int     limit
3227   //
3228   address generate_md5_implCompress(bool multi_block, const char *name) {
3229     __ align(CodeEntryAlignment);
3230     StubCodeMark mark(this, "StubRoutines", name);
3231     address start = __ pc();
3232 
3233     Register buf       = c_rarg0;
3234     Register state     = c_rarg1;
3235     Register ofs       = c_rarg2;
3236     Register limit     = c_rarg3;
3237     Register a         = r4;
3238     Register b         = r5;
3239     Register c         = r6;
3240     Register d         = r7;
3241     Register rscratch3 = r10;
3242     Register rscratch4 = r11;
3243 
3244     Label keys;
3245     Label md5_loop;
3246 
3247     __ BIND(md5_loop);
3248 
3249     // Save hash values for addition after rounds
3250     __ ldrw(a, Address(state,  0));
3251     __ ldrw(b, Address(state,  4));
3252     __ ldrw(c, Address(state,  8));
3253     __ ldrw(d, Address(state, 12));
3254 
3255 #define FF(r1, r2, r3, r4, k, s, t)              \
3256     __ eorw(rscratch3, r3, r4);                  \
3257     __ movw(rscratch2, t);                       \
3258     __ andw(rscratch3, rscratch3, r2);           \
3259     __ addw(rscratch4, r1, rscratch2);           \
3260     __ ldrw(rscratch1, Address(buf, k*4));       \
3261     __ eorw(rscratch3, rscratch3, r4);           \
3262     __ addw(rscratch3, rscratch3, rscratch1);    \
3263     __ addw(rscratch3, rscratch3, rscratch4);    \
3264     __ rorw(rscratch2, rscratch3, 32 - s);       \
3265     __ addw(r1, rscratch2, r2);
3266 
3267 #define GG(r1, r2, r3, r4, k, s, t)              \
3268     __ eorw(rscratch2, r2, r3);                  \
3269     __ ldrw(rscratch1, Address(buf, k*4));       \
3270     __ andw(rscratch3, rscratch2, r4);           \
3271     __ movw(rscratch2, t);                       \
3272     __ eorw(rscratch3, rscratch3, r3);           \
3273     __ addw(rscratch4, r1, rscratch2);           \
3274     __ addw(rscratch3, rscratch3, rscratch1);    \
3275     __ addw(rscratch3, rscratch3, rscratch4);    \
3276     __ rorw(rscratch2, rscratch3, 32 - s);       \
3277     __ addw(r1, rscratch2, r2);
3278 
3279 #define HH(r1, r2, r3, r4, k, s, t)              \
3280     __ eorw(rscratch3, r3, r4);                  \
3281     __ movw(rscratch2, t);                       \
3282     __ addw(rscratch4, r1, rscratch2);           \
3283     __ ldrw(rscratch1, Address(buf, k*4));       \
3284     __ eorw(rscratch3, rscratch3, r2);           \
3285     __ addw(rscratch3, rscratch3, rscratch1);    \
3286     __ addw(rscratch3, rscratch3, rscratch4);    \
3287     __ rorw(rscratch2, rscratch3, 32 - s);       \
3288     __ addw(r1, rscratch2, r2);
3289 
3290 #define II(r1, r2, r3, r4, k, s, t)              \
3291     __ movw(rscratch3, t);                       \
3292     __ ornw(rscratch2, r2, r4);                  \
3293     __ addw(rscratch4, r1, rscratch3);           \
3294     __ ldrw(rscratch1, Address(buf, k*4));       \
3295     __ eorw(rscratch3, rscratch2, r3);           \
3296     __ addw(rscratch3, rscratch3, rscratch1);    \
3297     __ addw(rscratch3, rscratch3, rscratch4);    \
3298     __ rorw(rscratch2, rscratch3, 32 - s);       \
3299     __ addw(r1, rscratch2, r2);
3300 
3301     // Round 1
3302     FF(a, b, c, d,  0,  7, 0xd76aa478)
3303     FF(d, a, b, c,  1, 12, 0xe8c7b756)
3304     FF(c, d, a, b,  2, 17, 0x242070db)
3305     FF(b, c, d, a,  3, 22, 0xc1bdceee)
3306     FF(a, b, c, d,  4,  7, 0xf57c0faf)
3307     FF(d, a, b, c,  5, 12, 0x4787c62a)
3308     FF(c, d, a, b,  6, 17, 0xa8304613)
3309     FF(b, c, d, a,  7, 22, 0xfd469501)
3310     FF(a, b, c, d,  8,  7, 0x698098d8)
3311     FF(d, a, b, c,  9, 12, 0x8b44f7af)
3312     FF(c, d, a, b, 10, 17, 0xffff5bb1)
3313     FF(b, c, d, a, 11, 22, 0x895cd7be)
3314     FF(a, b, c, d, 12,  7, 0x6b901122)
3315     FF(d, a, b, c, 13, 12, 0xfd987193)
3316     FF(c, d, a, b, 14, 17, 0xa679438e)
3317     FF(b, c, d, a, 15, 22, 0x49b40821)
3318 
3319     // Round 2
3320     GG(a, b, c, d,  1,  5, 0xf61e2562)
3321     GG(d, a, b, c,  6,  9, 0xc040b340)
3322     GG(c, d, a, b, 11, 14, 0x265e5a51)
3323     GG(b, c, d, a,  0, 20, 0xe9b6c7aa)
3324     GG(a, b, c, d,  5,  5, 0xd62f105d)
3325     GG(d, a, b, c, 10,  9, 0x02441453)
3326     GG(c, d, a, b, 15, 14, 0xd8a1e681)
3327     GG(b, c, d, a,  4, 20, 0xe7d3fbc8)
3328     GG(a, b, c, d,  9,  5, 0x21e1cde6)
3329     GG(d, a, b, c, 14,  9, 0xc33707d6)
3330     GG(c, d, a, b,  3, 14, 0xf4d50d87)
3331     GG(b, c, d, a,  8, 20, 0x455a14ed)
3332     GG(a, b, c, d, 13,  5, 0xa9e3e905)
3333     GG(d, a, b, c,  2,  9, 0xfcefa3f8)
3334     GG(c, d, a, b,  7, 14, 0x676f02d9)
3335     GG(b, c, d, a, 12, 20, 0x8d2a4c8a)
3336 
3337     // Round 3
3338     HH(a, b, c, d,  5,  4, 0xfffa3942)
3339     HH(d, a, b, c,  8, 11, 0x8771f681)
3340     HH(c, d, a, b, 11, 16, 0x6d9d6122)
3341     HH(b, c, d, a, 14, 23, 0xfde5380c)
3342     HH(a, b, c, d,  1,  4, 0xa4beea44)
3343     HH(d, a, b, c,  4, 11, 0x4bdecfa9)
3344     HH(c, d, a, b,  7, 16, 0xf6bb4b60)
3345     HH(b, c, d, a, 10, 23, 0xbebfbc70)
3346     HH(a, b, c, d, 13,  4, 0x289b7ec6)
3347     HH(d, a, b, c,  0, 11, 0xeaa127fa)
3348     HH(c, d, a, b,  3, 16, 0xd4ef3085)
3349     HH(b, c, d, a,  6, 23, 0x04881d05)
3350     HH(a, b, c, d,  9,  4, 0xd9d4d039)
3351     HH(d, a, b, c, 12, 11, 0xe6db99e5)
3352     HH(c, d, a, b, 15, 16, 0x1fa27cf8)
3353     HH(b, c, d, a,  2, 23, 0xc4ac5665)
3354 
3355     // Round 4
3356     II(a, b, c, d,  0,  6, 0xf4292244)
3357     II(d, a, b, c,  7, 10, 0x432aff97)
3358     II(c, d, a, b, 14, 15, 0xab9423a7)
3359     II(b, c, d, a,  5, 21, 0xfc93a039)
3360     II(a, b, c, d, 12,  6, 0x655b59c3)
3361     II(d, a, b, c,  3, 10, 0x8f0ccc92)
3362     II(c, d, a, b, 10, 15, 0xffeff47d)
3363     II(b, c, d, a,  1, 21, 0x85845dd1)
3364     II(a, b, c, d,  8,  6, 0x6fa87e4f)
3365     II(d, a, b, c, 15, 10, 0xfe2ce6e0)
3366     II(c, d, a, b,  6, 15, 0xa3014314)
3367     II(b, c, d, a, 13, 21, 0x4e0811a1)
3368     II(a, b, c, d,  4,  6, 0xf7537e82)
3369     II(d, a, b, c, 11, 10, 0xbd3af235)
3370     II(c, d, a, b,  2, 15, 0x2ad7d2bb)
3371     II(b, c, d, a,  9, 21, 0xeb86d391)
3372 
3373 #undef FF
3374 #undef GG
3375 #undef HH
3376 #undef II
3377 
3378     // write hash values back in the correct order
3379     __ ldrw(rscratch1, Address(state,  0));
3380     __ addw(rscratch1, rscratch1, a);
3381     __ strw(rscratch1, Address(state,  0));
3382 
3383     __ ldrw(rscratch2, Address(state,  4));
3384     __ addw(rscratch2, rscratch2, b);
3385     __ strw(rscratch2, Address(state,  4));
3386 
3387     __ ldrw(rscratch3, Address(state,  8));
3388     __ addw(rscratch3, rscratch3, c);
3389     __ strw(rscratch3, Address(state,  8));
3390 
3391     __ ldrw(rscratch4, Address(state, 12));
3392     __ addw(rscratch4, rscratch4, d);
3393     __ strw(rscratch4, Address(state, 12));
3394 
3395     if (multi_block) {
3396       __ add(buf, buf, 64);
3397       __ add(ofs, ofs, 64);
3398       __ cmp(ofs, limit);
3399       __ br(Assembler::LE, md5_loop);
3400       __ mov(c_rarg0, ofs); // return ofs
3401     }
3402 
3403     __ ret(lr);
3404 
3405     return start;
3406   }
3407 
3408   // Arguments:
3409   //
3410   // Inputs:
3411   //   c_rarg0   - byte[]  source+offset
3412   //   c_rarg1   - int[]   SHA.state
3413   //   c_rarg2   - int     offset
3414   //   c_rarg3   - int     limit
3415   //
3416   address generate_sha1_implCompress(bool multi_block, const char *name) {
3417     __ align(CodeEntryAlignment);
3418     StubCodeMark mark(this, "StubRoutines", name);
3419     address start = __ pc();
3420 
3421     Register buf   = c_rarg0;
3422     Register state = c_rarg1;
3423     Register ofs   = c_rarg2;
3424     Register limit = c_rarg3;
3425 
3426     Label keys;
3427     Label sha1_loop;
3428 
3429     // load the keys into v0..v3
3430     __ adr(rscratch1, keys);
3431     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3432     // load 5 words state into v6, v7
3433     __ ldrq(v6, Address(state, 0));
3434     __ ldrs(v7, Address(state, 16));
3435 
3436 
3437     __ BIND(sha1_loop);
3438     // load 64 bytes of data into v16..v19
3439     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3440     __ rev32(v16, __ T16B, v16);
3441     __ rev32(v17, __ T16B, v17);
3442     __ rev32(v18, __ T16B, v18);
3443     __ rev32(v19, __ T16B, v19);
3444 
3445     // do the sha1
3446     __ addv(v4, __ T4S, v16, v0);
3447     __ orr(v20, __ T16B, v6, v6);
3448 
3449     FloatRegister d0 = v16;
3450     FloatRegister d1 = v17;
3451     FloatRegister d2 = v18;
3452     FloatRegister d3 = v19;
3453 
3454     for (int round = 0; round < 20; round++) {
3455       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3456       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3457       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3458       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3459       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3460 
3461       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3462       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3463       __ sha1h(tmp2, __ T4S, v20);
3464       if (round < 5)
3465         __ sha1c(v20, __ T4S, tmp3, tmp4);
3466       else if (round < 10 || round >= 15)
3467         __ sha1p(v20, __ T4S, tmp3, tmp4);
3468       else
3469         __ sha1m(v20, __ T4S, tmp3, tmp4);
3470       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3471 
3472       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3473     }
3474 
3475     __ addv(v7, __ T2S, v7, v21);
3476     __ addv(v6, __ T4S, v6, v20);
3477 
3478     if (multi_block) {
3479       __ add(ofs, ofs, 64);
3480       __ cmp(ofs, limit);
3481       __ br(Assembler::LE, sha1_loop);
3482       __ mov(c_rarg0, ofs); // return ofs
3483     }
3484 
3485     __ strq(v6, Address(state, 0));
3486     __ strs(v7, Address(state, 16));
3487 
3488     __ ret(lr);
3489 
3490     __ bind(keys);
3491     __ emit_int32(0x5a827999);
3492     __ emit_int32(0x6ed9eba1);
3493     __ emit_int32(0x8f1bbcdc);
3494     __ emit_int32(0xca62c1d6);
3495 
3496     return start;
3497   }
3498 
3499 
3500   // Arguments:
3501   //
3502   // Inputs:
3503   //   c_rarg0   - byte[]  source+offset
3504   //   c_rarg1   - int[]   SHA.state
3505   //   c_rarg2   - int     offset
3506   //   c_rarg3   - int     limit
3507   //
3508   address generate_sha256_implCompress(bool multi_block, const char *name) {
3509     static const uint32_t round_consts[64] = {
3510       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3511       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3512       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3513       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3514       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3515       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3516       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3517       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3518       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3519       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3520       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3521       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3522       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3523       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3524       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3525       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3526     };
3527     __ align(CodeEntryAlignment);
3528     StubCodeMark mark(this, "StubRoutines", name);
3529     address start = __ pc();
3530 
3531     Register buf   = c_rarg0;
3532     Register state = c_rarg1;
3533     Register ofs   = c_rarg2;
3534     Register limit = c_rarg3;
3535 
3536     Label sha1_loop;
3537 
3538     __ stpd(v8, v9, __ pre(sp, -32));
3539     __ stpd(v10, v11, Address(sp, 16));
3540 
3541 // dga == v0
3542 // dgb == v1
3543 // dg0 == v2
3544 // dg1 == v3
3545 // dg2 == v4
3546 // t0 == v6
3547 // t1 == v7
3548 
3549     // load 16 keys to v16..v31
3550     __ lea(rscratch1, ExternalAddress((address)round_consts));
3551     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3552     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3553     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3554     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3555 
3556     // load 8 words (256 bits) state
3557     __ ldpq(v0, v1, state);
3558 
3559     __ BIND(sha1_loop);
3560     // load 64 bytes of data into v8..v11
3561     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3562     __ rev32(v8, __ T16B, v8);
3563     __ rev32(v9, __ T16B, v9);
3564     __ rev32(v10, __ T16B, v10);
3565     __ rev32(v11, __ T16B, v11);
3566 
3567     __ addv(v6, __ T4S, v8, v16);
3568     __ orr(v2, __ T16B, v0, v0);
3569     __ orr(v3, __ T16B, v1, v1);
3570 
3571     FloatRegister d0 = v8;
3572     FloatRegister d1 = v9;
3573     FloatRegister d2 = v10;
3574     FloatRegister d3 = v11;
3575 
3576 
3577     for (int round = 0; round < 16; round++) {
3578       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3579       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3580       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3581       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3582 
3583       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3584        __ orr(v4, __ T16B, v2, v2);
3585       if (round < 15)
3586         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3587       __ sha256h(v2, __ T4S, v3, tmp2);
3588       __ sha256h2(v3, __ T4S, v4, tmp2);
3589       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3590 
3591       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3592     }
3593 
3594     __ addv(v0, __ T4S, v0, v2);
3595     __ addv(v1, __ T4S, v1, v3);
3596 
3597     if (multi_block) {
3598       __ add(ofs, ofs, 64);
3599       __ cmp(ofs, limit);
3600       __ br(Assembler::LE, sha1_loop);
3601       __ mov(c_rarg0, ofs); // return ofs
3602     }
3603 
3604     __ ldpd(v10, v11, Address(sp, 16));
3605     __ ldpd(v8, v9, __ post(sp, 32));
3606 
3607     __ stpq(v0, v1, state);
3608 
3609     __ ret(lr);
3610 
3611     return start;
3612   }
3613 
3614   // Arguments:
3615   //
3616   // Inputs:
3617   //   c_rarg0   - byte[]  source+offset
3618   //   c_rarg1   - int[]   SHA.state
3619   //   c_rarg2   - int     offset
3620   //   c_rarg3   - int     limit
3621   //
3622   address generate_sha512_implCompress(bool multi_block, const char *name) {
3623     static const uint64_t round_consts[80] = {
3624       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3625       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3626       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3627       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3628       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3629       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3630       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3631       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3632       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3633       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3634       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3635       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3636       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3637       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3638       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3639       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3640       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3641       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3642       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3643       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3644       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3645       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3646       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3647       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3648       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3649       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3650       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3651     };
3652 
3653     // Double rounds for sha512.
3654     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3655       if (dr < 36)                                                                   \
3656         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3657       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3658       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3659       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3660       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3661       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3662       if (dr < 32) {                                                                 \
3663         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3664         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3665       }                                                                              \
3666       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3667       if (dr < 32)                                                                   \
3668         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3669       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3670       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3671 
3672     __ align(CodeEntryAlignment);
3673     StubCodeMark mark(this, "StubRoutines", name);
3674     address start = __ pc();
3675 
3676     Register buf   = c_rarg0;
3677     Register state = c_rarg1;
3678     Register ofs   = c_rarg2;
3679     Register limit = c_rarg3;
3680 
3681     __ stpd(v8, v9, __ pre(sp, -64));
3682     __ stpd(v10, v11, Address(sp, 16));
3683     __ stpd(v12, v13, Address(sp, 32));
3684     __ stpd(v14, v15, Address(sp, 48));
3685 
3686     Label sha512_loop;
3687 
3688     // load state
3689     __ ld1(v8, v9, v10, v11, __ T2D, state);
3690 
3691     // load first 4 round constants
3692     __ lea(rscratch1, ExternalAddress((address)round_consts));
3693     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3694 
3695     __ BIND(sha512_loop);
3696     // load 128B of data into v12..v19
3697     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3698     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3699     __ rev64(v12, __ T16B, v12);
3700     __ rev64(v13, __ T16B, v13);
3701     __ rev64(v14, __ T16B, v14);
3702     __ rev64(v15, __ T16B, v15);
3703     __ rev64(v16, __ T16B, v16);
3704     __ rev64(v17, __ T16B, v17);
3705     __ rev64(v18, __ T16B, v18);
3706     __ rev64(v19, __ T16B, v19);
3707 
3708     __ mov(rscratch2, rscratch1);
3709 
3710     __ mov(v0, __ T16B, v8);
3711     __ mov(v1, __ T16B, v9);
3712     __ mov(v2, __ T16B, v10);
3713     __ mov(v3, __ T16B, v11);
3714 
3715     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3716     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3717     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3718     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3719     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3720     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3721     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3722     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3723     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3724     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3725     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3726     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3727     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3728     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3729     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3730     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3731     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3732     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3733     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3734     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3735     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3736     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3737     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3738     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3739     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3740     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3741     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3742     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3743     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3744     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3745     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3746     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3747     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3748     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3749     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3750     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3751     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3752     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3753     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3754     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3755 
3756     __ addv(v8, __ T2D, v8, v0);
3757     __ addv(v9, __ T2D, v9, v1);
3758     __ addv(v10, __ T2D, v10, v2);
3759     __ addv(v11, __ T2D, v11, v3);
3760 
3761     if (multi_block) {
3762       __ add(ofs, ofs, 128);
3763       __ cmp(ofs, limit);
3764       __ br(Assembler::LE, sha512_loop);
3765       __ mov(c_rarg0, ofs); // return ofs
3766     }
3767 
3768     __ st1(v8, v9, v10, v11, __ T2D, state);
3769 
3770     __ ldpd(v14, v15, Address(sp, 48));
3771     __ ldpd(v12, v13, Address(sp, 32));
3772     __ ldpd(v10, v11, Address(sp, 16));
3773     __ ldpd(v8, v9, __ post(sp, 64));
3774 
3775     __ ret(lr);
3776 
3777     return start;
3778   }
3779 
3780   // Arguments:
3781   //
3782   // Inputs:
3783   //   c_rarg0   - byte[]  source+offset
3784   //   c_rarg1   - byte[]   SHA.state
3785   //   c_rarg2   - int     digest_length
3786   //   c_rarg3   - int     offset
3787   //   c_rarg4   - int     limit
3788   //
3789   address generate_sha3_implCompress(bool multi_block, const char *name) {
3790     static const uint64_t round_consts[24] = {
3791       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3792       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3793       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3794       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3795       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3796       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3797       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3798       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3799     };
3800 
3801     __ align(CodeEntryAlignment);
3802     StubCodeMark mark(this, "StubRoutines", name);
3803     address start = __ pc();
3804 
3805     Register buf           = c_rarg0;
3806     Register state         = c_rarg1;
3807     Register digest_length = c_rarg2;
3808     Register ofs           = c_rarg3;
3809     Register limit         = c_rarg4;
3810 
3811     Label sha3_loop, rounds24_loop;
3812     Label sha3_512, sha3_384_or_224, sha3_256;
3813 
3814     __ stpd(v8, v9, __ pre(sp, -64));
3815     __ stpd(v10, v11, Address(sp, 16));
3816     __ stpd(v12, v13, Address(sp, 32));
3817     __ stpd(v14, v15, Address(sp, 48));
3818 
3819     // load state
3820     __ add(rscratch1, state, 32);
3821     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3822     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3823     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3824     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3825     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3826     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3827     __ ld1(v24, __ T1D, rscratch1);
3828 
3829     __ BIND(sha3_loop);
3830 
3831     // 24 keccak rounds
3832     __ movw(rscratch2, 24);
3833 
3834     // load round_constants base
3835     __ lea(rscratch1, ExternalAddress((address) round_consts));
3836 
3837     // load input
3838     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3839     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3840     __ eor(v0, __ T8B, v0, v25);
3841     __ eor(v1, __ T8B, v1, v26);
3842     __ eor(v2, __ T8B, v2, v27);
3843     __ eor(v3, __ T8B, v3, v28);
3844     __ eor(v4, __ T8B, v4, v29);
3845     __ eor(v5, __ T8B, v5, v30);
3846     __ eor(v6, __ T8B, v6, v31);
3847 
3848     // digest_length == 64, SHA3-512
3849     __ tbnz(digest_length, 6, sha3_512);
3850 
3851     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3852     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3853     __ eor(v7, __ T8B, v7, v25);
3854     __ eor(v8, __ T8B, v8, v26);
3855     __ eor(v9, __ T8B, v9, v27);
3856     __ eor(v10, __ T8B, v10, v28);
3857     __ eor(v11, __ T8B, v11, v29);
3858     __ eor(v12, __ T8B, v12, v30);
3859 
3860     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3861     __ tbnz(digest_length, 4, sha3_384_or_224);
3862 
3863     // SHA3-256
3864     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3865     __ eor(v13, __ T8B, v13, v25);
3866     __ eor(v14, __ T8B, v14, v26);
3867     __ eor(v15, __ T8B, v15, v27);
3868     __ eor(v16, __ T8B, v16, v28);
3869     __ b(rounds24_loop);
3870 
3871     __ BIND(sha3_384_or_224);
3872     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3873 
3874     // SHA3-224
3875     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3876     __ ld1(v29, __ T8B, __ post(buf, 8));
3877     __ eor(v13, __ T8B, v13, v25);
3878     __ eor(v14, __ T8B, v14, v26);
3879     __ eor(v15, __ T8B, v15, v27);
3880     __ eor(v16, __ T8B, v16, v28);
3881     __ eor(v17, __ T8B, v17, v29);
3882     __ b(rounds24_loop);
3883 
3884     __ BIND(sha3_512);
3885     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3886     __ eor(v7, __ T8B, v7, v25);
3887     __ eor(v8, __ T8B, v8, v26);
3888 
3889     __ BIND(rounds24_loop);
3890     __ subw(rscratch2, rscratch2, 1);
3891 
3892     __ eor3(v29, __ T16B, v4, v9, v14);
3893     __ eor3(v26, __ T16B, v1, v6, v11);
3894     __ eor3(v28, __ T16B, v3, v8, v13);
3895     __ eor3(v25, __ T16B, v0, v5, v10);
3896     __ eor3(v27, __ T16B, v2, v7, v12);
3897     __ eor3(v29, __ T16B, v29, v19, v24);
3898     __ eor3(v26, __ T16B, v26, v16, v21);
3899     __ eor3(v28, __ T16B, v28, v18, v23);
3900     __ eor3(v25, __ T16B, v25, v15, v20);
3901     __ eor3(v27, __ T16B, v27, v17, v22);
3902 
3903     __ rax1(v30, __ T2D, v29, v26);
3904     __ rax1(v26, __ T2D, v26, v28);
3905     __ rax1(v28, __ T2D, v28, v25);
3906     __ rax1(v25, __ T2D, v25, v27);
3907     __ rax1(v27, __ T2D, v27, v29);
3908 
3909     __ eor(v0, __ T16B, v0, v30);
3910     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3911     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3912     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3913     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3914     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3915     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3916     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3917     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3918     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3919     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3920     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3921     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3922     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3923     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3924     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3925     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3926     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3927     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3928     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3929     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3930     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3931     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3932     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3933     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3934 
3935     __ bcax(v20, __ T16B, v31, v22, v8);
3936     __ bcax(v21, __ T16B, v8,  v23, v22);
3937     __ bcax(v22, __ T16B, v22, v24, v23);
3938     __ bcax(v23, __ T16B, v23, v31, v24);
3939     __ bcax(v24, __ T16B, v24, v8,  v31);
3940 
3941     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3942 
3943     __ bcax(v17, __ T16B, v25, v19, v3);
3944     __ bcax(v18, __ T16B, v3,  v15, v19);
3945     __ bcax(v19, __ T16B, v19, v16, v15);
3946     __ bcax(v15, __ T16B, v15, v25, v16);
3947     __ bcax(v16, __ T16B, v16, v3,  v25);
3948 
3949     __ bcax(v10, __ T16B, v29, v12, v26);
3950     __ bcax(v11, __ T16B, v26, v13, v12);
3951     __ bcax(v12, __ T16B, v12, v14, v13);
3952     __ bcax(v13, __ T16B, v13, v29, v14);
3953     __ bcax(v14, __ T16B, v14, v26, v29);
3954 
3955     __ bcax(v7, __ T16B, v30, v9,  v4);
3956     __ bcax(v8, __ T16B, v4,  v5,  v9);
3957     __ bcax(v9, __ T16B, v9,  v6,  v5);
3958     __ bcax(v5, __ T16B, v5,  v30, v6);
3959     __ bcax(v6, __ T16B, v6,  v4,  v30);
3960 
3961     __ bcax(v3, __ T16B, v27, v0,  v28);
3962     __ bcax(v4, __ T16B, v28, v1,  v0);
3963     __ bcax(v0, __ T16B, v0,  v2,  v1);
3964     __ bcax(v1, __ T16B, v1,  v27, v2);
3965     __ bcax(v2, __ T16B, v2,  v28, v27);
3966 
3967     __ eor(v0, __ T16B, v0, v31);
3968 
3969     __ cbnzw(rscratch2, rounds24_loop);
3970 
3971     if (multi_block) {
3972       // block_size =  200 - 2 * digest_length, ofs += block_size
3973       __ add(ofs, ofs, 200);
3974       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3975 
3976       __ cmp(ofs, limit);
3977       __ br(Assembler::LE, sha3_loop);
3978       __ mov(c_rarg0, ofs); // return ofs
3979     }
3980 
3981     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3982     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3983     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3984     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3985     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3986     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3987     __ st1(v24, __ T1D, state);
3988 
3989     __ ldpd(v14, v15, Address(sp, 48));
3990     __ ldpd(v12, v13, Address(sp, 32));
3991     __ ldpd(v10, v11, Address(sp, 16));
3992     __ ldpd(v8, v9, __ post(sp, 64));
3993 
3994     __ ret(lr);
3995 
3996     return start;
3997   }
3998 
3999   /**
4000    *  Arguments:
4001    *
4002    * Inputs:
4003    *   c_rarg0   - int crc
4004    *   c_rarg1   - byte* buf
4005    *   c_rarg2   - int length
4006    *
4007    * Output:
4008    *       rax   - int crc result
4009    */
4010   address generate_updateBytesCRC32() {
4011     assert(UseCRC32Intrinsics, "what are we doing here?");
4012 
4013     __ align(CodeEntryAlignment);
4014     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4015 
4016     address start = __ pc();
4017 
4018     const Register crc   = c_rarg0;  // crc
4019     const Register buf   = c_rarg1;  // source java byte array address
4020     const Register len   = c_rarg2;  // length
4021     const Register table0 = c_rarg3; // crc_table address
4022     const Register table1 = c_rarg4;
4023     const Register table2 = c_rarg5;
4024     const Register table3 = c_rarg6;
4025     const Register tmp3 = c_rarg7;
4026 
4027     BLOCK_COMMENT("Entry:");
4028     __ enter(); // required for proper stackwalking of RuntimeStub frame
4029 
4030     __ kernel_crc32(crc, buf, len,
4031               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4032 
4033     __ leave(); // required for proper stackwalking of RuntimeStub frame
4034     __ ret(lr);
4035 
4036     return start;
4037   }
4038 
4039   /**
4040    *  Arguments:
4041    *
4042    * Inputs:
4043    *   c_rarg0   - int crc
4044    *   c_rarg1   - byte* buf
4045    *   c_rarg2   - int length
4046    *   c_rarg3   - int* table
4047    *
4048    * Output:
4049    *       r0   - int crc result
4050    */
4051   address generate_updateBytesCRC32C() {
4052     assert(UseCRC32CIntrinsics, "what are we doing here?");
4053 
4054     __ align(CodeEntryAlignment);
4055     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4056 
4057     address start = __ pc();
4058 
4059     const Register crc   = c_rarg0;  // crc
4060     const Register buf   = c_rarg1;  // source java byte array address
4061     const Register len   = c_rarg2;  // length
4062     const Register table0 = c_rarg3; // crc_table address
4063     const Register table1 = c_rarg4;
4064     const Register table2 = c_rarg5;
4065     const Register table3 = c_rarg6;
4066     const Register tmp3 = c_rarg7;
4067 
4068     BLOCK_COMMENT("Entry:");
4069     __ enter(); // required for proper stackwalking of RuntimeStub frame
4070 
4071     __ kernel_crc32c(crc, buf, len,
4072               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4073 
4074     __ leave(); // required for proper stackwalking of RuntimeStub frame
4075     __ ret(lr);
4076 
4077     return start;
4078   }
4079 
4080   /***
4081    *  Arguments:
4082    *
4083    *  Inputs:
4084    *   c_rarg0   - int   adler
4085    *   c_rarg1   - byte* buff
4086    *   c_rarg2   - int   len
4087    *
4088    * Output:
4089    *   c_rarg0   - int adler result
4090    */
4091   address generate_updateBytesAdler32() {
4092     __ align(CodeEntryAlignment);
4093     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4094     address start = __ pc();
4095 
4096     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4097 
4098     // Aliases
4099     Register adler  = c_rarg0;
4100     Register s1     = c_rarg0;
4101     Register s2     = c_rarg3;
4102     Register buff   = c_rarg1;
4103     Register len    = c_rarg2;
4104     Register nmax  = r4;
4105     Register base  = r5;
4106     Register count = r6;
4107     Register temp0 = rscratch1;
4108     Register temp1 = rscratch2;
4109     FloatRegister vbytes = v0;
4110     FloatRegister vs1acc = v1;
4111     FloatRegister vs2acc = v2;
4112     FloatRegister vtable = v3;
4113 
4114     // Max number of bytes we can process before having to take the mod
4115     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4116     uint64_t BASE = 0xfff1;
4117     uint64_t NMAX = 0x15B0;
4118 
4119     __ mov(base, BASE);
4120     __ mov(nmax, NMAX);
4121 
4122     // Load accumulation coefficients for the upper 16 bits
4123     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4124     __ ld1(vtable, __ T16B, Address(temp0));
4125 
4126     // s1 is initialized to the lower 16 bits of adler
4127     // s2 is initialized to the upper 16 bits of adler
4128     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4129     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4130 
4131     // The pipelined loop needs at least 16 elements for 1 iteration
4132     // It does check this, but it is more effective to skip to the cleanup loop
4133     __ cmp(len, (u1)16);
4134     __ br(Assembler::HS, L_nmax);
4135     __ cbz(len, L_combine);
4136 
4137     __ bind(L_simple_by1_loop);
4138     __ ldrb(temp0, Address(__ post(buff, 1)));
4139     __ add(s1, s1, temp0);
4140     __ add(s2, s2, s1);
4141     __ subs(len, len, 1);
4142     __ br(Assembler::HI, L_simple_by1_loop);
4143 
4144     // s1 = s1 % BASE
4145     __ subs(temp0, s1, base);
4146     __ csel(s1, temp0, s1, Assembler::HS);
4147 
4148     // s2 = s2 % BASE
4149     __ lsr(temp0, s2, 16);
4150     __ lsl(temp1, temp0, 4);
4151     __ sub(temp1, temp1, temp0);
4152     __ add(s2, temp1, s2, ext::uxth);
4153 
4154     __ subs(temp0, s2, base);
4155     __ csel(s2, temp0, s2, Assembler::HS);
4156 
4157     __ b(L_combine);
4158 
4159     __ bind(L_nmax);
4160     __ subs(len, len, nmax);
4161     __ sub(count, nmax, 16);
4162     __ br(Assembler::LO, L_by16);
4163 
4164     __ bind(L_nmax_loop);
4165 
4166     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4167                                       vbytes, vs1acc, vs2acc, vtable);
4168 
4169     __ subs(count, count, 16);
4170     __ br(Assembler::HS, L_nmax_loop);
4171 
4172     // s1 = s1 % BASE
4173     __ lsr(temp0, s1, 16);
4174     __ lsl(temp1, temp0, 4);
4175     __ sub(temp1, temp1, temp0);
4176     __ add(temp1, temp1, s1, ext::uxth);
4177 
4178     __ lsr(temp0, temp1, 16);
4179     __ lsl(s1, temp0, 4);
4180     __ sub(s1, s1, temp0);
4181     __ add(s1, s1, temp1, ext:: uxth);
4182 
4183     __ subs(temp0, s1, base);
4184     __ csel(s1, temp0, s1, Assembler::HS);
4185 
4186     // s2 = s2 % BASE
4187     __ lsr(temp0, s2, 16);
4188     __ lsl(temp1, temp0, 4);
4189     __ sub(temp1, temp1, temp0);
4190     __ add(temp1, temp1, s2, ext::uxth);
4191 
4192     __ lsr(temp0, temp1, 16);
4193     __ lsl(s2, temp0, 4);
4194     __ sub(s2, s2, temp0);
4195     __ add(s2, s2, temp1, ext:: uxth);
4196 
4197     __ subs(temp0, s2, base);
4198     __ csel(s2, temp0, s2, Assembler::HS);
4199 
4200     __ subs(len, len, nmax);
4201     __ sub(count, nmax, 16);
4202     __ br(Assembler::HS, L_nmax_loop);
4203 
4204     __ bind(L_by16);
4205     __ adds(len, len, count);
4206     __ br(Assembler::LO, L_by1);
4207 
4208     __ bind(L_by16_loop);
4209 
4210     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4211                                       vbytes, vs1acc, vs2acc, vtable);
4212 
4213     __ subs(len, len, 16);
4214     __ br(Assembler::HS, L_by16_loop);
4215 
4216     __ bind(L_by1);
4217     __ adds(len, len, 15);
4218     __ br(Assembler::LO, L_do_mod);
4219 
4220     __ bind(L_by1_loop);
4221     __ ldrb(temp0, Address(__ post(buff, 1)));
4222     __ add(s1, temp0, s1);
4223     __ add(s2, s2, s1);
4224     __ subs(len, len, 1);
4225     __ br(Assembler::HS, L_by1_loop);
4226 
4227     __ bind(L_do_mod);
4228     // s1 = s1 % BASE
4229     __ lsr(temp0, s1, 16);
4230     __ lsl(temp1, temp0, 4);
4231     __ sub(temp1, temp1, temp0);
4232     __ add(temp1, temp1, s1, ext::uxth);
4233 
4234     __ lsr(temp0, temp1, 16);
4235     __ lsl(s1, temp0, 4);
4236     __ sub(s1, s1, temp0);
4237     __ add(s1, s1, temp1, ext:: uxth);
4238 
4239     __ subs(temp0, s1, base);
4240     __ csel(s1, temp0, s1, Assembler::HS);
4241 
4242     // s2 = s2 % BASE
4243     __ lsr(temp0, s2, 16);
4244     __ lsl(temp1, temp0, 4);
4245     __ sub(temp1, temp1, temp0);
4246     __ add(temp1, temp1, s2, ext::uxth);
4247 
4248     __ lsr(temp0, temp1, 16);
4249     __ lsl(s2, temp0, 4);
4250     __ sub(s2, s2, temp0);
4251     __ add(s2, s2, temp1, ext:: uxth);
4252 
4253     __ subs(temp0, s2, base);
4254     __ csel(s2, temp0, s2, Assembler::HS);
4255 
4256     // Combine lower bits and higher bits
4257     __ bind(L_combine);
4258     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4259 
4260     __ ret(lr);
4261 
4262     return start;
4263   }
4264 
4265   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4266           Register temp0, Register temp1, FloatRegister vbytes,
4267           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4268     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4269     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4270     // In non-vectorized code, we update s1 and s2 as:
4271     //   s1 <- s1 + b1
4272     //   s2 <- s2 + s1
4273     //   s1 <- s1 + b2
4274     //   s2 <- s2 + b1
4275     //   ...
4276     //   s1 <- s1 + b16
4277     //   s2 <- s2 + s1
4278     // Putting above assignments together, we have:
4279     //   s1_new = s1 + b1 + b2 + ... + b16
4280     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4281     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4282     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4283     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4284 
4285     // s2 = s2 + s1 * 16
4286     __ add(s2, s2, s1, Assembler::LSL, 4);
4287 
4288     // vs1acc = b1 + b2 + b3 + ... + b16
4289     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4290     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4291     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4292     __ uaddlv(vs1acc, __ T16B, vbytes);
4293     __ uaddlv(vs2acc, __ T8H, vs2acc);
4294 
4295     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4296     __ fmovd(temp0, vs1acc);
4297     __ fmovd(temp1, vs2acc);
4298     __ add(s1, s1, temp0);
4299     __ add(s2, s2, temp1);
4300   }
4301 
4302   /**
4303    *  Arguments:
4304    *
4305    *  Input:
4306    *    c_rarg0   - x address
4307    *    c_rarg1   - x length
4308    *    c_rarg2   - y address
4309    *    c_rarg3   - y length
4310    *    c_rarg4   - z address
4311    *    c_rarg5   - z length
4312    */
4313   address generate_multiplyToLen() {
4314     __ align(CodeEntryAlignment);
4315     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4316 
4317     address start = __ pc();
4318     const Register x     = r0;
4319     const Register xlen  = r1;
4320     const Register y     = r2;
4321     const Register ylen  = r3;
4322     const Register z     = r4;
4323     const Register zlen  = r5;
4324 
4325     const Register tmp1  = r10;
4326     const Register tmp2  = r11;
4327     const Register tmp3  = r12;
4328     const Register tmp4  = r13;
4329     const Register tmp5  = r14;
4330     const Register tmp6  = r15;
4331     const Register tmp7  = r16;
4332 
4333     BLOCK_COMMENT("Entry:");
4334     __ enter(); // required for proper stackwalking of RuntimeStub frame
4335     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4336     __ leave(); // required for proper stackwalking of RuntimeStub frame
4337     __ ret(lr);
4338 
4339     return start;
4340   }
4341 
4342   address generate_squareToLen() {
4343     // squareToLen algorithm for sizes 1..127 described in java code works
4344     // faster than multiply_to_len on some CPUs and slower on others, but
4345     // multiply_to_len shows a bit better overall results
4346     __ align(CodeEntryAlignment);
4347     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4348     address start = __ pc();
4349 
4350     const Register x     = r0;
4351     const Register xlen  = r1;
4352     const Register z     = r2;
4353     const Register zlen  = r3;
4354     const Register y     = r4; // == x
4355     const Register ylen  = r5; // == xlen
4356 
4357     const Register tmp1  = r10;
4358     const Register tmp2  = r11;
4359     const Register tmp3  = r12;
4360     const Register tmp4  = r13;
4361     const Register tmp5  = r14;
4362     const Register tmp6  = r15;
4363     const Register tmp7  = r16;
4364 
4365     RegSet spilled_regs = RegSet::of(y, ylen);
4366     BLOCK_COMMENT("Entry:");
4367     __ enter();
4368     __ push(spilled_regs, sp);
4369     __ mov(y, x);
4370     __ mov(ylen, xlen);
4371     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4372     __ pop(spilled_regs, sp);
4373     __ leave();
4374     __ ret(lr);
4375     return start;
4376   }
4377 
4378   address generate_mulAdd() {
4379     __ align(CodeEntryAlignment);
4380     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4381 
4382     address start = __ pc();
4383 
4384     const Register out     = r0;
4385     const Register in      = r1;
4386     const Register offset  = r2;
4387     const Register len     = r3;
4388     const Register k       = r4;
4389 
4390     BLOCK_COMMENT("Entry:");
4391     __ enter();
4392     __ mul_add(out, in, offset, len, k);
4393     __ leave();
4394     __ ret(lr);
4395 
4396     return start;
4397   }
4398 
4399   // Arguments:
4400   //
4401   // Input:
4402   //   c_rarg0   - newArr address
4403   //   c_rarg1   - oldArr address
4404   //   c_rarg2   - newIdx
4405   //   c_rarg3   - shiftCount
4406   //   c_rarg4   - numIter
4407   //
4408   address generate_bigIntegerRightShift() {
4409     __ align(CodeEntryAlignment);
4410     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4411     address start = __ pc();
4412 
4413     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4414 
4415     Register newArr        = c_rarg0;
4416     Register oldArr        = c_rarg1;
4417     Register newIdx        = c_rarg2;
4418     Register shiftCount    = c_rarg3;
4419     Register numIter       = c_rarg4;
4420     Register idx           = numIter;
4421 
4422     Register newArrCur     = rscratch1;
4423     Register shiftRevCount = rscratch2;
4424     Register oldArrCur     = r13;
4425     Register oldArrNext    = r14;
4426 
4427     FloatRegister oldElem0        = v0;
4428     FloatRegister oldElem1        = v1;
4429     FloatRegister newElem         = v2;
4430     FloatRegister shiftVCount     = v3;
4431     FloatRegister shiftVRevCount  = v4;
4432 
4433     __ cbz(idx, Exit);
4434 
4435     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4436 
4437     // left shift count
4438     __ movw(shiftRevCount, 32);
4439     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4440 
4441     // numIter too small to allow a 4-words SIMD loop, rolling back
4442     __ cmp(numIter, (u1)4);
4443     __ br(Assembler::LT, ShiftThree);
4444 
4445     __ dup(shiftVCount,    __ T4S, shiftCount);
4446     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4447     __ negr(shiftVCount,   __ T4S, shiftVCount);
4448 
4449     __ BIND(ShiftSIMDLoop);
4450 
4451     // Calculate the load addresses
4452     __ sub(idx, idx, 4);
4453     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4454     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4455     __ add(oldArrCur,  oldArrNext, 4);
4456 
4457     // Load 4 words and process
4458     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4459     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4460     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4461     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4462     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4463     __ st1(newElem,   __ T4S,  Address(newArrCur));
4464 
4465     __ cmp(idx, (u1)4);
4466     __ br(Assembler::LT, ShiftTwoLoop);
4467     __ b(ShiftSIMDLoop);
4468 
4469     __ BIND(ShiftTwoLoop);
4470     __ cbz(idx, Exit);
4471     __ cmp(idx, (u1)1);
4472     __ br(Assembler::EQ, ShiftOne);
4473 
4474     // Calculate the load addresses
4475     __ sub(idx, idx, 2);
4476     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4477     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4478     __ add(oldArrCur,  oldArrNext, 4);
4479 
4480     // Load 2 words and process
4481     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4482     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4483     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4484     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4485     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4486     __ st1(newElem,   __ T2S, Address(newArrCur));
4487     __ b(ShiftTwoLoop);
4488 
4489     __ BIND(ShiftThree);
4490     __ tbz(idx, 1, ShiftOne);
4491     __ tbz(idx, 0, ShiftTwo);
4492     __ ldrw(r10,  Address(oldArr, 12));
4493     __ ldrw(r11,  Address(oldArr, 8));
4494     __ lsrvw(r10, r10, shiftCount);
4495     __ lslvw(r11, r11, shiftRevCount);
4496     __ orrw(r12,  r10, r11);
4497     __ strw(r12,  Address(newArr, 8));
4498 
4499     __ BIND(ShiftTwo);
4500     __ ldrw(r10,  Address(oldArr, 8));
4501     __ ldrw(r11,  Address(oldArr, 4));
4502     __ lsrvw(r10, r10, shiftCount);
4503     __ lslvw(r11, r11, shiftRevCount);
4504     __ orrw(r12,  r10, r11);
4505     __ strw(r12,  Address(newArr, 4));
4506 
4507     __ BIND(ShiftOne);
4508     __ ldrw(r10,  Address(oldArr, 4));
4509     __ ldrw(r11,  Address(oldArr));
4510     __ lsrvw(r10, r10, shiftCount);
4511     __ lslvw(r11, r11, shiftRevCount);
4512     __ orrw(r12,  r10, r11);
4513     __ strw(r12,  Address(newArr));
4514 
4515     __ BIND(Exit);
4516     __ ret(lr);
4517 
4518     return start;
4519   }
4520 
4521   // Arguments:
4522   //
4523   // Input:
4524   //   c_rarg0   - newArr address
4525   //   c_rarg1   - oldArr address
4526   //   c_rarg2   - newIdx
4527   //   c_rarg3   - shiftCount
4528   //   c_rarg4   - numIter
4529   //
4530   address generate_bigIntegerLeftShift() {
4531     __ align(CodeEntryAlignment);
4532     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4533     address start = __ pc();
4534 
4535     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4536 
4537     Register newArr        = c_rarg0;
4538     Register oldArr        = c_rarg1;
4539     Register newIdx        = c_rarg2;
4540     Register shiftCount    = c_rarg3;
4541     Register numIter       = c_rarg4;
4542 
4543     Register shiftRevCount = rscratch1;
4544     Register oldArrNext    = rscratch2;
4545 
4546     FloatRegister oldElem0        = v0;
4547     FloatRegister oldElem1        = v1;
4548     FloatRegister newElem         = v2;
4549     FloatRegister shiftVCount     = v3;
4550     FloatRegister shiftVRevCount  = v4;
4551 
4552     __ cbz(numIter, Exit);
4553 
4554     __ add(oldArrNext, oldArr, 4);
4555     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4556 
4557     // right shift count
4558     __ movw(shiftRevCount, 32);
4559     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4560 
4561     // numIter too small to allow a 4-words SIMD loop, rolling back
4562     __ cmp(numIter, (u1)4);
4563     __ br(Assembler::LT, ShiftThree);
4564 
4565     __ dup(shiftVCount,     __ T4S, shiftCount);
4566     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4567     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4568 
4569     __ BIND(ShiftSIMDLoop);
4570 
4571     // load 4 words and process
4572     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4573     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4574     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4575     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4576     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4577     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4578     __ sub(numIter,   numIter, 4);
4579 
4580     __ cmp(numIter, (u1)4);
4581     __ br(Assembler::LT, ShiftTwoLoop);
4582     __ b(ShiftSIMDLoop);
4583 
4584     __ BIND(ShiftTwoLoop);
4585     __ cbz(numIter, Exit);
4586     __ cmp(numIter, (u1)1);
4587     __ br(Assembler::EQ, ShiftOne);
4588 
4589     // load 2 words and process
4590     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4591     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4592     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4593     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4594     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4595     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4596     __ sub(numIter,   numIter, 2);
4597     __ b(ShiftTwoLoop);
4598 
4599     __ BIND(ShiftThree);
4600     __ ldrw(r10,  __ post(oldArr, 4));
4601     __ ldrw(r11,  __ post(oldArrNext, 4));
4602     __ lslvw(r10, r10, shiftCount);
4603     __ lsrvw(r11, r11, shiftRevCount);
4604     __ orrw(r12,  r10, r11);
4605     __ strw(r12,  __ post(newArr, 4));
4606     __ tbz(numIter, 1, Exit);
4607     __ tbz(numIter, 0, ShiftOne);
4608 
4609     __ BIND(ShiftTwo);
4610     __ ldrw(r10,  __ post(oldArr, 4));
4611     __ ldrw(r11,  __ post(oldArrNext, 4));
4612     __ lslvw(r10, r10, shiftCount);
4613     __ lsrvw(r11, r11, shiftRevCount);
4614     __ orrw(r12,  r10, r11);
4615     __ strw(r12,  __ post(newArr, 4));
4616 
4617     __ BIND(ShiftOne);
4618     __ ldrw(r10,  Address(oldArr));
4619     __ ldrw(r11,  Address(oldArrNext));
4620     __ lslvw(r10, r10, shiftCount);
4621     __ lsrvw(r11, r11, shiftRevCount);
4622     __ orrw(r12,  r10, r11);
4623     __ strw(r12,  Address(newArr));
4624 
4625     __ BIND(Exit);
4626     __ ret(lr);
4627 
4628     return start;
4629   }
4630 
4631   address generate_count_positives(address &count_positives_long) {
4632     const u1 large_loop_size = 64;
4633     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4634     int dcache_line = VM_Version::dcache_line_size();
4635 
4636     Register ary1 = r1, len = r2, result = r0;
4637 
4638     __ align(CodeEntryAlignment);
4639 
4640     StubCodeMark mark(this, "StubRoutines", "count_positives");
4641 
4642     address entry = __ pc();
4643 
4644     __ enter();
4645     // precondition: a copy of len is already in result
4646     // __ mov(result, len);
4647 
4648   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4649         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4650 
4651   __ cmp(len, (u1)15);
4652   __ br(Assembler::GT, LEN_OVER_15);
4653   // The only case when execution falls into this code is when pointer is near
4654   // the end of memory page and we have to avoid reading next page
4655   __ add(ary1, ary1, len);
4656   __ subs(len, len, 8);
4657   __ br(Assembler::GT, LEN_OVER_8);
4658   __ ldr(rscratch2, Address(ary1, -8));
4659   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4660   __ lsrv(rscratch2, rscratch2, rscratch1);
4661   __ tst(rscratch2, UPPER_BIT_MASK);
4662   __ csel(result, zr, result, Assembler::NE);
4663   __ leave();
4664   __ ret(lr);
4665   __ bind(LEN_OVER_8);
4666   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4667   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4668   __ tst(rscratch2, UPPER_BIT_MASK);
4669   __ br(Assembler::NE, RET_NO_POP);
4670   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4671   __ lsrv(rscratch1, rscratch1, rscratch2);
4672   __ tst(rscratch1, UPPER_BIT_MASK);
4673   __ bind(RET_NO_POP);
4674   __ csel(result, zr, result, Assembler::NE);
4675   __ leave();
4676   __ ret(lr);
4677 
4678   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4679   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4680 
4681   count_positives_long = __ pc(); // 2nd entry point
4682 
4683   __ enter();
4684 
4685   __ bind(LEN_OVER_15);
4686     __ push(spilled_regs, sp);
4687     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4688     __ cbz(rscratch2, ALIGNED);
4689     __ ldp(tmp6, tmp1, Address(ary1));
4690     __ mov(tmp5, 16);
4691     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4692     __ add(ary1, ary1, rscratch1);
4693     __ orr(tmp6, tmp6, tmp1);
4694     __ tst(tmp6, UPPER_BIT_MASK);
4695     __ br(Assembler::NE, RET_ADJUST);
4696     __ sub(len, len, rscratch1);
4697 
4698   __ bind(ALIGNED);
4699     __ cmp(len, large_loop_size);
4700     __ br(Assembler::LT, CHECK_16);
4701     // Perform 16-byte load as early return in pre-loop to handle situation
4702     // when initially aligned large array has negative values at starting bytes,
4703     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4704     // slower. Cases with negative bytes further ahead won't be affected that
4705     // much. In fact, it'll be faster due to early loads, less instructions and
4706     // less branches in LARGE_LOOP.
4707     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4708     __ sub(len, len, 16);
4709     __ orr(tmp6, tmp6, tmp1);
4710     __ tst(tmp6, UPPER_BIT_MASK);
4711     __ br(Assembler::NE, RET_ADJUST_16);
4712     __ cmp(len, large_loop_size);
4713     __ br(Assembler::LT, CHECK_16);
4714 
4715     if (SoftwarePrefetchHintDistance >= 0
4716         && SoftwarePrefetchHintDistance >= dcache_line) {
4717       // initial prefetch
4718       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4719     }
4720   __ bind(LARGE_LOOP);
4721     if (SoftwarePrefetchHintDistance >= 0) {
4722       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4723     }
4724     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4725     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4726     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4727     // instructions per cycle and have less branches, but this approach disables
4728     // early return, thus, all 64 bytes are loaded and checked every time.
4729     __ ldp(tmp2, tmp3, Address(ary1));
4730     __ ldp(tmp4, tmp5, Address(ary1, 16));
4731     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4732     __ ldp(tmp6, tmp1, Address(ary1, 48));
4733     __ add(ary1, ary1, large_loop_size);
4734     __ sub(len, len, large_loop_size);
4735     __ orr(tmp2, tmp2, tmp3);
4736     __ orr(tmp4, tmp4, tmp5);
4737     __ orr(rscratch1, rscratch1, rscratch2);
4738     __ orr(tmp6, tmp6, tmp1);
4739     __ orr(tmp2, tmp2, tmp4);
4740     __ orr(rscratch1, rscratch1, tmp6);
4741     __ orr(tmp2, tmp2, rscratch1);
4742     __ tst(tmp2, UPPER_BIT_MASK);
4743     __ br(Assembler::NE, RET_ADJUST_LONG);
4744     __ cmp(len, large_loop_size);
4745     __ br(Assembler::GE, LARGE_LOOP);
4746 
4747   __ bind(CHECK_16); // small 16-byte load pre-loop
4748     __ cmp(len, (u1)16);
4749     __ br(Assembler::LT, POST_LOOP16);
4750 
4751   __ bind(LOOP16); // small 16-byte load loop
4752     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4753     __ sub(len, len, 16);
4754     __ orr(tmp2, tmp2, tmp3);
4755     __ tst(tmp2, UPPER_BIT_MASK);
4756     __ br(Assembler::NE, RET_ADJUST_16);
4757     __ cmp(len, (u1)16);
4758     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4759 
4760   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4761     __ cmp(len, (u1)8);
4762     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4763     __ ldr(tmp3, Address(__ post(ary1, 8)));
4764     __ tst(tmp3, UPPER_BIT_MASK);
4765     __ br(Assembler::NE, RET_ADJUST);
4766     __ sub(len, len, 8);
4767 
4768   __ bind(POST_LOOP16_LOAD_TAIL);
4769     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
4770     __ ldr(tmp1, Address(ary1));
4771     __ mov(tmp2, 64);
4772     __ sub(tmp4, tmp2, len, __ LSL, 3);
4773     __ lslv(tmp1, tmp1, tmp4);
4774     __ tst(tmp1, UPPER_BIT_MASK);
4775     __ br(Assembler::NE, RET_ADJUST);
4776     // Fallthrough
4777 
4778   __ bind(RET_LEN);
4779     __ pop(spilled_regs, sp);
4780     __ leave();
4781     __ ret(lr);
4782 
4783     // difference result - len is the count of guaranteed to be
4784     // positive bytes
4785 
4786   __ bind(RET_ADJUST_LONG);
4787     __ add(len, len, (u1)(large_loop_size - 16));
4788   __ bind(RET_ADJUST_16);
4789     __ add(len, len, 16);
4790   __ bind(RET_ADJUST);
4791     __ pop(spilled_regs, sp);
4792     __ leave();
4793     __ sub(result, result, len);
4794     __ ret(lr);
4795 
4796     return entry;
4797   }
4798 
4799   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4800         bool usePrefetch, Label &NOT_EQUAL) {
4801     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4802         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4803         tmp7 = r12, tmp8 = r13;
4804     Label LOOP;
4805 
4806     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4807     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4808     __ bind(LOOP);
4809     if (usePrefetch) {
4810       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4811       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4812     }
4813     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4814     __ eor(tmp1, tmp1, tmp2);
4815     __ eor(tmp3, tmp3, tmp4);
4816     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4817     __ orr(tmp1, tmp1, tmp3);
4818     __ cbnz(tmp1, NOT_EQUAL);
4819     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4820     __ eor(tmp5, tmp5, tmp6);
4821     __ eor(tmp7, tmp7, tmp8);
4822     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4823     __ orr(tmp5, tmp5, tmp7);
4824     __ cbnz(tmp5, NOT_EQUAL);
4825     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4826     __ eor(tmp1, tmp1, tmp2);
4827     __ eor(tmp3, tmp3, tmp4);
4828     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4829     __ orr(tmp1, tmp1, tmp3);
4830     __ cbnz(tmp1, NOT_EQUAL);
4831     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4832     __ eor(tmp5, tmp5, tmp6);
4833     __ sub(cnt1, cnt1, 8 * wordSize);
4834     __ eor(tmp7, tmp7, tmp8);
4835     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4836     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4837     // cmp) because subs allows an unlimited range of immediate operand.
4838     __ subs(tmp6, cnt1, loopThreshold);
4839     __ orr(tmp5, tmp5, tmp7);
4840     __ cbnz(tmp5, NOT_EQUAL);
4841     __ br(__ GE, LOOP);
4842     // post-loop
4843     __ eor(tmp1, tmp1, tmp2);
4844     __ eor(tmp3, tmp3, tmp4);
4845     __ orr(tmp1, tmp1, tmp3);
4846     __ sub(cnt1, cnt1, 2 * wordSize);
4847     __ cbnz(tmp1, NOT_EQUAL);
4848   }
4849 
4850   void generate_large_array_equals_loop_simd(int loopThreshold,
4851         bool usePrefetch, Label &NOT_EQUAL) {
4852     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4853         tmp2 = rscratch2;
4854     Label LOOP;
4855 
4856     __ bind(LOOP);
4857     if (usePrefetch) {
4858       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4859       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4860     }
4861     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4862     __ sub(cnt1, cnt1, 8 * wordSize);
4863     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4864     __ subs(tmp1, cnt1, loopThreshold);
4865     __ eor(v0, __ T16B, v0, v4);
4866     __ eor(v1, __ T16B, v1, v5);
4867     __ eor(v2, __ T16B, v2, v6);
4868     __ eor(v3, __ T16B, v3, v7);
4869     __ orr(v0, __ T16B, v0, v1);
4870     __ orr(v1, __ T16B, v2, v3);
4871     __ orr(v0, __ T16B, v0, v1);
4872     __ umov(tmp1, v0, __ D, 0);
4873     __ umov(tmp2, v0, __ D, 1);
4874     __ orr(tmp1, tmp1, tmp2);
4875     __ cbnz(tmp1, NOT_EQUAL);
4876     __ br(__ GE, LOOP);
4877   }
4878 
4879   // a1 = r1 - array1 address
4880   // a2 = r2 - array2 address
4881   // result = r0 - return value. Already contains "false"
4882   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4883   // r3-r5 are reserved temporary registers
4884   address generate_large_array_equals() {
4885     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4886         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4887         tmp7 = r12, tmp8 = r13;
4888     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4889         SMALL_LOOP, POST_LOOP;
4890     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4891     // calculate if at least 32 prefetched bytes are used
4892     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4893     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4894     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4895     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4896         tmp5, tmp6, tmp7, tmp8);
4897 
4898     __ align(CodeEntryAlignment);
4899 
4900     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4901 
4902     address entry = __ pc();
4903     __ enter();
4904     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4905     // also advance pointers to use post-increment instead of pre-increment
4906     __ add(a1, a1, wordSize);
4907     __ add(a2, a2, wordSize);
4908     if (AvoidUnalignedAccesses) {
4909       // both implementations (SIMD/nonSIMD) are using relatively large load
4910       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4911       // on some CPUs in case of address is not at least 16-byte aligned.
4912       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4913       // load if needed at least for 1st address and make if 16-byte aligned.
4914       Label ALIGNED16;
4915       __ tbz(a1, 3, ALIGNED16);
4916       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4917       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4918       __ sub(cnt1, cnt1, wordSize);
4919       __ eor(tmp1, tmp1, tmp2);
4920       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4921       __ bind(ALIGNED16);
4922     }
4923     if (UseSIMDForArrayEquals) {
4924       if (SoftwarePrefetchHintDistance >= 0) {
4925         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4926         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4927         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4928             /* prfm = */ true, NOT_EQUAL);
4929         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4930         __ br(__ LT, TAIL);
4931       }
4932       __ bind(NO_PREFETCH_LARGE_LOOP);
4933       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4934           /* prfm = */ false, NOT_EQUAL);
4935     } else {
4936       __ push(spilled_regs, sp);
4937       if (SoftwarePrefetchHintDistance >= 0) {
4938         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4939         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4940         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4941             /* prfm = */ true, NOT_EQUAL);
4942         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4943         __ br(__ LT, TAIL);
4944       }
4945       __ bind(NO_PREFETCH_LARGE_LOOP);
4946       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4947           /* prfm = */ false, NOT_EQUAL);
4948     }
4949     __ bind(TAIL);
4950       __ cbz(cnt1, EQUAL);
4951       __ subs(cnt1, cnt1, wordSize);
4952       __ br(__ LE, POST_LOOP);
4953     __ bind(SMALL_LOOP);
4954       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4955       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4956       __ subs(cnt1, cnt1, wordSize);
4957       __ eor(tmp1, tmp1, tmp2);
4958       __ cbnz(tmp1, NOT_EQUAL);
4959       __ br(__ GT, SMALL_LOOP);
4960     __ bind(POST_LOOP);
4961       __ ldr(tmp1, Address(a1, cnt1));
4962       __ ldr(tmp2, Address(a2, cnt1));
4963       __ eor(tmp1, tmp1, tmp2);
4964       __ cbnz(tmp1, NOT_EQUAL);
4965     __ bind(EQUAL);
4966       __ mov(result, true);
4967     __ bind(NOT_EQUAL);
4968       if (!UseSIMDForArrayEquals) {
4969         __ pop(spilled_regs, sp);
4970       }
4971     __ bind(NOT_EQUAL_NO_POP);
4972     __ leave();
4973     __ ret(lr);
4974     return entry;
4975   }
4976 
4977   address generate_dsin_dcos(bool isCos) {
4978     __ align(CodeEntryAlignment);
4979     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4980     address start = __ pc();
4981     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4982         (address)StubRoutines::aarch64::_two_over_pi,
4983         (address)StubRoutines::aarch64::_pio2,
4984         (address)StubRoutines::aarch64::_dsin_coef,
4985         (address)StubRoutines::aarch64::_dcos_coef);
4986     return start;
4987   }
4988 
4989   address generate_dlog() {
4990     __ align(CodeEntryAlignment);
4991     StubCodeMark mark(this, "StubRoutines", "dlog");
4992     address entry = __ pc();
4993     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4994         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4995     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4996     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4997         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4998     return entry;
4999   }
5000 
5001 
5002   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5003   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5004       Label &DIFF2) {
5005     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5006     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5007 
5008     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5009     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5010     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5011     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5012 
5013     __ fmovd(tmpL, vtmp3);
5014     __ eor(rscratch2, tmp3, tmpL);
5015     __ cbnz(rscratch2, DIFF2);
5016 
5017     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5018     __ umov(tmpL, vtmp3, __ D, 1);
5019     __ eor(rscratch2, tmpU, tmpL);
5020     __ cbnz(rscratch2, DIFF1);
5021 
5022     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5023     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5024     __ fmovd(tmpL, vtmp);
5025     __ eor(rscratch2, tmp3, tmpL);
5026     __ cbnz(rscratch2, DIFF2);
5027 
5028     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5029     __ umov(tmpL, vtmp, __ D, 1);
5030     __ eor(rscratch2, tmpU, tmpL);
5031     __ cbnz(rscratch2, DIFF1);
5032   }
5033 
5034   // r0  = result
5035   // r1  = str1
5036   // r2  = cnt1
5037   // r3  = str2
5038   // r4  = cnt2
5039   // r10 = tmp1
5040   // r11 = tmp2
5041   address generate_compare_long_string_different_encoding(bool isLU) {
5042     __ align(CodeEntryAlignment);
5043     StubCodeMark mark(this, "StubRoutines", isLU
5044         ? "compare_long_string_different_encoding LU"
5045         : "compare_long_string_different_encoding UL");
5046     address entry = __ pc();
5047     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5048         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5049         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5050     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5051         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5052     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5053     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5054 
5055     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5056 
5057     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5058     // cnt2 == amount of characters left to compare
5059     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5060     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5061     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5062     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5063     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5064     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5065     __ eor(rscratch2, tmp1, tmp2);
5066     __ mov(rscratch1, tmp2);
5067     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5068     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5069              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5070     __ push(spilled_regs, sp);
5071     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5072     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5073 
5074     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5075 
5076     if (SoftwarePrefetchHintDistance >= 0) {
5077       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5078       __ br(__ LT, NO_PREFETCH);
5079       __ bind(LARGE_LOOP_PREFETCH);
5080         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5081         __ mov(tmp4, 2);
5082         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5083         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5084           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5085           __ subs(tmp4, tmp4, 1);
5086           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5087           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5088           __ mov(tmp4, 2);
5089         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5090           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5091           __ subs(tmp4, tmp4, 1);
5092           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5093           __ sub(cnt2, cnt2, 64);
5094           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5095           __ br(__ GE, LARGE_LOOP_PREFETCH);
5096     }
5097     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5098     __ bind(NO_PREFETCH);
5099     __ subs(cnt2, cnt2, 16);
5100     __ br(__ LT, TAIL);
5101     __ align(OptoLoopAlignment);
5102     __ bind(SMALL_LOOP); // smaller loop
5103       __ subs(cnt2, cnt2, 16);
5104       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5105       __ br(__ GE, SMALL_LOOP);
5106       __ cmn(cnt2, (u1)16);
5107       __ br(__ EQ, LOAD_LAST);
5108     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5109       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5110       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5111       __ ldr(tmp3, Address(cnt1, -8));
5112       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5113       __ b(LOAD_LAST);
5114     __ bind(DIFF2);
5115       __ mov(tmpU, tmp3);
5116     __ bind(DIFF1);
5117       __ pop(spilled_regs, sp);
5118       __ b(CALCULATE_DIFFERENCE);
5119     __ bind(LOAD_LAST);
5120       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5121       // No need to load it again
5122       __ mov(tmpU, tmp3);
5123       __ pop(spilled_regs, sp);
5124 
5125       // tmp2 points to the address of the last 4 Latin1 characters right now
5126       __ ldrs(vtmp, Address(tmp2));
5127       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5128       __ fmovd(tmpL, vtmp);
5129 
5130       __ eor(rscratch2, tmpU, tmpL);
5131       __ cbz(rscratch2, DONE);
5132 
5133     // Find the first different characters in the longwords and
5134     // compute their difference.
5135     __ bind(CALCULATE_DIFFERENCE);
5136       __ rev(rscratch2, rscratch2);
5137       __ clz(rscratch2, rscratch2);
5138       __ andr(rscratch2, rscratch2, -16);
5139       __ lsrv(tmp1, tmp1, rscratch2);
5140       __ uxthw(tmp1, tmp1);
5141       __ lsrv(rscratch1, rscratch1, rscratch2);
5142       __ uxthw(rscratch1, rscratch1);
5143       __ subw(result, tmp1, rscratch1);
5144     __ bind(DONE);
5145       __ ret(lr);
5146     return entry;
5147   }
5148 
5149     address generate_method_entry_barrier() {
5150     __ align(CodeEntryAlignment);
5151     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5152 
5153     Label deoptimize_label;
5154 
5155     address start = __ pc();
5156 
5157     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5158 
5159     __ enter();
5160     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5161 
5162     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5163 
5164     __ push_call_clobbered_registers();
5165 
5166     __ mov(c_rarg0, rscratch2);
5167     __ call_VM_leaf
5168          (CAST_FROM_FN_PTR
5169           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5170 
5171     __ reset_last_Java_frame(true);
5172 
5173     __ mov(rscratch1, r0);
5174 
5175     __ pop_call_clobbered_registers();
5176 
5177     __ cbnz(rscratch1, deoptimize_label);
5178 
5179     __ leave();
5180     __ ret(lr);
5181 
5182     __ BIND(deoptimize_label);
5183 
5184     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5185     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5186 
5187     __ mov(sp, rscratch1);
5188     __ br(rscratch2);
5189 
5190     return start;
5191   }
5192 
5193   // r0  = result
5194   // r1  = str1
5195   // r2  = cnt1
5196   // r3  = str2
5197   // r4  = cnt2
5198   // r10 = tmp1
5199   // r11 = tmp2
5200   address generate_compare_long_string_same_encoding(bool isLL) {
5201     __ align(CodeEntryAlignment);
5202     StubCodeMark mark(this, "StubRoutines", isLL
5203         ? "compare_long_string_same_encoding LL"
5204         : "compare_long_string_same_encoding UU");
5205     address entry = __ pc();
5206     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5207         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5208 
5209     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5210 
5211     // exit from large loop when less than 64 bytes left to read or we're about
5212     // to prefetch memory behind array border
5213     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5214 
5215     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5216     __ eor(rscratch2, tmp1, tmp2);
5217     __ cbnz(rscratch2, CAL_DIFFERENCE);
5218 
5219     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5220     // update pointers, because of previous read
5221     __ add(str1, str1, wordSize);
5222     __ add(str2, str2, wordSize);
5223     if (SoftwarePrefetchHintDistance >= 0) {
5224       __ align(OptoLoopAlignment);
5225       __ bind(LARGE_LOOP_PREFETCH);
5226         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5227         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5228 
5229         for (int i = 0; i < 4; i++) {
5230           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5231           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5232           __ cmp(tmp1, tmp2);
5233           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5234           __ br(Assembler::NE, DIFF);
5235         }
5236         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5237         __ add(str1, str1, 64);
5238         __ add(str2, str2, 64);
5239         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5240         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5241         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5242     }
5243 
5244     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5245     __ br(Assembler::LE, LESS16);
5246     __ align(OptoLoopAlignment);
5247     __ bind(LOOP_COMPARE16);
5248       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5249       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5250       __ cmp(tmp1, tmp2);
5251       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5252       __ br(Assembler::NE, DIFF);
5253       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5254       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5255       __ br(Assembler::LT, LESS16);
5256 
5257       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5258       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5259       __ cmp(tmp1, tmp2);
5260       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5261       __ br(Assembler::NE, DIFF);
5262       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5263       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5264       __ br(Assembler::GE, LOOP_COMPARE16);
5265       __ cbz(cnt2, LENGTH_DIFF);
5266 
5267     __ bind(LESS16);
5268       // each 8 compare
5269       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5270       __ br(Assembler::LE, LESS8);
5271       __ ldr(tmp1, Address(__ post(str1, 8)));
5272       __ ldr(tmp2, Address(__ post(str2, 8)));
5273       __ eor(rscratch2, tmp1, tmp2);
5274       __ cbnz(rscratch2, CAL_DIFFERENCE);
5275       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5276 
5277     __ bind(LESS8); // directly load last 8 bytes
5278       if (!isLL) {
5279         __ add(cnt2, cnt2, cnt2);
5280       }
5281       __ ldr(tmp1, Address(str1, cnt2));
5282       __ ldr(tmp2, Address(str2, cnt2));
5283       __ eor(rscratch2, tmp1, tmp2);
5284       __ cbz(rscratch2, LENGTH_DIFF);
5285       __ b(CAL_DIFFERENCE);
5286 
5287     __ bind(DIFF);
5288       __ cmp(tmp1, tmp2);
5289       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5290       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5291       // reuse rscratch2 register for the result of eor instruction
5292       __ eor(rscratch2, tmp1, tmp2);
5293 
5294     __ bind(CAL_DIFFERENCE);
5295       __ rev(rscratch2, rscratch2);
5296       __ clz(rscratch2, rscratch2);
5297       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5298       __ lsrv(tmp1, tmp1, rscratch2);
5299       __ lsrv(tmp2, tmp2, rscratch2);
5300       if (isLL) {
5301         __ uxtbw(tmp1, tmp1);
5302         __ uxtbw(tmp2, tmp2);
5303       } else {
5304         __ uxthw(tmp1, tmp1);
5305         __ uxthw(tmp2, tmp2);
5306       }
5307       __ subw(result, tmp1, tmp2);
5308 
5309     __ bind(LENGTH_DIFF);
5310       __ ret(lr);
5311     return entry;
5312   }
5313 
5314   void generate_compare_long_strings() {
5315       StubRoutines::aarch64::_compare_long_string_LL
5316           = generate_compare_long_string_same_encoding(true);
5317       StubRoutines::aarch64::_compare_long_string_UU
5318           = generate_compare_long_string_same_encoding(false);
5319       StubRoutines::aarch64::_compare_long_string_LU
5320           = generate_compare_long_string_different_encoding(true);
5321       StubRoutines::aarch64::_compare_long_string_UL
5322           = generate_compare_long_string_different_encoding(false);
5323   }
5324 
5325   // R0 = result
5326   // R1 = str2
5327   // R2 = cnt1
5328   // R3 = str1
5329   // R4 = cnt2
5330   // This generic linear code use few additional ideas, which makes it faster:
5331   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5332   // in order to skip initial loading(help in systems with 1 ld pipeline)
5333   // 2) we can use "fast" algorithm of finding single character to search for
5334   // first symbol with less branches(1 branch per each loaded register instead
5335   // of branch for each symbol), so, this is where constants like
5336   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5337   // 3) after loading and analyzing 1st register of source string, it can be
5338   // used to search for every 1st character entry, saving few loads in
5339   // comparison with "simplier-but-slower" implementation
5340   // 4) in order to avoid lots of push/pop operations, code below is heavily
5341   // re-using/re-initializing/compressing register values, which makes code
5342   // larger and a bit less readable, however, most of extra operations are
5343   // issued during loads or branches, so, penalty is minimal
5344   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5345     const char* stubName = str1_isL
5346         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5347         : "indexof_linear_uu";
5348     __ align(CodeEntryAlignment);
5349     StubCodeMark mark(this, "StubRoutines", stubName);
5350     address entry = __ pc();
5351 
5352     int str1_chr_size = str1_isL ? 1 : 2;
5353     int str2_chr_size = str2_isL ? 1 : 2;
5354     int str1_chr_shift = str1_isL ? 0 : 1;
5355     int str2_chr_shift = str2_isL ? 0 : 1;
5356     bool isL = str1_isL && str2_isL;
5357    // parameters
5358     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5359     // temporary registers
5360     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5361     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5362     // redefinitions
5363     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5364 
5365     __ push(spilled_regs, sp);
5366     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5367         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5368         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5369         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5370         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5371         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5372     // Read whole register from str1. It is safe, because length >=8 here
5373     __ ldr(ch1, Address(str1));
5374     // Read whole register from str2. It is safe, because length >=8 here
5375     __ ldr(ch2, Address(str2));
5376     __ sub(cnt2, cnt2, cnt1);
5377     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5378     if (str1_isL != str2_isL) {
5379       __ eor(v0, __ T16B, v0, v0);
5380     }
5381     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5382     __ mul(first, first, tmp1);
5383     // check if we have less than 1 register to check
5384     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5385     if (str1_isL != str2_isL) {
5386       __ fmovd(v1, ch1);
5387     }
5388     __ br(__ LE, L_SMALL);
5389     __ eor(ch2, first, ch2);
5390     if (str1_isL != str2_isL) {
5391       __ zip1(v1, __ T16B, v1, v0);
5392     }
5393     __ sub(tmp2, ch2, tmp1);
5394     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5395     __ bics(tmp2, tmp2, ch2);
5396     if (str1_isL != str2_isL) {
5397       __ fmovd(ch1, v1);
5398     }
5399     __ br(__ NE, L_HAS_ZERO);
5400     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5401     __ add(result, result, wordSize/str2_chr_size);
5402     __ add(str2, str2, wordSize);
5403     __ br(__ LT, L_POST_LOOP);
5404     __ BIND(L_LOOP);
5405       __ ldr(ch2, Address(str2));
5406       __ eor(ch2, first, ch2);
5407       __ sub(tmp2, ch2, tmp1);
5408       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5409       __ bics(tmp2, tmp2, ch2);
5410       __ br(__ NE, L_HAS_ZERO);
5411     __ BIND(L_LOOP_PROCEED);
5412       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5413       __ add(str2, str2, wordSize);
5414       __ add(result, result, wordSize/str2_chr_size);
5415       __ br(__ GE, L_LOOP);
5416     __ BIND(L_POST_LOOP);
5417       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5418       __ br(__ LE, NOMATCH);
5419       __ ldr(ch2, Address(str2));
5420       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5421       __ eor(ch2, first, ch2);
5422       __ sub(tmp2, ch2, tmp1);
5423       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5424       __ mov(tmp4, -1); // all bits set
5425       __ b(L_SMALL_PROCEED);
5426     __ align(OptoLoopAlignment);
5427     __ BIND(L_SMALL);
5428       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5429       __ eor(ch2, first, ch2);
5430       if (str1_isL != str2_isL) {
5431         __ zip1(v1, __ T16B, v1, v0);
5432       }
5433       __ sub(tmp2, ch2, tmp1);
5434       __ mov(tmp4, -1); // all bits set
5435       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5436       if (str1_isL != str2_isL) {
5437         __ fmovd(ch1, v1); // move converted 4 symbols
5438       }
5439     __ BIND(L_SMALL_PROCEED);
5440       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5441       __ bic(tmp2, tmp2, ch2);
5442       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5443       __ rbit(tmp2, tmp2);
5444       __ br(__ EQ, NOMATCH);
5445     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5446       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5447       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5448       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5449       if (str2_isL) { // LL
5450         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5451         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5452         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5453         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5454         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5455       } else {
5456         __ mov(ch2, 0xE); // all bits in byte set except last one
5457         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5458         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5459         __ lslv(tmp2, tmp2, tmp4);
5460         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5461         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5462         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5463         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5464       }
5465       __ cmp(ch1, ch2);
5466       __ mov(tmp4, wordSize/str2_chr_size);
5467       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5468     __ BIND(L_SMALL_CMP_LOOP);
5469       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5470                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5471       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5472                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5473       __ add(tmp4, tmp4, 1);
5474       __ cmp(tmp4, cnt1);
5475       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5476       __ cmp(first, ch2);
5477       __ br(__ EQ, L_SMALL_CMP_LOOP);
5478     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5479       __ cbz(tmp2, NOMATCH); // no more matches. exit
5480       __ clz(tmp4, tmp2);
5481       __ add(result, result, 1); // advance index
5482       __ add(str2, str2, str2_chr_size); // advance pointer
5483       __ b(L_SMALL_HAS_ZERO_LOOP);
5484     __ align(OptoLoopAlignment);
5485     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5486       __ cmp(first, ch2);
5487       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5488       __ b(DONE);
5489     __ align(OptoLoopAlignment);
5490     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5491       if (str2_isL) { // LL
5492         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5493         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5494         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5495         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5496         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5497       } else {
5498         __ mov(ch2, 0xE); // all bits in byte set except last one
5499         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5500         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5501         __ lslv(tmp2, tmp2, tmp4);
5502         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5503         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5504         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5505         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5506       }
5507       __ cmp(ch1, ch2);
5508       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5509       __ b(DONE);
5510     __ align(OptoLoopAlignment);
5511     __ BIND(L_HAS_ZERO);
5512       __ rbit(tmp2, tmp2);
5513       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5514       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5515       // It's fine because both counters are 32bit and are not changed in this
5516       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5517       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5518       __ sub(result, result, 1);
5519     __ BIND(L_HAS_ZERO_LOOP);
5520       __ mov(cnt1, wordSize/str2_chr_size);
5521       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5522       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5523       if (str2_isL) {
5524         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5525         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5526         __ lslv(tmp2, tmp2, tmp4);
5527         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5528         __ add(tmp4, tmp4, 1);
5529         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5530         __ lsl(tmp2, tmp2, 1);
5531         __ mov(tmp4, wordSize/str2_chr_size);
5532       } else {
5533         __ mov(ch2, 0xE);
5534         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5535         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5536         __ lslv(tmp2, tmp2, tmp4);
5537         __ add(tmp4, tmp4, 1);
5538         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5539         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5540         __ lsl(tmp2, tmp2, 1);
5541         __ mov(tmp4, wordSize/str2_chr_size);
5542         __ sub(str2, str2, str2_chr_size);
5543       }
5544       __ cmp(ch1, ch2);
5545       __ mov(tmp4, wordSize/str2_chr_size);
5546       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5547     __ BIND(L_CMP_LOOP);
5548       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5549                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5550       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5551                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5552       __ add(tmp4, tmp4, 1);
5553       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5554       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5555       __ cmp(cnt1, ch2);
5556       __ br(__ EQ, L_CMP_LOOP);
5557     __ BIND(L_CMP_LOOP_NOMATCH);
5558       // here we're not matched
5559       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5560       __ clz(tmp4, tmp2);
5561       __ add(str2, str2, str2_chr_size); // advance pointer
5562       __ b(L_HAS_ZERO_LOOP);
5563     __ align(OptoLoopAlignment);
5564     __ BIND(L_CMP_LOOP_LAST_CMP);
5565       __ cmp(cnt1, ch2);
5566       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5567       __ b(DONE);
5568     __ align(OptoLoopAlignment);
5569     __ BIND(L_CMP_LOOP_LAST_CMP2);
5570       if (str2_isL) {
5571         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5572         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5573         __ lslv(tmp2, tmp2, tmp4);
5574         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5575         __ add(tmp4, tmp4, 1);
5576         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5577         __ lsl(tmp2, tmp2, 1);
5578       } else {
5579         __ mov(ch2, 0xE);
5580         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5581         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5582         __ lslv(tmp2, tmp2, tmp4);
5583         __ add(tmp4, tmp4, 1);
5584         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5585         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5586         __ lsl(tmp2, tmp2, 1);
5587         __ sub(str2, str2, str2_chr_size);
5588       }
5589       __ cmp(ch1, ch2);
5590       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5591       __ b(DONE);
5592     __ align(OptoLoopAlignment);
5593     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5594       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5595       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5596       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5597       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5598       // result by analyzed characters value, so, we can just reset lower bits
5599       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5600       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5601       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5602       // index of last analyzed substring inside current octet. So, str2 in at
5603       // respective start address. We need to advance it to next octet
5604       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5605       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5606       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5607       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5608       __ movw(cnt2, cnt2);
5609       __ b(L_LOOP_PROCEED);
5610     __ align(OptoLoopAlignment);
5611     __ BIND(NOMATCH);
5612       __ mov(result, -1);
5613     __ BIND(DONE);
5614       __ pop(spilled_regs, sp);
5615       __ ret(lr);
5616     return entry;
5617   }
5618 
5619   void generate_string_indexof_stubs() {
5620     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5621     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5622     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5623   }
5624 
5625   void inflate_and_store_2_fp_registers(bool generatePrfm,
5626       FloatRegister src1, FloatRegister src2) {
5627     Register dst = r1;
5628     __ zip1(v1, __ T16B, src1, v0);
5629     __ zip2(v2, __ T16B, src1, v0);
5630     if (generatePrfm) {
5631       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5632     }
5633     __ zip1(v3, __ T16B, src2, v0);
5634     __ zip2(v4, __ T16B, src2, v0);
5635     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5636   }
5637 
5638   // R0 = src
5639   // R1 = dst
5640   // R2 = len
5641   // R3 = len >> 3
5642   // V0 = 0
5643   // v1 = loaded 8 bytes
5644   address generate_large_byte_array_inflate() {
5645     __ align(CodeEntryAlignment);
5646     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5647     address entry = __ pc();
5648     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5649     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5650     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5651 
5652     // do one more 8-byte read to have address 16-byte aligned in most cases
5653     // also use single store instruction
5654     __ ldrd(v2, __ post(src, 8));
5655     __ sub(octetCounter, octetCounter, 2);
5656     __ zip1(v1, __ T16B, v1, v0);
5657     __ zip1(v2, __ T16B, v2, v0);
5658     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5659     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5660     __ subs(rscratch1, octetCounter, large_loop_threshold);
5661     __ br(__ LE, LOOP_START);
5662     __ b(LOOP_PRFM_START);
5663     __ bind(LOOP_PRFM);
5664       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5665     __ bind(LOOP_PRFM_START);
5666       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5667       __ sub(octetCounter, octetCounter, 8);
5668       __ subs(rscratch1, octetCounter, large_loop_threshold);
5669       inflate_and_store_2_fp_registers(true, v3, v4);
5670       inflate_and_store_2_fp_registers(true, v5, v6);
5671       __ br(__ GT, LOOP_PRFM);
5672       __ cmp(octetCounter, (u1)8);
5673       __ br(__ LT, DONE);
5674     __ bind(LOOP);
5675       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5676       __ bind(LOOP_START);
5677       __ sub(octetCounter, octetCounter, 8);
5678       __ cmp(octetCounter, (u1)8);
5679       inflate_and_store_2_fp_registers(false, v3, v4);
5680       inflate_and_store_2_fp_registers(false, v5, v6);
5681       __ br(__ GE, LOOP);
5682     __ bind(DONE);
5683       __ ret(lr);
5684     return entry;
5685   }
5686 
5687   /**
5688    *  Arguments:
5689    *
5690    *  Input:
5691    *  c_rarg0   - current state address
5692    *  c_rarg1   - H key address
5693    *  c_rarg2   - data address
5694    *  c_rarg3   - number of blocks
5695    *
5696    *  Output:
5697    *  Updated state at c_rarg0
5698    */
5699   address generate_ghash_processBlocks() {
5700     // Bafflingly, GCM uses little-endian for the byte order, but
5701     // big-endian for the bit order.  For example, the polynomial 1 is
5702     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5703     //
5704     // So, we must either reverse the bytes in each word and do
5705     // everything big-endian or reverse the bits in each byte and do
5706     // it little-endian.  On AArch64 it's more idiomatic to reverse
5707     // the bits in each byte (we have an instruction, RBIT, to do
5708     // that) and keep the data in little-endian bit order through the
5709     // calculation, bit-reversing the inputs and outputs.
5710 
5711     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5712     __ align(wordSize * 2);
5713     address p = __ pc();
5714     __ emit_int64(0x87);  // The low-order bits of the field
5715                           // polynomial (i.e. p = z^7+z^2+z+1)
5716                           // repeated in the low and high parts of a
5717                           // 128-bit vector
5718     __ emit_int64(0x87);
5719 
5720     __ align(CodeEntryAlignment);
5721     address start = __ pc();
5722 
5723     Register state   = c_rarg0;
5724     Register subkeyH = c_rarg1;
5725     Register data    = c_rarg2;
5726     Register blocks  = c_rarg3;
5727 
5728     FloatRegister vzr = v30;
5729     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5730 
5731     __ ldrq(v24, p);    // The field polynomial
5732 
5733     __ ldrq(v0, Address(state));
5734     __ ldrq(v1, Address(subkeyH));
5735 
5736     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5737     __ rbit(v0, __ T16B, v0);
5738     __ rev64(v1, __ T16B, v1);
5739     __ rbit(v1, __ T16B, v1);
5740 
5741     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5742     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5743 
5744     {
5745       Label L_ghash_loop;
5746       __ bind(L_ghash_loop);
5747 
5748       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5749                                                  // reversing each byte
5750       __ rbit(v2, __ T16B, v2);
5751       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5752 
5753       // Multiply state in v2 by subkey in v1
5754       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5755                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
5756                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
5757       // Reduce v7:v5 by the field polynomial
5758       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
5759 
5760       __ sub(blocks, blocks, 1);
5761       __ cbnz(blocks, L_ghash_loop);
5762     }
5763 
5764     // The bit-reversed result is at this point in v0
5765     __ rev64(v0, __ T16B, v0);
5766     __ rbit(v0, __ T16B, v0);
5767 
5768     __ st1(v0, __ T16B, state);
5769     __ ret(lr);
5770 
5771     return start;
5772   }
5773 
5774   address generate_ghash_processBlocks_wide() {
5775     address small = generate_ghash_processBlocks();
5776 
5777     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
5778     __ align(wordSize * 2);
5779     address p = __ pc();
5780     __ emit_int64(0x87);  // The low-order bits of the field
5781                           // polynomial (i.e. p = z^7+z^2+z+1)
5782                           // repeated in the low and high parts of a
5783                           // 128-bit vector
5784     __ emit_int64(0x87);
5785 
5786     __ align(CodeEntryAlignment);
5787     address start = __ pc();
5788 
5789     Register state   = c_rarg0;
5790     Register subkeyH = c_rarg1;
5791     Register data    = c_rarg2;
5792     Register blocks  = c_rarg3;
5793 
5794     const int unroll = 4;
5795 
5796     __ cmp(blocks, (unsigned char)(unroll * 2));
5797     __ br(__ LT, small);
5798 
5799     if (unroll > 1) {
5800     // Save state before entering routine
5801       __ sub(sp, sp, 4 * 16);
5802       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
5803       __ sub(sp, sp, 4 * 16);
5804       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
5805     }
5806 
5807     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
5808 
5809     if (unroll > 1) {
5810       // And restore state
5811       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
5812       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
5813     }
5814 
5815     __ cmp(blocks, (unsigned char)0);
5816     __ br(__ GT, small);
5817 
5818     __ ret(lr);
5819 
5820     return start;
5821   }
5822 
5823   void generate_base64_encode_simdround(Register src, Register dst,
5824         FloatRegister codec, u8 size) {
5825 
5826     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
5827     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5828     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5829 
5830     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5831 
5832     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
5833 
5834     __ ushr(ind0, arrangement, in0,  2);
5835 
5836     __ ushr(ind1, arrangement, in1,  2);
5837     __ shl(in0,   arrangement, in0,  6);
5838     __ orr(ind1,  arrangement, ind1, in0);
5839     __ ushr(ind1, arrangement, ind1, 2);
5840 
5841     __ ushr(ind2, arrangement, in2,  4);
5842     __ shl(in1,   arrangement, in1,  4);
5843     __ orr(ind2,  arrangement, in1,  ind2);
5844     __ ushr(ind2, arrangement, ind2, 2);
5845 
5846     __ shl(ind3,  arrangement, in2,  2);
5847     __ ushr(ind3, arrangement, ind3, 2);
5848 
5849     __ tbl(out0,  arrangement, codec,  4, ind0);
5850     __ tbl(out1,  arrangement, codec,  4, ind1);
5851     __ tbl(out2,  arrangement, codec,  4, ind2);
5852     __ tbl(out3,  arrangement, codec,  4, ind3);
5853 
5854     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
5855   }
5856 
5857    /**
5858    *  Arguments:
5859    *
5860    *  Input:
5861    *  c_rarg0   - src_start
5862    *  c_rarg1   - src_offset
5863    *  c_rarg2   - src_length
5864    *  c_rarg3   - dest_start
5865    *  c_rarg4   - dest_offset
5866    *  c_rarg5   - isURL
5867    *
5868    */
5869   address generate_base64_encodeBlock() {
5870 
5871     static const char toBase64[64] = {
5872       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5873       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5874       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5875       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5876       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5877     };
5878 
5879     static const char toBase64URL[64] = {
5880       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5881       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5882       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5883       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5884       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5885     };
5886 
5887     __ align(CodeEntryAlignment);
5888     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5889     address start = __ pc();
5890 
5891     Register src   = c_rarg0;  // source array
5892     Register soff  = c_rarg1;  // source start offset
5893     Register send  = c_rarg2;  // source end offset
5894     Register dst   = c_rarg3;  // dest array
5895     Register doff  = c_rarg4;  // position for writing to dest array
5896     Register isURL = c_rarg5;  // Base64 or URL character set
5897 
5898     // c_rarg6 and c_rarg7 are free to use as temps
5899     Register codec  = c_rarg6;
5900     Register length = c_rarg7;
5901 
5902     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
5903 
5904     __ add(src, src, soff);
5905     __ add(dst, dst, doff);
5906     __ sub(length, send, soff);
5907 
5908     // load the codec base address
5909     __ lea(codec, ExternalAddress((address) toBase64));
5910     __ cbz(isURL, ProcessData);
5911     __ lea(codec, ExternalAddress((address) toBase64URL));
5912 
5913     __ BIND(ProcessData);
5914 
5915     // too short to formup a SIMD loop, roll back
5916     __ cmp(length, (u1)24);
5917     __ br(Assembler::LT, Process3B);
5918 
5919     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
5920 
5921     __ BIND(Process48B);
5922     __ cmp(length, (u1)48);
5923     __ br(Assembler::LT, Process24B);
5924     generate_base64_encode_simdround(src, dst, v0, 16);
5925     __ sub(length, length, 48);
5926     __ b(Process48B);
5927 
5928     __ BIND(Process24B);
5929     __ cmp(length, (u1)24);
5930     __ br(Assembler::LT, SIMDExit);
5931     generate_base64_encode_simdround(src, dst, v0, 8);
5932     __ sub(length, length, 24);
5933 
5934     __ BIND(SIMDExit);
5935     __ cbz(length, Exit);
5936 
5937     __ BIND(Process3B);
5938     //  3 src bytes, 24 bits
5939     __ ldrb(r10, __ post(src, 1));
5940     __ ldrb(r11, __ post(src, 1));
5941     __ ldrb(r12, __ post(src, 1));
5942     __ orrw(r11, r11, r10, Assembler::LSL, 8);
5943     __ orrw(r12, r12, r11, Assembler::LSL, 8);
5944     // codec index
5945     __ ubfmw(r15, r12, 18, 23);
5946     __ ubfmw(r14, r12, 12, 17);
5947     __ ubfmw(r13, r12, 6,  11);
5948     __ andw(r12,  r12, 63);
5949     // get the code based on the codec
5950     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
5951     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
5952     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
5953     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
5954     __ strb(r15, __ post(dst, 1));
5955     __ strb(r14, __ post(dst, 1));
5956     __ strb(r13, __ post(dst, 1));
5957     __ strb(r12, __ post(dst, 1));
5958     __ sub(length, length, 3);
5959     __ cbnz(length, Process3B);
5960 
5961     __ BIND(Exit);
5962     __ ret(lr);
5963 
5964     return start;
5965   }
5966 
5967   void generate_base64_decode_simdround(Register src, Register dst,
5968         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
5969 
5970     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
5971     FloatRegister out0 = v20, out1 = v21, out2 = v22;
5972 
5973     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
5974     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
5975 
5976     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
5977 
5978     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5979 
5980     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
5981 
5982     // we need unsigned saturating subtract, to make sure all input values
5983     // in range [0, 63] will have 0U value in the higher half lookup
5984     __ uqsubv(decH0, __ T16B, in0, v27);
5985     __ uqsubv(decH1, __ T16B, in1, v27);
5986     __ uqsubv(decH2, __ T16B, in2, v27);
5987     __ uqsubv(decH3, __ T16B, in3, v27);
5988 
5989     // lower half lookup
5990     __ tbl(decL0, arrangement, codecL, 4, in0);
5991     __ tbl(decL1, arrangement, codecL, 4, in1);
5992     __ tbl(decL2, arrangement, codecL, 4, in2);
5993     __ tbl(decL3, arrangement, codecL, 4, in3);
5994 
5995     // higher half lookup
5996     __ tbx(decH0, arrangement, codecH, 4, decH0);
5997     __ tbx(decH1, arrangement, codecH, 4, decH1);
5998     __ tbx(decH2, arrangement, codecH, 4, decH2);
5999     __ tbx(decH3, arrangement, codecH, 4, decH3);
6000 
6001     // combine lower and higher
6002     __ orr(decL0, arrangement, decL0, decH0);
6003     __ orr(decL1, arrangement, decL1, decH1);
6004     __ orr(decL2, arrangement, decL2, decH2);
6005     __ orr(decL3, arrangement, decL3, decH3);
6006 
6007     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6008     __ cmhi(decH0, arrangement, decL0, v27);
6009     __ cmhi(decH1, arrangement, decL1, v27);
6010     __ cmhi(decH2, arrangement, decL2, v27);
6011     __ cmhi(decH3, arrangement, decL3, v27);
6012     __ orr(in0, arrangement, decH0, decH1);
6013     __ orr(in1, arrangement, decH2, decH3);
6014     __ orr(in2, arrangement, in0,   in1);
6015     __ umaxv(in3, arrangement, in2);
6016     __ umov(rscratch2, in3, __ B, 0);
6017 
6018     // get the data to output
6019     __ shl(out0,  arrangement, decL0, 2);
6020     __ ushr(out1, arrangement, decL1, 4);
6021     __ orr(out0,  arrangement, out0,  out1);
6022     __ shl(out1,  arrangement, decL1, 4);
6023     __ ushr(out2, arrangement, decL2, 2);
6024     __ orr(out1,  arrangement, out1,  out2);
6025     __ shl(out2,  arrangement, decL2, 6);
6026     __ orr(out2,  arrangement, out2,  decL3);
6027 
6028     __ cbz(rscratch2, NoIllegalData);
6029 
6030     // handle illegal input
6031     __ umov(r10, in2, __ D, 0);
6032     if (size == 16) {
6033       __ cbnz(r10, ErrorInLowerHalf);
6034 
6035       // illegal input is in higher half, store the lower half now.
6036       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6037 
6038       __ umov(r10, in2,  __ D, 1);
6039       __ umov(r11, out0, __ D, 1);
6040       __ umov(r12, out1, __ D, 1);
6041       __ umov(r13, out2, __ D, 1);
6042       __ b(StoreLegalData);
6043 
6044       __ BIND(ErrorInLowerHalf);
6045     }
6046     __ umov(r11, out0, __ D, 0);
6047     __ umov(r12, out1, __ D, 0);
6048     __ umov(r13, out2, __ D, 0);
6049 
6050     __ BIND(StoreLegalData);
6051     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6052     __ strb(r11, __ post(dst, 1));
6053     __ strb(r12, __ post(dst, 1));
6054     __ strb(r13, __ post(dst, 1));
6055     __ lsr(r10, r10, 8);
6056     __ lsr(r11, r11, 8);
6057     __ lsr(r12, r12, 8);
6058     __ lsr(r13, r13, 8);
6059     __ b(StoreLegalData);
6060 
6061     __ BIND(NoIllegalData);
6062     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6063   }
6064 
6065 
6066    /**
6067    *  Arguments:
6068    *
6069    *  Input:
6070    *  c_rarg0   - src_start
6071    *  c_rarg1   - src_offset
6072    *  c_rarg2   - src_length
6073    *  c_rarg3   - dest_start
6074    *  c_rarg4   - dest_offset
6075    *  c_rarg5   - isURL
6076    *  c_rarg6   - isMIME
6077    *
6078    */
6079   address generate_base64_decodeBlock() {
6080 
6081     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6082     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6083     // titled "Base64 decoding".
6084 
6085     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6086     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6087     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6088     static const uint8_t fromBase64ForNoSIMD[256] = {
6089       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6090       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6091       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6092        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6093       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6094        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6095       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6096        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6097       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6098       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6099       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6100       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6101       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6102       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6103       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6104       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6105     };
6106 
6107     static const uint8_t fromBase64URLForNoSIMD[256] = {
6108       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6109       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6110       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6111        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6112       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6113        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6114       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6115        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6116       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6117       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6118       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6119       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6120       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6121       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6122       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6123       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6124     };
6125 
6126     // A legal value of base64 code is in range [0, 127].  We need two lookups
6127     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6128     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6129     // table vector lookup use tbx, out of range indices are unchanged in
6130     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6131     // The value of index 64 is set to 0, so that we know that we already get the
6132     // decoded data with the 1st lookup.
6133     static const uint8_t fromBase64ForSIMD[128] = {
6134       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6135       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6136       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6137        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6138         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6139        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6140       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6141        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6142     };
6143 
6144     static const uint8_t fromBase64URLForSIMD[128] = {
6145       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6146       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6147       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6148        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6149         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6150        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6151        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6152        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6153     };
6154 
6155     __ align(CodeEntryAlignment);
6156     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6157     address start = __ pc();
6158 
6159     Register src    = c_rarg0;  // source array
6160     Register soff   = c_rarg1;  // source start offset
6161     Register send   = c_rarg2;  // source end offset
6162     Register dst    = c_rarg3;  // dest array
6163     Register doff   = c_rarg4;  // position for writing to dest array
6164     Register isURL  = c_rarg5;  // Base64 or URL character set
6165     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6166 
6167     Register length = send;    // reuse send as length of source data to process
6168 
6169     Register simd_codec   = c_rarg6;
6170     Register nosimd_codec = c_rarg7;
6171 
6172     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6173 
6174     __ enter();
6175 
6176     __ add(src, src, soff);
6177     __ add(dst, dst, doff);
6178 
6179     __ mov(doff, dst);
6180 
6181     __ sub(length, send, soff);
6182     __ bfm(length, zr, 0, 1);
6183 
6184     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6185     __ cbz(isURL, ProcessData);
6186     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6187 
6188     __ BIND(ProcessData);
6189     __ mov(rscratch1, length);
6190     __ cmp(length, (u1)144); // 144 = 80 + 64
6191     __ br(Assembler::LT, Process4B);
6192 
6193     // In the MIME case, the line length cannot be more than 76
6194     // bytes (see RFC 2045). This is too short a block for SIMD
6195     // to be worthwhile, so we use non-SIMD here.
6196     __ movw(rscratch1, 79);
6197 
6198     __ BIND(Process4B);
6199     __ ldrw(r14, __ post(src, 4));
6200     __ ubfxw(r10, r14, 0,  8);
6201     __ ubfxw(r11, r14, 8,  8);
6202     __ ubfxw(r12, r14, 16, 8);
6203     __ ubfxw(r13, r14, 24, 8);
6204     // get the de-code
6205     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6206     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6207     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6208     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6209     // error detection, 255u indicates an illegal input
6210     __ orrw(r14, r10, r11);
6211     __ orrw(r15, r12, r13);
6212     __ orrw(r14, r14, r15);
6213     __ tbnz(r14, 7, Exit);
6214     // recover the data
6215     __ lslw(r14, r10, 10);
6216     __ bfiw(r14, r11, 4, 6);
6217     __ bfmw(r14, r12, 2, 5);
6218     __ rev16w(r14, r14);
6219     __ bfiw(r13, r12, 6, 2);
6220     __ strh(r14, __ post(dst, 2));
6221     __ strb(r13, __ post(dst, 1));
6222     // non-simd loop
6223     __ subsw(rscratch1, rscratch1, 4);
6224     __ br(Assembler::GT, Process4B);
6225 
6226     // if exiting from PreProcess80B, rscratch1 == -1;
6227     // otherwise, rscratch1 == 0.
6228     __ cbzw(rscratch1, Exit);
6229     __ sub(length, length, 80);
6230 
6231     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6232     __ cbz(isURL, SIMDEnter);
6233     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6234 
6235     __ BIND(SIMDEnter);
6236     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6237     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6238     __ mov(rscratch1, 63);
6239     __ dup(v27, __ T16B, rscratch1);
6240 
6241     __ BIND(Process64B);
6242     __ cmp(length, (u1)64);
6243     __ br(Assembler::LT, Process32B);
6244     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6245     __ sub(length, length, 64);
6246     __ b(Process64B);
6247 
6248     __ BIND(Process32B);
6249     __ cmp(length, (u1)32);
6250     __ br(Assembler::LT, SIMDExit);
6251     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6252     __ sub(length, length, 32);
6253     __ b(Process32B);
6254 
6255     __ BIND(SIMDExit);
6256     __ cbz(length, Exit);
6257     __ movw(rscratch1, length);
6258     __ b(Process4B);
6259 
6260     __ BIND(Exit);
6261     __ sub(c_rarg0, dst, doff);
6262 
6263     __ leave();
6264     __ ret(lr);
6265 
6266     return start;
6267   }
6268 
6269   // Support for spin waits.
6270   address generate_spin_wait() {
6271     __ align(CodeEntryAlignment);
6272     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6273     address start = __ pc();
6274 
6275     __ spin_wait();
6276     __ ret(lr);
6277 
6278     return start;
6279   }
6280 
6281 #ifdef LINUX
6282 
6283   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6284   //
6285   // If LSE is in use, generate LSE versions of all the stubs. The
6286   // non-LSE versions are in atomic_aarch64.S.
6287 
6288   // class AtomicStubMark records the entry point of a stub and the
6289   // stub pointer which will point to it. The stub pointer is set to
6290   // the entry point when ~AtomicStubMark() is called, which must be
6291   // after ICache::invalidate_range. This ensures safe publication of
6292   // the generated code.
6293   class AtomicStubMark {
6294     address _entry_point;
6295     aarch64_atomic_stub_t *_stub;
6296     MacroAssembler *_masm;
6297   public:
6298     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6299       _masm = masm;
6300       __ align(32);
6301       _entry_point = __ pc();
6302       _stub = stub;
6303     }
6304     ~AtomicStubMark() {
6305       *_stub = (aarch64_atomic_stub_t)_entry_point;
6306     }
6307   };
6308 
6309   // NB: For memory_order_conservative we need a trailing membar after
6310   // LSE atomic operations but not a leading membar.
6311   //
6312   // We don't need a leading membar because a clause in the Arm ARM
6313   // says:
6314   //
6315   //   Barrier-ordered-before
6316   //
6317   //   Barrier instructions order prior Memory effects before subsequent
6318   //   Memory effects generated by the same Observer. A read or a write
6319   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6320   //   Observer if and only if RW1 appears in program order before RW 2
6321   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6322   //   instruction with both Acquire and Release semantics.
6323   //
6324   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6325   // and Release semantics, therefore we don't need a leading
6326   // barrier. However, there is no corresponding Barrier-ordered-after
6327   // relationship, therefore we need a trailing membar to prevent a
6328   // later store or load from being reordered with the store in an
6329   // atomic instruction.
6330   //
6331   // This was checked by using the herd7 consistency model simulator
6332   // (http://diy.inria.fr/) with this test case:
6333   //
6334   // AArch64 LseCas
6335   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6336   // P0 | P1;
6337   // LDR W4, [X2] | MOV W3, #0;
6338   // DMB LD       | MOV W4, #1;
6339   // LDR W3, [X1] | CASAL W3, W4, [X1];
6340   //              | DMB ISH;
6341   //              | STR W4, [X2];
6342   // exists
6343   // (0:X3=0 /\ 0:X4=1)
6344   //
6345   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6346   // with the store to x in P1. Without the DMB in P1 this may happen.
6347   //
6348   // At the time of writing we don't know of any AArch64 hardware that
6349   // reorders stores in this way, but the Reference Manual permits it.
6350 
6351   void gen_cas_entry(Assembler::operand_size size,
6352                      atomic_memory_order order) {
6353     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6354       exchange_val = c_rarg2;
6355     bool acquire, release;
6356     switch (order) {
6357       case memory_order_relaxed:
6358         acquire = false;
6359         release = false;
6360         break;
6361       case memory_order_release:
6362         acquire = false;
6363         release = true;
6364         break;
6365       default:
6366         acquire = true;
6367         release = true;
6368         break;
6369     }
6370     __ mov(prev, compare_val);
6371     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6372     if (order == memory_order_conservative) {
6373       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6374     }
6375     if (size == Assembler::xword) {
6376       __ mov(r0, prev);
6377     } else {
6378       __ movw(r0, prev);
6379     }
6380     __ ret(lr);
6381   }
6382 
6383   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6384     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6385     // If not relaxed, then default to conservative.  Relaxed is the only
6386     // case we use enough to be worth specializing.
6387     if (order == memory_order_relaxed) {
6388       __ ldadd(size, incr, prev, addr);
6389     } else {
6390       __ ldaddal(size, incr, prev, addr);
6391       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6392     }
6393     if (size == Assembler::xword) {
6394       __ mov(r0, prev);
6395     } else {
6396       __ movw(r0, prev);
6397     }
6398     __ ret(lr);
6399   }
6400 
6401   void gen_swpal_entry(Assembler::operand_size size) {
6402     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6403     __ swpal(size, incr, prev, addr);
6404     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6405     if (size == Assembler::xword) {
6406       __ mov(r0, prev);
6407     } else {
6408       __ movw(r0, prev);
6409     }
6410     __ ret(lr);
6411   }
6412 
6413   void generate_atomic_entry_points() {
6414     if (! UseLSE) {
6415       return;
6416     }
6417 
6418     __ align(CodeEntryAlignment);
6419     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6420     address first_entry = __ pc();
6421 
6422     // ADD, memory_order_conservative
6423     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6424     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6425     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6426     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6427 
6428     // ADD, memory_order_relaxed
6429     AtomicStubMark mark_fetch_add_4_relaxed
6430       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6431     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6432     AtomicStubMark mark_fetch_add_8_relaxed
6433       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6434     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6435 
6436     // XCHG, memory_order_conservative
6437     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6438     gen_swpal_entry(Assembler::word);
6439     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6440     gen_swpal_entry(Assembler::xword);
6441 
6442     // CAS, memory_order_conservative
6443     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6444     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6445     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6446     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6447     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6448     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6449 
6450     // CAS, memory_order_relaxed
6451     AtomicStubMark mark_cmpxchg_1_relaxed
6452       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6453     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6454     AtomicStubMark mark_cmpxchg_4_relaxed
6455       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6456     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6457     AtomicStubMark mark_cmpxchg_8_relaxed
6458       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6459     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6460 
6461     AtomicStubMark mark_cmpxchg_4_release
6462       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6463     gen_cas_entry(MacroAssembler::word, memory_order_release);
6464     AtomicStubMark mark_cmpxchg_8_release
6465       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6466     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6467 
6468     AtomicStubMark mark_cmpxchg_4_seq_cst
6469       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6470     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6471     AtomicStubMark mark_cmpxchg_8_seq_cst
6472       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6473     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6474 
6475     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6476   }
6477 #endif // LINUX
6478 
6479   // Pass object argument in r0 (which has to be preserved outside this stub)
6480   // Pass back result in r0
6481   // Clobbers rscratch1
6482   address generate_load_nklass() {
6483     __ align(CodeEntryAlignment);
6484     StubCodeMark mark(this, "StubRoutines", "load_nklass");
6485 
6486     address start = __ pc();
6487 
6488     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
6489     __ enter();
6490     __ push(RegSet::of(rscratch1, rscratch2), sp);
6491     __ push_call_clobbered_registers_except(r0);
6492     __ call_VM_leaf(CAST_FROM_FN_PTR(address, oopDesc::load_nklass_runtime), 1);
6493     __ pop_call_clobbered_registers_except(r0);
6494     __ pop(RegSet::of(rscratch1, rscratch2), sp);
6495     __ leave();
6496     __ reset_last_Java_frame(true);
6497     __ ret(lr);
6498 
6499     return start;
6500   }
6501 
6502   // Continuation point for throwing of implicit exceptions that are
6503   // not handled in the current activation. Fabricates an exception
6504   // oop and initiates normal exception dispatching in this
6505   // frame. Since we need to preserve callee-saved values (currently
6506   // only for C2, but done for C1 as well) we need a callee-saved oop
6507   // map and therefore have to make these stubs into RuntimeStubs
6508   // rather than BufferBlobs.  If the compiler needs all registers to
6509   // be preserved between the fault point and the exception handler
6510   // then it must assume responsibility for that in
6511   // AbstractCompiler::continuation_for_implicit_null_exception or
6512   // continuation_for_implicit_division_by_zero_exception. All other
6513   // implicit exceptions (e.g., NullPointerException or
6514   // AbstractMethodError on entry) are either at call sites or
6515   // otherwise assume that stack unwinding will be initiated, so
6516   // caller saved registers were assumed volatile in the compiler.
6517 
6518 #undef __
6519 #define __ masm->
6520 
6521   address generate_throw_exception(const char* name,
6522                                    address runtime_entry,
6523                                    Register arg1 = noreg,
6524                                    Register arg2 = noreg) {
6525     // Information about frame layout at time of blocking runtime call.
6526     // Note that we only have to preserve callee-saved registers since
6527     // the compilers are responsible for supplying a continuation point
6528     // if they expect all registers to be preserved.
6529     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6530     enum layout {
6531       rfp_off = 0,
6532       rfp_off2,
6533       return_off,
6534       return_off2,
6535       framesize // inclusive of return address
6536     };
6537 
6538     int insts_size = 512;
6539     int locs_size  = 64;
6540 
6541     CodeBuffer code(name, insts_size, locs_size);
6542     OopMapSet* oop_maps  = new OopMapSet();
6543     MacroAssembler* masm = new MacroAssembler(&code);
6544 
6545     address start = __ pc();
6546 
6547     // This is an inlined and slightly modified version of call_VM
6548     // which has the ability to fetch the return PC out of
6549     // thread-local storage and also sets up last_Java_sp slightly
6550     // differently than the real call_VM
6551 
6552     __ enter(); // Save FP and LR before call
6553 
6554     assert(is_even(framesize/2), "sp not 16-byte aligned");
6555 
6556     // lr and fp are already in place
6557     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
6558 
6559     int frame_complete = __ pc() - start;
6560 
6561     // Set up last_Java_sp and last_Java_fp
6562     address the_pc = __ pc();
6563     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6564 
6565     // Call runtime
6566     if (arg1 != noreg) {
6567       assert(arg2 != c_rarg1, "clobbered");
6568       __ mov(c_rarg1, arg1);
6569     }
6570     if (arg2 != noreg) {
6571       __ mov(c_rarg2, arg2);
6572     }
6573     __ mov(c_rarg0, rthread);
6574     BLOCK_COMMENT("call runtime_entry");
6575     __ mov(rscratch1, runtime_entry);
6576     __ blr(rscratch1);
6577 
6578     // Generate oop map
6579     OopMap* map = new OopMap(framesize, 0);
6580 
6581     oop_maps->add_gc_map(the_pc - start, map);
6582 
6583     __ reset_last_Java_frame(true);
6584 
6585     // Reinitialize the ptrue predicate register, in case the external runtime
6586     // call clobbers ptrue reg, as we may return to SVE compiled code.
6587     __ reinitialize_ptrue();
6588 
6589     __ leave();
6590 
6591     // check for pending exceptions
6592 #ifdef ASSERT
6593     Label L;
6594     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6595     __ cbnz(rscratch1, L);
6596     __ should_not_reach_here();
6597     __ bind(L);
6598 #endif // ASSERT
6599     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6600 
6601 
6602     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6603     RuntimeStub* stub =
6604       RuntimeStub::new_runtime_stub(name,
6605                                     &code,
6606                                     frame_complete,
6607                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6608                                     oop_maps, false);
6609     return stub->entry_point();
6610   }
6611 
6612   class MontgomeryMultiplyGenerator : public MacroAssembler {
6613 
6614     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6615       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6616 
6617     RegSet _toSave;
6618     bool _squaring;
6619 
6620   public:
6621     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6622       : MacroAssembler(as->code()), _squaring(squaring) {
6623 
6624       // Register allocation
6625 
6626       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6627       Pa_base = *regs;       // Argument registers
6628       if (squaring)
6629         Pb_base = Pa_base;
6630       else
6631         Pb_base = *++regs;
6632       Pn_base = *++regs;
6633       Rlen= *++regs;
6634       inv = *++regs;
6635       Pm_base = *++regs;
6636 
6637                           // Working registers:
6638       Ra =  *++regs;        // The current digit of a, b, n, and m.
6639       Rb =  *++regs;
6640       Rm =  *++regs;
6641       Rn =  *++regs;
6642 
6643       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
6644       Pb =  *++regs;
6645       Pm =  *++regs;
6646       Pn =  *++regs;
6647 
6648       t0 =  *++regs;        // Three registers which form a
6649       t1 =  *++regs;        // triple-precision accumuator.
6650       t2 =  *++regs;
6651 
6652       Ri =  *++regs;        // Inner and outer loop indexes.
6653       Rj =  *++regs;
6654 
6655       Rhi_ab = *++regs;     // Product registers: low and high parts
6656       Rlo_ab = *++regs;     // of a*b and m*n.
6657       Rhi_mn = *++regs;
6658       Rlo_mn = *++regs;
6659 
6660       // r19 and up are callee-saved.
6661       _toSave = RegSet::range(r19, *regs) + Pm_base;
6662     }
6663 
6664   private:
6665     void save_regs() {
6666       push(_toSave, sp);
6667     }
6668 
6669     void restore_regs() {
6670       pop(_toSave, sp);
6671     }
6672 
6673     template <typename T>
6674     void unroll_2(Register count, T block) {
6675       Label loop, end, odd;
6676       tbnz(count, 0, odd);
6677       cbz(count, end);
6678       align(16);
6679       bind(loop);
6680       (this->*block)();
6681       bind(odd);
6682       (this->*block)();
6683       subs(count, count, 2);
6684       br(Assembler::GT, loop);
6685       bind(end);
6686     }
6687 
6688     template <typename T>
6689     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
6690       Label loop, end, odd;
6691       tbnz(count, 0, odd);
6692       cbz(count, end);
6693       align(16);
6694       bind(loop);
6695       (this->*block)(d, s, tmp);
6696       bind(odd);
6697       (this->*block)(d, s, tmp);
6698       subs(count, count, 2);
6699       br(Assembler::GT, loop);
6700       bind(end);
6701     }
6702 
6703     void pre1(RegisterOrConstant i) {
6704       block_comment("pre1");
6705       // Pa = Pa_base;
6706       // Pb = Pb_base + i;
6707       // Pm = Pm_base;
6708       // Pn = Pn_base + i;
6709       // Ra = *Pa;
6710       // Rb = *Pb;
6711       // Rm = *Pm;
6712       // Rn = *Pn;
6713       ldr(Ra, Address(Pa_base));
6714       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6715       ldr(Rm, Address(Pm_base));
6716       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6717       lea(Pa, Address(Pa_base));
6718       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6719       lea(Pm, Address(Pm_base));
6720       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6721 
6722       // Zero the m*n result.
6723       mov(Rhi_mn, zr);
6724       mov(Rlo_mn, zr);
6725     }
6726 
6727     // The core multiply-accumulate step of a Montgomery
6728     // multiplication.  The idea is to schedule operations as a
6729     // pipeline so that instructions with long latencies (loads and
6730     // multiplies) have time to complete before their results are
6731     // used.  This most benefits in-order implementations of the
6732     // architecture but out-of-order ones also benefit.
6733     void step() {
6734       block_comment("step");
6735       // MACC(Ra, Rb, t0, t1, t2);
6736       // Ra = *++Pa;
6737       // Rb = *--Pb;
6738       umulh(Rhi_ab, Ra, Rb);
6739       mul(Rlo_ab, Ra, Rb);
6740       ldr(Ra, pre(Pa, wordSize));
6741       ldr(Rb, pre(Pb, -wordSize));
6742       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
6743                                        // previous iteration.
6744       // MACC(Rm, Rn, t0, t1, t2);
6745       // Rm = *++Pm;
6746       // Rn = *--Pn;
6747       umulh(Rhi_mn, Rm, Rn);
6748       mul(Rlo_mn, Rm, Rn);
6749       ldr(Rm, pre(Pm, wordSize));
6750       ldr(Rn, pre(Pn, -wordSize));
6751       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6752     }
6753 
6754     void post1() {
6755       block_comment("post1");
6756 
6757       // MACC(Ra, Rb, t0, t1, t2);
6758       // Ra = *++Pa;
6759       // Rb = *--Pb;
6760       umulh(Rhi_ab, Ra, Rb);
6761       mul(Rlo_ab, Ra, Rb);
6762       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6763       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6764 
6765       // *Pm = Rm = t0 * inv;
6766       mul(Rm, t0, inv);
6767       str(Rm, Address(Pm));
6768 
6769       // MACC(Rm, Rn, t0, t1, t2);
6770       // t0 = t1; t1 = t2; t2 = 0;
6771       umulh(Rhi_mn, Rm, Rn);
6772 
6773 #ifndef PRODUCT
6774       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6775       {
6776         mul(Rlo_mn, Rm, Rn);
6777         add(Rlo_mn, t0, Rlo_mn);
6778         Label ok;
6779         cbz(Rlo_mn, ok); {
6780           stop("broken Montgomery multiply");
6781         } bind(ok);
6782       }
6783 #endif
6784       // We have very carefully set things up so that
6785       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6786       // the lower half of Rm * Rn because we know the result already:
6787       // it must be -t0.  t0 + (-t0) must generate a carry iff
6788       // t0 != 0.  So, rather than do a mul and an adds we just set
6789       // the carry flag iff t0 is nonzero.
6790       //
6791       // mul(Rlo_mn, Rm, Rn);
6792       // adds(zr, t0, Rlo_mn);
6793       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6794       adcs(t0, t1, Rhi_mn);
6795       adc(t1, t2, zr);
6796       mov(t2, zr);
6797     }
6798 
6799     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
6800       block_comment("pre2");
6801       // Pa = Pa_base + i-len;
6802       // Pb = Pb_base + len;
6803       // Pm = Pm_base + i-len;
6804       // Pn = Pn_base + len;
6805 
6806       if (i.is_register()) {
6807         sub(Rj, i.as_register(), len);
6808       } else {
6809         mov(Rj, i.as_constant());
6810         sub(Rj, Rj, len);
6811       }
6812       // Rj == i-len
6813 
6814       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
6815       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
6816       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6817       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
6818 
6819       // Ra = *++Pa;
6820       // Rb = *--Pb;
6821       // Rm = *++Pm;
6822       // Rn = *--Pn;
6823       ldr(Ra, pre(Pa, wordSize));
6824       ldr(Rb, pre(Pb, -wordSize));
6825       ldr(Rm, pre(Pm, wordSize));
6826       ldr(Rn, pre(Pn, -wordSize));
6827 
6828       mov(Rhi_mn, zr);
6829       mov(Rlo_mn, zr);
6830     }
6831 
6832     void post2(RegisterOrConstant i, RegisterOrConstant len) {
6833       block_comment("post2");
6834       if (i.is_constant()) {
6835         mov(Rj, i.as_constant()-len.as_constant());
6836       } else {
6837         sub(Rj, i.as_register(), len);
6838       }
6839 
6840       adds(t0, t0, Rlo_mn); // The pending m*n, low part
6841 
6842       // As soon as we know the least significant digit of our result,
6843       // store it.
6844       // Pm_base[i-len] = t0;
6845       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6846 
6847       // t0 = t1; t1 = t2; t2 = 0;
6848       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
6849       adc(t1, t2, zr);
6850       mov(t2, zr);
6851     }
6852 
6853     // A carry in t0 after Montgomery multiplication means that we
6854     // should subtract multiples of n from our result in m.  We'll
6855     // keep doing that until there is no carry.
6856     void normalize(RegisterOrConstant len) {
6857       block_comment("normalize");
6858       // while (t0)
6859       //   t0 = sub(Pm_base, Pn_base, t0, len);
6860       Label loop, post, again;
6861       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
6862       cbz(t0, post); {
6863         bind(again); {
6864           mov(i, zr);
6865           mov(cnt, len);
6866           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6867           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6868           subs(zr, zr, zr); // set carry flag, i.e. no borrow
6869           align(16);
6870           bind(loop); {
6871             sbcs(Rm, Rm, Rn);
6872             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6873             add(i, i, 1);
6874             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6875             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6876             sub(cnt, cnt, 1);
6877           } cbnz(cnt, loop);
6878           sbc(t0, t0, zr);
6879         } cbnz(t0, again);
6880       } bind(post);
6881     }
6882 
6883     // Move memory at s to d, reversing words.
6884     //    Increments d to end of copied memory
6885     //    Destroys tmp1, tmp2
6886     //    Preserves len
6887     //    Leaves s pointing to the address which was in d at start
6888     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
6889       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
6890 
6891       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
6892       mov(tmp1, len);
6893       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
6894       sub(s, d, len, ext::uxtw, LogBytesPerWord);
6895     }
6896     // where
6897     void reverse1(Register d, Register s, Register tmp) {
6898       ldr(tmp, pre(s, -wordSize));
6899       ror(tmp, tmp, 32);
6900       str(tmp, post(d, wordSize));
6901     }
6902 
6903     void step_squaring() {
6904       // An extra ACC
6905       step();
6906       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6907     }
6908 
6909     void last_squaring(RegisterOrConstant i) {
6910       Label dont;
6911       // if ((i & 1) == 0) {
6912       tbnz(i.as_register(), 0, dont); {
6913         // MACC(Ra, Rb, t0, t1, t2);
6914         // Ra = *++Pa;
6915         // Rb = *--Pb;
6916         umulh(Rhi_ab, Ra, Rb);
6917         mul(Rlo_ab, Ra, Rb);
6918         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6919       } bind(dont);
6920     }
6921 
6922     void extra_step_squaring() {
6923       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6924 
6925       // MACC(Rm, Rn, t0, t1, t2);
6926       // Rm = *++Pm;
6927       // Rn = *--Pn;
6928       umulh(Rhi_mn, Rm, Rn);
6929       mul(Rlo_mn, Rm, Rn);
6930       ldr(Rm, pre(Pm, wordSize));
6931       ldr(Rn, pre(Pn, -wordSize));
6932     }
6933 
6934     void post1_squaring() {
6935       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6936 
6937       // *Pm = Rm = t0 * inv;
6938       mul(Rm, t0, inv);
6939       str(Rm, Address(Pm));
6940 
6941       // MACC(Rm, Rn, t0, t1, t2);
6942       // t0 = t1; t1 = t2; t2 = 0;
6943       umulh(Rhi_mn, Rm, Rn);
6944 
6945 #ifndef PRODUCT
6946       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6947       {
6948         mul(Rlo_mn, Rm, Rn);
6949         add(Rlo_mn, t0, Rlo_mn);
6950         Label ok;
6951         cbz(Rlo_mn, ok); {
6952           stop("broken Montgomery multiply");
6953         } bind(ok);
6954       }
6955 #endif
6956       // We have very carefully set things up so that
6957       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6958       // the lower half of Rm * Rn because we know the result already:
6959       // it must be -t0.  t0 + (-t0) must generate a carry iff
6960       // t0 != 0.  So, rather than do a mul and an adds we just set
6961       // the carry flag iff t0 is nonzero.
6962       //
6963       // mul(Rlo_mn, Rm, Rn);
6964       // adds(zr, t0, Rlo_mn);
6965       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6966       adcs(t0, t1, Rhi_mn);
6967       adc(t1, t2, zr);
6968       mov(t2, zr);
6969     }
6970 
6971     void acc(Register Rhi, Register Rlo,
6972              Register t0, Register t1, Register t2) {
6973       adds(t0, t0, Rlo);
6974       adcs(t1, t1, Rhi);
6975       adc(t2, t2, zr);
6976     }
6977 
6978   public:
6979     /**
6980      * Fast Montgomery multiplication.  The derivation of the
6981      * algorithm is in A Cryptographic Library for the Motorola
6982      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
6983      *
6984      * Arguments:
6985      *
6986      * Inputs for multiplication:
6987      *   c_rarg0   - int array elements a
6988      *   c_rarg1   - int array elements b
6989      *   c_rarg2   - int array elements n (the modulus)
6990      *   c_rarg3   - int length
6991      *   c_rarg4   - int inv
6992      *   c_rarg5   - int array elements m (the result)
6993      *
6994      * Inputs for squaring:
6995      *   c_rarg0   - int array elements a
6996      *   c_rarg1   - int array elements n (the modulus)
6997      *   c_rarg2   - int length
6998      *   c_rarg3   - int inv
6999      *   c_rarg4   - int array elements m (the result)
7000      *
7001      */
7002     address generate_multiply() {
7003       Label argh, nothing;
7004       bind(argh);
7005       stop("MontgomeryMultiply total_allocation must be <= 8192");
7006 
7007       align(CodeEntryAlignment);
7008       address entry = pc();
7009 
7010       cbzw(Rlen, nothing);
7011 
7012       enter();
7013 
7014       // Make room.
7015       cmpw(Rlen, 512);
7016       br(Assembler::HI, argh);
7017       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7018       andr(sp, Ra, -2 * wordSize);
7019 
7020       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7021 
7022       {
7023         // Copy input args, reversing as we go.  We use Ra as a
7024         // temporary variable.
7025         reverse(Ra, Pa_base, Rlen, t0, t1);
7026         if (!_squaring)
7027           reverse(Ra, Pb_base, Rlen, t0, t1);
7028         reverse(Ra, Pn_base, Rlen, t0, t1);
7029       }
7030 
7031       // Push all call-saved registers and also Pm_base which we'll need
7032       // at the end.
7033       save_regs();
7034 
7035 #ifndef PRODUCT
7036       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7037       {
7038         ldr(Rn, Address(Pn_base, 0));
7039         mul(Rlo_mn, Rn, inv);
7040         subs(zr, Rlo_mn, -1);
7041         Label ok;
7042         br(EQ, ok); {
7043           stop("broken inverse in Montgomery multiply");
7044         } bind(ok);
7045       }
7046 #endif
7047 
7048       mov(Pm_base, Ra);
7049 
7050       mov(t0, zr);
7051       mov(t1, zr);
7052       mov(t2, zr);
7053 
7054       block_comment("for (int i = 0; i < len; i++) {");
7055       mov(Ri, zr); {
7056         Label loop, end;
7057         cmpw(Ri, Rlen);
7058         br(Assembler::GE, end);
7059 
7060         bind(loop);
7061         pre1(Ri);
7062 
7063         block_comment("  for (j = i; j; j--) {"); {
7064           movw(Rj, Ri);
7065           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7066         } block_comment("  } // j");
7067 
7068         post1();
7069         addw(Ri, Ri, 1);
7070         cmpw(Ri, Rlen);
7071         br(Assembler::LT, loop);
7072         bind(end);
7073         block_comment("} // i");
7074       }
7075 
7076       block_comment("for (int i = len; i < 2*len; i++) {");
7077       mov(Ri, Rlen); {
7078         Label loop, end;
7079         cmpw(Ri, Rlen, Assembler::LSL, 1);
7080         br(Assembler::GE, end);
7081 
7082         bind(loop);
7083         pre2(Ri, Rlen);
7084 
7085         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7086           lslw(Rj, Rlen, 1);
7087           subw(Rj, Rj, Ri);
7088           subw(Rj, Rj, 1);
7089           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7090         } block_comment("  } // j");
7091 
7092         post2(Ri, Rlen);
7093         addw(Ri, Ri, 1);
7094         cmpw(Ri, Rlen, Assembler::LSL, 1);
7095         br(Assembler::LT, loop);
7096         bind(end);
7097       }
7098       block_comment("} // i");
7099 
7100       normalize(Rlen);
7101 
7102       mov(Ra, Pm_base);  // Save Pm_base in Ra
7103       restore_regs();  // Restore caller's Pm_base
7104 
7105       // Copy our result into caller's Pm_base
7106       reverse(Pm_base, Ra, Rlen, t0, t1);
7107 
7108       leave();
7109       bind(nothing);
7110       ret(lr);
7111 
7112       return entry;
7113     }
7114     // In C, approximately:
7115 
7116     // void
7117     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7118     //                     julong Pn_base[], julong Pm_base[],
7119     //                     julong inv, int len) {
7120     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7121     //   julong *Pa, *Pb, *Pn, *Pm;
7122     //   julong Ra, Rb, Rn, Rm;
7123 
7124     //   int i;
7125 
7126     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7127 
7128     //   for (i = 0; i < len; i++) {
7129     //     int j;
7130 
7131     //     Pa = Pa_base;
7132     //     Pb = Pb_base + i;
7133     //     Pm = Pm_base;
7134     //     Pn = Pn_base + i;
7135 
7136     //     Ra = *Pa;
7137     //     Rb = *Pb;
7138     //     Rm = *Pm;
7139     //     Rn = *Pn;
7140 
7141     //     int iters = i;
7142     //     for (j = 0; iters--; j++) {
7143     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7144     //       MACC(Ra, Rb, t0, t1, t2);
7145     //       Ra = *++Pa;
7146     //       Rb = *--Pb;
7147     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7148     //       MACC(Rm, Rn, t0, t1, t2);
7149     //       Rm = *++Pm;
7150     //       Rn = *--Pn;
7151     //     }
7152 
7153     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7154     //     MACC(Ra, Rb, t0, t1, t2);
7155     //     *Pm = Rm = t0 * inv;
7156     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7157     //     MACC(Rm, Rn, t0, t1, t2);
7158 
7159     //     assert(t0 == 0, "broken Montgomery multiply");
7160 
7161     //     t0 = t1; t1 = t2; t2 = 0;
7162     //   }
7163 
7164     //   for (i = len; i < 2*len; i++) {
7165     //     int j;
7166 
7167     //     Pa = Pa_base + i-len;
7168     //     Pb = Pb_base + len;
7169     //     Pm = Pm_base + i-len;
7170     //     Pn = Pn_base + len;
7171 
7172     //     Ra = *++Pa;
7173     //     Rb = *--Pb;
7174     //     Rm = *++Pm;
7175     //     Rn = *--Pn;
7176 
7177     //     int iters = len*2-i-1;
7178     //     for (j = i-len+1; iters--; j++) {
7179     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7180     //       MACC(Ra, Rb, t0, t1, t2);
7181     //       Ra = *++Pa;
7182     //       Rb = *--Pb;
7183     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7184     //       MACC(Rm, Rn, t0, t1, t2);
7185     //       Rm = *++Pm;
7186     //       Rn = *--Pn;
7187     //     }
7188 
7189     //     Pm_base[i-len] = t0;
7190     //     t0 = t1; t1 = t2; t2 = 0;
7191     //   }
7192 
7193     //   while (t0)
7194     //     t0 = sub(Pm_base, Pn_base, t0, len);
7195     // }
7196 
7197     /**
7198      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7199      * multiplies than Montgomery multiplication so it should be up to
7200      * 25% faster.  However, its loop control is more complex and it
7201      * may actually run slower on some machines.
7202      *
7203      * Arguments:
7204      *
7205      * Inputs:
7206      *   c_rarg0   - int array elements a
7207      *   c_rarg1   - int array elements n (the modulus)
7208      *   c_rarg2   - int length
7209      *   c_rarg3   - int inv
7210      *   c_rarg4   - int array elements m (the result)
7211      *
7212      */
7213     address generate_square() {
7214       Label argh;
7215       bind(argh);
7216       stop("MontgomeryMultiply total_allocation must be <= 8192");
7217 
7218       align(CodeEntryAlignment);
7219       address entry = pc();
7220 
7221       enter();
7222 
7223       // Make room.
7224       cmpw(Rlen, 512);
7225       br(Assembler::HI, argh);
7226       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7227       andr(sp, Ra, -2 * wordSize);
7228 
7229       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7230 
7231       {
7232         // Copy input args, reversing as we go.  We use Ra as a
7233         // temporary variable.
7234         reverse(Ra, Pa_base, Rlen, t0, t1);
7235         reverse(Ra, Pn_base, Rlen, t0, t1);
7236       }
7237 
7238       // Push all call-saved registers and also Pm_base which we'll need
7239       // at the end.
7240       save_regs();
7241 
7242       mov(Pm_base, Ra);
7243 
7244       mov(t0, zr);
7245       mov(t1, zr);
7246       mov(t2, zr);
7247 
7248       block_comment("for (int i = 0; i < len; i++) {");
7249       mov(Ri, zr); {
7250         Label loop, end;
7251         bind(loop);
7252         cmp(Ri, Rlen);
7253         br(Assembler::GE, end);
7254 
7255         pre1(Ri);
7256 
7257         block_comment("for (j = (i+1)/2; j; j--) {"); {
7258           add(Rj, Ri, 1);
7259           lsr(Rj, Rj, 1);
7260           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7261         } block_comment("  } // j");
7262 
7263         last_squaring(Ri);
7264 
7265         block_comment("  for (j = i/2; j; j--) {"); {
7266           lsr(Rj, Ri, 1);
7267           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7268         } block_comment("  } // j");
7269 
7270         post1_squaring();
7271         add(Ri, Ri, 1);
7272         cmp(Ri, Rlen);
7273         br(Assembler::LT, loop);
7274 
7275         bind(end);
7276         block_comment("} // i");
7277       }
7278 
7279       block_comment("for (int i = len; i < 2*len; i++) {");
7280       mov(Ri, Rlen); {
7281         Label loop, end;
7282         bind(loop);
7283         cmp(Ri, Rlen, Assembler::LSL, 1);
7284         br(Assembler::GE, end);
7285 
7286         pre2(Ri, Rlen);
7287 
7288         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7289           lsl(Rj, Rlen, 1);
7290           sub(Rj, Rj, Ri);
7291           sub(Rj, Rj, 1);
7292           lsr(Rj, Rj, 1);
7293           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7294         } block_comment("  } // j");
7295 
7296         last_squaring(Ri);
7297 
7298         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7299           lsl(Rj, Rlen, 1);
7300           sub(Rj, Rj, Ri);
7301           lsr(Rj, Rj, 1);
7302           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7303         } block_comment("  } // j");
7304 
7305         post2(Ri, Rlen);
7306         add(Ri, Ri, 1);
7307         cmp(Ri, Rlen, Assembler::LSL, 1);
7308 
7309         br(Assembler::LT, loop);
7310         bind(end);
7311         block_comment("} // i");
7312       }
7313 
7314       normalize(Rlen);
7315 
7316       mov(Ra, Pm_base);  // Save Pm_base in Ra
7317       restore_regs();  // Restore caller's Pm_base
7318 
7319       // Copy our result into caller's Pm_base
7320       reverse(Pm_base, Ra, Rlen, t0, t1);
7321 
7322       leave();
7323       ret(lr);
7324 
7325       return entry;
7326     }
7327     // In C, approximately:
7328 
7329     // void
7330     // montgomery_square(julong Pa_base[], julong Pn_base[],
7331     //                   julong Pm_base[], julong inv, int len) {
7332     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7333     //   julong *Pa, *Pb, *Pn, *Pm;
7334     //   julong Ra, Rb, Rn, Rm;
7335 
7336     //   int i;
7337 
7338     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7339 
7340     //   for (i = 0; i < len; i++) {
7341     //     int j;
7342 
7343     //     Pa = Pa_base;
7344     //     Pb = Pa_base + i;
7345     //     Pm = Pm_base;
7346     //     Pn = Pn_base + i;
7347 
7348     //     Ra = *Pa;
7349     //     Rb = *Pb;
7350     //     Rm = *Pm;
7351     //     Rn = *Pn;
7352 
7353     //     int iters = (i+1)/2;
7354     //     for (j = 0; iters--; j++) {
7355     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7356     //       MACC2(Ra, Rb, t0, t1, t2);
7357     //       Ra = *++Pa;
7358     //       Rb = *--Pb;
7359     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7360     //       MACC(Rm, Rn, t0, t1, t2);
7361     //       Rm = *++Pm;
7362     //       Rn = *--Pn;
7363     //     }
7364     //     if ((i & 1) == 0) {
7365     //       assert(Ra == Pa_base[j], "must be");
7366     //       MACC(Ra, Ra, t0, t1, t2);
7367     //     }
7368     //     iters = i/2;
7369     //     assert(iters == i-j, "must be");
7370     //     for (; iters--; j++) {
7371     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7372     //       MACC(Rm, Rn, t0, t1, t2);
7373     //       Rm = *++Pm;
7374     //       Rn = *--Pn;
7375     //     }
7376 
7377     //     *Pm = Rm = t0 * inv;
7378     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7379     //     MACC(Rm, Rn, t0, t1, t2);
7380 
7381     //     assert(t0 == 0, "broken Montgomery multiply");
7382 
7383     //     t0 = t1; t1 = t2; t2 = 0;
7384     //   }
7385 
7386     //   for (i = len; i < 2*len; i++) {
7387     //     int start = i-len+1;
7388     //     int end = start + (len - start)/2;
7389     //     int j;
7390 
7391     //     Pa = Pa_base + i-len;
7392     //     Pb = Pa_base + len;
7393     //     Pm = Pm_base + i-len;
7394     //     Pn = Pn_base + len;
7395 
7396     //     Ra = *++Pa;
7397     //     Rb = *--Pb;
7398     //     Rm = *++Pm;
7399     //     Rn = *--Pn;
7400 
7401     //     int iters = (2*len-i-1)/2;
7402     //     assert(iters == end-start, "must be");
7403     //     for (j = start; iters--; j++) {
7404     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7405     //       MACC2(Ra, Rb, t0, t1, t2);
7406     //       Ra = *++Pa;
7407     //       Rb = *--Pb;
7408     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7409     //       MACC(Rm, Rn, t0, t1, t2);
7410     //       Rm = *++Pm;
7411     //       Rn = *--Pn;
7412     //     }
7413     //     if ((i & 1) == 0) {
7414     //       assert(Ra == Pa_base[j], "must be");
7415     //       MACC(Ra, Ra, t0, t1, t2);
7416     //     }
7417     //     iters =  (2*len-i)/2;
7418     //     assert(iters == len-j, "must be");
7419     //     for (; iters--; j++) {
7420     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7421     //       MACC(Rm, Rn, t0, t1, t2);
7422     //       Rm = *++Pm;
7423     //       Rn = *--Pn;
7424     //     }
7425     //     Pm_base[i-len] = t0;
7426     //     t0 = t1; t1 = t2; t2 = 0;
7427     //   }
7428 
7429     //   while (t0)
7430     //     t0 = sub(Pm_base, Pn_base, t0, len);
7431     // }
7432   };
7433 
7434 
7435   // Initialization
7436   void generate_initial() {
7437     // Generate initial stubs and initializes the entry points
7438 
7439     // entry points that exist in all platforms Note: This is code
7440     // that could be shared among different platforms - however the
7441     // benefit seems to be smaller than the disadvantage of having a
7442     // much more complicated generator structure. See also comment in
7443     // stubRoutines.hpp.
7444 
7445     StubRoutines::_forward_exception_entry = generate_forward_exception();
7446 
7447     StubRoutines::_call_stub_entry =
7448       generate_call_stub(StubRoutines::_call_stub_return_address);
7449 
7450     // is referenced by megamorphic call
7451     StubRoutines::_catch_exception_entry = generate_catch_exception();
7452 
7453     // Build this early so it's available for the interpreter.
7454     StubRoutines::_throw_StackOverflowError_entry =
7455       generate_throw_exception("StackOverflowError throw_exception",
7456                                CAST_FROM_FN_PTR(address,
7457                                                 SharedRuntime::throw_StackOverflowError));
7458     StubRoutines::_throw_delayed_StackOverflowError_entry =
7459       generate_throw_exception("delayed StackOverflowError throw_exception",
7460                                CAST_FROM_FN_PTR(address,
7461                                                 SharedRuntime::throw_delayed_StackOverflowError));
7462     if (UseCRC32Intrinsics) {
7463       // set table address before stub generation which use it
7464       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7465       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7466     }
7467 
7468     if (UseCRC32CIntrinsics) {
7469       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7470     }
7471 
7472     // Disabled until JDK-8210858 is fixed
7473     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7474     //   StubRoutines::_dlog = generate_dlog();
7475     // }
7476 
7477     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7478       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7479     }
7480 
7481     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7482       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7483     }
7484 
7485     StubRoutines::_load_nklass = generate_load_nklass();
7486   }
7487 
7488   void generate_all() {
7489     // support for verify_oop (must happen after universe_init)
7490     if (VerifyOops) {
7491       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
7492     }
7493     StubRoutines::_throw_AbstractMethodError_entry =
7494       generate_throw_exception("AbstractMethodError throw_exception",
7495                                CAST_FROM_FN_PTR(address,
7496                                                 SharedRuntime::
7497                                                 throw_AbstractMethodError));
7498 
7499     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7500       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7501                                CAST_FROM_FN_PTR(address,
7502                                                 SharedRuntime::
7503                                                 throw_IncompatibleClassChangeError));
7504 
7505     StubRoutines::_throw_NullPointerException_at_call_entry =
7506       generate_throw_exception("NullPointerException at call throw_exception",
7507                                CAST_FROM_FN_PTR(address,
7508                                                 SharedRuntime::
7509                                                 throw_NullPointerException_at_call));
7510 
7511     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7512 
7513     // arraycopy stubs used by compilers
7514     generate_arraycopy_stubs();
7515 
7516     // countPositives stub for large arrays.
7517     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
7518 
7519     // array equals stub for large arrays.
7520     if (!UseSimpleArrayEquals) {
7521       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7522     }
7523 
7524     generate_compare_long_strings();
7525 
7526     generate_string_indexof_stubs();
7527 
7528     // byte_array_inflate stub for large arrays.
7529     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7530 
7531     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7532     if (bs_nm != NULL) {
7533       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7534     }
7535 #ifdef COMPILER2
7536     if (UseMultiplyToLenIntrinsic) {
7537       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7538     }
7539 
7540     if (UseSquareToLenIntrinsic) {
7541       StubRoutines::_squareToLen = generate_squareToLen();
7542     }
7543 
7544     if (UseMulAddIntrinsic) {
7545       StubRoutines::_mulAdd = generate_mulAdd();
7546     }
7547 
7548     if (UseSIMDForBigIntegerShiftIntrinsics) {
7549       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7550       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7551     }
7552 
7553     if (UseMontgomeryMultiplyIntrinsic) {
7554       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7555       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7556       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7557     }
7558 
7559     if (UseMontgomerySquareIntrinsic) {
7560       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7561       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7562       // We use generate_multiply() rather than generate_square()
7563       // because it's faster for the sizes of modulus we care about.
7564       StubRoutines::_montgomerySquare = g.generate_multiply();
7565     }
7566 #endif // COMPILER2
7567 
7568     if (UseBASE64Intrinsics) {
7569         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7570         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7571     }
7572 
7573     // data cache line writeback
7574     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7575     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7576 
7577     if (UseAESIntrinsics) {
7578       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7579       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7580       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7581       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7582       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7583     }
7584     if (UseGHASHIntrinsics) {
7585       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7586       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
7587     }
7588     if (UseAESIntrinsics && UseGHASHIntrinsics) {
7589       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7590     }
7591 
7592     if (UseMD5Intrinsics) {
7593       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
7594       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
7595     }
7596     if (UseSHA1Intrinsics) {
7597       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7598       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7599     }
7600     if (UseSHA256Intrinsics) {
7601       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7602       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7603     }
7604     if (UseSHA512Intrinsics) {
7605       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7606       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7607     }
7608     if (UseSHA3Intrinsics) {
7609       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7610       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7611     }
7612 
7613     // generate Adler32 intrinsics code
7614     if (UseAdler32Intrinsics) {
7615       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7616     }
7617 
7618     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
7619 
7620 #ifdef LINUX
7621 
7622     generate_atomic_entry_points();
7623 
7624 #endif // LINUX
7625 
7626     StubRoutines::aarch64::set_completed();
7627   }
7628 
7629  public:
7630   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7631     if (all) {
7632       generate_all();
7633     } else {
7634       generate_initial();
7635     }
7636   }
7637 }; // end class declaration
7638 
7639 #define UCM_TABLE_MAX_ENTRIES 8
7640 void StubGenerator_generate(CodeBuffer* code, bool all) {
7641   if (UnsafeCopyMemory::_table == NULL) {
7642     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7643   }
7644   StubGenerator g(code, all);
7645 }
7646 
7647 
7648 #ifdef LINUX
7649 
7650 // Define pointers to atomic stubs and initialize them to point to the
7651 // code in atomic_aarch64.S.
7652 
7653 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
7654   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7655     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
7656   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7657     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7658 
7659 DEFAULT_ATOMIC_OP(fetch_add, 4, )
7660 DEFAULT_ATOMIC_OP(fetch_add, 8, )
7661 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
7662 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
7663 DEFAULT_ATOMIC_OP(xchg, 4, )
7664 DEFAULT_ATOMIC_OP(xchg, 8, )
7665 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
7666 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
7667 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
7668 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
7669 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
7670 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
7671 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
7672 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
7673 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
7674 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
7675 
7676 #undef DEFAULT_ATOMIC_OP
7677 
7678 #endif // LINUX
--- EOF ---