1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/atomic.hpp"
  45 #include "runtime/frame.inline.hpp"
  46 #include "runtime/handles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubCodeGenerator.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/thread.inline.hpp"
  51 #include "utilities/align.hpp"
  52 #include "utilities/powerOfTwo.hpp"
  53 #ifdef COMPILER2
  54 #include "opto/runtime.hpp"
  55 #endif
  56 #if INCLUDE_ZGC
  57 #include "gc/z/zThreadLocalData.hpp"
  58 #endif
  59 
  60 // Declaration and definition of StubGenerator (no .hpp file).
  61 // For a more detailed description of the stub routine structure
  62 // see the comment in stubRoutines.hpp
  63 
  64 #undef __
  65 #define __ _masm->
  66 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  67 
  68 #ifdef PRODUCT
  69 #define BLOCK_COMMENT(str) /* nothing */
  70 #else
  71 #define BLOCK_COMMENT(str) __ block_comment(str)
  72 #endif
  73 
  74 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  75 
  76 // Stub Code definitions
  77 
  78 class StubGenerator: public StubCodeGenerator {
  79  private:
  80 
  81 #ifdef PRODUCT
  82 #define inc_counter_np(counter) ((void)0)
  83 #else
  84   void inc_counter_np_(int& counter) {
  85     __ lea(rscratch2, ExternalAddress((address)&counter));
  86     __ ldrw(rscratch1, Address(rscratch2));
  87     __ addw(rscratch1, rscratch1, 1);
  88     __ strw(rscratch1, Address(rscratch2));
  89   }
  90 #define inc_counter_np(counter) \
  91   BLOCK_COMMENT("inc_counter " #counter); \
  92   inc_counter_np_(counter);
  93 #endif
  94 
  95   // Call stubs are used to call Java from C
  96   //
  97   // Arguments:
  98   //    c_rarg0:   call wrapper address                   address
  99   //    c_rarg1:   result                                 address
 100   //    c_rarg2:   result type                            BasicType
 101   //    c_rarg3:   method                                 Method*
 102   //    c_rarg4:   (interpreter) entry point              address
 103   //    c_rarg5:   parameters                             intptr_t*
 104   //    c_rarg6:   parameter size (in words)              int
 105   //    c_rarg7:   thread                                 Thread*
 106   //
 107   // There is no return from the stub itself as any Java result
 108   // is written to result
 109   //
 110   // we save r30 (lr) as the return PC at the base of the frame and
 111   // link r29 (fp) below it as the frame pointer installing sp (r31)
 112   // into fp.
 113   //
 114   // we save r0-r7, which accounts for all the c arguments.
 115   //
 116   // TODO: strictly do we need to save them all? they are treated as
 117   // volatile by C so could we omit saving the ones we are going to
 118   // place in global registers (thread? method?) or those we only use
 119   // during setup of the Java call?
 120   //
 121   // we don't need to save r8 which C uses as an indirect result location
 122   // return register.
 123   //
 124   // we don't need to save r9-r15 which both C and Java treat as
 125   // volatile
 126   //
 127   // we don't need to save r16-18 because Java does not use them
 128   //
 129   // we save r19-r28 which Java uses as scratch registers and C
 130   // expects to be callee-save
 131   //
 132   // we save the bottom 64 bits of each value stored in v8-v15; it is
 133   // the responsibility of the caller to preserve larger values.
 134   //
 135   // so the stub frame looks like this when we enter Java code
 136   //
 137   //     [ return_from_Java     ] <--- sp
 138   //     [ argument word n      ]
 139   //      ...
 140   // -27 [ argument word 1      ]
 141   // -26 [ saved v15            ] <--- sp_after_call
 142   // -25 [ saved v14            ]
 143   // -24 [ saved v13            ]
 144   // -23 [ saved v12            ]
 145   // -22 [ saved v11            ]
 146   // -21 [ saved v10            ]
 147   // -20 [ saved v9             ]
 148   // -19 [ saved v8             ]
 149   // -18 [ saved r28            ]
 150   // -17 [ saved r27            ]
 151   // -16 [ saved r26            ]
 152   // -15 [ saved r25            ]
 153   // -14 [ saved r24            ]
 154   // -13 [ saved r23            ]
 155   // -12 [ saved r22            ]
 156   // -11 [ saved r21            ]
 157   // -10 [ saved r20            ]
 158   //  -9 [ saved r19            ]
 159   //  -8 [ call wrapper    (r0) ]
 160   //  -7 [ result          (r1) ]
 161   //  -6 [ result type     (r2) ]
 162   //  -5 [ method          (r3) ]
 163   //  -4 [ entry point     (r4) ]
 164   //  -3 [ parameters      (r5) ]
 165   //  -2 [ parameter size  (r6) ]
 166   //  -1 [ thread (r7)          ]
 167   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 168   //   1 [ saved lr       (r30) ]
 169 
 170   // Call stub stack layout word offsets from fp
 171   enum call_stub_layout {
 172     sp_after_call_off = -26,
 173 
 174     d15_off            = -26,
 175     d13_off            = -24,
 176     d11_off            = -22,
 177     d9_off             = -20,
 178 
 179     r28_off            = -18,
 180     r26_off            = -16,
 181     r24_off            = -14,
 182     r22_off            = -12,
 183     r20_off            = -10,
 184     call_wrapper_off   =  -8,
 185     result_off         =  -7,
 186     result_type_off    =  -6,
 187     method_off         =  -5,
 188     entry_point_off    =  -4,
 189     parameter_size_off =  -2,
 190     thread_off         =  -1,
 191     fp_f               =   0,
 192     retaddr_off        =   1,
 193   };
 194 
 195   address generate_call_stub(address& return_address) {
 196     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 197            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 198            "adjust this code");
 199 
 200     StubCodeMark mark(this, "StubRoutines", "call_stub");
 201     address start = __ pc();
 202 
 203     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 204 
 205     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 206     const Address result        (rfp, result_off         * wordSize);
 207     const Address result_type   (rfp, result_type_off    * wordSize);
 208     const Address method        (rfp, method_off         * wordSize);
 209     const Address entry_point   (rfp, entry_point_off    * wordSize);
 210     const Address parameter_size(rfp, parameter_size_off * wordSize);
 211 
 212     const Address thread        (rfp, thread_off         * wordSize);
 213 
 214     const Address d15_save      (rfp, d15_off * wordSize);
 215     const Address d13_save      (rfp, d13_off * wordSize);
 216     const Address d11_save      (rfp, d11_off * wordSize);
 217     const Address d9_save       (rfp, d9_off * wordSize);
 218 
 219     const Address r28_save      (rfp, r28_off * wordSize);
 220     const Address r26_save      (rfp, r26_off * wordSize);
 221     const Address r24_save      (rfp, r24_off * wordSize);
 222     const Address r22_save      (rfp, r22_off * wordSize);
 223     const Address r20_save      (rfp, r20_off * wordSize);
 224 
 225     // stub code
 226 
 227     address aarch64_entry = __ pc();
 228 
 229     // set up frame and move sp to end of save area
 230     __ enter();
 231     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 232 
 233     // save register parameters and Java scratch/global registers
 234     // n.b. we save thread even though it gets installed in
 235     // rthread because we want to sanity check rthread later
 236     __ str(c_rarg7,  thread);
 237     __ strw(c_rarg6, parameter_size);
 238     __ stp(c_rarg4, c_rarg5,  entry_point);
 239     __ stp(c_rarg2, c_rarg3,  result_type);
 240     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 241 
 242     __ stp(r20, r19,   r20_save);
 243     __ stp(r22, r21,   r22_save);
 244     __ stp(r24, r23,   r24_save);
 245     __ stp(r26, r25,   r26_save);
 246     __ stp(r28, r27,   r28_save);
 247 
 248     __ stpd(v9,  v8,   d9_save);
 249     __ stpd(v11, v10,  d11_save);
 250     __ stpd(v13, v12,  d13_save);
 251     __ stpd(v15, v14,  d15_save);
 252 
 253     // install Java thread in global register now we have saved
 254     // whatever value it held
 255     __ mov(rthread, c_rarg7);
 256     // And method
 257     __ mov(rmethod, c_rarg3);
 258 
 259     // set up the heapbase register
 260     __ reinit_heapbase();
 261 
 262 #ifdef ASSERT
 263     // make sure we have no pending exceptions
 264     {
 265       Label L;
 266       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 267       __ cmp(rscratch1, (u1)NULL_WORD);
 268       __ br(Assembler::EQ, L);
 269       __ stop("StubRoutines::call_stub: entered with pending exception");
 270       __ BIND(L);
 271     }
 272 #endif
 273     // pass parameters if any
 274     __ mov(esp, sp);
 275     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 276     __ andr(sp, rscratch1, -2 * wordSize);
 277 
 278     BLOCK_COMMENT("pass parameters if any");
 279     Label parameters_done;
 280     // parameter count is still in c_rarg6
 281     // and parameter pointer identifying param 1 is in c_rarg5
 282     __ cbzw(c_rarg6, parameters_done);
 283 
 284     address loop = __ pc();
 285     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 286     __ subsw(c_rarg6, c_rarg6, 1);
 287     __ push(rscratch1);
 288     __ br(Assembler::GT, loop);
 289 
 290     __ BIND(parameters_done);
 291 
 292     // call Java entry -- passing methdoOop, and current sp
 293     //      rmethod: Method*
 294     //      r13: sender sp
 295     BLOCK_COMMENT("call Java function");
 296     __ mov(r13, sp);
 297     __ blr(c_rarg4);
 298 
 299     // we do this here because the notify will already have been done
 300     // if we get to the next instruction via an exception
 301     //
 302     // n.b. adding this instruction here affects the calculation of
 303     // whether or not a routine returns to the call stub (used when
 304     // doing stack walks) since the normal test is to check the return
 305     // pc against the address saved below. so we may need to allow for
 306     // this extra instruction in the check.
 307 
 308     // save current address for use by exception handling code
 309 
 310     return_address = __ pc();
 311 
 312     // store result depending on type (everything that is not
 313     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 314     // n.b. this assumes Java returns an integral result in r0
 315     // and a floating result in j_farg0
 316     __ ldr(j_rarg2, result);
 317     Label is_long, is_float, is_double, exit;
 318     __ ldr(j_rarg1, result_type);
 319     __ cmp(j_rarg1, (u1)T_OBJECT);
 320     __ br(Assembler::EQ, is_long);
 321     __ cmp(j_rarg1, (u1)T_LONG);
 322     __ br(Assembler::EQ, is_long);
 323     __ cmp(j_rarg1, (u1)T_FLOAT);
 324     __ br(Assembler::EQ, is_float);
 325     __ cmp(j_rarg1, (u1)T_DOUBLE);
 326     __ br(Assembler::EQ, is_double);
 327 
 328     // handle T_INT case
 329     __ strw(r0, Address(j_rarg2));
 330 
 331     __ BIND(exit);
 332 
 333     // pop parameters
 334     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 335 
 336 #ifdef ASSERT
 337     // verify that threads correspond
 338     {
 339       Label L, S;
 340       __ ldr(rscratch1, thread);
 341       __ cmp(rthread, rscratch1);
 342       __ br(Assembler::NE, S);
 343       __ get_thread(rscratch1);
 344       __ cmp(rthread, rscratch1);
 345       __ br(Assembler::EQ, L);
 346       __ BIND(S);
 347       __ stop("StubRoutines::call_stub: threads must correspond");
 348       __ BIND(L);
 349     }
 350 #endif
 351 
 352     // restore callee-save registers
 353     __ ldpd(v15, v14,  d15_save);
 354     __ ldpd(v13, v12,  d13_save);
 355     __ ldpd(v11, v10,  d11_save);
 356     __ ldpd(v9,  v8,   d9_save);
 357 
 358     __ ldp(r28, r27,   r28_save);
 359     __ ldp(r26, r25,   r26_save);
 360     __ ldp(r24, r23,   r24_save);
 361     __ ldp(r22, r21,   r22_save);
 362     __ ldp(r20, r19,   r20_save);
 363 
 364     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 365     __ ldrw(c_rarg2, result_type);
 366     __ ldr(c_rarg3,  method);
 367     __ ldp(c_rarg4, c_rarg5,  entry_point);
 368     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 369 
 370     // leave frame and return to caller
 371     __ leave();
 372     __ ret(lr);
 373 
 374     // handle return types different from T_INT
 375 
 376     __ BIND(is_long);
 377     __ str(r0, Address(j_rarg2, 0));
 378     __ br(Assembler::AL, exit);
 379 
 380     __ BIND(is_float);
 381     __ strs(j_farg0, Address(j_rarg2, 0));
 382     __ br(Assembler::AL, exit);
 383 
 384     __ BIND(is_double);
 385     __ strd(j_farg0, Address(j_rarg2, 0));
 386     __ br(Assembler::AL, exit);
 387 
 388     return start;
 389   }
 390 
 391   // Return point for a Java call if there's an exception thrown in
 392   // Java code.  The exception is caught and transformed into a
 393   // pending exception stored in JavaThread that can be tested from
 394   // within the VM.
 395   //
 396   // Note: Usually the parameters are removed by the callee. In case
 397   // of an exception crossing an activation frame boundary, that is
 398   // not the case if the callee is compiled code => need to setup the
 399   // rsp.
 400   //
 401   // r0: exception oop
 402 
 403   address generate_catch_exception() {
 404     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 405     address start = __ pc();
 406 
 407     // same as in generate_call_stub():
 408     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 409     const Address thread        (rfp, thread_off         * wordSize);
 410 
 411 #ifdef ASSERT
 412     // verify that threads correspond
 413     {
 414       Label L, S;
 415       __ ldr(rscratch1, thread);
 416       __ cmp(rthread, rscratch1);
 417       __ br(Assembler::NE, S);
 418       __ get_thread(rscratch1);
 419       __ cmp(rthread, rscratch1);
 420       __ br(Assembler::EQ, L);
 421       __ bind(S);
 422       __ stop("StubRoutines::catch_exception: threads must correspond");
 423       __ bind(L);
 424     }
 425 #endif
 426 
 427     // set pending exception
 428     __ verify_oop(r0);
 429 
 430     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 431     __ mov(rscratch1, (address)__FILE__);
 432     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 433     __ movw(rscratch1, (int)__LINE__);
 434     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 435 
 436     // complete return to VM
 437     assert(StubRoutines::_call_stub_return_address != NULL,
 438            "_call_stub_return_address must have been generated before");
 439     __ b(StubRoutines::_call_stub_return_address);
 440 
 441     return start;
 442   }
 443 
 444   // Continuation point for runtime calls returning with a pending
 445   // exception.  The pending exception check happened in the runtime
 446   // or native call stub.  The pending exception in Thread is
 447   // converted into a Java-level exception.
 448   //
 449   // Contract with Java-level exception handlers:
 450   // r0: exception
 451   // r3: throwing pc
 452   //
 453   // NOTE: At entry of this stub, exception-pc must be in LR !!
 454 
 455   // NOTE: this is always used as a jump target within generated code
 456   // so it just needs to be generated code with no x86 prolog
 457 
 458   address generate_forward_exception() {
 459     StubCodeMark mark(this, "StubRoutines", "forward exception");
 460     address start = __ pc();
 461 
 462     // Upon entry, LR points to the return address returning into
 463     // Java (interpreted or compiled) code; i.e., the return address
 464     // becomes the throwing pc.
 465     //
 466     // Arguments pushed before the runtime call are still on the stack
 467     // but the exception handler will reset the stack pointer ->
 468     // ignore them.  A potential result in registers can be ignored as
 469     // well.
 470 
 471 #ifdef ASSERT
 472     // make sure this code is only executed if there is a pending exception
 473     {
 474       Label L;
 475       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 476       __ cbnz(rscratch1, L);
 477       __ stop("StubRoutines::forward exception: no pending exception (1)");
 478       __ bind(L);
 479     }
 480 #endif
 481 
 482     // compute exception handler into r19
 483 
 484     // call the VM to find the handler address associated with the
 485     // caller address. pass thread in r0 and caller pc (ret address)
 486     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 487     // the stack.
 488     __ mov(c_rarg1, lr);
 489     // lr will be trashed by the VM call so we move it to R19
 490     // (callee-saved) because we also need to pass it to the handler
 491     // returned by this call.
 492     __ mov(r19, lr);
 493     BLOCK_COMMENT("call exception_handler_for_return_address");
 494     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 495                          SharedRuntime::exception_handler_for_return_address),
 496                     rthread, c_rarg1);
 497     // Reinitialize the ptrue predicate register, in case the external runtime
 498     // call clobbers ptrue reg, as we may return to SVE compiled code.
 499     __ reinitialize_ptrue();
 500 
 501     // we should not really care that lr is no longer the callee
 502     // address. we saved the value the handler needs in r19 so we can
 503     // just copy it to r3. however, the C2 handler will push its own
 504     // frame and then calls into the VM and the VM code asserts that
 505     // the PC for the frame above the handler belongs to a compiled
 506     // Java method. So, we restore lr here to satisfy that assert.
 507     __ mov(lr, r19);
 508     // setup r0 & r3 & clear pending exception
 509     __ mov(r3, r19);
 510     __ mov(r19, r0);
 511     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 512     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 513 
 514 #ifdef ASSERT
 515     // make sure exception is set
 516     {
 517       Label L;
 518       __ cbnz(r0, L);
 519       __ stop("StubRoutines::forward exception: no pending exception (2)");
 520       __ bind(L);
 521     }
 522 #endif
 523 
 524     // continue at exception handler
 525     // r0: exception
 526     // r3: throwing pc
 527     // r19: exception handler
 528     __ verify_oop(r0);
 529     __ br(r19);
 530 
 531     return start;
 532   }
 533 
 534   // Non-destructive plausibility checks for oops
 535   //
 536   // Arguments:
 537   //    r0: oop to verify
 538   //    rscratch1: error message
 539   //
 540   // Stack after saving c_rarg3:
 541   //    [tos + 0]: saved c_rarg3
 542   //    [tos + 1]: saved c_rarg2
 543   //    [tos + 2]: saved lr
 544   //    [tos + 3]: saved rscratch2
 545   //    [tos + 4]: saved r0
 546   //    [tos + 5]: saved rscratch1
 547   address generate_verify_oop() {
 548 
 549     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 550     address start = __ pc();
 551 
 552     Label exit, error;
 553 
 554     // save c_rarg2 and c_rarg3
 555     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 556 
 557     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 558     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 559     __ ldr(c_rarg3, Address(c_rarg2));
 560     __ add(c_rarg3, c_rarg3, 1);
 561     __ str(c_rarg3, Address(c_rarg2));
 562 
 563     // object is in r0
 564     // make sure object is 'reasonable'
 565     __ cbz(r0, exit); // if obj is NULL it is OK
 566 
 567 #if INCLUDE_ZGC
 568     if (UseZGC) {
 569       // Check if mask is good.
 570       // verifies that ZAddressBadMask & r0 == 0
 571       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 572       __ andr(c_rarg2, r0, c_rarg3);
 573       __ cbnz(c_rarg2, error);
 574     }
 575 #endif
 576 
 577     // Check if the oop is in the right area of memory
 578     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 579     __ andr(c_rarg2, r0, c_rarg3);
 580     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 581 
 582     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 583     // instruction here because the flags register is live.
 584     __ eor(c_rarg2, c_rarg2, c_rarg3);
 585     __ cbnz(c_rarg2, error);
 586 
 587     // make sure klass is 'reasonable', which is not zero.
 588     __ load_klass(r0, r0);  // get klass
 589     __ cbz(r0, error);      // if klass is NULL it is broken










 590 
 591     // return if everything seems ok
 592     __ bind(exit);
 593 
 594     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 595     __ ret(lr);
 596 
 597     // handle errors
 598     __ bind(error);
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600 
 601     __ push(RegSet::range(r0, r29), sp);
 602     // debug(char* msg, int64_t pc, int64_t regs[])
 603     __ mov(c_rarg0, rscratch1);      // pass address of error message
 604     __ mov(c_rarg1, lr);             // pass return address
 605     __ mov(c_rarg2, sp);             // pass address of regs on stack
 606 #ifndef PRODUCT
 607     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 608 #endif
 609     BLOCK_COMMENT("call MacroAssembler::debug");
 610     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 611     __ blr(rscratch1);
 612     __ hlt(0);
 613 
 614     return start;
 615   }
 616 
 617   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 618 
 619   // Generate indices for iota vector.
 620   address generate_iota_indices(const char *stub_name) {
 621     __ align(CodeEntryAlignment);
 622     StubCodeMark mark(this, "StubRoutines", stub_name);
 623     address start = __ pc();
 624     __ emit_data64(0x0706050403020100, relocInfo::none);
 625     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 626     return start;
 627   }
 628 
 629   // The inner part of zero_words().  This is the bulk operation,
 630   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 631   // caller is responsible for zeroing the last few words.
 632   //
 633   // Inputs:
 634   // r10: the HeapWord-aligned base address of an array to zero.
 635   // r11: the count in HeapWords, r11 > 0.
 636   //
 637   // Returns r10 and r11, adjusted for the caller to clear.
 638   // r10: the base address of the tail of words left to clear.
 639   // r11: the number of words in the tail.
 640   //      r11 < MacroAssembler::zero_words_block_size.
 641 
 642   address generate_zero_blocks() {
 643     Label done;
 644     Label base_aligned;
 645 
 646     Register base = r10, cnt = r11;
 647 
 648     __ align(CodeEntryAlignment);
 649     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 650     address start = __ pc();
 651 
 652     if (UseBlockZeroing) {
 653       int zva_length = VM_Version::zva_length();
 654 
 655       // Ensure ZVA length can be divided by 16. This is required by
 656       // the subsequent operations.
 657       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 658 
 659       __ tbz(base, 3, base_aligned);
 660       __ str(zr, Address(__ post(base, 8)));
 661       __ sub(cnt, cnt, 1);
 662       __ bind(base_aligned);
 663 
 664       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 665       // alignment.
 666       Label small;
 667       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 668       __ subs(rscratch1, cnt, low_limit >> 3);
 669       __ br(Assembler::LT, small);
 670       __ zero_dcache_blocks(base, cnt);
 671       __ bind(small);
 672     }
 673 
 674     {
 675       // Number of stp instructions we'll unroll
 676       const int unroll =
 677         MacroAssembler::zero_words_block_size / 2;
 678       // Clear the remaining blocks.
 679       Label loop;
 680       __ subs(cnt, cnt, unroll * 2);
 681       __ br(Assembler::LT, done);
 682       __ bind(loop);
 683       for (int i = 0; i < unroll; i++)
 684         __ stp(zr, zr, __ post(base, 16));
 685       __ subs(cnt, cnt, unroll * 2);
 686       __ br(Assembler::GE, loop);
 687       __ bind(done);
 688       __ add(cnt, cnt, unroll * 2);
 689     }
 690 
 691     __ ret(lr);
 692 
 693     return start;
 694   }
 695 
 696 
 697   typedef enum {
 698     copy_forwards = 1,
 699     copy_backwards = -1
 700   } copy_direction;
 701 
 702   // Bulk copy of blocks of 8 words.
 703   //
 704   // count is a count of words.
 705   //
 706   // Precondition: count >= 8
 707   //
 708   // Postconditions:
 709   //
 710   // The least significant bit of count contains the remaining count
 711   // of words to copy.  The rest of count is trash.
 712   //
 713   // s and d are adjusted to point to the remaining words to copy
 714   //
 715   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 716                            copy_direction direction) {
 717     int unit = wordSize * direction;
 718     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 719 
 720     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 721       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 722     const Register stride = r13;
 723 
 724     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 725     assert_different_registers(s, d, count, rscratch1);
 726 
 727     Label again, drain;
 728     const char *stub_name;
 729     if (direction == copy_forwards)
 730       stub_name = "forward_copy_longs";
 731     else
 732       stub_name = "backward_copy_longs";
 733 
 734     __ align(CodeEntryAlignment);
 735 
 736     StubCodeMark mark(this, "StubRoutines", stub_name);
 737 
 738     __ bind(start);
 739 
 740     Label unaligned_copy_long;
 741     if (AvoidUnalignedAccesses) {
 742       __ tbnz(d, 3, unaligned_copy_long);
 743     }
 744 
 745     if (direction == copy_forwards) {
 746       __ sub(s, s, bias);
 747       __ sub(d, d, bias);
 748     }
 749 
 750 #ifdef ASSERT
 751     // Make sure we are never given < 8 words
 752     {
 753       Label L;
 754       __ cmp(count, (u1)8);
 755       __ br(Assembler::GE, L);
 756       __ stop("genrate_copy_longs called with < 8 words");
 757       __ bind(L);
 758     }
 759 #endif
 760 
 761     // Fill 8 registers
 762     if (UseSIMDForMemoryOps) {
 763       __ ldpq(v0, v1, Address(s, 4 * unit));
 764       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 765     } else {
 766       __ ldp(t0, t1, Address(s, 2 * unit));
 767       __ ldp(t2, t3, Address(s, 4 * unit));
 768       __ ldp(t4, t5, Address(s, 6 * unit));
 769       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 770     }
 771 
 772     __ subs(count, count, 16);
 773     __ br(Assembler::LO, drain);
 774 
 775     int prefetch = PrefetchCopyIntervalInBytes;
 776     bool use_stride = false;
 777     if (direction == copy_backwards) {
 778        use_stride = prefetch > 256;
 779        prefetch = -prefetch;
 780        if (use_stride) __ mov(stride, prefetch);
 781     }
 782 
 783     __ bind(again);
 784 
 785     if (PrefetchCopyIntervalInBytes > 0)
 786       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 787 
 788     if (UseSIMDForMemoryOps) {
 789       __ stpq(v0, v1, Address(d, 4 * unit));
 790       __ ldpq(v0, v1, Address(s, 4 * unit));
 791       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 792       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 793     } else {
 794       __ stp(t0, t1, Address(d, 2 * unit));
 795       __ ldp(t0, t1, Address(s, 2 * unit));
 796       __ stp(t2, t3, Address(d, 4 * unit));
 797       __ ldp(t2, t3, Address(s, 4 * unit));
 798       __ stp(t4, t5, Address(d, 6 * unit));
 799       __ ldp(t4, t5, Address(s, 6 * unit));
 800       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 801       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 802     }
 803 
 804     __ subs(count, count, 8);
 805     __ br(Assembler::HS, again);
 806 
 807     // Drain
 808     __ bind(drain);
 809     if (UseSIMDForMemoryOps) {
 810       __ stpq(v0, v1, Address(d, 4 * unit));
 811       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 812     } else {
 813       __ stp(t0, t1, Address(d, 2 * unit));
 814       __ stp(t2, t3, Address(d, 4 * unit));
 815       __ stp(t4, t5, Address(d, 6 * unit));
 816       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 817     }
 818 
 819     {
 820       Label L1, L2;
 821       __ tbz(count, exact_log2(4), L1);
 822       if (UseSIMDForMemoryOps) {
 823         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 824         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 825       } else {
 826         __ ldp(t0, t1, Address(s, 2 * unit));
 827         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 828         __ stp(t0, t1, Address(d, 2 * unit));
 829         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 830       }
 831       __ bind(L1);
 832 
 833       if (direction == copy_forwards) {
 834         __ add(s, s, bias);
 835         __ add(d, d, bias);
 836       }
 837 
 838       __ tbz(count, 1, L2);
 839       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 840       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 841       __ bind(L2);
 842     }
 843 
 844     __ ret(lr);
 845 
 846     if (AvoidUnalignedAccesses) {
 847       Label drain, again;
 848       // Register order for storing. Order is different for backward copy.
 849 
 850       __ bind(unaligned_copy_long);
 851 
 852       // source address is even aligned, target odd aligned
 853       //
 854       // when forward copying word pairs we read long pairs at offsets
 855       // {0, 2, 4, 6} (in long words). when backwards copying we read
 856       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 857       // address by -2 in the forwards case so we can compute the
 858       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 859       // or -1.
 860       //
 861       // when forward copying we need to store 1 word, 3 pairs and
 862       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 863       // zero offset We adjust the destination by -1 which means we
 864       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 865       //
 866       // When backwards copyng we need to store 1 word, 3 pairs and
 867       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 868       // offsets {1, 3, 5, 7, 8} * unit.
 869 
 870       if (direction == copy_forwards) {
 871         __ sub(s, s, 16);
 872         __ sub(d, d, 8);
 873       }
 874 
 875       // Fill 8 registers
 876       //
 877       // for forwards copy s was offset by -16 from the original input
 878       // value of s so the register contents are at these offsets
 879       // relative to the 64 bit block addressed by that original input
 880       // and so on for each successive 64 byte block when s is updated
 881       //
 882       // t0 at offset 0,  t1 at offset 8
 883       // t2 at offset 16, t3 at offset 24
 884       // t4 at offset 32, t5 at offset 40
 885       // t6 at offset 48, t7 at offset 56
 886 
 887       // for backwards copy s was not offset so the register contents
 888       // are at these offsets into the preceding 64 byte block
 889       // relative to that original input and so on for each successive
 890       // preceding 64 byte block when s is updated. this explains the
 891       // slightly counter-intuitive looking pattern of register usage
 892       // in the stp instructions for backwards copy.
 893       //
 894       // t0 at offset -16, t1 at offset -8
 895       // t2 at offset -32, t3 at offset -24
 896       // t4 at offset -48, t5 at offset -40
 897       // t6 at offset -64, t7 at offset -56
 898 
 899       __ ldp(t0, t1, Address(s, 2 * unit));
 900       __ ldp(t2, t3, Address(s, 4 * unit));
 901       __ ldp(t4, t5, Address(s, 6 * unit));
 902       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 903 
 904       __ subs(count, count, 16);
 905       __ br(Assembler::LO, drain);
 906 
 907       int prefetch = PrefetchCopyIntervalInBytes;
 908       bool use_stride = false;
 909       if (direction == copy_backwards) {
 910          use_stride = prefetch > 256;
 911          prefetch = -prefetch;
 912          if (use_stride) __ mov(stride, prefetch);
 913       }
 914 
 915       __ bind(again);
 916 
 917       if (PrefetchCopyIntervalInBytes > 0)
 918         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 919 
 920       if (direction == copy_forwards) {
 921        // allowing for the offset of -8 the store instructions place
 922        // registers into the target 64 bit block at the following
 923        // offsets
 924        //
 925        // t0 at offset 0
 926        // t1 at offset 8,  t2 at offset 16
 927        // t3 at offset 24, t4 at offset 32
 928        // t5 at offset 40, t6 at offset 48
 929        // t7 at offset 56
 930 
 931         __ str(t0, Address(d, 1 * unit));
 932         __ stp(t1, t2, Address(d, 2 * unit));
 933         __ ldp(t0, t1, Address(s, 2 * unit));
 934         __ stp(t3, t4, Address(d, 4 * unit));
 935         __ ldp(t2, t3, Address(s, 4 * unit));
 936         __ stp(t5, t6, Address(d, 6 * unit));
 937         __ ldp(t4, t5, Address(s, 6 * unit));
 938         __ str(t7, Address(__ pre(d, 8 * unit)));
 939         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 940       } else {
 941        // d was not offset when we started so the registers are
 942        // written into the 64 bit block preceding d with the following
 943        // offsets
 944        //
 945        // t1 at offset -8
 946        // t3 at offset -24, t0 at offset -16
 947        // t5 at offset -48, t2 at offset -32
 948        // t7 at offset -56, t4 at offset -48
 949        //                   t6 at offset -64
 950        //
 951        // note that this matches the offsets previously noted for the
 952        // loads
 953 
 954         __ str(t1, Address(d, 1 * unit));
 955         __ stp(t3, t0, Address(d, 3 * unit));
 956         __ ldp(t0, t1, Address(s, 2 * unit));
 957         __ stp(t5, t2, Address(d, 5 * unit));
 958         __ ldp(t2, t3, Address(s, 4 * unit));
 959         __ stp(t7, t4, Address(d, 7 * unit));
 960         __ ldp(t4, t5, Address(s, 6 * unit));
 961         __ str(t6, Address(__ pre(d, 8 * unit)));
 962         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 963       }
 964 
 965       __ subs(count, count, 8);
 966       __ br(Assembler::HS, again);
 967 
 968       // Drain
 969       //
 970       // this uses the same pattern of offsets and register arguments
 971       // as above
 972       __ bind(drain);
 973       if (direction == copy_forwards) {
 974         __ str(t0, Address(d, 1 * unit));
 975         __ stp(t1, t2, Address(d, 2 * unit));
 976         __ stp(t3, t4, Address(d, 4 * unit));
 977         __ stp(t5, t6, Address(d, 6 * unit));
 978         __ str(t7, Address(__ pre(d, 8 * unit)));
 979       } else {
 980         __ str(t1, Address(d, 1 * unit));
 981         __ stp(t3, t0, Address(d, 3 * unit));
 982         __ stp(t5, t2, Address(d, 5 * unit));
 983         __ stp(t7, t4, Address(d, 7 * unit));
 984         __ str(t6, Address(__ pre(d, 8 * unit)));
 985       }
 986       // now we need to copy any remaining part block which may
 987       // include a 4 word block subblock and/or a 2 word subblock.
 988       // bits 2 and 1 in the count are the tell-tale for whether we
 989       // have each such subblock
 990       {
 991         Label L1, L2;
 992         __ tbz(count, exact_log2(4), L1);
 993        // this is the same as above but copying only 4 longs hence
 994        // with only one intervening stp between the str instructions
 995        // but note that the offsets and registers still follow the
 996        // same pattern
 997         __ ldp(t0, t1, Address(s, 2 * unit));
 998         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 999         if (direction == copy_forwards) {
1000           __ str(t0, Address(d, 1 * unit));
1001           __ stp(t1, t2, Address(d, 2 * unit));
1002           __ str(t3, Address(__ pre(d, 4 * unit)));
1003         } else {
1004           __ str(t1, Address(d, 1 * unit));
1005           __ stp(t3, t0, Address(d, 3 * unit));
1006           __ str(t2, Address(__ pre(d, 4 * unit)));
1007         }
1008         __ bind(L1);
1009 
1010         __ tbz(count, 1, L2);
1011        // this is the same as above but copying only 2 longs hence
1012        // there is no intervening stp between the str instructions
1013        // but note that the offset and register patterns are still
1014        // the same
1015         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1016         if (direction == copy_forwards) {
1017           __ str(t0, Address(d, 1 * unit));
1018           __ str(t1, Address(__ pre(d, 2 * unit)));
1019         } else {
1020           __ str(t1, Address(d, 1 * unit));
1021           __ str(t0, Address(__ pre(d, 2 * unit)));
1022         }
1023         __ bind(L2);
1024 
1025        // for forwards copy we need to re-adjust the offsets we
1026        // applied so that s and d are follow the last words written
1027 
1028        if (direction == copy_forwards) {
1029          __ add(s, s, 16);
1030          __ add(d, d, 8);
1031        }
1032 
1033       }
1034 
1035       __ ret(lr);
1036       }
1037   }
1038 
1039   // Small copy: less than 16 bytes.
1040   //
1041   // NB: Ignores all of the bits of count which represent more than 15
1042   // bytes, so a caller doesn't have to mask them.
1043 
1044   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1045     bool is_backwards = step < 0;
1046     size_t granularity = uabs(step);
1047     int direction = is_backwards ? -1 : 1;
1048     int unit = wordSize * direction;
1049 
1050     Label Lword, Lint, Lshort, Lbyte;
1051 
1052     assert(granularity
1053            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1054 
1055     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1056 
1057     // ??? I don't know if this bit-test-and-branch is the right thing
1058     // to do.  It does a lot of jumping, resulting in several
1059     // mispredicted branches.  It might make more sense to do this
1060     // with something like Duff's device with a single computed branch.
1061 
1062     __ tbz(count, 3 - exact_log2(granularity), Lword);
1063     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1064     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1065     __ bind(Lword);
1066 
1067     if (granularity <= sizeof (jint)) {
1068       __ tbz(count, 2 - exact_log2(granularity), Lint);
1069       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1070       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1071       __ bind(Lint);
1072     }
1073 
1074     if (granularity <= sizeof (jshort)) {
1075       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1076       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1077       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1078       __ bind(Lshort);
1079     }
1080 
1081     if (granularity <= sizeof (jbyte)) {
1082       __ tbz(count, 0, Lbyte);
1083       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1084       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1085       __ bind(Lbyte);
1086     }
1087   }
1088 
1089   Label copy_f, copy_b;
1090 
1091   // All-singing all-dancing memory copy.
1092   //
1093   // Copy count units of memory from s to d.  The size of a unit is
1094   // step, which can be positive or negative depending on the direction
1095   // of copy.  If is_aligned is false, we align the source address.
1096   //
1097 
1098   void copy_memory(bool is_aligned, Register s, Register d,
1099                    Register count, Register tmp, int step) {
1100     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1101     bool is_backwards = step < 0;
1102     unsigned int granularity = uabs(step);
1103     const Register t0 = r3, t1 = r4;
1104 
1105     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1106     // load all the data before writing anything
1107     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1108     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1109     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1110     const Register send = r17, dend = r16;
1111 
1112     if (PrefetchCopyIntervalInBytes > 0)
1113       __ prfm(Address(s, 0), PLDL1KEEP);
1114     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1115     __ br(Assembler::HI, copy_big);
1116 
1117     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1118     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1119 
1120     __ cmp(count, u1(16/granularity));
1121     __ br(Assembler::LS, copy16);
1122 
1123     __ cmp(count, u1(64/granularity));
1124     __ br(Assembler::HI, copy80);
1125 
1126     __ cmp(count, u1(32/granularity));
1127     __ br(Assembler::LS, copy32);
1128 
1129     // 33..64 bytes
1130     if (UseSIMDForMemoryOps) {
1131       __ ldpq(v0, v1, Address(s, 0));
1132       __ ldpq(v2, v3, Address(send, -32));
1133       __ stpq(v0, v1, Address(d, 0));
1134       __ stpq(v2, v3, Address(dend, -32));
1135     } else {
1136       __ ldp(t0, t1, Address(s, 0));
1137       __ ldp(t2, t3, Address(s, 16));
1138       __ ldp(t4, t5, Address(send, -32));
1139       __ ldp(t6, t7, Address(send, -16));
1140 
1141       __ stp(t0, t1, Address(d, 0));
1142       __ stp(t2, t3, Address(d, 16));
1143       __ stp(t4, t5, Address(dend, -32));
1144       __ stp(t6, t7, Address(dend, -16));
1145     }
1146     __ b(finish);
1147 
1148     // 17..32 bytes
1149     __ bind(copy32);
1150     __ ldp(t0, t1, Address(s, 0));
1151     __ ldp(t2, t3, Address(send, -16));
1152     __ stp(t0, t1, Address(d, 0));
1153     __ stp(t2, t3, Address(dend, -16));
1154     __ b(finish);
1155 
1156     // 65..80/96 bytes
1157     // (96 bytes if SIMD because we do 32 byes per instruction)
1158     __ bind(copy80);
1159     if (UseSIMDForMemoryOps) {
1160       __ ldpq(v0, v1, Address(s, 0));
1161       __ ldpq(v2, v3, Address(s, 32));
1162       // Unaligned pointers can be an issue for copying.
1163       // The issue has more chances to happen when granularity of data is
1164       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1165       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1166       // The most performance drop has been seen for the range 65-80 bytes.
1167       // For such cases using the pair of ldp/stp instead of the third pair of
1168       // ldpq/stpq fixes the performance issue.
1169       if (granularity < sizeof (jint)) {
1170         Label copy96;
1171         __ cmp(count, u1(80/granularity));
1172         __ br(Assembler::HI, copy96);
1173         __ ldp(t0, t1, Address(send, -16));
1174 
1175         __ stpq(v0, v1, Address(d, 0));
1176         __ stpq(v2, v3, Address(d, 32));
1177         __ stp(t0, t1, Address(dend, -16));
1178         __ b(finish);
1179 
1180         __ bind(copy96);
1181       }
1182       __ ldpq(v4, v5, Address(send, -32));
1183 
1184       __ stpq(v0, v1, Address(d, 0));
1185       __ stpq(v2, v3, Address(d, 32));
1186       __ stpq(v4, v5, Address(dend, -32));
1187     } else {
1188       __ ldp(t0, t1, Address(s, 0));
1189       __ ldp(t2, t3, Address(s, 16));
1190       __ ldp(t4, t5, Address(s, 32));
1191       __ ldp(t6, t7, Address(s, 48));
1192       __ ldp(t8, t9, Address(send, -16));
1193 
1194       __ stp(t0, t1, Address(d, 0));
1195       __ stp(t2, t3, Address(d, 16));
1196       __ stp(t4, t5, Address(d, 32));
1197       __ stp(t6, t7, Address(d, 48));
1198       __ stp(t8, t9, Address(dend, -16));
1199     }
1200     __ b(finish);
1201 
1202     // 0..16 bytes
1203     __ bind(copy16);
1204     __ cmp(count, u1(8/granularity));
1205     __ br(Assembler::LO, copy8);
1206 
1207     // 8..16 bytes
1208     __ ldr(t0, Address(s, 0));
1209     __ ldr(t1, Address(send, -8));
1210     __ str(t0, Address(d, 0));
1211     __ str(t1, Address(dend, -8));
1212     __ b(finish);
1213 
1214     if (granularity < 8) {
1215       // 4..7 bytes
1216       __ bind(copy8);
1217       __ tbz(count, 2 - exact_log2(granularity), copy4);
1218       __ ldrw(t0, Address(s, 0));
1219       __ ldrw(t1, Address(send, -4));
1220       __ strw(t0, Address(d, 0));
1221       __ strw(t1, Address(dend, -4));
1222       __ b(finish);
1223       if (granularity < 4) {
1224         // 0..3 bytes
1225         __ bind(copy4);
1226         __ cbz(count, finish); // get rid of 0 case
1227         if (granularity == 2) {
1228           __ ldrh(t0, Address(s, 0));
1229           __ strh(t0, Address(d, 0));
1230         } else { // granularity == 1
1231           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1232           // the first and last byte.
1233           // Handle the 3 byte case by loading and storing base + count/2
1234           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1235           // This does means in the 1 byte case we load/store the same
1236           // byte 3 times.
1237           __ lsr(count, count, 1);
1238           __ ldrb(t0, Address(s, 0));
1239           __ ldrb(t1, Address(send, -1));
1240           __ ldrb(t2, Address(s, count));
1241           __ strb(t0, Address(d, 0));
1242           __ strb(t1, Address(dend, -1));
1243           __ strb(t2, Address(d, count));
1244         }
1245         __ b(finish);
1246       }
1247     }
1248 
1249     __ bind(copy_big);
1250     if (is_backwards) {
1251       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1252       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1253     }
1254 
1255     // Now we've got the small case out of the way we can align the
1256     // source address on a 2-word boundary.
1257 
1258     Label aligned;
1259 
1260     if (is_aligned) {
1261       // We may have to adjust by 1 word to get s 2-word-aligned.
1262       __ tbz(s, exact_log2(wordSize), aligned);
1263       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1264       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1265       __ sub(count, count, wordSize/granularity);
1266     } else {
1267       if (is_backwards) {
1268         __ andr(rscratch2, s, 2 * wordSize - 1);
1269       } else {
1270         __ neg(rscratch2, s);
1271         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1272       }
1273       // rscratch2 is the byte adjustment needed to align s.
1274       __ cbz(rscratch2, aligned);
1275       int shift = exact_log2(granularity);
1276       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1277       __ sub(count, count, rscratch2);
1278 
1279 #if 0
1280       // ?? This code is only correct for a disjoint copy.  It may or
1281       // may not make sense to use it in that case.
1282 
1283       // Copy the first pair; s and d may not be aligned.
1284       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1285       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1286 
1287       // Align s and d, adjust count
1288       if (is_backwards) {
1289         __ sub(s, s, rscratch2);
1290         __ sub(d, d, rscratch2);
1291       } else {
1292         __ add(s, s, rscratch2);
1293         __ add(d, d, rscratch2);
1294       }
1295 #else
1296       copy_memory_small(s, d, rscratch2, rscratch1, step);
1297 #endif
1298     }
1299 
1300     __ bind(aligned);
1301 
1302     // s is now 2-word-aligned.
1303 
1304     // We have a count of units and some trailing bytes.  Adjust the
1305     // count and do a bulk copy of words.
1306     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1307     if (direction == copy_forwards)
1308       __ bl(copy_f);
1309     else
1310       __ bl(copy_b);
1311 
1312     // And the tail.
1313     copy_memory_small(s, d, count, tmp, step);
1314 
1315     if (granularity >= 8) __ bind(copy8);
1316     if (granularity >= 4) __ bind(copy4);
1317     __ bind(finish);
1318   }
1319 
1320 
1321   void clobber_registers() {
1322 #ifdef ASSERT
1323     RegSet clobbered
1324       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1325     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1326     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1327     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1328       __ mov(*it, rscratch1);
1329     }
1330 #endif
1331 
1332   }
1333 
1334   // Scan over array at a for count oops, verifying each one.
1335   // Preserves a and count, clobbers rscratch1 and rscratch2.
1336   void verify_oop_array (int size, Register a, Register count, Register temp) {
1337     Label loop, end;
1338     __ mov(rscratch1, a);
1339     __ mov(rscratch2, zr);
1340     __ bind(loop);
1341     __ cmp(rscratch2, count);
1342     __ br(Assembler::HS, end);
1343     if (size == wordSize) {
1344       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1345       __ verify_oop(temp);
1346     } else {
1347       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1348       __ decode_heap_oop(temp); // calls verify_oop
1349     }
1350     __ add(rscratch2, rscratch2, 1);
1351     __ b(loop);
1352     __ bind(end);
1353   }
1354 
1355   // Arguments:
1356   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1357   //             ignored
1358   //   is_oop  - true => oop array, so generate store check code
1359   //   name    - stub name string
1360   //
1361   // Inputs:
1362   //   c_rarg0   - source array address
1363   //   c_rarg1   - destination array address
1364   //   c_rarg2   - element count, treated as ssize_t, can be zero
1365   //
1366   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1367   // the hardware handle it.  The two dwords within qwords that span
1368   // cache line boundaries will still be loaded and stored atomically.
1369   //
1370   // Side Effects:
1371   //   disjoint_int_copy_entry is set to the no-overlap entry point
1372   //   used by generate_conjoint_int_oop_copy().
1373   //
1374   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1375                                   const char *name, bool dest_uninitialized = false) {
1376     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1377     RegSet saved_reg = RegSet::of(s, d, count);
1378     __ align(CodeEntryAlignment);
1379     StubCodeMark mark(this, "StubRoutines", name);
1380     address start = __ pc();
1381     __ enter();
1382 
1383     if (entry != NULL) {
1384       *entry = __ pc();
1385       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1386       BLOCK_COMMENT("Entry:");
1387     }
1388 
1389     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1390     if (dest_uninitialized) {
1391       decorators |= IS_DEST_UNINITIALIZED;
1392     }
1393     if (aligned) {
1394       decorators |= ARRAYCOPY_ALIGNED;
1395     }
1396 
1397     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1398     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1399 
1400     if (is_oop) {
1401       // save regs before copy_memory
1402       __ push(RegSet::of(d, count), sp);
1403     }
1404     {
1405       // UnsafeCopyMemory page error: continue after ucm
1406       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1407       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1408       copy_memory(aligned, s, d, count, rscratch1, size);
1409     }
1410 
1411     if (is_oop) {
1412       __ pop(RegSet::of(d, count), sp);
1413       if (VerifyOops)
1414         verify_oop_array(size, d, count, r16);
1415     }
1416 
1417     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1418 
1419     __ leave();
1420     __ mov(r0, zr); // return 0
1421     __ ret(lr);
1422     return start;
1423   }
1424 
1425   // Arguments:
1426   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1427   //             ignored
1428   //   is_oop  - true => oop array, so generate store check code
1429   //   name    - stub name string
1430   //
1431   // Inputs:
1432   //   c_rarg0   - source array address
1433   //   c_rarg1   - destination array address
1434   //   c_rarg2   - element count, treated as ssize_t, can be zero
1435   //
1436   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1437   // the hardware handle it.  The two dwords within qwords that span
1438   // cache line boundaries will still be loaded and stored atomically.
1439   //
1440   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1441                                  address *entry, const char *name,
1442                                  bool dest_uninitialized = false) {
1443     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1444     RegSet saved_regs = RegSet::of(s, d, count);
1445     StubCodeMark mark(this, "StubRoutines", name);
1446     address start = __ pc();
1447     __ enter();
1448 
1449     if (entry != NULL) {
1450       *entry = __ pc();
1451       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1452       BLOCK_COMMENT("Entry:");
1453     }
1454 
1455     // use fwd copy when (d-s) above_equal (count*size)
1456     __ sub(rscratch1, d, s);
1457     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1458     __ br(Assembler::HS, nooverlap_target);
1459 
1460     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1461     if (dest_uninitialized) {
1462       decorators |= IS_DEST_UNINITIALIZED;
1463     }
1464     if (aligned) {
1465       decorators |= ARRAYCOPY_ALIGNED;
1466     }
1467 
1468     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1469     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1470 
1471     if (is_oop) {
1472       // save regs before copy_memory
1473       __ push(RegSet::of(d, count), sp);
1474     }
1475     {
1476       // UnsafeCopyMemory page error: continue after ucm
1477       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1478       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1479       copy_memory(aligned, s, d, count, rscratch1, -size);
1480     }
1481     if (is_oop) {
1482       __ pop(RegSet::of(d, count), sp);
1483       if (VerifyOops)
1484         verify_oop_array(size, d, count, r16);
1485     }
1486     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1487     __ leave();
1488     __ mov(r0, zr); // return 0
1489     __ ret(lr);
1490     return start;
1491 }
1492 
1493   // Arguments:
1494   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1495   //             ignored
1496   //   name    - stub name string
1497   //
1498   // Inputs:
1499   //   c_rarg0   - source array address
1500   //   c_rarg1   - destination array address
1501   //   c_rarg2   - element count, treated as ssize_t, can be zero
1502   //
1503   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1504   // we let the hardware handle it.  The one to eight bytes within words,
1505   // dwords or qwords that span cache line boundaries will still be loaded
1506   // and stored atomically.
1507   //
1508   // Side Effects:
1509   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1510   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1511   // we let the hardware handle it.  The one to eight bytes within words,
1512   // dwords or qwords that span cache line boundaries will still be loaded
1513   // and stored atomically.
1514   //
1515   // Side Effects:
1516   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1517   //   used by generate_conjoint_byte_copy().
1518   //
1519   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1520     const bool not_oop = false;
1521     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1522   }
1523 
1524   // Arguments:
1525   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1526   //             ignored
1527   //   name    - stub name string
1528   //
1529   // Inputs:
1530   //   c_rarg0   - source array address
1531   //   c_rarg1   - destination array address
1532   //   c_rarg2   - element count, treated as ssize_t, can be zero
1533   //
1534   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1535   // we let the hardware handle it.  The one to eight bytes within words,
1536   // dwords or qwords that span cache line boundaries will still be loaded
1537   // and stored atomically.
1538   //
1539   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1540                                       address* entry, const char *name) {
1541     const bool not_oop = false;
1542     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1543   }
1544 
1545   // Arguments:
1546   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1547   //             ignored
1548   //   name    - stub name string
1549   //
1550   // Inputs:
1551   //   c_rarg0   - source array address
1552   //   c_rarg1   - destination array address
1553   //   c_rarg2   - element count, treated as ssize_t, can be zero
1554   //
1555   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1556   // let the hardware handle it.  The two or four words within dwords
1557   // or qwords that span cache line boundaries will still be loaded
1558   // and stored atomically.
1559   //
1560   // Side Effects:
1561   //   disjoint_short_copy_entry is set to the no-overlap entry point
1562   //   used by generate_conjoint_short_copy().
1563   //
1564   address generate_disjoint_short_copy(bool aligned,
1565                                        address* entry, const char *name) {
1566     const bool not_oop = false;
1567     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1568   }
1569 
1570   // Arguments:
1571   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1572   //             ignored
1573   //   name    - stub name string
1574   //
1575   // Inputs:
1576   //   c_rarg0   - source array address
1577   //   c_rarg1   - destination array address
1578   //   c_rarg2   - element count, treated as ssize_t, can be zero
1579   //
1580   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1581   // let the hardware handle it.  The two or four words within dwords
1582   // or qwords that span cache line boundaries will still be loaded
1583   // and stored atomically.
1584   //
1585   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1586                                        address *entry, const char *name) {
1587     const bool not_oop = false;
1588     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1589 
1590   }
1591   // Arguments:
1592   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1593   //             ignored
1594   //   name    - stub name string
1595   //
1596   // Inputs:
1597   //   c_rarg0   - source array address
1598   //   c_rarg1   - destination array address
1599   //   c_rarg2   - element count, treated as ssize_t, can be zero
1600   //
1601   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1602   // the hardware handle it.  The two dwords within qwords that span
1603   // cache line boundaries will still be loaded and stored atomically.
1604   //
1605   // Side Effects:
1606   //   disjoint_int_copy_entry is set to the no-overlap entry point
1607   //   used by generate_conjoint_int_oop_copy().
1608   //
1609   address generate_disjoint_int_copy(bool aligned, address *entry,
1610                                          const char *name, bool dest_uninitialized = false) {
1611     const bool not_oop = false;
1612     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1613   }
1614 
1615   // Arguments:
1616   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1617   //             ignored
1618   //   name    - stub name string
1619   //
1620   // Inputs:
1621   //   c_rarg0   - source array address
1622   //   c_rarg1   - destination array address
1623   //   c_rarg2   - element count, treated as ssize_t, can be zero
1624   //
1625   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1626   // the hardware handle it.  The two dwords within qwords that span
1627   // cache line boundaries will still be loaded and stored atomically.
1628   //
1629   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1630                                      address *entry, const char *name,
1631                                      bool dest_uninitialized = false) {
1632     const bool not_oop = false;
1633     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1634   }
1635 
1636 
1637   // Arguments:
1638   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1639   //             ignored
1640   //   name    - stub name string
1641   //
1642   // Inputs:
1643   //   c_rarg0   - source array address
1644   //   c_rarg1   - destination array address
1645   //   c_rarg2   - element count, treated as size_t, can be zero
1646   //
1647   // Side Effects:
1648   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1649   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1650   //
1651   address generate_disjoint_long_copy(bool aligned, address *entry,
1652                                           const char *name, bool dest_uninitialized = false) {
1653     const bool not_oop = false;
1654     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1655   }
1656 
1657   // Arguments:
1658   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1659   //             ignored
1660   //   name    - stub name string
1661   //
1662   // Inputs:
1663   //   c_rarg0   - source array address
1664   //   c_rarg1   - destination array address
1665   //   c_rarg2   - element count, treated as size_t, can be zero
1666   //
1667   address generate_conjoint_long_copy(bool aligned,
1668                                       address nooverlap_target, address *entry,
1669                                       const char *name, bool dest_uninitialized = false) {
1670     const bool not_oop = false;
1671     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1672   }
1673 
1674   // Arguments:
1675   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1676   //             ignored
1677   //   name    - stub name string
1678   //
1679   // Inputs:
1680   //   c_rarg0   - source array address
1681   //   c_rarg1   - destination array address
1682   //   c_rarg2   - element count, treated as size_t, can be zero
1683   //
1684   // Side Effects:
1685   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1686   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1687   //
1688   address generate_disjoint_oop_copy(bool aligned, address *entry,
1689                                      const char *name, bool dest_uninitialized) {
1690     const bool is_oop = true;
1691     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1692     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1693   }
1694 
1695   // Arguments:
1696   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1697   //             ignored
1698   //   name    - stub name string
1699   //
1700   // Inputs:
1701   //   c_rarg0   - source array address
1702   //   c_rarg1   - destination array address
1703   //   c_rarg2   - element count, treated as size_t, can be zero
1704   //
1705   address generate_conjoint_oop_copy(bool aligned,
1706                                      address nooverlap_target, address *entry,
1707                                      const char *name, bool dest_uninitialized) {
1708     const bool is_oop = true;
1709     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1710     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1711                                   name, dest_uninitialized);
1712   }
1713 
1714 
1715   // Helper for generating a dynamic type check.
1716   // Smashes rscratch1, rscratch2.
1717   void generate_type_check(Register sub_klass,
1718                            Register super_check_offset,
1719                            Register super_klass,
1720                            Label& L_success) {
1721     assert_different_registers(sub_klass, super_check_offset, super_klass);
1722 
1723     BLOCK_COMMENT("type_check:");
1724 
1725     Label L_miss;
1726 
1727     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1728                                      super_check_offset);
1729     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1730 
1731     // Fall through on failure!
1732     __ BIND(L_miss);
1733   }
1734 
1735   //
1736   //  Generate checkcasting array copy stub
1737   //
1738   //  Input:
1739   //    c_rarg0   - source array address
1740   //    c_rarg1   - destination array address
1741   //    c_rarg2   - element count, treated as ssize_t, can be zero
1742   //    c_rarg3   - size_t ckoff (super_check_offset)
1743   //    c_rarg4   - oop ckval (super_klass)
1744   //
1745   //  Output:
1746   //    r0 ==  0  -  success
1747   //    r0 == -1^K - failure, where K is partial transfer count
1748   //
1749   address generate_checkcast_copy(const char *name, address *entry,
1750                                   bool dest_uninitialized = false) {
1751 
1752     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1753 
1754     // Input registers (after setup_arg_regs)
1755     const Register from        = c_rarg0;   // source array address
1756     const Register to          = c_rarg1;   // destination array address
1757     const Register count       = c_rarg2;   // elementscount
1758     const Register ckoff       = c_rarg3;   // super_check_offset
1759     const Register ckval       = c_rarg4;   // super_klass
1760 
1761     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1762     RegSet wb_post_saved_regs = RegSet::of(count);
1763 
1764     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1765     const Register copied_oop  = r22;       // actual oop copied
1766     const Register count_save  = r21;       // orig elementscount
1767     const Register start_to    = r20;       // destination array start address
1768     const Register r19_klass   = r19;       // oop._klass
1769 
1770     //---------------------------------------------------------------
1771     // Assembler stub will be used for this call to arraycopy
1772     // if the two arrays are subtypes of Object[] but the
1773     // destination array type is not equal to or a supertype
1774     // of the source type.  Each element must be separately
1775     // checked.
1776 
1777     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1778                                copied_oop, r19_klass, count_save);
1779 
1780     __ align(CodeEntryAlignment);
1781     StubCodeMark mark(this, "StubRoutines", name);
1782     address start = __ pc();
1783 
1784     __ enter(); // required for proper stackwalking of RuntimeStub frame
1785 
1786 #ifdef ASSERT
1787     // caller guarantees that the arrays really are different
1788     // otherwise, we would have to make conjoint checks
1789     { Label L;
1790       array_overlap_test(L, TIMES_OOP);
1791       __ stop("checkcast_copy within a single array");
1792       __ bind(L);
1793     }
1794 #endif //ASSERT
1795 
1796     // Caller of this entry point must set up the argument registers.
1797     if (entry != NULL) {
1798       *entry = __ pc();
1799       BLOCK_COMMENT("Entry:");
1800     }
1801 
1802      // Empty array:  Nothing to do.
1803     __ cbz(count, L_done);
1804     __ push(RegSet::of(r19, r20, r21, r22), sp);
1805 
1806 #ifdef ASSERT
1807     BLOCK_COMMENT("assert consistent ckoff/ckval");
1808     // The ckoff and ckval must be mutually consistent,
1809     // even though caller generates both.
1810     { Label L;
1811       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1812       __ ldrw(start_to, Address(ckval, sco_offset));
1813       __ cmpw(ckoff, start_to);
1814       __ br(Assembler::EQ, L);
1815       __ stop("super_check_offset inconsistent");
1816       __ bind(L);
1817     }
1818 #endif //ASSERT
1819 
1820     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1821     bool is_oop = true;
1822     if (dest_uninitialized) {
1823       decorators |= IS_DEST_UNINITIALIZED;
1824     }
1825 
1826     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1827     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1828 
1829     // save the original count
1830     __ mov(count_save, count);
1831 
1832     // Copy from low to high addresses
1833     __ mov(start_to, to);              // Save destination array start address
1834     __ b(L_load_element);
1835 
1836     // ======== begin loop ========
1837     // (Loop is rotated; its entry is L_load_element.)
1838     // Loop control:
1839     //   for (; count != 0; count--) {
1840     //     copied_oop = load_heap_oop(from++);
1841     //     ... generate_type_check ...;
1842     //     store_heap_oop(to++, copied_oop);
1843     //   }
1844     __ align(OptoLoopAlignment);
1845 
1846     __ BIND(L_store_element);
1847     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1848     __ sub(count, count, 1);
1849     __ cbz(count, L_do_card_marks);
1850 
1851     // ======== loop entry is here ========
1852     __ BIND(L_load_element);
1853     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1854     __ cbz(copied_oop, L_store_element);
1855 
1856     __ load_klass(r19_klass, copied_oop);// query the object klass
1857     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1858     // ======== end loop ========
1859 
1860     // It was a real error; we must depend on the caller to finish the job.
1861     // Register count = remaining oops, count_orig = total oops.
1862     // Emit GC store barriers for the oops we have copied and report
1863     // their number to the caller.
1864 
1865     __ subs(count, count_save, count);     // K = partially copied oop count
1866     __ eon(count, count, zr);                   // report (-1^K) to caller
1867     __ br(Assembler::EQ, L_done_pop);
1868 
1869     __ BIND(L_do_card_marks);
1870     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1871 
1872     __ bind(L_done_pop);
1873     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1874     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1875 
1876     __ bind(L_done);
1877     __ mov(r0, count);
1878     __ leave();
1879     __ ret(lr);
1880 
1881     return start;
1882   }
1883 
1884   // Perform range checks on the proposed arraycopy.
1885   // Kills temp, but nothing else.
1886   // Also, clean the sign bits of src_pos and dst_pos.
1887   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1888                               Register src_pos, // source position (c_rarg1)
1889                               Register dst,     // destination array oo (c_rarg2)
1890                               Register dst_pos, // destination position (c_rarg3)
1891                               Register length,
1892                               Register temp,
1893                               Label& L_failed) {
1894     BLOCK_COMMENT("arraycopy_range_checks:");
1895 
1896     assert_different_registers(rscratch1, temp);
1897 
1898     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1899     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1900     __ addw(temp, length, src_pos);
1901     __ cmpw(temp, rscratch1);
1902     __ br(Assembler::HI, L_failed);
1903 
1904     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1905     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1906     __ addw(temp, length, dst_pos);
1907     __ cmpw(temp, rscratch1);
1908     __ br(Assembler::HI, L_failed);
1909 
1910     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1911     __ movw(src_pos, src_pos);
1912     __ movw(dst_pos, dst_pos);
1913 
1914     BLOCK_COMMENT("arraycopy_range_checks done");
1915   }
1916 
1917   // These stubs get called from some dumb test routine.
1918   // I'll write them properly when they're called from
1919   // something that's actually doing something.
1920   static void fake_arraycopy_stub(address src, address dst, int count) {
1921     assert(count == 0, "huh?");
1922   }
1923 
1924 
1925   //
1926   //  Generate 'unsafe' array copy stub
1927   //  Though just as safe as the other stubs, it takes an unscaled
1928   //  size_t argument instead of an element count.
1929   //
1930   //  Input:
1931   //    c_rarg0   - source array address
1932   //    c_rarg1   - destination array address
1933   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1934   //
1935   // Examines the alignment of the operands and dispatches
1936   // to a long, int, short, or byte copy loop.
1937   //
1938   address generate_unsafe_copy(const char *name,
1939                                address byte_copy_entry,
1940                                address short_copy_entry,
1941                                address int_copy_entry,
1942                                address long_copy_entry) {
1943     Label L_long_aligned, L_int_aligned, L_short_aligned;
1944     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1945 
1946     __ align(CodeEntryAlignment);
1947     StubCodeMark mark(this, "StubRoutines", name);
1948     address start = __ pc();
1949     __ enter(); // required for proper stackwalking of RuntimeStub frame
1950 
1951     // bump this on entry, not on exit:
1952     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1953 
1954     __ orr(rscratch1, s, d);
1955     __ orr(rscratch1, rscratch1, count);
1956 
1957     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1958     __ cbz(rscratch1, L_long_aligned);
1959     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1960     __ cbz(rscratch1, L_int_aligned);
1961     __ tbz(rscratch1, 0, L_short_aligned);
1962     __ b(RuntimeAddress(byte_copy_entry));
1963 
1964     __ BIND(L_short_aligned);
1965     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1966     __ b(RuntimeAddress(short_copy_entry));
1967     __ BIND(L_int_aligned);
1968     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1969     __ b(RuntimeAddress(int_copy_entry));
1970     __ BIND(L_long_aligned);
1971     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1972     __ b(RuntimeAddress(long_copy_entry));
1973 
1974     return start;
1975   }
1976 
1977   //
1978   //  Generate generic array copy stubs
1979   //
1980   //  Input:
1981   //    c_rarg0    -  src oop
1982   //    c_rarg1    -  src_pos (32-bits)
1983   //    c_rarg2    -  dst oop
1984   //    c_rarg3    -  dst_pos (32-bits)
1985   //    c_rarg4    -  element count (32-bits)
1986   //
1987   //  Output:
1988   //    r0 ==  0  -  success
1989   //    r0 == -1^K - failure, where K is partial transfer count
1990   //
1991   address generate_generic_copy(const char *name,
1992                                 address byte_copy_entry, address short_copy_entry,
1993                                 address int_copy_entry, address oop_copy_entry,
1994                                 address long_copy_entry, address checkcast_copy_entry) {
1995 
1996     Label L_failed, L_objArray;
1997     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1998 
1999     // Input registers
2000     const Register src        = c_rarg0;  // source array oop
2001     const Register src_pos    = c_rarg1;  // source position
2002     const Register dst        = c_rarg2;  // destination array oop
2003     const Register dst_pos    = c_rarg3;  // destination position
2004     const Register length     = c_rarg4;
2005 
2006 
2007     // Registers used as temps
2008     const Register dst_klass  = c_rarg5;
2009 
2010     __ align(CodeEntryAlignment);
2011 
2012     StubCodeMark mark(this, "StubRoutines", name);
2013 
2014     address start = __ pc();
2015 
2016     __ enter(); // required for proper stackwalking of RuntimeStub frame
2017 
2018     // bump this on entry, not on exit:
2019     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2020 
2021     //-----------------------------------------------------------------------
2022     // Assembler stub will be used for this call to arraycopy
2023     // if the following conditions are met:
2024     //
2025     // (1) src and dst must not be null.
2026     // (2) src_pos must not be negative.
2027     // (3) dst_pos must not be negative.
2028     // (4) length  must not be negative.
2029     // (5) src klass and dst klass should be the same and not NULL.
2030     // (6) src and dst should be arrays.
2031     // (7) src_pos + length must not exceed length of src.
2032     // (8) dst_pos + length must not exceed length of dst.
2033     //
2034 
2035     //  if (src == NULL) return -1;
2036     __ cbz(src, L_failed);
2037 
2038     //  if (src_pos < 0) return -1;
2039     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2040 
2041     //  if (dst == NULL) return -1;
2042     __ cbz(dst, L_failed);
2043 
2044     //  if (dst_pos < 0) return -1;
2045     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2046 
2047     // registers used as temp
2048     const Register scratch_length    = r16; // elements count to copy
2049     const Register scratch_src_klass = r17; // array klass
2050     const Register lh                = r15; // layout helper
2051 
2052     //  if (length < 0) return -1;
2053     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2054     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2055 
2056     __ load_klass(scratch_src_klass, src);
2057 #ifdef ASSERT
2058     //  assert(src->klass() != NULL);
2059     {
2060       BLOCK_COMMENT("assert klasses not null {");
2061       Label L1, L2;
2062       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2063       __ bind(L1);
2064       __ stop("broken null klass");
2065       __ bind(L2);
2066       __ load_klass(rscratch1, dst);
2067       __ cbz(rscratch1, L1);     // this would be broken also
2068       BLOCK_COMMENT("} assert klasses not null done");
2069     }
2070 #endif
2071 
2072     // Load layout helper (32-bits)
2073     //
2074     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2075     // 32        30    24            16              8     2                 0
2076     //
2077     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2078     //
2079 
2080     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2081 
2082     // Handle objArrays completely differently...
2083     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2084     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2085     __ movw(rscratch1, objArray_lh);
2086     __ eorw(rscratch2, lh, rscratch1);
2087     __ cbzw(rscratch2, L_objArray);
2088 
2089     //  if (src->klass() != dst->klass()) return -1;
2090     __ load_klass(rscratch2, dst);
2091     __ eor(rscratch2, rscratch2, scratch_src_klass);
2092     __ cbnz(rscratch2, L_failed);
2093 
2094     //  if (!src->is_Array()) return -1;
2095     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2096 
2097     // At this point, it is known to be a typeArray (array_tag 0x3).
2098 #ifdef ASSERT
2099     {
2100       BLOCK_COMMENT("assert primitive array {");
2101       Label L;
2102       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2103       __ cmpw(lh, rscratch2);
2104       __ br(Assembler::GE, L);
2105       __ stop("must be a primitive array");
2106       __ bind(L);
2107       BLOCK_COMMENT("} assert primitive array done");
2108     }
2109 #endif
2110 
2111     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2112                            rscratch2, L_failed);
2113 
2114     // TypeArrayKlass
2115     //
2116     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2117     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2118     //
2119 
2120     const Register rscratch1_offset = rscratch1;    // array offset
2121     const Register r15_elsize = lh; // element size
2122 
2123     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2124            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2125     __ add(src, src, rscratch1_offset);           // src array offset
2126     __ add(dst, dst, rscratch1_offset);           // dst array offset
2127     BLOCK_COMMENT("choose copy loop based on element size");
2128 
2129     // next registers should be set before the jump to corresponding stub
2130     const Register from     = c_rarg0;  // source array address
2131     const Register to       = c_rarg1;  // destination array address
2132     const Register count    = c_rarg2;  // elements count
2133 
2134     // 'from', 'to', 'count' registers should be set in such order
2135     // since they are the same as 'src', 'src_pos', 'dst'.
2136 
2137     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2138 
2139     // The possible values of elsize are 0-3, i.e. exact_log2(element
2140     // size in bytes).  We do a simple bitwise binary search.
2141   __ BIND(L_copy_bytes);
2142     __ tbnz(r15_elsize, 1, L_copy_ints);
2143     __ tbnz(r15_elsize, 0, L_copy_shorts);
2144     __ lea(from, Address(src, src_pos));// src_addr
2145     __ lea(to,   Address(dst, dst_pos));// dst_addr
2146     __ movw(count, scratch_length); // length
2147     __ b(RuntimeAddress(byte_copy_entry));
2148 
2149   __ BIND(L_copy_shorts);
2150     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2151     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2152     __ movw(count, scratch_length); // length
2153     __ b(RuntimeAddress(short_copy_entry));
2154 
2155   __ BIND(L_copy_ints);
2156     __ tbnz(r15_elsize, 0, L_copy_longs);
2157     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2158     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2159     __ movw(count, scratch_length); // length
2160     __ b(RuntimeAddress(int_copy_entry));
2161 
2162   __ BIND(L_copy_longs);
2163 #ifdef ASSERT
2164     {
2165       BLOCK_COMMENT("assert long copy {");
2166       Label L;
2167       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2168       __ cmpw(r15_elsize, LogBytesPerLong);
2169       __ br(Assembler::EQ, L);
2170       __ stop("must be long copy, but elsize is wrong");
2171       __ bind(L);
2172       BLOCK_COMMENT("} assert long copy done");
2173     }
2174 #endif
2175     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2176     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2177     __ movw(count, scratch_length); // length
2178     __ b(RuntimeAddress(long_copy_entry));
2179 
2180     // ObjArrayKlass
2181   __ BIND(L_objArray);
2182     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2183 
2184     Label L_plain_copy, L_checkcast_copy;
2185     //  test array classes for subtyping
2186     __ load_klass(r15, dst);
2187     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2188     __ br(Assembler::NE, L_checkcast_copy);
2189 
2190     // Identically typed arrays can be copied without element-wise checks.
2191     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2192                            rscratch2, L_failed);
2193 
2194     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2195     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2196     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2197     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2198     __ movw(count, scratch_length); // length
2199   __ BIND(L_plain_copy);
2200     __ b(RuntimeAddress(oop_copy_entry));
2201 
2202   __ BIND(L_checkcast_copy);
2203     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2204     {
2205       // Before looking at dst.length, make sure dst is also an objArray.
2206       __ ldrw(rscratch1, Address(r15, lh_offset));
2207       __ movw(rscratch2, objArray_lh);
2208       __ eorw(rscratch1, rscratch1, rscratch2);
2209       __ cbnzw(rscratch1, L_failed);
2210 
2211       // It is safe to examine both src.length and dst.length.
2212       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2213                              r15, L_failed);
2214 
2215       __ load_klass(dst_klass, dst); // reload
2216 
2217       // Marshal the base address arguments now, freeing registers.
2218       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2219       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2220       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2221       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2222       __ movw(count, length);           // length (reloaded)
2223       Register sco_temp = c_rarg3;      // this register is free now
2224       assert_different_registers(from, to, count, sco_temp,
2225                                  dst_klass, scratch_src_klass);
2226       // assert_clean_int(count, sco_temp);
2227 
2228       // Generate the type check.
2229       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2230       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2231 
2232       // Smashes rscratch1, rscratch2
2233       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2234 
2235       // Fetch destination element klass from the ObjArrayKlass header.
2236       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2237       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2238       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2239 
2240       // the checkcast_copy loop needs two extra arguments:
2241       assert(c_rarg3 == sco_temp, "#3 already in place");
2242       // Set up arguments for checkcast_copy_entry.
2243       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2244       __ b(RuntimeAddress(checkcast_copy_entry));
2245     }
2246 
2247   __ BIND(L_failed);
2248     __ mov(r0, -1);
2249     __ leave();   // required for proper stackwalking of RuntimeStub frame
2250     __ ret(lr);
2251 
2252     return start;
2253   }
2254 
2255   //
2256   // Generate stub for array fill. If "aligned" is true, the
2257   // "to" address is assumed to be heapword aligned.
2258   //
2259   // Arguments for generated stub:
2260   //   to:    c_rarg0
2261   //   value: c_rarg1
2262   //   count: c_rarg2 treated as signed
2263   //
2264   address generate_fill(BasicType t, bool aligned, const char *name) {
2265     __ align(CodeEntryAlignment);
2266     StubCodeMark mark(this, "StubRoutines", name);
2267     address start = __ pc();
2268 
2269     BLOCK_COMMENT("Entry:");
2270 
2271     const Register to        = c_rarg0;  // source array address
2272     const Register value     = c_rarg1;  // value
2273     const Register count     = c_rarg2;  // elements count
2274 
2275     const Register bz_base = r10;        // base for block_zero routine
2276     const Register cnt_words = r11;      // temp register
2277 
2278     __ enter();
2279 
2280     Label L_fill_elements, L_exit1;
2281 
2282     int shift = -1;
2283     switch (t) {
2284       case T_BYTE:
2285         shift = 0;
2286         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2287         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2288         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2289         __ br(Assembler::LO, L_fill_elements);
2290         break;
2291       case T_SHORT:
2292         shift = 1;
2293         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2294         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2295         __ br(Assembler::LO, L_fill_elements);
2296         break;
2297       case T_INT:
2298         shift = 2;
2299         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2300         __ br(Assembler::LO, L_fill_elements);
2301         break;
2302       default: ShouldNotReachHere();
2303     }
2304 
2305     // Align source address at 8 bytes address boundary.
2306     Label L_skip_align1, L_skip_align2, L_skip_align4;
2307     if (!aligned) {
2308       switch (t) {
2309         case T_BYTE:
2310           // One byte misalignment happens only for byte arrays.
2311           __ tbz(to, 0, L_skip_align1);
2312           __ strb(value, Address(__ post(to, 1)));
2313           __ subw(count, count, 1);
2314           __ bind(L_skip_align1);
2315           // Fallthrough
2316         case T_SHORT:
2317           // Two bytes misalignment happens only for byte and short (char) arrays.
2318           __ tbz(to, 1, L_skip_align2);
2319           __ strh(value, Address(__ post(to, 2)));
2320           __ subw(count, count, 2 >> shift);
2321           __ bind(L_skip_align2);
2322           // Fallthrough
2323         case T_INT:
2324           // Align to 8 bytes, we know we are 4 byte aligned to start.
2325           __ tbz(to, 2, L_skip_align4);
2326           __ strw(value, Address(__ post(to, 4)));
2327           __ subw(count, count, 4 >> shift);
2328           __ bind(L_skip_align4);
2329           break;
2330         default: ShouldNotReachHere();
2331       }
2332     }
2333 
2334     //
2335     //  Fill large chunks
2336     //
2337     __ lsrw(cnt_words, count, 3 - shift); // number of words
2338     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2339     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2340     if (UseBlockZeroing) {
2341       Label non_block_zeroing, rest;
2342       // If the fill value is zero we can use the fast zero_words().
2343       __ cbnz(value, non_block_zeroing);
2344       __ mov(bz_base, to);
2345       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2346       __ zero_words(bz_base, cnt_words);
2347       __ b(rest);
2348       __ bind(non_block_zeroing);
2349       __ fill_words(to, cnt_words, value);
2350       __ bind(rest);
2351     } else {
2352       __ fill_words(to, cnt_words, value);
2353     }
2354 
2355     // Remaining count is less than 8 bytes. Fill it by a single store.
2356     // Note that the total length is no less than 8 bytes.
2357     if (t == T_BYTE || t == T_SHORT) {
2358       Label L_exit1;
2359       __ cbzw(count, L_exit1);
2360       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2361       __ str(value, Address(to, -8));    // overwrite some elements
2362       __ bind(L_exit1);
2363       __ leave();
2364       __ ret(lr);
2365     }
2366 
2367     // Handle copies less than 8 bytes.
2368     Label L_fill_2, L_fill_4, L_exit2;
2369     __ bind(L_fill_elements);
2370     switch (t) {
2371       case T_BYTE:
2372         __ tbz(count, 0, L_fill_2);
2373         __ strb(value, Address(__ post(to, 1)));
2374         __ bind(L_fill_2);
2375         __ tbz(count, 1, L_fill_4);
2376         __ strh(value, Address(__ post(to, 2)));
2377         __ bind(L_fill_4);
2378         __ tbz(count, 2, L_exit2);
2379         __ strw(value, Address(to));
2380         break;
2381       case T_SHORT:
2382         __ tbz(count, 0, L_fill_4);
2383         __ strh(value, Address(__ post(to, 2)));
2384         __ bind(L_fill_4);
2385         __ tbz(count, 1, L_exit2);
2386         __ strw(value, Address(to));
2387         break;
2388       case T_INT:
2389         __ cbzw(count, L_exit2);
2390         __ strw(value, Address(to));
2391         break;
2392       default: ShouldNotReachHere();
2393     }
2394     __ bind(L_exit2);
2395     __ leave();
2396     __ ret(lr);
2397     return start;
2398   }
2399 
2400   address generate_data_cache_writeback() {
2401     const Register line        = c_rarg0;  // address of line to write back
2402 
2403     __ align(CodeEntryAlignment);
2404 
2405     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2406 
2407     address start = __ pc();
2408     __ enter();
2409     __ cache_wb(Address(line, 0));
2410     __ leave();
2411     __ ret(lr);
2412 
2413     return start;
2414   }
2415 
2416   address generate_data_cache_writeback_sync() {
2417     const Register is_pre     = c_rarg0;  // pre or post sync
2418 
2419     __ align(CodeEntryAlignment);
2420 
2421     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2422 
2423     // pre wbsync is a no-op
2424     // post wbsync translates to an sfence
2425 
2426     Label skip;
2427     address start = __ pc();
2428     __ enter();
2429     __ cbnz(is_pre, skip);
2430     __ cache_wbsync(false);
2431     __ bind(skip);
2432     __ leave();
2433     __ ret(lr);
2434 
2435     return start;
2436   }
2437 
2438   void generate_arraycopy_stubs() {
2439     address entry;
2440     address entry_jbyte_arraycopy;
2441     address entry_jshort_arraycopy;
2442     address entry_jint_arraycopy;
2443     address entry_oop_arraycopy;
2444     address entry_jlong_arraycopy;
2445     address entry_checkcast_arraycopy;
2446 
2447     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2448     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2449 
2450     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2451 
2452     //*** jbyte
2453     // Always need aligned and unaligned versions
2454     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2455                                                                                   "jbyte_disjoint_arraycopy");
2456     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2457                                                                                   &entry_jbyte_arraycopy,
2458                                                                                   "jbyte_arraycopy");
2459     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2460                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2461     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2462                                                                                   "arrayof_jbyte_arraycopy");
2463 
2464     //*** jshort
2465     // Always need aligned and unaligned versions
2466     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2467                                                                                     "jshort_disjoint_arraycopy");
2468     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2469                                                                                     &entry_jshort_arraycopy,
2470                                                                                     "jshort_arraycopy");
2471     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2472                                                                                     "arrayof_jshort_disjoint_arraycopy");
2473     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2474                                                                                     "arrayof_jshort_arraycopy");
2475 
2476     //*** jint
2477     // Aligned versions
2478     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2479                                                                                 "arrayof_jint_disjoint_arraycopy");
2480     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2481                                                                                 "arrayof_jint_arraycopy");
2482     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2483     // entry_jint_arraycopy always points to the unaligned version
2484     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2485                                                                                 "jint_disjoint_arraycopy");
2486     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2487                                                                                 &entry_jint_arraycopy,
2488                                                                                 "jint_arraycopy");
2489 
2490     //*** jlong
2491     // It is always aligned
2492     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2493                                                                                   "arrayof_jlong_disjoint_arraycopy");
2494     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2495                                                                                   "arrayof_jlong_arraycopy");
2496     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2497     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2498 
2499     //*** oops
2500     {
2501       // With compressed oops we need unaligned versions; notice that
2502       // we overwrite entry_oop_arraycopy.
2503       bool aligned = !UseCompressedOops;
2504 
2505       StubRoutines::_arrayof_oop_disjoint_arraycopy
2506         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2507                                      /*dest_uninitialized*/false);
2508       StubRoutines::_arrayof_oop_arraycopy
2509         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2510                                      /*dest_uninitialized*/false);
2511       // Aligned versions without pre-barriers
2512       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2513         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2514                                      /*dest_uninitialized*/true);
2515       StubRoutines::_arrayof_oop_arraycopy_uninit
2516         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2517                                      /*dest_uninitialized*/true);
2518     }
2519 
2520     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2521     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2522     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2523     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2524 
2525     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2526     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2527                                                                         /*dest_uninitialized*/true);
2528 
2529     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2530                                                               entry_jbyte_arraycopy,
2531                                                               entry_jshort_arraycopy,
2532                                                               entry_jint_arraycopy,
2533                                                               entry_jlong_arraycopy);
2534 
2535     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2536                                                                entry_jbyte_arraycopy,
2537                                                                entry_jshort_arraycopy,
2538                                                                entry_jint_arraycopy,
2539                                                                entry_oop_arraycopy,
2540                                                                entry_jlong_arraycopy,
2541                                                                entry_checkcast_arraycopy);
2542 
2543     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2544     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2545     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2546     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2547     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2548     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2549   }
2550 
2551   void generate_math_stubs() { Unimplemented(); }
2552 
2553   // Arguments:
2554   //
2555   // Inputs:
2556   //   c_rarg0   - source byte array address
2557   //   c_rarg1   - destination byte array address
2558   //   c_rarg2   - K (key) in little endian int array
2559   //
2560   address generate_aescrypt_encryptBlock() {
2561     __ align(CodeEntryAlignment);
2562     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2563 
2564     const Register from        = c_rarg0;  // source array address
2565     const Register to          = c_rarg1;  // destination array address
2566     const Register key         = c_rarg2;  // key array address
2567     const Register keylen      = rscratch1;
2568 
2569     address start = __ pc();
2570     __ enter();
2571 
2572     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2573 
2574     __ aesenc_loadkeys(key, keylen);
2575     __ aesecb_encrypt(from, to, keylen);
2576 
2577     __ mov(r0, 0);
2578 
2579     __ leave();
2580     __ ret(lr);
2581 
2582     return start;
2583   }
2584 
2585   // Arguments:
2586   //
2587   // Inputs:
2588   //   c_rarg0   - source byte array address
2589   //   c_rarg1   - destination byte array address
2590   //   c_rarg2   - K (key) in little endian int array
2591   //
2592   address generate_aescrypt_decryptBlock() {
2593     assert(UseAES, "need AES cryptographic extension support");
2594     __ align(CodeEntryAlignment);
2595     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2596     Label L_doLast;
2597 
2598     const Register from        = c_rarg0;  // source array address
2599     const Register to          = c_rarg1;  // destination array address
2600     const Register key         = c_rarg2;  // key array address
2601     const Register keylen      = rscratch1;
2602 
2603     address start = __ pc();
2604     __ enter(); // required for proper stackwalking of RuntimeStub frame
2605 
2606     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2607 
2608     __ aesecb_decrypt(from, to, key, keylen);
2609 
2610     __ mov(r0, 0);
2611 
2612     __ leave();
2613     __ ret(lr);
2614 
2615     return start;
2616   }
2617 
2618   // Arguments:
2619   //
2620   // Inputs:
2621   //   c_rarg0   - source byte array address
2622   //   c_rarg1   - destination byte array address
2623   //   c_rarg2   - K (key) in little endian int array
2624   //   c_rarg3   - r vector byte array address
2625   //   c_rarg4   - input length
2626   //
2627   // Output:
2628   //   x0        - input length
2629   //
2630   address generate_cipherBlockChaining_encryptAESCrypt() {
2631     assert(UseAES, "need AES cryptographic extension support");
2632     __ align(CodeEntryAlignment);
2633     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2634 
2635     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2636 
2637     const Register from        = c_rarg0;  // source array address
2638     const Register to          = c_rarg1;  // destination array address
2639     const Register key         = c_rarg2;  // key array address
2640     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2641                                            // and left with the results of the last encryption block
2642     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2643     const Register keylen      = rscratch1;
2644 
2645     address start = __ pc();
2646 
2647       __ enter();
2648 
2649       __ movw(rscratch2, len_reg);
2650 
2651       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2652 
2653       __ ld1(v0, __ T16B, rvec);
2654 
2655       __ cmpw(keylen, 52);
2656       __ br(Assembler::CC, L_loadkeys_44);
2657       __ br(Assembler::EQ, L_loadkeys_52);
2658 
2659       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2660       __ rev32(v17, __ T16B, v17);
2661       __ rev32(v18, __ T16B, v18);
2662     __ BIND(L_loadkeys_52);
2663       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2664       __ rev32(v19, __ T16B, v19);
2665       __ rev32(v20, __ T16B, v20);
2666     __ BIND(L_loadkeys_44);
2667       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2668       __ rev32(v21, __ T16B, v21);
2669       __ rev32(v22, __ T16B, v22);
2670       __ rev32(v23, __ T16B, v23);
2671       __ rev32(v24, __ T16B, v24);
2672       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2673       __ rev32(v25, __ T16B, v25);
2674       __ rev32(v26, __ T16B, v26);
2675       __ rev32(v27, __ T16B, v27);
2676       __ rev32(v28, __ T16B, v28);
2677       __ ld1(v29, v30, v31, __ T16B, key);
2678       __ rev32(v29, __ T16B, v29);
2679       __ rev32(v30, __ T16B, v30);
2680       __ rev32(v31, __ T16B, v31);
2681 
2682     __ BIND(L_aes_loop);
2683       __ ld1(v1, __ T16B, __ post(from, 16));
2684       __ eor(v0, __ T16B, v0, v1);
2685 
2686       __ br(Assembler::CC, L_rounds_44);
2687       __ br(Assembler::EQ, L_rounds_52);
2688 
2689       __ aese(v0, v17); __ aesmc(v0, v0);
2690       __ aese(v0, v18); __ aesmc(v0, v0);
2691     __ BIND(L_rounds_52);
2692       __ aese(v0, v19); __ aesmc(v0, v0);
2693       __ aese(v0, v20); __ aesmc(v0, v0);
2694     __ BIND(L_rounds_44);
2695       __ aese(v0, v21); __ aesmc(v0, v0);
2696       __ aese(v0, v22); __ aesmc(v0, v0);
2697       __ aese(v0, v23); __ aesmc(v0, v0);
2698       __ aese(v0, v24); __ aesmc(v0, v0);
2699       __ aese(v0, v25); __ aesmc(v0, v0);
2700       __ aese(v0, v26); __ aesmc(v0, v0);
2701       __ aese(v0, v27); __ aesmc(v0, v0);
2702       __ aese(v0, v28); __ aesmc(v0, v0);
2703       __ aese(v0, v29); __ aesmc(v0, v0);
2704       __ aese(v0, v30);
2705       __ eor(v0, __ T16B, v0, v31);
2706 
2707       __ st1(v0, __ T16B, __ post(to, 16));
2708 
2709       __ subw(len_reg, len_reg, 16);
2710       __ cbnzw(len_reg, L_aes_loop);
2711 
2712       __ st1(v0, __ T16B, rvec);
2713 
2714       __ mov(r0, rscratch2);
2715 
2716       __ leave();
2717       __ ret(lr);
2718 
2719       return start;
2720   }
2721 
2722   // Arguments:
2723   //
2724   // Inputs:
2725   //   c_rarg0   - source byte array address
2726   //   c_rarg1   - destination byte array address
2727   //   c_rarg2   - K (key) in little endian int array
2728   //   c_rarg3   - r vector byte array address
2729   //   c_rarg4   - input length
2730   //
2731   // Output:
2732   //   r0        - input length
2733   //
2734   address generate_cipherBlockChaining_decryptAESCrypt() {
2735     assert(UseAES, "need AES cryptographic extension support");
2736     __ align(CodeEntryAlignment);
2737     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2738 
2739     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2740 
2741     const Register from        = c_rarg0;  // source array address
2742     const Register to          = c_rarg1;  // destination array address
2743     const Register key         = c_rarg2;  // key array address
2744     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2745                                            // and left with the results of the last encryption block
2746     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2747     const Register keylen      = rscratch1;
2748 
2749     address start = __ pc();
2750 
2751       __ enter();
2752 
2753       __ movw(rscratch2, len_reg);
2754 
2755       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2756 
2757       __ ld1(v2, __ T16B, rvec);
2758 
2759       __ ld1(v31, __ T16B, __ post(key, 16));
2760       __ rev32(v31, __ T16B, v31);
2761 
2762       __ cmpw(keylen, 52);
2763       __ br(Assembler::CC, L_loadkeys_44);
2764       __ br(Assembler::EQ, L_loadkeys_52);
2765 
2766       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2767       __ rev32(v17, __ T16B, v17);
2768       __ rev32(v18, __ T16B, v18);
2769     __ BIND(L_loadkeys_52);
2770       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2771       __ rev32(v19, __ T16B, v19);
2772       __ rev32(v20, __ T16B, v20);
2773     __ BIND(L_loadkeys_44);
2774       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2775       __ rev32(v21, __ T16B, v21);
2776       __ rev32(v22, __ T16B, v22);
2777       __ rev32(v23, __ T16B, v23);
2778       __ rev32(v24, __ T16B, v24);
2779       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2780       __ rev32(v25, __ T16B, v25);
2781       __ rev32(v26, __ T16B, v26);
2782       __ rev32(v27, __ T16B, v27);
2783       __ rev32(v28, __ T16B, v28);
2784       __ ld1(v29, v30, __ T16B, key);
2785       __ rev32(v29, __ T16B, v29);
2786       __ rev32(v30, __ T16B, v30);
2787 
2788     __ BIND(L_aes_loop);
2789       __ ld1(v0, __ T16B, __ post(from, 16));
2790       __ orr(v1, __ T16B, v0, v0);
2791 
2792       __ br(Assembler::CC, L_rounds_44);
2793       __ br(Assembler::EQ, L_rounds_52);
2794 
2795       __ aesd(v0, v17); __ aesimc(v0, v0);
2796       __ aesd(v0, v18); __ aesimc(v0, v0);
2797     __ BIND(L_rounds_52);
2798       __ aesd(v0, v19); __ aesimc(v0, v0);
2799       __ aesd(v0, v20); __ aesimc(v0, v0);
2800     __ BIND(L_rounds_44);
2801       __ aesd(v0, v21); __ aesimc(v0, v0);
2802       __ aesd(v0, v22); __ aesimc(v0, v0);
2803       __ aesd(v0, v23); __ aesimc(v0, v0);
2804       __ aesd(v0, v24); __ aesimc(v0, v0);
2805       __ aesd(v0, v25); __ aesimc(v0, v0);
2806       __ aesd(v0, v26); __ aesimc(v0, v0);
2807       __ aesd(v0, v27); __ aesimc(v0, v0);
2808       __ aesd(v0, v28); __ aesimc(v0, v0);
2809       __ aesd(v0, v29); __ aesimc(v0, v0);
2810       __ aesd(v0, v30);
2811       __ eor(v0, __ T16B, v0, v31);
2812       __ eor(v0, __ T16B, v0, v2);
2813 
2814       __ st1(v0, __ T16B, __ post(to, 16));
2815       __ orr(v2, __ T16B, v1, v1);
2816 
2817       __ subw(len_reg, len_reg, 16);
2818       __ cbnzw(len_reg, L_aes_loop);
2819 
2820       __ st1(v2, __ T16B, rvec);
2821 
2822       __ mov(r0, rscratch2);
2823 
2824       __ leave();
2825       __ ret(lr);
2826 
2827     return start;
2828   }
2829 
2830   // CTR AES crypt.
2831   // Arguments:
2832   //
2833   // Inputs:
2834   //   c_rarg0   - source byte array address
2835   //   c_rarg1   - destination byte array address
2836   //   c_rarg2   - K (key) in little endian int array
2837   //   c_rarg3   - counter vector byte array address
2838   //   c_rarg4   - input length
2839   //   c_rarg5   - saved encryptedCounter start
2840   //   c_rarg6   - saved used length
2841   //
2842   // Output:
2843   //   r0       - input length
2844   //
2845   address generate_counterMode_AESCrypt() {
2846     const Register in = c_rarg0;
2847     const Register out = c_rarg1;
2848     const Register key = c_rarg2;
2849     const Register counter = c_rarg3;
2850     const Register saved_len = c_rarg4, len = r10;
2851     const Register saved_encrypted_ctr = c_rarg5;
2852     const Register used_ptr = c_rarg6, used = r12;
2853 
2854     const Register offset = r7;
2855     const Register keylen = r11;
2856 
2857     const unsigned char block_size = 16;
2858     const int bulk_width = 4;
2859     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2860     // performance with larger data sizes, but it also means that the
2861     // fast path isn't used until you have at least 8 blocks, and up
2862     // to 127 bytes of data will be executed on the slow path. For
2863     // that reason, and also so as not to blow away too much icache, 4
2864     // blocks seems like a sensible compromise.
2865 
2866     // Algorithm:
2867     //
2868     //    if (len == 0) {
2869     //        goto DONE;
2870     //    }
2871     //    int result = len;
2872     //    do {
2873     //        if (used >= blockSize) {
2874     //            if (len >= bulk_width * blockSize) {
2875     //                CTR_large_block();
2876     //                if (len == 0)
2877     //                    goto DONE;
2878     //            }
2879     //            for (;;) {
2880     //                16ByteVector v0 = counter;
2881     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2882     //                used = 0;
2883     //                if (len < blockSize)
2884     //                    break;    /* goto NEXT */
2885     //                16ByteVector v1 = load16Bytes(in, offset);
2886     //                v1 = v1 ^ encryptedCounter;
2887     //                store16Bytes(out, offset);
2888     //                used = blockSize;
2889     //                offset += blockSize;
2890     //                len -= blockSize;
2891     //                if (len == 0)
2892     //                    goto DONE;
2893     //            }
2894     //        }
2895     //      NEXT:
2896     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2897     //        len--;
2898     //    } while (len != 0);
2899     //  DONE:
2900     //    return result;
2901     //
2902     // CTR_large_block()
2903     //    Wide bulk encryption of whole blocks.
2904 
2905     __ align(CodeEntryAlignment);
2906     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2907     const address start = __ pc();
2908     __ enter();
2909 
2910     Label DONE, CTR_large_block, large_block_return;
2911     __ ldrw(used, Address(used_ptr));
2912     __ cbzw(saved_len, DONE);
2913 
2914     __ mov(len, saved_len);
2915     __ mov(offset, 0);
2916 
2917     // Compute #rounds for AES based on the length of the key array
2918     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2919 
2920     __ aesenc_loadkeys(key, keylen);
2921 
2922     {
2923       Label L_CTR_loop, NEXT;
2924 
2925       __ bind(L_CTR_loop);
2926 
2927       __ cmp(used, block_size);
2928       __ br(__ LO, NEXT);
2929 
2930       // Maybe we have a lot of data
2931       __ subsw(rscratch1, len, bulk_width * block_size);
2932       __ br(__ HS, CTR_large_block);
2933       __ BIND(large_block_return);
2934       __ cbzw(len, DONE);
2935 
2936       // Setup the counter
2937       __ movi(v4, __ T4S, 0);
2938       __ movi(v5, __ T4S, 1);
2939       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2940 
2941       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2942       __ rev32(v16, __ T16B, v0);
2943       __ addv(v16, __ T4S, v16, v4);
2944       __ rev32(v16, __ T16B, v16);
2945       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2946 
2947       {
2948         // We have fewer than bulk_width blocks of data left. Encrypt
2949         // them one by one until there is less than a full block
2950         // remaining, being careful to save both the encrypted counter
2951         // and the counter.
2952 
2953         Label inner_loop;
2954         __ bind(inner_loop);
2955         // Counter to encrypt is in v0
2956         __ aesecb_encrypt(noreg, noreg, keylen);
2957         __ st1(v0, __ T16B, saved_encrypted_ctr);
2958 
2959         // Do we have a remaining full block?
2960 
2961         __ mov(used, 0);
2962         __ cmp(len, block_size);
2963         __ br(__ LO, NEXT);
2964 
2965         // Yes, we have a full block
2966         __ ldrq(v1, Address(in, offset));
2967         __ eor(v1, __ T16B, v1, v0);
2968         __ strq(v1, Address(out, offset));
2969         __ mov(used, block_size);
2970         __ add(offset, offset, block_size);
2971 
2972         __ subw(len, len, block_size);
2973         __ cbzw(len, DONE);
2974 
2975         // Increment the counter, store it back
2976         __ orr(v0, __ T16B, v16, v16);
2977         __ rev32(v16, __ T16B, v16);
2978         __ addv(v16, __ T4S, v16, v4);
2979         __ rev32(v16, __ T16B, v16);
2980         __ st1(v16, __ T16B, counter); // Save the incremented counter back
2981 
2982         __ b(inner_loop);
2983       }
2984 
2985       __ BIND(NEXT);
2986 
2987       // Encrypt a single byte, and loop.
2988       // We expect this to be a rare event.
2989       __ ldrb(rscratch1, Address(in, offset));
2990       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
2991       __ eor(rscratch1, rscratch1, rscratch2);
2992       __ strb(rscratch1, Address(out, offset));
2993       __ add(offset, offset, 1);
2994       __ add(used, used, 1);
2995       __ subw(len, len,1);
2996       __ cbnzw(len, L_CTR_loop);
2997     }
2998 
2999     __ bind(DONE);
3000     __ strw(used, Address(used_ptr));
3001     __ mov(r0, saved_len);
3002 
3003     __ leave(); // required for proper stackwalking of RuntimeStub frame
3004     __ ret(lr);
3005 
3006     // Bulk encryption
3007 
3008     __ BIND (CTR_large_block);
3009     assert(bulk_width == 4 || bulk_width == 8, "must be");
3010 
3011     if (bulk_width == 8) {
3012       __ sub(sp, sp, 4 * 16);
3013       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3014     }
3015     __ sub(sp, sp, 4 * 16);
3016     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3017     RegSet saved_regs = (RegSet::of(in, out, offset)
3018                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3019     __ push(saved_regs, sp);
3020     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3021     __ add(in, in, offset);
3022     __ add(out, out, offset);
3023 
3024     // Keys should already be loaded into the correct registers
3025 
3026     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3027     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3028 
3029     // AES/CTR loop
3030     {
3031       Label L_CTR_loop;
3032       __ BIND(L_CTR_loop);
3033 
3034       // Setup the counters
3035       __ movi(v8, __ T4S, 0);
3036       __ movi(v9, __ T4S, 1);
3037       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3038 
3039       for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
3040         __ rev32(f, __ T16B, v16);
3041         __ addv(v16, __ T4S, v16, v8);
3042       }
3043 
3044       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3045 
3046       // Encrypt the counters
3047       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3048 
3049       if (bulk_width == 8) {
3050         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3051       }
3052 
3053       // XOR the encrypted counters with the inputs
3054       for (int i = 0; i < bulk_width; i++) {
3055         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3056       }
3057 
3058       // Write the encrypted data
3059       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3060       if (bulk_width == 8) {
3061         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3062       }
3063 
3064       __ subw(len, len, 16 * bulk_width);
3065       __ cbnzw(len, L_CTR_loop);
3066     }
3067 
3068     // Save the counter back where it goes
3069     __ rev32(v16, __ T16B, v16);
3070     __ st1(v16, __ T16B, counter);
3071 
3072     __ pop(saved_regs, sp);
3073 
3074     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3075     if (bulk_width == 8) {
3076       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3077     }
3078 
3079     __ andr(rscratch1, len, -16 * bulk_width);
3080     __ sub(len, len, rscratch1);
3081     __ add(offset, offset, rscratch1);
3082     __ mov(used, 16);
3083     __ strw(used, Address(used_ptr));
3084     __ b(large_block_return);
3085 
3086     return start;
3087   }
3088 
3089   // Vector AES Galois Counter Mode implementation. Parameters:
3090   //
3091   // in = c_rarg0
3092   // len = c_rarg1
3093   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3094   // out = c_rarg3
3095   // key = c_rarg4
3096   // state = c_rarg5 - GHASH.state
3097   // subkeyHtbl = c_rarg6 - powers of H
3098   // counter = c_rarg7 - 16 bytes of CTR
3099   // return - number of processed bytes
3100   address generate_galoisCounterMode_AESCrypt() {
3101     address ghash_polynomial = __ pc();
3102     __ emit_int64(0x87);  // The low-order bits of the field
3103                           // polynomial (i.e. p = z^7+z^2+z+1)
3104                           // repeated in the low and high parts of a
3105                           // 128-bit vector
3106     __ emit_int64(0x87);
3107 
3108     __ align(CodeEntryAlignment);
3109      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3110     address start = __ pc();
3111     __ enter();
3112 
3113     const Register in = c_rarg0;
3114     const Register len = c_rarg1;
3115     const Register ct = c_rarg2;
3116     const Register out = c_rarg3;
3117     // and updated with the incremented counter in the end
3118 
3119     const Register key = c_rarg4;
3120     const Register state = c_rarg5;
3121 
3122     const Register subkeyHtbl = c_rarg6;
3123 
3124     const Register counter = c_rarg7;
3125 
3126     const Register keylen = r10;
3127     // Save state before entering routine
3128     __ sub(sp, sp, 4 * 16);
3129     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3130     __ sub(sp, sp, 4 * 16);
3131     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3132 
3133     // __ andr(len, len, -512);
3134     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3135     __ str(len, __ pre(sp, -2 * wordSize));
3136 
3137     Label DONE;
3138     __ cbz(len, DONE);
3139 
3140     // Compute #rounds for AES based on the length of the key array
3141     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3142 
3143     __ aesenc_loadkeys(key, keylen);
3144     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3145     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3146 
3147     // AES/CTR loop
3148     {
3149       Label L_CTR_loop;
3150       __ BIND(L_CTR_loop);
3151 
3152       // Setup the counters
3153       __ movi(v8, __ T4S, 0);
3154       __ movi(v9, __ T4S, 1);
3155       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3156       for (FloatRegister f = v0; f < v8; f++) {
3157         __ rev32(f, __ T16B, v16);
3158         __ addv(v16, __ T4S, v16, v8);
3159       }
3160 
3161       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3162 
3163       // Encrypt the counters
3164       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3165 
3166       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3167 
3168       // XOR the encrypted counters with the inputs
3169       for (int i = 0; i < 8; i++) {
3170         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3171       }
3172       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3173       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3174 
3175       __ subw(len, len, 16 * 8);
3176       __ cbnzw(len, L_CTR_loop);
3177     }
3178 
3179     __ rev32(v16, __ T16B, v16);
3180     __ st1(v16, __ T16B, counter);
3181 
3182     __ ldr(len, Address(sp));
3183     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3184 
3185     // GHASH/CTR loop
3186     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3187                                 len, /*unrolls*/4);
3188 
3189 #ifdef ASSERT
3190     { Label L;
3191       __ cmp(len, (unsigned char)0);
3192       __ br(Assembler::EQ, L);
3193       __ stop("stubGenerator: abort");
3194       __ bind(L);
3195   }
3196 #endif
3197 
3198   __ bind(DONE);
3199     // Return the number of bytes processed
3200     __ ldr(r0, __ post(sp, 2 * wordSize));
3201 
3202     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3203     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3204 
3205     __ leave(); // required for proper stackwalking of RuntimeStub frame
3206     __ ret(lr);
3207      return start;
3208   }
3209 
3210   // Arguments:
3211   //
3212   // Inputs:
3213   //   c_rarg0   - byte[]  source+offset
3214   //   c_rarg1   - int[]   SHA.state
3215   //   c_rarg2   - int     offset
3216   //   c_rarg3   - int     limit
3217   //
3218   address generate_md5_implCompress(bool multi_block, const char *name) {
3219     __ align(CodeEntryAlignment);
3220     StubCodeMark mark(this, "StubRoutines", name);
3221     address start = __ pc();
3222 
3223     Register buf       = c_rarg0;
3224     Register state     = c_rarg1;
3225     Register ofs       = c_rarg2;
3226     Register limit     = c_rarg3;
3227     Register a         = r4;
3228     Register b         = r5;
3229     Register c         = r6;
3230     Register d         = r7;
3231     Register rscratch3 = r10;
3232     Register rscratch4 = r11;
3233 
3234     Label keys;
3235     Label md5_loop;
3236 
3237     __ BIND(md5_loop);
3238 
3239     // Save hash values for addition after rounds
3240     __ ldrw(a, Address(state,  0));
3241     __ ldrw(b, Address(state,  4));
3242     __ ldrw(c, Address(state,  8));
3243     __ ldrw(d, Address(state, 12));
3244 
3245 #define FF(r1, r2, r3, r4, k, s, t)              \
3246     __ eorw(rscratch3, r3, r4);                  \
3247     __ movw(rscratch2, t);                       \
3248     __ andw(rscratch3, rscratch3, r2);           \
3249     __ addw(rscratch4, r1, rscratch2);           \
3250     __ ldrw(rscratch1, Address(buf, k*4));       \
3251     __ eorw(rscratch3, rscratch3, r4);           \
3252     __ addw(rscratch3, rscratch3, rscratch1);    \
3253     __ addw(rscratch3, rscratch3, rscratch4);    \
3254     __ rorw(rscratch2, rscratch3, 32 - s);       \
3255     __ addw(r1, rscratch2, r2);
3256 
3257 #define GG(r1, r2, r3, r4, k, s, t)              \
3258     __ eorw(rscratch2, r2, r3);                  \
3259     __ ldrw(rscratch1, Address(buf, k*4));       \
3260     __ andw(rscratch3, rscratch2, r4);           \
3261     __ movw(rscratch2, t);                       \
3262     __ eorw(rscratch3, rscratch3, r3);           \
3263     __ addw(rscratch4, r1, rscratch2);           \
3264     __ addw(rscratch3, rscratch3, rscratch1);    \
3265     __ addw(rscratch3, rscratch3, rscratch4);    \
3266     __ rorw(rscratch2, rscratch3, 32 - s);       \
3267     __ addw(r1, rscratch2, r2);
3268 
3269 #define HH(r1, r2, r3, r4, k, s, t)              \
3270     __ eorw(rscratch3, r3, r4);                  \
3271     __ movw(rscratch2, t);                       \
3272     __ addw(rscratch4, r1, rscratch2);           \
3273     __ ldrw(rscratch1, Address(buf, k*4));       \
3274     __ eorw(rscratch3, rscratch3, r2);           \
3275     __ addw(rscratch3, rscratch3, rscratch1);    \
3276     __ addw(rscratch3, rscratch3, rscratch4);    \
3277     __ rorw(rscratch2, rscratch3, 32 - s);       \
3278     __ addw(r1, rscratch2, r2);
3279 
3280 #define II(r1, r2, r3, r4, k, s, t)              \
3281     __ movw(rscratch3, t);                       \
3282     __ ornw(rscratch2, r2, r4);                  \
3283     __ addw(rscratch4, r1, rscratch3);           \
3284     __ ldrw(rscratch1, Address(buf, k*4));       \
3285     __ eorw(rscratch3, rscratch2, r3);           \
3286     __ addw(rscratch3, rscratch3, rscratch1);    \
3287     __ addw(rscratch3, rscratch3, rscratch4);    \
3288     __ rorw(rscratch2, rscratch3, 32 - s);       \
3289     __ addw(r1, rscratch2, r2);
3290 
3291     // Round 1
3292     FF(a, b, c, d,  0,  7, 0xd76aa478)
3293     FF(d, a, b, c,  1, 12, 0xe8c7b756)
3294     FF(c, d, a, b,  2, 17, 0x242070db)
3295     FF(b, c, d, a,  3, 22, 0xc1bdceee)
3296     FF(a, b, c, d,  4,  7, 0xf57c0faf)
3297     FF(d, a, b, c,  5, 12, 0x4787c62a)
3298     FF(c, d, a, b,  6, 17, 0xa8304613)
3299     FF(b, c, d, a,  7, 22, 0xfd469501)
3300     FF(a, b, c, d,  8,  7, 0x698098d8)
3301     FF(d, a, b, c,  9, 12, 0x8b44f7af)
3302     FF(c, d, a, b, 10, 17, 0xffff5bb1)
3303     FF(b, c, d, a, 11, 22, 0x895cd7be)
3304     FF(a, b, c, d, 12,  7, 0x6b901122)
3305     FF(d, a, b, c, 13, 12, 0xfd987193)
3306     FF(c, d, a, b, 14, 17, 0xa679438e)
3307     FF(b, c, d, a, 15, 22, 0x49b40821)
3308 
3309     // Round 2
3310     GG(a, b, c, d,  1,  5, 0xf61e2562)
3311     GG(d, a, b, c,  6,  9, 0xc040b340)
3312     GG(c, d, a, b, 11, 14, 0x265e5a51)
3313     GG(b, c, d, a,  0, 20, 0xe9b6c7aa)
3314     GG(a, b, c, d,  5,  5, 0xd62f105d)
3315     GG(d, a, b, c, 10,  9, 0x02441453)
3316     GG(c, d, a, b, 15, 14, 0xd8a1e681)
3317     GG(b, c, d, a,  4, 20, 0xe7d3fbc8)
3318     GG(a, b, c, d,  9,  5, 0x21e1cde6)
3319     GG(d, a, b, c, 14,  9, 0xc33707d6)
3320     GG(c, d, a, b,  3, 14, 0xf4d50d87)
3321     GG(b, c, d, a,  8, 20, 0x455a14ed)
3322     GG(a, b, c, d, 13,  5, 0xa9e3e905)
3323     GG(d, a, b, c,  2,  9, 0xfcefa3f8)
3324     GG(c, d, a, b,  7, 14, 0x676f02d9)
3325     GG(b, c, d, a, 12, 20, 0x8d2a4c8a)
3326 
3327     // Round 3
3328     HH(a, b, c, d,  5,  4, 0xfffa3942)
3329     HH(d, a, b, c,  8, 11, 0x8771f681)
3330     HH(c, d, a, b, 11, 16, 0x6d9d6122)
3331     HH(b, c, d, a, 14, 23, 0xfde5380c)
3332     HH(a, b, c, d,  1,  4, 0xa4beea44)
3333     HH(d, a, b, c,  4, 11, 0x4bdecfa9)
3334     HH(c, d, a, b,  7, 16, 0xf6bb4b60)
3335     HH(b, c, d, a, 10, 23, 0xbebfbc70)
3336     HH(a, b, c, d, 13,  4, 0x289b7ec6)
3337     HH(d, a, b, c,  0, 11, 0xeaa127fa)
3338     HH(c, d, a, b,  3, 16, 0xd4ef3085)
3339     HH(b, c, d, a,  6, 23, 0x04881d05)
3340     HH(a, b, c, d,  9,  4, 0xd9d4d039)
3341     HH(d, a, b, c, 12, 11, 0xe6db99e5)
3342     HH(c, d, a, b, 15, 16, 0x1fa27cf8)
3343     HH(b, c, d, a,  2, 23, 0xc4ac5665)
3344 
3345     // Round 4
3346     II(a, b, c, d,  0,  6, 0xf4292244)
3347     II(d, a, b, c,  7, 10, 0x432aff97)
3348     II(c, d, a, b, 14, 15, 0xab9423a7)
3349     II(b, c, d, a,  5, 21, 0xfc93a039)
3350     II(a, b, c, d, 12,  6, 0x655b59c3)
3351     II(d, a, b, c,  3, 10, 0x8f0ccc92)
3352     II(c, d, a, b, 10, 15, 0xffeff47d)
3353     II(b, c, d, a,  1, 21, 0x85845dd1)
3354     II(a, b, c, d,  8,  6, 0x6fa87e4f)
3355     II(d, a, b, c, 15, 10, 0xfe2ce6e0)
3356     II(c, d, a, b,  6, 15, 0xa3014314)
3357     II(b, c, d, a, 13, 21, 0x4e0811a1)
3358     II(a, b, c, d,  4,  6, 0xf7537e82)
3359     II(d, a, b, c, 11, 10, 0xbd3af235)
3360     II(c, d, a, b,  2, 15, 0x2ad7d2bb)
3361     II(b, c, d, a,  9, 21, 0xeb86d391)
3362 
3363 #undef FF
3364 #undef GG
3365 #undef HH
3366 #undef II
3367 
3368     // write hash values back in the correct order
3369     __ ldrw(rscratch1, Address(state,  0));
3370     __ addw(rscratch1, rscratch1, a);
3371     __ strw(rscratch1, Address(state,  0));
3372 
3373     __ ldrw(rscratch2, Address(state,  4));
3374     __ addw(rscratch2, rscratch2, b);
3375     __ strw(rscratch2, Address(state,  4));
3376 
3377     __ ldrw(rscratch3, Address(state,  8));
3378     __ addw(rscratch3, rscratch3, c);
3379     __ strw(rscratch3, Address(state,  8));
3380 
3381     __ ldrw(rscratch4, Address(state, 12));
3382     __ addw(rscratch4, rscratch4, d);
3383     __ strw(rscratch4, Address(state, 12));
3384 
3385     if (multi_block) {
3386       __ add(buf, buf, 64);
3387       __ add(ofs, ofs, 64);
3388       __ cmp(ofs, limit);
3389       __ br(Assembler::LE, md5_loop);
3390       __ mov(c_rarg0, ofs); // return ofs
3391     }
3392 
3393     __ ret(lr);
3394 
3395     return start;
3396   }
3397 
3398   // Arguments:
3399   //
3400   // Inputs:
3401   //   c_rarg0   - byte[]  source+offset
3402   //   c_rarg1   - int[]   SHA.state
3403   //   c_rarg2   - int     offset
3404   //   c_rarg3   - int     limit
3405   //
3406   address generate_sha1_implCompress(bool multi_block, const char *name) {
3407     __ align(CodeEntryAlignment);
3408     StubCodeMark mark(this, "StubRoutines", name);
3409     address start = __ pc();
3410 
3411     Register buf   = c_rarg0;
3412     Register state = c_rarg1;
3413     Register ofs   = c_rarg2;
3414     Register limit = c_rarg3;
3415 
3416     Label keys;
3417     Label sha1_loop;
3418 
3419     // load the keys into v0..v3
3420     __ adr(rscratch1, keys);
3421     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3422     // load 5 words state into v6, v7
3423     __ ldrq(v6, Address(state, 0));
3424     __ ldrs(v7, Address(state, 16));
3425 
3426 
3427     __ BIND(sha1_loop);
3428     // load 64 bytes of data into v16..v19
3429     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3430     __ rev32(v16, __ T16B, v16);
3431     __ rev32(v17, __ T16B, v17);
3432     __ rev32(v18, __ T16B, v18);
3433     __ rev32(v19, __ T16B, v19);
3434 
3435     // do the sha1
3436     __ addv(v4, __ T4S, v16, v0);
3437     __ orr(v20, __ T16B, v6, v6);
3438 
3439     FloatRegister d0 = v16;
3440     FloatRegister d1 = v17;
3441     FloatRegister d2 = v18;
3442     FloatRegister d3 = v19;
3443 
3444     for (int round = 0; round < 20; round++) {
3445       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3446       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3447       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3448       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3449       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3450 
3451       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3452       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3453       __ sha1h(tmp2, __ T4S, v20);
3454       if (round < 5)
3455         __ sha1c(v20, __ T4S, tmp3, tmp4);
3456       else if (round < 10 || round >= 15)
3457         __ sha1p(v20, __ T4S, tmp3, tmp4);
3458       else
3459         __ sha1m(v20, __ T4S, tmp3, tmp4);
3460       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3461 
3462       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3463     }
3464 
3465     __ addv(v7, __ T2S, v7, v21);
3466     __ addv(v6, __ T4S, v6, v20);
3467 
3468     if (multi_block) {
3469       __ add(ofs, ofs, 64);
3470       __ cmp(ofs, limit);
3471       __ br(Assembler::LE, sha1_loop);
3472       __ mov(c_rarg0, ofs); // return ofs
3473     }
3474 
3475     __ strq(v6, Address(state, 0));
3476     __ strs(v7, Address(state, 16));
3477 
3478     __ ret(lr);
3479 
3480     __ bind(keys);
3481     __ emit_int32(0x5a827999);
3482     __ emit_int32(0x6ed9eba1);
3483     __ emit_int32(0x8f1bbcdc);
3484     __ emit_int32(0xca62c1d6);
3485 
3486     return start;
3487   }
3488 
3489 
3490   // Arguments:
3491   //
3492   // Inputs:
3493   //   c_rarg0   - byte[]  source+offset
3494   //   c_rarg1   - int[]   SHA.state
3495   //   c_rarg2   - int     offset
3496   //   c_rarg3   - int     limit
3497   //
3498   address generate_sha256_implCompress(bool multi_block, const char *name) {
3499     static const uint32_t round_consts[64] = {
3500       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3501       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3502       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3503       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3504       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3505       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3506       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3507       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3508       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3509       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3510       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3511       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3512       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3513       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3514       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3515       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3516     };
3517     __ align(CodeEntryAlignment);
3518     StubCodeMark mark(this, "StubRoutines", name);
3519     address start = __ pc();
3520 
3521     Register buf   = c_rarg0;
3522     Register state = c_rarg1;
3523     Register ofs   = c_rarg2;
3524     Register limit = c_rarg3;
3525 
3526     Label sha1_loop;
3527 
3528     __ stpd(v8, v9, __ pre(sp, -32));
3529     __ stpd(v10, v11, Address(sp, 16));
3530 
3531 // dga == v0
3532 // dgb == v1
3533 // dg0 == v2
3534 // dg1 == v3
3535 // dg2 == v4
3536 // t0 == v6
3537 // t1 == v7
3538 
3539     // load 16 keys to v16..v31
3540     __ lea(rscratch1, ExternalAddress((address)round_consts));
3541     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3542     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3543     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3544     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3545 
3546     // load 8 words (256 bits) state
3547     __ ldpq(v0, v1, state);
3548 
3549     __ BIND(sha1_loop);
3550     // load 64 bytes of data into v8..v11
3551     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3552     __ rev32(v8, __ T16B, v8);
3553     __ rev32(v9, __ T16B, v9);
3554     __ rev32(v10, __ T16B, v10);
3555     __ rev32(v11, __ T16B, v11);
3556 
3557     __ addv(v6, __ T4S, v8, v16);
3558     __ orr(v2, __ T16B, v0, v0);
3559     __ orr(v3, __ T16B, v1, v1);
3560 
3561     FloatRegister d0 = v8;
3562     FloatRegister d1 = v9;
3563     FloatRegister d2 = v10;
3564     FloatRegister d3 = v11;
3565 
3566 
3567     for (int round = 0; round < 16; round++) {
3568       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3569       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3570       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3571       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3572 
3573       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3574        __ orr(v4, __ T16B, v2, v2);
3575       if (round < 15)
3576         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3577       __ sha256h(v2, __ T4S, v3, tmp2);
3578       __ sha256h2(v3, __ T4S, v4, tmp2);
3579       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3580 
3581       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3582     }
3583 
3584     __ addv(v0, __ T4S, v0, v2);
3585     __ addv(v1, __ T4S, v1, v3);
3586 
3587     if (multi_block) {
3588       __ add(ofs, ofs, 64);
3589       __ cmp(ofs, limit);
3590       __ br(Assembler::LE, sha1_loop);
3591       __ mov(c_rarg0, ofs); // return ofs
3592     }
3593 
3594     __ ldpd(v10, v11, Address(sp, 16));
3595     __ ldpd(v8, v9, __ post(sp, 32));
3596 
3597     __ stpq(v0, v1, state);
3598 
3599     __ ret(lr);
3600 
3601     return start;
3602   }
3603 
3604   // Arguments:
3605   //
3606   // Inputs:
3607   //   c_rarg0   - byte[]  source+offset
3608   //   c_rarg1   - int[]   SHA.state
3609   //   c_rarg2   - int     offset
3610   //   c_rarg3   - int     limit
3611   //
3612   address generate_sha512_implCompress(bool multi_block, const char *name) {
3613     static const uint64_t round_consts[80] = {
3614       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3615       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3616       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3617       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3618       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3619       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3620       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3621       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3622       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3623       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3624       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3625       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3626       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3627       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3628       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3629       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3630       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3631       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3632       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3633       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3634       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3635       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3636       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3637       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3638       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3639       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3640       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3641     };
3642 
3643     // Double rounds for sha512.
3644     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3645       if (dr < 36)                                                                   \
3646         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3647       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3648       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3649       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3650       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3651       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3652       if (dr < 32) {                                                                 \
3653         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3654         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3655       }                                                                              \
3656       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3657       if (dr < 32)                                                                   \
3658         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3659       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3660       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3661 
3662     __ align(CodeEntryAlignment);
3663     StubCodeMark mark(this, "StubRoutines", name);
3664     address start = __ pc();
3665 
3666     Register buf   = c_rarg0;
3667     Register state = c_rarg1;
3668     Register ofs   = c_rarg2;
3669     Register limit = c_rarg3;
3670 
3671     __ stpd(v8, v9, __ pre(sp, -64));
3672     __ stpd(v10, v11, Address(sp, 16));
3673     __ stpd(v12, v13, Address(sp, 32));
3674     __ stpd(v14, v15, Address(sp, 48));
3675 
3676     Label sha512_loop;
3677 
3678     // load state
3679     __ ld1(v8, v9, v10, v11, __ T2D, state);
3680 
3681     // load first 4 round constants
3682     __ lea(rscratch1, ExternalAddress((address)round_consts));
3683     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3684 
3685     __ BIND(sha512_loop);
3686     // load 128B of data into v12..v19
3687     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3688     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3689     __ rev64(v12, __ T16B, v12);
3690     __ rev64(v13, __ T16B, v13);
3691     __ rev64(v14, __ T16B, v14);
3692     __ rev64(v15, __ T16B, v15);
3693     __ rev64(v16, __ T16B, v16);
3694     __ rev64(v17, __ T16B, v17);
3695     __ rev64(v18, __ T16B, v18);
3696     __ rev64(v19, __ T16B, v19);
3697 
3698     __ mov(rscratch2, rscratch1);
3699 
3700     __ mov(v0, __ T16B, v8);
3701     __ mov(v1, __ T16B, v9);
3702     __ mov(v2, __ T16B, v10);
3703     __ mov(v3, __ T16B, v11);
3704 
3705     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3706     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3707     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3708     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3709     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3710     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3711     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3712     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3713     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3714     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3715     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3716     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3717     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3718     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3719     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3720     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3721     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3722     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3723     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3724     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3725     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3726     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3727     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3728     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3729     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3730     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3731     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3732     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3733     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3734     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3735     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3736     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3737     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3738     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3739     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3740     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3741     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3742     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3743     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3744     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3745 
3746     __ addv(v8, __ T2D, v8, v0);
3747     __ addv(v9, __ T2D, v9, v1);
3748     __ addv(v10, __ T2D, v10, v2);
3749     __ addv(v11, __ T2D, v11, v3);
3750 
3751     if (multi_block) {
3752       __ add(ofs, ofs, 128);
3753       __ cmp(ofs, limit);
3754       __ br(Assembler::LE, sha512_loop);
3755       __ mov(c_rarg0, ofs); // return ofs
3756     }
3757 
3758     __ st1(v8, v9, v10, v11, __ T2D, state);
3759 
3760     __ ldpd(v14, v15, Address(sp, 48));
3761     __ ldpd(v12, v13, Address(sp, 32));
3762     __ ldpd(v10, v11, Address(sp, 16));
3763     __ ldpd(v8, v9, __ post(sp, 64));
3764 
3765     __ ret(lr);
3766 
3767     return start;
3768   }
3769 
3770   // Arguments:
3771   //
3772   // Inputs:
3773   //   c_rarg0   - byte[]  source+offset
3774   //   c_rarg1   - byte[]   SHA.state
3775   //   c_rarg2   - int     digest_length
3776   //   c_rarg3   - int     offset
3777   //   c_rarg4   - int     limit
3778   //
3779   address generate_sha3_implCompress(bool multi_block, const char *name) {
3780     static const uint64_t round_consts[24] = {
3781       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3782       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3783       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3784       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3785       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3786       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3787       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3788       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3789     };
3790 
3791     __ align(CodeEntryAlignment);
3792     StubCodeMark mark(this, "StubRoutines", name);
3793     address start = __ pc();
3794 
3795     Register buf           = c_rarg0;
3796     Register state         = c_rarg1;
3797     Register digest_length = c_rarg2;
3798     Register ofs           = c_rarg3;
3799     Register limit         = c_rarg4;
3800 
3801     Label sha3_loop, rounds24_loop;
3802     Label sha3_512, sha3_384_or_224, sha3_256;
3803 
3804     __ stpd(v8, v9, __ pre(sp, -64));
3805     __ stpd(v10, v11, Address(sp, 16));
3806     __ stpd(v12, v13, Address(sp, 32));
3807     __ stpd(v14, v15, Address(sp, 48));
3808 
3809     // load state
3810     __ add(rscratch1, state, 32);
3811     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3812     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3813     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3814     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3815     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3816     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3817     __ ld1(v24, __ T1D, rscratch1);
3818 
3819     __ BIND(sha3_loop);
3820 
3821     // 24 keccak rounds
3822     __ movw(rscratch2, 24);
3823 
3824     // load round_constants base
3825     __ lea(rscratch1, ExternalAddress((address) round_consts));
3826 
3827     // load input
3828     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3829     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3830     __ eor(v0, __ T8B, v0, v25);
3831     __ eor(v1, __ T8B, v1, v26);
3832     __ eor(v2, __ T8B, v2, v27);
3833     __ eor(v3, __ T8B, v3, v28);
3834     __ eor(v4, __ T8B, v4, v29);
3835     __ eor(v5, __ T8B, v5, v30);
3836     __ eor(v6, __ T8B, v6, v31);
3837 
3838     // digest_length == 64, SHA3-512
3839     __ tbnz(digest_length, 6, sha3_512);
3840 
3841     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3842     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3843     __ eor(v7, __ T8B, v7, v25);
3844     __ eor(v8, __ T8B, v8, v26);
3845     __ eor(v9, __ T8B, v9, v27);
3846     __ eor(v10, __ T8B, v10, v28);
3847     __ eor(v11, __ T8B, v11, v29);
3848     __ eor(v12, __ T8B, v12, v30);
3849 
3850     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3851     __ tbnz(digest_length, 4, sha3_384_or_224);
3852 
3853     // SHA3-256
3854     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3855     __ eor(v13, __ T8B, v13, v25);
3856     __ eor(v14, __ T8B, v14, v26);
3857     __ eor(v15, __ T8B, v15, v27);
3858     __ eor(v16, __ T8B, v16, v28);
3859     __ b(rounds24_loop);
3860 
3861     __ BIND(sha3_384_or_224);
3862     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3863 
3864     // SHA3-224
3865     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3866     __ ld1(v29, __ T8B, __ post(buf, 8));
3867     __ eor(v13, __ T8B, v13, v25);
3868     __ eor(v14, __ T8B, v14, v26);
3869     __ eor(v15, __ T8B, v15, v27);
3870     __ eor(v16, __ T8B, v16, v28);
3871     __ eor(v17, __ T8B, v17, v29);
3872     __ b(rounds24_loop);
3873 
3874     __ BIND(sha3_512);
3875     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3876     __ eor(v7, __ T8B, v7, v25);
3877     __ eor(v8, __ T8B, v8, v26);
3878 
3879     __ BIND(rounds24_loop);
3880     __ subw(rscratch2, rscratch2, 1);
3881 
3882     __ eor3(v29, __ T16B, v4, v9, v14);
3883     __ eor3(v26, __ T16B, v1, v6, v11);
3884     __ eor3(v28, __ T16B, v3, v8, v13);
3885     __ eor3(v25, __ T16B, v0, v5, v10);
3886     __ eor3(v27, __ T16B, v2, v7, v12);
3887     __ eor3(v29, __ T16B, v29, v19, v24);
3888     __ eor3(v26, __ T16B, v26, v16, v21);
3889     __ eor3(v28, __ T16B, v28, v18, v23);
3890     __ eor3(v25, __ T16B, v25, v15, v20);
3891     __ eor3(v27, __ T16B, v27, v17, v22);
3892 
3893     __ rax1(v30, __ T2D, v29, v26);
3894     __ rax1(v26, __ T2D, v26, v28);
3895     __ rax1(v28, __ T2D, v28, v25);
3896     __ rax1(v25, __ T2D, v25, v27);
3897     __ rax1(v27, __ T2D, v27, v29);
3898 
3899     __ eor(v0, __ T16B, v0, v30);
3900     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3901     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3902     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3903     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3904     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3905     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3906     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3907     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3908     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3909     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3910     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3911     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3912     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3913     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3914     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3915     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3916     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3917     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3918     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3919     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3920     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3921     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3922     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3923     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3924 
3925     __ bcax(v20, __ T16B, v31, v22, v8);
3926     __ bcax(v21, __ T16B, v8,  v23, v22);
3927     __ bcax(v22, __ T16B, v22, v24, v23);
3928     __ bcax(v23, __ T16B, v23, v31, v24);
3929     __ bcax(v24, __ T16B, v24, v8,  v31);
3930 
3931     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3932 
3933     __ bcax(v17, __ T16B, v25, v19, v3);
3934     __ bcax(v18, __ T16B, v3,  v15, v19);
3935     __ bcax(v19, __ T16B, v19, v16, v15);
3936     __ bcax(v15, __ T16B, v15, v25, v16);
3937     __ bcax(v16, __ T16B, v16, v3,  v25);
3938 
3939     __ bcax(v10, __ T16B, v29, v12, v26);
3940     __ bcax(v11, __ T16B, v26, v13, v12);
3941     __ bcax(v12, __ T16B, v12, v14, v13);
3942     __ bcax(v13, __ T16B, v13, v29, v14);
3943     __ bcax(v14, __ T16B, v14, v26, v29);
3944 
3945     __ bcax(v7, __ T16B, v30, v9,  v4);
3946     __ bcax(v8, __ T16B, v4,  v5,  v9);
3947     __ bcax(v9, __ T16B, v9,  v6,  v5);
3948     __ bcax(v5, __ T16B, v5,  v30, v6);
3949     __ bcax(v6, __ T16B, v6,  v4,  v30);
3950 
3951     __ bcax(v3, __ T16B, v27, v0,  v28);
3952     __ bcax(v4, __ T16B, v28, v1,  v0);
3953     __ bcax(v0, __ T16B, v0,  v2,  v1);
3954     __ bcax(v1, __ T16B, v1,  v27, v2);
3955     __ bcax(v2, __ T16B, v2,  v28, v27);
3956 
3957     __ eor(v0, __ T16B, v0, v31);
3958 
3959     __ cbnzw(rscratch2, rounds24_loop);
3960 
3961     if (multi_block) {
3962       // block_size =  200 - 2 * digest_length, ofs += block_size
3963       __ add(ofs, ofs, 200);
3964       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3965 
3966       __ cmp(ofs, limit);
3967       __ br(Assembler::LE, sha3_loop);
3968       __ mov(c_rarg0, ofs); // return ofs
3969     }
3970 
3971     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3972     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3973     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3974     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3975     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3976     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3977     __ st1(v24, __ T1D, state);
3978 
3979     __ ldpd(v14, v15, Address(sp, 48));
3980     __ ldpd(v12, v13, Address(sp, 32));
3981     __ ldpd(v10, v11, Address(sp, 16));
3982     __ ldpd(v8, v9, __ post(sp, 64));
3983 
3984     __ ret(lr);
3985 
3986     return start;
3987   }
3988 
3989   /**
3990    *  Arguments:
3991    *
3992    * Inputs:
3993    *   c_rarg0   - int crc
3994    *   c_rarg1   - byte* buf
3995    *   c_rarg2   - int length
3996    *
3997    * Output:
3998    *       rax   - int crc result
3999    */
4000   address generate_updateBytesCRC32() {
4001     assert(UseCRC32Intrinsics, "what are we doing here?");
4002 
4003     __ align(CodeEntryAlignment);
4004     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4005 
4006     address start = __ pc();
4007 
4008     const Register crc   = c_rarg0;  // crc
4009     const Register buf   = c_rarg1;  // source java byte array address
4010     const Register len   = c_rarg2;  // length
4011     const Register table0 = c_rarg3; // crc_table address
4012     const Register table1 = c_rarg4;
4013     const Register table2 = c_rarg5;
4014     const Register table3 = c_rarg6;
4015     const Register tmp3 = c_rarg7;
4016 
4017     BLOCK_COMMENT("Entry:");
4018     __ enter(); // required for proper stackwalking of RuntimeStub frame
4019 
4020     __ kernel_crc32(crc, buf, len,
4021               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4022 
4023     __ leave(); // required for proper stackwalking of RuntimeStub frame
4024     __ ret(lr);
4025 
4026     return start;
4027   }
4028 
4029   /**
4030    *  Arguments:
4031    *
4032    * Inputs:
4033    *   c_rarg0   - int crc
4034    *   c_rarg1   - byte* buf
4035    *   c_rarg2   - int length
4036    *   c_rarg3   - int* table
4037    *
4038    * Output:
4039    *       r0   - int crc result
4040    */
4041   address generate_updateBytesCRC32C() {
4042     assert(UseCRC32CIntrinsics, "what are we doing here?");
4043 
4044     __ align(CodeEntryAlignment);
4045     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4046 
4047     address start = __ pc();
4048 
4049     const Register crc   = c_rarg0;  // crc
4050     const Register buf   = c_rarg1;  // source java byte array address
4051     const Register len   = c_rarg2;  // length
4052     const Register table0 = c_rarg3; // crc_table address
4053     const Register table1 = c_rarg4;
4054     const Register table2 = c_rarg5;
4055     const Register table3 = c_rarg6;
4056     const Register tmp3 = c_rarg7;
4057 
4058     BLOCK_COMMENT("Entry:");
4059     __ enter(); // required for proper stackwalking of RuntimeStub frame
4060 
4061     __ kernel_crc32c(crc, buf, len,
4062               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4063 
4064     __ leave(); // required for proper stackwalking of RuntimeStub frame
4065     __ ret(lr);
4066 
4067     return start;
4068   }
4069 
4070   /***
4071    *  Arguments:
4072    *
4073    *  Inputs:
4074    *   c_rarg0   - int   adler
4075    *   c_rarg1   - byte* buff
4076    *   c_rarg2   - int   len
4077    *
4078    * Output:
4079    *   c_rarg0   - int adler result
4080    */
4081   address generate_updateBytesAdler32() {
4082     __ align(CodeEntryAlignment);
4083     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4084     address start = __ pc();
4085 
4086     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4087 
4088     // Aliases
4089     Register adler  = c_rarg0;
4090     Register s1     = c_rarg0;
4091     Register s2     = c_rarg3;
4092     Register buff   = c_rarg1;
4093     Register len    = c_rarg2;
4094     Register nmax  = r4;
4095     Register base  = r5;
4096     Register count = r6;
4097     Register temp0 = rscratch1;
4098     Register temp1 = rscratch2;
4099     FloatRegister vbytes = v0;
4100     FloatRegister vs1acc = v1;
4101     FloatRegister vs2acc = v2;
4102     FloatRegister vtable = v3;
4103 
4104     // Max number of bytes we can process before having to take the mod
4105     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4106     uint64_t BASE = 0xfff1;
4107     uint64_t NMAX = 0x15B0;
4108 
4109     __ mov(base, BASE);
4110     __ mov(nmax, NMAX);
4111 
4112     // Load accumulation coefficients for the upper 16 bits
4113     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4114     __ ld1(vtable, __ T16B, Address(temp0));
4115 
4116     // s1 is initialized to the lower 16 bits of adler
4117     // s2 is initialized to the upper 16 bits of adler
4118     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4119     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4120 
4121     // The pipelined loop needs at least 16 elements for 1 iteration
4122     // It does check this, but it is more effective to skip to the cleanup loop
4123     __ cmp(len, (u1)16);
4124     __ br(Assembler::HS, L_nmax);
4125     __ cbz(len, L_combine);
4126 
4127     __ bind(L_simple_by1_loop);
4128     __ ldrb(temp0, Address(__ post(buff, 1)));
4129     __ add(s1, s1, temp0);
4130     __ add(s2, s2, s1);
4131     __ subs(len, len, 1);
4132     __ br(Assembler::HI, L_simple_by1_loop);
4133 
4134     // s1 = s1 % BASE
4135     __ subs(temp0, s1, base);
4136     __ csel(s1, temp0, s1, Assembler::HS);
4137 
4138     // s2 = s2 % BASE
4139     __ lsr(temp0, s2, 16);
4140     __ lsl(temp1, temp0, 4);
4141     __ sub(temp1, temp1, temp0);
4142     __ add(s2, temp1, s2, ext::uxth);
4143 
4144     __ subs(temp0, s2, base);
4145     __ csel(s2, temp0, s2, Assembler::HS);
4146 
4147     __ b(L_combine);
4148 
4149     __ bind(L_nmax);
4150     __ subs(len, len, nmax);
4151     __ sub(count, nmax, 16);
4152     __ br(Assembler::LO, L_by16);
4153 
4154     __ bind(L_nmax_loop);
4155 
4156     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4157                                       vbytes, vs1acc, vs2acc, vtable);
4158 
4159     __ subs(count, count, 16);
4160     __ br(Assembler::HS, L_nmax_loop);
4161 
4162     // s1 = s1 % BASE
4163     __ lsr(temp0, s1, 16);
4164     __ lsl(temp1, temp0, 4);
4165     __ sub(temp1, temp1, temp0);
4166     __ add(temp1, temp1, s1, ext::uxth);
4167 
4168     __ lsr(temp0, temp1, 16);
4169     __ lsl(s1, temp0, 4);
4170     __ sub(s1, s1, temp0);
4171     __ add(s1, s1, temp1, ext:: uxth);
4172 
4173     __ subs(temp0, s1, base);
4174     __ csel(s1, temp0, s1, Assembler::HS);
4175 
4176     // s2 = s2 % BASE
4177     __ lsr(temp0, s2, 16);
4178     __ lsl(temp1, temp0, 4);
4179     __ sub(temp1, temp1, temp0);
4180     __ add(temp1, temp1, s2, ext::uxth);
4181 
4182     __ lsr(temp0, temp1, 16);
4183     __ lsl(s2, temp0, 4);
4184     __ sub(s2, s2, temp0);
4185     __ add(s2, s2, temp1, ext:: uxth);
4186 
4187     __ subs(temp0, s2, base);
4188     __ csel(s2, temp0, s2, Assembler::HS);
4189 
4190     __ subs(len, len, nmax);
4191     __ sub(count, nmax, 16);
4192     __ br(Assembler::HS, L_nmax_loop);
4193 
4194     __ bind(L_by16);
4195     __ adds(len, len, count);
4196     __ br(Assembler::LO, L_by1);
4197 
4198     __ bind(L_by16_loop);
4199 
4200     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4201                                       vbytes, vs1acc, vs2acc, vtable);
4202 
4203     __ subs(len, len, 16);
4204     __ br(Assembler::HS, L_by16_loop);
4205 
4206     __ bind(L_by1);
4207     __ adds(len, len, 15);
4208     __ br(Assembler::LO, L_do_mod);
4209 
4210     __ bind(L_by1_loop);
4211     __ ldrb(temp0, Address(__ post(buff, 1)));
4212     __ add(s1, temp0, s1);
4213     __ add(s2, s2, s1);
4214     __ subs(len, len, 1);
4215     __ br(Assembler::HS, L_by1_loop);
4216 
4217     __ bind(L_do_mod);
4218     // s1 = s1 % BASE
4219     __ lsr(temp0, s1, 16);
4220     __ lsl(temp1, temp0, 4);
4221     __ sub(temp1, temp1, temp0);
4222     __ add(temp1, temp1, s1, ext::uxth);
4223 
4224     __ lsr(temp0, temp1, 16);
4225     __ lsl(s1, temp0, 4);
4226     __ sub(s1, s1, temp0);
4227     __ add(s1, s1, temp1, ext:: uxth);
4228 
4229     __ subs(temp0, s1, base);
4230     __ csel(s1, temp0, s1, Assembler::HS);
4231 
4232     // s2 = s2 % BASE
4233     __ lsr(temp0, s2, 16);
4234     __ lsl(temp1, temp0, 4);
4235     __ sub(temp1, temp1, temp0);
4236     __ add(temp1, temp1, s2, ext::uxth);
4237 
4238     __ lsr(temp0, temp1, 16);
4239     __ lsl(s2, temp0, 4);
4240     __ sub(s2, s2, temp0);
4241     __ add(s2, s2, temp1, ext:: uxth);
4242 
4243     __ subs(temp0, s2, base);
4244     __ csel(s2, temp0, s2, Assembler::HS);
4245 
4246     // Combine lower bits and higher bits
4247     __ bind(L_combine);
4248     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4249 
4250     __ ret(lr);
4251 
4252     return start;
4253   }
4254 
4255   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4256           Register temp0, Register temp1, FloatRegister vbytes,
4257           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4258     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4259     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4260     // In non-vectorized code, we update s1 and s2 as:
4261     //   s1 <- s1 + b1
4262     //   s2 <- s2 + s1
4263     //   s1 <- s1 + b2
4264     //   s2 <- s2 + b1
4265     //   ...
4266     //   s1 <- s1 + b16
4267     //   s2 <- s2 + s1
4268     // Putting above assignments together, we have:
4269     //   s1_new = s1 + b1 + b2 + ... + b16
4270     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4271     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4272     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4273     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4274 
4275     // s2 = s2 + s1 * 16
4276     __ add(s2, s2, s1, Assembler::LSL, 4);
4277 
4278     // vs1acc = b1 + b2 + b3 + ... + b16
4279     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4280     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4281     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4282     __ uaddlv(vs1acc, __ T16B, vbytes);
4283     __ uaddlv(vs2acc, __ T8H, vs2acc);
4284 
4285     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4286     __ fmovd(temp0, vs1acc);
4287     __ fmovd(temp1, vs2acc);
4288     __ add(s1, s1, temp0);
4289     __ add(s2, s2, temp1);
4290   }
4291 
4292   /**
4293    *  Arguments:
4294    *
4295    *  Input:
4296    *    c_rarg0   - x address
4297    *    c_rarg1   - x length
4298    *    c_rarg2   - y address
4299    *    c_rarg3   - y length
4300    *    c_rarg4   - z address
4301    *    c_rarg5   - z length
4302    */
4303   address generate_multiplyToLen() {
4304     __ align(CodeEntryAlignment);
4305     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4306 
4307     address start = __ pc();
4308     const Register x     = r0;
4309     const Register xlen  = r1;
4310     const Register y     = r2;
4311     const Register ylen  = r3;
4312     const Register z     = r4;
4313     const Register zlen  = r5;
4314 
4315     const Register tmp1  = r10;
4316     const Register tmp2  = r11;
4317     const Register tmp3  = r12;
4318     const Register tmp4  = r13;
4319     const Register tmp5  = r14;
4320     const Register tmp6  = r15;
4321     const Register tmp7  = r16;
4322 
4323     BLOCK_COMMENT("Entry:");
4324     __ enter(); // required for proper stackwalking of RuntimeStub frame
4325     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4326     __ leave(); // required for proper stackwalking of RuntimeStub frame
4327     __ ret(lr);
4328 
4329     return start;
4330   }
4331 
4332   address generate_squareToLen() {
4333     // squareToLen algorithm for sizes 1..127 described in java code works
4334     // faster than multiply_to_len on some CPUs and slower on others, but
4335     // multiply_to_len shows a bit better overall results
4336     __ align(CodeEntryAlignment);
4337     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4338     address start = __ pc();
4339 
4340     const Register x     = r0;
4341     const Register xlen  = r1;
4342     const Register z     = r2;
4343     const Register zlen  = r3;
4344     const Register y     = r4; // == x
4345     const Register ylen  = r5; // == xlen
4346 
4347     const Register tmp1  = r10;
4348     const Register tmp2  = r11;
4349     const Register tmp3  = r12;
4350     const Register tmp4  = r13;
4351     const Register tmp5  = r14;
4352     const Register tmp6  = r15;
4353     const Register tmp7  = r16;
4354 
4355     RegSet spilled_regs = RegSet::of(y, ylen);
4356     BLOCK_COMMENT("Entry:");
4357     __ enter();
4358     __ push(spilled_regs, sp);
4359     __ mov(y, x);
4360     __ mov(ylen, xlen);
4361     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4362     __ pop(spilled_regs, sp);
4363     __ leave();
4364     __ ret(lr);
4365     return start;
4366   }
4367 
4368   address generate_mulAdd() {
4369     __ align(CodeEntryAlignment);
4370     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4371 
4372     address start = __ pc();
4373 
4374     const Register out     = r0;
4375     const Register in      = r1;
4376     const Register offset  = r2;
4377     const Register len     = r3;
4378     const Register k       = r4;
4379 
4380     BLOCK_COMMENT("Entry:");
4381     __ enter();
4382     __ mul_add(out, in, offset, len, k);
4383     __ leave();
4384     __ ret(lr);
4385 
4386     return start;
4387   }
4388 
4389   // Arguments:
4390   //
4391   // Input:
4392   //   c_rarg0   - newArr address
4393   //   c_rarg1   - oldArr address
4394   //   c_rarg2   - newIdx
4395   //   c_rarg3   - shiftCount
4396   //   c_rarg4   - numIter
4397   //
4398   address generate_bigIntegerRightShift() {
4399     __ align(CodeEntryAlignment);
4400     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4401     address start = __ pc();
4402 
4403     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4404 
4405     Register newArr        = c_rarg0;
4406     Register oldArr        = c_rarg1;
4407     Register newIdx        = c_rarg2;
4408     Register shiftCount    = c_rarg3;
4409     Register numIter       = c_rarg4;
4410     Register idx           = numIter;
4411 
4412     Register newArrCur     = rscratch1;
4413     Register shiftRevCount = rscratch2;
4414     Register oldArrCur     = r13;
4415     Register oldArrNext    = r14;
4416 
4417     FloatRegister oldElem0        = v0;
4418     FloatRegister oldElem1        = v1;
4419     FloatRegister newElem         = v2;
4420     FloatRegister shiftVCount     = v3;
4421     FloatRegister shiftVRevCount  = v4;
4422 
4423     __ cbz(idx, Exit);
4424 
4425     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4426 
4427     // left shift count
4428     __ movw(shiftRevCount, 32);
4429     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4430 
4431     // numIter too small to allow a 4-words SIMD loop, rolling back
4432     __ cmp(numIter, (u1)4);
4433     __ br(Assembler::LT, ShiftThree);
4434 
4435     __ dup(shiftVCount,    __ T4S, shiftCount);
4436     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4437     __ negr(shiftVCount,   __ T4S, shiftVCount);
4438 
4439     __ BIND(ShiftSIMDLoop);
4440 
4441     // Calculate the load addresses
4442     __ sub(idx, idx, 4);
4443     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4444     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4445     __ add(oldArrCur,  oldArrNext, 4);
4446 
4447     // Load 4 words and process
4448     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4449     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4450     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4451     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4452     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4453     __ st1(newElem,   __ T4S,  Address(newArrCur));
4454 
4455     __ cmp(idx, (u1)4);
4456     __ br(Assembler::LT, ShiftTwoLoop);
4457     __ b(ShiftSIMDLoop);
4458 
4459     __ BIND(ShiftTwoLoop);
4460     __ cbz(idx, Exit);
4461     __ cmp(idx, (u1)1);
4462     __ br(Assembler::EQ, ShiftOne);
4463 
4464     // Calculate the load addresses
4465     __ sub(idx, idx, 2);
4466     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4467     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4468     __ add(oldArrCur,  oldArrNext, 4);
4469 
4470     // Load 2 words and process
4471     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4472     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4473     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4474     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4475     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4476     __ st1(newElem,   __ T2S, Address(newArrCur));
4477     __ b(ShiftTwoLoop);
4478 
4479     __ BIND(ShiftThree);
4480     __ tbz(idx, 1, ShiftOne);
4481     __ tbz(idx, 0, ShiftTwo);
4482     __ ldrw(r10,  Address(oldArr, 12));
4483     __ ldrw(r11,  Address(oldArr, 8));
4484     __ lsrvw(r10, r10, shiftCount);
4485     __ lslvw(r11, r11, shiftRevCount);
4486     __ orrw(r12,  r10, r11);
4487     __ strw(r12,  Address(newArr, 8));
4488 
4489     __ BIND(ShiftTwo);
4490     __ ldrw(r10,  Address(oldArr, 8));
4491     __ ldrw(r11,  Address(oldArr, 4));
4492     __ lsrvw(r10, r10, shiftCount);
4493     __ lslvw(r11, r11, shiftRevCount);
4494     __ orrw(r12,  r10, r11);
4495     __ strw(r12,  Address(newArr, 4));
4496 
4497     __ BIND(ShiftOne);
4498     __ ldrw(r10,  Address(oldArr, 4));
4499     __ ldrw(r11,  Address(oldArr));
4500     __ lsrvw(r10, r10, shiftCount);
4501     __ lslvw(r11, r11, shiftRevCount);
4502     __ orrw(r12,  r10, r11);
4503     __ strw(r12,  Address(newArr));
4504 
4505     __ BIND(Exit);
4506     __ ret(lr);
4507 
4508     return start;
4509   }
4510 
4511   // Arguments:
4512   //
4513   // Input:
4514   //   c_rarg0   - newArr address
4515   //   c_rarg1   - oldArr address
4516   //   c_rarg2   - newIdx
4517   //   c_rarg3   - shiftCount
4518   //   c_rarg4   - numIter
4519   //
4520   address generate_bigIntegerLeftShift() {
4521     __ align(CodeEntryAlignment);
4522     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4523     address start = __ pc();
4524 
4525     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4526 
4527     Register newArr        = c_rarg0;
4528     Register oldArr        = c_rarg1;
4529     Register newIdx        = c_rarg2;
4530     Register shiftCount    = c_rarg3;
4531     Register numIter       = c_rarg4;
4532 
4533     Register shiftRevCount = rscratch1;
4534     Register oldArrNext    = rscratch2;
4535 
4536     FloatRegister oldElem0        = v0;
4537     FloatRegister oldElem1        = v1;
4538     FloatRegister newElem         = v2;
4539     FloatRegister shiftVCount     = v3;
4540     FloatRegister shiftVRevCount  = v4;
4541 
4542     __ cbz(numIter, Exit);
4543 
4544     __ add(oldArrNext, oldArr, 4);
4545     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4546 
4547     // right shift count
4548     __ movw(shiftRevCount, 32);
4549     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4550 
4551     // numIter too small to allow a 4-words SIMD loop, rolling back
4552     __ cmp(numIter, (u1)4);
4553     __ br(Assembler::LT, ShiftThree);
4554 
4555     __ dup(shiftVCount,     __ T4S, shiftCount);
4556     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4557     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4558 
4559     __ BIND(ShiftSIMDLoop);
4560 
4561     // load 4 words and process
4562     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4563     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4564     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4565     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4566     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4567     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4568     __ sub(numIter,   numIter, 4);
4569 
4570     __ cmp(numIter, (u1)4);
4571     __ br(Assembler::LT, ShiftTwoLoop);
4572     __ b(ShiftSIMDLoop);
4573 
4574     __ BIND(ShiftTwoLoop);
4575     __ cbz(numIter, Exit);
4576     __ cmp(numIter, (u1)1);
4577     __ br(Assembler::EQ, ShiftOne);
4578 
4579     // load 2 words and process
4580     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4581     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4582     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4583     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4584     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4585     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4586     __ sub(numIter,   numIter, 2);
4587     __ b(ShiftTwoLoop);
4588 
4589     __ BIND(ShiftThree);
4590     __ ldrw(r10,  __ post(oldArr, 4));
4591     __ ldrw(r11,  __ post(oldArrNext, 4));
4592     __ lslvw(r10, r10, shiftCount);
4593     __ lsrvw(r11, r11, shiftRevCount);
4594     __ orrw(r12,  r10, r11);
4595     __ strw(r12,  __ post(newArr, 4));
4596     __ tbz(numIter, 1, Exit);
4597     __ tbz(numIter, 0, ShiftOne);
4598 
4599     __ BIND(ShiftTwo);
4600     __ ldrw(r10,  __ post(oldArr, 4));
4601     __ ldrw(r11,  __ post(oldArrNext, 4));
4602     __ lslvw(r10, r10, shiftCount);
4603     __ lsrvw(r11, r11, shiftRevCount);
4604     __ orrw(r12,  r10, r11);
4605     __ strw(r12,  __ post(newArr, 4));
4606 
4607     __ BIND(ShiftOne);
4608     __ ldrw(r10,  Address(oldArr));
4609     __ ldrw(r11,  Address(oldArrNext));
4610     __ lslvw(r10, r10, shiftCount);
4611     __ lsrvw(r11, r11, shiftRevCount);
4612     __ orrw(r12,  r10, r11);
4613     __ strw(r12,  Address(newArr));
4614 
4615     __ BIND(Exit);
4616     __ ret(lr);
4617 
4618     return start;
4619   }
4620 
4621   address generate_count_positives(address &count_positives_long) {
4622     const u1 large_loop_size = 64;
4623     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4624     int dcache_line = VM_Version::dcache_line_size();
4625 
4626     Register ary1 = r1, len = r2, result = r0;
4627 
4628     __ align(CodeEntryAlignment);
4629 
4630     StubCodeMark mark(this, "StubRoutines", "count_positives");
4631 
4632     address entry = __ pc();
4633 
4634     __ enter();
4635     // precondition: a copy of len is already in result
4636     // __ mov(result, len);
4637 
4638   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4639         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4640 
4641   __ cmp(len, (u1)15);
4642   __ br(Assembler::GT, LEN_OVER_15);
4643   // The only case when execution falls into this code is when pointer is near
4644   // the end of memory page and we have to avoid reading next page
4645   __ add(ary1, ary1, len);
4646   __ subs(len, len, 8);
4647   __ br(Assembler::GT, LEN_OVER_8);
4648   __ ldr(rscratch2, Address(ary1, -8));
4649   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4650   __ lsrv(rscratch2, rscratch2, rscratch1);
4651   __ tst(rscratch2, UPPER_BIT_MASK);
4652   __ csel(result, zr, result, Assembler::NE);
4653   __ leave();
4654   __ ret(lr);
4655   __ bind(LEN_OVER_8);
4656   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4657   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4658   __ tst(rscratch2, UPPER_BIT_MASK);
4659   __ br(Assembler::NE, RET_NO_POP);
4660   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4661   __ lsrv(rscratch1, rscratch1, rscratch2);
4662   __ tst(rscratch1, UPPER_BIT_MASK);
4663   __ bind(RET_NO_POP);
4664   __ csel(result, zr, result, Assembler::NE);
4665   __ leave();
4666   __ ret(lr);
4667 
4668   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4669   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4670 
4671   count_positives_long = __ pc(); // 2nd entry point
4672 
4673   __ enter();
4674 
4675   __ bind(LEN_OVER_15);
4676     __ push(spilled_regs, sp);
4677     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4678     __ cbz(rscratch2, ALIGNED);
4679     __ ldp(tmp6, tmp1, Address(ary1));
4680     __ mov(tmp5, 16);
4681     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4682     __ add(ary1, ary1, rscratch1);
4683     __ orr(tmp6, tmp6, tmp1);
4684     __ tst(tmp6, UPPER_BIT_MASK);
4685     __ br(Assembler::NE, RET_ADJUST);
4686     __ sub(len, len, rscratch1);
4687 
4688   __ bind(ALIGNED);
4689     __ cmp(len, large_loop_size);
4690     __ br(Assembler::LT, CHECK_16);
4691     // Perform 16-byte load as early return in pre-loop to handle situation
4692     // when initially aligned large array has negative values at starting bytes,
4693     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4694     // slower. Cases with negative bytes further ahead won't be affected that
4695     // much. In fact, it'll be faster due to early loads, less instructions and
4696     // less branches in LARGE_LOOP.
4697     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4698     __ sub(len, len, 16);
4699     __ orr(tmp6, tmp6, tmp1);
4700     __ tst(tmp6, UPPER_BIT_MASK);
4701     __ br(Assembler::NE, RET_ADJUST_16);
4702     __ cmp(len, large_loop_size);
4703     __ br(Assembler::LT, CHECK_16);
4704 
4705     if (SoftwarePrefetchHintDistance >= 0
4706         && SoftwarePrefetchHintDistance >= dcache_line) {
4707       // initial prefetch
4708       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4709     }
4710   __ bind(LARGE_LOOP);
4711     if (SoftwarePrefetchHintDistance >= 0) {
4712       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4713     }
4714     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4715     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4716     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4717     // instructions per cycle and have less branches, but this approach disables
4718     // early return, thus, all 64 bytes are loaded and checked every time.
4719     __ ldp(tmp2, tmp3, Address(ary1));
4720     __ ldp(tmp4, tmp5, Address(ary1, 16));
4721     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4722     __ ldp(tmp6, tmp1, Address(ary1, 48));
4723     __ add(ary1, ary1, large_loop_size);
4724     __ sub(len, len, large_loop_size);
4725     __ orr(tmp2, tmp2, tmp3);
4726     __ orr(tmp4, tmp4, tmp5);
4727     __ orr(rscratch1, rscratch1, rscratch2);
4728     __ orr(tmp6, tmp6, tmp1);
4729     __ orr(tmp2, tmp2, tmp4);
4730     __ orr(rscratch1, rscratch1, tmp6);
4731     __ orr(tmp2, tmp2, rscratch1);
4732     __ tst(tmp2, UPPER_BIT_MASK);
4733     __ br(Assembler::NE, RET_ADJUST_LONG);
4734     __ cmp(len, large_loop_size);
4735     __ br(Assembler::GE, LARGE_LOOP);
4736 
4737   __ bind(CHECK_16); // small 16-byte load pre-loop
4738     __ cmp(len, (u1)16);
4739     __ br(Assembler::LT, POST_LOOP16);
4740 
4741   __ bind(LOOP16); // small 16-byte load loop
4742     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4743     __ sub(len, len, 16);
4744     __ orr(tmp2, tmp2, tmp3);
4745     __ tst(tmp2, UPPER_BIT_MASK);
4746     __ br(Assembler::NE, RET_ADJUST_16);
4747     __ cmp(len, (u1)16);
4748     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4749 
4750   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4751     __ cmp(len, (u1)8);
4752     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4753     __ ldr(tmp3, Address(__ post(ary1, 8)));
4754     __ tst(tmp3, UPPER_BIT_MASK);
4755     __ br(Assembler::NE, RET_ADJUST);
4756     __ sub(len, len, 8);
4757 
4758   __ bind(POST_LOOP16_LOAD_TAIL);
4759     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
4760     __ ldr(tmp1, Address(ary1));
4761     __ mov(tmp2, 64);
4762     __ sub(tmp4, tmp2, len, __ LSL, 3);
4763     __ lslv(tmp1, tmp1, tmp4);
4764     __ tst(tmp1, UPPER_BIT_MASK);
4765     __ br(Assembler::NE, RET_ADJUST);
4766     // Fallthrough
4767 
4768   __ bind(RET_LEN);
4769     __ pop(spilled_regs, sp);
4770     __ leave();
4771     __ ret(lr);
4772 
4773     // difference result - len is the count of guaranteed to be
4774     // positive bytes
4775 
4776   __ bind(RET_ADJUST_LONG);
4777     __ add(len, len, (u1)(large_loop_size - 16));
4778   __ bind(RET_ADJUST_16);
4779     __ add(len, len, 16);
4780   __ bind(RET_ADJUST);
4781     __ pop(spilled_regs, sp);
4782     __ leave();
4783     __ sub(result, result, len);
4784     __ ret(lr);
4785 
4786     return entry;
4787   }
4788 
4789   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4790         bool usePrefetch, Label &NOT_EQUAL) {
4791     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4792         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4793         tmp7 = r12, tmp8 = r13;
4794     Label LOOP;
4795 
4796     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4797     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4798     __ bind(LOOP);
4799     if (usePrefetch) {
4800       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4801       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4802     }
4803     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4804     __ eor(tmp1, tmp1, tmp2);
4805     __ eor(tmp3, tmp3, tmp4);
4806     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4807     __ orr(tmp1, tmp1, tmp3);
4808     __ cbnz(tmp1, NOT_EQUAL);
4809     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4810     __ eor(tmp5, tmp5, tmp6);
4811     __ eor(tmp7, tmp7, tmp8);
4812     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4813     __ orr(tmp5, tmp5, tmp7);
4814     __ cbnz(tmp5, NOT_EQUAL);
4815     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4816     __ eor(tmp1, tmp1, tmp2);
4817     __ eor(tmp3, tmp3, tmp4);
4818     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4819     __ orr(tmp1, tmp1, tmp3);
4820     __ cbnz(tmp1, NOT_EQUAL);
4821     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4822     __ eor(tmp5, tmp5, tmp6);
4823     __ sub(cnt1, cnt1, 8 * wordSize);
4824     __ eor(tmp7, tmp7, tmp8);
4825     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4826     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4827     // cmp) because subs allows an unlimited range of immediate operand.
4828     __ subs(tmp6, cnt1, loopThreshold);
4829     __ orr(tmp5, tmp5, tmp7);
4830     __ cbnz(tmp5, NOT_EQUAL);
4831     __ br(__ GE, LOOP);
4832     // post-loop
4833     __ eor(tmp1, tmp1, tmp2);
4834     __ eor(tmp3, tmp3, tmp4);
4835     __ orr(tmp1, tmp1, tmp3);
4836     __ sub(cnt1, cnt1, 2 * wordSize);
4837     __ cbnz(tmp1, NOT_EQUAL);
4838   }
4839 
4840   void generate_large_array_equals_loop_simd(int loopThreshold,
4841         bool usePrefetch, Label &NOT_EQUAL) {
4842     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4843         tmp2 = rscratch2;
4844     Label LOOP;
4845 
4846     __ bind(LOOP);
4847     if (usePrefetch) {
4848       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4849       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4850     }
4851     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4852     __ sub(cnt1, cnt1, 8 * wordSize);
4853     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4854     __ subs(tmp1, cnt1, loopThreshold);
4855     __ eor(v0, __ T16B, v0, v4);
4856     __ eor(v1, __ T16B, v1, v5);
4857     __ eor(v2, __ T16B, v2, v6);
4858     __ eor(v3, __ T16B, v3, v7);
4859     __ orr(v0, __ T16B, v0, v1);
4860     __ orr(v1, __ T16B, v2, v3);
4861     __ orr(v0, __ T16B, v0, v1);
4862     __ umov(tmp1, v0, __ D, 0);
4863     __ umov(tmp2, v0, __ D, 1);
4864     __ orr(tmp1, tmp1, tmp2);
4865     __ cbnz(tmp1, NOT_EQUAL);
4866     __ br(__ GE, LOOP);
4867   }
4868 
4869   // a1 = r1 - array1 address
4870   // a2 = r2 - array2 address
4871   // result = r0 - return value. Already contains "false"
4872   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4873   // r3-r5 are reserved temporary registers
4874   address generate_large_array_equals() {
4875     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4876         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4877         tmp7 = r12, tmp8 = r13;
4878     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4879         SMALL_LOOP, POST_LOOP;
4880     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4881     // calculate if at least 32 prefetched bytes are used
4882     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4883     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4884     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4885     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4886         tmp5, tmp6, tmp7, tmp8);
4887 
4888     __ align(CodeEntryAlignment);
4889 
4890     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4891 
4892     address entry = __ pc();
4893     __ enter();
4894     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4895     // also advance pointers to use post-increment instead of pre-increment
4896     __ add(a1, a1, wordSize);
4897     __ add(a2, a2, wordSize);
4898     if (AvoidUnalignedAccesses) {
4899       // both implementations (SIMD/nonSIMD) are using relatively large load
4900       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4901       // on some CPUs in case of address is not at least 16-byte aligned.
4902       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4903       // load if needed at least for 1st address and make if 16-byte aligned.
4904       Label ALIGNED16;
4905       __ tbz(a1, 3, ALIGNED16);
4906       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4907       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4908       __ sub(cnt1, cnt1, wordSize);
4909       __ eor(tmp1, tmp1, tmp2);
4910       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4911       __ bind(ALIGNED16);
4912     }
4913     if (UseSIMDForArrayEquals) {
4914       if (SoftwarePrefetchHintDistance >= 0) {
4915         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4916         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4917         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4918             /* prfm = */ true, NOT_EQUAL);
4919         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4920         __ br(__ LT, TAIL);
4921       }
4922       __ bind(NO_PREFETCH_LARGE_LOOP);
4923       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4924           /* prfm = */ false, NOT_EQUAL);
4925     } else {
4926       __ push(spilled_regs, sp);
4927       if (SoftwarePrefetchHintDistance >= 0) {
4928         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4929         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4930         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4931             /* prfm = */ true, NOT_EQUAL);
4932         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4933         __ br(__ LT, TAIL);
4934       }
4935       __ bind(NO_PREFETCH_LARGE_LOOP);
4936       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4937           /* prfm = */ false, NOT_EQUAL);
4938     }
4939     __ bind(TAIL);
4940       __ cbz(cnt1, EQUAL);
4941       __ subs(cnt1, cnt1, wordSize);
4942       __ br(__ LE, POST_LOOP);
4943     __ bind(SMALL_LOOP);
4944       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4945       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4946       __ subs(cnt1, cnt1, wordSize);
4947       __ eor(tmp1, tmp1, tmp2);
4948       __ cbnz(tmp1, NOT_EQUAL);
4949       __ br(__ GT, SMALL_LOOP);
4950     __ bind(POST_LOOP);
4951       __ ldr(tmp1, Address(a1, cnt1));
4952       __ ldr(tmp2, Address(a2, cnt1));
4953       __ eor(tmp1, tmp1, tmp2);
4954       __ cbnz(tmp1, NOT_EQUAL);
4955     __ bind(EQUAL);
4956       __ mov(result, true);
4957     __ bind(NOT_EQUAL);
4958       if (!UseSIMDForArrayEquals) {
4959         __ pop(spilled_regs, sp);
4960       }
4961     __ bind(NOT_EQUAL_NO_POP);
4962     __ leave();
4963     __ ret(lr);
4964     return entry;
4965   }
4966 
4967   address generate_dsin_dcos(bool isCos) {
4968     __ align(CodeEntryAlignment);
4969     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4970     address start = __ pc();
4971     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4972         (address)StubRoutines::aarch64::_two_over_pi,
4973         (address)StubRoutines::aarch64::_pio2,
4974         (address)StubRoutines::aarch64::_dsin_coef,
4975         (address)StubRoutines::aarch64::_dcos_coef);
4976     return start;
4977   }
4978 
4979   address generate_dlog() {
4980     __ align(CodeEntryAlignment);
4981     StubCodeMark mark(this, "StubRoutines", "dlog");
4982     address entry = __ pc();
4983     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4984         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4985     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4986     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4987         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4988     return entry;
4989   }
4990 
4991 
4992   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4993   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4994       Label &DIFF2) {
4995     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4996     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4997 
4998     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4999     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5000     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5001     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5002 
5003     __ fmovd(tmpL, vtmp3);
5004     __ eor(rscratch2, tmp3, tmpL);
5005     __ cbnz(rscratch2, DIFF2);
5006 
5007     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5008     __ umov(tmpL, vtmp3, __ D, 1);
5009     __ eor(rscratch2, tmpU, tmpL);
5010     __ cbnz(rscratch2, DIFF1);
5011 
5012     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5013     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5014     __ fmovd(tmpL, vtmp);
5015     __ eor(rscratch2, tmp3, tmpL);
5016     __ cbnz(rscratch2, DIFF2);
5017 
5018     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5019     __ umov(tmpL, vtmp, __ D, 1);
5020     __ eor(rscratch2, tmpU, tmpL);
5021     __ cbnz(rscratch2, DIFF1);
5022   }
5023 
5024   // r0  = result
5025   // r1  = str1
5026   // r2  = cnt1
5027   // r3  = str2
5028   // r4  = cnt2
5029   // r10 = tmp1
5030   // r11 = tmp2
5031   address generate_compare_long_string_different_encoding(bool isLU) {
5032     __ align(CodeEntryAlignment);
5033     StubCodeMark mark(this, "StubRoutines", isLU
5034         ? "compare_long_string_different_encoding LU"
5035         : "compare_long_string_different_encoding UL");
5036     address entry = __ pc();
5037     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5038         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5039         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5040     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5041         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5042     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5043     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5044 
5045     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5046 
5047     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5048     // cnt2 == amount of characters left to compare
5049     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5050     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5051     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5052     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5053     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5054     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5055     __ eor(rscratch2, tmp1, tmp2);
5056     __ mov(rscratch1, tmp2);
5057     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5058     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5059              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5060     __ push(spilled_regs, sp);
5061     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5062     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5063 
5064     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5065 
5066     if (SoftwarePrefetchHintDistance >= 0) {
5067       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5068       __ br(__ LT, NO_PREFETCH);
5069       __ bind(LARGE_LOOP_PREFETCH);
5070         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5071         __ mov(tmp4, 2);
5072         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5073         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5074           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5075           __ subs(tmp4, tmp4, 1);
5076           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5077           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5078           __ mov(tmp4, 2);
5079         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5080           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5081           __ subs(tmp4, tmp4, 1);
5082           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5083           __ sub(cnt2, cnt2, 64);
5084           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5085           __ br(__ GE, LARGE_LOOP_PREFETCH);
5086     }
5087     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5088     __ bind(NO_PREFETCH);
5089     __ subs(cnt2, cnt2, 16);
5090     __ br(__ LT, TAIL);
5091     __ align(OptoLoopAlignment);
5092     __ bind(SMALL_LOOP); // smaller loop
5093       __ subs(cnt2, cnt2, 16);
5094       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5095       __ br(__ GE, SMALL_LOOP);
5096       __ cmn(cnt2, (u1)16);
5097       __ br(__ EQ, LOAD_LAST);
5098     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5099       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5100       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5101       __ ldr(tmp3, Address(cnt1, -8));
5102       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5103       __ b(LOAD_LAST);
5104     __ bind(DIFF2);
5105       __ mov(tmpU, tmp3);
5106     __ bind(DIFF1);
5107       __ pop(spilled_regs, sp);
5108       __ b(CALCULATE_DIFFERENCE);
5109     __ bind(LOAD_LAST);
5110       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5111       // No need to load it again
5112       __ mov(tmpU, tmp3);
5113       __ pop(spilled_regs, sp);
5114 
5115       // tmp2 points to the address of the last 4 Latin1 characters right now
5116       __ ldrs(vtmp, Address(tmp2));
5117       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5118       __ fmovd(tmpL, vtmp);
5119 
5120       __ eor(rscratch2, tmpU, tmpL);
5121       __ cbz(rscratch2, DONE);
5122 
5123     // Find the first different characters in the longwords and
5124     // compute their difference.
5125     __ bind(CALCULATE_DIFFERENCE);
5126       __ rev(rscratch2, rscratch2);
5127       __ clz(rscratch2, rscratch2);
5128       __ andr(rscratch2, rscratch2, -16);
5129       __ lsrv(tmp1, tmp1, rscratch2);
5130       __ uxthw(tmp1, tmp1);
5131       __ lsrv(rscratch1, rscratch1, rscratch2);
5132       __ uxthw(rscratch1, rscratch1);
5133       __ subw(result, tmp1, rscratch1);
5134     __ bind(DONE);
5135       __ ret(lr);
5136     return entry;
5137   }
5138 
5139     address generate_method_entry_barrier() {
5140     __ align(CodeEntryAlignment);
5141     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5142 
5143     Label deoptimize_label;
5144 
5145     address start = __ pc();
5146 
5147     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5148 
5149     __ enter();
5150     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5151 
5152     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5153 
5154     __ push_call_clobbered_registers();
5155 
5156     __ mov(c_rarg0, rscratch2);
5157     __ call_VM_leaf
5158          (CAST_FROM_FN_PTR
5159           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5160 
5161     __ reset_last_Java_frame(true);
5162 
5163     __ mov(rscratch1, r0);
5164 
5165     __ pop_call_clobbered_registers();
5166 
5167     __ cbnz(rscratch1, deoptimize_label);
5168 
5169     __ leave();
5170     __ ret(lr);
5171 
5172     __ BIND(deoptimize_label);
5173 
5174     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5175     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5176 
5177     __ mov(sp, rscratch1);
5178     __ br(rscratch2);
5179 
5180     return start;
5181   }
5182 
5183   // r0  = result
5184   // r1  = str1
5185   // r2  = cnt1
5186   // r3  = str2
5187   // r4  = cnt2
5188   // r10 = tmp1
5189   // r11 = tmp2
5190   address generate_compare_long_string_same_encoding(bool isLL) {
5191     __ align(CodeEntryAlignment);
5192     StubCodeMark mark(this, "StubRoutines", isLL
5193         ? "compare_long_string_same_encoding LL"
5194         : "compare_long_string_same_encoding UU");
5195     address entry = __ pc();
5196     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5197         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5198 
5199     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5200 
5201     // exit from large loop when less than 64 bytes left to read or we're about
5202     // to prefetch memory behind array border
5203     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5204 
5205     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5206     __ eor(rscratch2, tmp1, tmp2);
5207     __ cbnz(rscratch2, CAL_DIFFERENCE);
5208 
5209     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5210     // update pointers, because of previous read
5211     __ add(str1, str1, wordSize);
5212     __ add(str2, str2, wordSize);
5213     if (SoftwarePrefetchHintDistance >= 0) {
5214       __ align(OptoLoopAlignment);
5215       __ bind(LARGE_LOOP_PREFETCH);
5216         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5217         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5218 
5219         for (int i = 0; i < 4; i++) {
5220           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5221           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5222           __ cmp(tmp1, tmp2);
5223           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5224           __ br(Assembler::NE, DIFF);
5225         }
5226         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5227         __ add(str1, str1, 64);
5228         __ add(str2, str2, 64);
5229         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5230         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5231         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5232     }
5233 
5234     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5235     __ br(Assembler::LE, LESS16);
5236     __ align(OptoLoopAlignment);
5237     __ bind(LOOP_COMPARE16);
5238       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5239       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5240       __ cmp(tmp1, tmp2);
5241       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5242       __ br(Assembler::NE, DIFF);
5243       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5244       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5245       __ br(Assembler::LT, LESS16);
5246 
5247       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5248       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5249       __ cmp(tmp1, tmp2);
5250       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5251       __ br(Assembler::NE, DIFF);
5252       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5253       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5254       __ br(Assembler::GE, LOOP_COMPARE16);
5255       __ cbz(cnt2, LENGTH_DIFF);
5256 
5257     __ bind(LESS16);
5258       // each 8 compare
5259       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5260       __ br(Assembler::LE, LESS8);
5261       __ ldr(tmp1, Address(__ post(str1, 8)));
5262       __ ldr(tmp2, Address(__ post(str2, 8)));
5263       __ eor(rscratch2, tmp1, tmp2);
5264       __ cbnz(rscratch2, CAL_DIFFERENCE);
5265       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5266 
5267     __ bind(LESS8); // directly load last 8 bytes
5268       if (!isLL) {
5269         __ add(cnt2, cnt2, cnt2);
5270       }
5271       __ ldr(tmp1, Address(str1, cnt2));
5272       __ ldr(tmp2, Address(str2, cnt2));
5273       __ eor(rscratch2, tmp1, tmp2);
5274       __ cbz(rscratch2, LENGTH_DIFF);
5275       __ b(CAL_DIFFERENCE);
5276 
5277     __ bind(DIFF);
5278       __ cmp(tmp1, tmp2);
5279       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5280       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5281       // reuse rscratch2 register for the result of eor instruction
5282       __ eor(rscratch2, tmp1, tmp2);
5283 
5284     __ bind(CAL_DIFFERENCE);
5285       __ rev(rscratch2, rscratch2);
5286       __ clz(rscratch2, rscratch2);
5287       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5288       __ lsrv(tmp1, tmp1, rscratch2);
5289       __ lsrv(tmp2, tmp2, rscratch2);
5290       if (isLL) {
5291         __ uxtbw(tmp1, tmp1);
5292         __ uxtbw(tmp2, tmp2);
5293       } else {
5294         __ uxthw(tmp1, tmp1);
5295         __ uxthw(tmp2, tmp2);
5296       }
5297       __ subw(result, tmp1, tmp2);
5298 
5299     __ bind(LENGTH_DIFF);
5300       __ ret(lr);
5301     return entry;
5302   }
5303 
5304   void generate_compare_long_strings() {
5305       StubRoutines::aarch64::_compare_long_string_LL
5306           = generate_compare_long_string_same_encoding(true);
5307       StubRoutines::aarch64::_compare_long_string_UU
5308           = generate_compare_long_string_same_encoding(false);
5309       StubRoutines::aarch64::_compare_long_string_LU
5310           = generate_compare_long_string_different_encoding(true);
5311       StubRoutines::aarch64::_compare_long_string_UL
5312           = generate_compare_long_string_different_encoding(false);
5313   }
5314 
5315   // R0 = result
5316   // R1 = str2
5317   // R2 = cnt1
5318   // R3 = str1
5319   // R4 = cnt2
5320   // This generic linear code use few additional ideas, which makes it faster:
5321   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5322   // in order to skip initial loading(help in systems with 1 ld pipeline)
5323   // 2) we can use "fast" algorithm of finding single character to search for
5324   // first symbol with less branches(1 branch per each loaded register instead
5325   // of branch for each symbol), so, this is where constants like
5326   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5327   // 3) after loading and analyzing 1st register of source string, it can be
5328   // used to search for every 1st character entry, saving few loads in
5329   // comparison with "simplier-but-slower" implementation
5330   // 4) in order to avoid lots of push/pop operations, code below is heavily
5331   // re-using/re-initializing/compressing register values, which makes code
5332   // larger and a bit less readable, however, most of extra operations are
5333   // issued during loads or branches, so, penalty is minimal
5334   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5335     const char* stubName = str1_isL
5336         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5337         : "indexof_linear_uu";
5338     __ align(CodeEntryAlignment);
5339     StubCodeMark mark(this, "StubRoutines", stubName);
5340     address entry = __ pc();
5341 
5342     int str1_chr_size = str1_isL ? 1 : 2;
5343     int str2_chr_size = str2_isL ? 1 : 2;
5344     int str1_chr_shift = str1_isL ? 0 : 1;
5345     int str2_chr_shift = str2_isL ? 0 : 1;
5346     bool isL = str1_isL && str2_isL;
5347    // parameters
5348     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5349     // temporary registers
5350     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5351     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5352     // redefinitions
5353     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5354 
5355     __ push(spilled_regs, sp);
5356     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5357         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5358         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5359         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5360         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5361         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5362     // Read whole register from str1. It is safe, because length >=8 here
5363     __ ldr(ch1, Address(str1));
5364     // Read whole register from str2. It is safe, because length >=8 here
5365     __ ldr(ch2, Address(str2));
5366     __ sub(cnt2, cnt2, cnt1);
5367     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5368     if (str1_isL != str2_isL) {
5369       __ eor(v0, __ T16B, v0, v0);
5370     }
5371     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5372     __ mul(first, first, tmp1);
5373     // check if we have less than 1 register to check
5374     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5375     if (str1_isL != str2_isL) {
5376       __ fmovd(v1, ch1);
5377     }
5378     __ br(__ LE, L_SMALL);
5379     __ eor(ch2, first, ch2);
5380     if (str1_isL != str2_isL) {
5381       __ zip1(v1, __ T16B, v1, v0);
5382     }
5383     __ sub(tmp2, ch2, tmp1);
5384     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5385     __ bics(tmp2, tmp2, ch2);
5386     if (str1_isL != str2_isL) {
5387       __ fmovd(ch1, v1);
5388     }
5389     __ br(__ NE, L_HAS_ZERO);
5390     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5391     __ add(result, result, wordSize/str2_chr_size);
5392     __ add(str2, str2, wordSize);
5393     __ br(__ LT, L_POST_LOOP);
5394     __ BIND(L_LOOP);
5395       __ ldr(ch2, Address(str2));
5396       __ eor(ch2, first, ch2);
5397       __ sub(tmp2, ch2, tmp1);
5398       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5399       __ bics(tmp2, tmp2, ch2);
5400       __ br(__ NE, L_HAS_ZERO);
5401     __ BIND(L_LOOP_PROCEED);
5402       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5403       __ add(str2, str2, wordSize);
5404       __ add(result, result, wordSize/str2_chr_size);
5405       __ br(__ GE, L_LOOP);
5406     __ BIND(L_POST_LOOP);
5407       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5408       __ br(__ LE, NOMATCH);
5409       __ ldr(ch2, Address(str2));
5410       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5411       __ eor(ch2, first, ch2);
5412       __ sub(tmp2, ch2, tmp1);
5413       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5414       __ mov(tmp4, -1); // all bits set
5415       __ b(L_SMALL_PROCEED);
5416     __ align(OptoLoopAlignment);
5417     __ BIND(L_SMALL);
5418       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5419       __ eor(ch2, first, ch2);
5420       if (str1_isL != str2_isL) {
5421         __ zip1(v1, __ T16B, v1, v0);
5422       }
5423       __ sub(tmp2, ch2, tmp1);
5424       __ mov(tmp4, -1); // all bits set
5425       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5426       if (str1_isL != str2_isL) {
5427         __ fmovd(ch1, v1); // move converted 4 symbols
5428       }
5429     __ BIND(L_SMALL_PROCEED);
5430       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5431       __ bic(tmp2, tmp2, ch2);
5432       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5433       __ rbit(tmp2, tmp2);
5434       __ br(__ EQ, NOMATCH);
5435     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5436       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5437       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5438       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5439       if (str2_isL) { // LL
5440         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5441         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5442         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5443         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5444         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5445       } else {
5446         __ mov(ch2, 0xE); // all bits in byte set except last one
5447         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5448         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5449         __ lslv(tmp2, tmp2, tmp4);
5450         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5451         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5452         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5453         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5454       }
5455       __ cmp(ch1, ch2);
5456       __ mov(tmp4, wordSize/str2_chr_size);
5457       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5458     __ BIND(L_SMALL_CMP_LOOP);
5459       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5460                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5461       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5462                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5463       __ add(tmp4, tmp4, 1);
5464       __ cmp(tmp4, cnt1);
5465       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5466       __ cmp(first, ch2);
5467       __ br(__ EQ, L_SMALL_CMP_LOOP);
5468     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5469       __ cbz(tmp2, NOMATCH); // no more matches. exit
5470       __ clz(tmp4, tmp2);
5471       __ add(result, result, 1); // advance index
5472       __ add(str2, str2, str2_chr_size); // advance pointer
5473       __ b(L_SMALL_HAS_ZERO_LOOP);
5474     __ align(OptoLoopAlignment);
5475     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5476       __ cmp(first, ch2);
5477       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5478       __ b(DONE);
5479     __ align(OptoLoopAlignment);
5480     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5481       if (str2_isL) { // LL
5482         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5483         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5484         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5485         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5486         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5487       } else {
5488         __ mov(ch2, 0xE); // all bits in byte set except last one
5489         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5490         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5491         __ lslv(tmp2, tmp2, tmp4);
5492         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5493         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5494         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5495         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5496       }
5497       __ cmp(ch1, ch2);
5498       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5499       __ b(DONE);
5500     __ align(OptoLoopAlignment);
5501     __ BIND(L_HAS_ZERO);
5502       __ rbit(tmp2, tmp2);
5503       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5504       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5505       // It's fine because both counters are 32bit and are not changed in this
5506       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5507       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5508       __ sub(result, result, 1);
5509     __ BIND(L_HAS_ZERO_LOOP);
5510       __ mov(cnt1, wordSize/str2_chr_size);
5511       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5512       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5513       if (str2_isL) {
5514         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5515         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5516         __ lslv(tmp2, tmp2, tmp4);
5517         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5518         __ add(tmp4, tmp4, 1);
5519         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5520         __ lsl(tmp2, tmp2, 1);
5521         __ mov(tmp4, wordSize/str2_chr_size);
5522       } else {
5523         __ mov(ch2, 0xE);
5524         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5525         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5526         __ lslv(tmp2, tmp2, tmp4);
5527         __ add(tmp4, tmp4, 1);
5528         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5529         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5530         __ lsl(tmp2, tmp2, 1);
5531         __ mov(tmp4, wordSize/str2_chr_size);
5532         __ sub(str2, str2, str2_chr_size);
5533       }
5534       __ cmp(ch1, ch2);
5535       __ mov(tmp4, wordSize/str2_chr_size);
5536       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5537     __ BIND(L_CMP_LOOP);
5538       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5539                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5540       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5541                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5542       __ add(tmp4, tmp4, 1);
5543       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5544       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5545       __ cmp(cnt1, ch2);
5546       __ br(__ EQ, L_CMP_LOOP);
5547     __ BIND(L_CMP_LOOP_NOMATCH);
5548       // here we're not matched
5549       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5550       __ clz(tmp4, tmp2);
5551       __ add(str2, str2, str2_chr_size); // advance pointer
5552       __ b(L_HAS_ZERO_LOOP);
5553     __ align(OptoLoopAlignment);
5554     __ BIND(L_CMP_LOOP_LAST_CMP);
5555       __ cmp(cnt1, ch2);
5556       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5557       __ b(DONE);
5558     __ align(OptoLoopAlignment);
5559     __ BIND(L_CMP_LOOP_LAST_CMP2);
5560       if (str2_isL) {
5561         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5562         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5563         __ lslv(tmp2, tmp2, tmp4);
5564         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5565         __ add(tmp4, tmp4, 1);
5566         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5567         __ lsl(tmp2, tmp2, 1);
5568       } else {
5569         __ mov(ch2, 0xE);
5570         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5571         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5572         __ lslv(tmp2, tmp2, tmp4);
5573         __ add(tmp4, tmp4, 1);
5574         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5575         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5576         __ lsl(tmp2, tmp2, 1);
5577         __ sub(str2, str2, str2_chr_size);
5578       }
5579       __ cmp(ch1, ch2);
5580       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5581       __ b(DONE);
5582     __ align(OptoLoopAlignment);
5583     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5584       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5585       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5586       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5587       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5588       // result by analyzed characters value, so, we can just reset lower bits
5589       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5590       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5591       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5592       // index of last analyzed substring inside current octet. So, str2 in at
5593       // respective start address. We need to advance it to next octet
5594       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5595       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5596       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5597       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5598       __ movw(cnt2, cnt2);
5599       __ b(L_LOOP_PROCEED);
5600     __ align(OptoLoopAlignment);
5601     __ BIND(NOMATCH);
5602       __ mov(result, -1);
5603     __ BIND(DONE);
5604       __ pop(spilled_regs, sp);
5605       __ ret(lr);
5606     return entry;
5607   }
5608 
5609   void generate_string_indexof_stubs() {
5610     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5611     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5612     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5613   }
5614 
5615   void inflate_and_store_2_fp_registers(bool generatePrfm,
5616       FloatRegister src1, FloatRegister src2) {
5617     Register dst = r1;
5618     __ zip1(v1, __ T16B, src1, v0);
5619     __ zip2(v2, __ T16B, src1, v0);
5620     if (generatePrfm) {
5621       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5622     }
5623     __ zip1(v3, __ T16B, src2, v0);
5624     __ zip2(v4, __ T16B, src2, v0);
5625     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5626   }
5627 
5628   // R0 = src
5629   // R1 = dst
5630   // R2 = len
5631   // R3 = len >> 3
5632   // V0 = 0
5633   // v1 = loaded 8 bytes
5634   address generate_large_byte_array_inflate() {
5635     __ align(CodeEntryAlignment);
5636     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5637     address entry = __ pc();
5638     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5639     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5640     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5641 
5642     // do one more 8-byte read to have address 16-byte aligned in most cases
5643     // also use single store instruction
5644     __ ldrd(v2, __ post(src, 8));
5645     __ sub(octetCounter, octetCounter, 2);
5646     __ zip1(v1, __ T16B, v1, v0);
5647     __ zip1(v2, __ T16B, v2, v0);
5648     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5649     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5650     __ subs(rscratch1, octetCounter, large_loop_threshold);
5651     __ br(__ LE, LOOP_START);
5652     __ b(LOOP_PRFM_START);
5653     __ bind(LOOP_PRFM);
5654       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5655     __ bind(LOOP_PRFM_START);
5656       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5657       __ sub(octetCounter, octetCounter, 8);
5658       __ subs(rscratch1, octetCounter, large_loop_threshold);
5659       inflate_and_store_2_fp_registers(true, v3, v4);
5660       inflate_and_store_2_fp_registers(true, v5, v6);
5661       __ br(__ GT, LOOP_PRFM);
5662       __ cmp(octetCounter, (u1)8);
5663       __ br(__ LT, DONE);
5664     __ bind(LOOP);
5665       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5666       __ bind(LOOP_START);
5667       __ sub(octetCounter, octetCounter, 8);
5668       __ cmp(octetCounter, (u1)8);
5669       inflate_and_store_2_fp_registers(false, v3, v4);
5670       inflate_and_store_2_fp_registers(false, v5, v6);
5671       __ br(__ GE, LOOP);
5672     __ bind(DONE);
5673       __ ret(lr);
5674     return entry;
5675   }
5676 
5677   /**
5678    *  Arguments:
5679    *
5680    *  Input:
5681    *  c_rarg0   - current state address
5682    *  c_rarg1   - H key address
5683    *  c_rarg2   - data address
5684    *  c_rarg3   - number of blocks
5685    *
5686    *  Output:
5687    *  Updated state at c_rarg0
5688    */
5689   address generate_ghash_processBlocks() {
5690     // Bafflingly, GCM uses little-endian for the byte order, but
5691     // big-endian for the bit order.  For example, the polynomial 1 is
5692     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5693     //
5694     // So, we must either reverse the bytes in each word and do
5695     // everything big-endian or reverse the bits in each byte and do
5696     // it little-endian.  On AArch64 it's more idiomatic to reverse
5697     // the bits in each byte (we have an instruction, RBIT, to do
5698     // that) and keep the data in little-endian bit order through the
5699     // calculation, bit-reversing the inputs and outputs.
5700 
5701     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5702     __ align(wordSize * 2);
5703     address p = __ pc();
5704     __ emit_int64(0x87);  // The low-order bits of the field
5705                           // polynomial (i.e. p = z^7+z^2+z+1)
5706                           // repeated in the low and high parts of a
5707                           // 128-bit vector
5708     __ emit_int64(0x87);
5709 
5710     __ align(CodeEntryAlignment);
5711     address start = __ pc();
5712 
5713     Register state   = c_rarg0;
5714     Register subkeyH = c_rarg1;
5715     Register data    = c_rarg2;
5716     Register blocks  = c_rarg3;
5717 
5718     FloatRegister vzr = v30;
5719     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5720 
5721     __ ldrq(v24, p);    // The field polynomial
5722 
5723     __ ldrq(v0, Address(state));
5724     __ ldrq(v1, Address(subkeyH));
5725 
5726     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5727     __ rbit(v0, __ T16B, v0);
5728     __ rev64(v1, __ T16B, v1);
5729     __ rbit(v1, __ T16B, v1);
5730 
5731     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5732     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5733 
5734     {
5735       Label L_ghash_loop;
5736       __ bind(L_ghash_loop);
5737 
5738       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5739                                                  // reversing each byte
5740       __ rbit(v2, __ T16B, v2);
5741       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5742 
5743       // Multiply state in v2 by subkey in v1
5744       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5745                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
5746                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
5747       // Reduce v7:v5 by the field polynomial
5748       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
5749 
5750       __ sub(blocks, blocks, 1);
5751       __ cbnz(blocks, L_ghash_loop);
5752     }
5753 
5754     // The bit-reversed result is at this point in v0
5755     __ rev64(v0, __ T16B, v0);
5756     __ rbit(v0, __ T16B, v0);
5757 
5758     __ st1(v0, __ T16B, state);
5759     __ ret(lr);
5760 
5761     return start;
5762   }
5763 
5764   address generate_ghash_processBlocks_wide() {
5765     address small = generate_ghash_processBlocks();
5766 
5767     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
5768     __ align(wordSize * 2);
5769     address p = __ pc();
5770     __ emit_int64(0x87);  // The low-order bits of the field
5771                           // polynomial (i.e. p = z^7+z^2+z+1)
5772                           // repeated in the low and high parts of a
5773                           // 128-bit vector
5774     __ emit_int64(0x87);
5775 
5776     __ align(CodeEntryAlignment);
5777     address start = __ pc();
5778 
5779     Register state   = c_rarg0;
5780     Register subkeyH = c_rarg1;
5781     Register data    = c_rarg2;
5782     Register blocks  = c_rarg3;
5783 
5784     const int unroll = 4;
5785 
5786     __ cmp(blocks, (unsigned char)(unroll * 2));
5787     __ br(__ LT, small);
5788 
5789     if (unroll > 1) {
5790     // Save state before entering routine
5791       __ sub(sp, sp, 4 * 16);
5792       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
5793       __ sub(sp, sp, 4 * 16);
5794       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
5795     }
5796 
5797     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
5798 
5799     if (unroll > 1) {
5800       // And restore state
5801       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
5802       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
5803     }
5804 
5805     __ cmp(blocks, (unsigned char)0);
5806     __ br(__ GT, small);
5807 
5808     __ ret(lr);
5809 
5810     return start;
5811   }
5812 
5813   void generate_base64_encode_simdround(Register src, Register dst,
5814         FloatRegister codec, u8 size) {
5815 
5816     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
5817     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5818     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5819 
5820     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5821 
5822     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
5823 
5824     __ ushr(ind0, arrangement, in0,  2);
5825 
5826     __ ushr(ind1, arrangement, in1,  2);
5827     __ shl(in0,   arrangement, in0,  6);
5828     __ orr(ind1,  arrangement, ind1, in0);
5829     __ ushr(ind1, arrangement, ind1, 2);
5830 
5831     __ ushr(ind2, arrangement, in2,  4);
5832     __ shl(in1,   arrangement, in1,  4);
5833     __ orr(ind2,  arrangement, in1,  ind2);
5834     __ ushr(ind2, arrangement, ind2, 2);
5835 
5836     __ shl(ind3,  arrangement, in2,  2);
5837     __ ushr(ind3, arrangement, ind3, 2);
5838 
5839     __ tbl(out0,  arrangement, codec,  4, ind0);
5840     __ tbl(out1,  arrangement, codec,  4, ind1);
5841     __ tbl(out2,  arrangement, codec,  4, ind2);
5842     __ tbl(out3,  arrangement, codec,  4, ind3);
5843 
5844     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
5845   }
5846 
5847    /**
5848    *  Arguments:
5849    *
5850    *  Input:
5851    *  c_rarg0   - src_start
5852    *  c_rarg1   - src_offset
5853    *  c_rarg2   - src_length
5854    *  c_rarg3   - dest_start
5855    *  c_rarg4   - dest_offset
5856    *  c_rarg5   - isURL
5857    *
5858    */
5859   address generate_base64_encodeBlock() {
5860 
5861     static const char toBase64[64] = {
5862       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5863       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5864       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5865       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5866       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5867     };
5868 
5869     static const char toBase64URL[64] = {
5870       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5871       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5872       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5873       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5874       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5875     };
5876 
5877     __ align(CodeEntryAlignment);
5878     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5879     address start = __ pc();
5880 
5881     Register src   = c_rarg0;  // source array
5882     Register soff  = c_rarg1;  // source start offset
5883     Register send  = c_rarg2;  // source end offset
5884     Register dst   = c_rarg3;  // dest array
5885     Register doff  = c_rarg4;  // position for writing to dest array
5886     Register isURL = c_rarg5;  // Base64 or URL character set
5887 
5888     // c_rarg6 and c_rarg7 are free to use as temps
5889     Register codec  = c_rarg6;
5890     Register length = c_rarg7;
5891 
5892     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
5893 
5894     __ add(src, src, soff);
5895     __ add(dst, dst, doff);
5896     __ sub(length, send, soff);
5897 
5898     // load the codec base address
5899     __ lea(codec, ExternalAddress((address) toBase64));
5900     __ cbz(isURL, ProcessData);
5901     __ lea(codec, ExternalAddress((address) toBase64URL));
5902 
5903     __ BIND(ProcessData);
5904 
5905     // too short to formup a SIMD loop, roll back
5906     __ cmp(length, (u1)24);
5907     __ br(Assembler::LT, Process3B);
5908 
5909     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
5910 
5911     __ BIND(Process48B);
5912     __ cmp(length, (u1)48);
5913     __ br(Assembler::LT, Process24B);
5914     generate_base64_encode_simdround(src, dst, v0, 16);
5915     __ sub(length, length, 48);
5916     __ b(Process48B);
5917 
5918     __ BIND(Process24B);
5919     __ cmp(length, (u1)24);
5920     __ br(Assembler::LT, SIMDExit);
5921     generate_base64_encode_simdround(src, dst, v0, 8);
5922     __ sub(length, length, 24);
5923 
5924     __ BIND(SIMDExit);
5925     __ cbz(length, Exit);
5926 
5927     __ BIND(Process3B);
5928     //  3 src bytes, 24 bits
5929     __ ldrb(r10, __ post(src, 1));
5930     __ ldrb(r11, __ post(src, 1));
5931     __ ldrb(r12, __ post(src, 1));
5932     __ orrw(r11, r11, r10, Assembler::LSL, 8);
5933     __ orrw(r12, r12, r11, Assembler::LSL, 8);
5934     // codec index
5935     __ ubfmw(r15, r12, 18, 23);
5936     __ ubfmw(r14, r12, 12, 17);
5937     __ ubfmw(r13, r12, 6,  11);
5938     __ andw(r12,  r12, 63);
5939     // get the code based on the codec
5940     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
5941     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
5942     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
5943     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
5944     __ strb(r15, __ post(dst, 1));
5945     __ strb(r14, __ post(dst, 1));
5946     __ strb(r13, __ post(dst, 1));
5947     __ strb(r12, __ post(dst, 1));
5948     __ sub(length, length, 3);
5949     __ cbnz(length, Process3B);
5950 
5951     __ BIND(Exit);
5952     __ ret(lr);
5953 
5954     return start;
5955   }
5956 
5957   void generate_base64_decode_simdround(Register src, Register dst,
5958         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
5959 
5960     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
5961     FloatRegister out0 = v20, out1 = v21, out2 = v22;
5962 
5963     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
5964     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
5965 
5966     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
5967 
5968     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5969 
5970     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
5971 
5972     // we need unsigned saturating subtract, to make sure all input values
5973     // in range [0, 63] will have 0U value in the higher half lookup
5974     __ uqsubv(decH0, __ T16B, in0, v27);
5975     __ uqsubv(decH1, __ T16B, in1, v27);
5976     __ uqsubv(decH2, __ T16B, in2, v27);
5977     __ uqsubv(decH3, __ T16B, in3, v27);
5978 
5979     // lower half lookup
5980     __ tbl(decL0, arrangement, codecL, 4, in0);
5981     __ tbl(decL1, arrangement, codecL, 4, in1);
5982     __ tbl(decL2, arrangement, codecL, 4, in2);
5983     __ tbl(decL3, arrangement, codecL, 4, in3);
5984 
5985     // higher half lookup
5986     __ tbx(decH0, arrangement, codecH, 4, decH0);
5987     __ tbx(decH1, arrangement, codecH, 4, decH1);
5988     __ tbx(decH2, arrangement, codecH, 4, decH2);
5989     __ tbx(decH3, arrangement, codecH, 4, decH3);
5990 
5991     // combine lower and higher
5992     __ orr(decL0, arrangement, decL0, decH0);
5993     __ orr(decL1, arrangement, decL1, decH1);
5994     __ orr(decL2, arrangement, decL2, decH2);
5995     __ orr(decL3, arrangement, decL3, decH3);
5996 
5997     // check illegal inputs, value larger than 63 (maximum of 6 bits)
5998     __ cmhi(decH0, arrangement, decL0, v27);
5999     __ cmhi(decH1, arrangement, decL1, v27);
6000     __ cmhi(decH2, arrangement, decL2, v27);
6001     __ cmhi(decH3, arrangement, decL3, v27);
6002     __ orr(in0, arrangement, decH0, decH1);
6003     __ orr(in1, arrangement, decH2, decH3);
6004     __ orr(in2, arrangement, in0,   in1);
6005     __ umaxv(in3, arrangement, in2);
6006     __ umov(rscratch2, in3, __ B, 0);
6007 
6008     // get the data to output
6009     __ shl(out0,  arrangement, decL0, 2);
6010     __ ushr(out1, arrangement, decL1, 4);
6011     __ orr(out0,  arrangement, out0,  out1);
6012     __ shl(out1,  arrangement, decL1, 4);
6013     __ ushr(out2, arrangement, decL2, 2);
6014     __ orr(out1,  arrangement, out1,  out2);
6015     __ shl(out2,  arrangement, decL2, 6);
6016     __ orr(out2,  arrangement, out2,  decL3);
6017 
6018     __ cbz(rscratch2, NoIllegalData);
6019 
6020     // handle illegal input
6021     __ umov(r10, in2, __ D, 0);
6022     if (size == 16) {
6023       __ cbnz(r10, ErrorInLowerHalf);
6024 
6025       // illegal input is in higher half, store the lower half now.
6026       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6027 
6028       __ umov(r10, in2,  __ D, 1);
6029       __ umov(r11, out0, __ D, 1);
6030       __ umov(r12, out1, __ D, 1);
6031       __ umov(r13, out2, __ D, 1);
6032       __ b(StoreLegalData);
6033 
6034       __ BIND(ErrorInLowerHalf);
6035     }
6036     __ umov(r11, out0, __ D, 0);
6037     __ umov(r12, out1, __ D, 0);
6038     __ umov(r13, out2, __ D, 0);
6039 
6040     __ BIND(StoreLegalData);
6041     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6042     __ strb(r11, __ post(dst, 1));
6043     __ strb(r12, __ post(dst, 1));
6044     __ strb(r13, __ post(dst, 1));
6045     __ lsr(r10, r10, 8);
6046     __ lsr(r11, r11, 8);
6047     __ lsr(r12, r12, 8);
6048     __ lsr(r13, r13, 8);
6049     __ b(StoreLegalData);
6050 
6051     __ BIND(NoIllegalData);
6052     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6053   }
6054 
6055 
6056    /**
6057    *  Arguments:
6058    *
6059    *  Input:
6060    *  c_rarg0   - src_start
6061    *  c_rarg1   - src_offset
6062    *  c_rarg2   - src_length
6063    *  c_rarg3   - dest_start
6064    *  c_rarg4   - dest_offset
6065    *  c_rarg5   - isURL
6066    *  c_rarg6   - isMIME
6067    *
6068    */
6069   address generate_base64_decodeBlock() {
6070 
6071     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6072     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6073     // titled "Base64 decoding".
6074 
6075     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6076     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6077     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6078     static const uint8_t fromBase64ForNoSIMD[256] = {
6079       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6080       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6081       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6082        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6083       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6084        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6085       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6086        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6087       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6088       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6089       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6090       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6091       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6092       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6093       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6094       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6095     };
6096 
6097     static const uint8_t fromBase64URLForNoSIMD[256] = {
6098       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6099       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6100       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6101        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6102       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6103        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6104       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6105        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6106       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6107       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6108       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6109       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6110       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6111       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6112       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6113       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6114     };
6115 
6116     // A legal value of base64 code is in range [0, 127].  We need two lookups
6117     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6118     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6119     // table vector lookup use tbx, out of range indices are unchanged in
6120     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6121     // The value of index 64 is set to 0, so that we know that we already get the
6122     // decoded data with the 1st lookup.
6123     static const uint8_t fromBase64ForSIMD[128] = {
6124       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6125       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6126       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6127        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6128         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6129        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6130       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6131        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6132     };
6133 
6134     static const uint8_t fromBase64URLForSIMD[128] = {
6135       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6136       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6137       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6138        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6139         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6140        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6141        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6142        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6143     };
6144 
6145     __ align(CodeEntryAlignment);
6146     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6147     address start = __ pc();
6148 
6149     Register src    = c_rarg0;  // source array
6150     Register soff   = c_rarg1;  // source start offset
6151     Register send   = c_rarg2;  // source end offset
6152     Register dst    = c_rarg3;  // dest array
6153     Register doff   = c_rarg4;  // position for writing to dest array
6154     Register isURL  = c_rarg5;  // Base64 or URL character set
6155     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6156 
6157     Register length = send;    // reuse send as length of source data to process
6158 
6159     Register simd_codec   = c_rarg6;
6160     Register nosimd_codec = c_rarg7;
6161 
6162     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6163 
6164     __ enter();
6165 
6166     __ add(src, src, soff);
6167     __ add(dst, dst, doff);
6168 
6169     __ mov(doff, dst);
6170 
6171     __ sub(length, send, soff);
6172     __ bfm(length, zr, 0, 1);
6173 
6174     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6175     __ cbz(isURL, ProcessData);
6176     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6177 
6178     __ BIND(ProcessData);
6179     __ mov(rscratch1, length);
6180     __ cmp(length, (u1)144); // 144 = 80 + 64
6181     __ br(Assembler::LT, Process4B);
6182 
6183     // In the MIME case, the line length cannot be more than 76
6184     // bytes (see RFC 2045). This is too short a block for SIMD
6185     // to be worthwhile, so we use non-SIMD here.
6186     __ movw(rscratch1, 79);
6187 
6188     __ BIND(Process4B);
6189     __ ldrw(r14, __ post(src, 4));
6190     __ ubfxw(r10, r14, 0,  8);
6191     __ ubfxw(r11, r14, 8,  8);
6192     __ ubfxw(r12, r14, 16, 8);
6193     __ ubfxw(r13, r14, 24, 8);
6194     // get the de-code
6195     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6196     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6197     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6198     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6199     // error detection, 255u indicates an illegal input
6200     __ orrw(r14, r10, r11);
6201     __ orrw(r15, r12, r13);
6202     __ orrw(r14, r14, r15);
6203     __ tbnz(r14, 7, Exit);
6204     // recover the data
6205     __ lslw(r14, r10, 10);
6206     __ bfiw(r14, r11, 4, 6);
6207     __ bfmw(r14, r12, 2, 5);
6208     __ rev16w(r14, r14);
6209     __ bfiw(r13, r12, 6, 2);
6210     __ strh(r14, __ post(dst, 2));
6211     __ strb(r13, __ post(dst, 1));
6212     // non-simd loop
6213     __ subsw(rscratch1, rscratch1, 4);
6214     __ br(Assembler::GT, Process4B);
6215 
6216     // if exiting from PreProcess80B, rscratch1 == -1;
6217     // otherwise, rscratch1 == 0.
6218     __ cbzw(rscratch1, Exit);
6219     __ sub(length, length, 80);
6220 
6221     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6222     __ cbz(isURL, SIMDEnter);
6223     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6224 
6225     __ BIND(SIMDEnter);
6226     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6227     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6228     __ mov(rscratch1, 63);
6229     __ dup(v27, __ T16B, rscratch1);
6230 
6231     __ BIND(Process64B);
6232     __ cmp(length, (u1)64);
6233     __ br(Assembler::LT, Process32B);
6234     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6235     __ sub(length, length, 64);
6236     __ b(Process64B);
6237 
6238     __ BIND(Process32B);
6239     __ cmp(length, (u1)32);
6240     __ br(Assembler::LT, SIMDExit);
6241     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6242     __ sub(length, length, 32);
6243     __ b(Process32B);
6244 
6245     __ BIND(SIMDExit);
6246     __ cbz(length, Exit);
6247     __ movw(rscratch1, length);
6248     __ b(Process4B);
6249 
6250     __ BIND(Exit);
6251     __ sub(c_rarg0, dst, doff);
6252 
6253     __ leave();
6254     __ ret(lr);
6255 
6256     return start;
6257   }
6258 
6259   // Support for spin waits.
6260   address generate_spin_wait() {
6261     __ align(CodeEntryAlignment);
6262     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6263     address start = __ pc();
6264 
6265     __ spin_wait();
6266     __ ret(lr);
6267 
6268     return start;
6269   }
6270 
6271 #ifdef LINUX
6272 
6273   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6274   //
6275   // If LSE is in use, generate LSE versions of all the stubs. The
6276   // non-LSE versions are in atomic_aarch64.S.
6277 
6278   // class AtomicStubMark records the entry point of a stub and the
6279   // stub pointer which will point to it. The stub pointer is set to
6280   // the entry point when ~AtomicStubMark() is called, which must be
6281   // after ICache::invalidate_range. This ensures safe publication of
6282   // the generated code.
6283   class AtomicStubMark {
6284     address _entry_point;
6285     aarch64_atomic_stub_t *_stub;
6286     MacroAssembler *_masm;
6287   public:
6288     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6289       _masm = masm;
6290       __ align(32);
6291       _entry_point = __ pc();
6292       _stub = stub;
6293     }
6294     ~AtomicStubMark() {
6295       *_stub = (aarch64_atomic_stub_t)_entry_point;
6296     }
6297   };
6298 
6299   // NB: For memory_order_conservative we need a trailing membar after
6300   // LSE atomic operations but not a leading membar.
6301   //
6302   // We don't need a leading membar because a clause in the Arm ARM
6303   // says:
6304   //
6305   //   Barrier-ordered-before
6306   //
6307   //   Barrier instructions order prior Memory effects before subsequent
6308   //   Memory effects generated by the same Observer. A read or a write
6309   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6310   //   Observer if and only if RW1 appears in program order before RW 2
6311   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6312   //   instruction with both Acquire and Release semantics.
6313   //
6314   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6315   // and Release semantics, therefore we don't need a leading
6316   // barrier. However, there is no corresponding Barrier-ordered-after
6317   // relationship, therefore we need a trailing membar to prevent a
6318   // later store or load from being reordered with the store in an
6319   // atomic instruction.
6320   //
6321   // This was checked by using the herd7 consistency model simulator
6322   // (http://diy.inria.fr/) with this test case:
6323   //
6324   // AArch64 LseCas
6325   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6326   // P0 | P1;
6327   // LDR W4, [X2] | MOV W3, #0;
6328   // DMB LD       | MOV W4, #1;
6329   // LDR W3, [X1] | CASAL W3, W4, [X1];
6330   //              | DMB ISH;
6331   //              | STR W4, [X2];
6332   // exists
6333   // (0:X3=0 /\ 0:X4=1)
6334   //
6335   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6336   // with the store to x in P1. Without the DMB in P1 this may happen.
6337   //
6338   // At the time of writing we don't know of any AArch64 hardware that
6339   // reorders stores in this way, but the Reference Manual permits it.
6340 
6341   void gen_cas_entry(Assembler::operand_size size,
6342                      atomic_memory_order order) {
6343     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6344       exchange_val = c_rarg2;
6345     bool acquire, release;
6346     switch (order) {
6347       case memory_order_relaxed:
6348         acquire = false;
6349         release = false;
6350         break;
6351       case memory_order_release:
6352         acquire = false;
6353         release = true;
6354         break;
6355       default:
6356         acquire = true;
6357         release = true;
6358         break;
6359     }
6360     __ mov(prev, compare_val);
6361     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6362     if (order == memory_order_conservative) {
6363       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6364     }
6365     if (size == Assembler::xword) {
6366       __ mov(r0, prev);
6367     } else {
6368       __ movw(r0, prev);
6369     }
6370     __ ret(lr);
6371   }
6372 
6373   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6374     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6375     // If not relaxed, then default to conservative.  Relaxed is the only
6376     // case we use enough to be worth specializing.
6377     if (order == memory_order_relaxed) {
6378       __ ldadd(size, incr, prev, addr);
6379     } else {
6380       __ ldaddal(size, incr, prev, addr);
6381       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6382     }
6383     if (size == Assembler::xword) {
6384       __ mov(r0, prev);
6385     } else {
6386       __ movw(r0, prev);
6387     }
6388     __ ret(lr);
6389   }
6390 
6391   void gen_swpal_entry(Assembler::operand_size size) {
6392     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6393     __ swpal(size, incr, prev, addr);
6394     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6395     if (size == Assembler::xword) {
6396       __ mov(r0, prev);
6397     } else {
6398       __ movw(r0, prev);
6399     }
6400     __ ret(lr);
6401   }
6402 
6403   void generate_atomic_entry_points() {
6404     if (! UseLSE) {
6405       return;
6406     }
6407 
6408     __ align(CodeEntryAlignment);
6409     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6410     address first_entry = __ pc();
6411 
6412     // ADD, memory_order_conservative
6413     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6414     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6415     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6416     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6417 
6418     // ADD, memory_order_relaxed
6419     AtomicStubMark mark_fetch_add_4_relaxed
6420       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6421     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6422     AtomicStubMark mark_fetch_add_8_relaxed
6423       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6424     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6425 
6426     // XCHG, memory_order_conservative
6427     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6428     gen_swpal_entry(Assembler::word);
6429     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6430     gen_swpal_entry(Assembler::xword);
6431 
6432     // CAS, memory_order_conservative
6433     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6434     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6435     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6436     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6437     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6438     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6439 
6440     // CAS, memory_order_relaxed
6441     AtomicStubMark mark_cmpxchg_1_relaxed
6442       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6443     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6444     AtomicStubMark mark_cmpxchg_4_relaxed
6445       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6446     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6447     AtomicStubMark mark_cmpxchg_8_relaxed
6448       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6449     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6450 
6451     AtomicStubMark mark_cmpxchg_4_release
6452       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6453     gen_cas_entry(MacroAssembler::word, memory_order_release);
6454     AtomicStubMark mark_cmpxchg_8_release
6455       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6456     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6457 
6458     AtomicStubMark mark_cmpxchg_4_seq_cst
6459       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6460     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6461     AtomicStubMark mark_cmpxchg_8_seq_cst
6462       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6463     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6464 
6465     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6466   }
6467 #endif // LINUX
6468 























6469   // Continuation point for throwing of implicit exceptions that are
6470   // not handled in the current activation. Fabricates an exception
6471   // oop and initiates normal exception dispatching in this
6472   // frame. Since we need to preserve callee-saved values (currently
6473   // only for C2, but done for C1 as well) we need a callee-saved oop
6474   // map and therefore have to make these stubs into RuntimeStubs
6475   // rather than BufferBlobs.  If the compiler needs all registers to
6476   // be preserved between the fault point and the exception handler
6477   // then it must assume responsibility for that in
6478   // AbstractCompiler::continuation_for_implicit_null_exception or
6479   // continuation_for_implicit_division_by_zero_exception. All other
6480   // implicit exceptions (e.g., NullPointerException or
6481   // AbstractMethodError on entry) are either at call sites or
6482   // otherwise assume that stack unwinding will be initiated, so
6483   // caller saved registers were assumed volatile in the compiler.
6484 
6485 #undef __
6486 #define __ masm->
6487 
6488   address generate_throw_exception(const char* name,
6489                                    address runtime_entry,
6490                                    Register arg1 = noreg,
6491                                    Register arg2 = noreg) {
6492     // Information about frame layout at time of blocking runtime call.
6493     // Note that we only have to preserve callee-saved registers since
6494     // the compilers are responsible for supplying a continuation point
6495     // if they expect all registers to be preserved.
6496     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6497     enum layout {
6498       rfp_off = 0,
6499       rfp_off2,
6500       return_off,
6501       return_off2,
6502       framesize // inclusive of return address
6503     };
6504 
6505     int insts_size = 512;
6506     int locs_size  = 64;
6507 
6508     CodeBuffer code(name, insts_size, locs_size);
6509     OopMapSet* oop_maps  = new OopMapSet();
6510     MacroAssembler* masm = new MacroAssembler(&code);
6511 
6512     address start = __ pc();
6513 
6514     // This is an inlined and slightly modified version of call_VM
6515     // which has the ability to fetch the return PC out of
6516     // thread-local storage and also sets up last_Java_sp slightly
6517     // differently than the real call_VM
6518 
6519     __ enter(); // Save FP and LR before call
6520 
6521     assert(is_even(framesize/2), "sp not 16-byte aligned");
6522 
6523     // lr and fp are already in place
6524     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
6525 
6526     int frame_complete = __ pc() - start;
6527 
6528     // Set up last_Java_sp and last_Java_fp
6529     address the_pc = __ pc();
6530     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6531 
6532     // Call runtime
6533     if (arg1 != noreg) {
6534       assert(arg2 != c_rarg1, "clobbered");
6535       __ mov(c_rarg1, arg1);
6536     }
6537     if (arg2 != noreg) {
6538       __ mov(c_rarg2, arg2);
6539     }
6540     __ mov(c_rarg0, rthread);
6541     BLOCK_COMMENT("call runtime_entry");
6542     __ mov(rscratch1, runtime_entry);
6543     __ blr(rscratch1);
6544 
6545     // Generate oop map
6546     OopMap* map = new OopMap(framesize, 0);
6547 
6548     oop_maps->add_gc_map(the_pc - start, map);
6549 
6550     __ reset_last_Java_frame(true);
6551 
6552     // Reinitialize the ptrue predicate register, in case the external runtime
6553     // call clobbers ptrue reg, as we may return to SVE compiled code.
6554     __ reinitialize_ptrue();
6555 
6556     __ leave();
6557 
6558     // check for pending exceptions
6559 #ifdef ASSERT
6560     Label L;
6561     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6562     __ cbnz(rscratch1, L);
6563     __ should_not_reach_here();
6564     __ bind(L);
6565 #endif // ASSERT
6566     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6567 
6568 
6569     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6570     RuntimeStub* stub =
6571       RuntimeStub::new_runtime_stub(name,
6572                                     &code,
6573                                     frame_complete,
6574                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6575                                     oop_maps, false);
6576     return stub->entry_point();
6577   }
6578 
6579   class MontgomeryMultiplyGenerator : public MacroAssembler {
6580 
6581     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6582       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6583 
6584     RegSet _toSave;
6585     bool _squaring;
6586 
6587   public:
6588     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6589       : MacroAssembler(as->code()), _squaring(squaring) {
6590 
6591       // Register allocation
6592 
6593       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6594       Pa_base = *regs;       // Argument registers
6595       if (squaring)
6596         Pb_base = Pa_base;
6597       else
6598         Pb_base = *++regs;
6599       Pn_base = *++regs;
6600       Rlen= *++regs;
6601       inv = *++regs;
6602       Pm_base = *++regs;
6603 
6604                           // Working registers:
6605       Ra =  *++regs;        // The current digit of a, b, n, and m.
6606       Rb =  *++regs;
6607       Rm =  *++regs;
6608       Rn =  *++regs;
6609 
6610       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
6611       Pb =  *++regs;
6612       Pm =  *++regs;
6613       Pn =  *++regs;
6614 
6615       t0 =  *++regs;        // Three registers which form a
6616       t1 =  *++regs;        // triple-precision accumuator.
6617       t2 =  *++regs;
6618 
6619       Ri =  *++regs;        // Inner and outer loop indexes.
6620       Rj =  *++regs;
6621 
6622       Rhi_ab = *++regs;     // Product registers: low and high parts
6623       Rlo_ab = *++regs;     // of a*b and m*n.
6624       Rhi_mn = *++regs;
6625       Rlo_mn = *++regs;
6626 
6627       // r19 and up are callee-saved.
6628       _toSave = RegSet::range(r19, *regs) + Pm_base;
6629     }
6630 
6631   private:
6632     void save_regs() {
6633       push(_toSave, sp);
6634     }
6635 
6636     void restore_regs() {
6637       pop(_toSave, sp);
6638     }
6639 
6640     template <typename T>
6641     void unroll_2(Register count, T block) {
6642       Label loop, end, odd;
6643       tbnz(count, 0, odd);
6644       cbz(count, end);
6645       align(16);
6646       bind(loop);
6647       (this->*block)();
6648       bind(odd);
6649       (this->*block)();
6650       subs(count, count, 2);
6651       br(Assembler::GT, loop);
6652       bind(end);
6653     }
6654 
6655     template <typename T>
6656     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
6657       Label loop, end, odd;
6658       tbnz(count, 0, odd);
6659       cbz(count, end);
6660       align(16);
6661       bind(loop);
6662       (this->*block)(d, s, tmp);
6663       bind(odd);
6664       (this->*block)(d, s, tmp);
6665       subs(count, count, 2);
6666       br(Assembler::GT, loop);
6667       bind(end);
6668     }
6669 
6670     void pre1(RegisterOrConstant i) {
6671       block_comment("pre1");
6672       // Pa = Pa_base;
6673       // Pb = Pb_base + i;
6674       // Pm = Pm_base;
6675       // Pn = Pn_base + i;
6676       // Ra = *Pa;
6677       // Rb = *Pb;
6678       // Rm = *Pm;
6679       // Rn = *Pn;
6680       ldr(Ra, Address(Pa_base));
6681       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6682       ldr(Rm, Address(Pm_base));
6683       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6684       lea(Pa, Address(Pa_base));
6685       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6686       lea(Pm, Address(Pm_base));
6687       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6688 
6689       // Zero the m*n result.
6690       mov(Rhi_mn, zr);
6691       mov(Rlo_mn, zr);
6692     }
6693 
6694     // The core multiply-accumulate step of a Montgomery
6695     // multiplication.  The idea is to schedule operations as a
6696     // pipeline so that instructions with long latencies (loads and
6697     // multiplies) have time to complete before their results are
6698     // used.  This most benefits in-order implementations of the
6699     // architecture but out-of-order ones also benefit.
6700     void step() {
6701       block_comment("step");
6702       // MACC(Ra, Rb, t0, t1, t2);
6703       // Ra = *++Pa;
6704       // Rb = *--Pb;
6705       umulh(Rhi_ab, Ra, Rb);
6706       mul(Rlo_ab, Ra, Rb);
6707       ldr(Ra, pre(Pa, wordSize));
6708       ldr(Rb, pre(Pb, -wordSize));
6709       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
6710                                        // previous iteration.
6711       // MACC(Rm, Rn, t0, t1, t2);
6712       // Rm = *++Pm;
6713       // Rn = *--Pn;
6714       umulh(Rhi_mn, Rm, Rn);
6715       mul(Rlo_mn, Rm, Rn);
6716       ldr(Rm, pre(Pm, wordSize));
6717       ldr(Rn, pre(Pn, -wordSize));
6718       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6719     }
6720 
6721     void post1() {
6722       block_comment("post1");
6723 
6724       // MACC(Ra, Rb, t0, t1, t2);
6725       // Ra = *++Pa;
6726       // Rb = *--Pb;
6727       umulh(Rhi_ab, Ra, Rb);
6728       mul(Rlo_ab, Ra, Rb);
6729       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6730       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6731 
6732       // *Pm = Rm = t0 * inv;
6733       mul(Rm, t0, inv);
6734       str(Rm, Address(Pm));
6735 
6736       // MACC(Rm, Rn, t0, t1, t2);
6737       // t0 = t1; t1 = t2; t2 = 0;
6738       umulh(Rhi_mn, Rm, Rn);
6739 
6740 #ifndef PRODUCT
6741       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6742       {
6743         mul(Rlo_mn, Rm, Rn);
6744         add(Rlo_mn, t0, Rlo_mn);
6745         Label ok;
6746         cbz(Rlo_mn, ok); {
6747           stop("broken Montgomery multiply");
6748         } bind(ok);
6749       }
6750 #endif
6751       // We have very carefully set things up so that
6752       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6753       // the lower half of Rm * Rn because we know the result already:
6754       // it must be -t0.  t0 + (-t0) must generate a carry iff
6755       // t0 != 0.  So, rather than do a mul and an adds we just set
6756       // the carry flag iff t0 is nonzero.
6757       //
6758       // mul(Rlo_mn, Rm, Rn);
6759       // adds(zr, t0, Rlo_mn);
6760       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6761       adcs(t0, t1, Rhi_mn);
6762       adc(t1, t2, zr);
6763       mov(t2, zr);
6764     }
6765 
6766     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
6767       block_comment("pre2");
6768       // Pa = Pa_base + i-len;
6769       // Pb = Pb_base + len;
6770       // Pm = Pm_base + i-len;
6771       // Pn = Pn_base + len;
6772 
6773       if (i.is_register()) {
6774         sub(Rj, i.as_register(), len);
6775       } else {
6776         mov(Rj, i.as_constant());
6777         sub(Rj, Rj, len);
6778       }
6779       // Rj == i-len
6780 
6781       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
6782       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
6783       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6784       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
6785 
6786       // Ra = *++Pa;
6787       // Rb = *--Pb;
6788       // Rm = *++Pm;
6789       // Rn = *--Pn;
6790       ldr(Ra, pre(Pa, wordSize));
6791       ldr(Rb, pre(Pb, -wordSize));
6792       ldr(Rm, pre(Pm, wordSize));
6793       ldr(Rn, pre(Pn, -wordSize));
6794 
6795       mov(Rhi_mn, zr);
6796       mov(Rlo_mn, zr);
6797     }
6798 
6799     void post2(RegisterOrConstant i, RegisterOrConstant len) {
6800       block_comment("post2");
6801       if (i.is_constant()) {
6802         mov(Rj, i.as_constant()-len.as_constant());
6803       } else {
6804         sub(Rj, i.as_register(), len);
6805       }
6806 
6807       adds(t0, t0, Rlo_mn); // The pending m*n, low part
6808 
6809       // As soon as we know the least significant digit of our result,
6810       // store it.
6811       // Pm_base[i-len] = t0;
6812       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6813 
6814       // t0 = t1; t1 = t2; t2 = 0;
6815       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
6816       adc(t1, t2, zr);
6817       mov(t2, zr);
6818     }
6819 
6820     // A carry in t0 after Montgomery multiplication means that we
6821     // should subtract multiples of n from our result in m.  We'll
6822     // keep doing that until there is no carry.
6823     void normalize(RegisterOrConstant len) {
6824       block_comment("normalize");
6825       // while (t0)
6826       //   t0 = sub(Pm_base, Pn_base, t0, len);
6827       Label loop, post, again;
6828       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
6829       cbz(t0, post); {
6830         bind(again); {
6831           mov(i, zr);
6832           mov(cnt, len);
6833           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6834           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6835           subs(zr, zr, zr); // set carry flag, i.e. no borrow
6836           align(16);
6837           bind(loop); {
6838             sbcs(Rm, Rm, Rn);
6839             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6840             add(i, i, 1);
6841             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6842             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6843             sub(cnt, cnt, 1);
6844           } cbnz(cnt, loop);
6845           sbc(t0, t0, zr);
6846         } cbnz(t0, again);
6847       } bind(post);
6848     }
6849 
6850     // Move memory at s to d, reversing words.
6851     //    Increments d to end of copied memory
6852     //    Destroys tmp1, tmp2
6853     //    Preserves len
6854     //    Leaves s pointing to the address which was in d at start
6855     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
6856       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
6857 
6858       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
6859       mov(tmp1, len);
6860       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
6861       sub(s, d, len, ext::uxtw, LogBytesPerWord);
6862     }
6863     // where
6864     void reverse1(Register d, Register s, Register tmp) {
6865       ldr(tmp, pre(s, -wordSize));
6866       ror(tmp, tmp, 32);
6867       str(tmp, post(d, wordSize));
6868     }
6869 
6870     void step_squaring() {
6871       // An extra ACC
6872       step();
6873       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6874     }
6875 
6876     void last_squaring(RegisterOrConstant i) {
6877       Label dont;
6878       // if ((i & 1) == 0) {
6879       tbnz(i.as_register(), 0, dont); {
6880         // MACC(Ra, Rb, t0, t1, t2);
6881         // Ra = *++Pa;
6882         // Rb = *--Pb;
6883         umulh(Rhi_ab, Ra, Rb);
6884         mul(Rlo_ab, Ra, Rb);
6885         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6886       } bind(dont);
6887     }
6888 
6889     void extra_step_squaring() {
6890       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6891 
6892       // MACC(Rm, Rn, t0, t1, t2);
6893       // Rm = *++Pm;
6894       // Rn = *--Pn;
6895       umulh(Rhi_mn, Rm, Rn);
6896       mul(Rlo_mn, Rm, Rn);
6897       ldr(Rm, pre(Pm, wordSize));
6898       ldr(Rn, pre(Pn, -wordSize));
6899     }
6900 
6901     void post1_squaring() {
6902       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6903 
6904       // *Pm = Rm = t0 * inv;
6905       mul(Rm, t0, inv);
6906       str(Rm, Address(Pm));
6907 
6908       // MACC(Rm, Rn, t0, t1, t2);
6909       // t0 = t1; t1 = t2; t2 = 0;
6910       umulh(Rhi_mn, Rm, Rn);
6911 
6912 #ifndef PRODUCT
6913       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6914       {
6915         mul(Rlo_mn, Rm, Rn);
6916         add(Rlo_mn, t0, Rlo_mn);
6917         Label ok;
6918         cbz(Rlo_mn, ok); {
6919           stop("broken Montgomery multiply");
6920         } bind(ok);
6921       }
6922 #endif
6923       // We have very carefully set things up so that
6924       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6925       // the lower half of Rm * Rn because we know the result already:
6926       // it must be -t0.  t0 + (-t0) must generate a carry iff
6927       // t0 != 0.  So, rather than do a mul and an adds we just set
6928       // the carry flag iff t0 is nonzero.
6929       //
6930       // mul(Rlo_mn, Rm, Rn);
6931       // adds(zr, t0, Rlo_mn);
6932       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6933       adcs(t0, t1, Rhi_mn);
6934       adc(t1, t2, zr);
6935       mov(t2, zr);
6936     }
6937 
6938     void acc(Register Rhi, Register Rlo,
6939              Register t0, Register t1, Register t2) {
6940       adds(t0, t0, Rlo);
6941       adcs(t1, t1, Rhi);
6942       adc(t2, t2, zr);
6943     }
6944 
6945   public:
6946     /**
6947      * Fast Montgomery multiplication.  The derivation of the
6948      * algorithm is in A Cryptographic Library for the Motorola
6949      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
6950      *
6951      * Arguments:
6952      *
6953      * Inputs for multiplication:
6954      *   c_rarg0   - int array elements a
6955      *   c_rarg1   - int array elements b
6956      *   c_rarg2   - int array elements n (the modulus)
6957      *   c_rarg3   - int length
6958      *   c_rarg4   - int inv
6959      *   c_rarg5   - int array elements m (the result)
6960      *
6961      * Inputs for squaring:
6962      *   c_rarg0   - int array elements a
6963      *   c_rarg1   - int array elements n (the modulus)
6964      *   c_rarg2   - int length
6965      *   c_rarg3   - int inv
6966      *   c_rarg4   - int array elements m (the result)
6967      *
6968      */
6969     address generate_multiply() {
6970       Label argh, nothing;
6971       bind(argh);
6972       stop("MontgomeryMultiply total_allocation must be <= 8192");
6973 
6974       align(CodeEntryAlignment);
6975       address entry = pc();
6976 
6977       cbzw(Rlen, nothing);
6978 
6979       enter();
6980 
6981       // Make room.
6982       cmpw(Rlen, 512);
6983       br(Assembler::HI, argh);
6984       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
6985       andr(sp, Ra, -2 * wordSize);
6986 
6987       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
6988 
6989       {
6990         // Copy input args, reversing as we go.  We use Ra as a
6991         // temporary variable.
6992         reverse(Ra, Pa_base, Rlen, t0, t1);
6993         if (!_squaring)
6994           reverse(Ra, Pb_base, Rlen, t0, t1);
6995         reverse(Ra, Pn_base, Rlen, t0, t1);
6996       }
6997 
6998       // Push all call-saved registers and also Pm_base which we'll need
6999       // at the end.
7000       save_regs();
7001 
7002 #ifndef PRODUCT
7003       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7004       {
7005         ldr(Rn, Address(Pn_base, 0));
7006         mul(Rlo_mn, Rn, inv);
7007         subs(zr, Rlo_mn, -1);
7008         Label ok;
7009         br(EQ, ok); {
7010           stop("broken inverse in Montgomery multiply");
7011         } bind(ok);
7012       }
7013 #endif
7014 
7015       mov(Pm_base, Ra);
7016 
7017       mov(t0, zr);
7018       mov(t1, zr);
7019       mov(t2, zr);
7020 
7021       block_comment("for (int i = 0; i < len; i++) {");
7022       mov(Ri, zr); {
7023         Label loop, end;
7024         cmpw(Ri, Rlen);
7025         br(Assembler::GE, end);
7026 
7027         bind(loop);
7028         pre1(Ri);
7029 
7030         block_comment("  for (j = i; j; j--) {"); {
7031           movw(Rj, Ri);
7032           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7033         } block_comment("  } // j");
7034 
7035         post1();
7036         addw(Ri, Ri, 1);
7037         cmpw(Ri, Rlen);
7038         br(Assembler::LT, loop);
7039         bind(end);
7040         block_comment("} // i");
7041       }
7042 
7043       block_comment("for (int i = len; i < 2*len; i++) {");
7044       mov(Ri, Rlen); {
7045         Label loop, end;
7046         cmpw(Ri, Rlen, Assembler::LSL, 1);
7047         br(Assembler::GE, end);
7048 
7049         bind(loop);
7050         pre2(Ri, Rlen);
7051 
7052         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7053           lslw(Rj, Rlen, 1);
7054           subw(Rj, Rj, Ri);
7055           subw(Rj, Rj, 1);
7056           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7057         } block_comment("  } // j");
7058 
7059         post2(Ri, Rlen);
7060         addw(Ri, Ri, 1);
7061         cmpw(Ri, Rlen, Assembler::LSL, 1);
7062         br(Assembler::LT, loop);
7063         bind(end);
7064       }
7065       block_comment("} // i");
7066 
7067       normalize(Rlen);
7068 
7069       mov(Ra, Pm_base);  // Save Pm_base in Ra
7070       restore_regs();  // Restore caller's Pm_base
7071 
7072       // Copy our result into caller's Pm_base
7073       reverse(Pm_base, Ra, Rlen, t0, t1);
7074 
7075       leave();
7076       bind(nothing);
7077       ret(lr);
7078 
7079       return entry;
7080     }
7081     // In C, approximately:
7082 
7083     // void
7084     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7085     //                     julong Pn_base[], julong Pm_base[],
7086     //                     julong inv, int len) {
7087     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7088     //   julong *Pa, *Pb, *Pn, *Pm;
7089     //   julong Ra, Rb, Rn, Rm;
7090 
7091     //   int i;
7092 
7093     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7094 
7095     //   for (i = 0; i < len; i++) {
7096     //     int j;
7097 
7098     //     Pa = Pa_base;
7099     //     Pb = Pb_base + i;
7100     //     Pm = Pm_base;
7101     //     Pn = Pn_base + i;
7102 
7103     //     Ra = *Pa;
7104     //     Rb = *Pb;
7105     //     Rm = *Pm;
7106     //     Rn = *Pn;
7107 
7108     //     int iters = i;
7109     //     for (j = 0; iters--; j++) {
7110     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7111     //       MACC(Ra, Rb, t0, t1, t2);
7112     //       Ra = *++Pa;
7113     //       Rb = *--Pb;
7114     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7115     //       MACC(Rm, Rn, t0, t1, t2);
7116     //       Rm = *++Pm;
7117     //       Rn = *--Pn;
7118     //     }
7119 
7120     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7121     //     MACC(Ra, Rb, t0, t1, t2);
7122     //     *Pm = Rm = t0 * inv;
7123     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7124     //     MACC(Rm, Rn, t0, t1, t2);
7125 
7126     //     assert(t0 == 0, "broken Montgomery multiply");
7127 
7128     //     t0 = t1; t1 = t2; t2 = 0;
7129     //   }
7130 
7131     //   for (i = len; i < 2*len; i++) {
7132     //     int j;
7133 
7134     //     Pa = Pa_base + i-len;
7135     //     Pb = Pb_base + len;
7136     //     Pm = Pm_base + i-len;
7137     //     Pn = Pn_base + len;
7138 
7139     //     Ra = *++Pa;
7140     //     Rb = *--Pb;
7141     //     Rm = *++Pm;
7142     //     Rn = *--Pn;
7143 
7144     //     int iters = len*2-i-1;
7145     //     for (j = i-len+1; iters--; j++) {
7146     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7147     //       MACC(Ra, Rb, t0, t1, t2);
7148     //       Ra = *++Pa;
7149     //       Rb = *--Pb;
7150     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7151     //       MACC(Rm, Rn, t0, t1, t2);
7152     //       Rm = *++Pm;
7153     //       Rn = *--Pn;
7154     //     }
7155 
7156     //     Pm_base[i-len] = t0;
7157     //     t0 = t1; t1 = t2; t2 = 0;
7158     //   }
7159 
7160     //   while (t0)
7161     //     t0 = sub(Pm_base, Pn_base, t0, len);
7162     // }
7163 
7164     /**
7165      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7166      * multiplies than Montgomery multiplication so it should be up to
7167      * 25% faster.  However, its loop control is more complex and it
7168      * may actually run slower on some machines.
7169      *
7170      * Arguments:
7171      *
7172      * Inputs:
7173      *   c_rarg0   - int array elements a
7174      *   c_rarg1   - int array elements n (the modulus)
7175      *   c_rarg2   - int length
7176      *   c_rarg3   - int inv
7177      *   c_rarg4   - int array elements m (the result)
7178      *
7179      */
7180     address generate_square() {
7181       Label argh;
7182       bind(argh);
7183       stop("MontgomeryMultiply total_allocation must be <= 8192");
7184 
7185       align(CodeEntryAlignment);
7186       address entry = pc();
7187 
7188       enter();
7189 
7190       // Make room.
7191       cmpw(Rlen, 512);
7192       br(Assembler::HI, argh);
7193       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7194       andr(sp, Ra, -2 * wordSize);
7195 
7196       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7197 
7198       {
7199         // Copy input args, reversing as we go.  We use Ra as a
7200         // temporary variable.
7201         reverse(Ra, Pa_base, Rlen, t0, t1);
7202         reverse(Ra, Pn_base, Rlen, t0, t1);
7203       }
7204 
7205       // Push all call-saved registers and also Pm_base which we'll need
7206       // at the end.
7207       save_regs();
7208 
7209       mov(Pm_base, Ra);
7210 
7211       mov(t0, zr);
7212       mov(t1, zr);
7213       mov(t2, zr);
7214 
7215       block_comment("for (int i = 0; i < len; i++) {");
7216       mov(Ri, zr); {
7217         Label loop, end;
7218         bind(loop);
7219         cmp(Ri, Rlen);
7220         br(Assembler::GE, end);
7221 
7222         pre1(Ri);
7223 
7224         block_comment("for (j = (i+1)/2; j; j--) {"); {
7225           add(Rj, Ri, 1);
7226           lsr(Rj, Rj, 1);
7227           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7228         } block_comment("  } // j");
7229 
7230         last_squaring(Ri);
7231 
7232         block_comment("  for (j = i/2; j; j--) {"); {
7233           lsr(Rj, Ri, 1);
7234           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7235         } block_comment("  } // j");
7236 
7237         post1_squaring();
7238         add(Ri, Ri, 1);
7239         cmp(Ri, Rlen);
7240         br(Assembler::LT, loop);
7241 
7242         bind(end);
7243         block_comment("} // i");
7244       }
7245 
7246       block_comment("for (int i = len; i < 2*len; i++) {");
7247       mov(Ri, Rlen); {
7248         Label loop, end;
7249         bind(loop);
7250         cmp(Ri, Rlen, Assembler::LSL, 1);
7251         br(Assembler::GE, end);
7252 
7253         pre2(Ri, Rlen);
7254 
7255         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7256           lsl(Rj, Rlen, 1);
7257           sub(Rj, Rj, Ri);
7258           sub(Rj, Rj, 1);
7259           lsr(Rj, Rj, 1);
7260           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7261         } block_comment("  } // j");
7262 
7263         last_squaring(Ri);
7264 
7265         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7266           lsl(Rj, Rlen, 1);
7267           sub(Rj, Rj, Ri);
7268           lsr(Rj, Rj, 1);
7269           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7270         } block_comment("  } // j");
7271 
7272         post2(Ri, Rlen);
7273         add(Ri, Ri, 1);
7274         cmp(Ri, Rlen, Assembler::LSL, 1);
7275 
7276         br(Assembler::LT, loop);
7277         bind(end);
7278         block_comment("} // i");
7279       }
7280 
7281       normalize(Rlen);
7282 
7283       mov(Ra, Pm_base);  // Save Pm_base in Ra
7284       restore_regs();  // Restore caller's Pm_base
7285 
7286       // Copy our result into caller's Pm_base
7287       reverse(Pm_base, Ra, Rlen, t0, t1);
7288 
7289       leave();
7290       ret(lr);
7291 
7292       return entry;
7293     }
7294     // In C, approximately:
7295 
7296     // void
7297     // montgomery_square(julong Pa_base[], julong Pn_base[],
7298     //                   julong Pm_base[], julong inv, int len) {
7299     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7300     //   julong *Pa, *Pb, *Pn, *Pm;
7301     //   julong Ra, Rb, Rn, Rm;
7302 
7303     //   int i;
7304 
7305     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7306 
7307     //   for (i = 0; i < len; i++) {
7308     //     int j;
7309 
7310     //     Pa = Pa_base;
7311     //     Pb = Pa_base + i;
7312     //     Pm = Pm_base;
7313     //     Pn = Pn_base + i;
7314 
7315     //     Ra = *Pa;
7316     //     Rb = *Pb;
7317     //     Rm = *Pm;
7318     //     Rn = *Pn;
7319 
7320     //     int iters = (i+1)/2;
7321     //     for (j = 0; iters--; j++) {
7322     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7323     //       MACC2(Ra, Rb, t0, t1, t2);
7324     //       Ra = *++Pa;
7325     //       Rb = *--Pb;
7326     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7327     //       MACC(Rm, Rn, t0, t1, t2);
7328     //       Rm = *++Pm;
7329     //       Rn = *--Pn;
7330     //     }
7331     //     if ((i & 1) == 0) {
7332     //       assert(Ra == Pa_base[j], "must be");
7333     //       MACC(Ra, Ra, t0, t1, t2);
7334     //     }
7335     //     iters = i/2;
7336     //     assert(iters == i-j, "must be");
7337     //     for (; iters--; j++) {
7338     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7339     //       MACC(Rm, Rn, t0, t1, t2);
7340     //       Rm = *++Pm;
7341     //       Rn = *--Pn;
7342     //     }
7343 
7344     //     *Pm = Rm = t0 * inv;
7345     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7346     //     MACC(Rm, Rn, t0, t1, t2);
7347 
7348     //     assert(t0 == 0, "broken Montgomery multiply");
7349 
7350     //     t0 = t1; t1 = t2; t2 = 0;
7351     //   }
7352 
7353     //   for (i = len; i < 2*len; i++) {
7354     //     int start = i-len+1;
7355     //     int end = start + (len - start)/2;
7356     //     int j;
7357 
7358     //     Pa = Pa_base + i-len;
7359     //     Pb = Pa_base + len;
7360     //     Pm = Pm_base + i-len;
7361     //     Pn = Pn_base + len;
7362 
7363     //     Ra = *++Pa;
7364     //     Rb = *--Pb;
7365     //     Rm = *++Pm;
7366     //     Rn = *--Pn;
7367 
7368     //     int iters = (2*len-i-1)/2;
7369     //     assert(iters == end-start, "must be");
7370     //     for (j = start; iters--; j++) {
7371     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7372     //       MACC2(Ra, Rb, t0, t1, t2);
7373     //       Ra = *++Pa;
7374     //       Rb = *--Pb;
7375     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7376     //       MACC(Rm, Rn, t0, t1, t2);
7377     //       Rm = *++Pm;
7378     //       Rn = *--Pn;
7379     //     }
7380     //     if ((i & 1) == 0) {
7381     //       assert(Ra == Pa_base[j], "must be");
7382     //       MACC(Ra, Ra, t0, t1, t2);
7383     //     }
7384     //     iters =  (2*len-i)/2;
7385     //     assert(iters == len-j, "must be");
7386     //     for (; iters--; j++) {
7387     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7388     //       MACC(Rm, Rn, t0, t1, t2);
7389     //       Rm = *++Pm;
7390     //       Rn = *--Pn;
7391     //     }
7392     //     Pm_base[i-len] = t0;
7393     //     t0 = t1; t1 = t2; t2 = 0;
7394     //   }
7395 
7396     //   while (t0)
7397     //     t0 = sub(Pm_base, Pn_base, t0, len);
7398     // }
7399   };
7400 
7401 
7402   // Initialization
7403   void generate_initial() {
7404     // Generate initial stubs and initializes the entry points
7405 
7406     // entry points that exist in all platforms Note: This is code
7407     // that could be shared among different platforms - however the
7408     // benefit seems to be smaller than the disadvantage of having a
7409     // much more complicated generator structure. See also comment in
7410     // stubRoutines.hpp.
7411 
7412     StubRoutines::_forward_exception_entry = generate_forward_exception();
7413 
7414     StubRoutines::_call_stub_entry =
7415       generate_call_stub(StubRoutines::_call_stub_return_address);
7416 
7417     // is referenced by megamorphic call
7418     StubRoutines::_catch_exception_entry = generate_catch_exception();
7419 
7420     // Build this early so it's available for the interpreter.
7421     StubRoutines::_throw_StackOverflowError_entry =
7422       generate_throw_exception("StackOverflowError throw_exception",
7423                                CAST_FROM_FN_PTR(address,
7424                                                 SharedRuntime::throw_StackOverflowError));
7425     StubRoutines::_throw_delayed_StackOverflowError_entry =
7426       generate_throw_exception("delayed StackOverflowError throw_exception",
7427                                CAST_FROM_FN_PTR(address,
7428                                                 SharedRuntime::throw_delayed_StackOverflowError));
7429     if (UseCRC32Intrinsics) {
7430       // set table address before stub generation which use it
7431       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7432       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7433     }
7434 
7435     if (UseCRC32CIntrinsics) {
7436       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7437     }
7438 
7439     // Disabled until JDK-8210858 is fixed
7440     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7441     //   StubRoutines::_dlog = generate_dlog();
7442     // }
7443 
7444     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7445       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7446     }
7447 
7448     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7449       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7450     }


7451   }
7452 
7453   void generate_all() {
7454     // support for verify_oop (must happen after universe_init)
7455     if (VerifyOops) {
7456       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
7457     }
7458     StubRoutines::_throw_AbstractMethodError_entry =
7459       generate_throw_exception("AbstractMethodError throw_exception",
7460                                CAST_FROM_FN_PTR(address,
7461                                                 SharedRuntime::
7462                                                 throw_AbstractMethodError));
7463 
7464     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7465       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7466                                CAST_FROM_FN_PTR(address,
7467                                                 SharedRuntime::
7468                                                 throw_IncompatibleClassChangeError));
7469 
7470     StubRoutines::_throw_NullPointerException_at_call_entry =
7471       generate_throw_exception("NullPointerException at call throw_exception",
7472                                CAST_FROM_FN_PTR(address,
7473                                                 SharedRuntime::
7474                                                 throw_NullPointerException_at_call));
7475 
7476     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7477 
7478     // arraycopy stubs used by compilers
7479     generate_arraycopy_stubs();
7480 
7481     // countPositives stub for large arrays.
7482     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
7483 
7484     // array equals stub for large arrays.
7485     if (!UseSimpleArrayEquals) {
7486       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7487     }
7488 
7489     generate_compare_long_strings();
7490 
7491     generate_string_indexof_stubs();
7492 
7493     // byte_array_inflate stub for large arrays.
7494     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7495 
7496     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7497     if (bs_nm != NULL) {
7498       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7499     }
7500 #ifdef COMPILER2
7501     if (UseMultiplyToLenIntrinsic) {
7502       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7503     }
7504 
7505     if (UseSquareToLenIntrinsic) {
7506       StubRoutines::_squareToLen = generate_squareToLen();
7507     }
7508 
7509     if (UseMulAddIntrinsic) {
7510       StubRoutines::_mulAdd = generate_mulAdd();
7511     }
7512 
7513     if (UseSIMDForBigIntegerShiftIntrinsics) {
7514       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7515       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7516     }
7517 
7518     if (UseMontgomeryMultiplyIntrinsic) {
7519       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7520       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7521       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7522     }
7523 
7524     if (UseMontgomerySquareIntrinsic) {
7525       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7526       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7527       // We use generate_multiply() rather than generate_square()
7528       // because it's faster for the sizes of modulus we care about.
7529       StubRoutines::_montgomerySquare = g.generate_multiply();
7530     }
7531 #endif // COMPILER2
7532 
7533     if (UseBASE64Intrinsics) {
7534         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7535         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7536     }
7537 
7538     // data cache line writeback
7539     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7540     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7541 
7542     if (UseAESIntrinsics) {
7543       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7544       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7545       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7546       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7547       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7548     }
7549     if (UseGHASHIntrinsics) {
7550       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7551       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
7552     }
7553     if (UseAESIntrinsics && UseGHASHIntrinsics) {
7554       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7555     }
7556 
7557     if (UseMD5Intrinsics) {
7558       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
7559       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
7560     }
7561     if (UseSHA1Intrinsics) {
7562       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7563       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7564     }
7565     if (UseSHA256Intrinsics) {
7566       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7567       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7568     }
7569     if (UseSHA512Intrinsics) {
7570       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7571       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7572     }
7573     if (UseSHA3Intrinsics) {
7574       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7575       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7576     }
7577 
7578     // generate Adler32 intrinsics code
7579     if (UseAdler32Intrinsics) {
7580       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7581     }
7582 
7583     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
7584 
7585 #ifdef LINUX
7586 
7587     generate_atomic_entry_points();
7588 
7589 #endif // LINUX
7590 
7591     StubRoutines::aarch64::set_completed();
7592   }
7593 
7594  public:
7595   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7596     if (all) {
7597       generate_all();
7598     } else {
7599       generate_initial();
7600     }
7601   }
7602 }; // end class declaration
7603 
7604 #define UCM_TABLE_MAX_ENTRIES 8
7605 void StubGenerator_generate(CodeBuffer* code, bool all) {
7606   if (UnsafeCopyMemory::_table == NULL) {
7607     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7608   }
7609   StubGenerator g(code, all);
7610 }
7611 
7612 
7613 #ifdef LINUX
7614 
7615 // Define pointers to atomic stubs and initialize them to point to the
7616 // code in atomic_aarch64.S.
7617 
7618 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
7619   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7620     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
7621   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7622     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7623 
7624 DEFAULT_ATOMIC_OP(fetch_add, 4, )
7625 DEFAULT_ATOMIC_OP(fetch_add, 8, )
7626 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
7627 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
7628 DEFAULT_ATOMIC_OP(xchg, 4, )
7629 DEFAULT_ATOMIC_OP(xchg, 8, )
7630 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
7631 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
7632 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
7633 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
7634 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
7635 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
7636 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
7637 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
7638 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
7639 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
7640 
7641 #undef DEFAULT_ATOMIC_OP
7642 
7643 #endif // LINUX
--- EOF ---