1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/atomic.hpp"
  45 #include "runtime/continuation.hpp"
  46 #include "runtime/continuationEntry.inline.hpp"
  47 #include "runtime/frame.inline.hpp"
  48 #include "runtime/handles.inline.hpp"
  49 #include "runtime/javaThread.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubCodeGenerator.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "utilities/align.hpp"
  54 #include "utilities/globalDefinitions.hpp"
  55 #include "utilities/powerOfTwo.hpp"
  56 #ifdef COMPILER2
  57 #include "opto/runtime.hpp"
  58 #endif
  59 #if INCLUDE_ZGC
  60 #include "gc/z/zThreadLocalData.hpp"
  61 #endif
  62 
  63 // Declaration and definition of StubGenerator (no .hpp file).
  64 // For a more detailed description of the stub routine structure
  65 // see the comment in stubRoutines.hpp
  66 
  67 #undef __
  68 #define __ _masm->
  69 
  70 #ifdef PRODUCT
  71 #define BLOCK_COMMENT(str) /* nothing */
  72 #else
  73 #define BLOCK_COMMENT(str) __ block_comment(str)
  74 #endif
  75 
  76 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  77 
  78 // Stub Code definitions
  79 
  80 class StubGenerator: public StubCodeGenerator {
  81  private:
  82 
  83 #ifdef PRODUCT
  84 #define inc_counter_np(counter) ((void)0)
  85 #else
  86   void inc_counter_np_(int& counter) {
  87     __ lea(rscratch2, ExternalAddress((address)&counter));
  88     __ ldrw(rscratch1, Address(rscratch2));
  89     __ addw(rscratch1, rscratch1, 1);
  90     __ strw(rscratch1, Address(rscratch2));
  91   }
  92 #define inc_counter_np(counter) \
  93   BLOCK_COMMENT("inc_counter " #counter); \
  94   inc_counter_np_(counter);
  95 #endif
  96 
  97   // Call stubs are used to call Java from C
  98   //
  99   // Arguments:
 100   //    c_rarg0:   call wrapper address                   address
 101   //    c_rarg1:   result                                 address
 102   //    c_rarg2:   result type                            BasicType
 103   //    c_rarg3:   method                                 Method*
 104   //    c_rarg4:   (interpreter) entry point              address
 105   //    c_rarg5:   parameters                             intptr_t*
 106   //    c_rarg6:   parameter size (in words)              int
 107   //    c_rarg7:   thread                                 Thread*
 108   //
 109   // There is no return from the stub itself as any Java result
 110   // is written to result
 111   //
 112   // we save r30 (lr) as the return PC at the base of the frame and
 113   // link r29 (fp) below it as the frame pointer installing sp (r31)
 114   // into fp.
 115   //
 116   // we save r0-r7, which accounts for all the c arguments.
 117   //
 118   // TODO: strictly do we need to save them all? they are treated as
 119   // volatile by C so could we omit saving the ones we are going to
 120   // place in global registers (thread? method?) or those we only use
 121   // during setup of the Java call?
 122   //
 123   // we don't need to save r8 which C uses as an indirect result location
 124   // return register.
 125   //
 126   // we don't need to save r9-r15 which both C and Java treat as
 127   // volatile
 128   //
 129   // we don't need to save r16-18 because Java does not use them
 130   //
 131   // we save r19-r28 which Java uses as scratch registers and C
 132   // expects to be callee-save
 133   //
 134   // we save the bottom 64 bits of each value stored in v8-v15; it is
 135   // the responsibility of the caller to preserve larger values.
 136   //
 137   // so the stub frame looks like this when we enter Java code
 138   //
 139   //     [ return_from_Java     ] <--- sp
 140   //     [ argument word n      ]
 141   //      ...
 142   // -27 [ argument word 1      ]
 143   // -26 [ saved v15            ] <--- sp_after_call
 144   // -25 [ saved v14            ]
 145   // -24 [ saved v13            ]
 146   // -23 [ saved v12            ]
 147   // -22 [ saved v11            ]
 148   // -21 [ saved v10            ]
 149   // -20 [ saved v9             ]
 150   // -19 [ saved v8             ]
 151   // -18 [ saved r28            ]
 152   // -17 [ saved r27            ]
 153   // -16 [ saved r26            ]
 154   // -15 [ saved r25            ]
 155   // -14 [ saved r24            ]
 156   // -13 [ saved r23            ]
 157   // -12 [ saved r22            ]
 158   // -11 [ saved r21            ]
 159   // -10 [ saved r20            ]
 160   //  -9 [ saved r19            ]
 161   //  -8 [ call wrapper    (r0) ]
 162   //  -7 [ result          (r1) ]
 163   //  -6 [ result type     (r2) ]
 164   //  -5 [ method          (r3) ]
 165   //  -4 [ entry point     (r4) ]
 166   //  -3 [ parameters      (r5) ]
 167   //  -2 [ parameter size  (r6) ]
 168   //  -1 [ thread (r7)          ]
 169   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 170   //   1 [ saved lr       (r30) ]
 171 
 172   // Call stub stack layout word offsets from fp
 173   enum call_stub_layout {
 174     sp_after_call_off = -26,
 175 
 176     d15_off            = -26,
 177     d13_off            = -24,
 178     d11_off            = -22,
 179     d9_off             = -20,
 180 
 181     r28_off            = -18,
 182     r26_off            = -16,
 183     r24_off            = -14,
 184     r22_off            = -12,
 185     r20_off            = -10,
 186     call_wrapper_off   =  -8,
 187     result_off         =  -7,
 188     result_type_off    =  -6,
 189     method_off         =  -5,
 190     entry_point_off    =  -4,
 191     parameter_size_off =  -2,
 192     thread_off         =  -1,
 193     fp_f               =   0,
 194     retaddr_off        =   1,
 195   };
 196 
 197   address generate_call_stub(address& return_address) {
 198     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 199            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 200            "adjust this code");
 201 
 202     StubCodeMark mark(this, "StubRoutines", "call_stub");
 203     address start = __ pc();
 204 
 205     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 206 
 207     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 208     const Address result        (rfp, result_off         * wordSize);
 209     const Address result_type   (rfp, result_type_off    * wordSize);
 210     const Address method        (rfp, method_off         * wordSize);
 211     const Address entry_point   (rfp, entry_point_off    * wordSize);
 212     const Address parameter_size(rfp, parameter_size_off * wordSize);
 213 
 214     const Address thread        (rfp, thread_off         * wordSize);
 215 
 216     const Address d15_save      (rfp, d15_off * wordSize);
 217     const Address d13_save      (rfp, d13_off * wordSize);
 218     const Address d11_save      (rfp, d11_off * wordSize);
 219     const Address d9_save       (rfp, d9_off * wordSize);
 220 
 221     const Address r28_save      (rfp, r28_off * wordSize);
 222     const Address r26_save      (rfp, r26_off * wordSize);
 223     const Address r24_save      (rfp, r24_off * wordSize);
 224     const Address r22_save      (rfp, r22_off * wordSize);
 225     const Address r20_save      (rfp, r20_off * wordSize);
 226 
 227     // stub code
 228 
 229     address aarch64_entry = __ pc();
 230 
 231     // set up frame and move sp to end of save area
 232     __ enter();
 233     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 234 
 235     // save register parameters and Java scratch/global registers
 236     // n.b. we save thread even though it gets installed in
 237     // rthread because we want to sanity check rthread later
 238     __ str(c_rarg7,  thread);
 239     __ strw(c_rarg6, parameter_size);
 240     __ stp(c_rarg4, c_rarg5,  entry_point);
 241     __ stp(c_rarg2, c_rarg3,  result_type);
 242     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 243 
 244     __ stp(r20, r19,   r20_save);
 245     __ stp(r22, r21,   r22_save);
 246     __ stp(r24, r23,   r24_save);
 247     __ stp(r26, r25,   r26_save);
 248     __ stp(r28, r27,   r28_save);
 249 
 250     __ stpd(v9,  v8,   d9_save);
 251     __ stpd(v11, v10,  d11_save);
 252     __ stpd(v13, v12,  d13_save);
 253     __ stpd(v15, v14,  d15_save);
 254 
 255     // install Java thread in global register now we have saved
 256     // whatever value it held
 257     __ mov(rthread, c_rarg7);
 258     // And method
 259     __ mov(rmethod, c_rarg3);
 260 
 261     // set up the heapbase register
 262     __ reinit_heapbase();
 263 
 264 #ifdef ASSERT
 265     // make sure we have no pending exceptions
 266     {
 267       Label L;
 268       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 269       __ cmp(rscratch1, (u1)NULL_WORD);
 270       __ br(Assembler::EQ, L);
 271       __ stop("StubRoutines::call_stub: entered with pending exception");
 272       __ BIND(L);
 273     }
 274 #endif
 275     // pass parameters if any
 276     __ mov(esp, sp);
 277     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 278     __ andr(sp, rscratch1, -2 * wordSize);
 279 
 280     BLOCK_COMMENT("pass parameters if any");
 281     Label parameters_done;
 282     // parameter count is still in c_rarg6
 283     // and parameter pointer identifying param 1 is in c_rarg5
 284     __ cbzw(c_rarg6, parameters_done);
 285 
 286     address loop = __ pc();
 287     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 288     __ subsw(c_rarg6, c_rarg6, 1);
 289     __ push(rscratch1);
 290     __ br(Assembler::GT, loop);
 291 
 292     __ BIND(parameters_done);
 293 
 294     // call Java entry -- passing methdoOop, and current sp
 295     //      rmethod: Method*
 296     //      r19_sender_sp: sender sp
 297     BLOCK_COMMENT("call Java function");
 298     __ mov(r19_sender_sp, sp);
 299     __ blr(c_rarg4);
 300 
 301     // we do this here because the notify will already have been done
 302     // if we get to the next instruction via an exception
 303     //
 304     // n.b. adding this instruction here affects the calculation of
 305     // whether or not a routine returns to the call stub (used when
 306     // doing stack walks) since the normal test is to check the return
 307     // pc against the address saved below. so we may need to allow for
 308     // this extra instruction in the check.
 309 
 310     // save current address for use by exception handling code
 311 
 312     return_address = __ pc();
 313 
 314     // store result depending on type (everything that is not
 315     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 316     // n.b. this assumes Java returns an integral result in r0
 317     // and a floating result in j_farg0
 318     __ ldr(j_rarg2, result);
 319     Label is_long, is_float, is_double, exit;
 320     __ ldr(j_rarg1, result_type);
 321     __ cmp(j_rarg1, (u1)T_OBJECT);
 322     __ br(Assembler::EQ, is_long);
 323     __ cmp(j_rarg1, (u1)T_LONG);
 324     __ br(Assembler::EQ, is_long);
 325     __ cmp(j_rarg1, (u1)T_FLOAT);
 326     __ br(Assembler::EQ, is_float);
 327     __ cmp(j_rarg1, (u1)T_DOUBLE);
 328     __ br(Assembler::EQ, is_double);
 329 
 330     // handle T_INT case
 331     __ strw(r0, Address(j_rarg2));
 332 
 333     __ BIND(exit);
 334 
 335     // pop parameters
 336     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 337 
 338 #ifdef ASSERT
 339     // verify that threads correspond
 340     {
 341       Label L, S;
 342       __ ldr(rscratch1, thread);
 343       __ cmp(rthread, rscratch1);
 344       __ br(Assembler::NE, S);
 345       __ get_thread(rscratch1);
 346       __ cmp(rthread, rscratch1);
 347       __ br(Assembler::EQ, L);
 348       __ BIND(S);
 349       __ stop("StubRoutines::call_stub: threads must correspond");
 350       __ BIND(L);
 351     }
 352 #endif
 353 
 354     __ pop_cont_fastpath(rthread);
 355 
 356     // restore callee-save registers
 357     __ ldpd(v15, v14,  d15_save);
 358     __ ldpd(v13, v12,  d13_save);
 359     __ ldpd(v11, v10,  d11_save);
 360     __ ldpd(v9,  v8,   d9_save);
 361 
 362     __ ldp(r28, r27,   r28_save);
 363     __ ldp(r26, r25,   r26_save);
 364     __ ldp(r24, r23,   r24_save);
 365     __ ldp(r22, r21,   r22_save);
 366     __ ldp(r20, r19,   r20_save);
 367 
 368     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 369     __ ldrw(c_rarg2, result_type);
 370     __ ldr(c_rarg3,  method);
 371     __ ldp(c_rarg4, c_rarg5,  entry_point);
 372     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 373 
 374     // leave frame and return to caller
 375     __ leave();
 376     __ ret(lr);
 377 
 378     // handle return types different from T_INT
 379 
 380     __ BIND(is_long);
 381     __ str(r0, Address(j_rarg2, 0));
 382     __ br(Assembler::AL, exit);
 383 
 384     __ BIND(is_float);
 385     __ strs(j_farg0, Address(j_rarg2, 0));
 386     __ br(Assembler::AL, exit);
 387 
 388     __ BIND(is_double);
 389     __ strd(j_farg0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     return start;
 393   }
 394 
 395   // Return point for a Java call if there's an exception thrown in
 396   // Java code.  The exception is caught and transformed into a
 397   // pending exception stored in JavaThread that can be tested from
 398   // within the VM.
 399   //
 400   // Note: Usually the parameters are removed by the callee. In case
 401   // of an exception crossing an activation frame boundary, that is
 402   // not the case if the callee is compiled code => need to setup the
 403   // rsp.
 404   //
 405   // r0: exception oop
 406 
 407   address generate_catch_exception() {
 408     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 409     address start = __ pc();
 410 
 411     // same as in generate_call_stub():
 412     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 413     const Address thread        (rfp, thread_off         * wordSize);
 414 
 415 #ifdef ASSERT
 416     // verify that threads correspond
 417     {
 418       Label L, S;
 419       __ ldr(rscratch1, thread);
 420       __ cmp(rthread, rscratch1);
 421       __ br(Assembler::NE, S);
 422       __ get_thread(rscratch1);
 423       __ cmp(rthread, rscratch1);
 424       __ br(Assembler::EQ, L);
 425       __ bind(S);
 426       __ stop("StubRoutines::catch_exception: threads must correspond");
 427       __ bind(L);
 428     }
 429 #endif
 430 
 431     // set pending exception
 432     __ verify_oop(r0);
 433 
 434     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 435     __ mov(rscratch1, (address)__FILE__);
 436     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 437     __ movw(rscratch1, (int)__LINE__);
 438     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 439 
 440     // complete return to VM
 441     assert(StubRoutines::_call_stub_return_address != NULL,
 442            "_call_stub_return_address must have been generated before");
 443     __ b(StubRoutines::_call_stub_return_address);
 444 
 445     return start;
 446   }
 447 
 448   // Continuation point for runtime calls returning with a pending
 449   // exception.  The pending exception check happened in the runtime
 450   // or native call stub.  The pending exception in Thread is
 451   // converted into a Java-level exception.
 452   //
 453   // Contract with Java-level exception handlers:
 454   // r0: exception
 455   // r3: throwing pc
 456   //
 457   // NOTE: At entry of this stub, exception-pc must be in LR !!
 458 
 459   // NOTE: this is always used as a jump target within generated code
 460   // so it just needs to be generated code with no x86 prolog
 461 
 462   address generate_forward_exception() {
 463     StubCodeMark mark(this, "StubRoutines", "forward exception");
 464     address start = __ pc();
 465 
 466     // Upon entry, LR points to the return address returning into
 467     // Java (interpreted or compiled) code; i.e., the return address
 468     // becomes the throwing pc.
 469     //
 470     // Arguments pushed before the runtime call are still on the stack
 471     // but the exception handler will reset the stack pointer ->
 472     // ignore them.  A potential result in registers can be ignored as
 473     // well.
 474 
 475 #ifdef ASSERT
 476     // make sure this code is only executed if there is a pending exception
 477     {
 478       Label L;
 479       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 480       __ cbnz(rscratch1, L);
 481       __ stop("StubRoutines::forward exception: no pending exception (1)");
 482       __ bind(L);
 483     }
 484 #endif
 485 
 486     // compute exception handler into r19
 487 
 488     // call the VM to find the handler address associated with the
 489     // caller address. pass thread in r0 and caller pc (ret address)
 490     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 491     // the stack.
 492     __ mov(c_rarg1, lr);
 493     // lr will be trashed by the VM call so we move it to R19
 494     // (callee-saved) because we also need to pass it to the handler
 495     // returned by this call.
 496     __ mov(r19, lr);
 497     BLOCK_COMMENT("call exception_handler_for_return_address");
 498     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 499                          SharedRuntime::exception_handler_for_return_address),
 500                     rthread, c_rarg1);
 501     // Reinitialize the ptrue predicate register, in case the external runtime
 502     // call clobbers ptrue reg, as we may return to SVE compiled code.
 503     __ reinitialize_ptrue();
 504 
 505     // we should not really care that lr is no longer the callee
 506     // address. we saved the value the handler needs in r19 so we can
 507     // just copy it to r3. however, the C2 handler will push its own
 508     // frame and then calls into the VM and the VM code asserts that
 509     // the PC for the frame above the handler belongs to a compiled
 510     // Java method. So, we restore lr here to satisfy that assert.
 511     __ mov(lr, r19);
 512     // setup r0 & r3 & clear pending exception
 513     __ mov(r3, r19);
 514     __ mov(r19, r0);
 515     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 516     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 517 
 518 #ifdef ASSERT
 519     // make sure exception is set
 520     {
 521       Label L;
 522       __ cbnz(r0, L);
 523       __ stop("StubRoutines::forward exception: no pending exception (2)");
 524       __ bind(L);
 525     }
 526 #endif
 527 
 528     // continue at exception handler
 529     // r0: exception
 530     // r3: throwing pc
 531     // r19: exception handler
 532     __ verify_oop(r0);
 533     __ br(r19);
 534 
 535     return start;
 536   }
 537 
 538   // Non-destructive plausibility checks for oops
 539   //
 540   // Arguments:
 541   //    r0: oop to verify
 542   //    rscratch1: error message
 543   //
 544   // Stack after saving c_rarg3:
 545   //    [tos + 0]: saved c_rarg3
 546   //    [tos + 1]: saved c_rarg2
 547   //    [tos + 2]: saved lr
 548   //    [tos + 3]: saved rscratch2
 549   //    [tos + 4]: saved r0
 550   //    [tos + 5]: saved rscratch1
 551   address generate_verify_oop() {
 552 
 553     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 554     address start = __ pc();
 555 
 556     Label exit, error;
 557 
 558     // save c_rarg2 and c_rarg3
 559     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 560 
 561     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 562     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 563     __ ldr(c_rarg3, Address(c_rarg2));
 564     __ add(c_rarg3, c_rarg3, 1);
 565     __ str(c_rarg3, Address(c_rarg2));
 566 
 567     // object is in r0
 568     // make sure object is 'reasonable'
 569     __ cbz(r0, exit); // if obj is NULL it is OK
 570 
 571     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 572     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 573 
 574     // return if everything seems ok
 575     __ bind(exit);
 576 
 577     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 578     __ ret(lr);
 579 
 580     // handle errors
 581     __ bind(error);
 582     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 583 
 584     __ push(RegSet::range(r0, r29), sp);
 585     // debug(char* msg, int64_t pc, int64_t regs[])
 586     __ mov(c_rarg0, rscratch1);      // pass address of error message
 587     __ mov(c_rarg1, lr);             // pass return address
 588     __ mov(c_rarg2, sp);             // pass address of regs on stack
 589 #ifndef PRODUCT
 590     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 591 #endif
 592     BLOCK_COMMENT("call MacroAssembler::debug");
 593     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 594     __ blr(rscratch1);
 595     __ hlt(0);
 596 
 597     return start;
 598   }
 599 
 600   // Generate indices for iota vector.
 601   address generate_iota_indices(const char *stub_name) {
 602     __ align(CodeEntryAlignment);
 603     StubCodeMark mark(this, "StubRoutines", stub_name);
 604     address start = __ pc();
 605     // B
 606     __ emit_data64(0x0706050403020100, relocInfo::none);
 607     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 608     // H
 609     __ emit_data64(0x0003000200010000, relocInfo::none);
 610     __ emit_data64(0x0007000600050004, relocInfo::none);
 611     // S
 612     __ emit_data64(0x0000000100000000, relocInfo::none);
 613     __ emit_data64(0x0000000300000002, relocInfo::none);
 614     // D
 615     __ emit_data64(0x0000000000000000, relocInfo::none);
 616     __ emit_data64(0x0000000000000001, relocInfo::none);
 617     // S - FP
 618     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 619     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 620     // D - FP
 621     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 622     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 623     return start;
 624   }
 625 
 626   // The inner part of zero_words().  This is the bulk operation,
 627   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 628   // caller is responsible for zeroing the last few words.
 629   //
 630   // Inputs:
 631   // r10: the HeapWord-aligned base address of an array to zero.
 632   // r11: the count in HeapWords, r11 > 0.
 633   //
 634   // Returns r10 and r11, adjusted for the caller to clear.
 635   // r10: the base address of the tail of words left to clear.
 636   // r11: the number of words in the tail.
 637   //      r11 < MacroAssembler::zero_words_block_size.
 638 
 639   address generate_zero_blocks() {
 640     Label done;
 641     Label base_aligned;
 642 
 643     Register base = r10, cnt = r11;
 644 
 645     __ align(CodeEntryAlignment);
 646     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 647     address start = __ pc();
 648 
 649     if (UseBlockZeroing) {
 650       int zva_length = VM_Version::zva_length();
 651 
 652       // Ensure ZVA length can be divided by 16. This is required by
 653       // the subsequent operations.
 654       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 655 
 656       __ tbz(base, 3, base_aligned);
 657       __ str(zr, Address(__ post(base, 8)));
 658       __ sub(cnt, cnt, 1);
 659       __ bind(base_aligned);
 660 
 661       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 662       // alignment.
 663       Label small;
 664       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 665       __ subs(rscratch1, cnt, low_limit >> 3);
 666       __ br(Assembler::LT, small);
 667       __ zero_dcache_blocks(base, cnt);
 668       __ bind(small);
 669     }
 670 
 671     {
 672       // Number of stp instructions we'll unroll
 673       const int unroll =
 674         MacroAssembler::zero_words_block_size / 2;
 675       // Clear the remaining blocks.
 676       Label loop;
 677       __ subs(cnt, cnt, unroll * 2);
 678       __ br(Assembler::LT, done);
 679       __ bind(loop);
 680       for (int i = 0; i < unroll; i++)
 681         __ stp(zr, zr, __ post(base, 16));
 682       __ subs(cnt, cnt, unroll * 2);
 683       __ br(Assembler::GE, loop);
 684       __ bind(done);
 685       __ add(cnt, cnt, unroll * 2);
 686     }
 687 
 688     __ ret(lr);
 689 
 690     return start;
 691   }
 692 
 693 
 694   typedef enum {
 695     copy_forwards = 1,
 696     copy_backwards = -1
 697   } copy_direction;
 698 
 699   // Bulk copy of blocks of 8 words.
 700   //
 701   // count is a count of words.
 702   //
 703   // Precondition: count >= 8
 704   //
 705   // Postconditions:
 706   //
 707   // The least significant bit of count contains the remaining count
 708   // of words to copy.  The rest of count is trash.
 709   //
 710   // s and d are adjusted to point to the remaining words to copy
 711   //
 712   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 713                            copy_direction direction) {
 714     int unit = wordSize * direction;
 715     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 716 
 717     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 718       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 719     const Register stride = r13;
 720 
 721     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 722     assert_different_registers(s, d, count, rscratch1);
 723 
 724     Label again, drain;
 725     const char *stub_name;
 726     if (direction == copy_forwards)
 727       stub_name = "forward_copy_longs";
 728     else
 729       stub_name = "backward_copy_longs";
 730 
 731     __ align(CodeEntryAlignment);
 732 
 733     StubCodeMark mark(this, "StubRoutines", stub_name);
 734 
 735     __ bind(start);
 736 
 737     Label unaligned_copy_long;
 738     if (AvoidUnalignedAccesses) {
 739       __ tbnz(d, 3, unaligned_copy_long);
 740     }
 741 
 742     if (direction == copy_forwards) {
 743       __ sub(s, s, bias);
 744       __ sub(d, d, bias);
 745     }
 746 
 747 #ifdef ASSERT
 748     // Make sure we are never given < 8 words
 749     {
 750       Label L;
 751       __ cmp(count, (u1)8);
 752       __ br(Assembler::GE, L);
 753       __ stop("genrate_copy_longs called with < 8 words");
 754       __ bind(L);
 755     }
 756 #endif
 757 
 758     // Fill 8 registers
 759     if (UseSIMDForMemoryOps) {
 760       __ ldpq(v0, v1, Address(s, 4 * unit));
 761       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 762     } else {
 763       __ ldp(t0, t1, Address(s, 2 * unit));
 764       __ ldp(t2, t3, Address(s, 4 * unit));
 765       __ ldp(t4, t5, Address(s, 6 * unit));
 766       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 767     }
 768 
 769     __ subs(count, count, 16);
 770     __ br(Assembler::LO, drain);
 771 
 772     int prefetch = PrefetchCopyIntervalInBytes;
 773     bool use_stride = false;
 774     if (direction == copy_backwards) {
 775        use_stride = prefetch > 256;
 776        prefetch = -prefetch;
 777        if (use_stride) __ mov(stride, prefetch);
 778     }
 779 
 780     __ bind(again);
 781 
 782     if (PrefetchCopyIntervalInBytes > 0)
 783       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 784 
 785     if (UseSIMDForMemoryOps) {
 786       __ stpq(v0, v1, Address(d, 4 * unit));
 787       __ ldpq(v0, v1, Address(s, 4 * unit));
 788       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 789       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 790     } else {
 791       __ stp(t0, t1, Address(d, 2 * unit));
 792       __ ldp(t0, t1, Address(s, 2 * unit));
 793       __ stp(t2, t3, Address(d, 4 * unit));
 794       __ ldp(t2, t3, Address(s, 4 * unit));
 795       __ stp(t4, t5, Address(d, 6 * unit));
 796       __ ldp(t4, t5, Address(s, 6 * unit));
 797       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 798       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 799     }
 800 
 801     __ subs(count, count, 8);
 802     __ br(Assembler::HS, again);
 803 
 804     // Drain
 805     __ bind(drain);
 806     if (UseSIMDForMemoryOps) {
 807       __ stpq(v0, v1, Address(d, 4 * unit));
 808       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 809     } else {
 810       __ stp(t0, t1, Address(d, 2 * unit));
 811       __ stp(t2, t3, Address(d, 4 * unit));
 812       __ stp(t4, t5, Address(d, 6 * unit));
 813       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 814     }
 815 
 816     {
 817       Label L1, L2;
 818       __ tbz(count, exact_log2(4), L1);
 819       if (UseSIMDForMemoryOps) {
 820         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 821         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 822       } else {
 823         __ ldp(t0, t1, Address(s, 2 * unit));
 824         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 825         __ stp(t0, t1, Address(d, 2 * unit));
 826         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 827       }
 828       __ bind(L1);
 829 
 830       if (direction == copy_forwards) {
 831         __ add(s, s, bias);
 832         __ add(d, d, bias);
 833       }
 834 
 835       __ tbz(count, 1, L2);
 836       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 837       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 838       __ bind(L2);
 839     }
 840 
 841     __ ret(lr);
 842 
 843     if (AvoidUnalignedAccesses) {
 844       Label drain, again;
 845       // Register order for storing. Order is different for backward copy.
 846 
 847       __ bind(unaligned_copy_long);
 848 
 849       // source address is even aligned, target odd aligned
 850       //
 851       // when forward copying word pairs we read long pairs at offsets
 852       // {0, 2, 4, 6} (in long words). when backwards copying we read
 853       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 854       // address by -2 in the forwards case so we can compute the
 855       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 856       // or -1.
 857       //
 858       // when forward copying we need to store 1 word, 3 pairs and
 859       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 860       // zero offset We adjust the destination by -1 which means we
 861       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 862       //
 863       // When backwards copyng we need to store 1 word, 3 pairs and
 864       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 865       // offsets {1, 3, 5, 7, 8} * unit.
 866 
 867       if (direction == copy_forwards) {
 868         __ sub(s, s, 16);
 869         __ sub(d, d, 8);
 870       }
 871 
 872       // Fill 8 registers
 873       //
 874       // for forwards copy s was offset by -16 from the original input
 875       // value of s so the register contents are at these offsets
 876       // relative to the 64 bit block addressed by that original input
 877       // and so on for each successive 64 byte block when s is updated
 878       //
 879       // t0 at offset 0,  t1 at offset 8
 880       // t2 at offset 16, t3 at offset 24
 881       // t4 at offset 32, t5 at offset 40
 882       // t6 at offset 48, t7 at offset 56
 883 
 884       // for backwards copy s was not offset so the register contents
 885       // are at these offsets into the preceding 64 byte block
 886       // relative to that original input and so on for each successive
 887       // preceding 64 byte block when s is updated. this explains the
 888       // slightly counter-intuitive looking pattern of register usage
 889       // in the stp instructions for backwards copy.
 890       //
 891       // t0 at offset -16, t1 at offset -8
 892       // t2 at offset -32, t3 at offset -24
 893       // t4 at offset -48, t5 at offset -40
 894       // t6 at offset -64, t7 at offset -56
 895 
 896       __ ldp(t0, t1, Address(s, 2 * unit));
 897       __ ldp(t2, t3, Address(s, 4 * unit));
 898       __ ldp(t4, t5, Address(s, 6 * unit));
 899       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 900 
 901       __ subs(count, count, 16);
 902       __ br(Assembler::LO, drain);
 903 
 904       int prefetch = PrefetchCopyIntervalInBytes;
 905       bool use_stride = false;
 906       if (direction == copy_backwards) {
 907          use_stride = prefetch > 256;
 908          prefetch = -prefetch;
 909          if (use_stride) __ mov(stride, prefetch);
 910       }
 911 
 912       __ bind(again);
 913 
 914       if (PrefetchCopyIntervalInBytes > 0)
 915         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 916 
 917       if (direction == copy_forwards) {
 918        // allowing for the offset of -8 the store instructions place
 919        // registers into the target 64 bit block at the following
 920        // offsets
 921        //
 922        // t0 at offset 0
 923        // t1 at offset 8,  t2 at offset 16
 924        // t3 at offset 24, t4 at offset 32
 925        // t5 at offset 40, t6 at offset 48
 926        // t7 at offset 56
 927 
 928         __ str(t0, Address(d, 1 * unit));
 929         __ stp(t1, t2, Address(d, 2 * unit));
 930         __ ldp(t0, t1, Address(s, 2 * unit));
 931         __ stp(t3, t4, Address(d, 4 * unit));
 932         __ ldp(t2, t3, Address(s, 4 * unit));
 933         __ stp(t5, t6, Address(d, 6 * unit));
 934         __ ldp(t4, t5, Address(s, 6 * unit));
 935         __ str(t7, Address(__ pre(d, 8 * unit)));
 936         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 937       } else {
 938        // d was not offset when we started so the registers are
 939        // written into the 64 bit block preceding d with the following
 940        // offsets
 941        //
 942        // t1 at offset -8
 943        // t3 at offset -24, t0 at offset -16
 944        // t5 at offset -48, t2 at offset -32
 945        // t7 at offset -56, t4 at offset -48
 946        //                   t6 at offset -64
 947        //
 948        // note that this matches the offsets previously noted for the
 949        // loads
 950 
 951         __ str(t1, Address(d, 1 * unit));
 952         __ stp(t3, t0, Address(d, 3 * unit));
 953         __ ldp(t0, t1, Address(s, 2 * unit));
 954         __ stp(t5, t2, Address(d, 5 * unit));
 955         __ ldp(t2, t3, Address(s, 4 * unit));
 956         __ stp(t7, t4, Address(d, 7 * unit));
 957         __ ldp(t4, t5, Address(s, 6 * unit));
 958         __ str(t6, Address(__ pre(d, 8 * unit)));
 959         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 960       }
 961 
 962       __ subs(count, count, 8);
 963       __ br(Assembler::HS, again);
 964 
 965       // Drain
 966       //
 967       // this uses the same pattern of offsets and register arguments
 968       // as above
 969       __ bind(drain);
 970       if (direction == copy_forwards) {
 971         __ str(t0, Address(d, 1 * unit));
 972         __ stp(t1, t2, Address(d, 2 * unit));
 973         __ stp(t3, t4, Address(d, 4 * unit));
 974         __ stp(t5, t6, Address(d, 6 * unit));
 975         __ str(t7, Address(__ pre(d, 8 * unit)));
 976       } else {
 977         __ str(t1, Address(d, 1 * unit));
 978         __ stp(t3, t0, Address(d, 3 * unit));
 979         __ stp(t5, t2, Address(d, 5 * unit));
 980         __ stp(t7, t4, Address(d, 7 * unit));
 981         __ str(t6, Address(__ pre(d, 8 * unit)));
 982       }
 983       // now we need to copy any remaining part block which may
 984       // include a 4 word block subblock and/or a 2 word subblock.
 985       // bits 2 and 1 in the count are the tell-tale for whether we
 986       // have each such subblock
 987       {
 988         Label L1, L2;
 989         __ tbz(count, exact_log2(4), L1);
 990        // this is the same as above but copying only 4 longs hence
 991        // with only one intervening stp between the str instructions
 992        // but note that the offsets and registers still follow the
 993        // same pattern
 994         __ ldp(t0, t1, Address(s, 2 * unit));
 995         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 996         if (direction == copy_forwards) {
 997           __ str(t0, Address(d, 1 * unit));
 998           __ stp(t1, t2, Address(d, 2 * unit));
 999           __ str(t3, Address(__ pre(d, 4 * unit)));
1000         } else {
1001           __ str(t1, Address(d, 1 * unit));
1002           __ stp(t3, t0, Address(d, 3 * unit));
1003           __ str(t2, Address(__ pre(d, 4 * unit)));
1004         }
1005         __ bind(L1);
1006 
1007         __ tbz(count, 1, L2);
1008        // this is the same as above but copying only 2 longs hence
1009        // there is no intervening stp between the str instructions
1010        // but note that the offset and register patterns are still
1011        // the same
1012         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1013         if (direction == copy_forwards) {
1014           __ str(t0, Address(d, 1 * unit));
1015           __ str(t1, Address(__ pre(d, 2 * unit)));
1016         } else {
1017           __ str(t1, Address(d, 1 * unit));
1018           __ str(t0, Address(__ pre(d, 2 * unit)));
1019         }
1020         __ bind(L2);
1021 
1022        // for forwards copy we need to re-adjust the offsets we
1023        // applied so that s and d are follow the last words written
1024 
1025        if (direction == copy_forwards) {
1026          __ add(s, s, 16);
1027          __ add(d, d, 8);
1028        }
1029 
1030       }
1031 
1032       __ ret(lr);
1033       }
1034   }
1035 
1036   // Small copy: less than 16 bytes.
1037   //
1038   // NB: Ignores all of the bits of count which represent more than 15
1039   // bytes, so a caller doesn't have to mask them.
1040 
1041   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1042     bool is_backwards = step < 0;
1043     size_t granularity = uabs(step);
1044     int direction = is_backwards ? -1 : 1;
1045     int unit = wordSize * direction;
1046 
1047     Label Lword, Lint, Lshort, Lbyte;
1048 
1049     assert(granularity
1050            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1051 
1052     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1053 
1054     // ??? I don't know if this bit-test-and-branch is the right thing
1055     // to do.  It does a lot of jumping, resulting in several
1056     // mispredicted branches.  It might make more sense to do this
1057     // with something like Duff's device with a single computed branch.
1058 
1059     __ tbz(count, 3 - exact_log2(granularity), Lword);
1060     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1061     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1062     __ bind(Lword);
1063 
1064     if (granularity <= sizeof (jint)) {
1065       __ tbz(count, 2 - exact_log2(granularity), Lint);
1066       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1067       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1068       __ bind(Lint);
1069     }
1070 
1071     if (granularity <= sizeof (jshort)) {
1072       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1073       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1074       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1075       __ bind(Lshort);
1076     }
1077 
1078     if (granularity <= sizeof (jbyte)) {
1079       __ tbz(count, 0, Lbyte);
1080       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1081       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1082       __ bind(Lbyte);
1083     }
1084   }
1085 
1086   Label copy_f, copy_b;
1087 
1088   // All-singing all-dancing memory copy.
1089   //
1090   // Copy count units of memory from s to d.  The size of a unit is
1091   // step, which can be positive or negative depending on the direction
1092   // of copy.  If is_aligned is false, we align the source address.
1093   //
1094 
1095   void copy_memory(bool is_aligned, Register s, Register d,
1096                    Register count, Register tmp, int step) {
1097     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1098     bool is_backwards = step < 0;
1099     unsigned int granularity = uabs(step);
1100     const Register t0 = r3, t1 = r4;
1101 
1102     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1103     // load all the data before writing anything
1104     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1105     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1106     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1107     const Register send = r17, dend = r16;
1108 
1109     if (PrefetchCopyIntervalInBytes > 0)
1110       __ prfm(Address(s, 0), PLDL1KEEP);
1111     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1112     __ br(Assembler::HI, copy_big);
1113 
1114     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1115     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1116 
1117     __ cmp(count, u1(16/granularity));
1118     __ br(Assembler::LS, copy16);
1119 
1120     __ cmp(count, u1(64/granularity));
1121     __ br(Assembler::HI, copy80);
1122 
1123     __ cmp(count, u1(32/granularity));
1124     __ br(Assembler::LS, copy32);
1125 
1126     // 33..64 bytes
1127     if (UseSIMDForMemoryOps) {
1128       __ ldpq(v0, v1, Address(s, 0));
1129       __ ldpq(v2, v3, Address(send, -32));
1130       __ stpq(v0, v1, Address(d, 0));
1131       __ stpq(v2, v3, Address(dend, -32));
1132     } else {
1133       __ ldp(t0, t1, Address(s, 0));
1134       __ ldp(t2, t3, Address(s, 16));
1135       __ ldp(t4, t5, Address(send, -32));
1136       __ ldp(t6, t7, Address(send, -16));
1137 
1138       __ stp(t0, t1, Address(d, 0));
1139       __ stp(t2, t3, Address(d, 16));
1140       __ stp(t4, t5, Address(dend, -32));
1141       __ stp(t6, t7, Address(dend, -16));
1142     }
1143     __ b(finish);
1144 
1145     // 17..32 bytes
1146     __ bind(copy32);
1147     __ ldp(t0, t1, Address(s, 0));
1148     __ ldp(t2, t3, Address(send, -16));
1149     __ stp(t0, t1, Address(d, 0));
1150     __ stp(t2, t3, Address(dend, -16));
1151     __ b(finish);
1152 
1153     // 65..80/96 bytes
1154     // (96 bytes if SIMD because we do 32 byes per instruction)
1155     __ bind(copy80);
1156     if (UseSIMDForMemoryOps) {
1157       __ ldpq(v0, v1, Address(s, 0));
1158       __ ldpq(v2, v3, Address(s, 32));
1159       // Unaligned pointers can be an issue for copying.
1160       // The issue has more chances to happen when granularity of data is
1161       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1162       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1163       // The most performance drop has been seen for the range 65-80 bytes.
1164       // For such cases using the pair of ldp/stp instead of the third pair of
1165       // ldpq/stpq fixes the performance issue.
1166       if (granularity < sizeof (jint)) {
1167         Label copy96;
1168         __ cmp(count, u1(80/granularity));
1169         __ br(Assembler::HI, copy96);
1170         __ ldp(t0, t1, Address(send, -16));
1171 
1172         __ stpq(v0, v1, Address(d, 0));
1173         __ stpq(v2, v3, Address(d, 32));
1174         __ stp(t0, t1, Address(dend, -16));
1175         __ b(finish);
1176 
1177         __ bind(copy96);
1178       }
1179       __ ldpq(v4, v5, Address(send, -32));
1180 
1181       __ stpq(v0, v1, Address(d, 0));
1182       __ stpq(v2, v3, Address(d, 32));
1183       __ stpq(v4, v5, Address(dend, -32));
1184     } else {
1185       __ ldp(t0, t1, Address(s, 0));
1186       __ ldp(t2, t3, Address(s, 16));
1187       __ ldp(t4, t5, Address(s, 32));
1188       __ ldp(t6, t7, Address(s, 48));
1189       __ ldp(t8, t9, Address(send, -16));
1190 
1191       __ stp(t0, t1, Address(d, 0));
1192       __ stp(t2, t3, Address(d, 16));
1193       __ stp(t4, t5, Address(d, 32));
1194       __ stp(t6, t7, Address(d, 48));
1195       __ stp(t8, t9, Address(dend, -16));
1196     }
1197     __ b(finish);
1198 
1199     // 0..16 bytes
1200     __ bind(copy16);
1201     __ cmp(count, u1(8/granularity));
1202     __ br(Assembler::LO, copy8);
1203 
1204     // 8..16 bytes
1205     __ ldr(t0, Address(s, 0));
1206     __ ldr(t1, Address(send, -8));
1207     __ str(t0, Address(d, 0));
1208     __ str(t1, Address(dend, -8));
1209     __ b(finish);
1210 
1211     if (granularity < 8) {
1212       // 4..7 bytes
1213       __ bind(copy8);
1214       __ tbz(count, 2 - exact_log2(granularity), copy4);
1215       __ ldrw(t0, Address(s, 0));
1216       __ ldrw(t1, Address(send, -4));
1217       __ strw(t0, Address(d, 0));
1218       __ strw(t1, Address(dend, -4));
1219       __ b(finish);
1220       if (granularity < 4) {
1221         // 0..3 bytes
1222         __ bind(copy4);
1223         __ cbz(count, finish); // get rid of 0 case
1224         if (granularity == 2) {
1225           __ ldrh(t0, Address(s, 0));
1226           __ strh(t0, Address(d, 0));
1227         } else { // granularity == 1
1228           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1229           // the first and last byte.
1230           // Handle the 3 byte case by loading and storing base + count/2
1231           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1232           // This does means in the 1 byte case we load/store the same
1233           // byte 3 times.
1234           __ lsr(count, count, 1);
1235           __ ldrb(t0, Address(s, 0));
1236           __ ldrb(t1, Address(send, -1));
1237           __ ldrb(t2, Address(s, count));
1238           __ strb(t0, Address(d, 0));
1239           __ strb(t1, Address(dend, -1));
1240           __ strb(t2, Address(d, count));
1241         }
1242         __ b(finish);
1243       }
1244     }
1245 
1246     __ bind(copy_big);
1247     if (is_backwards) {
1248       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1249       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1250     }
1251 
1252     // Now we've got the small case out of the way we can align the
1253     // source address on a 2-word boundary.
1254 
1255     Label aligned;
1256 
1257     if (is_aligned) {
1258       // We may have to adjust by 1 word to get s 2-word-aligned.
1259       __ tbz(s, exact_log2(wordSize), aligned);
1260       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1261       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1262       __ sub(count, count, wordSize/granularity);
1263     } else {
1264       if (is_backwards) {
1265         __ andr(rscratch2, s, 2 * wordSize - 1);
1266       } else {
1267         __ neg(rscratch2, s);
1268         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1269       }
1270       // rscratch2 is the byte adjustment needed to align s.
1271       __ cbz(rscratch2, aligned);
1272       int shift = exact_log2(granularity);
1273       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1274       __ sub(count, count, rscratch2);
1275 
1276 #if 0
1277       // ?? This code is only correct for a disjoint copy.  It may or
1278       // may not make sense to use it in that case.
1279 
1280       // Copy the first pair; s and d may not be aligned.
1281       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1282       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1283 
1284       // Align s and d, adjust count
1285       if (is_backwards) {
1286         __ sub(s, s, rscratch2);
1287         __ sub(d, d, rscratch2);
1288       } else {
1289         __ add(s, s, rscratch2);
1290         __ add(d, d, rscratch2);
1291       }
1292 #else
1293       copy_memory_small(s, d, rscratch2, rscratch1, step);
1294 #endif
1295     }
1296 
1297     __ bind(aligned);
1298 
1299     // s is now 2-word-aligned.
1300 
1301     // We have a count of units and some trailing bytes.  Adjust the
1302     // count and do a bulk copy of words.
1303     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1304     if (direction == copy_forwards)
1305       __ bl(copy_f);
1306     else
1307       __ bl(copy_b);
1308 
1309     // And the tail.
1310     copy_memory_small(s, d, count, tmp, step);
1311 
1312     if (granularity >= 8) __ bind(copy8);
1313     if (granularity >= 4) __ bind(copy4);
1314     __ bind(finish);
1315   }
1316 
1317 
1318   void clobber_registers() {
1319 #ifdef ASSERT
1320     RegSet clobbered
1321       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1322     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1323     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1324     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1325       __ mov(*it, rscratch1);
1326     }
1327 #endif
1328 
1329   }
1330 
1331   // Scan over array at a for count oops, verifying each one.
1332   // Preserves a and count, clobbers rscratch1 and rscratch2.
1333   void verify_oop_array (int size, Register a, Register count, Register temp) {
1334     Label loop, end;
1335     __ mov(rscratch1, a);
1336     __ mov(rscratch2, zr);
1337     __ bind(loop);
1338     __ cmp(rscratch2, count);
1339     __ br(Assembler::HS, end);
1340     if (size == wordSize) {
1341       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1342       __ verify_oop(temp);
1343     } else {
1344       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1345       __ decode_heap_oop(temp); // calls verify_oop
1346     }
1347     __ add(rscratch2, rscratch2, 1);
1348     __ b(loop);
1349     __ bind(end);
1350   }
1351 
1352   // Arguments:
1353   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1354   //             ignored
1355   //   is_oop  - true => oop array, so generate store check code
1356   //   name    - stub name string
1357   //
1358   // Inputs:
1359   //   c_rarg0   - source array address
1360   //   c_rarg1   - destination array address
1361   //   c_rarg2   - element count, treated as ssize_t, can be zero
1362   //
1363   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1364   // the hardware handle it.  The two dwords within qwords that span
1365   // cache line boundaries will still be loaded and stored atomically.
1366   //
1367   // Side Effects:
1368   //   disjoint_int_copy_entry is set to the no-overlap entry point
1369   //   used by generate_conjoint_int_oop_copy().
1370   //
1371   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1372                                   const char *name, bool dest_uninitialized = false) {
1373     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1374     RegSet saved_reg = RegSet::of(s, d, count);
1375     __ align(CodeEntryAlignment);
1376     StubCodeMark mark(this, "StubRoutines", name);
1377     address start = __ pc();
1378     __ enter();
1379 
1380     if (entry != NULL) {
1381       *entry = __ pc();
1382       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1383       BLOCK_COMMENT("Entry:");
1384     }
1385 
1386     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1387     if (dest_uninitialized) {
1388       decorators |= IS_DEST_UNINITIALIZED;
1389     }
1390     if (aligned) {
1391       decorators |= ARRAYCOPY_ALIGNED;
1392     }
1393 
1394     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1395     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1396 
1397     if (is_oop) {
1398       // save regs before copy_memory
1399       __ push(RegSet::of(d, count), sp);
1400     }
1401     {
1402       // UnsafeCopyMemory page error: continue after ucm
1403       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1404       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1405       copy_memory(aligned, s, d, count, rscratch1, size);
1406     }
1407 
1408     if (is_oop) {
1409       __ pop(RegSet::of(d, count), sp);
1410       if (VerifyOops)
1411         verify_oop_array(size, d, count, r16);
1412     }
1413 
1414     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1415 
1416     __ leave();
1417     __ mov(r0, zr); // return 0
1418     __ ret(lr);
1419     return start;
1420   }
1421 
1422   // Arguments:
1423   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1424   //             ignored
1425   //   is_oop  - true => oop array, so generate store check code
1426   //   name    - stub name string
1427   //
1428   // Inputs:
1429   //   c_rarg0   - source array address
1430   //   c_rarg1   - destination array address
1431   //   c_rarg2   - element count, treated as ssize_t, can be zero
1432   //
1433   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1434   // the hardware handle it.  The two dwords within qwords that span
1435   // cache line boundaries will still be loaded and stored atomically.
1436   //
1437   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1438                                  address *entry, const char *name,
1439                                  bool dest_uninitialized = false) {
1440     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1441     RegSet saved_regs = RegSet::of(s, d, count);
1442     StubCodeMark mark(this, "StubRoutines", name);
1443     address start = __ pc();
1444     __ enter();
1445 
1446     if (entry != NULL) {
1447       *entry = __ pc();
1448       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1449       BLOCK_COMMENT("Entry:");
1450     }
1451 
1452     // use fwd copy when (d-s) above_equal (count*size)
1453     __ sub(rscratch1, d, s);
1454     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1455     __ br(Assembler::HS, nooverlap_target);
1456 
1457     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1458     if (dest_uninitialized) {
1459       decorators |= IS_DEST_UNINITIALIZED;
1460     }
1461     if (aligned) {
1462       decorators |= ARRAYCOPY_ALIGNED;
1463     }
1464 
1465     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1466     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1467 
1468     if (is_oop) {
1469       // save regs before copy_memory
1470       __ push(RegSet::of(d, count), sp);
1471     }
1472     {
1473       // UnsafeCopyMemory page error: continue after ucm
1474       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1475       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1476       copy_memory(aligned, s, d, count, rscratch1, -size);
1477     }
1478     if (is_oop) {
1479       __ pop(RegSet::of(d, count), sp);
1480       if (VerifyOops)
1481         verify_oop_array(size, d, count, r16);
1482     }
1483     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1484     __ leave();
1485     __ mov(r0, zr); // return 0
1486     __ ret(lr);
1487     return start;
1488 }
1489 
1490   // Arguments:
1491   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1492   //             ignored
1493   //   name    - stub name string
1494   //
1495   // Inputs:
1496   //   c_rarg0   - source array address
1497   //   c_rarg1   - destination array address
1498   //   c_rarg2   - element count, treated as ssize_t, can be zero
1499   //
1500   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1501   // we let the hardware handle it.  The one to eight bytes within words,
1502   // dwords or qwords that span cache line boundaries will still be loaded
1503   // and stored atomically.
1504   //
1505   // Side Effects:
1506   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1507   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1508   // we let the hardware handle it.  The one to eight bytes within words,
1509   // dwords or qwords that span cache line boundaries will still be loaded
1510   // and stored atomically.
1511   //
1512   // Side Effects:
1513   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1514   //   used by generate_conjoint_byte_copy().
1515   //
1516   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1517     const bool not_oop = false;
1518     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1519   }
1520 
1521   // Arguments:
1522   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1523   //             ignored
1524   //   name    - stub name string
1525   //
1526   // Inputs:
1527   //   c_rarg0   - source array address
1528   //   c_rarg1   - destination array address
1529   //   c_rarg2   - element count, treated as ssize_t, can be zero
1530   //
1531   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1532   // we let the hardware handle it.  The one to eight bytes within words,
1533   // dwords or qwords that span cache line boundaries will still be loaded
1534   // and stored atomically.
1535   //
1536   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1537                                       address* entry, const char *name) {
1538     const bool not_oop = false;
1539     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1540   }
1541 
1542   // Arguments:
1543   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1544   //             ignored
1545   //   name    - stub name string
1546   //
1547   // Inputs:
1548   //   c_rarg0   - source array address
1549   //   c_rarg1   - destination array address
1550   //   c_rarg2   - element count, treated as ssize_t, can be zero
1551   //
1552   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1553   // let the hardware handle it.  The two or four words within dwords
1554   // or qwords that span cache line boundaries will still be loaded
1555   // and stored atomically.
1556   //
1557   // Side Effects:
1558   //   disjoint_short_copy_entry is set to the no-overlap entry point
1559   //   used by generate_conjoint_short_copy().
1560   //
1561   address generate_disjoint_short_copy(bool aligned,
1562                                        address* entry, const char *name) {
1563     const bool not_oop = false;
1564     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1565   }
1566 
1567   // Arguments:
1568   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1569   //             ignored
1570   //   name    - stub name string
1571   //
1572   // Inputs:
1573   //   c_rarg0   - source array address
1574   //   c_rarg1   - destination array address
1575   //   c_rarg2   - element count, treated as ssize_t, can be zero
1576   //
1577   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1578   // let the hardware handle it.  The two or four words within dwords
1579   // or qwords that span cache line boundaries will still be loaded
1580   // and stored atomically.
1581   //
1582   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1583                                        address *entry, const char *name) {
1584     const bool not_oop = false;
1585     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1586 
1587   }
1588   // Arguments:
1589   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1590   //             ignored
1591   //   name    - stub name string
1592   //
1593   // Inputs:
1594   //   c_rarg0   - source array address
1595   //   c_rarg1   - destination array address
1596   //   c_rarg2   - element count, treated as ssize_t, can be zero
1597   //
1598   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1599   // the hardware handle it.  The two dwords within qwords that span
1600   // cache line boundaries will still be loaded and stored atomically.
1601   //
1602   // Side Effects:
1603   //   disjoint_int_copy_entry is set to the no-overlap entry point
1604   //   used by generate_conjoint_int_oop_copy().
1605   //
1606   address generate_disjoint_int_copy(bool aligned, address *entry,
1607                                          const char *name, bool dest_uninitialized = false) {
1608     const bool not_oop = false;
1609     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1610   }
1611 
1612   // Arguments:
1613   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1614   //             ignored
1615   //   name    - stub name string
1616   //
1617   // Inputs:
1618   //   c_rarg0   - source array address
1619   //   c_rarg1   - destination array address
1620   //   c_rarg2   - element count, treated as ssize_t, can be zero
1621   //
1622   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1623   // the hardware handle it.  The two dwords within qwords that span
1624   // cache line boundaries will still be loaded and stored atomically.
1625   //
1626   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1627                                      address *entry, const char *name,
1628                                      bool dest_uninitialized = false) {
1629     const bool not_oop = false;
1630     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1631   }
1632 
1633 
1634   // Arguments:
1635   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1636   //             ignored
1637   //   name    - stub name string
1638   //
1639   // Inputs:
1640   //   c_rarg0   - source array address
1641   //   c_rarg1   - destination array address
1642   //   c_rarg2   - element count, treated as size_t, can be zero
1643   //
1644   // Side Effects:
1645   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1646   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1647   //
1648   address generate_disjoint_long_copy(bool aligned, address *entry,
1649                                           const char *name, bool dest_uninitialized = false) {
1650     const bool not_oop = false;
1651     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1652   }
1653 
1654   // Arguments:
1655   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1656   //             ignored
1657   //   name    - stub name string
1658   //
1659   // Inputs:
1660   //   c_rarg0   - source array address
1661   //   c_rarg1   - destination array address
1662   //   c_rarg2   - element count, treated as size_t, can be zero
1663   //
1664   address generate_conjoint_long_copy(bool aligned,
1665                                       address nooverlap_target, address *entry,
1666                                       const char *name, bool dest_uninitialized = false) {
1667     const bool not_oop = false;
1668     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1669   }
1670 
1671   // Arguments:
1672   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1673   //             ignored
1674   //   name    - stub name string
1675   //
1676   // Inputs:
1677   //   c_rarg0   - source array address
1678   //   c_rarg1   - destination array address
1679   //   c_rarg2   - element count, treated as size_t, can be zero
1680   //
1681   // Side Effects:
1682   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1683   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1684   //
1685   address generate_disjoint_oop_copy(bool aligned, address *entry,
1686                                      const char *name, bool dest_uninitialized) {
1687     const bool is_oop = true;
1688     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1689     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1690   }
1691 
1692   // Arguments:
1693   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1694   //             ignored
1695   //   name    - stub name string
1696   //
1697   // Inputs:
1698   //   c_rarg0   - source array address
1699   //   c_rarg1   - destination array address
1700   //   c_rarg2   - element count, treated as size_t, can be zero
1701   //
1702   address generate_conjoint_oop_copy(bool aligned,
1703                                      address nooverlap_target, address *entry,
1704                                      const char *name, bool dest_uninitialized) {
1705     const bool is_oop = true;
1706     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1707     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1708                                   name, dest_uninitialized);
1709   }
1710 
1711 
1712   // Helper for generating a dynamic type check.
1713   // Smashes rscratch1, rscratch2.
1714   void generate_type_check(Register sub_klass,
1715                            Register super_check_offset,
1716                            Register super_klass,
1717                            Label& L_success) {
1718     assert_different_registers(sub_klass, super_check_offset, super_klass);
1719 
1720     BLOCK_COMMENT("type_check:");
1721 
1722     Label L_miss;
1723 
1724     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1725                                      super_check_offset);
1726     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1727 
1728     // Fall through on failure!
1729     __ BIND(L_miss);
1730   }
1731 
1732   //
1733   //  Generate checkcasting array copy stub
1734   //
1735   //  Input:
1736   //    c_rarg0   - source array address
1737   //    c_rarg1   - destination array address
1738   //    c_rarg2   - element count, treated as ssize_t, can be zero
1739   //    c_rarg3   - size_t ckoff (super_check_offset)
1740   //    c_rarg4   - oop ckval (super_klass)
1741   //
1742   //  Output:
1743   //    r0 ==  0  -  success
1744   //    r0 == -1^K - failure, where K is partial transfer count
1745   //
1746   address generate_checkcast_copy(const char *name, address *entry,
1747                                   bool dest_uninitialized = false) {
1748 
1749     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1750 
1751     // Input registers (after setup_arg_regs)
1752     const Register from        = c_rarg0;   // source array address
1753     const Register to          = c_rarg1;   // destination array address
1754     const Register count       = c_rarg2;   // elementscount
1755     const Register ckoff       = c_rarg3;   // super_check_offset
1756     const Register ckval       = c_rarg4;   // super_klass
1757 
1758     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1759     RegSet wb_post_saved_regs = RegSet::of(count);
1760 
1761     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1762     const Register copied_oop  = r22;       // actual oop copied
1763     const Register count_save  = r21;       // orig elementscount
1764     const Register start_to    = r20;       // destination array start address
1765     const Register r19_klass   = r19;       // oop._klass
1766 
1767     //---------------------------------------------------------------
1768     // Assembler stub will be used for this call to arraycopy
1769     // if the two arrays are subtypes of Object[] but the
1770     // destination array type is not equal to or a supertype
1771     // of the source type.  Each element must be separately
1772     // checked.
1773 
1774     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1775                                copied_oop, r19_klass, count_save);
1776 
1777     __ align(CodeEntryAlignment);
1778     StubCodeMark mark(this, "StubRoutines", name);
1779     address start = __ pc();
1780 
1781     __ enter(); // required for proper stackwalking of RuntimeStub frame
1782 
1783 #ifdef ASSERT
1784     // caller guarantees that the arrays really are different
1785     // otherwise, we would have to make conjoint checks
1786     { Label L;
1787       __ b(L);                  // conjoint check not yet implemented
1788       __ stop("checkcast_copy within a single array");
1789       __ bind(L);
1790     }
1791 #endif //ASSERT
1792 
1793     // Caller of this entry point must set up the argument registers.
1794     if (entry != NULL) {
1795       *entry = __ pc();
1796       BLOCK_COMMENT("Entry:");
1797     }
1798 
1799      // Empty array:  Nothing to do.
1800     __ cbz(count, L_done);
1801     __ push(RegSet::of(r19, r20, r21, r22), sp);
1802 
1803 #ifdef ASSERT
1804     BLOCK_COMMENT("assert consistent ckoff/ckval");
1805     // The ckoff and ckval must be mutually consistent,
1806     // even though caller generates both.
1807     { Label L;
1808       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1809       __ ldrw(start_to, Address(ckval, sco_offset));
1810       __ cmpw(ckoff, start_to);
1811       __ br(Assembler::EQ, L);
1812       __ stop("super_check_offset inconsistent");
1813       __ bind(L);
1814     }
1815 #endif //ASSERT
1816 
1817     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1818     bool is_oop = true;
1819     if (dest_uninitialized) {
1820       decorators |= IS_DEST_UNINITIALIZED;
1821     }
1822 
1823     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1824     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1825 
1826     // save the original count
1827     __ mov(count_save, count);
1828 
1829     // Copy from low to high addresses
1830     __ mov(start_to, to);              // Save destination array start address
1831     __ b(L_load_element);
1832 
1833     // ======== begin loop ========
1834     // (Loop is rotated; its entry is L_load_element.)
1835     // Loop control:
1836     //   for (; count != 0; count--) {
1837     //     copied_oop = load_heap_oop(from++);
1838     //     ... generate_type_check ...;
1839     //     store_heap_oop(to++, copied_oop);
1840     //   }
1841     __ align(OptoLoopAlignment);
1842 
1843     __ BIND(L_store_element);
1844     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
1845     __ sub(count, count, 1);
1846     __ cbz(count, L_do_card_marks);
1847 
1848     // ======== loop entry is here ========
1849     __ BIND(L_load_element);
1850     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1851     __ cbz(copied_oop, L_store_element);
1852 
1853     __ load_klass(r19_klass, copied_oop);// query the object klass
1854     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1855     // ======== end loop ========
1856 
1857     // It was a real error; we must depend on the caller to finish the job.
1858     // Register count = remaining oops, count_orig = total oops.
1859     // Emit GC store barriers for the oops we have copied and report
1860     // their number to the caller.
1861 
1862     __ subs(count, count_save, count);     // K = partially copied oop count
1863     __ eon(count, count, zr);                   // report (-1^K) to caller
1864     __ br(Assembler::EQ, L_done_pop);
1865 
1866     __ BIND(L_do_card_marks);
1867     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1868 
1869     __ bind(L_done_pop);
1870     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1871     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1872 
1873     __ bind(L_done);
1874     __ mov(r0, count);
1875     __ leave();
1876     __ ret(lr);
1877 
1878     return start;
1879   }
1880 
1881   // Perform range checks on the proposed arraycopy.
1882   // Kills temp, but nothing else.
1883   // Also, clean the sign bits of src_pos and dst_pos.
1884   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1885                               Register src_pos, // source position (c_rarg1)
1886                               Register dst,     // destination array oo (c_rarg2)
1887                               Register dst_pos, // destination position (c_rarg3)
1888                               Register length,
1889                               Register temp,
1890                               Label& L_failed) {
1891     BLOCK_COMMENT("arraycopy_range_checks:");
1892 
1893     assert_different_registers(rscratch1, temp);
1894 
1895     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1896     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1897     __ addw(temp, length, src_pos);
1898     __ cmpw(temp, rscratch1);
1899     __ br(Assembler::HI, L_failed);
1900 
1901     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1902     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1903     __ addw(temp, length, dst_pos);
1904     __ cmpw(temp, rscratch1);
1905     __ br(Assembler::HI, L_failed);
1906 
1907     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1908     __ movw(src_pos, src_pos);
1909     __ movw(dst_pos, dst_pos);
1910 
1911     BLOCK_COMMENT("arraycopy_range_checks done");
1912   }
1913 
1914   // These stubs get called from some dumb test routine.
1915   // I'll write them properly when they're called from
1916   // something that's actually doing something.
1917   static void fake_arraycopy_stub(address src, address dst, int count) {
1918     assert(count == 0, "huh?");
1919   }
1920 
1921 
1922   //
1923   //  Generate 'unsafe' array copy stub
1924   //  Though just as safe as the other stubs, it takes an unscaled
1925   //  size_t argument instead of an element count.
1926   //
1927   //  Input:
1928   //    c_rarg0   - source array address
1929   //    c_rarg1   - destination array address
1930   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1931   //
1932   // Examines the alignment of the operands and dispatches
1933   // to a long, int, short, or byte copy loop.
1934   //
1935   address generate_unsafe_copy(const char *name,
1936                                address byte_copy_entry,
1937                                address short_copy_entry,
1938                                address int_copy_entry,
1939                                address long_copy_entry) {
1940     Label L_long_aligned, L_int_aligned, L_short_aligned;
1941     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1942 
1943     __ align(CodeEntryAlignment);
1944     StubCodeMark mark(this, "StubRoutines", name);
1945     address start = __ pc();
1946     __ enter(); // required for proper stackwalking of RuntimeStub frame
1947 
1948     // bump this on entry, not on exit:
1949     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1950 
1951     __ orr(rscratch1, s, d);
1952     __ orr(rscratch1, rscratch1, count);
1953 
1954     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1955     __ cbz(rscratch1, L_long_aligned);
1956     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1957     __ cbz(rscratch1, L_int_aligned);
1958     __ tbz(rscratch1, 0, L_short_aligned);
1959     __ b(RuntimeAddress(byte_copy_entry));
1960 
1961     __ BIND(L_short_aligned);
1962     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1963     __ b(RuntimeAddress(short_copy_entry));
1964     __ BIND(L_int_aligned);
1965     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1966     __ b(RuntimeAddress(int_copy_entry));
1967     __ BIND(L_long_aligned);
1968     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1969     __ b(RuntimeAddress(long_copy_entry));
1970 
1971     return start;
1972   }
1973 
1974   //
1975   //  Generate generic array copy stubs
1976   //
1977   //  Input:
1978   //    c_rarg0    -  src oop
1979   //    c_rarg1    -  src_pos (32-bits)
1980   //    c_rarg2    -  dst oop
1981   //    c_rarg3    -  dst_pos (32-bits)
1982   //    c_rarg4    -  element count (32-bits)
1983   //
1984   //  Output:
1985   //    r0 ==  0  -  success
1986   //    r0 == -1^K - failure, where K is partial transfer count
1987   //
1988   address generate_generic_copy(const char *name,
1989                                 address byte_copy_entry, address short_copy_entry,
1990                                 address int_copy_entry, address oop_copy_entry,
1991                                 address long_copy_entry, address checkcast_copy_entry) {
1992 
1993     Label L_failed, L_objArray;
1994     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1995 
1996     // Input registers
1997     const Register src        = c_rarg0;  // source array oop
1998     const Register src_pos    = c_rarg1;  // source position
1999     const Register dst        = c_rarg2;  // destination array oop
2000     const Register dst_pos    = c_rarg3;  // destination position
2001     const Register length     = c_rarg4;
2002 
2003 
2004     // Registers used as temps
2005     const Register dst_klass  = c_rarg5;
2006 
2007     __ align(CodeEntryAlignment);
2008 
2009     StubCodeMark mark(this, "StubRoutines", name);
2010 
2011     address start = __ pc();
2012 
2013     __ enter(); // required for proper stackwalking of RuntimeStub frame
2014 
2015     // bump this on entry, not on exit:
2016     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2017 
2018     //-----------------------------------------------------------------------
2019     // Assembler stub will be used for this call to arraycopy
2020     // if the following conditions are met:
2021     //
2022     // (1) src and dst must not be null.
2023     // (2) src_pos must not be negative.
2024     // (3) dst_pos must not be negative.
2025     // (4) length  must not be negative.
2026     // (5) src klass and dst klass should be the same and not NULL.
2027     // (6) src and dst should be arrays.
2028     // (7) src_pos + length must not exceed length of src.
2029     // (8) dst_pos + length must not exceed length of dst.
2030     //
2031 
2032     //  if (src == NULL) return -1;
2033     __ cbz(src, L_failed);
2034 
2035     //  if (src_pos < 0) return -1;
2036     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2037 
2038     //  if (dst == NULL) return -1;
2039     __ cbz(dst, L_failed);
2040 
2041     //  if (dst_pos < 0) return -1;
2042     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2043 
2044     // registers used as temp
2045     const Register scratch_length    = r16; // elements count to copy
2046     const Register scratch_src_klass = r17; // array klass
2047     const Register lh                = r15; // layout helper
2048 
2049     //  if (length < 0) return -1;
2050     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2051     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2052 
2053     __ load_klass(scratch_src_klass, src);
2054 #ifdef ASSERT
2055     //  assert(src->klass() != NULL);
2056     {
2057       BLOCK_COMMENT("assert klasses not null {");
2058       Label L1, L2;
2059       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2060       __ bind(L1);
2061       __ stop("broken null klass");
2062       __ bind(L2);
2063       __ load_klass(rscratch1, dst);
2064       __ cbz(rscratch1, L1);     // this would be broken also
2065       BLOCK_COMMENT("} assert klasses not null done");
2066     }
2067 #endif
2068 
2069     // Load layout helper (32-bits)
2070     //
2071     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2072     // 32        30    24            16              8     2                 0
2073     //
2074     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2075     //
2076 
2077     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2078 
2079     // Handle objArrays completely differently...
2080     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2081     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2082     __ movw(rscratch1, objArray_lh);
2083     __ eorw(rscratch2, lh, rscratch1);
2084     __ cbzw(rscratch2, L_objArray);
2085 
2086     //  if (src->klass() != dst->klass()) return -1;
2087     __ load_klass(rscratch2, dst);
2088     __ eor(rscratch2, rscratch2, scratch_src_klass);
2089     __ cbnz(rscratch2, L_failed);
2090 
2091     //  if (!src->is_Array()) return -1;
2092     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2093 
2094     // At this point, it is known to be a typeArray (array_tag 0x3).
2095 #ifdef ASSERT
2096     {
2097       BLOCK_COMMENT("assert primitive array {");
2098       Label L;
2099       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2100       __ cmpw(lh, rscratch2);
2101       __ br(Assembler::GE, L);
2102       __ stop("must be a primitive array");
2103       __ bind(L);
2104       BLOCK_COMMENT("} assert primitive array done");
2105     }
2106 #endif
2107 
2108     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2109                            rscratch2, L_failed);
2110 
2111     // TypeArrayKlass
2112     //
2113     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2114     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2115     //
2116 
2117     const Register rscratch1_offset = rscratch1;    // array offset
2118     const Register r15_elsize = lh; // element size
2119 
2120     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2121            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2122     __ add(src, src, rscratch1_offset);           // src array offset
2123     __ add(dst, dst, rscratch1_offset);           // dst array offset
2124     BLOCK_COMMENT("choose copy loop based on element size");
2125 
2126     // next registers should be set before the jump to corresponding stub
2127     const Register from     = c_rarg0;  // source array address
2128     const Register to       = c_rarg1;  // destination array address
2129     const Register count    = c_rarg2;  // elements count
2130 
2131     // 'from', 'to', 'count' registers should be set in such order
2132     // since they are the same as 'src', 'src_pos', 'dst'.
2133 
2134     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2135 
2136     // The possible values of elsize are 0-3, i.e. exact_log2(element
2137     // size in bytes).  We do a simple bitwise binary search.
2138   __ BIND(L_copy_bytes);
2139     __ tbnz(r15_elsize, 1, L_copy_ints);
2140     __ tbnz(r15_elsize, 0, L_copy_shorts);
2141     __ lea(from, Address(src, src_pos));// src_addr
2142     __ lea(to,   Address(dst, dst_pos));// dst_addr
2143     __ movw(count, scratch_length); // length
2144     __ b(RuntimeAddress(byte_copy_entry));
2145 
2146   __ BIND(L_copy_shorts);
2147     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2148     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2149     __ movw(count, scratch_length); // length
2150     __ b(RuntimeAddress(short_copy_entry));
2151 
2152   __ BIND(L_copy_ints);
2153     __ tbnz(r15_elsize, 0, L_copy_longs);
2154     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2155     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2156     __ movw(count, scratch_length); // length
2157     __ b(RuntimeAddress(int_copy_entry));
2158 
2159   __ BIND(L_copy_longs);
2160 #ifdef ASSERT
2161     {
2162       BLOCK_COMMENT("assert long copy {");
2163       Label L;
2164       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2165       __ cmpw(r15_elsize, LogBytesPerLong);
2166       __ br(Assembler::EQ, L);
2167       __ stop("must be long copy, but elsize is wrong");
2168       __ bind(L);
2169       BLOCK_COMMENT("} assert long copy done");
2170     }
2171 #endif
2172     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2173     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2174     __ movw(count, scratch_length); // length
2175     __ b(RuntimeAddress(long_copy_entry));
2176 
2177     // ObjArrayKlass
2178   __ BIND(L_objArray);
2179     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2180 
2181     Label L_plain_copy, L_checkcast_copy;
2182     //  test array classes for subtyping
2183     __ load_klass(r15, dst);
2184     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2185     __ br(Assembler::NE, L_checkcast_copy);
2186 
2187     // Identically typed arrays can be copied without element-wise checks.
2188     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2189                            rscratch2, L_failed);
2190 
2191     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2192     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2193     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2194     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2195     __ movw(count, scratch_length); // length
2196   __ BIND(L_plain_copy);
2197     __ b(RuntimeAddress(oop_copy_entry));
2198 
2199   __ BIND(L_checkcast_copy);
2200     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2201     {
2202       // Before looking at dst.length, make sure dst is also an objArray.
2203       __ ldrw(rscratch1, Address(r15, lh_offset));
2204       __ movw(rscratch2, objArray_lh);
2205       __ eorw(rscratch1, rscratch1, rscratch2);
2206       __ cbnzw(rscratch1, L_failed);
2207 
2208       // It is safe to examine both src.length and dst.length.
2209       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2210                              r15, L_failed);
2211 
2212       __ load_klass(dst_klass, dst); // reload
2213 
2214       // Marshal the base address arguments now, freeing registers.
2215       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2216       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2217       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2218       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2219       __ movw(count, length);           // length (reloaded)
2220       Register sco_temp = c_rarg3;      // this register is free now
2221       assert_different_registers(from, to, count, sco_temp,
2222                                  dst_klass, scratch_src_klass);
2223       // assert_clean_int(count, sco_temp);
2224 
2225       // Generate the type check.
2226       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2227       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2228 
2229       // Smashes rscratch1, rscratch2
2230       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2231 
2232       // Fetch destination element klass from the ObjArrayKlass header.
2233       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2234       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2235       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2236 
2237       // the checkcast_copy loop needs two extra arguments:
2238       assert(c_rarg3 == sco_temp, "#3 already in place");
2239       // Set up arguments for checkcast_copy_entry.
2240       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2241       __ b(RuntimeAddress(checkcast_copy_entry));
2242     }
2243 
2244   __ BIND(L_failed);
2245     __ mov(r0, -1);
2246     __ leave();   // required for proper stackwalking of RuntimeStub frame
2247     __ ret(lr);
2248 
2249     return start;
2250   }
2251 
2252   //
2253   // Generate stub for array fill. If "aligned" is true, the
2254   // "to" address is assumed to be heapword aligned.
2255   //
2256   // Arguments for generated stub:
2257   //   to:    c_rarg0
2258   //   value: c_rarg1
2259   //   count: c_rarg2 treated as signed
2260   //
2261   address generate_fill(BasicType t, bool aligned, const char *name) {
2262     __ align(CodeEntryAlignment);
2263     StubCodeMark mark(this, "StubRoutines", name);
2264     address start = __ pc();
2265 
2266     BLOCK_COMMENT("Entry:");
2267 
2268     const Register to        = c_rarg0;  // source array address
2269     const Register value     = c_rarg1;  // value
2270     const Register count     = c_rarg2;  // elements count
2271 
2272     const Register bz_base = r10;        // base for block_zero routine
2273     const Register cnt_words = r11;      // temp register
2274 
2275     __ enter();
2276 
2277     Label L_fill_elements, L_exit1;
2278 
2279     int shift = -1;
2280     switch (t) {
2281       case T_BYTE:
2282         shift = 0;
2283         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2284         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2285         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2286         __ br(Assembler::LO, L_fill_elements);
2287         break;
2288       case T_SHORT:
2289         shift = 1;
2290         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2291         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2292         __ br(Assembler::LO, L_fill_elements);
2293         break;
2294       case T_INT:
2295         shift = 2;
2296         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2297         __ br(Assembler::LO, L_fill_elements);
2298         break;
2299       default: ShouldNotReachHere();
2300     }
2301 
2302     // Align source address at 8 bytes address boundary.
2303     Label L_skip_align1, L_skip_align2, L_skip_align4;
2304     if (!aligned) {
2305       switch (t) {
2306         case T_BYTE:
2307           // One byte misalignment happens only for byte arrays.
2308           __ tbz(to, 0, L_skip_align1);
2309           __ strb(value, Address(__ post(to, 1)));
2310           __ subw(count, count, 1);
2311           __ bind(L_skip_align1);
2312           // Fallthrough
2313         case T_SHORT:
2314           // Two bytes misalignment happens only for byte and short (char) arrays.
2315           __ tbz(to, 1, L_skip_align2);
2316           __ strh(value, Address(__ post(to, 2)));
2317           __ subw(count, count, 2 >> shift);
2318           __ bind(L_skip_align2);
2319           // Fallthrough
2320         case T_INT:
2321           // Align to 8 bytes, we know we are 4 byte aligned to start.
2322           __ tbz(to, 2, L_skip_align4);
2323           __ strw(value, Address(__ post(to, 4)));
2324           __ subw(count, count, 4 >> shift);
2325           __ bind(L_skip_align4);
2326           break;
2327         default: ShouldNotReachHere();
2328       }
2329     }
2330 
2331     //
2332     //  Fill large chunks
2333     //
2334     __ lsrw(cnt_words, count, 3 - shift); // number of words
2335     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2336     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2337     if (UseBlockZeroing) {
2338       Label non_block_zeroing, rest;
2339       // If the fill value is zero we can use the fast zero_words().
2340       __ cbnz(value, non_block_zeroing);
2341       __ mov(bz_base, to);
2342       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2343       address tpc = __ zero_words(bz_base, cnt_words);
2344       if (tpc == nullptr) {
2345         fatal("CodeCache is full at generate_fill");
2346       }
2347       __ b(rest);
2348       __ bind(non_block_zeroing);
2349       __ fill_words(to, cnt_words, value);
2350       __ bind(rest);
2351     } else {
2352       __ fill_words(to, cnt_words, value);
2353     }
2354 
2355     // Remaining count is less than 8 bytes. Fill it by a single store.
2356     // Note that the total length is no less than 8 bytes.
2357     if (t == T_BYTE || t == T_SHORT) {
2358       Label L_exit1;
2359       __ cbzw(count, L_exit1);
2360       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2361       __ str(value, Address(to, -8));    // overwrite some elements
2362       __ bind(L_exit1);
2363       __ leave();
2364       __ ret(lr);
2365     }
2366 
2367     // Handle copies less than 8 bytes.
2368     Label L_fill_2, L_fill_4, L_exit2;
2369     __ bind(L_fill_elements);
2370     switch (t) {
2371       case T_BYTE:
2372         __ tbz(count, 0, L_fill_2);
2373         __ strb(value, Address(__ post(to, 1)));
2374         __ bind(L_fill_2);
2375         __ tbz(count, 1, L_fill_4);
2376         __ strh(value, Address(__ post(to, 2)));
2377         __ bind(L_fill_4);
2378         __ tbz(count, 2, L_exit2);
2379         __ strw(value, Address(to));
2380         break;
2381       case T_SHORT:
2382         __ tbz(count, 0, L_fill_4);
2383         __ strh(value, Address(__ post(to, 2)));
2384         __ bind(L_fill_4);
2385         __ tbz(count, 1, L_exit2);
2386         __ strw(value, Address(to));
2387         break;
2388       case T_INT:
2389         __ cbzw(count, L_exit2);
2390         __ strw(value, Address(to));
2391         break;
2392       default: ShouldNotReachHere();
2393     }
2394     __ bind(L_exit2);
2395     __ leave();
2396     __ ret(lr);
2397     return start;
2398   }
2399 
2400   address generate_data_cache_writeback() {
2401     const Register line        = c_rarg0;  // address of line to write back
2402 
2403     __ align(CodeEntryAlignment);
2404 
2405     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2406 
2407     address start = __ pc();
2408     __ enter();
2409     __ cache_wb(Address(line, 0));
2410     __ leave();
2411     __ ret(lr);
2412 
2413     return start;
2414   }
2415 
2416   address generate_data_cache_writeback_sync() {
2417     const Register is_pre     = c_rarg0;  // pre or post sync
2418 
2419     __ align(CodeEntryAlignment);
2420 
2421     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2422 
2423     // pre wbsync is a no-op
2424     // post wbsync translates to an sfence
2425 
2426     Label skip;
2427     address start = __ pc();
2428     __ enter();
2429     __ cbnz(is_pre, skip);
2430     __ cache_wbsync(false);
2431     __ bind(skip);
2432     __ leave();
2433     __ ret(lr);
2434 
2435     return start;
2436   }
2437 
2438   void generate_arraycopy_stubs() {
2439     address entry;
2440     address entry_jbyte_arraycopy;
2441     address entry_jshort_arraycopy;
2442     address entry_jint_arraycopy;
2443     address entry_oop_arraycopy;
2444     address entry_jlong_arraycopy;
2445     address entry_checkcast_arraycopy;
2446 
2447     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2448     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2449 
2450     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2451 
2452     //*** jbyte
2453     // Always need aligned and unaligned versions
2454     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2455                                                                                   "jbyte_disjoint_arraycopy");
2456     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2457                                                                                   &entry_jbyte_arraycopy,
2458                                                                                   "jbyte_arraycopy");
2459     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2460                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2461     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2462                                                                                   "arrayof_jbyte_arraycopy");
2463 
2464     //*** jshort
2465     // Always need aligned and unaligned versions
2466     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2467                                                                                     "jshort_disjoint_arraycopy");
2468     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2469                                                                                     &entry_jshort_arraycopy,
2470                                                                                     "jshort_arraycopy");
2471     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2472                                                                                     "arrayof_jshort_disjoint_arraycopy");
2473     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2474                                                                                     "arrayof_jshort_arraycopy");
2475 
2476     //*** jint
2477     // Aligned versions
2478     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2479                                                                                 "arrayof_jint_disjoint_arraycopy");
2480     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2481                                                                                 "arrayof_jint_arraycopy");
2482     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2483     // entry_jint_arraycopy always points to the unaligned version
2484     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2485                                                                                 "jint_disjoint_arraycopy");
2486     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2487                                                                                 &entry_jint_arraycopy,
2488                                                                                 "jint_arraycopy");
2489 
2490     //*** jlong
2491     // It is always aligned
2492     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2493                                                                                   "arrayof_jlong_disjoint_arraycopy");
2494     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2495                                                                                   "arrayof_jlong_arraycopy");
2496     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2497     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2498 
2499     //*** oops
2500     {
2501       // With compressed oops we need unaligned versions; notice that
2502       // we overwrite entry_oop_arraycopy.
2503       bool aligned = !UseCompressedOops;
2504 
2505       StubRoutines::_arrayof_oop_disjoint_arraycopy
2506         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2507                                      /*dest_uninitialized*/false);
2508       StubRoutines::_arrayof_oop_arraycopy
2509         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2510                                      /*dest_uninitialized*/false);
2511       // Aligned versions without pre-barriers
2512       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2513         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2514                                      /*dest_uninitialized*/true);
2515       StubRoutines::_arrayof_oop_arraycopy_uninit
2516         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2517                                      /*dest_uninitialized*/true);
2518     }
2519 
2520     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2521     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2522     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2523     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2524 
2525     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2526     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2527                                                                         /*dest_uninitialized*/true);
2528 
2529     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2530                                                               entry_jbyte_arraycopy,
2531                                                               entry_jshort_arraycopy,
2532                                                               entry_jint_arraycopy,
2533                                                               entry_jlong_arraycopy);
2534 
2535     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2536                                                                entry_jbyte_arraycopy,
2537                                                                entry_jshort_arraycopy,
2538                                                                entry_jint_arraycopy,
2539                                                                entry_oop_arraycopy,
2540                                                                entry_jlong_arraycopy,
2541                                                                entry_checkcast_arraycopy);
2542 
2543     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2544     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2545     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2546     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2547     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2548     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2549   }
2550 
2551   void generate_math_stubs() { Unimplemented(); }
2552 
2553   // Arguments:
2554   //
2555   // Inputs:
2556   //   c_rarg0   - source byte array address
2557   //   c_rarg1   - destination byte array address
2558   //   c_rarg2   - K (key) in little endian int array
2559   //
2560   address generate_aescrypt_encryptBlock() {
2561     __ align(CodeEntryAlignment);
2562     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2563 
2564     const Register from        = c_rarg0;  // source array address
2565     const Register to          = c_rarg1;  // destination array address
2566     const Register key         = c_rarg2;  // key array address
2567     const Register keylen      = rscratch1;
2568 
2569     address start = __ pc();
2570     __ enter();
2571 
2572     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2573 
2574     __ aesenc_loadkeys(key, keylen);
2575     __ aesecb_encrypt(from, to, keylen);
2576 
2577     __ mov(r0, 0);
2578 
2579     __ leave();
2580     __ ret(lr);
2581 
2582     return start;
2583   }
2584 
2585   // Arguments:
2586   //
2587   // Inputs:
2588   //   c_rarg0   - source byte array address
2589   //   c_rarg1   - destination byte array address
2590   //   c_rarg2   - K (key) in little endian int array
2591   //
2592   address generate_aescrypt_decryptBlock() {
2593     assert(UseAES, "need AES cryptographic extension support");
2594     __ align(CodeEntryAlignment);
2595     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2596     Label L_doLast;
2597 
2598     const Register from        = c_rarg0;  // source array address
2599     const Register to          = c_rarg1;  // destination array address
2600     const Register key         = c_rarg2;  // key array address
2601     const Register keylen      = rscratch1;
2602 
2603     address start = __ pc();
2604     __ enter(); // required for proper stackwalking of RuntimeStub frame
2605 
2606     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2607 
2608     __ aesecb_decrypt(from, to, key, keylen);
2609 
2610     __ mov(r0, 0);
2611 
2612     __ leave();
2613     __ ret(lr);
2614 
2615     return start;
2616   }
2617 
2618   // Arguments:
2619   //
2620   // Inputs:
2621   //   c_rarg0   - source byte array address
2622   //   c_rarg1   - destination byte array address
2623   //   c_rarg2   - K (key) in little endian int array
2624   //   c_rarg3   - r vector byte array address
2625   //   c_rarg4   - input length
2626   //
2627   // Output:
2628   //   x0        - input length
2629   //
2630   address generate_cipherBlockChaining_encryptAESCrypt() {
2631     assert(UseAES, "need AES cryptographic extension support");
2632     __ align(CodeEntryAlignment);
2633     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2634 
2635     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2636 
2637     const Register from        = c_rarg0;  // source array address
2638     const Register to          = c_rarg1;  // destination array address
2639     const Register key         = c_rarg2;  // key array address
2640     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2641                                            // and left with the results of the last encryption block
2642     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2643     const Register keylen      = rscratch1;
2644 
2645     address start = __ pc();
2646 
2647       __ enter();
2648 
2649       __ movw(rscratch2, len_reg);
2650 
2651       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2652 
2653       __ ld1(v0, __ T16B, rvec);
2654 
2655       __ cmpw(keylen, 52);
2656       __ br(Assembler::CC, L_loadkeys_44);
2657       __ br(Assembler::EQ, L_loadkeys_52);
2658 
2659       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2660       __ rev32(v17, __ T16B, v17);
2661       __ rev32(v18, __ T16B, v18);
2662     __ BIND(L_loadkeys_52);
2663       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2664       __ rev32(v19, __ T16B, v19);
2665       __ rev32(v20, __ T16B, v20);
2666     __ BIND(L_loadkeys_44);
2667       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2668       __ rev32(v21, __ T16B, v21);
2669       __ rev32(v22, __ T16B, v22);
2670       __ rev32(v23, __ T16B, v23);
2671       __ rev32(v24, __ T16B, v24);
2672       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2673       __ rev32(v25, __ T16B, v25);
2674       __ rev32(v26, __ T16B, v26);
2675       __ rev32(v27, __ T16B, v27);
2676       __ rev32(v28, __ T16B, v28);
2677       __ ld1(v29, v30, v31, __ T16B, key);
2678       __ rev32(v29, __ T16B, v29);
2679       __ rev32(v30, __ T16B, v30);
2680       __ rev32(v31, __ T16B, v31);
2681 
2682     __ BIND(L_aes_loop);
2683       __ ld1(v1, __ T16B, __ post(from, 16));
2684       __ eor(v0, __ T16B, v0, v1);
2685 
2686       __ br(Assembler::CC, L_rounds_44);
2687       __ br(Assembler::EQ, L_rounds_52);
2688 
2689       __ aese(v0, v17); __ aesmc(v0, v0);
2690       __ aese(v0, v18); __ aesmc(v0, v0);
2691     __ BIND(L_rounds_52);
2692       __ aese(v0, v19); __ aesmc(v0, v0);
2693       __ aese(v0, v20); __ aesmc(v0, v0);
2694     __ BIND(L_rounds_44);
2695       __ aese(v0, v21); __ aesmc(v0, v0);
2696       __ aese(v0, v22); __ aesmc(v0, v0);
2697       __ aese(v0, v23); __ aesmc(v0, v0);
2698       __ aese(v0, v24); __ aesmc(v0, v0);
2699       __ aese(v0, v25); __ aesmc(v0, v0);
2700       __ aese(v0, v26); __ aesmc(v0, v0);
2701       __ aese(v0, v27); __ aesmc(v0, v0);
2702       __ aese(v0, v28); __ aesmc(v0, v0);
2703       __ aese(v0, v29); __ aesmc(v0, v0);
2704       __ aese(v0, v30);
2705       __ eor(v0, __ T16B, v0, v31);
2706 
2707       __ st1(v0, __ T16B, __ post(to, 16));
2708 
2709       __ subw(len_reg, len_reg, 16);
2710       __ cbnzw(len_reg, L_aes_loop);
2711 
2712       __ st1(v0, __ T16B, rvec);
2713 
2714       __ mov(r0, rscratch2);
2715 
2716       __ leave();
2717       __ ret(lr);
2718 
2719       return start;
2720   }
2721 
2722   // Arguments:
2723   //
2724   // Inputs:
2725   //   c_rarg0   - source byte array address
2726   //   c_rarg1   - destination byte array address
2727   //   c_rarg2   - K (key) in little endian int array
2728   //   c_rarg3   - r vector byte array address
2729   //   c_rarg4   - input length
2730   //
2731   // Output:
2732   //   r0        - input length
2733   //
2734   address generate_cipherBlockChaining_decryptAESCrypt() {
2735     assert(UseAES, "need AES cryptographic extension support");
2736     __ align(CodeEntryAlignment);
2737     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2738 
2739     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2740 
2741     const Register from        = c_rarg0;  // source array address
2742     const Register to          = c_rarg1;  // destination array address
2743     const Register key         = c_rarg2;  // key array address
2744     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2745                                            // and left with the results of the last encryption block
2746     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2747     const Register keylen      = rscratch1;
2748 
2749     address start = __ pc();
2750 
2751       __ enter();
2752 
2753       __ movw(rscratch2, len_reg);
2754 
2755       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2756 
2757       __ ld1(v2, __ T16B, rvec);
2758 
2759       __ ld1(v31, __ T16B, __ post(key, 16));
2760       __ rev32(v31, __ T16B, v31);
2761 
2762       __ cmpw(keylen, 52);
2763       __ br(Assembler::CC, L_loadkeys_44);
2764       __ br(Assembler::EQ, L_loadkeys_52);
2765 
2766       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2767       __ rev32(v17, __ T16B, v17);
2768       __ rev32(v18, __ T16B, v18);
2769     __ BIND(L_loadkeys_52);
2770       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2771       __ rev32(v19, __ T16B, v19);
2772       __ rev32(v20, __ T16B, v20);
2773     __ BIND(L_loadkeys_44);
2774       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2775       __ rev32(v21, __ T16B, v21);
2776       __ rev32(v22, __ T16B, v22);
2777       __ rev32(v23, __ T16B, v23);
2778       __ rev32(v24, __ T16B, v24);
2779       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2780       __ rev32(v25, __ T16B, v25);
2781       __ rev32(v26, __ T16B, v26);
2782       __ rev32(v27, __ T16B, v27);
2783       __ rev32(v28, __ T16B, v28);
2784       __ ld1(v29, v30, __ T16B, key);
2785       __ rev32(v29, __ T16B, v29);
2786       __ rev32(v30, __ T16B, v30);
2787 
2788     __ BIND(L_aes_loop);
2789       __ ld1(v0, __ T16B, __ post(from, 16));
2790       __ orr(v1, __ T16B, v0, v0);
2791 
2792       __ br(Assembler::CC, L_rounds_44);
2793       __ br(Assembler::EQ, L_rounds_52);
2794 
2795       __ aesd(v0, v17); __ aesimc(v0, v0);
2796       __ aesd(v0, v18); __ aesimc(v0, v0);
2797     __ BIND(L_rounds_52);
2798       __ aesd(v0, v19); __ aesimc(v0, v0);
2799       __ aesd(v0, v20); __ aesimc(v0, v0);
2800     __ BIND(L_rounds_44);
2801       __ aesd(v0, v21); __ aesimc(v0, v0);
2802       __ aesd(v0, v22); __ aesimc(v0, v0);
2803       __ aesd(v0, v23); __ aesimc(v0, v0);
2804       __ aesd(v0, v24); __ aesimc(v0, v0);
2805       __ aesd(v0, v25); __ aesimc(v0, v0);
2806       __ aesd(v0, v26); __ aesimc(v0, v0);
2807       __ aesd(v0, v27); __ aesimc(v0, v0);
2808       __ aesd(v0, v28); __ aesimc(v0, v0);
2809       __ aesd(v0, v29); __ aesimc(v0, v0);
2810       __ aesd(v0, v30);
2811       __ eor(v0, __ T16B, v0, v31);
2812       __ eor(v0, __ T16B, v0, v2);
2813 
2814       __ st1(v0, __ T16B, __ post(to, 16));
2815       __ orr(v2, __ T16B, v1, v1);
2816 
2817       __ subw(len_reg, len_reg, 16);
2818       __ cbnzw(len_reg, L_aes_loop);
2819 
2820       __ st1(v2, __ T16B, rvec);
2821 
2822       __ mov(r0, rscratch2);
2823 
2824       __ leave();
2825       __ ret(lr);
2826 
2827     return start;
2828   }
2829 
2830   // CTR AES crypt.
2831   // Arguments:
2832   //
2833   // Inputs:
2834   //   c_rarg0   - source byte array address
2835   //   c_rarg1   - destination byte array address
2836   //   c_rarg2   - K (key) in little endian int array
2837   //   c_rarg3   - counter vector byte array address
2838   //   c_rarg4   - input length
2839   //   c_rarg5   - saved encryptedCounter start
2840   //   c_rarg6   - saved used length
2841   //
2842   // Output:
2843   //   r0       - input length
2844   //
2845   address generate_counterMode_AESCrypt() {
2846     const Register in = c_rarg0;
2847     const Register out = c_rarg1;
2848     const Register key = c_rarg2;
2849     const Register counter = c_rarg3;
2850     const Register saved_len = c_rarg4, len = r10;
2851     const Register saved_encrypted_ctr = c_rarg5;
2852     const Register used_ptr = c_rarg6, used = r12;
2853 
2854     const Register offset = r7;
2855     const Register keylen = r11;
2856 
2857     const unsigned char block_size = 16;
2858     const int bulk_width = 4;
2859     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2860     // performance with larger data sizes, but it also means that the
2861     // fast path isn't used until you have at least 8 blocks, and up
2862     // to 127 bytes of data will be executed on the slow path. For
2863     // that reason, and also so as not to blow away too much icache, 4
2864     // blocks seems like a sensible compromise.
2865 
2866     // Algorithm:
2867     //
2868     //    if (len == 0) {
2869     //        goto DONE;
2870     //    }
2871     //    int result = len;
2872     //    do {
2873     //        if (used >= blockSize) {
2874     //            if (len >= bulk_width * blockSize) {
2875     //                CTR_large_block();
2876     //                if (len == 0)
2877     //                    goto DONE;
2878     //            }
2879     //            for (;;) {
2880     //                16ByteVector v0 = counter;
2881     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2882     //                used = 0;
2883     //                if (len < blockSize)
2884     //                    break;    /* goto NEXT */
2885     //                16ByteVector v1 = load16Bytes(in, offset);
2886     //                v1 = v1 ^ encryptedCounter;
2887     //                store16Bytes(out, offset);
2888     //                used = blockSize;
2889     //                offset += blockSize;
2890     //                len -= blockSize;
2891     //                if (len == 0)
2892     //                    goto DONE;
2893     //            }
2894     //        }
2895     //      NEXT:
2896     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2897     //        len--;
2898     //    } while (len != 0);
2899     //  DONE:
2900     //    return result;
2901     //
2902     // CTR_large_block()
2903     //    Wide bulk encryption of whole blocks.
2904 
2905     __ align(CodeEntryAlignment);
2906     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2907     const address start = __ pc();
2908     __ enter();
2909 
2910     Label DONE, CTR_large_block, large_block_return;
2911     __ ldrw(used, Address(used_ptr));
2912     __ cbzw(saved_len, DONE);
2913 
2914     __ mov(len, saved_len);
2915     __ mov(offset, 0);
2916 
2917     // Compute #rounds for AES based on the length of the key array
2918     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2919 
2920     __ aesenc_loadkeys(key, keylen);
2921 
2922     {
2923       Label L_CTR_loop, NEXT;
2924 
2925       __ bind(L_CTR_loop);
2926 
2927       __ cmp(used, block_size);
2928       __ br(__ LO, NEXT);
2929 
2930       // Maybe we have a lot of data
2931       __ subsw(rscratch1, len, bulk_width * block_size);
2932       __ br(__ HS, CTR_large_block);
2933       __ BIND(large_block_return);
2934       __ cbzw(len, DONE);
2935 
2936       // Setup the counter
2937       __ movi(v4, __ T4S, 0);
2938       __ movi(v5, __ T4S, 1);
2939       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2940 
2941       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2942       __ rev32(v16, __ T16B, v0);
2943       __ addv(v16, __ T4S, v16, v4);
2944       __ rev32(v16, __ T16B, v16);
2945       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2946 
2947       {
2948         // We have fewer than bulk_width blocks of data left. Encrypt
2949         // them one by one until there is less than a full block
2950         // remaining, being careful to save both the encrypted counter
2951         // and the counter.
2952 
2953         Label inner_loop;
2954         __ bind(inner_loop);
2955         // Counter to encrypt is in v0
2956         __ aesecb_encrypt(noreg, noreg, keylen);
2957         __ st1(v0, __ T16B, saved_encrypted_ctr);
2958 
2959         // Do we have a remaining full block?
2960 
2961         __ mov(used, 0);
2962         __ cmp(len, block_size);
2963         __ br(__ LO, NEXT);
2964 
2965         // Yes, we have a full block
2966         __ ldrq(v1, Address(in, offset));
2967         __ eor(v1, __ T16B, v1, v0);
2968         __ strq(v1, Address(out, offset));
2969         __ mov(used, block_size);
2970         __ add(offset, offset, block_size);
2971 
2972         __ subw(len, len, block_size);
2973         __ cbzw(len, DONE);
2974 
2975         // Increment the counter, store it back
2976         __ orr(v0, __ T16B, v16, v16);
2977         __ rev32(v16, __ T16B, v16);
2978         __ addv(v16, __ T4S, v16, v4);
2979         __ rev32(v16, __ T16B, v16);
2980         __ st1(v16, __ T16B, counter); // Save the incremented counter back
2981 
2982         __ b(inner_loop);
2983       }
2984 
2985       __ BIND(NEXT);
2986 
2987       // Encrypt a single byte, and loop.
2988       // We expect this to be a rare event.
2989       __ ldrb(rscratch1, Address(in, offset));
2990       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
2991       __ eor(rscratch1, rscratch1, rscratch2);
2992       __ strb(rscratch1, Address(out, offset));
2993       __ add(offset, offset, 1);
2994       __ add(used, used, 1);
2995       __ subw(len, len,1);
2996       __ cbnzw(len, L_CTR_loop);
2997     }
2998 
2999     __ bind(DONE);
3000     __ strw(used, Address(used_ptr));
3001     __ mov(r0, saved_len);
3002 
3003     __ leave(); // required for proper stackwalking of RuntimeStub frame
3004     __ ret(lr);
3005 
3006     // Bulk encryption
3007 
3008     __ BIND (CTR_large_block);
3009     assert(bulk_width == 4 || bulk_width == 8, "must be");
3010 
3011     if (bulk_width == 8) {
3012       __ sub(sp, sp, 4 * 16);
3013       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3014     }
3015     __ sub(sp, sp, 4 * 16);
3016     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3017     RegSet saved_regs = (RegSet::of(in, out, offset)
3018                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3019     __ push(saved_regs, sp);
3020     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3021     __ add(in, in, offset);
3022     __ add(out, out, offset);
3023 
3024     // Keys should already be loaded into the correct registers
3025 
3026     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3027     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3028 
3029     // AES/CTR loop
3030     {
3031       Label L_CTR_loop;
3032       __ BIND(L_CTR_loop);
3033 
3034       // Setup the counters
3035       __ movi(v8, __ T4S, 0);
3036       __ movi(v9, __ T4S, 1);
3037       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3038 
3039       for (int i = 0; i < bulk_width; i++) {
3040         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3041         __ rev32(v0_ofs, __ T16B, v16);
3042         __ addv(v16, __ T4S, v16, v8);
3043       }
3044 
3045       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3046 
3047       // Encrypt the counters
3048       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3049 
3050       if (bulk_width == 8) {
3051         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3052       }
3053 
3054       // XOR the encrypted counters with the inputs
3055       for (int i = 0; i < bulk_width; i++) {
3056         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3057         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3058         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3059       }
3060 
3061       // Write the encrypted data
3062       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3063       if (bulk_width == 8) {
3064         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3065       }
3066 
3067       __ subw(len, len, 16 * bulk_width);
3068       __ cbnzw(len, L_CTR_loop);
3069     }
3070 
3071     // Save the counter back where it goes
3072     __ rev32(v16, __ T16B, v16);
3073     __ st1(v16, __ T16B, counter);
3074 
3075     __ pop(saved_regs, sp);
3076 
3077     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3078     if (bulk_width == 8) {
3079       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3080     }
3081 
3082     __ andr(rscratch1, len, -16 * bulk_width);
3083     __ sub(len, len, rscratch1);
3084     __ add(offset, offset, rscratch1);
3085     __ mov(used, 16);
3086     __ strw(used, Address(used_ptr));
3087     __ b(large_block_return);
3088 
3089     return start;
3090   }
3091 
3092   // Vector AES Galois Counter Mode implementation. Parameters:
3093   //
3094   // in = c_rarg0
3095   // len = c_rarg1
3096   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3097   // out = c_rarg3
3098   // key = c_rarg4
3099   // state = c_rarg5 - GHASH.state
3100   // subkeyHtbl = c_rarg6 - powers of H
3101   // counter = c_rarg7 - 16 bytes of CTR
3102   // return - number of processed bytes
3103   address generate_galoisCounterMode_AESCrypt() {
3104     address ghash_polynomial = __ pc();
3105     __ emit_int64(0x87);  // The low-order bits of the field
3106                           // polynomial (i.e. p = z^7+z^2+z+1)
3107                           // repeated in the low and high parts of a
3108                           // 128-bit vector
3109     __ emit_int64(0x87);
3110 
3111     __ align(CodeEntryAlignment);
3112      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3113     address start = __ pc();
3114     __ enter();
3115 
3116     const Register in = c_rarg0;
3117     const Register len = c_rarg1;
3118     const Register ct = c_rarg2;
3119     const Register out = c_rarg3;
3120     // and updated with the incremented counter in the end
3121 
3122     const Register key = c_rarg4;
3123     const Register state = c_rarg5;
3124 
3125     const Register subkeyHtbl = c_rarg6;
3126 
3127     const Register counter = c_rarg7;
3128 
3129     const Register keylen = r10;
3130     // Save state before entering routine
3131     __ sub(sp, sp, 4 * 16);
3132     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3133     __ sub(sp, sp, 4 * 16);
3134     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3135 
3136     // __ andr(len, len, -512);
3137     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3138     __ str(len, __ pre(sp, -2 * wordSize));
3139 
3140     Label DONE;
3141     __ cbz(len, DONE);
3142 
3143     // Compute #rounds for AES based on the length of the key array
3144     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3145 
3146     __ aesenc_loadkeys(key, keylen);
3147     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3148     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3149 
3150     // AES/CTR loop
3151     {
3152       Label L_CTR_loop;
3153       __ BIND(L_CTR_loop);
3154 
3155       // Setup the counters
3156       __ movi(v8, __ T4S, 0);
3157       __ movi(v9, __ T4S, 1);
3158       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3159 
3160       assert(v0->encoding() < v8->encoding(), "");
3161       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3162         FloatRegister f = as_FloatRegister(i);
3163         __ rev32(f, __ T16B, v16);
3164         __ addv(v16, __ T4S, v16, v8);
3165       }
3166 
3167       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3168 
3169       // Encrypt the counters
3170       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3171 
3172       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3173 
3174       // XOR the encrypted counters with the inputs
3175       for (int i = 0; i < 8; i++) {
3176         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3177         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3178         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3179       }
3180       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3181       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3182 
3183       __ subw(len, len, 16 * 8);
3184       __ cbnzw(len, L_CTR_loop);
3185     }
3186 
3187     __ rev32(v16, __ T16B, v16);
3188     __ st1(v16, __ T16B, counter);
3189 
3190     __ ldr(len, Address(sp));
3191     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3192 
3193     // GHASH/CTR loop
3194     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3195                                 len, /*unrolls*/4);
3196 
3197 #ifdef ASSERT
3198     { Label L;
3199       __ cmp(len, (unsigned char)0);
3200       __ br(Assembler::EQ, L);
3201       __ stop("stubGenerator: abort");
3202       __ bind(L);
3203   }
3204 #endif
3205 
3206   __ bind(DONE);
3207     // Return the number of bytes processed
3208     __ ldr(r0, __ post(sp, 2 * wordSize));
3209 
3210     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3211     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3212 
3213     __ leave(); // required for proper stackwalking of RuntimeStub frame
3214     __ ret(lr);
3215      return start;
3216   }
3217 
3218   // Utility routines for md5.
3219   // Clobbers r10 and r11.
3220   void md5_FF(Register buf, Register r1, Register r2, Register r3, Register r4,
3221               int k, int s, int t) {
3222     Register rscratch3 = r10;
3223     Register rscratch4 = r11;
3224 
3225     __ eorw(rscratch3, r3, r4);
3226     __ movw(rscratch2, t);
3227     __ andw(rscratch3, rscratch3, r2);
3228     __ addw(rscratch4, r1, rscratch2);
3229     __ ldrw(rscratch1, Address(buf, k*4));
3230     __ eorw(rscratch3, rscratch3, r4);
3231     __ addw(rscratch4, rscratch4, rscratch1);
3232     __ addw(rscratch3, rscratch3, rscratch4);
3233     __ rorw(rscratch2, rscratch3, 32 - s);
3234     __ addw(r1, rscratch2, r2);
3235   }
3236 
3237   void md5_GG(Register buf, Register r1, Register r2, Register r3, Register r4,
3238               int k, int s, int t) {
3239     Register rscratch3 = r10;
3240     Register rscratch4 = r11;
3241 
3242     __ andw(rscratch3, r2, r4);
3243     __ bicw(rscratch4, r3, r4);
3244     __ ldrw(rscratch1, Address(buf, k*4));
3245     __ movw(rscratch2, t);
3246     __ orrw(rscratch3, rscratch3, rscratch4);
3247     __ addw(rscratch4, r1, rscratch2);
3248     __ addw(rscratch4, rscratch4, rscratch1);
3249     __ addw(rscratch3, rscratch3, rscratch4);
3250     __ rorw(rscratch2, rscratch3, 32 - s);
3251     __ addw(r1, rscratch2, r2);
3252   }
3253 
3254   void md5_HH(Register buf, Register r1, Register r2, Register r3, Register r4,
3255               int k, int s, int t) {
3256     Register rscratch3 = r10;
3257     Register rscratch4 = r11;
3258 
3259     __ eorw(rscratch3, r3, r4);
3260     __ movw(rscratch2, t);
3261     __ addw(rscratch4, r1, rscratch2);
3262     __ ldrw(rscratch1, Address(buf, k*4));
3263     __ eorw(rscratch3, rscratch3, r2);
3264     __ addw(rscratch4, rscratch4, rscratch1);
3265     __ addw(rscratch3, rscratch3, rscratch4);
3266     __ rorw(rscratch2, rscratch3, 32 - s);
3267     __ addw(r1, rscratch2, r2);
3268   }
3269 
3270   void md5_II(Register buf, Register r1, Register r2, Register r3, Register r4,
3271               int k, int s, int t) {
3272     Register rscratch3 = r10;
3273     Register rscratch4 = r11;
3274 
3275     __ movw(rscratch3, t);
3276     __ ornw(rscratch2, r2, r4);
3277     __ addw(rscratch4, r1, rscratch3);
3278     __ ldrw(rscratch1, Address(buf, k*4));
3279     __ eorw(rscratch3, rscratch2, r3);
3280     __ addw(rscratch4, rscratch4, rscratch1);
3281     __ addw(rscratch3, rscratch3, rscratch4);
3282     __ rorw(rscratch2, rscratch3, 32 - s);
3283     __ addw(r1, rscratch2, r2);
3284   }
3285 
3286   // Arguments:
3287   //
3288   // Inputs:
3289   //   c_rarg0   - byte[]  source+offset
3290   //   c_rarg1   - int[]   SHA.state
3291   //   c_rarg2   - int     offset
3292   //   c_rarg3   - int     limit
3293   //
3294   address generate_md5_implCompress(bool multi_block, const char *name) {
3295     __ align(CodeEntryAlignment);
3296     StubCodeMark mark(this, "StubRoutines", name);
3297     address start = __ pc();
3298 
3299     Register buf       = c_rarg0;
3300     Register state     = c_rarg1;
3301     Register ofs       = c_rarg2;
3302     Register limit     = c_rarg3;
3303     Register a         = r4;
3304     Register b         = r5;
3305     Register c         = r6;
3306     Register d         = r7;
3307     Register rscratch3 = r10;
3308     Register rscratch4 = r11;
3309 
3310     Label md5_loop;
3311     __ BIND(md5_loop);
3312 
3313     // Save hash values for addition after rounds
3314     __ ldrw(a, Address(state,  0));
3315     __ ldrw(b, Address(state,  4));
3316     __ ldrw(c, Address(state,  8));
3317     __ ldrw(d, Address(state, 12));
3318 
3319     // Round 1
3320     md5_FF(buf, a, b, c, d,  0,  7, 0xd76aa478);
3321     md5_FF(buf, d, a, b, c,  1, 12, 0xe8c7b756);
3322     md5_FF(buf, c, d, a, b,  2, 17, 0x242070db);
3323     md5_FF(buf, b, c, d, a,  3, 22, 0xc1bdceee);
3324     md5_FF(buf, a, b, c, d,  4,  7, 0xf57c0faf);
3325     md5_FF(buf, d, a, b, c,  5, 12, 0x4787c62a);
3326     md5_FF(buf, c, d, a, b,  6, 17, 0xa8304613);
3327     md5_FF(buf, b, c, d, a,  7, 22, 0xfd469501);
3328     md5_FF(buf, a, b, c, d,  8,  7, 0x698098d8);
3329     md5_FF(buf, d, a, b, c,  9, 12, 0x8b44f7af);
3330     md5_FF(buf, c, d, a, b, 10, 17, 0xffff5bb1);
3331     md5_FF(buf, b, c, d, a, 11, 22, 0x895cd7be);
3332     md5_FF(buf, a, b, c, d, 12,  7, 0x6b901122);
3333     md5_FF(buf, d, a, b, c, 13, 12, 0xfd987193);
3334     md5_FF(buf, c, d, a, b, 14, 17, 0xa679438e);
3335     md5_FF(buf, b, c, d, a, 15, 22, 0x49b40821);
3336 
3337     // Round 2
3338     md5_GG(buf, a, b, c, d,  1,  5, 0xf61e2562);
3339     md5_GG(buf, d, a, b, c,  6,  9, 0xc040b340);
3340     md5_GG(buf, c, d, a, b, 11, 14, 0x265e5a51);
3341     md5_GG(buf, b, c, d, a,  0, 20, 0xe9b6c7aa);
3342     md5_GG(buf, a, b, c, d,  5,  5, 0xd62f105d);
3343     md5_GG(buf, d, a, b, c, 10,  9, 0x02441453);
3344     md5_GG(buf, c, d, a, b, 15, 14, 0xd8a1e681);
3345     md5_GG(buf, b, c, d, a,  4, 20, 0xe7d3fbc8);
3346     md5_GG(buf, a, b, c, d,  9,  5, 0x21e1cde6);
3347     md5_GG(buf, d, a, b, c, 14,  9, 0xc33707d6);
3348     md5_GG(buf, c, d, a, b,  3, 14, 0xf4d50d87);
3349     md5_GG(buf, b, c, d, a,  8, 20, 0x455a14ed);
3350     md5_GG(buf, a, b, c, d, 13,  5, 0xa9e3e905);
3351     md5_GG(buf, d, a, b, c,  2,  9, 0xfcefa3f8);
3352     md5_GG(buf, c, d, a, b,  7, 14, 0x676f02d9);
3353     md5_GG(buf, b, c, d, a, 12, 20, 0x8d2a4c8a);
3354 
3355     // Round 3
3356     md5_HH(buf, a, b, c, d,  5,  4, 0xfffa3942);
3357     md5_HH(buf, d, a, b, c,  8, 11, 0x8771f681);
3358     md5_HH(buf, c, d, a, b, 11, 16, 0x6d9d6122);
3359     md5_HH(buf, b, c, d, a, 14, 23, 0xfde5380c);
3360     md5_HH(buf, a, b, c, d,  1,  4, 0xa4beea44);
3361     md5_HH(buf, d, a, b, c,  4, 11, 0x4bdecfa9);
3362     md5_HH(buf, c, d, a, b,  7, 16, 0xf6bb4b60);
3363     md5_HH(buf, b, c, d, a, 10, 23, 0xbebfbc70);
3364     md5_HH(buf, a, b, c, d, 13,  4, 0x289b7ec6);
3365     md5_HH(buf, d, a, b, c,  0, 11, 0xeaa127fa);
3366     md5_HH(buf, c, d, a, b,  3, 16, 0xd4ef3085);
3367     md5_HH(buf, b, c, d, a,  6, 23, 0x04881d05);
3368     md5_HH(buf, a, b, c, d,  9,  4, 0xd9d4d039);
3369     md5_HH(buf, d, a, b, c, 12, 11, 0xe6db99e5);
3370     md5_HH(buf, c, d, a, b, 15, 16, 0x1fa27cf8);
3371     md5_HH(buf, b, c, d, a,  2, 23, 0xc4ac5665);
3372 
3373     // Round 4
3374     md5_II(buf, a, b, c, d,  0,  6, 0xf4292244);
3375     md5_II(buf, d, a, b, c,  7, 10, 0x432aff97);
3376     md5_II(buf, c, d, a, b, 14, 15, 0xab9423a7);
3377     md5_II(buf, b, c, d, a,  5, 21, 0xfc93a039);
3378     md5_II(buf, a, b, c, d, 12,  6, 0x655b59c3);
3379     md5_II(buf, d, a, b, c,  3, 10, 0x8f0ccc92);
3380     md5_II(buf, c, d, a, b, 10, 15, 0xffeff47d);
3381     md5_II(buf, b, c, d, a,  1, 21, 0x85845dd1);
3382     md5_II(buf, a, b, c, d,  8,  6, 0x6fa87e4f);
3383     md5_II(buf, d, a, b, c, 15, 10, 0xfe2ce6e0);
3384     md5_II(buf, c, d, a, b,  6, 15, 0xa3014314);
3385     md5_II(buf, b, c, d, a, 13, 21, 0x4e0811a1);
3386     md5_II(buf, a, b, c, d,  4,  6, 0xf7537e82);
3387     md5_II(buf, d, a, b, c, 11, 10, 0xbd3af235);
3388     md5_II(buf, c, d, a, b,  2, 15, 0x2ad7d2bb);
3389     md5_II(buf, b, c, d, a,  9, 21, 0xeb86d391);
3390 
3391     // write hash values back in the correct order
3392     __ ldrw(rscratch1, Address(state,  0));
3393     __ addw(rscratch1, rscratch1, a);
3394     __ strw(rscratch1, Address(state,  0));
3395 
3396     __ ldrw(rscratch2, Address(state,  4));
3397     __ addw(rscratch2, rscratch2, b);
3398     __ strw(rscratch2, Address(state,  4));
3399 
3400     __ ldrw(rscratch3, Address(state,  8));
3401     __ addw(rscratch3, rscratch3, c);
3402     __ strw(rscratch3, Address(state,  8));
3403 
3404     __ ldrw(rscratch4, Address(state, 12));
3405     __ addw(rscratch4, rscratch4, d);
3406     __ strw(rscratch4, Address(state, 12));
3407 
3408     if (multi_block) {
3409       __ add(buf, buf, 64);
3410       __ add(ofs, ofs, 64);
3411       __ cmp(ofs, limit);
3412       __ br(Assembler::LE, md5_loop);
3413       __ mov(c_rarg0, ofs); // return ofs
3414     }
3415 
3416     __ ret(lr);
3417 
3418     return start;
3419   }
3420 
3421   // Arguments:
3422   //
3423   // Inputs:
3424   //   c_rarg0   - byte[]  source+offset
3425   //   c_rarg1   - int[]   SHA.state
3426   //   c_rarg2   - int     offset
3427   //   c_rarg3   - int     limit
3428   //
3429   address generate_sha1_implCompress(bool multi_block, const char *name) {
3430     __ align(CodeEntryAlignment);
3431     StubCodeMark mark(this, "StubRoutines", name);
3432     address start = __ pc();
3433 
3434     Register buf   = c_rarg0;
3435     Register state = c_rarg1;
3436     Register ofs   = c_rarg2;
3437     Register limit = c_rarg3;
3438 
3439     Label keys;
3440     Label sha1_loop;
3441 
3442     // load the keys into v0..v3
3443     __ adr(rscratch1, keys);
3444     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3445     // load 5 words state into v6, v7
3446     __ ldrq(v6, Address(state, 0));
3447     __ ldrs(v7, Address(state, 16));
3448 
3449 
3450     __ BIND(sha1_loop);
3451     // load 64 bytes of data into v16..v19
3452     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3453     __ rev32(v16, __ T16B, v16);
3454     __ rev32(v17, __ T16B, v17);
3455     __ rev32(v18, __ T16B, v18);
3456     __ rev32(v19, __ T16B, v19);
3457 
3458     // do the sha1
3459     __ addv(v4, __ T4S, v16, v0);
3460     __ orr(v20, __ T16B, v6, v6);
3461 
3462     FloatRegister d0 = v16;
3463     FloatRegister d1 = v17;
3464     FloatRegister d2 = v18;
3465     FloatRegister d3 = v19;
3466 
3467     for (int round = 0; round < 20; round++) {
3468       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3469       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3470       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3471       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3472       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3473 
3474       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3475       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3476       __ sha1h(tmp2, __ T4S, v20);
3477       if (round < 5)
3478         __ sha1c(v20, __ T4S, tmp3, tmp4);
3479       else if (round < 10 || round >= 15)
3480         __ sha1p(v20, __ T4S, tmp3, tmp4);
3481       else
3482         __ sha1m(v20, __ T4S, tmp3, tmp4);
3483       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3484 
3485       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3486     }
3487 
3488     __ addv(v7, __ T2S, v7, v21);
3489     __ addv(v6, __ T4S, v6, v20);
3490 
3491     if (multi_block) {
3492       __ add(ofs, ofs, 64);
3493       __ cmp(ofs, limit);
3494       __ br(Assembler::LE, sha1_loop);
3495       __ mov(c_rarg0, ofs); // return ofs
3496     }
3497 
3498     __ strq(v6, Address(state, 0));
3499     __ strs(v7, Address(state, 16));
3500 
3501     __ ret(lr);
3502 
3503     __ bind(keys);
3504     __ emit_int32(0x5a827999);
3505     __ emit_int32(0x6ed9eba1);
3506     __ emit_int32(0x8f1bbcdc);
3507     __ emit_int32(0xca62c1d6);
3508 
3509     return start;
3510   }
3511 
3512 
3513   // Arguments:
3514   //
3515   // Inputs:
3516   //   c_rarg0   - byte[]  source+offset
3517   //   c_rarg1   - int[]   SHA.state
3518   //   c_rarg2   - int     offset
3519   //   c_rarg3   - int     limit
3520   //
3521   address generate_sha256_implCompress(bool multi_block, const char *name) {
3522     static const uint32_t round_consts[64] = {
3523       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3524       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3525       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3526       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3527       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3528       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3529       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3530       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3531       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3532       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3533       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3534       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3535       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3536       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3537       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3538       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3539     };
3540     __ align(CodeEntryAlignment);
3541     StubCodeMark mark(this, "StubRoutines", name);
3542     address start = __ pc();
3543 
3544     Register buf   = c_rarg0;
3545     Register state = c_rarg1;
3546     Register ofs   = c_rarg2;
3547     Register limit = c_rarg3;
3548 
3549     Label sha1_loop;
3550 
3551     __ stpd(v8, v9, __ pre(sp, -32));
3552     __ stpd(v10, v11, Address(sp, 16));
3553 
3554 // dga == v0
3555 // dgb == v1
3556 // dg0 == v2
3557 // dg1 == v3
3558 // dg2 == v4
3559 // t0 == v6
3560 // t1 == v7
3561 
3562     // load 16 keys to v16..v31
3563     __ lea(rscratch1, ExternalAddress((address)round_consts));
3564     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3565     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3566     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3567     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3568 
3569     // load 8 words (256 bits) state
3570     __ ldpq(v0, v1, state);
3571 
3572     __ BIND(sha1_loop);
3573     // load 64 bytes of data into v8..v11
3574     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3575     __ rev32(v8, __ T16B, v8);
3576     __ rev32(v9, __ T16B, v9);
3577     __ rev32(v10, __ T16B, v10);
3578     __ rev32(v11, __ T16B, v11);
3579 
3580     __ addv(v6, __ T4S, v8, v16);
3581     __ orr(v2, __ T16B, v0, v0);
3582     __ orr(v3, __ T16B, v1, v1);
3583 
3584     FloatRegister d0 = v8;
3585     FloatRegister d1 = v9;
3586     FloatRegister d2 = v10;
3587     FloatRegister d3 = v11;
3588 
3589 
3590     for (int round = 0; round < 16; round++) {
3591       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3592       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3593       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3594       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3595 
3596       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3597        __ orr(v4, __ T16B, v2, v2);
3598       if (round < 15)
3599         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3600       __ sha256h(v2, __ T4S, v3, tmp2);
3601       __ sha256h2(v3, __ T4S, v4, tmp2);
3602       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3603 
3604       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3605     }
3606 
3607     __ addv(v0, __ T4S, v0, v2);
3608     __ addv(v1, __ T4S, v1, v3);
3609 
3610     if (multi_block) {
3611       __ add(ofs, ofs, 64);
3612       __ cmp(ofs, limit);
3613       __ br(Assembler::LE, sha1_loop);
3614       __ mov(c_rarg0, ofs); // return ofs
3615     }
3616 
3617     __ ldpd(v10, v11, Address(sp, 16));
3618     __ ldpd(v8, v9, __ post(sp, 32));
3619 
3620     __ stpq(v0, v1, state);
3621 
3622     __ ret(lr);
3623 
3624     return start;
3625   }
3626 
3627   // Double rounds for sha512.
3628   void sha512_dround(int dr,
3629                      FloatRegister vi0, FloatRegister vi1,
3630                      FloatRegister vi2, FloatRegister vi3,
3631                      FloatRegister vi4, FloatRegister vrc0,
3632                      FloatRegister vrc1, FloatRegister vin0,
3633                      FloatRegister vin1, FloatRegister vin2,
3634                      FloatRegister vin3, FloatRegister vin4) {
3635       if (dr < 36) {
3636         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3637       }
3638       __ addv(v5, __ T2D, vrc0, vin0);
3639       __ ext(v6, __ T16B, vi2, vi3, 8);
3640       __ ext(v5, __ T16B, v5, v5, 8);
3641       __ ext(v7, __ T16B, vi1, vi2, 8);
3642       __ addv(vi3, __ T2D, vi3, v5);
3643       if (dr < 32) {
3644         __ ext(v5, __ T16B, vin3, vin4, 8);
3645         __ sha512su0(vin0, __ T2D, vin1);
3646       }
3647       __ sha512h(vi3, __ T2D, v6, v7);
3648       if (dr < 32) {
3649         __ sha512su1(vin0, __ T2D, vin2, v5);
3650       }
3651       __ addv(vi4, __ T2D, vi1, vi3);
3652       __ sha512h2(vi3, __ T2D, vi1, vi0);
3653   }
3654 
3655   // Arguments:
3656   //
3657   // Inputs:
3658   //   c_rarg0   - byte[]  source+offset
3659   //   c_rarg1   - int[]   SHA.state
3660   //   c_rarg2   - int     offset
3661   //   c_rarg3   - int     limit
3662   //
3663   address generate_sha512_implCompress(bool multi_block, const char *name) {
3664     static const uint64_t round_consts[80] = {
3665       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3666       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3667       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3668       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3669       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3670       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3671       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3672       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3673       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3674       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3675       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3676       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3677       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3678       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3679       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3680       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3681       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3682       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3683       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3684       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3685       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3686       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3687       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3688       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3689       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3690       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3691       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3692     };
3693 
3694     __ align(CodeEntryAlignment);
3695     StubCodeMark mark(this, "StubRoutines", name);
3696     address start = __ pc();
3697 
3698     Register buf   = c_rarg0;
3699     Register state = c_rarg1;
3700     Register ofs   = c_rarg2;
3701     Register limit = c_rarg3;
3702 
3703     __ stpd(v8, v9, __ pre(sp, -64));
3704     __ stpd(v10, v11, Address(sp, 16));
3705     __ stpd(v12, v13, Address(sp, 32));
3706     __ stpd(v14, v15, Address(sp, 48));
3707 
3708     Label sha512_loop;
3709 
3710     // load state
3711     __ ld1(v8, v9, v10, v11, __ T2D, state);
3712 
3713     // load first 4 round constants
3714     __ lea(rscratch1, ExternalAddress((address)round_consts));
3715     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3716 
3717     __ BIND(sha512_loop);
3718     // load 128B of data into v12..v19
3719     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3720     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3721     __ rev64(v12, __ T16B, v12);
3722     __ rev64(v13, __ T16B, v13);
3723     __ rev64(v14, __ T16B, v14);
3724     __ rev64(v15, __ T16B, v15);
3725     __ rev64(v16, __ T16B, v16);
3726     __ rev64(v17, __ T16B, v17);
3727     __ rev64(v18, __ T16B, v18);
3728     __ rev64(v19, __ T16B, v19);
3729 
3730     __ mov(rscratch2, rscratch1);
3731 
3732     __ mov(v0, __ T16B, v8);
3733     __ mov(v1, __ T16B, v9);
3734     __ mov(v2, __ T16B, v10);
3735     __ mov(v3, __ T16B, v11);
3736 
3737     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3738     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3739     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3740     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3741     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3742     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3743     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3744     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3745     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3746     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3747     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3748     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3749     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3750     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3751     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3752     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3753     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3754     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3755     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3756     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3757     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3758     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3759     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3760     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3761     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3762     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3763     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3764     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3765     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3766     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3767     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3768     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3769     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3770     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3771     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3772     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3773     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3774     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3775     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3776     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3777 
3778     __ addv(v8, __ T2D, v8, v0);
3779     __ addv(v9, __ T2D, v9, v1);
3780     __ addv(v10, __ T2D, v10, v2);
3781     __ addv(v11, __ T2D, v11, v3);
3782 
3783     if (multi_block) {
3784       __ add(ofs, ofs, 128);
3785       __ cmp(ofs, limit);
3786       __ br(Assembler::LE, sha512_loop);
3787       __ mov(c_rarg0, ofs); // return ofs
3788     }
3789 
3790     __ st1(v8, v9, v10, v11, __ T2D, state);
3791 
3792     __ ldpd(v14, v15, Address(sp, 48));
3793     __ ldpd(v12, v13, Address(sp, 32));
3794     __ ldpd(v10, v11, Address(sp, 16));
3795     __ ldpd(v8, v9, __ post(sp, 64));
3796 
3797     __ ret(lr);
3798 
3799     return start;
3800   }
3801 
3802   // Arguments:
3803   //
3804   // Inputs:
3805   //   c_rarg0   - byte[]  source+offset
3806   //   c_rarg1   - byte[]  SHA.state
3807   //   c_rarg2   - int     block_size
3808   //   c_rarg3   - int     offset
3809   //   c_rarg4   - int     limit
3810   //
3811   address generate_sha3_implCompress(bool multi_block, const char *name) {
3812     static const uint64_t round_consts[24] = {
3813       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3814       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3815       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3816       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3817       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3818       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3819       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3820       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3821     };
3822 
3823     __ align(CodeEntryAlignment);
3824     StubCodeMark mark(this, "StubRoutines", name);
3825     address start = __ pc();
3826 
3827     Register buf           = c_rarg0;
3828     Register state         = c_rarg1;
3829     Register block_size    = c_rarg2;
3830     Register ofs           = c_rarg3;
3831     Register limit         = c_rarg4;
3832 
3833     Label sha3_loop, rounds24_loop;
3834     Label sha3_512_or_sha3_384, shake128;
3835 
3836     __ stpd(v8, v9, __ pre(sp, -64));
3837     __ stpd(v10, v11, Address(sp, 16));
3838     __ stpd(v12, v13, Address(sp, 32));
3839     __ stpd(v14, v15, Address(sp, 48));
3840 
3841     // load state
3842     __ add(rscratch1, state, 32);
3843     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3844     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3845     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3846     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3847     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3848     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3849     __ ld1(v24, __ T1D, rscratch1);
3850 
3851     __ BIND(sha3_loop);
3852 
3853     // 24 keccak rounds
3854     __ movw(rscratch2, 24);
3855 
3856     // load round_constants base
3857     __ lea(rscratch1, ExternalAddress((address) round_consts));
3858 
3859     // load input
3860     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3861     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3862     __ eor(v0, __ T8B, v0, v25);
3863     __ eor(v1, __ T8B, v1, v26);
3864     __ eor(v2, __ T8B, v2, v27);
3865     __ eor(v3, __ T8B, v3, v28);
3866     __ eor(v4, __ T8B, v4, v29);
3867     __ eor(v5, __ T8B, v5, v30);
3868     __ eor(v6, __ T8B, v6, v31);
3869 
3870     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
3871     __ tbz(block_size, 7, sha3_512_or_sha3_384);
3872 
3873     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3874     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3875     __ eor(v7, __ T8B, v7, v25);
3876     __ eor(v8, __ T8B, v8, v26);
3877     __ eor(v9, __ T8B, v9, v27);
3878     __ eor(v10, __ T8B, v10, v28);
3879     __ eor(v11, __ T8B, v11, v29);
3880     __ eor(v12, __ T8B, v12, v30);
3881     __ eor(v13, __ T8B, v13, v31);
3882 
3883     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
3884     __ eor(v14, __ T8B, v14, v25);
3885     __ eor(v15, __ T8B, v15, v26);
3886     __ eor(v16, __ T8B, v16, v27);
3887 
3888     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
3889     __ andw(c_rarg5, block_size, 48);
3890     __ cbzw(c_rarg5, rounds24_loop);
3891 
3892     __ tbnz(block_size, 5, shake128);
3893     // block_size == 144, bit5 == 0, SHA3-244
3894     __ ldrd(v28, __ post(buf, 8));
3895     __ eor(v17, __ T8B, v17, v28);
3896     __ b(rounds24_loop);
3897 
3898     __ BIND(shake128);
3899     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
3900     __ eor(v17, __ T8B, v17, v28);
3901     __ eor(v18, __ T8B, v18, v29);
3902     __ eor(v19, __ T8B, v19, v30);
3903     __ eor(v20, __ T8B, v20, v31);
3904     __ b(rounds24_loop); // block_size == 168, SHAKE128
3905 
3906     __ BIND(sha3_512_or_sha3_384);
3907     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3908     __ eor(v7, __ T8B, v7, v25);
3909     __ eor(v8, __ T8B, v8, v26);
3910     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
3911 
3912     // SHA3-384
3913     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
3914     __ eor(v9,  __ T8B, v9,  v27);
3915     __ eor(v10, __ T8B, v10, v28);
3916     __ eor(v11, __ T8B, v11, v29);
3917     __ eor(v12, __ T8B, v12, v30);
3918 
3919     __ BIND(rounds24_loop);
3920     __ subw(rscratch2, rscratch2, 1);
3921 
3922     __ eor3(v29, __ T16B, v4, v9, v14);
3923     __ eor3(v26, __ T16B, v1, v6, v11);
3924     __ eor3(v28, __ T16B, v3, v8, v13);
3925     __ eor3(v25, __ T16B, v0, v5, v10);
3926     __ eor3(v27, __ T16B, v2, v7, v12);
3927     __ eor3(v29, __ T16B, v29, v19, v24);
3928     __ eor3(v26, __ T16B, v26, v16, v21);
3929     __ eor3(v28, __ T16B, v28, v18, v23);
3930     __ eor3(v25, __ T16B, v25, v15, v20);
3931     __ eor3(v27, __ T16B, v27, v17, v22);
3932 
3933     __ rax1(v30, __ T2D, v29, v26);
3934     __ rax1(v26, __ T2D, v26, v28);
3935     __ rax1(v28, __ T2D, v28, v25);
3936     __ rax1(v25, __ T2D, v25, v27);
3937     __ rax1(v27, __ T2D, v27, v29);
3938 
3939     __ eor(v0, __ T16B, v0, v30);
3940     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3941     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3942     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3943     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3944     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3945     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3946     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3947     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3948     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3949     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3950     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3951     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3952     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3953     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3954     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3955     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3956     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3957     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3958     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3959     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3960     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3961     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3962     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3963     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3964 
3965     __ bcax(v20, __ T16B, v31, v22, v8);
3966     __ bcax(v21, __ T16B, v8,  v23, v22);
3967     __ bcax(v22, __ T16B, v22, v24, v23);
3968     __ bcax(v23, __ T16B, v23, v31, v24);
3969     __ bcax(v24, __ T16B, v24, v8,  v31);
3970 
3971     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3972 
3973     __ bcax(v17, __ T16B, v25, v19, v3);
3974     __ bcax(v18, __ T16B, v3,  v15, v19);
3975     __ bcax(v19, __ T16B, v19, v16, v15);
3976     __ bcax(v15, __ T16B, v15, v25, v16);
3977     __ bcax(v16, __ T16B, v16, v3,  v25);
3978 
3979     __ bcax(v10, __ T16B, v29, v12, v26);
3980     __ bcax(v11, __ T16B, v26, v13, v12);
3981     __ bcax(v12, __ T16B, v12, v14, v13);
3982     __ bcax(v13, __ T16B, v13, v29, v14);
3983     __ bcax(v14, __ T16B, v14, v26, v29);
3984 
3985     __ bcax(v7, __ T16B, v30, v9,  v4);
3986     __ bcax(v8, __ T16B, v4,  v5,  v9);
3987     __ bcax(v9, __ T16B, v9,  v6,  v5);
3988     __ bcax(v5, __ T16B, v5,  v30, v6);
3989     __ bcax(v6, __ T16B, v6,  v4,  v30);
3990 
3991     __ bcax(v3, __ T16B, v27, v0,  v28);
3992     __ bcax(v4, __ T16B, v28, v1,  v0);
3993     __ bcax(v0, __ T16B, v0,  v2,  v1);
3994     __ bcax(v1, __ T16B, v1,  v27, v2);
3995     __ bcax(v2, __ T16B, v2,  v28, v27);
3996 
3997     __ eor(v0, __ T16B, v0, v31);
3998 
3999     __ cbnzw(rscratch2, rounds24_loop);
4000 
4001     if (multi_block) {
4002       __ add(ofs, ofs, block_size);
4003       __ cmp(ofs, limit);
4004       __ br(Assembler::LE, sha3_loop);
4005       __ mov(c_rarg0, ofs); // return ofs
4006     }
4007 
4008     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4009     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4010     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4011     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4012     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4013     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4014     __ st1(v24, __ T1D, state);
4015 
4016     __ ldpd(v14, v15, Address(sp, 48));
4017     __ ldpd(v12, v13, Address(sp, 32));
4018     __ ldpd(v10, v11, Address(sp, 16));
4019     __ ldpd(v8, v9, __ post(sp, 64));
4020 
4021     __ ret(lr);
4022 
4023     return start;
4024   }
4025 
4026   /**
4027    *  Arguments:
4028    *
4029    * Inputs:
4030    *   c_rarg0   - int crc
4031    *   c_rarg1   - byte* buf
4032    *   c_rarg2   - int length
4033    *
4034    * Output:
4035    *       rax   - int crc result
4036    */
4037   address generate_updateBytesCRC32() {
4038     assert(UseCRC32Intrinsics, "what are we doing here?");
4039 
4040     __ align(CodeEntryAlignment);
4041     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4042 
4043     address start = __ pc();
4044 
4045     const Register crc   = c_rarg0;  // crc
4046     const Register buf   = c_rarg1;  // source java byte array address
4047     const Register len   = c_rarg2;  // length
4048     const Register table0 = c_rarg3; // crc_table address
4049     const Register table1 = c_rarg4;
4050     const Register table2 = c_rarg5;
4051     const Register table3 = c_rarg6;
4052     const Register tmp3 = c_rarg7;
4053 
4054     BLOCK_COMMENT("Entry:");
4055     __ enter(); // required for proper stackwalking of RuntimeStub frame
4056 
4057     __ kernel_crc32(crc, buf, len,
4058               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4059 
4060     __ leave(); // required for proper stackwalking of RuntimeStub frame
4061     __ ret(lr);
4062 
4063     return start;
4064   }
4065 
4066   // ChaCha20 block function.  This version parallelizes by loading
4067   // individual 32-bit state elements into vectors for four blocks
4068   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4069   //
4070   // state (int[16]) = c_rarg0
4071   // keystream (byte[1024]) = c_rarg1
4072   // return - number of bytes of keystream (always 256)
4073   address generate_chacha20Block_blockpar() {
4074     Label L_twoRounds, L_cc20_const;
4075     // The constant data is broken into two 128-bit segments to be loaded
4076     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4077     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4078     // The second 128-bits is a table constant used for 8-bit left rotations.
4079     __ BIND(L_cc20_const);
4080     __ emit_int64(0x0000000100000000UL);
4081     __ emit_int64(0x0000000300000002UL);
4082     __ emit_int64(0x0605040702010003UL);
4083     __ emit_int64(0x0E0D0C0F0A09080BUL);
4084 
4085     __ align(CodeEntryAlignment);
4086     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4087     address start = __ pc();
4088     __ enter();
4089 
4090     int i, j;
4091     const Register state = c_rarg0;
4092     const Register keystream = c_rarg1;
4093     const Register loopCtr = r10;
4094     const Register tmpAddr = r11;
4095 
4096     const FloatRegister stateFirst = v0;
4097     const FloatRegister stateSecond = v1;
4098     const FloatRegister stateThird = v2;
4099     const FloatRegister stateFourth = v3;
4100     const FloatRegister origCtrState = v28;
4101     const FloatRegister scratch = v29;
4102     const FloatRegister lrot8Tbl = v30;
4103 
4104     // Organize SIMD registers in an array that facilitates
4105     // putting repetitive opcodes into loop structures.  It is
4106     // important that each grouping of 4 registers is monotonically
4107     // increasing to support the requirements of multi-register
4108     // instructions (e.g. ld4r, st4, etc.)
4109     const FloatRegister workSt[16] = {
4110          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4111         v20, v21, v22, v23, v24, v25, v26, v27
4112     };
4113 
4114     // Load from memory and interlace across 16 SIMD registers,
4115     // With each word from memory being broadcast to all lanes of
4116     // each successive SIMD register.
4117     //      Addr(0) -> All lanes in workSt[i]
4118     //      Addr(4) -> All lanes workSt[i + 1], etc.
4119     __ mov(tmpAddr, state);
4120     for (i = 0; i < 16; i += 4) {
4121       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4122           __ post(tmpAddr, 16));
4123     }
4124 
4125     // Pull in constant data.  The first 16 bytes are the add overlay
4126     // which is applied to the vector holding the counter (state[12]).
4127     // The second 16 bytes is the index register for the 8-bit left
4128     // rotation tbl instruction.
4129     __ adr(tmpAddr, L_cc20_const);
4130     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4131     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4132 
4133     // Set up the 10 iteration loop and perform all 8 quarter round ops
4134     __ mov(loopCtr, 10);
4135     __ BIND(L_twoRounds);
4136 
4137     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4138         scratch, lrot8Tbl);
4139     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4140         scratch, lrot8Tbl);
4141     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4142         scratch, lrot8Tbl);
4143     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4144         scratch, lrot8Tbl);
4145 
4146     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4147         scratch, lrot8Tbl);
4148     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4149         scratch, lrot8Tbl);
4150     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4151         scratch, lrot8Tbl);
4152     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4153         scratch, lrot8Tbl);
4154 
4155     // Decrement and iterate
4156     __ sub(loopCtr, loopCtr, 1);
4157     __ cbnz(loopCtr, L_twoRounds);
4158 
4159     __ mov(tmpAddr, state);
4160 
4161     // Add the starting state back to the post-loop keystream
4162     // state.  We read/interlace the state array from memory into
4163     // 4 registers similar to what we did in the beginning.  Then
4164     // add the counter overlay onto workSt[12] at the end.
4165     for (i = 0; i < 16; i += 4) {
4166       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4167           __ post(tmpAddr, 16));
4168       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4169       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4170       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4171       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4172     }
4173     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4174 
4175     // Write to key stream, storing the same element out of workSt[0..15]
4176     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4177     // for the next element position.
4178     for (i = 0; i < 4; i++) {
4179       for (j = 0; j < 16; j += 4) {
4180         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4181             __ post(keystream, 16));
4182       }
4183     }
4184 
4185     __ mov(r0, 256);             // Return length of output keystream
4186     __ leave();
4187     __ ret(lr);
4188 
4189     return start;
4190   }
4191 
4192   /**
4193    *  Arguments:
4194    *
4195    * Inputs:
4196    *   c_rarg0   - int crc
4197    *   c_rarg1   - byte* buf
4198    *   c_rarg2   - int length
4199    *   c_rarg3   - int* table
4200    *
4201    * Output:
4202    *       r0   - int crc result
4203    */
4204   address generate_updateBytesCRC32C() {
4205     assert(UseCRC32CIntrinsics, "what are we doing here?");
4206 
4207     __ align(CodeEntryAlignment);
4208     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4209 
4210     address start = __ pc();
4211 
4212     const Register crc   = c_rarg0;  // crc
4213     const Register buf   = c_rarg1;  // source java byte array address
4214     const Register len   = c_rarg2;  // length
4215     const Register table0 = c_rarg3; // crc_table address
4216     const Register table1 = c_rarg4;
4217     const Register table2 = c_rarg5;
4218     const Register table3 = c_rarg6;
4219     const Register tmp3 = c_rarg7;
4220 
4221     BLOCK_COMMENT("Entry:");
4222     __ enter(); // required for proper stackwalking of RuntimeStub frame
4223 
4224     __ kernel_crc32c(crc, buf, len,
4225               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4226 
4227     __ leave(); // required for proper stackwalking of RuntimeStub frame
4228     __ ret(lr);
4229 
4230     return start;
4231   }
4232 
4233   /***
4234    *  Arguments:
4235    *
4236    *  Inputs:
4237    *   c_rarg0   - int   adler
4238    *   c_rarg1   - byte* buff
4239    *   c_rarg2   - int   len
4240    *
4241    * Output:
4242    *   c_rarg0   - int adler result
4243    */
4244   address generate_updateBytesAdler32() {
4245     __ align(CodeEntryAlignment);
4246     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4247     address start = __ pc();
4248 
4249     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4250 
4251     // Aliases
4252     Register adler  = c_rarg0;
4253     Register s1     = c_rarg0;
4254     Register s2     = c_rarg3;
4255     Register buff   = c_rarg1;
4256     Register len    = c_rarg2;
4257     Register nmax  = r4;
4258     Register base  = r5;
4259     Register count = r6;
4260     Register temp0 = rscratch1;
4261     Register temp1 = rscratch2;
4262     FloatRegister vbytes = v0;
4263     FloatRegister vs1acc = v1;
4264     FloatRegister vs2acc = v2;
4265     FloatRegister vtable = v3;
4266 
4267     // Max number of bytes we can process before having to take the mod
4268     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4269     uint64_t BASE = 0xfff1;
4270     uint64_t NMAX = 0x15B0;
4271 
4272     __ mov(base, BASE);
4273     __ mov(nmax, NMAX);
4274 
4275     // Load accumulation coefficients for the upper 16 bits
4276     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4277     __ ld1(vtable, __ T16B, Address(temp0));
4278 
4279     // s1 is initialized to the lower 16 bits of adler
4280     // s2 is initialized to the upper 16 bits of adler
4281     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4282     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4283 
4284     // The pipelined loop needs at least 16 elements for 1 iteration
4285     // It does check this, but it is more effective to skip to the cleanup loop
4286     __ cmp(len, (u1)16);
4287     __ br(Assembler::HS, L_nmax);
4288     __ cbz(len, L_combine);
4289 
4290     __ bind(L_simple_by1_loop);
4291     __ ldrb(temp0, Address(__ post(buff, 1)));
4292     __ add(s1, s1, temp0);
4293     __ add(s2, s2, s1);
4294     __ subs(len, len, 1);
4295     __ br(Assembler::HI, L_simple_by1_loop);
4296 
4297     // s1 = s1 % BASE
4298     __ subs(temp0, s1, base);
4299     __ csel(s1, temp0, s1, Assembler::HS);
4300 
4301     // s2 = s2 % BASE
4302     __ lsr(temp0, s2, 16);
4303     __ lsl(temp1, temp0, 4);
4304     __ sub(temp1, temp1, temp0);
4305     __ add(s2, temp1, s2, ext::uxth);
4306 
4307     __ subs(temp0, s2, base);
4308     __ csel(s2, temp0, s2, Assembler::HS);
4309 
4310     __ b(L_combine);
4311 
4312     __ bind(L_nmax);
4313     __ subs(len, len, nmax);
4314     __ sub(count, nmax, 16);
4315     __ br(Assembler::LO, L_by16);
4316 
4317     __ bind(L_nmax_loop);
4318 
4319     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4320                                       vbytes, vs1acc, vs2acc, vtable);
4321 
4322     __ subs(count, count, 16);
4323     __ br(Assembler::HS, L_nmax_loop);
4324 
4325     // s1 = s1 % BASE
4326     __ lsr(temp0, s1, 16);
4327     __ lsl(temp1, temp0, 4);
4328     __ sub(temp1, temp1, temp0);
4329     __ add(temp1, temp1, s1, ext::uxth);
4330 
4331     __ lsr(temp0, temp1, 16);
4332     __ lsl(s1, temp0, 4);
4333     __ sub(s1, s1, temp0);
4334     __ add(s1, s1, temp1, ext:: uxth);
4335 
4336     __ subs(temp0, s1, base);
4337     __ csel(s1, temp0, s1, Assembler::HS);
4338 
4339     // s2 = s2 % BASE
4340     __ lsr(temp0, s2, 16);
4341     __ lsl(temp1, temp0, 4);
4342     __ sub(temp1, temp1, temp0);
4343     __ add(temp1, temp1, s2, ext::uxth);
4344 
4345     __ lsr(temp0, temp1, 16);
4346     __ lsl(s2, temp0, 4);
4347     __ sub(s2, s2, temp0);
4348     __ add(s2, s2, temp1, ext:: uxth);
4349 
4350     __ subs(temp0, s2, base);
4351     __ csel(s2, temp0, s2, Assembler::HS);
4352 
4353     __ subs(len, len, nmax);
4354     __ sub(count, nmax, 16);
4355     __ br(Assembler::HS, L_nmax_loop);
4356 
4357     __ bind(L_by16);
4358     __ adds(len, len, count);
4359     __ br(Assembler::LO, L_by1);
4360 
4361     __ bind(L_by16_loop);
4362 
4363     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4364                                       vbytes, vs1acc, vs2acc, vtable);
4365 
4366     __ subs(len, len, 16);
4367     __ br(Assembler::HS, L_by16_loop);
4368 
4369     __ bind(L_by1);
4370     __ adds(len, len, 15);
4371     __ br(Assembler::LO, L_do_mod);
4372 
4373     __ bind(L_by1_loop);
4374     __ ldrb(temp0, Address(__ post(buff, 1)));
4375     __ add(s1, temp0, s1);
4376     __ add(s2, s2, s1);
4377     __ subs(len, len, 1);
4378     __ br(Assembler::HS, L_by1_loop);
4379 
4380     __ bind(L_do_mod);
4381     // s1 = s1 % BASE
4382     __ lsr(temp0, s1, 16);
4383     __ lsl(temp1, temp0, 4);
4384     __ sub(temp1, temp1, temp0);
4385     __ add(temp1, temp1, s1, ext::uxth);
4386 
4387     __ lsr(temp0, temp1, 16);
4388     __ lsl(s1, temp0, 4);
4389     __ sub(s1, s1, temp0);
4390     __ add(s1, s1, temp1, ext:: uxth);
4391 
4392     __ subs(temp0, s1, base);
4393     __ csel(s1, temp0, s1, Assembler::HS);
4394 
4395     // s2 = s2 % BASE
4396     __ lsr(temp0, s2, 16);
4397     __ lsl(temp1, temp0, 4);
4398     __ sub(temp1, temp1, temp0);
4399     __ add(temp1, temp1, s2, ext::uxth);
4400 
4401     __ lsr(temp0, temp1, 16);
4402     __ lsl(s2, temp0, 4);
4403     __ sub(s2, s2, temp0);
4404     __ add(s2, s2, temp1, ext:: uxth);
4405 
4406     __ subs(temp0, s2, base);
4407     __ csel(s2, temp0, s2, Assembler::HS);
4408 
4409     // Combine lower bits and higher bits
4410     __ bind(L_combine);
4411     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4412 
4413     __ ret(lr);
4414 
4415     return start;
4416   }
4417 
4418   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4419           Register temp0, Register temp1, FloatRegister vbytes,
4420           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4421     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4422     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4423     // In non-vectorized code, we update s1 and s2 as:
4424     //   s1 <- s1 + b1
4425     //   s2 <- s2 + s1
4426     //   s1 <- s1 + b2
4427     //   s2 <- s2 + b1
4428     //   ...
4429     //   s1 <- s1 + b16
4430     //   s2 <- s2 + s1
4431     // Putting above assignments together, we have:
4432     //   s1_new = s1 + b1 + b2 + ... + b16
4433     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4434     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4435     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4436     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4437 
4438     // s2 = s2 + s1 * 16
4439     __ add(s2, s2, s1, Assembler::LSL, 4);
4440 
4441     // vs1acc = b1 + b2 + b3 + ... + b16
4442     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4443     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4444     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4445     __ uaddlv(vs1acc, __ T16B, vbytes);
4446     __ uaddlv(vs2acc, __ T8H, vs2acc);
4447 
4448     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4449     __ fmovd(temp0, vs1acc);
4450     __ fmovd(temp1, vs2acc);
4451     __ add(s1, s1, temp0);
4452     __ add(s2, s2, temp1);
4453   }
4454 
4455   /**
4456    *  Arguments:
4457    *
4458    *  Input:
4459    *    c_rarg0   - x address
4460    *    c_rarg1   - x length
4461    *    c_rarg2   - y address
4462    *    c_rarg3   - y length
4463    *    c_rarg4   - z address
4464    *    c_rarg5   - z length
4465    */
4466   address generate_multiplyToLen() {
4467     __ align(CodeEntryAlignment);
4468     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4469 
4470     address start = __ pc();
4471     const Register x     = r0;
4472     const Register xlen  = r1;
4473     const Register y     = r2;
4474     const Register ylen  = r3;
4475     const Register z     = r4;
4476     const Register zlen  = r5;
4477 
4478     const Register tmp1  = r10;
4479     const Register tmp2  = r11;
4480     const Register tmp3  = r12;
4481     const Register tmp4  = r13;
4482     const Register tmp5  = r14;
4483     const Register tmp6  = r15;
4484     const Register tmp7  = r16;
4485 
4486     BLOCK_COMMENT("Entry:");
4487     __ enter(); // required for proper stackwalking of RuntimeStub frame
4488     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4489     __ leave(); // required for proper stackwalking of RuntimeStub frame
4490     __ ret(lr);
4491 
4492     return start;
4493   }
4494 
4495   address generate_squareToLen() {
4496     // squareToLen algorithm for sizes 1..127 described in java code works
4497     // faster than multiply_to_len on some CPUs and slower on others, but
4498     // multiply_to_len shows a bit better overall results
4499     __ align(CodeEntryAlignment);
4500     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4501     address start = __ pc();
4502 
4503     const Register x     = r0;
4504     const Register xlen  = r1;
4505     const Register z     = r2;
4506     const Register zlen  = r3;
4507     const Register y     = r4; // == x
4508     const Register ylen  = r5; // == xlen
4509 
4510     const Register tmp1  = r10;
4511     const Register tmp2  = r11;
4512     const Register tmp3  = r12;
4513     const Register tmp4  = r13;
4514     const Register tmp5  = r14;
4515     const Register tmp6  = r15;
4516     const Register tmp7  = r16;
4517 
4518     RegSet spilled_regs = RegSet::of(y, ylen);
4519     BLOCK_COMMENT("Entry:");
4520     __ enter();
4521     __ push(spilled_regs, sp);
4522     __ mov(y, x);
4523     __ mov(ylen, xlen);
4524     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4525     __ pop(spilled_regs, sp);
4526     __ leave();
4527     __ ret(lr);
4528     return start;
4529   }
4530 
4531   address generate_mulAdd() {
4532     __ align(CodeEntryAlignment);
4533     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4534 
4535     address start = __ pc();
4536 
4537     const Register out     = r0;
4538     const Register in      = r1;
4539     const Register offset  = r2;
4540     const Register len     = r3;
4541     const Register k       = r4;
4542 
4543     BLOCK_COMMENT("Entry:");
4544     __ enter();
4545     __ mul_add(out, in, offset, len, k);
4546     __ leave();
4547     __ ret(lr);
4548 
4549     return start;
4550   }
4551 
4552   // Arguments:
4553   //
4554   // Input:
4555   //   c_rarg0   - newArr address
4556   //   c_rarg1   - oldArr address
4557   //   c_rarg2   - newIdx
4558   //   c_rarg3   - shiftCount
4559   //   c_rarg4   - numIter
4560   //
4561   address generate_bigIntegerRightShift() {
4562     __ align(CodeEntryAlignment);
4563     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4564     address start = __ pc();
4565 
4566     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4567 
4568     Register newArr        = c_rarg0;
4569     Register oldArr        = c_rarg1;
4570     Register newIdx        = c_rarg2;
4571     Register shiftCount    = c_rarg3;
4572     Register numIter       = c_rarg4;
4573     Register idx           = numIter;
4574 
4575     Register newArrCur     = rscratch1;
4576     Register shiftRevCount = rscratch2;
4577     Register oldArrCur     = r13;
4578     Register oldArrNext    = r14;
4579 
4580     FloatRegister oldElem0        = v0;
4581     FloatRegister oldElem1        = v1;
4582     FloatRegister newElem         = v2;
4583     FloatRegister shiftVCount     = v3;
4584     FloatRegister shiftVRevCount  = v4;
4585 
4586     __ cbz(idx, Exit);
4587 
4588     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4589 
4590     // left shift count
4591     __ movw(shiftRevCount, 32);
4592     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4593 
4594     // numIter too small to allow a 4-words SIMD loop, rolling back
4595     __ cmp(numIter, (u1)4);
4596     __ br(Assembler::LT, ShiftThree);
4597 
4598     __ dup(shiftVCount,    __ T4S, shiftCount);
4599     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4600     __ negr(shiftVCount,   __ T4S, shiftVCount);
4601 
4602     __ BIND(ShiftSIMDLoop);
4603 
4604     // Calculate the load addresses
4605     __ sub(idx, idx, 4);
4606     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4607     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4608     __ add(oldArrCur,  oldArrNext, 4);
4609 
4610     // Load 4 words and process
4611     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4612     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4613     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4614     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4615     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4616     __ st1(newElem,   __ T4S,  Address(newArrCur));
4617 
4618     __ cmp(idx, (u1)4);
4619     __ br(Assembler::LT, ShiftTwoLoop);
4620     __ b(ShiftSIMDLoop);
4621 
4622     __ BIND(ShiftTwoLoop);
4623     __ cbz(idx, Exit);
4624     __ cmp(idx, (u1)1);
4625     __ br(Assembler::EQ, ShiftOne);
4626 
4627     // Calculate the load addresses
4628     __ sub(idx, idx, 2);
4629     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4630     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4631     __ add(oldArrCur,  oldArrNext, 4);
4632 
4633     // Load 2 words and process
4634     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4635     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4636     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4637     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4638     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4639     __ st1(newElem,   __ T2S, Address(newArrCur));
4640     __ b(ShiftTwoLoop);
4641 
4642     __ BIND(ShiftThree);
4643     __ tbz(idx, 1, ShiftOne);
4644     __ tbz(idx, 0, ShiftTwo);
4645     __ ldrw(r10,  Address(oldArr, 12));
4646     __ ldrw(r11,  Address(oldArr, 8));
4647     __ lsrvw(r10, r10, shiftCount);
4648     __ lslvw(r11, r11, shiftRevCount);
4649     __ orrw(r12,  r10, r11);
4650     __ strw(r12,  Address(newArr, 8));
4651 
4652     __ BIND(ShiftTwo);
4653     __ ldrw(r10,  Address(oldArr, 8));
4654     __ ldrw(r11,  Address(oldArr, 4));
4655     __ lsrvw(r10, r10, shiftCount);
4656     __ lslvw(r11, r11, shiftRevCount);
4657     __ orrw(r12,  r10, r11);
4658     __ strw(r12,  Address(newArr, 4));
4659 
4660     __ BIND(ShiftOne);
4661     __ ldrw(r10,  Address(oldArr, 4));
4662     __ ldrw(r11,  Address(oldArr));
4663     __ lsrvw(r10, r10, shiftCount);
4664     __ lslvw(r11, r11, shiftRevCount);
4665     __ orrw(r12,  r10, r11);
4666     __ strw(r12,  Address(newArr));
4667 
4668     __ BIND(Exit);
4669     __ ret(lr);
4670 
4671     return start;
4672   }
4673 
4674   // Arguments:
4675   //
4676   // Input:
4677   //   c_rarg0   - newArr address
4678   //   c_rarg1   - oldArr address
4679   //   c_rarg2   - newIdx
4680   //   c_rarg3   - shiftCount
4681   //   c_rarg4   - numIter
4682   //
4683   address generate_bigIntegerLeftShift() {
4684     __ align(CodeEntryAlignment);
4685     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4686     address start = __ pc();
4687 
4688     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4689 
4690     Register newArr        = c_rarg0;
4691     Register oldArr        = c_rarg1;
4692     Register newIdx        = c_rarg2;
4693     Register shiftCount    = c_rarg3;
4694     Register numIter       = c_rarg4;
4695 
4696     Register shiftRevCount = rscratch1;
4697     Register oldArrNext    = rscratch2;
4698 
4699     FloatRegister oldElem0        = v0;
4700     FloatRegister oldElem1        = v1;
4701     FloatRegister newElem         = v2;
4702     FloatRegister shiftVCount     = v3;
4703     FloatRegister shiftVRevCount  = v4;
4704 
4705     __ cbz(numIter, Exit);
4706 
4707     __ add(oldArrNext, oldArr, 4);
4708     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4709 
4710     // right shift count
4711     __ movw(shiftRevCount, 32);
4712     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4713 
4714     // numIter too small to allow a 4-words SIMD loop, rolling back
4715     __ cmp(numIter, (u1)4);
4716     __ br(Assembler::LT, ShiftThree);
4717 
4718     __ dup(shiftVCount,     __ T4S, shiftCount);
4719     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4720     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4721 
4722     __ BIND(ShiftSIMDLoop);
4723 
4724     // load 4 words and process
4725     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4726     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4727     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4728     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4729     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4730     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4731     __ sub(numIter,   numIter, 4);
4732 
4733     __ cmp(numIter, (u1)4);
4734     __ br(Assembler::LT, ShiftTwoLoop);
4735     __ b(ShiftSIMDLoop);
4736 
4737     __ BIND(ShiftTwoLoop);
4738     __ cbz(numIter, Exit);
4739     __ cmp(numIter, (u1)1);
4740     __ br(Assembler::EQ, ShiftOne);
4741 
4742     // load 2 words and process
4743     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4744     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4745     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4746     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4747     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4748     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4749     __ sub(numIter,   numIter, 2);
4750     __ b(ShiftTwoLoop);
4751 
4752     __ BIND(ShiftThree);
4753     __ ldrw(r10,  __ post(oldArr, 4));
4754     __ ldrw(r11,  __ post(oldArrNext, 4));
4755     __ lslvw(r10, r10, shiftCount);
4756     __ lsrvw(r11, r11, shiftRevCount);
4757     __ orrw(r12,  r10, r11);
4758     __ strw(r12,  __ post(newArr, 4));
4759     __ tbz(numIter, 1, Exit);
4760     __ tbz(numIter, 0, ShiftOne);
4761 
4762     __ BIND(ShiftTwo);
4763     __ ldrw(r10,  __ post(oldArr, 4));
4764     __ ldrw(r11,  __ post(oldArrNext, 4));
4765     __ lslvw(r10, r10, shiftCount);
4766     __ lsrvw(r11, r11, shiftRevCount);
4767     __ orrw(r12,  r10, r11);
4768     __ strw(r12,  __ post(newArr, 4));
4769 
4770     __ BIND(ShiftOne);
4771     __ ldrw(r10,  Address(oldArr));
4772     __ ldrw(r11,  Address(oldArrNext));
4773     __ lslvw(r10, r10, shiftCount);
4774     __ lsrvw(r11, r11, shiftRevCount);
4775     __ orrw(r12,  r10, r11);
4776     __ strw(r12,  Address(newArr));
4777 
4778     __ BIND(Exit);
4779     __ ret(lr);
4780 
4781     return start;
4782   }
4783 
4784   address generate_count_positives(address &count_positives_long) {
4785     const u1 large_loop_size = 64;
4786     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4787     int dcache_line = VM_Version::dcache_line_size();
4788 
4789     Register ary1 = r1, len = r2, result = r0;
4790 
4791     __ align(CodeEntryAlignment);
4792 
4793     StubCodeMark mark(this, "StubRoutines", "count_positives");
4794 
4795     address entry = __ pc();
4796 
4797     __ enter();
4798     // precondition: a copy of len is already in result
4799     // __ mov(result, len);
4800 
4801   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4802         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4803 
4804   __ cmp(len, (u1)15);
4805   __ br(Assembler::GT, LEN_OVER_15);
4806   // The only case when execution falls into this code is when pointer is near
4807   // the end of memory page and we have to avoid reading next page
4808   __ add(ary1, ary1, len);
4809   __ subs(len, len, 8);
4810   __ br(Assembler::GT, LEN_OVER_8);
4811   __ ldr(rscratch2, Address(ary1, -8));
4812   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4813   __ lsrv(rscratch2, rscratch2, rscratch1);
4814   __ tst(rscratch2, UPPER_BIT_MASK);
4815   __ csel(result, zr, result, Assembler::NE);
4816   __ leave();
4817   __ ret(lr);
4818   __ bind(LEN_OVER_8);
4819   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4820   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4821   __ tst(rscratch2, UPPER_BIT_MASK);
4822   __ br(Assembler::NE, RET_NO_POP);
4823   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4824   __ lsrv(rscratch1, rscratch1, rscratch2);
4825   __ tst(rscratch1, UPPER_BIT_MASK);
4826   __ bind(RET_NO_POP);
4827   __ csel(result, zr, result, Assembler::NE);
4828   __ leave();
4829   __ ret(lr);
4830 
4831   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4832   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4833 
4834   count_positives_long = __ pc(); // 2nd entry point
4835 
4836   __ enter();
4837 
4838   __ bind(LEN_OVER_15);
4839     __ push(spilled_regs, sp);
4840     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4841     __ cbz(rscratch2, ALIGNED);
4842     __ ldp(tmp6, tmp1, Address(ary1));
4843     __ mov(tmp5, 16);
4844     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4845     __ add(ary1, ary1, rscratch1);
4846     __ orr(tmp6, tmp6, tmp1);
4847     __ tst(tmp6, UPPER_BIT_MASK);
4848     __ br(Assembler::NE, RET_ADJUST);
4849     __ sub(len, len, rscratch1);
4850 
4851   __ bind(ALIGNED);
4852     __ cmp(len, large_loop_size);
4853     __ br(Assembler::LT, CHECK_16);
4854     // Perform 16-byte load as early return in pre-loop to handle situation
4855     // when initially aligned large array has negative values at starting bytes,
4856     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4857     // slower. Cases with negative bytes further ahead won't be affected that
4858     // much. In fact, it'll be faster due to early loads, less instructions and
4859     // less branches in LARGE_LOOP.
4860     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4861     __ sub(len, len, 16);
4862     __ orr(tmp6, tmp6, tmp1);
4863     __ tst(tmp6, UPPER_BIT_MASK);
4864     __ br(Assembler::NE, RET_ADJUST_16);
4865     __ cmp(len, large_loop_size);
4866     __ br(Assembler::LT, CHECK_16);
4867 
4868     if (SoftwarePrefetchHintDistance >= 0
4869         && SoftwarePrefetchHintDistance >= dcache_line) {
4870       // initial prefetch
4871       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4872     }
4873   __ bind(LARGE_LOOP);
4874     if (SoftwarePrefetchHintDistance >= 0) {
4875       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4876     }
4877     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4878     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4879     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4880     // instructions per cycle and have less branches, but this approach disables
4881     // early return, thus, all 64 bytes are loaded and checked every time.
4882     __ ldp(tmp2, tmp3, Address(ary1));
4883     __ ldp(tmp4, tmp5, Address(ary1, 16));
4884     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4885     __ ldp(tmp6, tmp1, Address(ary1, 48));
4886     __ add(ary1, ary1, large_loop_size);
4887     __ sub(len, len, large_loop_size);
4888     __ orr(tmp2, tmp2, tmp3);
4889     __ orr(tmp4, tmp4, tmp5);
4890     __ orr(rscratch1, rscratch1, rscratch2);
4891     __ orr(tmp6, tmp6, tmp1);
4892     __ orr(tmp2, tmp2, tmp4);
4893     __ orr(rscratch1, rscratch1, tmp6);
4894     __ orr(tmp2, tmp2, rscratch1);
4895     __ tst(tmp2, UPPER_BIT_MASK);
4896     __ br(Assembler::NE, RET_ADJUST_LONG);
4897     __ cmp(len, large_loop_size);
4898     __ br(Assembler::GE, LARGE_LOOP);
4899 
4900   __ bind(CHECK_16); // small 16-byte load pre-loop
4901     __ cmp(len, (u1)16);
4902     __ br(Assembler::LT, POST_LOOP16);
4903 
4904   __ bind(LOOP16); // small 16-byte load loop
4905     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4906     __ sub(len, len, 16);
4907     __ orr(tmp2, tmp2, tmp3);
4908     __ tst(tmp2, UPPER_BIT_MASK);
4909     __ br(Assembler::NE, RET_ADJUST_16);
4910     __ cmp(len, (u1)16);
4911     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4912 
4913   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4914     __ cmp(len, (u1)8);
4915     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4916     __ ldr(tmp3, Address(__ post(ary1, 8)));
4917     __ tst(tmp3, UPPER_BIT_MASK);
4918     __ br(Assembler::NE, RET_ADJUST);
4919     __ sub(len, len, 8);
4920 
4921   __ bind(POST_LOOP16_LOAD_TAIL);
4922     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
4923     __ ldr(tmp1, Address(ary1));
4924     __ mov(tmp2, 64);
4925     __ sub(tmp4, tmp2, len, __ LSL, 3);
4926     __ lslv(tmp1, tmp1, tmp4);
4927     __ tst(tmp1, UPPER_BIT_MASK);
4928     __ br(Assembler::NE, RET_ADJUST);
4929     // Fallthrough
4930 
4931   __ bind(RET_LEN);
4932     __ pop(spilled_regs, sp);
4933     __ leave();
4934     __ ret(lr);
4935 
4936     // difference result - len is the count of guaranteed to be
4937     // positive bytes
4938 
4939   __ bind(RET_ADJUST_LONG);
4940     __ add(len, len, (u1)(large_loop_size - 16));
4941   __ bind(RET_ADJUST_16);
4942     __ add(len, len, 16);
4943   __ bind(RET_ADJUST);
4944     __ pop(spilled_regs, sp);
4945     __ leave();
4946     __ sub(result, result, len);
4947     __ ret(lr);
4948 
4949     return entry;
4950   }
4951 
4952   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4953         bool usePrefetch, Label &NOT_EQUAL) {
4954     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4955         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4956         tmp7 = r12, tmp8 = r13;
4957     Label LOOP;
4958 
4959     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4960     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4961     __ bind(LOOP);
4962     if (usePrefetch) {
4963       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4964       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4965     }
4966     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4967     __ eor(tmp1, tmp1, tmp2);
4968     __ eor(tmp3, tmp3, tmp4);
4969     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4970     __ orr(tmp1, tmp1, tmp3);
4971     __ cbnz(tmp1, NOT_EQUAL);
4972     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4973     __ eor(tmp5, tmp5, tmp6);
4974     __ eor(tmp7, tmp7, tmp8);
4975     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4976     __ orr(tmp5, tmp5, tmp7);
4977     __ cbnz(tmp5, NOT_EQUAL);
4978     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4979     __ eor(tmp1, tmp1, tmp2);
4980     __ eor(tmp3, tmp3, tmp4);
4981     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4982     __ orr(tmp1, tmp1, tmp3);
4983     __ cbnz(tmp1, NOT_EQUAL);
4984     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4985     __ eor(tmp5, tmp5, tmp6);
4986     __ sub(cnt1, cnt1, 8 * wordSize);
4987     __ eor(tmp7, tmp7, tmp8);
4988     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4989     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4990     // cmp) because subs allows an unlimited range of immediate operand.
4991     __ subs(tmp6, cnt1, loopThreshold);
4992     __ orr(tmp5, tmp5, tmp7);
4993     __ cbnz(tmp5, NOT_EQUAL);
4994     __ br(__ GE, LOOP);
4995     // post-loop
4996     __ eor(tmp1, tmp1, tmp2);
4997     __ eor(tmp3, tmp3, tmp4);
4998     __ orr(tmp1, tmp1, tmp3);
4999     __ sub(cnt1, cnt1, 2 * wordSize);
5000     __ cbnz(tmp1, NOT_EQUAL);
5001   }
5002 
5003   void generate_large_array_equals_loop_simd(int loopThreshold,
5004         bool usePrefetch, Label &NOT_EQUAL) {
5005     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5006         tmp2 = rscratch2;
5007     Label LOOP;
5008 
5009     __ bind(LOOP);
5010     if (usePrefetch) {
5011       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5012       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5013     }
5014     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5015     __ sub(cnt1, cnt1, 8 * wordSize);
5016     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5017     __ subs(tmp1, cnt1, loopThreshold);
5018     __ eor(v0, __ T16B, v0, v4);
5019     __ eor(v1, __ T16B, v1, v5);
5020     __ eor(v2, __ T16B, v2, v6);
5021     __ eor(v3, __ T16B, v3, v7);
5022     __ orr(v0, __ T16B, v0, v1);
5023     __ orr(v1, __ T16B, v2, v3);
5024     __ orr(v0, __ T16B, v0, v1);
5025     __ umov(tmp1, v0, __ D, 0);
5026     __ umov(tmp2, v0, __ D, 1);
5027     __ orr(tmp1, tmp1, tmp2);
5028     __ cbnz(tmp1, NOT_EQUAL);
5029     __ br(__ GE, LOOP);
5030   }
5031 
5032   // a1 = r1 - array1 address
5033   // a2 = r2 - array2 address
5034   // result = r0 - return value. Already contains "false"
5035   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5036   // r3-r5 are reserved temporary registers
5037   address generate_large_array_equals() {
5038     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5039         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5040         tmp7 = r12, tmp8 = r13;
5041     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5042         SMALL_LOOP, POST_LOOP;
5043     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5044     // calculate if at least 32 prefetched bytes are used
5045     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5046     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5047     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5048     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5049         tmp5, tmp6, tmp7, tmp8);
5050 
5051     __ align(CodeEntryAlignment);
5052 
5053     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5054 
5055     address entry = __ pc();
5056     __ enter();
5057     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5058     // also advance pointers to use post-increment instead of pre-increment
5059     __ add(a1, a1, wordSize);
5060     __ add(a2, a2, wordSize);
5061     if (AvoidUnalignedAccesses) {
5062       // both implementations (SIMD/nonSIMD) are using relatively large load
5063       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5064       // on some CPUs in case of address is not at least 16-byte aligned.
5065       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5066       // load if needed at least for 1st address and make if 16-byte aligned.
5067       Label ALIGNED16;
5068       __ tbz(a1, 3, ALIGNED16);
5069       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5070       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5071       __ sub(cnt1, cnt1, wordSize);
5072       __ eor(tmp1, tmp1, tmp2);
5073       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5074       __ bind(ALIGNED16);
5075     }
5076     if (UseSIMDForArrayEquals) {
5077       if (SoftwarePrefetchHintDistance >= 0) {
5078         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5079         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5080         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5081             /* prfm = */ true, NOT_EQUAL);
5082         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5083         __ br(__ LT, TAIL);
5084       }
5085       __ bind(NO_PREFETCH_LARGE_LOOP);
5086       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5087           /* prfm = */ false, NOT_EQUAL);
5088     } else {
5089       __ push(spilled_regs, sp);
5090       if (SoftwarePrefetchHintDistance >= 0) {
5091         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5092         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5093         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5094             /* prfm = */ true, NOT_EQUAL);
5095         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5096         __ br(__ LT, TAIL);
5097       }
5098       __ bind(NO_PREFETCH_LARGE_LOOP);
5099       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5100           /* prfm = */ false, NOT_EQUAL);
5101     }
5102     __ bind(TAIL);
5103       __ cbz(cnt1, EQUAL);
5104       __ subs(cnt1, cnt1, wordSize);
5105       __ br(__ LE, POST_LOOP);
5106     __ bind(SMALL_LOOP);
5107       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5108       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5109       __ subs(cnt1, cnt1, wordSize);
5110       __ eor(tmp1, tmp1, tmp2);
5111       __ cbnz(tmp1, NOT_EQUAL);
5112       __ br(__ GT, SMALL_LOOP);
5113     __ bind(POST_LOOP);
5114       __ ldr(tmp1, Address(a1, cnt1));
5115       __ ldr(tmp2, Address(a2, cnt1));
5116       __ eor(tmp1, tmp1, tmp2);
5117       __ cbnz(tmp1, NOT_EQUAL);
5118     __ bind(EQUAL);
5119       __ mov(result, true);
5120     __ bind(NOT_EQUAL);
5121       if (!UseSIMDForArrayEquals) {
5122         __ pop(spilled_regs, sp);
5123       }
5124     __ bind(NOT_EQUAL_NO_POP);
5125     __ leave();
5126     __ ret(lr);
5127     return entry;
5128   }
5129 
5130   address generate_dsin_dcos(bool isCos) {
5131     __ align(CodeEntryAlignment);
5132     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5133     address start = __ pc();
5134     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5135         (address)StubRoutines::aarch64::_two_over_pi,
5136         (address)StubRoutines::aarch64::_pio2,
5137         (address)StubRoutines::aarch64::_dsin_coef,
5138         (address)StubRoutines::aarch64::_dcos_coef);
5139     return start;
5140   }
5141 
5142   address generate_dlog() {
5143     __ align(CodeEntryAlignment);
5144     StubCodeMark mark(this, "StubRoutines", "dlog");
5145     address entry = __ pc();
5146     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5147         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5148     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5149     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5150         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5151     return entry;
5152   }
5153 
5154 
5155   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5156   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5157       Label &DIFF2) {
5158     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5159     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5160 
5161     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5162     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5163     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5164     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5165 
5166     __ fmovd(tmpL, vtmp3);
5167     __ eor(rscratch2, tmp3, tmpL);
5168     __ cbnz(rscratch2, DIFF2);
5169 
5170     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5171     __ umov(tmpL, vtmp3, __ D, 1);
5172     __ eor(rscratch2, tmpU, tmpL);
5173     __ cbnz(rscratch2, DIFF1);
5174 
5175     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5176     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5177     __ fmovd(tmpL, vtmp);
5178     __ eor(rscratch2, tmp3, tmpL);
5179     __ cbnz(rscratch2, DIFF2);
5180 
5181     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5182     __ umov(tmpL, vtmp, __ D, 1);
5183     __ eor(rscratch2, tmpU, tmpL);
5184     __ cbnz(rscratch2, DIFF1);
5185   }
5186 
5187   // r0  = result
5188   // r1  = str1
5189   // r2  = cnt1
5190   // r3  = str2
5191   // r4  = cnt2
5192   // r10 = tmp1
5193   // r11 = tmp2
5194   address generate_compare_long_string_different_encoding(bool isLU) {
5195     __ align(CodeEntryAlignment);
5196     StubCodeMark mark(this, "StubRoutines", isLU
5197         ? "compare_long_string_different_encoding LU"
5198         : "compare_long_string_different_encoding UL");
5199     address entry = __ pc();
5200     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5201         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5202         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5203     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5204         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5205     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5206     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5207 
5208     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5209 
5210     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5211     // cnt2 == amount of characters left to compare
5212     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5213     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5214     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5215     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5216     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5217     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5218     __ eor(rscratch2, tmp1, tmp2);
5219     __ mov(rscratch1, tmp2);
5220     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5221     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5222              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5223     __ push(spilled_regs, sp);
5224     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5225     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5226 
5227     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5228 
5229     if (SoftwarePrefetchHintDistance >= 0) {
5230       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5231       __ br(__ LT, NO_PREFETCH);
5232       __ bind(LARGE_LOOP_PREFETCH);
5233         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5234         __ mov(tmp4, 2);
5235         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5236         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5237           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5238           __ subs(tmp4, tmp4, 1);
5239           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5240           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5241           __ mov(tmp4, 2);
5242         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5243           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5244           __ subs(tmp4, tmp4, 1);
5245           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5246           __ sub(cnt2, cnt2, 64);
5247           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5248           __ br(__ GE, LARGE_LOOP_PREFETCH);
5249     }
5250     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5251     __ bind(NO_PREFETCH);
5252     __ subs(cnt2, cnt2, 16);
5253     __ br(__ LT, TAIL);
5254     __ align(OptoLoopAlignment);
5255     __ bind(SMALL_LOOP); // smaller loop
5256       __ subs(cnt2, cnt2, 16);
5257       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5258       __ br(__ GE, SMALL_LOOP);
5259       __ cmn(cnt2, (u1)16);
5260       __ br(__ EQ, LOAD_LAST);
5261     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5262       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5263       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5264       __ ldr(tmp3, Address(cnt1, -8));
5265       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5266       __ b(LOAD_LAST);
5267     __ bind(DIFF2);
5268       __ mov(tmpU, tmp3);
5269     __ bind(DIFF1);
5270       __ pop(spilled_regs, sp);
5271       __ b(CALCULATE_DIFFERENCE);
5272     __ bind(LOAD_LAST);
5273       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5274       // No need to load it again
5275       __ mov(tmpU, tmp3);
5276       __ pop(spilled_regs, sp);
5277 
5278       // tmp2 points to the address of the last 4 Latin1 characters right now
5279       __ ldrs(vtmp, Address(tmp2));
5280       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5281       __ fmovd(tmpL, vtmp);
5282 
5283       __ eor(rscratch2, tmpU, tmpL);
5284       __ cbz(rscratch2, DONE);
5285 
5286     // Find the first different characters in the longwords and
5287     // compute their difference.
5288     __ bind(CALCULATE_DIFFERENCE);
5289       __ rev(rscratch2, rscratch2);
5290       __ clz(rscratch2, rscratch2);
5291       __ andr(rscratch2, rscratch2, -16);
5292       __ lsrv(tmp1, tmp1, rscratch2);
5293       __ uxthw(tmp1, tmp1);
5294       __ lsrv(rscratch1, rscratch1, rscratch2);
5295       __ uxthw(rscratch1, rscratch1);
5296       __ subw(result, tmp1, rscratch1);
5297     __ bind(DONE);
5298       __ ret(lr);
5299     return entry;
5300   }
5301 
5302   address generate_method_entry_barrier() {
5303     __ align(CodeEntryAlignment);
5304     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5305 
5306     Label deoptimize_label;
5307 
5308     address start = __ pc();
5309 
5310     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5311 
5312     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5313       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5314       // We can get here despite the nmethod being good, if we have not
5315       // yet applied our cross modification fence (or data fence).
5316       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5317       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5318       __ ldrw(rscratch2, rscratch2);
5319       __ strw(rscratch2, thread_epoch_addr);
5320       __ isb();
5321       __ membar(__ LoadLoad);
5322     }
5323 
5324     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5325 
5326     __ enter();
5327     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5328 
5329     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5330 
5331     __ push_call_clobbered_registers();
5332 
5333     __ mov(c_rarg0, rscratch2);
5334     __ call_VM_leaf
5335          (CAST_FROM_FN_PTR
5336           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5337 
5338     __ reset_last_Java_frame(true);
5339 
5340     __ mov(rscratch1, r0);
5341 
5342     __ pop_call_clobbered_registers();
5343 
5344     __ cbnz(rscratch1, deoptimize_label);
5345 
5346     __ leave();
5347     __ ret(lr);
5348 
5349     __ BIND(deoptimize_label);
5350 
5351     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5352     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5353 
5354     __ mov(sp, rscratch1);
5355     __ br(rscratch2);
5356 
5357     return start;
5358   }
5359 
5360   address generate_check_lock_stack() {
5361     __ align(CodeEntryAlignment);
5362     StubCodeMark mark(this, "StubRoutines", "check_lock_stack");
5363 
5364     address start = __ pc();
5365 
5366     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5367     __ enter();
5368     __ push_call_clobbered_registers();
5369 
5370     __ mov(c_rarg0, r9);
5371     __ call_VM_leaf(CAST_FROM_FN_PTR(address, LockStack::ensure_lock_stack_size), 1);
5372 
5373 
5374     __ pop_call_clobbered_registers();
5375     __ leave();
5376     __ reset_last_Java_frame(true);
5377 
5378     __ ret(lr);
5379 
5380     return start;
5381   }
5382 
5383   // r0  = result
5384   // r1  = str1
5385   // r2  = cnt1
5386   // r3  = str2
5387   // r4  = cnt2
5388   // r10 = tmp1
5389   // r11 = tmp2
5390   address generate_compare_long_string_same_encoding(bool isLL) {
5391     __ align(CodeEntryAlignment);
5392     StubCodeMark mark(this, "StubRoutines", isLL
5393         ? "compare_long_string_same_encoding LL"
5394         : "compare_long_string_same_encoding UU");
5395     address entry = __ pc();
5396     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5397         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5398 
5399     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5400 
5401     // exit from large loop when less than 64 bytes left to read or we're about
5402     // to prefetch memory behind array border
5403     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5404 
5405     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5406     __ eor(rscratch2, tmp1, tmp2);
5407     __ cbnz(rscratch2, CAL_DIFFERENCE);
5408 
5409     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5410     // update pointers, because of previous read
5411     __ add(str1, str1, wordSize);
5412     __ add(str2, str2, wordSize);
5413     if (SoftwarePrefetchHintDistance >= 0) {
5414       __ align(OptoLoopAlignment);
5415       __ bind(LARGE_LOOP_PREFETCH);
5416         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5417         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5418 
5419         for (int i = 0; i < 4; i++) {
5420           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5421           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5422           __ cmp(tmp1, tmp2);
5423           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5424           __ br(Assembler::NE, DIFF);
5425         }
5426         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5427         __ add(str1, str1, 64);
5428         __ add(str2, str2, 64);
5429         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5430         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5431         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5432     }
5433 
5434     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5435     __ br(Assembler::LE, LESS16);
5436     __ align(OptoLoopAlignment);
5437     __ bind(LOOP_COMPARE16);
5438       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5439       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5440       __ cmp(tmp1, tmp2);
5441       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5442       __ br(Assembler::NE, DIFF);
5443       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5444       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5445       __ br(Assembler::LT, LESS16);
5446 
5447       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5448       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5449       __ cmp(tmp1, tmp2);
5450       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5451       __ br(Assembler::NE, DIFF);
5452       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5453       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5454       __ br(Assembler::GE, LOOP_COMPARE16);
5455       __ cbz(cnt2, LENGTH_DIFF);
5456 
5457     __ bind(LESS16);
5458       // each 8 compare
5459       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5460       __ br(Assembler::LE, LESS8);
5461       __ ldr(tmp1, Address(__ post(str1, 8)));
5462       __ ldr(tmp2, Address(__ post(str2, 8)));
5463       __ eor(rscratch2, tmp1, tmp2);
5464       __ cbnz(rscratch2, CAL_DIFFERENCE);
5465       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5466 
5467     __ bind(LESS8); // directly load last 8 bytes
5468       if (!isLL) {
5469         __ add(cnt2, cnt2, cnt2);
5470       }
5471       __ ldr(tmp1, Address(str1, cnt2));
5472       __ ldr(tmp2, Address(str2, cnt2));
5473       __ eor(rscratch2, tmp1, tmp2);
5474       __ cbz(rscratch2, LENGTH_DIFF);
5475       __ b(CAL_DIFFERENCE);
5476 
5477     __ bind(DIFF);
5478       __ cmp(tmp1, tmp2);
5479       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5480       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5481       // reuse rscratch2 register for the result of eor instruction
5482       __ eor(rscratch2, tmp1, tmp2);
5483 
5484     __ bind(CAL_DIFFERENCE);
5485       __ rev(rscratch2, rscratch2);
5486       __ clz(rscratch2, rscratch2);
5487       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5488       __ lsrv(tmp1, tmp1, rscratch2);
5489       __ lsrv(tmp2, tmp2, rscratch2);
5490       if (isLL) {
5491         __ uxtbw(tmp1, tmp1);
5492         __ uxtbw(tmp2, tmp2);
5493       } else {
5494         __ uxthw(tmp1, tmp1);
5495         __ uxthw(tmp2, tmp2);
5496       }
5497       __ subw(result, tmp1, tmp2);
5498 
5499     __ bind(LENGTH_DIFF);
5500       __ ret(lr);
5501     return entry;
5502   }
5503 
5504   enum string_compare_mode {
5505     LL,
5506     LU,
5507     UL,
5508     UU,
5509   };
5510 
5511   // The following registers are declared in aarch64.ad
5512   // r0  = result
5513   // r1  = str1
5514   // r2  = cnt1
5515   // r3  = str2
5516   // r4  = cnt2
5517   // r10 = tmp1
5518   // r11 = tmp2
5519   // z0  = ztmp1
5520   // z1  = ztmp2
5521   // p0  = pgtmp1
5522   // p1  = pgtmp2
5523   address generate_compare_long_string_sve(string_compare_mode mode) {
5524     __ align(CodeEntryAlignment);
5525     address entry = __ pc();
5526     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5527              tmp1 = r10, tmp2 = r11;
5528 
5529     Label LOOP, DONE, MISMATCH;
5530     Register vec_len = tmp1;
5531     Register idx = tmp2;
5532     // The minimum of the string lengths has been stored in cnt2.
5533     Register cnt = cnt2;
5534     FloatRegister ztmp1 = z0, ztmp2 = z1;
5535     PRegister pgtmp1 = p0, pgtmp2 = p1;
5536 
5537 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5538     switch (mode) {                                                            \
5539       case LL:                                                                 \
5540         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5541         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5542         break;                                                                 \
5543       case LU:                                                                 \
5544         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5545         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5546         break;                                                                 \
5547       case UL:                                                                 \
5548         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5549         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5550         break;                                                                 \
5551       case UU:                                                                 \
5552         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5553         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5554         break;                                                                 \
5555       default:                                                                 \
5556         ShouldNotReachHere();                                                  \
5557     }
5558 
5559     const char* stubname;
5560     switch (mode) {
5561       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5562       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5563       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5564       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5565       default: ShouldNotReachHere();
5566     }
5567 
5568     StubCodeMark mark(this, "StubRoutines", stubname);
5569 
5570     __ mov(idx, 0);
5571     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5572 
5573     if (mode == LL) {
5574       __ sve_cntb(vec_len);
5575     } else {
5576       __ sve_cnth(vec_len);
5577     }
5578 
5579     __ sub(rscratch1, cnt, vec_len);
5580 
5581     __ bind(LOOP);
5582 
5583       // main loop
5584       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5585       __ add(idx, idx, vec_len);
5586       // Compare strings.
5587       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5588       __ br(__ NE, MISMATCH);
5589       __ cmp(idx, rscratch1);
5590       __ br(__ LT, LOOP);
5591 
5592     // post loop, last iteration
5593     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5594 
5595     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5596     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5597     __ br(__ EQ, DONE);
5598 
5599     __ bind(MISMATCH);
5600 
5601     // Crop the vector to find its location.
5602     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5603     // Extract the first different characters of each string.
5604     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5605     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5606 
5607     // Compute the difference of the first different characters.
5608     __ sub(result, rscratch1, rscratch2);
5609 
5610     __ bind(DONE);
5611     __ ret(lr);
5612 #undef LOAD_PAIR
5613     return entry;
5614   }
5615 
5616   void generate_compare_long_strings() {
5617     if (UseSVE == 0) {
5618       StubRoutines::aarch64::_compare_long_string_LL
5619           = generate_compare_long_string_same_encoding(true);
5620       StubRoutines::aarch64::_compare_long_string_UU
5621           = generate_compare_long_string_same_encoding(false);
5622       StubRoutines::aarch64::_compare_long_string_LU
5623           = generate_compare_long_string_different_encoding(true);
5624       StubRoutines::aarch64::_compare_long_string_UL
5625           = generate_compare_long_string_different_encoding(false);
5626     } else {
5627       StubRoutines::aarch64::_compare_long_string_LL
5628           = generate_compare_long_string_sve(LL);
5629       StubRoutines::aarch64::_compare_long_string_UU
5630           = generate_compare_long_string_sve(UU);
5631       StubRoutines::aarch64::_compare_long_string_LU
5632           = generate_compare_long_string_sve(LU);
5633       StubRoutines::aarch64::_compare_long_string_UL
5634           = generate_compare_long_string_sve(UL);
5635     }
5636   }
5637 
5638   // R0 = result
5639   // R1 = str2
5640   // R2 = cnt1
5641   // R3 = str1
5642   // R4 = cnt2
5643   // This generic linear code use few additional ideas, which makes it faster:
5644   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5645   // in order to skip initial loading(help in systems with 1 ld pipeline)
5646   // 2) we can use "fast" algorithm of finding single character to search for
5647   // first symbol with less branches(1 branch per each loaded register instead
5648   // of branch for each symbol), so, this is where constants like
5649   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5650   // 3) after loading and analyzing 1st register of source string, it can be
5651   // used to search for every 1st character entry, saving few loads in
5652   // comparison with "simplier-but-slower" implementation
5653   // 4) in order to avoid lots of push/pop operations, code below is heavily
5654   // re-using/re-initializing/compressing register values, which makes code
5655   // larger and a bit less readable, however, most of extra operations are
5656   // issued during loads or branches, so, penalty is minimal
5657   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5658     const char* stubName = str1_isL
5659         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5660         : "indexof_linear_uu";
5661     __ align(CodeEntryAlignment);
5662     StubCodeMark mark(this, "StubRoutines", stubName);
5663     address entry = __ pc();
5664 
5665     int str1_chr_size = str1_isL ? 1 : 2;
5666     int str2_chr_size = str2_isL ? 1 : 2;
5667     int str1_chr_shift = str1_isL ? 0 : 1;
5668     int str2_chr_shift = str2_isL ? 0 : 1;
5669     bool isL = str1_isL && str2_isL;
5670    // parameters
5671     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5672     // temporary registers
5673     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5674     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5675     // redefinitions
5676     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5677 
5678     __ push(spilled_regs, sp);
5679     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5680         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5681         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5682         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5683         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5684         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5685     // Read whole register from str1. It is safe, because length >=8 here
5686     __ ldr(ch1, Address(str1));
5687     // Read whole register from str2. It is safe, because length >=8 here
5688     __ ldr(ch2, Address(str2));
5689     __ sub(cnt2, cnt2, cnt1);
5690     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5691     if (str1_isL != str2_isL) {
5692       __ eor(v0, __ T16B, v0, v0);
5693     }
5694     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5695     __ mul(first, first, tmp1);
5696     // check if we have less than 1 register to check
5697     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5698     if (str1_isL != str2_isL) {
5699       __ fmovd(v1, ch1);
5700     }
5701     __ br(__ LE, L_SMALL);
5702     __ eor(ch2, first, ch2);
5703     if (str1_isL != str2_isL) {
5704       __ zip1(v1, __ T16B, v1, v0);
5705     }
5706     __ sub(tmp2, ch2, tmp1);
5707     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5708     __ bics(tmp2, tmp2, ch2);
5709     if (str1_isL != str2_isL) {
5710       __ fmovd(ch1, v1);
5711     }
5712     __ br(__ NE, L_HAS_ZERO);
5713     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5714     __ add(result, result, wordSize/str2_chr_size);
5715     __ add(str2, str2, wordSize);
5716     __ br(__ LT, L_POST_LOOP);
5717     __ BIND(L_LOOP);
5718       __ ldr(ch2, Address(str2));
5719       __ eor(ch2, first, ch2);
5720       __ sub(tmp2, ch2, tmp1);
5721       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5722       __ bics(tmp2, tmp2, ch2);
5723       __ br(__ NE, L_HAS_ZERO);
5724     __ BIND(L_LOOP_PROCEED);
5725       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5726       __ add(str2, str2, wordSize);
5727       __ add(result, result, wordSize/str2_chr_size);
5728       __ br(__ GE, L_LOOP);
5729     __ BIND(L_POST_LOOP);
5730       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5731       __ br(__ LE, NOMATCH);
5732       __ ldr(ch2, Address(str2));
5733       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5734       __ eor(ch2, first, ch2);
5735       __ sub(tmp2, ch2, tmp1);
5736       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5737       __ mov(tmp4, -1); // all bits set
5738       __ b(L_SMALL_PROCEED);
5739     __ align(OptoLoopAlignment);
5740     __ BIND(L_SMALL);
5741       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5742       __ eor(ch2, first, ch2);
5743       if (str1_isL != str2_isL) {
5744         __ zip1(v1, __ T16B, v1, v0);
5745       }
5746       __ sub(tmp2, ch2, tmp1);
5747       __ mov(tmp4, -1); // all bits set
5748       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5749       if (str1_isL != str2_isL) {
5750         __ fmovd(ch1, v1); // move converted 4 symbols
5751       }
5752     __ BIND(L_SMALL_PROCEED);
5753       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5754       __ bic(tmp2, tmp2, ch2);
5755       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5756       __ rbit(tmp2, tmp2);
5757       __ br(__ EQ, NOMATCH);
5758     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5759       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5760       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5761       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5762       if (str2_isL) { // LL
5763         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5764         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5765         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5766         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5767         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5768       } else {
5769         __ mov(ch2, 0xE); // all bits in byte set except last one
5770         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5771         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5772         __ lslv(tmp2, tmp2, tmp4);
5773         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5774         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5775         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5776         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5777       }
5778       __ cmp(ch1, ch2);
5779       __ mov(tmp4, wordSize/str2_chr_size);
5780       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5781     __ BIND(L_SMALL_CMP_LOOP);
5782       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5783                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5784       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5785                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5786       __ add(tmp4, tmp4, 1);
5787       __ cmp(tmp4, cnt1);
5788       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5789       __ cmp(first, ch2);
5790       __ br(__ EQ, L_SMALL_CMP_LOOP);
5791     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5792       __ cbz(tmp2, NOMATCH); // no more matches. exit
5793       __ clz(tmp4, tmp2);
5794       __ add(result, result, 1); // advance index
5795       __ add(str2, str2, str2_chr_size); // advance pointer
5796       __ b(L_SMALL_HAS_ZERO_LOOP);
5797     __ align(OptoLoopAlignment);
5798     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5799       __ cmp(first, ch2);
5800       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5801       __ b(DONE);
5802     __ align(OptoLoopAlignment);
5803     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5804       if (str2_isL) { // LL
5805         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5806         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5807         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5808         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5809         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5810       } else {
5811         __ mov(ch2, 0xE); // all bits in byte set except last one
5812         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5813         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5814         __ lslv(tmp2, tmp2, tmp4);
5815         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5816         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5817         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5818         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5819       }
5820       __ cmp(ch1, ch2);
5821       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5822       __ b(DONE);
5823     __ align(OptoLoopAlignment);
5824     __ BIND(L_HAS_ZERO);
5825       __ rbit(tmp2, tmp2);
5826       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5827       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5828       // It's fine because both counters are 32bit and are not changed in this
5829       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5830       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5831       __ sub(result, result, 1);
5832     __ BIND(L_HAS_ZERO_LOOP);
5833       __ mov(cnt1, wordSize/str2_chr_size);
5834       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5835       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5836       if (str2_isL) {
5837         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5838         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5839         __ lslv(tmp2, tmp2, tmp4);
5840         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5841         __ add(tmp4, tmp4, 1);
5842         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5843         __ lsl(tmp2, tmp2, 1);
5844         __ mov(tmp4, wordSize/str2_chr_size);
5845       } else {
5846         __ mov(ch2, 0xE);
5847         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5848         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5849         __ lslv(tmp2, tmp2, tmp4);
5850         __ add(tmp4, tmp4, 1);
5851         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5852         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5853         __ lsl(tmp2, tmp2, 1);
5854         __ mov(tmp4, wordSize/str2_chr_size);
5855         __ sub(str2, str2, str2_chr_size);
5856       }
5857       __ cmp(ch1, ch2);
5858       __ mov(tmp4, wordSize/str2_chr_size);
5859       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5860     __ BIND(L_CMP_LOOP);
5861       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5862                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5863       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5864                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5865       __ add(tmp4, tmp4, 1);
5866       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5867       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5868       __ cmp(cnt1, ch2);
5869       __ br(__ EQ, L_CMP_LOOP);
5870     __ BIND(L_CMP_LOOP_NOMATCH);
5871       // here we're not matched
5872       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5873       __ clz(tmp4, tmp2);
5874       __ add(str2, str2, str2_chr_size); // advance pointer
5875       __ b(L_HAS_ZERO_LOOP);
5876     __ align(OptoLoopAlignment);
5877     __ BIND(L_CMP_LOOP_LAST_CMP);
5878       __ cmp(cnt1, ch2);
5879       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5880       __ b(DONE);
5881     __ align(OptoLoopAlignment);
5882     __ BIND(L_CMP_LOOP_LAST_CMP2);
5883       if (str2_isL) {
5884         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5885         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5886         __ lslv(tmp2, tmp2, tmp4);
5887         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5888         __ add(tmp4, tmp4, 1);
5889         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5890         __ lsl(tmp2, tmp2, 1);
5891       } else {
5892         __ mov(ch2, 0xE);
5893         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5894         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5895         __ lslv(tmp2, tmp2, tmp4);
5896         __ add(tmp4, tmp4, 1);
5897         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5898         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5899         __ lsl(tmp2, tmp2, 1);
5900         __ sub(str2, str2, str2_chr_size);
5901       }
5902       __ cmp(ch1, ch2);
5903       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5904       __ b(DONE);
5905     __ align(OptoLoopAlignment);
5906     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5907       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5908       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5909       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5910       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5911       // result by analyzed characters value, so, we can just reset lower bits
5912       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5913       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5914       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5915       // index of last analyzed substring inside current octet. So, str2 in at
5916       // respective start address. We need to advance it to next octet
5917       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5918       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5919       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5920       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5921       __ movw(cnt2, cnt2);
5922       __ b(L_LOOP_PROCEED);
5923     __ align(OptoLoopAlignment);
5924     __ BIND(NOMATCH);
5925       __ mov(result, -1);
5926     __ BIND(DONE);
5927       __ pop(spilled_regs, sp);
5928       __ ret(lr);
5929     return entry;
5930   }
5931 
5932   void generate_string_indexof_stubs() {
5933     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5934     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5935     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5936   }
5937 
5938   void inflate_and_store_2_fp_registers(bool generatePrfm,
5939       FloatRegister src1, FloatRegister src2) {
5940     Register dst = r1;
5941     __ zip1(v1, __ T16B, src1, v0);
5942     __ zip2(v2, __ T16B, src1, v0);
5943     if (generatePrfm) {
5944       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5945     }
5946     __ zip1(v3, __ T16B, src2, v0);
5947     __ zip2(v4, __ T16B, src2, v0);
5948     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5949   }
5950 
5951   // R0 = src
5952   // R1 = dst
5953   // R2 = len
5954   // R3 = len >> 3
5955   // V0 = 0
5956   // v1 = loaded 8 bytes
5957   address generate_large_byte_array_inflate() {
5958     __ align(CodeEntryAlignment);
5959     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5960     address entry = __ pc();
5961     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5962     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5963     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5964 
5965     // do one more 8-byte read to have address 16-byte aligned in most cases
5966     // also use single store instruction
5967     __ ldrd(v2, __ post(src, 8));
5968     __ sub(octetCounter, octetCounter, 2);
5969     __ zip1(v1, __ T16B, v1, v0);
5970     __ zip1(v2, __ T16B, v2, v0);
5971     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5972     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5973     __ subs(rscratch1, octetCounter, large_loop_threshold);
5974     __ br(__ LE, LOOP_START);
5975     __ b(LOOP_PRFM_START);
5976     __ bind(LOOP_PRFM);
5977       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5978     __ bind(LOOP_PRFM_START);
5979       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5980       __ sub(octetCounter, octetCounter, 8);
5981       __ subs(rscratch1, octetCounter, large_loop_threshold);
5982       inflate_and_store_2_fp_registers(true, v3, v4);
5983       inflate_and_store_2_fp_registers(true, v5, v6);
5984       __ br(__ GT, LOOP_PRFM);
5985       __ cmp(octetCounter, (u1)8);
5986       __ br(__ LT, DONE);
5987     __ bind(LOOP);
5988       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5989       __ bind(LOOP_START);
5990       __ sub(octetCounter, octetCounter, 8);
5991       __ cmp(octetCounter, (u1)8);
5992       inflate_and_store_2_fp_registers(false, v3, v4);
5993       inflate_and_store_2_fp_registers(false, v5, v6);
5994       __ br(__ GE, LOOP);
5995     __ bind(DONE);
5996       __ ret(lr);
5997     return entry;
5998   }
5999 
6000   /**
6001    *  Arguments:
6002    *
6003    *  Input:
6004    *  c_rarg0   - current state address
6005    *  c_rarg1   - H key address
6006    *  c_rarg2   - data address
6007    *  c_rarg3   - number of blocks
6008    *
6009    *  Output:
6010    *  Updated state at c_rarg0
6011    */
6012   address generate_ghash_processBlocks() {
6013     // Bafflingly, GCM uses little-endian for the byte order, but
6014     // big-endian for the bit order.  For example, the polynomial 1 is
6015     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6016     //
6017     // So, we must either reverse the bytes in each word and do
6018     // everything big-endian or reverse the bits in each byte and do
6019     // it little-endian.  On AArch64 it's more idiomatic to reverse
6020     // the bits in each byte (we have an instruction, RBIT, to do
6021     // that) and keep the data in little-endian bit order through the
6022     // calculation, bit-reversing the inputs and outputs.
6023 
6024     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6025     __ align(wordSize * 2);
6026     address p = __ pc();
6027     __ emit_int64(0x87);  // The low-order bits of the field
6028                           // polynomial (i.e. p = z^7+z^2+z+1)
6029                           // repeated in the low and high parts of a
6030                           // 128-bit vector
6031     __ emit_int64(0x87);
6032 
6033     __ align(CodeEntryAlignment);
6034     address start = __ pc();
6035 
6036     Register state   = c_rarg0;
6037     Register subkeyH = c_rarg1;
6038     Register data    = c_rarg2;
6039     Register blocks  = c_rarg3;
6040 
6041     FloatRegister vzr = v30;
6042     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6043 
6044     __ ldrq(v24, p);    // The field polynomial
6045 
6046     __ ldrq(v0, Address(state));
6047     __ ldrq(v1, Address(subkeyH));
6048 
6049     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6050     __ rbit(v0, __ T16B, v0);
6051     __ rev64(v1, __ T16B, v1);
6052     __ rbit(v1, __ T16B, v1);
6053 
6054     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6055     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6056 
6057     {
6058       Label L_ghash_loop;
6059       __ bind(L_ghash_loop);
6060 
6061       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6062                                                  // reversing each byte
6063       __ rbit(v2, __ T16B, v2);
6064       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6065 
6066       // Multiply state in v2 by subkey in v1
6067       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6068                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6069                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6070       // Reduce v7:v5 by the field polynomial
6071       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6072 
6073       __ sub(blocks, blocks, 1);
6074       __ cbnz(blocks, L_ghash_loop);
6075     }
6076 
6077     // The bit-reversed result is at this point in v0
6078     __ rev64(v0, __ T16B, v0);
6079     __ rbit(v0, __ T16B, v0);
6080 
6081     __ st1(v0, __ T16B, state);
6082     __ ret(lr);
6083 
6084     return start;
6085   }
6086 
6087   address generate_ghash_processBlocks_wide() {
6088     address small = generate_ghash_processBlocks();
6089 
6090     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6091     __ align(wordSize * 2);
6092     address p = __ pc();
6093     __ emit_int64(0x87);  // The low-order bits of the field
6094                           // polynomial (i.e. p = z^7+z^2+z+1)
6095                           // repeated in the low and high parts of a
6096                           // 128-bit vector
6097     __ emit_int64(0x87);
6098 
6099     __ align(CodeEntryAlignment);
6100     address start = __ pc();
6101 
6102     Register state   = c_rarg0;
6103     Register subkeyH = c_rarg1;
6104     Register data    = c_rarg2;
6105     Register blocks  = c_rarg3;
6106 
6107     const int unroll = 4;
6108 
6109     __ cmp(blocks, (unsigned char)(unroll * 2));
6110     __ br(__ LT, small);
6111 
6112     if (unroll > 1) {
6113     // Save state before entering routine
6114       __ sub(sp, sp, 4 * 16);
6115       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6116       __ sub(sp, sp, 4 * 16);
6117       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6118     }
6119 
6120     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6121 
6122     if (unroll > 1) {
6123       // And restore state
6124       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6125       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6126     }
6127 
6128     __ cmp(blocks, (unsigned char)0);
6129     __ br(__ GT, small);
6130 
6131     __ ret(lr);
6132 
6133     return start;
6134   }
6135 
6136   void generate_base64_encode_simdround(Register src, Register dst,
6137         FloatRegister codec, u8 size) {
6138 
6139     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6140     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6141     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6142 
6143     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6144 
6145     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6146 
6147     __ ushr(ind0, arrangement, in0,  2);
6148 
6149     __ ushr(ind1, arrangement, in1,  2);
6150     __ shl(in0,   arrangement, in0,  6);
6151     __ orr(ind1,  arrangement, ind1, in0);
6152     __ ushr(ind1, arrangement, ind1, 2);
6153 
6154     __ ushr(ind2, arrangement, in2,  4);
6155     __ shl(in1,   arrangement, in1,  4);
6156     __ orr(ind2,  arrangement, in1,  ind2);
6157     __ ushr(ind2, arrangement, ind2, 2);
6158 
6159     __ shl(ind3,  arrangement, in2,  2);
6160     __ ushr(ind3, arrangement, ind3, 2);
6161 
6162     __ tbl(out0,  arrangement, codec,  4, ind0);
6163     __ tbl(out1,  arrangement, codec,  4, ind1);
6164     __ tbl(out2,  arrangement, codec,  4, ind2);
6165     __ tbl(out3,  arrangement, codec,  4, ind3);
6166 
6167     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6168   }
6169 
6170    /**
6171    *  Arguments:
6172    *
6173    *  Input:
6174    *  c_rarg0   - src_start
6175    *  c_rarg1   - src_offset
6176    *  c_rarg2   - src_length
6177    *  c_rarg3   - dest_start
6178    *  c_rarg4   - dest_offset
6179    *  c_rarg5   - isURL
6180    *
6181    */
6182   address generate_base64_encodeBlock() {
6183 
6184     static const char toBase64[64] = {
6185       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6186       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6187       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6188       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6189       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6190     };
6191 
6192     static const char toBase64URL[64] = {
6193       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6194       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6195       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6196       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6197       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6198     };
6199 
6200     __ align(CodeEntryAlignment);
6201     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6202     address start = __ pc();
6203 
6204     Register src   = c_rarg0;  // source array
6205     Register soff  = c_rarg1;  // source start offset
6206     Register send  = c_rarg2;  // source end offset
6207     Register dst   = c_rarg3;  // dest array
6208     Register doff  = c_rarg4;  // position for writing to dest array
6209     Register isURL = c_rarg5;  // Base64 or URL character set
6210 
6211     // c_rarg6 and c_rarg7 are free to use as temps
6212     Register codec  = c_rarg6;
6213     Register length = c_rarg7;
6214 
6215     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6216 
6217     __ add(src, src, soff);
6218     __ add(dst, dst, doff);
6219     __ sub(length, send, soff);
6220 
6221     // load the codec base address
6222     __ lea(codec, ExternalAddress((address) toBase64));
6223     __ cbz(isURL, ProcessData);
6224     __ lea(codec, ExternalAddress((address) toBase64URL));
6225 
6226     __ BIND(ProcessData);
6227 
6228     // too short to formup a SIMD loop, roll back
6229     __ cmp(length, (u1)24);
6230     __ br(Assembler::LT, Process3B);
6231 
6232     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6233 
6234     __ BIND(Process48B);
6235     __ cmp(length, (u1)48);
6236     __ br(Assembler::LT, Process24B);
6237     generate_base64_encode_simdround(src, dst, v0, 16);
6238     __ sub(length, length, 48);
6239     __ b(Process48B);
6240 
6241     __ BIND(Process24B);
6242     __ cmp(length, (u1)24);
6243     __ br(Assembler::LT, SIMDExit);
6244     generate_base64_encode_simdround(src, dst, v0, 8);
6245     __ sub(length, length, 24);
6246 
6247     __ BIND(SIMDExit);
6248     __ cbz(length, Exit);
6249 
6250     __ BIND(Process3B);
6251     //  3 src bytes, 24 bits
6252     __ ldrb(r10, __ post(src, 1));
6253     __ ldrb(r11, __ post(src, 1));
6254     __ ldrb(r12, __ post(src, 1));
6255     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6256     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6257     // codec index
6258     __ ubfmw(r15, r12, 18, 23);
6259     __ ubfmw(r14, r12, 12, 17);
6260     __ ubfmw(r13, r12, 6,  11);
6261     __ andw(r12,  r12, 63);
6262     // get the code based on the codec
6263     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6264     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6265     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6266     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6267     __ strb(r15, __ post(dst, 1));
6268     __ strb(r14, __ post(dst, 1));
6269     __ strb(r13, __ post(dst, 1));
6270     __ strb(r12, __ post(dst, 1));
6271     __ sub(length, length, 3);
6272     __ cbnz(length, Process3B);
6273 
6274     __ BIND(Exit);
6275     __ ret(lr);
6276 
6277     return start;
6278   }
6279 
6280   void generate_base64_decode_simdround(Register src, Register dst,
6281         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6282 
6283     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6284     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6285 
6286     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6287     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6288 
6289     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6290 
6291     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6292 
6293     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6294 
6295     // we need unsigned saturating subtract, to make sure all input values
6296     // in range [0, 63] will have 0U value in the higher half lookup
6297     __ uqsubv(decH0, __ T16B, in0, v27);
6298     __ uqsubv(decH1, __ T16B, in1, v27);
6299     __ uqsubv(decH2, __ T16B, in2, v27);
6300     __ uqsubv(decH3, __ T16B, in3, v27);
6301 
6302     // lower half lookup
6303     __ tbl(decL0, arrangement, codecL, 4, in0);
6304     __ tbl(decL1, arrangement, codecL, 4, in1);
6305     __ tbl(decL2, arrangement, codecL, 4, in2);
6306     __ tbl(decL3, arrangement, codecL, 4, in3);
6307 
6308     // higher half lookup
6309     __ tbx(decH0, arrangement, codecH, 4, decH0);
6310     __ tbx(decH1, arrangement, codecH, 4, decH1);
6311     __ tbx(decH2, arrangement, codecH, 4, decH2);
6312     __ tbx(decH3, arrangement, codecH, 4, decH3);
6313 
6314     // combine lower and higher
6315     __ orr(decL0, arrangement, decL0, decH0);
6316     __ orr(decL1, arrangement, decL1, decH1);
6317     __ orr(decL2, arrangement, decL2, decH2);
6318     __ orr(decL3, arrangement, decL3, decH3);
6319 
6320     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6321     __ cmhi(decH0, arrangement, decL0, v27);
6322     __ cmhi(decH1, arrangement, decL1, v27);
6323     __ cmhi(decH2, arrangement, decL2, v27);
6324     __ cmhi(decH3, arrangement, decL3, v27);
6325     __ orr(in0, arrangement, decH0, decH1);
6326     __ orr(in1, arrangement, decH2, decH3);
6327     __ orr(in2, arrangement, in0,   in1);
6328     __ umaxv(in3, arrangement, in2);
6329     __ umov(rscratch2, in3, __ B, 0);
6330 
6331     // get the data to output
6332     __ shl(out0,  arrangement, decL0, 2);
6333     __ ushr(out1, arrangement, decL1, 4);
6334     __ orr(out0,  arrangement, out0,  out1);
6335     __ shl(out1,  arrangement, decL1, 4);
6336     __ ushr(out2, arrangement, decL2, 2);
6337     __ orr(out1,  arrangement, out1,  out2);
6338     __ shl(out2,  arrangement, decL2, 6);
6339     __ orr(out2,  arrangement, out2,  decL3);
6340 
6341     __ cbz(rscratch2, NoIllegalData);
6342 
6343     // handle illegal input
6344     __ umov(r10, in2, __ D, 0);
6345     if (size == 16) {
6346       __ cbnz(r10, ErrorInLowerHalf);
6347 
6348       // illegal input is in higher half, store the lower half now.
6349       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6350 
6351       __ umov(r10, in2,  __ D, 1);
6352       __ umov(r11, out0, __ D, 1);
6353       __ umov(r12, out1, __ D, 1);
6354       __ umov(r13, out2, __ D, 1);
6355       __ b(StoreLegalData);
6356 
6357       __ BIND(ErrorInLowerHalf);
6358     }
6359     __ umov(r11, out0, __ D, 0);
6360     __ umov(r12, out1, __ D, 0);
6361     __ umov(r13, out2, __ D, 0);
6362 
6363     __ BIND(StoreLegalData);
6364     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6365     __ strb(r11, __ post(dst, 1));
6366     __ strb(r12, __ post(dst, 1));
6367     __ strb(r13, __ post(dst, 1));
6368     __ lsr(r10, r10, 8);
6369     __ lsr(r11, r11, 8);
6370     __ lsr(r12, r12, 8);
6371     __ lsr(r13, r13, 8);
6372     __ b(StoreLegalData);
6373 
6374     __ BIND(NoIllegalData);
6375     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6376   }
6377 
6378 
6379    /**
6380    *  Arguments:
6381    *
6382    *  Input:
6383    *  c_rarg0   - src_start
6384    *  c_rarg1   - src_offset
6385    *  c_rarg2   - src_length
6386    *  c_rarg3   - dest_start
6387    *  c_rarg4   - dest_offset
6388    *  c_rarg5   - isURL
6389    *  c_rarg6   - isMIME
6390    *
6391    */
6392   address generate_base64_decodeBlock() {
6393 
6394     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6395     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6396     // titled "Base64 decoding".
6397 
6398     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6399     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6400     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6401     static const uint8_t fromBase64ForNoSIMD[256] = {
6402       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6403       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6404       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6405        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6406       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6407        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6408       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6409        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6410       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6411       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6412       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6413       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6414       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6415       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6416       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6417       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6418     };
6419 
6420     static const uint8_t fromBase64URLForNoSIMD[256] = {
6421       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6422       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6423       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6424        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6425       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6426        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6427       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6428        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6429       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6430       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6431       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6432       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6433       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6434       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6435       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6436       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6437     };
6438 
6439     // A legal value of base64 code is in range [0, 127].  We need two lookups
6440     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6441     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6442     // table vector lookup use tbx, out of range indices are unchanged in
6443     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6444     // The value of index 64 is set to 0, so that we know that we already get the
6445     // decoded data with the 1st lookup.
6446     static const uint8_t fromBase64ForSIMD[128] = {
6447       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6448       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6449       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6450        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6451         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6452        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6453       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6454        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6455     };
6456 
6457     static const uint8_t fromBase64URLForSIMD[128] = {
6458       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6459       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6460       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6461        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6462         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6463        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6464        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6465        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6466     };
6467 
6468     __ align(CodeEntryAlignment);
6469     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6470     address start = __ pc();
6471 
6472     Register src    = c_rarg0;  // source array
6473     Register soff   = c_rarg1;  // source start offset
6474     Register send   = c_rarg2;  // source end offset
6475     Register dst    = c_rarg3;  // dest array
6476     Register doff   = c_rarg4;  // position for writing to dest array
6477     Register isURL  = c_rarg5;  // Base64 or URL character set
6478     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6479 
6480     Register length = send;    // reuse send as length of source data to process
6481 
6482     Register simd_codec   = c_rarg6;
6483     Register nosimd_codec = c_rarg7;
6484 
6485     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6486 
6487     __ enter();
6488 
6489     __ add(src, src, soff);
6490     __ add(dst, dst, doff);
6491 
6492     __ mov(doff, dst);
6493 
6494     __ sub(length, send, soff);
6495     __ bfm(length, zr, 0, 1);
6496 
6497     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6498     __ cbz(isURL, ProcessData);
6499     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6500 
6501     __ BIND(ProcessData);
6502     __ mov(rscratch1, length);
6503     __ cmp(length, (u1)144); // 144 = 80 + 64
6504     __ br(Assembler::LT, Process4B);
6505 
6506     // In the MIME case, the line length cannot be more than 76
6507     // bytes (see RFC 2045). This is too short a block for SIMD
6508     // to be worthwhile, so we use non-SIMD here.
6509     __ movw(rscratch1, 79);
6510 
6511     __ BIND(Process4B);
6512     __ ldrw(r14, __ post(src, 4));
6513     __ ubfxw(r10, r14, 0,  8);
6514     __ ubfxw(r11, r14, 8,  8);
6515     __ ubfxw(r12, r14, 16, 8);
6516     __ ubfxw(r13, r14, 24, 8);
6517     // get the de-code
6518     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6519     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6520     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6521     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6522     // error detection, 255u indicates an illegal input
6523     __ orrw(r14, r10, r11);
6524     __ orrw(r15, r12, r13);
6525     __ orrw(r14, r14, r15);
6526     __ tbnz(r14, 7, Exit);
6527     // recover the data
6528     __ lslw(r14, r10, 10);
6529     __ bfiw(r14, r11, 4, 6);
6530     __ bfmw(r14, r12, 2, 5);
6531     __ rev16w(r14, r14);
6532     __ bfiw(r13, r12, 6, 2);
6533     __ strh(r14, __ post(dst, 2));
6534     __ strb(r13, __ post(dst, 1));
6535     // non-simd loop
6536     __ subsw(rscratch1, rscratch1, 4);
6537     __ br(Assembler::GT, Process4B);
6538 
6539     // if exiting from PreProcess80B, rscratch1 == -1;
6540     // otherwise, rscratch1 == 0.
6541     __ cbzw(rscratch1, Exit);
6542     __ sub(length, length, 80);
6543 
6544     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6545     __ cbz(isURL, SIMDEnter);
6546     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6547 
6548     __ BIND(SIMDEnter);
6549     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6550     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6551     __ mov(rscratch1, 63);
6552     __ dup(v27, __ T16B, rscratch1);
6553 
6554     __ BIND(Process64B);
6555     __ cmp(length, (u1)64);
6556     __ br(Assembler::LT, Process32B);
6557     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6558     __ sub(length, length, 64);
6559     __ b(Process64B);
6560 
6561     __ BIND(Process32B);
6562     __ cmp(length, (u1)32);
6563     __ br(Assembler::LT, SIMDExit);
6564     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6565     __ sub(length, length, 32);
6566     __ b(Process32B);
6567 
6568     __ BIND(SIMDExit);
6569     __ cbz(length, Exit);
6570     __ movw(rscratch1, length);
6571     __ b(Process4B);
6572 
6573     __ BIND(Exit);
6574     __ sub(c_rarg0, dst, doff);
6575 
6576     __ leave();
6577     __ ret(lr);
6578 
6579     return start;
6580   }
6581 
6582   // Support for spin waits.
6583   address generate_spin_wait() {
6584     __ align(CodeEntryAlignment);
6585     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6586     address start = __ pc();
6587 
6588     __ spin_wait();
6589     __ ret(lr);
6590 
6591     return start;
6592   }
6593 
6594 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6595 
6596   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6597   //
6598   // If LSE is in use, generate LSE versions of all the stubs. The
6599   // non-LSE versions are in atomic_aarch64.S.
6600 
6601   // class AtomicStubMark records the entry point of a stub and the
6602   // stub pointer which will point to it. The stub pointer is set to
6603   // the entry point when ~AtomicStubMark() is called, which must be
6604   // after ICache::invalidate_range. This ensures safe publication of
6605   // the generated code.
6606   class AtomicStubMark {
6607     address _entry_point;
6608     aarch64_atomic_stub_t *_stub;
6609     MacroAssembler *_masm;
6610   public:
6611     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6612       _masm = masm;
6613       __ align(32);
6614       _entry_point = __ pc();
6615       _stub = stub;
6616     }
6617     ~AtomicStubMark() {
6618       *_stub = (aarch64_atomic_stub_t)_entry_point;
6619     }
6620   };
6621 
6622   // NB: For memory_order_conservative we need a trailing membar after
6623   // LSE atomic operations but not a leading membar.
6624   //
6625   // We don't need a leading membar because a clause in the Arm ARM
6626   // says:
6627   //
6628   //   Barrier-ordered-before
6629   //
6630   //   Barrier instructions order prior Memory effects before subsequent
6631   //   Memory effects generated by the same Observer. A read or a write
6632   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6633   //   Observer if and only if RW1 appears in program order before RW 2
6634   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6635   //   instruction with both Acquire and Release semantics.
6636   //
6637   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6638   // and Release semantics, therefore we don't need a leading
6639   // barrier. However, there is no corresponding Barrier-ordered-after
6640   // relationship, therefore we need a trailing membar to prevent a
6641   // later store or load from being reordered with the store in an
6642   // atomic instruction.
6643   //
6644   // This was checked by using the herd7 consistency model simulator
6645   // (http://diy.inria.fr/) with this test case:
6646   //
6647   // AArch64 LseCas
6648   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6649   // P0 | P1;
6650   // LDR W4, [X2] | MOV W3, #0;
6651   // DMB LD       | MOV W4, #1;
6652   // LDR W3, [X1] | CASAL W3, W4, [X1];
6653   //              | DMB ISH;
6654   //              | STR W4, [X2];
6655   // exists
6656   // (0:X3=0 /\ 0:X4=1)
6657   //
6658   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6659   // with the store to x in P1. Without the DMB in P1 this may happen.
6660   //
6661   // At the time of writing we don't know of any AArch64 hardware that
6662   // reorders stores in this way, but the Reference Manual permits it.
6663 
6664   void gen_cas_entry(Assembler::operand_size size,
6665                      atomic_memory_order order) {
6666     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6667       exchange_val = c_rarg2;
6668     bool acquire, release;
6669     switch (order) {
6670       case memory_order_relaxed:
6671         acquire = false;
6672         release = false;
6673         break;
6674       case memory_order_release:
6675         acquire = false;
6676         release = true;
6677         break;
6678       default:
6679         acquire = true;
6680         release = true;
6681         break;
6682     }
6683     __ mov(prev, compare_val);
6684     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6685     if (order == memory_order_conservative) {
6686       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6687     }
6688     if (size == Assembler::xword) {
6689       __ mov(r0, prev);
6690     } else {
6691       __ movw(r0, prev);
6692     }
6693     __ ret(lr);
6694   }
6695 
6696   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6697     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6698     // If not relaxed, then default to conservative.  Relaxed is the only
6699     // case we use enough to be worth specializing.
6700     if (order == memory_order_relaxed) {
6701       __ ldadd(size, incr, prev, addr);
6702     } else {
6703       __ ldaddal(size, incr, prev, addr);
6704       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6705     }
6706     if (size == Assembler::xword) {
6707       __ mov(r0, prev);
6708     } else {
6709       __ movw(r0, prev);
6710     }
6711     __ ret(lr);
6712   }
6713 
6714   void gen_swpal_entry(Assembler::operand_size size) {
6715     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6716     __ swpal(size, incr, prev, addr);
6717     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6718     if (size == Assembler::xword) {
6719       __ mov(r0, prev);
6720     } else {
6721       __ movw(r0, prev);
6722     }
6723     __ ret(lr);
6724   }
6725 
6726   void generate_atomic_entry_points() {
6727     if (! UseLSE) {
6728       return;
6729     }
6730 
6731     __ align(CodeEntryAlignment);
6732     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6733     address first_entry = __ pc();
6734 
6735     // ADD, memory_order_conservative
6736     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6737     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6738     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6739     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6740 
6741     // ADD, memory_order_relaxed
6742     AtomicStubMark mark_fetch_add_4_relaxed
6743       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6744     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6745     AtomicStubMark mark_fetch_add_8_relaxed
6746       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6747     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6748 
6749     // XCHG, memory_order_conservative
6750     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6751     gen_swpal_entry(Assembler::word);
6752     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6753     gen_swpal_entry(Assembler::xword);
6754 
6755     // CAS, memory_order_conservative
6756     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6757     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6758     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6759     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6760     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6761     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6762 
6763     // CAS, memory_order_relaxed
6764     AtomicStubMark mark_cmpxchg_1_relaxed
6765       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6766     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6767     AtomicStubMark mark_cmpxchg_4_relaxed
6768       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6769     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6770     AtomicStubMark mark_cmpxchg_8_relaxed
6771       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6772     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6773 
6774     AtomicStubMark mark_cmpxchg_4_release
6775       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6776     gen_cas_entry(MacroAssembler::word, memory_order_release);
6777     AtomicStubMark mark_cmpxchg_8_release
6778       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6779     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6780 
6781     AtomicStubMark mark_cmpxchg_4_seq_cst
6782       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6783     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6784     AtomicStubMark mark_cmpxchg_8_seq_cst
6785       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6786     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6787 
6788     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6789   }
6790 #endif // LINUX
6791 
6792   address generate_cont_thaw(Continuation::thaw_kind kind) {
6793     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
6794     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
6795 
6796     address start = __ pc();
6797 
6798     if (return_barrier) {
6799       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6800       __ mov(sp, rscratch1);
6801     }
6802     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6803 
6804     if (return_barrier) {
6805       // preserve possible return value from a method returning to the return barrier
6806       __ fmovd(rscratch1, v0);
6807       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6808     }
6809 
6810     __ movw(c_rarg1, (return_barrier ? 1 : 0));
6811     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
6812     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
6813 
6814     if (return_barrier) {
6815       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6816       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6817       __ fmovd(v0, rscratch1);
6818     }
6819     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6820 
6821 
6822     Label thaw_success;
6823     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
6824     __ cbnz(rscratch2, thaw_success);
6825     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
6826     __ br(rscratch1);
6827     __ bind(thaw_success);
6828 
6829     // make room for the thawed frames
6830     __ sub(rscratch1, sp, rscratch2);
6831     __ andr(rscratch1, rscratch1, -16); // align
6832     __ mov(sp, rscratch1);
6833 
6834     if (return_barrier) {
6835       // save original return value -- again
6836       __ fmovd(rscratch1, v0);
6837       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6838     }
6839 
6840     // If we want, we can templatize thaw by kind, and have three different entries
6841     __ movw(c_rarg1, (uint32_t)kind);
6842 
6843     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
6844     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
6845 
6846     if (return_barrier) {
6847       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6848       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6849       __ fmovd(v0, rscratch1);
6850     } else {
6851       __ mov(r0, zr); // return 0 (success) from doYield
6852     }
6853 
6854     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
6855     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
6856     __ mov(rfp, sp);
6857 
6858     if (return_barrier_exception) {
6859       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
6860       __ verify_oop(r0);
6861       __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19
6862 
6863       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
6864 
6865       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
6866       // __ reinitialize_ptrue();
6867 
6868       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
6869 
6870       __ mov(r1, r0); // the exception handler
6871       __ mov(r0, r19); // restore return value contaning the exception oop
6872       __ verify_oop(r0);
6873 
6874       __ leave();
6875       __ mov(r3, lr);
6876       __ br(r1); // the exception handler
6877     } else {
6878       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
6879       __ leave();
6880       __ ret(lr);
6881     }
6882 
6883     return start;
6884   }
6885 
6886   address generate_cont_thaw() {
6887     if (!Continuations::enabled()) return nullptr;
6888 
6889     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
6890     address start = __ pc();
6891     generate_cont_thaw(Continuation::thaw_top);
6892     return start;
6893   }
6894 
6895   address generate_cont_returnBarrier() {
6896     if (!Continuations::enabled()) return nullptr;
6897 
6898     // TODO: will probably need multiple return barriers depending on return type
6899     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
6900     address start = __ pc();
6901 
6902     generate_cont_thaw(Continuation::thaw_return_barrier);
6903 
6904     return start;
6905   }
6906 
6907   address generate_cont_returnBarrier_exception() {
6908     if (!Continuations::enabled()) return nullptr;
6909 
6910     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
6911     address start = __ pc();
6912 
6913     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
6914 
6915     return start;
6916   }
6917 
6918 #if INCLUDE_JFR
6919 
6920   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
6921     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6922     __ mov(c_rarg0, thread);
6923   }
6924 
6925   // The handle is dereferenced through a load barrier.
6926   static void jfr_epilogue(MacroAssembler* _masm) {
6927     __ reset_last_Java_frame(true);
6928     __ resolve_global_jobject(r0, rscratch1, rscratch2);
6929   }
6930 
6931   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
6932   // It returns a jobject handle to the event writer.
6933   // The handle is dereferenced and the return value is the event writer oop.
6934   static RuntimeStub* generate_jfr_write_checkpoint() {
6935     enum layout {
6936       rbp_off,
6937       rbpH_off,
6938       return_off,
6939       return_off2,
6940       framesize // inclusive of return address
6941     };
6942 
6943     int insts_size = 1024;
6944     int locs_size = 64;
6945     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
6946     OopMapSet* oop_maps = new OopMapSet();
6947     MacroAssembler* masm = new MacroAssembler(&code);
6948     MacroAssembler* _masm = masm;
6949 
6950     address start = __ pc();
6951     __ enter();
6952     int frame_complete = __ pc() - start;
6953     address the_pc = __ pc();
6954     jfr_prologue(the_pc, _masm, rthread);
6955     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
6956     jfr_epilogue(_masm);
6957     __ leave();
6958     __ ret(lr);
6959 
6960     OopMap* map = new OopMap(framesize, 1); // rfp
6961     oop_maps->add_gc_map(the_pc - start, map);
6962 
6963     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
6964       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
6965                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6966                                     oop_maps, false);
6967     return stub;
6968   }
6969 
6970 #endif // INCLUDE_JFR
6971 
6972   // Continuation point for throwing of implicit exceptions that are
6973   // not handled in the current activation. Fabricates an exception
6974   // oop and initiates normal exception dispatching in this
6975   // frame. Since we need to preserve callee-saved values (currently
6976   // only for C2, but done for C1 as well) we need a callee-saved oop
6977   // map and therefore have to make these stubs into RuntimeStubs
6978   // rather than BufferBlobs.  If the compiler needs all registers to
6979   // be preserved between the fault point and the exception handler
6980   // then it must assume responsibility for that in
6981   // AbstractCompiler::continuation_for_implicit_null_exception or
6982   // continuation_for_implicit_division_by_zero_exception. All other
6983   // implicit exceptions (e.g., NullPointerException or
6984   // AbstractMethodError on entry) are either at call sites or
6985   // otherwise assume that stack unwinding will be initiated, so
6986   // caller saved registers were assumed volatile in the compiler.
6987 
6988 #undef __
6989 #define __ masm->
6990 
6991   address generate_throw_exception(const char* name,
6992                                    address runtime_entry,
6993                                    Register arg1 = noreg,
6994                                    Register arg2 = noreg) {
6995     // Information about frame layout at time of blocking runtime call.
6996     // Note that we only have to preserve callee-saved registers since
6997     // the compilers are responsible for supplying a continuation point
6998     // if they expect all registers to be preserved.
6999     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7000     enum layout {
7001       rfp_off = 0,
7002       rfp_off2,
7003       return_off,
7004       return_off2,
7005       framesize // inclusive of return address
7006     };
7007 
7008     int insts_size = 512;
7009     int locs_size  = 64;
7010 
7011     CodeBuffer code(name, insts_size, locs_size);
7012     OopMapSet* oop_maps  = new OopMapSet();
7013     MacroAssembler* masm = new MacroAssembler(&code);
7014 
7015     address start = __ pc();
7016 
7017     // This is an inlined and slightly modified version of call_VM
7018     // which has the ability to fetch the return PC out of
7019     // thread-local storage and also sets up last_Java_sp slightly
7020     // differently than the real call_VM
7021 
7022     __ enter(); // Save FP and LR before call
7023 
7024     assert(is_even(framesize/2), "sp not 16-byte aligned");
7025 
7026     // lr and fp are already in place
7027     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
7028 
7029     int frame_complete = __ pc() - start;
7030 
7031     // Set up last_Java_sp and last_Java_fp
7032     address the_pc = __ pc();
7033     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7034 
7035     // Call runtime
7036     if (arg1 != noreg) {
7037       assert(arg2 != c_rarg1, "clobbered");
7038       __ mov(c_rarg1, arg1);
7039     }
7040     if (arg2 != noreg) {
7041       __ mov(c_rarg2, arg2);
7042     }
7043     __ mov(c_rarg0, rthread);
7044     BLOCK_COMMENT("call runtime_entry");
7045     __ mov(rscratch1, runtime_entry);
7046     __ blr(rscratch1);
7047 
7048     // Generate oop map
7049     OopMap* map = new OopMap(framesize, 0);
7050 
7051     oop_maps->add_gc_map(the_pc - start, map);
7052 
7053     __ reset_last_Java_frame(true);
7054 
7055     // Reinitialize the ptrue predicate register, in case the external runtime
7056     // call clobbers ptrue reg, as we may return to SVE compiled code.
7057     __ reinitialize_ptrue();
7058 
7059     __ leave();
7060 
7061     // check for pending exceptions
7062 #ifdef ASSERT
7063     Label L;
7064     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
7065     __ cbnz(rscratch1, L);
7066     __ should_not_reach_here();
7067     __ bind(L);
7068 #endif // ASSERT
7069     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7070 
7071     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7072     RuntimeStub* stub =
7073       RuntimeStub::new_runtime_stub(name,
7074                                     &code,
7075                                     frame_complete,
7076                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7077                                     oop_maps, false);
7078     return stub->entry_point();
7079   }
7080 
7081   class MontgomeryMultiplyGenerator : public MacroAssembler {
7082 
7083     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7084       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7085 
7086     RegSet _toSave;
7087     bool _squaring;
7088 
7089   public:
7090     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7091       : MacroAssembler(as->code()), _squaring(squaring) {
7092 
7093       // Register allocation
7094 
7095       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7096       Pa_base = *regs;       // Argument registers
7097       if (squaring)
7098         Pb_base = Pa_base;
7099       else
7100         Pb_base = *++regs;
7101       Pn_base = *++regs;
7102       Rlen= *++regs;
7103       inv = *++regs;
7104       Pm_base = *++regs;
7105 
7106                           // Working registers:
7107       Ra =  *++regs;        // The current digit of a, b, n, and m.
7108       Rb =  *++regs;
7109       Rm =  *++regs;
7110       Rn =  *++regs;
7111 
7112       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7113       Pb =  *++regs;
7114       Pm =  *++regs;
7115       Pn =  *++regs;
7116 
7117       t0 =  *++regs;        // Three registers which form a
7118       t1 =  *++regs;        // triple-precision accumuator.
7119       t2 =  *++regs;
7120 
7121       Ri =  *++regs;        // Inner and outer loop indexes.
7122       Rj =  *++regs;
7123 
7124       Rhi_ab = *++regs;     // Product registers: low and high parts
7125       Rlo_ab = *++regs;     // of a*b and m*n.
7126       Rhi_mn = *++regs;
7127       Rlo_mn = *++regs;
7128 
7129       // r19 and up are callee-saved.
7130       _toSave = RegSet::range(r19, *regs) + Pm_base;
7131     }
7132 
7133   private:
7134     void save_regs() {
7135       push(_toSave, sp);
7136     }
7137 
7138     void restore_regs() {
7139       pop(_toSave, sp);
7140     }
7141 
7142     template <typename T>
7143     void unroll_2(Register count, T block) {
7144       Label loop, end, odd;
7145       tbnz(count, 0, odd);
7146       cbz(count, end);
7147       align(16);
7148       bind(loop);
7149       (this->*block)();
7150       bind(odd);
7151       (this->*block)();
7152       subs(count, count, 2);
7153       br(Assembler::GT, loop);
7154       bind(end);
7155     }
7156 
7157     template <typename T>
7158     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7159       Label loop, end, odd;
7160       tbnz(count, 0, odd);
7161       cbz(count, end);
7162       align(16);
7163       bind(loop);
7164       (this->*block)(d, s, tmp);
7165       bind(odd);
7166       (this->*block)(d, s, tmp);
7167       subs(count, count, 2);
7168       br(Assembler::GT, loop);
7169       bind(end);
7170     }
7171 
7172     void pre1(RegisterOrConstant i) {
7173       block_comment("pre1");
7174       // Pa = Pa_base;
7175       // Pb = Pb_base + i;
7176       // Pm = Pm_base;
7177       // Pn = Pn_base + i;
7178       // Ra = *Pa;
7179       // Rb = *Pb;
7180       // Rm = *Pm;
7181       // Rn = *Pn;
7182       ldr(Ra, Address(Pa_base));
7183       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7184       ldr(Rm, Address(Pm_base));
7185       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7186       lea(Pa, Address(Pa_base));
7187       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7188       lea(Pm, Address(Pm_base));
7189       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7190 
7191       // Zero the m*n result.
7192       mov(Rhi_mn, zr);
7193       mov(Rlo_mn, zr);
7194     }
7195 
7196     // The core multiply-accumulate step of a Montgomery
7197     // multiplication.  The idea is to schedule operations as a
7198     // pipeline so that instructions with long latencies (loads and
7199     // multiplies) have time to complete before their results are
7200     // used.  This most benefits in-order implementations of the
7201     // architecture but out-of-order ones also benefit.
7202     void step() {
7203       block_comment("step");
7204       // MACC(Ra, Rb, t0, t1, t2);
7205       // Ra = *++Pa;
7206       // Rb = *--Pb;
7207       umulh(Rhi_ab, Ra, Rb);
7208       mul(Rlo_ab, Ra, Rb);
7209       ldr(Ra, pre(Pa, wordSize));
7210       ldr(Rb, pre(Pb, -wordSize));
7211       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7212                                        // previous iteration.
7213       // MACC(Rm, Rn, t0, t1, t2);
7214       // Rm = *++Pm;
7215       // Rn = *--Pn;
7216       umulh(Rhi_mn, Rm, Rn);
7217       mul(Rlo_mn, Rm, Rn);
7218       ldr(Rm, pre(Pm, wordSize));
7219       ldr(Rn, pre(Pn, -wordSize));
7220       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7221     }
7222 
7223     void post1() {
7224       block_comment("post1");
7225 
7226       // MACC(Ra, Rb, t0, t1, t2);
7227       // Ra = *++Pa;
7228       // Rb = *--Pb;
7229       umulh(Rhi_ab, Ra, Rb);
7230       mul(Rlo_ab, Ra, Rb);
7231       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7232       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7233 
7234       // *Pm = Rm = t0 * inv;
7235       mul(Rm, t0, inv);
7236       str(Rm, Address(Pm));
7237 
7238       // MACC(Rm, Rn, t0, t1, t2);
7239       // t0 = t1; t1 = t2; t2 = 0;
7240       umulh(Rhi_mn, Rm, Rn);
7241 
7242 #ifndef PRODUCT
7243       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7244       {
7245         mul(Rlo_mn, Rm, Rn);
7246         add(Rlo_mn, t0, Rlo_mn);
7247         Label ok;
7248         cbz(Rlo_mn, ok); {
7249           stop("broken Montgomery multiply");
7250         } bind(ok);
7251       }
7252 #endif
7253       // We have very carefully set things up so that
7254       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7255       // the lower half of Rm * Rn because we know the result already:
7256       // it must be -t0.  t0 + (-t0) must generate a carry iff
7257       // t0 != 0.  So, rather than do a mul and an adds we just set
7258       // the carry flag iff t0 is nonzero.
7259       //
7260       // mul(Rlo_mn, Rm, Rn);
7261       // adds(zr, t0, Rlo_mn);
7262       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7263       adcs(t0, t1, Rhi_mn);
7264       adc(t1, t2, zr);
7265       mov(t2, zr);
7266     }
7267 
7268     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7269       block_comment("pre2");
7270       // Pa = Pa_base + i-len;
7271       // Pb = Pb_base + len;
7272       // Pm = Pm_base + i-len;
7273       // Pn = Pn_base + len;
7274 
7275       if (i.is_register()) {
7276         sub(Rj, i.as_register(), len);
7277       } else {
7278         mov(Rj, i.as_constant());
7279         sub(Rj, Rj, len);
7280       }
7281       // Rj == i-len
7282 
7283       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7284       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7285       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7286       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7287 
7288       // Ra = *++Pa;
7289       // Rb = *--Pb;
7290       // Rm = *++Pm;
7291       // Rn = *--Pn;
7292       ldr(Ra, pre(Pa, wordSize));
7293       ldr(Rb, pre(Pb, -wordSize));
7294       ldr(Rm, pre(Pm, wordSize));
7295       ldr(Rn, pre(Pn, -wordSize));
7296 
7297       mov(Rhi_mn, zr);
7298       mov(Rlo_mn, zr);
7299     }
7300 
7301     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7302       block_comment("post2");
7303       if (i.is_constant()) {
7304         mov(Rj, i.as_constant()-len.as_constant());
7305       } else {
7306         sub(Rj, i.as_register(), len);
7307       }
7308 
7309       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7310 
7311       // As soon as we know the least significant digit of our result,
7312       // store it.
7313       // Pm_base[i-len] = t0;
7314       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7315 
7316       // t0 = t1; t1 = t2; t2 = 0;
7317       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7318       adc(t1, t2, zr);
7319       mov(t2, zr);
7320     }
7321 
7322     // A carry in t0 after Montgomery multiplication means that we
7323     // should subtract multiples of n from our result in m.  We'll
7324     // keep doing that until there is no carry.
7325     void normalize(RegisterOrConstant len) {
7326       block_comment("normalize");
7327       // while (t0)
7328       //   t0 = sub(Pm_base, Pn_base, t0, len);
7329       Label loop, post, again;
7330       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7331       cbz(t0, post); {
7332         bind(again); {
7333           mov(i, zr);
7334           mov(cnt, len);
7335           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7336           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7337           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7338           align(16);
7339           bind(loop); {
7340             sbcs(Rm, Rm, Rn);
7341             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7342             add(i, i, 1);
7343             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7344             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7345             sub(cnt, cnt, 1);
7346           } cbnz(cnt, loop);
7347           sbc(t0, t0, zr);
7348         } cbnz(t0, again);
7349       } bind(post);
7350     }
7351 
7352     // Move memory at s to d, reversing words.
7353     //    Increments d to end of copied memory
7354     //    Destroys tmp1, tmp2
7355     //    Preserves len
7356     //    Leaves s pointing to the address which was in d at start
7357     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7358       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7359       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7360 
7361       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7362       mov(tmp1, len);
7363       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7364       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7365     }
7366     // where
7367     void reverse1(Register d, Register s, Register tmp) {
7368       ldr(tmp, pre(s, -wordSize));
7369       ror(tmp, tmp, 32);
7370       str(tmp, post(d, wordSize));
7371     }
7372 
7373     void step_squaring() {
7374       // An extra ACC
7375       step();
7376       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7377     }
7378 
7379     void last_squaring(RegisterOrConstant i) {
7380       Label dont;
7381       // if ((i & 1) == 0) {
7382       tbnz(i.as_register(), 0, dont); {
7383         // MACC(Ra, Rb, t0, t1, t2);
7384         // Ra = *++Pa;
7385         // Rb = *--Pb;
7386         umulh(Rhi_ab, Ra, Rb);
7387         mul(Rlo_ab, Ra, Rb);
7388         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7389       } bind(dont);
7390     }
7391 
7392     void extra_step_squaring() {
7393       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7394 
7395       // MACC(Rm, Rn, t0, t1, t2);
7396       // Rm = *++Pm;
7397       // Rn = *--Pn;
7398       umulh(Rhi_mn, Rm, Rn);
7399       mul(Rlo_mn, Rm, Rn);
7400       ldr(Rm, pre(Pm, wordSize));
7401       ldr(Rn, pre(Pn, -wordSize));
7402     }
7403 
7404     void post1_squaring() {
7405       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7406 
7407       // *Pm = Rm = t0 * inv;
7408       mul(Rm, t0, inv);
7409       str(Rm, Address(Pm));
7410 
7411       // MACC(Rm, Rn, t0, t1, t2);
7412       // t0 = t1; t1 = t2; t2 = 0;
7413       umulh(Rhi_mn, Rm, Rn);
7414 
7415 #ifndef PRODUCT
7416       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7417       {
7418         mul(Rlo_mn, Rm, Rn);
7419         add(Rlo_mn, t0, Rlo_mn);
7420         Label ok;
7421         cbz(Rlo_mn, ok); {
7422           stop("broken Montgomery multiply");
7423         } bind(ok);
7424       }
7425 #endif
7426       // We have very carefully set things up so that
7427       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7428       // the lower half of Rm * Rn because we know the result already:
7429       // it must be -t0.  t0 + (-t0) must generate a carry iff
7430       // t0 != 0.  So, rather than do a mul and an adds we just set
7431       // the carry flag iff t0 is nonzero.
7432       //
7433       // mul(Rlo_mn, Rm, Rn);
7434       // adds(zr, t0, Rlo_mn);
7435       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7436       adcs(t0, t1, Rhi_mn);
7437       adc(t1, t2, zr);
7438       mov(t2, zr);
7439     }
7440 
7441     void acc(Register Rhi, Register Rlo,
7442              Register t0, Register t1, Register t2) {
7443       adds(t0, t0, Rlo);
7444       adcs(t1, t1, Rhi);
7445       adc(t2, t2, zr);
7446     }
7447 
7448   public:
7449     /**
7450      * Fast Montgomery multiplication.  The derivation of the
7451      * algorithm is in A Cryptographic Library for the Motorola
7452      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7453      *
7454      * Arguments:
7455      *
7456      * Inputs for multiplication:
7457      *   c_rarg0   - int array elements a
7458      *   c_rarg1   - int array elements b
7459      *   c_rarg2   - int array elements n (the modulus)
7460      *   c_rarg3   - int length
7461      *   c_rarg4   - int inv
7462      *   c_rarg5   - int array elements m (the result)
7463      *
7464      * Inputs for squaring:
7465      *   c_rarg0   - int array elements a
7466      *   c_rarg1   - int array elements n (the modulus)
7467      *   c_rarg2   - int length
7468      *   c_rarg3   - int inv
7469      *   c_rarg4   - int array elements m (the result)
7470      *
7471      */
7472     address generate_multiply() {
7473       Label argh, nothing;
7474       bind(argh);
7475       stop("MontgomeryMultiply total_allocation must be <= 8192");
7476 
7477       align(CodeEntryAlignment);
7478       address entry = pc();
7479 
7480       cbzw(Rlen, nothing);
7481 
7482       enter();
7483 
7484       // Make room.
7485       cmpw(Rlen, 512);
7486       br(Assembler::HI, argh);
7487       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7488       andr(sp, Ra, -2 * wordSize);
7489 
7490       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7491 
7492       {
7493         // Copy input args, reversing as we go.  We use Ra as a
7494         // temporary variable.
7495         reverse(Ra, Pa_base, Rlen, t0, t1);
7496         if (!_squaring)
7497           reverse(Ra, Pb_base, Rlen, t0, t1);
7498         reverse(Ra, Pn_base, Rlen, t0, t1);
7499       }
7500 
7501       // Push all call-saved registers and also Pm_base which we'll need
7502       // at the end.
7503       save_regs();
7504 
7505 #ifndef PRODUCT
7506       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7507       {
7508         ldr(Rn, Address(Pn_base, 0));
7509         mul(Rlo_mn, Rn, inv);
7510         subs(zr, Rlo_mn, -1);
7511         Label ok;
7512         br(EQ, ok); {
7513           stop("broken inverse in Montgomery multiply");
7514         } bind(ok);
7515       }
7516 #endif
7517 
7518       mov(Pm_base, Ra);
7519 
7520       mov(t0, zr);
7521       mov(t1, zr);
7522       mov(t2, zr);
7523 
7524       block_comment("for (int i = 0; i < len; i++) {");
7525       mov(Ri, zr); {
7526         Label loop, end;
7527         cmpw(Ri, Rlen);
7528         br(Assembler::GE, end);
7529 
7530         bind(loop);
7531         pre1(Ri);
7532 
7533         block_comment("  for (j = i; j; j--) {"); {
7534           movw(Rj, Ri);
7535           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7536         } block_comment("  } // j");
7537 
7538         post1();
7539         addw(Ri, Ri, 1);
7540         cmpw(Ri, Rlen);
7541         br(Assembler::LT, loop);
7542         bind(end);
7543         block_comment("} // i");
7544       }
7545 
7546       block_comment("for (int i = len; i < 2*len; i++) {");
7547       mov(Ri, Rlen); {
7548         Label loop, end;
7549         cmpw(Ri, Rlen, Assembler::LSL, 1);
7550         br(Assembler::GE, end);
7551 
7552         bind(loop);
7553         pre2(Ri, Rlen);
7554 
7555         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7556           lslw(Rj, Rlen, 1);
7557           subw(Rj, Rj, Ri);
7558           subw(Rj, Rj, 1);
7559           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7560         } block_comment("  } // j");
7561 
7562         post2(Ri, Rlen);
7563         addw(Ri, Ri, 1);
7564         cmpw(Ri, Rlen, Assembler::LSL, 1);
7565         br(Assembler::LT, loop);
7566         bind(end);
7567       }
7568       block_comment("} // i");
7569 
7570       normalize(Rlen);
7571 
7572       mov(Ra, Pm_base);  // Save Pm_base in Ra
7573       restore_regs();  // Restore caller's Pm_base
7574 
7575       // Copy our result into caller's Pm_base
7576       reverse(Pm_base, Ra, Rlen, t0, t1);
7577 
7578       leave();
7579       bind(nothing);
7580       ret(lr);
7581 
7582       return entry;
7583     }
7584     // In C, approximately:
7585 
7586     // void
7587     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7588     //                     julong Pn_base[], julong Pm_base[],
7589     //                     julong inv, int len) {
7590     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7591     //   julong *Pa, *Pb, *Pn, *Pm;
7592     //   julong Ra, Rb, Rn, Rm;
7593 
7594     //   int i;
7595 
7596     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7597 
7598     //   for (i = 0; i < len; i++) {
7599     //     int j;
7600 
7601     //     Pa = Pa_base;
7602     //     Pb = Pb_base + i;
7603     //     Pm = Pm_base;
7604     //     Pn = Pn_base + i;
7605 
7606     //     Ra = *Pa;
7607     //     Rb = *Pb;
7608     //     Rm = *Pm;
7609     //     Rn = *Pn;
7610 
7611     //     int iters = i;
7612     //     for (j = 0; iters--; j++) {
7613     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7614     //       MACC(Ra, Rb, t0, t1, t2);
7615     //       Ra = *++Pa;
7616     //       Rb = *--Pb;
7617     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7618     //       MACC(Rm, Rn, t0, t1, t2);
7619     //       Rm = *++Pm;
7620     //       Rn = *--Pn;
7621     //     }
7622 
7623     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7624     //     MACC(Ra, Rb, t0, t1, t2);
7625     //     *Pm = Rm = t0 * inv;
7626     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7627     //     MACC(Rm, Rn, t0, t1, t2);
7628 
7629     //     assert(t0 == 0, "broken Montgomery multiply");
7630 
7631     //     t0 = t1; t1 = t2; t2 = 0;
7632     //   }
7633 
7634     //   for (i = len; i < 2*len; i++) {
7635     //     int j;
7636 
7637     //     Pa = Pa_base + i-len;
7638     //     Pb = Pb_base + len;
7639     //     Pm = Pm_base + i-len;
7640     //     Pn = Pn_base + len;
7641 
7642     //     Ra = *++Pa;
7643     //     Rb = *--Pb;
7644     //     Rm = *++Pm;
7645     //     Rn = *--Pn;
7646 
7647     //     int iters = len*2-i-1;
7648     //     for (j = i-len+1; iters--; j++) {
7649     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7650     //       MACC(Ra, Rb, t0, t1, t2);
7651     //       Ra = *++Pa;
7652     //       Rb = *--Pb;
7653     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7654     //       MACC(Rm, Rn, t0, t1, t2);
7655     //       Rm = *++Pm;
7656     //       Rn = *--Pn;
7657     //     }
7658 
7659     //     Pm_base[i-len] = t0;
7660     //     t0 = t1; t1 = t2; t2 = 0;
7661     //   }
7662 
7663     //   while (t0)
7664     //     t0 = sub(Pm_base, Pn_base, t0, len);
7665     // }
7666 
7667     /**
7668      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7669      * multiplies than Montgomery multiplication so it should be up to
7670      * 25% faster.  However, its loop control is more complex and it
7671      * may actually run slower on some machines.
7672      *
7673      * Arguments:
7674      *
7675      * Inputs:
7676      *   c_rarg0   - int array elements a
7677      *   c_rarg1   - int array elements n (the modulus)
7678      *   c_rarg2   - int length
7679      *   c_rarg3   - int inv
7680      *   c_rarg4   - int array elements m (the result)
7681      *
7682      */
7683     address generate_square() {
7684       Label argh;
7685       bind(argh);
7686       stop("MontgomeryMultiply total_allocation must be <= 8192");
7687 
7688       align(CodeEntryAlignment);
7689       address entry = pc();
7690 
7691       enter();
7692 
7693       // Make room.
7694       cmpw(Rlen, 512);
7695       br(Assembler::HI, argh);
7696       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7697       andr(sp, Ra, -2 * wordSize);
7698 
7699       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7700 
7701       {
7702         // Copy input args, reversing as we go.  We use Ra as a
7703         // temporary variable.
7704         reverse(Ra, Pa_base, Rlen, t0, t1);
7705         reverse(Ra, Pn_base, Rlen, t0, t1);
7706       }
7707 
7708       // Push all call-saved registers and also Pm_base which we'll need
7709       // at the end.
7710       save_regs();
7711 
7712       mov(Pm_base, Ra);
7713 
7714       mov(t0, zr);
7715       mov(t1, zr);
7716       mov(t2, zr);
7717 
7718       block_comment("for (int i = 0; i < len; i++) {");
7719       mov(Ri, zr); {
7720         Label loop, end;
7721         bind(loop);
7722         cmp(Ri, Rlen);
7723         br(Assembler::GE, end);
7724 
7725         pre1(Ri);
7726 
7727         block_comment("for (j = (i+1)/2; j; j--) {"); {
7728           add(Rj, Ri, 1);
7729           lsr(Rj, Rj, 1);
7730           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7731         } block_comment("  } // j");
7732 
7733         last_squaring(Ri);
7734 
7735         block_comment("  for (j = i/2; j; j--) {"); {
7736           lsr(Rj, Ri, 1);
7737           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7738         } block_comment("  } // j");
7739 
7740         post1_squaring();
7741         add(Ri, Ri, 1);
7742         cmp(Ri, Rlen);
7743         br(Assembler::LT, loop);
7744 
7745         bind(end);
7746         block_comment("} // i");
7747       }
7748 
7749       block_comment("for (int i = len; i < 2*len; i++) {");
7750       mov(Ri, Rlen); {
7751         Label loop, end;
7752         bind(loop);
7753         cmp(Ri, Rlen, Assembler::LSL, 1);
7754         br(Assembler::GE, end);
7755 
7756         pre2(Ri, Rlen);
7757 
7758         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7759           lsl(Rj, Rlen, 1);
7760           sub(Rj, Rj, Ri);
7761           sub(Rj, Rj, 1);
7762           lsr(Rj, Rj, 1);
7763           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7764         } block_comment("  } // j");
7765 
7766         last_squaring(Ri);
7767 
7768         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7769           lsl(Rj, Rlen, 1);
7770           sub(Rj, Rj, Ri);
7771           lsr(Rj, Rj, 1);
7772           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7773         } block_comment("  } // j");
7774 
7775         post2(Ri, Rlen);
7776         add(Ri, Ri, 1);
7777         cmp(Ri, Rlen, Assembler::LSL, 1);
7778 
7779         br(Assembler::LT, loop);
7780         bind(end);
7781         block_comment("} // i");
7782       }
7783 
7784       normalize(Rlen);
7785 
7786       mov(Ra, Pm_base);  // Save Pm_base in Ra
7787       restore_regs();  // Restore caller's Pm_base
7788 
7789       // Copy our result into caller's Pm_base
7790       reverse(Pm_base, Ra, Rlen, t0, t1);
7791 
7792       leave();
7793       ret(lr);
7794 
7795       return entry;
7796     }
7797     // In C, approximately:
7798 
7799     // void
7800     // montgomery_square(julong Pa_base[], julong Pn_base[],
7801     //                   julong Pm_base[], julong inv, int len) {
7802     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7803     //   julong *Pa, *Pb, *Pn, *Pm;
7804     //   julong Ra, Rb, Rn, Rm;
7805 
7806     //   int i;
7807 
7808     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7809 
7810     //   for (i = 0; i < len; i++) {
7811     //     int j;
7812 
7813     //     Pa = Pa_base;
7814     //     Pb = Pa_base + i;
7815     //     Pm = Pm_base;
7816     //     Pn = Pn_base + i;
7817 
7818     //     Ra = *Pa;
7819     //     Rb = *Pb;
7820     //     Rm = *Pm;
7821     //     Rn = *Pn;
7822 
7823     //     int iters = (i+1)/2;
7824     //     for (j = 0; iters--; j++) {
7825     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7826     //       MACC2(Ra, Rb, t0, t1, t2);
7827     //       Ra = *++Pa;
7828     //       Rb = *--Pb;
7829     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7830     //       MACC(Rm, Rn, t0, t1, t2);
7831     //       Rm = *++Pm;
7832     //       Rn = *--Pn;
7833     //     }
7834     //     if ((i & 1) == 0) {
7835     //       assert(Ra == Pa_base[j], "must be");
7836     //       MACC(Ra, Ra, t0, t1, t2);
7837     //     }
7838     //     iters = i/2;
7839     //     assert(iters == i-j, "must be");
7840     //     for (; iters--; j++) {
7841     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7842     //       MACC(Rm, Rn, t0, t1, t2);
7843     //       Rm = *++Pm;
7844     //       Rn = *--Pn;
7845     //     }
7846 
7847     //     *Pm = Rm = t0 * inv;
7848     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7849     //     MACC(Rm, Rn, t0, t1, t2);
7850 
7851     //     assert(t0 == 0, "broken Montgomery multiply");
7852 
7853     //     t0 = t1; t1 = t2; t2 = 0;
7854     //   }
7855 
7856     //   for (i = len; i < 2*len; i++) {
7857     //     int start = i-len+1;
7858     //     int end = start + (len - start)/2;
7859     //     int j;
7860 
7861     //     Pa = Pa_base + i-len;
7862     //     Pb = Pa_base + len;
7863     //     Pm = Pm_base + i-len;
7864     //     Pn = Pn_base + len;
7865 
7866     //     Ra = *++Pa;
7867     //     Rb = *--Pb;
7868     //     Rm = *++Pm;
7869     //     Rn = *--Pn;
7870 
7871     //     int iters = (2*len-i-1)/2;
7872     //     assert(iters == end-start, "must be");
7873     //     for (j = start; iters--; j++) {
7874     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7875     //       MACC2(Ra, Rb, t0, t1, t2);
7876     //       Ra = *++Pa;
7877     //       Rb = *--Pb;
7878     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7879     //       MACC(Rm, Rn, t0, t1, t2);
7880     //       Rm = *++Pm;
7881     //       Rn = *--Pn;
7882     //     }
7883     //     if ((i & 1) == 0) {
7884     //       assert(Ra == Pa_base[j], "must be");
7885     //       MACC(Ra, Ra, t0, t1, t2);
7886     //     }
7887     //     iters =  (2*len-i)/2;
7888     //     assert(iters == len-j, "must be");
7889     //     for (; iters--; j++) {
7890     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7891     //       MACC(Rm, Rn, t0, t1, t2);
7892     //       Rm = *++Pm;
7893     //       Rn = *--Pn;
7894     //     }
7895     //     Pm_base[i-len] = t0;
7896     //     t0 = t1; t1 = t2; t2 = 0;
7897     //   }
7898 
7899     //   while (t0)
7900     //     t0 = sub(Pm_base, Pn_base, t0, len);
7901     // }
7902   };
7903 
7904 
7905   // Initialization
7906   void generate_initial() {
7907     // Generate initial stubs and initializes the entry points
7908 
7909     // entry points that exist in all platforms Note: This is code
7910     // that could be shared among different platforms - however the
7911     // benefit seems to be smaller than the disadvantage of having a
7912     // much more complicated generator structure. See also comment in
7913     // stubRoutines.hpp.
7914 
7915     StubRoutines::_forward_exception_entry = generate_forward_exception();
7916 
7917     StubRoutines::_call_stub_entry =
7918       generate_call_stub(StubRoutines::_call_stub_return_address);
7919 
7920     // is referenced by megamorphic call
7921     StubRoutines::_catch_exception_entry = generate_catch_exception();
7922 
7923     // Build this early so it's available for the interpreter.
7924     StubRoutines::_throw_StackOverflowError_entry =
7925       generate_throw_exception("StackOverflowError throw_exception",
7926                                CAST_FROM_FN_PTR(address,
7927                                                 SharedRuntime::throw_StackOverflowError));
7928     StubRoutines::_throw_delayed_StackOverflowError_entry =
7929       generate_throw_exception("delayed StackOverflowError throw_exception",
7930                                CAST_FROM_FN_PTR(address,
7931                                                 SharedRuntime::throw_delayed_StackOverflowError));
7932     if (UseCRC32Intrinsics) {
7933       // set table address before stub generation which use it
7934       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7935       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7936     }
7937 
7938     if (UseCRC32CIntrinsics) {
7939       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7940     }
7941 
7942     // Disabled until JDK-8210858 is fixed
7943     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7944     //   StubRoutines::_dlog = generate_dlog();
7945     // }
7946 
7947     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7948       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7949     }
7950 
7951     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7952       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7953     }
7954   }
7955 
7956   void generate_phase1() {
7957     // Continuation stubs:
7958     StubRoutines::_cont_thaw          = generate_cont_thaw();
7959     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
7960     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
7961 
7962     JFR_ONLY(StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();)
7963     JFR_ONLY(StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();)
7964   }
7965 
7966   void generate_all() {
7967     // support for verify_oop (must happen after universe_init)
7968     if (VerifyOops) {
7969       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
7970     }
7971     StubRoutines::_throw_AbstractMethodError_entry =
7972       generate_throw_exception("AbstractMethodError throw_exception",
7973                                CAST_FROM_FN_PTR(address,
7974                                                 SharedRuntime::
7975                                                 throw_AbstractMethodError));
7976 
7977     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7978       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7979                                CAST_FROM_FN_PTR(address,
7980                                                 SharedRuntime::
7981                                                 throw_IncompatibleClassChangeError));
7982 
7983     StubRoutines::_throw_NullPointerException_at_call_entry =
7984       generate_throw_exception("NullPointerException at call throw_exception",
7985                                CAST_FROM_FN_PTR(address,
7986                                                 SharedRuntime::
7987                                                 throw_NullPointerException_at_call));
7988 
7989     if (UseSVE == 0) {
7990       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
7991     }
7992 
7993     // arraycopy stubs used by compilers
7994     generate_arraycopy_stubs();
7995 
7996     // countPositives stub for large arrays.
7997     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
7998 
7999     // array equals stub for large arrays.
8000     if (!UseSimpleArrayEquals) {
8001       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8002     }
8003 
8004     generate_compare_long_strings();
8005 
8006     generate_string_indexof_stubs();
8007 
8008     // byte_array_inflate stub for large arrays.
8009     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8010 
8011     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8012     if (bs_nm != NULL) {
8013       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
8014     }
8015     if (UseFastLocking) {
8016       StubRoutines::aarch64::_check_lock_stack = generate_check_lock_stack();
8017     }
8018 #ifdef COMPILER2
8019     if (UseMultiplyToLenIntrinsic) {
8020       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8021     }
8022 
8023     if (UseSquareToLenIntrinsic) {
8024       StubRoutines::_squareToLen = generate_squareToLen();
8025     }
8026 
8027     if (UseMulAddIntrinsic) {
8028       StubRoutines::_mulAdd = generate_mulAdd();
8029     }
8030 
8031     if (UseSIMDForBigIntegerShiftIntrinsics) {
8032       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8033       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8034     }
8035 
8036     if (UseMontgomeryMultiplyIntrinsic) {
8037       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8038       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8039       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8040     }
8041 
8042     if (UseMontgomerySquareIntrinsic) {
8043       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8044       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8045       // We use generate_multiply() rather than generate_square()
8046       // because it's faster for the sizes of modulus we care about.
8047       StubRoutines::_montgomerySquare = g.generate_multiply();
8048     }
8049 #endif // COMPILER2
8050 
8051     if (UseChaCha20Intrinsics) {
8052       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8053     }
8054 
8055     if (UseBASE64Intrinsics) {
8056         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8057         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8058     }
8059 
8060     // data cache line writeback
8061     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8062     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8063 
8064     if (UseAESIntrinsics) {
8065       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8066       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8067       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8068       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8069       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8070     }
8071     if (UseGHASHIntrinsics) {
8072       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8073       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8074     }
8075     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8076       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8077     }
8078 
8079     if (UseMD5Intrinsics) {
8080       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8081       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8082     }
8083     if (UseSHA1Intrinsics) {
8084       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8085       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8086     }
8087     if (UseSHA256Intrinsics) {
8088       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8089       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8090     }
8091     if (UseSHA512Intrinsics) {
8092       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8093       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8094     }
8095     if (UseSHA3Intrinsics) {
8096       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8097       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8098     }
8099 
8100     // generate Adler32 intrinsics code
8101     if (UseAdler32Intrinsics) {
8102       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8103     }
8104 
8105     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8106 
8107 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8108 
8109     generate_atomic_entry_points();
8110 
8111 #endif // LINUX
8112 
8113     StubRoutines::aarch64::set_completed();
8114   }
8115 
8116  public:
8117   StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
8118     if (phase == 0) {
8119       generate_initial();
8120     } else if (phase == 1) {
8121       generate_phase1(); // stubs that must be available for the interpreter
8122     } else {
8123       generate_all();
8124     }
8125   }
8126 }; // end class declaration
8127 
8128 #define UCM_TABLE_MAX_ENTRIES 8
8129 void StubGenerator_generate(CodeBuffer* code, int phase) {
8130   if (UnsafeCopyMemory::_table == NULL) {
8131     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
8132   }
8133   StubGenerator g(code, phase);
8134 }
8135 
8136 
8137 #if defined (LINUX)
8138 
8139 // Define pointers to atomic stubs and initialize them to point to the
8140 // code in atomic_aarch64.S.
8141 
8142 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8143   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8144     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8145   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8146     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8147 
8148 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8149 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8150 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8151 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8152 DEFAULT_ATOMIC_OP(xchg, 4, )
8153 DEFAULT_ATOMIC_OP(xchg, 8, )
8154 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8155 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8156 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8157 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8158 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8159 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8160 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8161 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8162 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8163 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8164 
8165 #undef DEFAULT_ATOMIC_OP
8166 
8167 #endif // LINUX