1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "prims/upcallLinker.hpp"
  45 #include "runtime/atomic.hpp"
  46 #include "runtime/continuation.hpp"
  47 #include "runtime/continuationEntry.inline.hpp"
  48 #include "runtime/frame.inline.hpp"
  49 #include "runtime/handles.inline.hpp"
  50 #include "runtime/javaThread.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/stubCodeGenerator.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/checkedCast.hpp"
  56 #include "utilities/globalDefinitions.hpp"
  57 #include "utilities/powerOfTwo.hpp"
  58 #ifdef COMPILER2
  59 #include "opto/runtime.hpp"
  60 #endif
  61 #if INCLUDE_ZGC
  62 #include "gc/z/zThreadLocalData.hpp"
  63 #endif
  64 
  65 // Declaration and definition of StubGenerator (no .hpp file).
  66 // For a more detailed description of the stub routine structure
  67 // see the comment in stubRoutines.hpp
  68 
  69 #undef __
  70 #define __ _masm->
  71 
  72 #ifdef PRODUCT
  73 #define BLOCK_COMMENT(str) /* nothing */
  74 #else
  75 #define BLOCK_COMMENT(str) __ block_comment(str)
  76 #endif
  77 
  78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  79 
  80 // Stub Code definitions
  81 
  82 class StubGenerator: public StubCodeGenerator {
  83  private:
  84 
  85 #ifdef PRODUCT
  86 #define inc_counter_np(counter) ((void)0)
  87 #else
  88   void inc_counter_np_(uint& counter) {
  89     __ lea(rscratch2, ExternalAddress((address)&counter));
  90     __ ldrw(rscratch1, Address(rscratch2));
  91     __ addw(rscratch1, rscratch1, 1);
  92     __ strw(rscratch1, Address(rscratch2));
  93   }
  94 #define inc_counter_np(counter) \
  95   BLOCK_COMMENT("inc_counter " #counter); \
  96   inc_counter_np_(counter);
  97 #endif
  98 
  99   // Call stubs are used to call Java from C
 100   //
 101   // Arguments:
 102   //    c_rarg0:   call wrapper address                   address
 103   //    c_rarg1:   result                                 address
 104   //    c_rarg2:   result type                            BasicType
 105   //    c_rarg3:   method                                 Method*
 106   //    c_rarg4:   (interpreter) entry point              address
 107   //    c_rarg5:   parameters                             intptr_t*
 108   //    c_rarg6:   parameter size (in words)              int
 109   //    c_rarg7:   thread                                 Thread*
 110   //
 111   // There is no return from the stub itself as any Java result
 112   // is written to result
 113   //
 114   // we save r30 (lr) as the return PC at the base of the frame and
 115   // link r29 (fp) below it as the frame pointer installing sp (r31)
 116   // into fp.
 117   //
 118   // we save r0-r7, which accounts for all the c arguments.
 119   //
 120   // TODO: strictly do we need to save them all? they are treated as
 121   // volatile by C so could we omit saving the ones we are going to
 122   // place in global registers (thread? method?) or those we only use
 123   // during setup of the Java call?
 124   //
 125   // we don't need to save r8 which C uses as an indirect result location
 126   // return register.
 127   //
 128   // we don't need to save r9-r15 which both C and Java treat as
 129   // volatile
 130   //
 131   // we don't need to save r16-18 because Java does not use them
 132   //
 133   // we save r19-r28 which Java uses as scratch registers and C
 134   // expects to be callee-save
 135   //
 136   // we save the bottom 64 bits of each value stored in v8-v15; it is
 137   // the responsibility of the caller to preserve larger values.
 138   //
 139   // so the stub frame looks like this when we enter Java code
 140   //
 141   //     [ return_from_Java     ] <--- sp
 142   //     [ argument word n      ]
 143   //      ...
 144   // -29 [ argument word 1      ]
 145   // -28 [ saved Floating-point Control Register ]
 146   // -26 [ saved v15            ] <--- sp_after_call
 147   // -25 [ saved v14            ]
 148   // -24 [ saved v13            ]
 149   // -23 [ saved v12            ]
 150   // -22 [ saved v11            ]
 151   // -21 [ saved v10            ]
 152   // -20 [ saved v9             ]
 153   // -19 [ saved v8             ]
 154   // -18 [ saved r28            ]
 155   // -17 [ saved r27            ]
 156   // -16 [ saved r26            ]
 157   // -15 [ saved r25            ]
 158   // -14 [ saved r24            ]
 159   // -13 [ saved r23            ]
 160   // -12 [ saved r22            ]
 161   // -11 [ saved r21            ]
 162   // -10 [ saved r20            ]
 163   //  -9 [ saved r19            ]
 164   //  -8 [ call wrapper    (r0) ]
 165   //  -7 [ result          (r1) ]
 166   //  -6 [ result type     (r2) ]
 167   //  -5 [ method          (r3) ]
 168   //  -4 [ entry point     (r4) ]
 169   //  -3 [ parameters      (r5) ]
 170   //  -2 [ parameter size  (r6) ]
 171   //  -1 [ thread (r7)          ]
 172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 173   //   1 [ saved lr       (r30) ]
 174 
 175   // Call stub stack layout word offsets from fp
 176   enum call_stub_layout {
 177     sp_after_call_off  = -28,
 178 
 179     fpcr_off           = sp_after_call_off,
 180     d15_off            = -26,
 181     d13_off            = -24,
 182     d11_off            = -22,
 183     d9_off             = -20,
 184 
 185     r28_off            = -18,
 186     r26_off            = -16,
 187     r24_off            = -14,
 188     r22_off            = -12,
 189     r20_off            = -10,
 190     call_wrapper_off   =  -8,
 191     result_off         =  -7,
 192     result_type_off    =  -6,
 193     method_off         =  -5,
 194     entry_point_off    =  -4,
 195     parameter_size_off =  -2,
 196     thread_off         =  -1,
 197     fp_f               =   0,
 198     retaddr_off        =   1,
 199   };
 200 
 201   address generate_call_stub(address& return_address) {
 202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 204            "adjust this code");
 205 
 206     StubCodeMark mark(this, "StubRoutines", "call_stub");
 207     address start = __ pc();
 208 
 209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 210 
 211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 213     const Address result        (rfp, result_off         * wordSize);
 214     const Address result_type   (rfp, result_type_off    * wordSize);
 215     const Address method        (rfp, method_off         * wordSize);
 216     const Address entry_point   (rfp, entry_point_off    * wordSize);
 217     const Address parameter_size(rfp, parameter_size_off * wordSize);
 218 
 219     const Address thread        (rfp, thread_off         * wordSize);
 220 
 221     const Address d15_save      (rfp, d15_off * wordSize);
 222     const Address d13_save      (rfp, d13_off * wordSize);
 223     const Address d11_save      (rfp, d11_off * wordSize);
 224     const Address d9_save       (rfp, d9_off * wordSize);
 225 
 226     const Address r28_save      (rfp, r28_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r24_save      (rfp, r24_off * wordSize);
 229     const Address r22_save      (rfp, r22_off * wordSize);
 230     const Address r20_save      (rfp, r20_off * wordSize);
 231 
 232     // stub code
 233 
 234     address aarch64_entry = __ pc();
 235 
 236     // set up frame and move sp to end of save area
 237     __ enter();
 238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 239 
 240     // save register parameters and Java scratch/global registers
 241     // n.b. we save thread even though it gets installed in
 242     // rthread because we want to sanity check rthread later
 243     __ str(c_rarg7,  thread);
 244     __ strw(c_rarg6, parameter_size);
 245     __ stp(c_rarg4, c_rarg5,  entry_point);
 246     __ stp(c_rarg2, c_rarg3,  result_type);
 247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 248 
 249     __ stp(r20, r19,   r20_save);
 250     __ stp(r22, r21,   r22_save);
 251     __ stp(r24, r23,   r24_save);
 252     __ stp(r26, r25,   r26_save);
 253     __ stp(r28, r27,   r28_save);
 254 
 255     __ stpd(v9,  v8,   d9_save);
 256     __ stpd(v11, v10,  d11_save);
 257     __ stpd(v13, v12,  d13_save);
 258     __ stpd(v15, v14,  d15_save);
 259 
 260     __ get_fpcr(rscratch1);
 261     __ str(rscratch1, fpcr_save);
 262     // Set FPCR to the state we need. We do want Round to Nearest. We
 263     // don't want non-IEEE rounding modes or floating-point traps.
 264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 266     __ set_fpcr(rscratch1);
 267 
 268     // install Java thread in global register now we have saved
 269     // whatever value it held
 270     __ mov(rthread, c_rarg7);
 271     // And method
 272     __ mov(rmethod, c_rarg3);
 273 
 274     // set up the heapbase register
 275     __ reinit_heapbase();
 276 
 277 #ifdef ASSERT
 278     // make sure we have no pending exceptions
 279     {
 280       Label L;
 281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 282       __ cmp(rscratch1, (u1)NULL_WORD);
 283       __ br(Assembler::EQ, L);
 284       __ stop("StubRoutines::call_stub: entered with pending exception");
 285       __ BIND(L);
 286     }
 287 #endif
 288     // pass parameters if any
 289     __ mov(esp, sp);
 290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 291     __ andr(sp, rscratch1, -2 * wordSize);
 292 
 293     BLOCK_COMMENT("pass parameters if any");
 294     Label parameters_done;
 295     // parameter count is still in c_rarg6
 296     // and parameter pointer identifying param 1 is in c_rarg5
 297     __ cbzw(c_rarg6, parameters_done);
 298 
 299     address loop = __ pc();
 300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 301     __ subsw(c_rarg6, c_rarg6, 1);
 302     __ push(rscratch1);
 303     __ br(Assembler::GT, loop);
 304 
 305     __ BIND(parameters_done);
 306 
 307     // call Java entry -- passing methdoOop, and current sp
 308     //      rmethod: Method*
 309     //      r19_sender_sp: sender sp
 310     BLOCK_COMMENT("call Java function");
 311     __ mov(r19_sender_sp, sp);
 312     __ blr(c_rarg4);
 313 
 314     // we do this here because the notify will already have been done
 315     // if we get to the next instruction via an exception
 316     //
 317     // n.b. adding this instruction here affects the calculation of
 318     // whether or not a routine returns to the call stub (used when
 319     // doing stack walks) since the normal test is to check the return
 320     // pc against the address saved below. so we may need to allow for
 321     // this extra instruction in the check.
 322 
 323     // save current address for use by exception handling code
 324 
 325     return_address = __ pc();
 326 
 327     // store result depending on type (everything that is not
 328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 329     // n.b. this assumes Java returns an integral result in r0
 330     // and a floating result in j_farg0
 331     __ ldr(j_rarg2, result);
 332     Label is_long, is_float, is_double, exit;
 333     __ ldr(j_rarg1, result_type);
 334     __ cmp(j_rarg1, (u1)T_OBJECT);
 335     __ br(Assembler::EQ, is_long);
 336     __ cmp(j_rarg1, (u1)T_LONG);
 337     __ br(Assembler::EQ, is_long);
 338     __ cmp(j_rarg1, (u1)T_FLOAT);
 339     __ br(Assembler::EQ, is_float);
 340     __ cmp(j_rarg1, (u1)T_DOUBLE);
 341     __ br(Assembler::EQ, is_double);
 342 
 343     // handle T_INT case
 344     __ strw(r0, Address(j_rarg2));
 345 
 346     __ BIND(exit);
 347 
 348     // pop parameters
 349     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 350 
 351 #ifdef ASSERT
 352     // verify that threads correspond
 353     {
 354       Label L, S;
 355       __ ldr(rscratch1, thread);
 356       __ cmp(rthread, rscratch1);
 357       __ br(Assembler::NE, S);
 358       __ get_thread(rscratch1);
 359       __ cmp(rthread, rscratch1);
 360       __ br(Assembler::EQ, L);
 361       __ BIND(S);
 362       __ stop("StubRoutines::call_stub: threads must correspond");
 363       __ BIND(L);
 364     }
 365 #endif
 366 
 367     __ pop_cont_fastpath(rthread);
 368 
 369     // restore callee-save registers
 370     __ ldpd(v15, v14,  d15_save);
 371     __ ldpd(v13, v12,  d13_save);
 372     __ ldpd(v11, v10,  d11_save);
 373     __ ldpd(v9,  v8,   d9_save);
 374 
 375     __ ldp(r28, r27,   r28_save);
 376     __ ldp(r26, r25,   r26_save);
 377     __ ldp(r24, r23,   r24_save);
 378     __ ldp(r22, r21,   r22_save);
 379     __ ldp(r20, r19,   r20_save);
 380 
 381     // restore fpcr
 382     __ ldr(rscratch1,  fpcr_save);
 383     __ set_fpcr(rscratch1);
 384 
 385     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 386     __ ldrw(c_rarg2, result_type);
 387     __ ldr(c_rarg3,  method);
 388     __ ldp(c_rarg4, c_rarg5,  entry_point);
 389     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 390 
 391     // leave frame and return to caller
 392     __ leave();
 393     __ ret(lr);
 394 
 395     // handle return types different from T_INT
 396 
 397     __ BIND(is_long);
 398     __ str(r0, Address(j_rarg2, 0));
 399     __ br(Assembler::AL, exit);
 400 
 401     __ BIND(is_float);
 402     __ strs(j_farg0, Address(j_rarg2, 0));
 403     __ br(Assembler::AL, exit);
 404 
 405     __ BIND(is_double);
 406     __ strd(j_farg0, Address(j_rarg2, 0));
 407     __ br(Assembler::AL, exit);
 408 
 409     return start;
 410   }
 411 
 412   // Return point for a Java call if there's an exception thrown in
 413   // Java code.  The exception is caught and transformed into a
 414   // pending exception stored in JavaThread that can be tested from
 415   // within the VM.
 416   //
 417   // Note: Usually the parameters are removed by the callee. In case
 418   // of an exception crossing an activation frame boundary, that is
 419   // not the case if the callee is compiled code => need to setup the
 420   // rsp.
 421   //
 422   // r0: exception oop
 423 
 424   address generate_catch_exception() {
 425     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 426     address start = __ pc();
 427 
 428     // same as in generate_call_stub():
 429     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 430     const Address thread        (rfp, thread_off         * wordSize);
 431 
 432 #ifdef ASSERT
 433     // verify that threads correspond
 434     {
 435       Label L, S;
 436       __ ldr(rscratch1, thread);
 437       __ cmp(rthread, rscratch1);
 438       __ br(Assembler::NE, S);
 439       __ get_thread(rscratch1);
 440       __ cmp(rthread, rscratch1);
 441       __ br(Assembler::EQ, L);
 442       __ bind(S);
 443       __ stop("StubRoutines::catch_exception: threads must correspond");
 444       __ bind(L);
 445     }
 446 #endif
 447 
 448     // set pending exception
 449     __ verify_oop(r0);
 450 
 451     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 452     __ mov(rscratch1, (address)__FILE__);
 453     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 454     __ movw(rscratch1, (int)__LINE__);
 455     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 456 
 457     // complete return to VM
 458     assert(StubRoutines::_call_stub_return_address != nullptr,
 459            "_call_stub_return_address must have been generated before");
 460     __ b(StubRoutines::_call_stub_return_address);
 461 
 462     return start;
 463   }
 464 
 465   // Continuation point for runtime calls returning with a pending
 466   // exception.  The pending exception check happened in the runtime
 467   // or native call stub.  The pending exception in Thread is
 468   // converted into a Java-level exception.
 469   //
 470   // Contract with Java-level exception handlers:
 471   // r0: exception
 472   // r3: throwing pc
 473   //
 474   // NOTE: At entry of this stub, exception-pc must be in LR !!
 475 
 476   // NOTE: this is always used as a jump target within generated code
 477   // so it just needs to be generated code with no x86 prolog
 478 
 479   address generate_forward_exception() {
 480     StubCodeMark mark(this, "StubRoutines", "forward exception");
 481     address start = __ pc();
 482 
 483     // Upon entry, LR points to the return address returning into
 484     // Java (interpreted or compiled) code; i.e., the return address
 485     // becomes the throwing pc.
 486     //
 487     // Arguments pushed before the runtime call are still on the stack
 488     // but the exception handler will reset the stack pointer ->
 489     // ignore them.  A potential result in registers can be ignored as
 490     // well.
 491 
 492 #ifdef ASSERT
 493     // make sure this code is only executed if there is a pending exception
 494     {
 495       Label L;
 496       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 497       __ cbnz(rscratch1, L);
 498       __ stop("StubRoutines::forward exception: no pending exception (1)");
 499       __ bind(L);
 500     }
 501 #endif
 502 
 503     // compute exception handler into r19
 504 
 505     // call the VM to find the handler address associated with the
 506     // caller address. pass thread in r0 and caller pc (ret address)
 507     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 508     // the stack.
 509     __ mov(c_rarg1, lr);
 510     // lr will be trashed by the VM call so we move it to R19
 511     // (callee-saved) because we also need to pass it to the handler
 512     // returned by this call.
 513     __ mov(r19, lr);
 514     BLOCK_COMMENT("call exception_handler_for_return_address");
 515     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 516                          SharedRuntime::exception_handler_for_return_address),
 517                     rthread, c_rarg1);
 518     // Reinitialize the ptrue predicate register, in case the external runtime
 519     // call clobbers ptrue reg, as we may return to SVE compiled code.
 520     __ reinitialize_ptrue();
 521 
 522     // we should not really care that lr is no longer the callee
 523     // address. we saved the value the handler needs in r19 so we can
 524     // just copy it to r3. however, the C2 handler will push its own
 525     // frame and then calls into the VM and the VM code asserts that
 526     // the PC for the frame above the handler belongs to a compiled
 527     // Java method. So, we restore lr here to satisfy that assert.
 528     __ mov(lr, r19);
 529     // setup r0 & r3 & clear pending exception
 530     __ mov(r3, r19);
 531     __ mov(r19, r0);
 532     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 533     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 534 
 535 #ifdef ASSERT
 536     // make sure exception is set
 537     {
 538       Label L;
 539       __ cbnz(r0, L);
 540       __ stop("StubRoutines::forward exception: no pending exception (2)");
 541       __ bind(L);
 542     }
 543 #endif
 544 
 545     // continue at exception handler
 546     // r0: exception
 547     // r3: throwing pc
 548     // r19: exception handler
 549     __ verify_oop(r0);
 550     __ br(r19);
 551 
 552     return start;
 553   }
 554 
 555   // Non-destructive plausibility checks for oops
 556   //
 557   // Arguments:
 558   //    r0: oop to verify
 559   //    rscratch1: error message
 560   //
 561   // Stack after saving c_rarg3:
 562   //    [tos + 0]: saved c_rarg3
 563   //    [tos + 1]: saved c_rarg2
 564   //    [tos + 2]: saved lr
 565   //    [tos + 3]: saved rscratch2
 566   //    [tos + 4]: saved r0
 567   //    [tos + 5]: saved rscratch1
 568   address generate_verify_oop() {
 569 
 570     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 571     address start = __ pc();
 572 
 573     Label exit, error;
 574 
 575     // save c_rarg2 and c_rarg3
 576     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 577 
 578     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 579     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 580     __ ldr(c_rarg3, Address(c_rarg2));
 581     __ add(c_rarg3, c_rarg3, 1);
 582     __ str(c_rarg3, Address(c_rarg2));
 583 
 584     // object is in r0
 585     // make sure object is 'reasonable'
 586     __ cbz(r0, exit); // if obj is null it is OK
 587 
 588     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 589     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 590 
 591     // return if everything seems ok
 592     __ bind(exit);
 593 
 594     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 595     __ ret(lr);
 596 
 597     // handle errors
 598     __ bind(error);
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600 
 601     __ push(RegSet::range(r0, r29), sp);
 602     // debug(char* msg, int64_t pc, int64_t regs[])
 603     __ mov(c_rarg0, rscratch1);      // pass address of error message
 604     __ mov(c_rarg1, lr);             // pass return address
 605     __ mov(c_rarg2, sp);             // pass address of regs on stack
 606 #ifndef PRODUCT
 607     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 608 #endif
 609     BLOCK_COMMENT("call MacroAssembler::debug");
 610     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 611     __ blr(rscratch1);
 612     __ hlt(0);
 613 
 614     return start;
 615   }
 616 
 617   // Generate indices for iota vector.
 618   address generate_iota_indices(const char *stub_name) {
 619     __ align(CodeEntryAlignment);
 620     StubCodeMark mark(this, "StubRoutines", stub_name);
 621     address start = __ pc();
 622     // B
 623     __ emit_data64(0x0706050403020100, relocInfo::none);
 624     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 625     // H
 626     __ emit_data64(0x0003000200010000, relocInfo::none);
 627     __ emit_data64(0x0007000600050004, relocInfo::none);
 628     // S
 629     __ emit_data64(0x0000000100000000, relocInfo::none);
 630     __ emit_data64(0x0000000300000002, relocInfo::none);
 631     // D
 632     __ emit_data64(0x0000000000000000, relocInfo::none);
 633     __ emit_data64(0x0000000000000001, relocInfo::none);
 634     // S - FP
 635     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 636     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 637     // D - FP
 638     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 639     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 640     return start;
 641   }
 642 
 643   // The inner part of zero_words().  This is the bulk operation,
 644   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 645   // caller is responsible for zeroing the last few words.
 646   //
 647   // Inputs:
 648   // r10: the HeapWord-aligned base address of an array to zero.
 649   // r11: the count in HeapWords, r11 > 0.
 650   //
 651   // Returns r10 and r11, adjusted for the caller to clear.
 652   // r10: the base address of the tail of words left to clear.
 653   // r11: the number of words in the tail.
 654   //      r11 < MacroAssembler::zero_words_block_size.
 655 
 656   address generate_zero_blocks() {
 657     Label done;
 658     Label base_aligned;
 659 
 660     Register base = r10, cnt = r11;
 661 
 662     __ align(CodeEntryAlignment);
 663     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 664     address start = __ pc();
 665 
 666     if (UseBlockZeroing) {
 667       int zva_length = VM_Version::zva_length();
 668 
 669       // Ensure ZVA length can be divided by 16. This is required by
 670       // the subsequent operations.
 671       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 672 
 673       __ tbz(base, 3, base_aligned);
 674       __ str(zr, Address(__ post(base, 8)));
 675       __ sub(cnt, cnt, 1);
 676       __ bind(base_aligned);
 677 
 678       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 679       // alignment.
 680       Label small;
 681       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 682       __ subs(rscratch1, cnt, low_limit >> 3);
 683       __ br(Assembler::LT, small);
 684       __ zero_dcache_blocks(base, cnt);
 685       __ bind(small);
 686     }
 687 
 688     {
 689       // Number of stp instructions we'll unroll
 690       const int unroll =
 691         MacroAssembler::zero_words_block_size / 2;
 692       // Clear the remaining blocks.
 693       Label loop;
 694       __ subs(cnt, cnt, unroll * 2);
 695       __ br(Assembler::LT, done);
 696       __ bind(loop);
 697       for (int i = 0; i < unroll; i++)
 698         __ stp(zr, zr, __ post(base, 16));
 699       __ subs(cnt, cnt, unroll * 2);
 700       __ br(Assembler::GE, loop);
 701       __ bind(done);
 702       __ add(cnt, cnt, unroll * 2);
 703     }
 704 
 705     __ ret(lr);
 706 
 707     return start;
 708   }
 709 
 710 
 711   typedef enum {
 712     copy_forwards = 1,
 713     copy_backwards = -1
 714   } copy_direction;
 715 
 716   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 717   // for arraycopy stubs.
 718   class ArrayCopyBarrierSetHelper : StackObj {
 719     BarrierSetAssembler* _bs_asm;
 720     MacroAssembler* _masm;
 721     DecoratorSet _decorators;
 722     BasicType _type;
 723     Register _gct1;
 724     Register _gct2;
 725     Register _gct3;
 726     FloatRegister _gcvt1;
 727     FloatRegister _gcvt2;
 728     FloatRegister _gcvt3;
 729 
 730   public:
 731     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 732                               DecoratorSet decorators,
 733                               BasicType type,
 734                               Register gct1,
 735                               Register gct2,
 736                               Register gct3,
 737                               FloatRegister gcvt1,
 738                               FloatRegister gcvt2,
 739                               FloatRegister gcvt3)
 740       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 741         _masm(masm),
 742         _decorators(decorators),
 743         _type(type),
 744         _gct1(gct1),
 745         _gct2(gct2),
 746         _gct3(gct3),
 747         _gcvt1(gcvt1),
 748         _gcvt2(gcvt2),
 749         _gcvt3(gcvt3) {
 750     }
 751 
 752     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 753       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 754                             dst1, dst2, src,
 755                             _gct1, _gct2, _gcvt1);
 756     }
 757 
 758     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 759       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 760                              dst, src1, src2,
 761                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 762     }
 763 
 764     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 765       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 766                             dst1, dst2, src,
 767                             _gct1);
 768     }
 769 
 770     void copy_store_at_16(Address dst, Register src1, Register src2) {
 771       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 772                              dst, src1, src2,
 773                              _gct1, _gct2, _gct3);
 774     }
 775 
 776     void copy_load_at_8(Register dst, Address src) {
 777       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 778                             dst, noreg, src,
 779                             _gct1);
 780     }
 781 
 782     void copy_store_at_8(Address dst, Register src) {
 783       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 784                              dst, src, noreg,
 785                              _gct1, _gct2, _gct3);
 786     }
 787   };
 788 
 789   // Bulk copy of blocks of 8 words.
 790   //
 791   // count is a count of words.
 792   //
 793   // Precondition: count >= 8
 794   //
 795   // Postconditions:
 796   //
 797   // The least significant bit of count contains the remaining count
 798   // of words to copy.  The rest of count is trash.
 799   //
 800   // s and d are adjusted to point to the remaining words to copy
 801   //
 802   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 803                            copy_direction direction) {
 804     int unit = wordSize * direction;
 805     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 806 
 807     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 808       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 809     const Register stride = r14;
 810     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 811     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 812     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 813 
 814     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 815     assert_different_registers(s, d, count, rscratch1, rscratch2);
 816 
 817     Label again, drain;
 818     const char *stub_name;
 819     if (direction == copy_forwards)
 820       stub_name = "forward_copy_longs";
 821     else
 822       stub_name = "backward_copy_longs";
 823 
 824     __ align(CodeEntryAlignment);
 825 
 826     StubCodeMark mark(this, "StubRoutines", stub_name);
 827 
 828     __ bind(start);
 829 
 830     Label unaligned_copy_long;
 831     if (AvoidUnalignedAccesses) {
 832       __ tbnz(d, 3, unaligned_copy_long);
 833     }
 834 
 835     if (direction == copy_forwards) {
 836       __ sub(s, s, bias);
 837       __ sub(d, d, bias);
 838     }
 839 
 840 #ifdef ASSERT
 841     // Make sure we are never given < 8 words
 842     {
 843       Label L;
 844       __ cmp(count, (u1)8);
 845       __ br(Assembler::GE, L);
 846       __ stop("genrate_copy_longs called with < 8 words");
 847       __ bind(L);
 848     }
 849 #endif
 850 
 851     // Fill 8 registers
 852     if (UseSIMDForMemoryOps) {
 853       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 854       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 855     } else {
 856       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 857       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 858       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 859       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 860     }
 861 
 862     __ subs(count, count, 16);
 863     __ br(Assembler::LO, drain);
 864 
 865     int prefetch = PrefetchCopyIntervalInBytes;
 866     bool use_stride = false;
 867     if (direction == copy_backwards) {
 868        use_stride = prefetch > 256;
 869        prefetch = -prefetch;
 870        if (use_stride) __ mov(stride, prefetch);
 871     }
 872 
 873     __ bind(again);
 874 
 875     if (PrefetchCopyIntervalInBytes > 0)
 876       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 877 
 878     if (UseSIMDForMemoryOps) {
 879       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 880       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 881       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 882       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 883     } else {
 884       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 885       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 886       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 887       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 888       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 889       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 890       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 891       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 892     }
 893 
 894     __ subs(count, count, 8);
 895     __ br(Assembler::HS, again);
 896 
 897     // Drain
 898     __ bind(drain);
 899     if (UseSIMDForMemoryOps) {
 900       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 901       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 902     } else {
 903       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 904       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 905       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 906       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 907     }
 908 
 909     {
 910       Label L1, L2;
 911       __ tbz(count, exact_log2(4), L1);
 912       if (UseSIMDForMemoryOps) {
 913         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 914         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 915       } else {
 916         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 917         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 918         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 919         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 920       }
 921       __ bind(L1);
 922 
 923       if (direction == copy_forwards) {
 924         __ add(s, s, bias);
 925         __ add(d, d, bias);
 926       }
 927 
 928       __ tbz(count, 1, L2);
 929       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 930       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 931       __ bind(L2);
 932     }
 933 
 934     __ ret(lr);
 935 
 936     if (AvoidUnalignedAccesses) {
 937       Label drain, again;
 938       // Register order for storing. Order is different for backward copy.
 939 
 940       __ bind(unaligned_copy_long);
 941 
 942       // source address is even aligned, target odd aligned
 943       //
 944       // when forward copying word pairs we read long pairs at offsets
 945       // {0, 2, 4, 6} (in long words). when backwards copying we read
 946       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 947       // address by -2 in the forwards case so we can compute the
 948       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 949       // or -1.
 950       //
 951       // when forward copying we need to store 1 word, 3 pairs and
 952       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 953       // zero offset We adjust the destination by -1 which means we
 954       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 955       //
 956       // When backwards copyng we need to store 1 word, 3 pairs and
 957       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 958       // offsets {1, 3, 5, 7, 8} * unit.
 959 
 960       if (direction == copy_forwards) {
 961         __ sub(s, s, 16);
 962         __ sub(d, d, 8);
 963       }
 964 
 965       // Fill 8 registers
 966       //
 967       // for forwards copy s was offset by -16 from the original input
 968       // value of s so the register contents are at these offsets
 969       // relative to the 64 bit block addressed by that original input
 970       // and so on for each successive 64 byte block when s is updated
 971       //
 972       // t0 at offset 0,  t1 at offset 8
 973       // t2 at offset 16, t3 at offset 24
 974       // t4 at offset 32, t5 at offset 40
 975       // t6 at offset 48, t7 at offset 56
 976 
 977       // for backwards copy s was not offset so the register contents
 978       // are at these offsets into the preceding 64 byte block
 979       // relative to that original input and so on for each successive
 980       // preceding 64 byte block when s is updated. this explains the
 981       // slightly counter-intuitive looking pattern of register usage
 982       // in the stp instructions for backwards copy.
 983       //
 984       // t0 at offset -16, t1 at offset -8
 985       // t2 at offset -32, t3 at offset -24
 986       // t4 at offset -48, t5 at offset -40
 987       // t6 at offset -64, t7 at offset -56
 988 
 989       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 990       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 991       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 992       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 993 
 994       __ subs(count, count, 16);
 995       __ br(Assembler::LO, drain);
 996 
 997       int prefetch = PrefetchCopyIntervalInBytes;
 998       bool use_stride = false;
 999       if (direction == copy_backwards) {
1000          use_stride = prefetch > 256;
1001          prefetch = -prefetch;
1002          if (use_stride) __ mov(stride, prefetch);
1003       }
1004 
1005       __ bind(again);
1006 
1007       if (PrefetchCopyIntervalInBytes > 0)
1008         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1009 
1010       if (direction == copy_forwards) {
1011        // allowing for the offset of -8 the store instructions place
1012        // registers into the target 64 bit block at the following
1013        // offsets
1014        //
1015        // t0 at offset 0
1016        // t1 at offset 8,  t2 at offset 16
1017        // t3 at offset 24, t4 at offset 32
1018        // t5 at offset 40, t6 at offset 48
1019        // t7 at offset 56
1020 
1021         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1022         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1023         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1024         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1025         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1026         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1027         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1028         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1029         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1030       } else {
1031        // d was not offset when we started so the registers are
1032        // written into the 64 bit block preceding d with the following
1033        // offsets
1034        //
1035        // t1 at offset -8
1036        // t3 at offset -24, t0 at offset -16
1037        // t5 at offset -48, t2 at offset -32
1038        // t7 at offset -56, t4 at offset -48
1039        //                   t6 at offset -64
1040        //
1041        // note that this matches the offsets previously noted for the
1042        // loads
1043 
1044         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1045         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1046         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1047         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1048         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1049         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1050         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1051         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1052         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1053       }
1054 
1055       __ subs(count, count, 8);
1056       __ br(Assembler::HS, again);
1057 
1058       // Drain
1059       //
1060       // this uses the same pattern of offsets and register arguments
1061       // as above
1062       __ bind(drain);
1063       if (direction == copy_forwards) {
1064         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1065         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1066         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1067         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1068         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1069       } else {
1070         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1071         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1072         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1073         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1074         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1075       }
1076       // now we need to copy any remaining part block which may
1077       // include a 4 word block subblock and/or a 2 word subblock.
1078       // bits 2 and 1 in the count are the tell-tale for whether we
1079       // have each such subblock
1080       {
1081         Label L1, L2;
1082         __ tbz(count, exact_log2(4), L1);
1083        // this is the same as above but copying only 4 longs hence
1084        // with only one intervening stp between the str instructions
1085        // but note that the offsets and registers still follow the
1086        // same pattern
1087         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1088         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1089         if (direction == copy_forwards) {
1090           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1091           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1092           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1093         } else {
1094           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1095           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1096           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1097         }
1098         __ bind(L1);
1099 
1100         __ tbz(count, 1, L2);
1101        // this is the same as above but copying only 2 longs hence
1102        // there is no intervening stp between the str instructions
1103        // but note that the offset and register patterns are still
1104        // the same
1105         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1106         if (direction == copy_forwards) {
1107           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1108           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1109         } else {
1110           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1111           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1112         }
1113         __ bind(L2);
1114 
1115        // for forwards copy we need to re-adjust the offsets we
1116        // applied so that s and d are follow the last words written
1117 
1118        if (direction == copy_forwards) {
1119          __ add(s, s, 16);
1120          __ add(d, d, 8);
1121        }
1122 
1123       }
1124 
1125       __ ret(lr);
1126       }
1127   }
1128 
1129   // Small copy: less than 16 bytes.
1130   //
1131   // NB: Ignores all of the bits of count which represent more than 15
1132   // bytes, so a caller doesn't have to mask them.
1133 
1134   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1135     bool is_backwards = step < 0;
1136     size_t granularity = uabs(step);
1137     int direction = is_backwards ? -1 : 1;
1138 
1139     Label Lword, Lint, Lshort, Lbyte;
1140 
1141     assert(granularity
1142            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1143 
1144     const Register t0 = r3;
1145     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1146     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1147 
1148     // ??? I don't know if this bit-test-and-branch is the right thing
1149     // to do.  It does a lot of jumping, resulting in several
1150     // mispredicted branches.  It might make more sense to do this
1151     // with something like Duff's device with a single computed branch.
1152 
1153     __ tbz(count, 3 - exact_log2(granularity), Lword);
1154     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1155     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1156     __ bind(Lword);
1157 
1158     if (granularity <= sizeof (jint)) {
1159       __ tbz(count, 2 - exact_log2(granularity), Lint);
1160       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1161       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1162       __ bind(Lint);
1163     }
1164 
1165     if (granularity <= sizeof (jshort)) {
1166       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1167       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1168       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1169       __ bind(Lshort);
1170     }
1171 
1172     if (granularity <= sizeof (jbyte)) {
1173       __ tbz(count, 0, Lbyte);
1174       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1175       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1176       __ bind(Lbyte);
1177     }
1178   }
1179 
1180   Label copy_f, copy_b;
1181   Label copy_obj_f, copy_obj_b;
1182   Label copy_obj_uninit_f, copy_obj_uninit_b;
1183 
1184   // All-singing all-dancing memory copy.
1185   //
1186   // Copy count units of memory from s to d.  The size of a unit is
1187   // step, which can be positive or negative depending on the direction
1188   // of copy.  If is_aligned is false, we align the source address.
1189   //
1190 
1191   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1192                    Register s, Register d, Register count, int step) {
1193     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1194     bool is_backwards = step < 0;
1195     unsigned int granularity = uabs(step);
1196     const Register t0 = r3, t1 = r4;
1197 
1198     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1199     // load all the data before writing anything
1200     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1201     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1202     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1203     const Register send = r17, dend = r16;
1204     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1205     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1206     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1207 
1208     if (PrefetchCopyIntervalInBytes > 0)
1209       __ prfm(Address(s, 0), PLDL1KEEP);
1210     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1211     __ br(Assembler::HI, copy_big);
1212 
1213     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1214     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1215 
1216     __ cmp(count, u1(16/granularity));
1217     __ br(Assembler::LS, copy16);
1218 
1219     __ cmp(count, u1(64/granularity));
1220     __ br(Assembler::HI, copy80);
1221 
1222     __ cmp(count, u1(32/granularity));
1223     __ br(Assembler::LS, copy32);
1224 
1225     // 33..64 bytes
1226     if (UseSIMDForMemoryOps) {
1227       bs.copy_load_at_32(v0, v1, Address(s, 0));
1228       bs.copy_load_at_32(v2, v3, Address(send, -32));
1229       bs.copy_store_at_32(Address(d, 0), v0, v1);
1230       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1231     } else {
1232       bs.copy_load_at_16(t0, t1, Address(s, 0));
1233       bs.copy_load_at_16(t2, t3, Address(s, 16));
1234       bs.copy_load_at_16(t4, t5, Address(send, -32));
1235       bs.copy_load_at_16(t6, t7, Address(send, -16));
1236 
1237       bs.copy_store_at_16(Address(d, 0), t0, t1);
1238       bs.copy_store_at_16(Address(d, 16), t2, t3);
1239       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1240       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1241     }
1242     __ b(finish);
1243 
1244     // 17..32 bytes
1245     __ bind(copy32);
1246     bs.copy_load_at_16(t0, t1, Address(s, 0));
1247     bs.copy_load_at_16(t6, t7, Address(send, -16));
1248 
1249     bs.copy_store_at_16(Address(d, 0), t0, t1);
1250     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1251     __ b(finish);
1252 
1253     // 65..80/96 bytes
1254     // (96 bytes if SIMD because we do 32 byes per instruction)
1255     __ bind(copy80);
1256     if (UseSIMDForMemoryOps) {
1257       bs.copy_load_at_32(v0, v1, Address(s, 0));
1258       bs.copy_load_at_32(v2, v3, Address(s, 32));
1259       // Unaligned pointers can be an issue for copying.
1260       // The issue has more chances to happen when granularity of data is
1261       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1262       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1263       // The most performance drop has been seen for the range 65-80 bytes.
1264       // For such cases using the pair of ldp/stp instead of the third pair of
1265       // ldpq/stpq fixes the performance issue.
1266       if (granularity < sizeof (jint)) {
1267         Label copy96;
1268         __ cmp(count, u1(80/granularity));
1269         __ br(Assembler::HI, copy96);
1270         bs.copy_load_at_16(t0, t1, Address(send, -16));
1271 
1272         bs.copy_store_at_32(Address(d, 0), v0, v1);
1273         bs.copy_store_at_32(Address(d, 32), v2, v3);
1274 
1275         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1276         __ b(finish);
1277 
1278         __ bind(copy96);
1279       }
1280       bs.copy_load_at_32(v4, v5, Address(send, -32));
1281 
1282       bs.copy_store_at_32(Address(d, 0), v0, v1);
1283       bs.copy_store_at_32(Address(d, 32), v2, v3);
1284 
1285       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1286     } else {
1287       bs.copy_load_at_16(t0, t1, Address(s, 0));
1288       bs.copy_load_at_16(t2, t3, Address(s, 16));
1289       bs.copy_load_at_16(t4, t5, Address(s, 32));
1290       bs.copy_load_at_16(t6, t7, Address(s, 48));
1291       bs.copy_load_at_16(t8, t9, Address(send, -16));
1292 
1293       bs.copy_store_at_16(Address(d, 0), t0, t1);
1294       bs.copy_store_at_16(Address(d, 16), t2, t3);
1295       bs.copy_store_at_16(Address(d, 32), t4, t5);
1296       bs.copy_store_at_16(Address(d, 48), t6, t7);
1297       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1298     }
1299     __ b(finish);
1300 
1301     // 0..16 bytes
1302     __ bind(copy16);
1303     __ cmp(count, u1(8/granularity));
1304     __ br(Assembler::LO, copy8);
1305 
1306     // 8..16 bytes
1307     bs.copy_load_at_8(t0, Address(s, 0));
1308     bs.copy_load_at_8(t1, Address(send, -8));
1309     bs.copy_store_at_8(Address(d, 0), t0);
1310     bs.copy_store_at_8(Address(dend, -8), t1);
1311     __ b(finish);
1312 
1313     if (granularity < 8) {
1314       // 4..7 bytes
1315       __ bind(copy8);
1316       __ tbz(count, 2 - exact_log2(granularity), copy4);
1317       __ ldrw(t0, Address(s, 0));
1318       __ ldrw(t1, Address(send, -4));
1319       __ strw(t0, Address(d, 0));
1320       __ strw(t1, Address(dend, -4));
1321       __ b(finish);
1322       if (granularity < 4) {
1323         // 0..3 bytes
1324         __ bind(copy4);
1325         __ cbz(count, finish); // get rid of 0 case
1326         if (granularity == 2) {
1327           __ ldrh(t0, Address(s, 0));
1328           __ strh(t0, Address(d, 0));
1329         } else { // granularity == 1
1330           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1331           // the first and last byte.
1332           // Handle the 3 byte case by loading and storing base + count/2
1333           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1334           // This does means in the 1 byte case we load/store the same
1335           // byte 3 times.
1336           __ lsr(count, count, 1);
1337           __ ldrb(t0, Address(s, 0));
1338           __ ldrb(t1, Address(send, -1));
1339           __ ldrb(t2, Address(s, count));
1340           __ strb(t0, Address(d, 0));
1341           __ strb(t1, Address(dend, -1));
1342           __ strb(t2, Address(d, count));
1343         }
1344         __ b(finish);
1345       }
1346     }
1347 
1348     __ bind(copy_big);
1349     if (is_backwards) {
1350       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1351       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1352     }
1353 
1354     // Now we've got the small case out of the way we can align the
1355     // source address on a 2-word boundary.
1356 
1357     // Here we will materialize a count in r15, which is used by copy_memory_small
1358     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1359     // Up until here, we have used t9, which aliases r15, but from here on, that register
1360     // can not be used as a temp register, as it contains the count.
1361 
1362     Label aligned;
1363 
1364     if (is_aligned) {
1365       // We may have to adjust by 1 word to get s 2-word-aligned.
1366       __ tbz(s, exact_log2(wordSize), aligned);
1367       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1368       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1369       __ sub(count, count, wordSize/granularity);
1370     } else {
1371       if (is_backwards) {
1372         __ andr(r15, s, 2 * wordSize - 1);
1373       } else {
1374         __ neg(r15, s);
1375         __ andr(r15, r15, 2 * wordSize - 1);
1376       }
1377       // r15 is the byte adjustment needed to align s.
1378       __ cbz(r15, aligned);
1379       int shift = exact_log2(granularity);
1380       if (shift)  __ lsr(r15, r15, shift);
1381       __ sub(count, count, r15);
1382 
1383 #if 0
1384       // ?? This code is only correct for a disjoint copy.  It may or
1385       // may not make sense to use it in that case.
1386 
1387       // Copy the first pair; s and d may not be aligned.
1388       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1389       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1390 
1391       // Align s and d, adjust count
1392       if (is_backwards) {
1393         __ sub(s, s, r15);
1394         __ sub(d, d, r15);
1395       } else {
1396         __ add(s, s, r15);
1397         __ add(d, d, r15);
1398       }
1399 #else
1400       copy_memory_small(decorators, type, s, d, r15, step);
1401 #endif
1402     }
1403 
1404     __ bind(aligned);
1405 
1406     // s is now 2-word-aligned.
1407 
1408     // We have a count of units and some trailing bytes.  Adjust the
1409     // count and do a bulk copy of words.
1410     __ lsr(r15, count, exact_log2(wordSize/granularity));
1411     if (direction == copy_forwards) {
1412       if (type != T_OBJECT) {
1413         __ bl(copy_f);
1414       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1415         __ bl(copy_obj_uninit_f);
1416       } else {
1417         __ bl(copy_obj_f);
1418       }
1419     } else {
1420       if (type != T_OBJECT) {
1421         __ bl(copy_b);
1422       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1423         __ bl(copy_obj_uninit_b);
1424       } else {
1425         __ bl(copy_obj_b);
1426       }
1427     }
1428 
1429     // And the tail.
1430     copy_memory_small(decorators, type, s, d, count, step);
1431 
1432     if (granularity >= 8) __ bind(copy8);
1433     if (granularity >= 4) __ bind(copy4);
1434     __ bind(finish);
1435   }
1436 
1437 
1438   void clobber_registers() {
1439 #ifdef ASSERT
1440     RegSet clobbered
1441       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1442     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1443     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1444     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1445       __ mov(*it, rscratch1);
1446     }
1447 #endif
1448 
1449   }
1450 
1451   // Scan over array at a for count oops, verifying each one.
1452   // Preserves a and count, clobbers rscratch1 and rscratch2.
1453   void verify_oop_array (int size, Register a, Register count, Register temp) {
1454     Label loop, end;
1455     __ mov(rscratch1, a);
1456     __ mov(rscratch2, zr);
1457     __ bind(loop);
1458     __ cmp(rscratch2, count);
1459     __ br(Assembler::HS, end);
1460     if (size == wordSize) {
1461       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1462       __ verify_oop(temp);
1463     } else {
1464       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1465       __ decode_heap_oop(temp); // calls verify_oop
1466     }
1467     __ add(rscratch2, rscratch2, 1);
1468     __ b(loop);
1469     __ bind(end);
1470   }
1471 
1472   // Arguments:
1473   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1474   //             ignored
1475   //   is_oop  - true => oop array, so generate store check code
1476   //   name    - stub name string
1477   //
1478   // Inputs:
1479   //   c_rarg0   - source array address
1480   //   c_rarg1   - destination array address
1481   //   c_rarg2   - element count, treated as ssize_t, can be zero
1482   //
1483   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1484   // the hardware handle it.  The two dwords within qwords that span
1485   // cache line boundaries will still be loaded and stored atomically.
1486   //
1487   // Side Effects:
1488   //   disjoint_int_copy_entry is set to the no-overlap entry point
1489   //   used by generate_conjoint_int_oop_copy().
1490   //
1491   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1492                                   const char *name, bool dest_uninitialized = false) {
1493     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1494     RegSet saved_reg = RegSet::of(s, d, count);
1495     __ align(CodeEntryAlignment);
1496     StubCodeMark mark(this, "StubRoutines", name);
1497     address start = __ pc();
1498     __ enter();
1499 
1500     if (entry != nullptr) {
1501       *entry = __ pc();
1502       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1503       BLOCK_COMMENT("Entry:");
1504     }
1505 
1506     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1507     if (dest_uninitialized) {
1508       decorators |= IS_DEST_UNINITIALIZED;
1509     }
1510     if (aligned) {
1511       decorators |= ARRAYCOPY_ALIGNED;
1512     }
1513 
1514     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1515     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1516 
1517     if (is_oop) {
1518       // save regs before copy_memory
1519       __ push(RegSet::of(d, count), sp);
1520     }
1521     {
1522       // UnsafeCopyMemory page error: continue after ucm
1523       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1524       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1525       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1526     }
1527 
1528     if (is_oop) {
1529       __ pop(RegSet::of(d, count), sp);
1530       if (VerifyOops)
1531         verify_oop_array(size, d, count, r16);
1532     }
1533 
1534     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1535 
1536     __ leave();
1537     __ mov(r0, zr); // return 0
1538     __ ret(lr);
1539     return start;
1540   }
1541 
1542   // Arguments:
1543   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1544   //             ignored
1545   //   is_oop  - true => oop array, so generate store check code
1546   //   name    - stub name string
1547   //
1548   // Inputs:
1549   //   c_rarg0   - source array address
1550   //   c_rarg1   - destination array address
1551   //   c_rarg2   - element count, treated as ssize_t, can be zero
1552   //
1553   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1554   // the hardware handle it.  The two dwords within qwords that span
1555   // cache line boundaries will still be loaded and stored atomically.
1556   //
1557   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1558                                  address *entry, const char *name,
1559                                  bool dest_uninitialized = false) {
1560     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1561     RegSet saved_regs = RegSet::of(s, d, count);
1562     StubCodeMark mark(this, "StubRoutines", name);
1563     address start = __ pc();
1564     __ enter();
1565 
1566     if (entry != nullptr) {
1567       *entry = __ pc();
1568       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1569       BLOCK_COMMENT("Entry:");
1570     }
1571 
1572     // use fwd copy when (d-s) above_equal (count*size)
1573     __ sub(rscratch1, d, s);
1574     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1575     __ br(Assembler::HS, nooverlap_target);
1576 
1577     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1578     if (dest_uninitialized) {
1579       decorators |= IS_DEST_UNINITIALIZED;
1580     }
1581     if (aligned) {
1582       decorators |= ARRAYCOPY_ALIGNED;
1583     }
1584 
1585     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1586     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1587 
1588     if (is_oop) {
1589       // save regs before copy_memory
1590       __ push(RegSet::of(d, count), sp);
1591     }
1592     {
1593       // UnsafeCopyMemory page error: continue after ucm
1594       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1595       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1596       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1597     }
1598     if (is_oop) {
1599       __ pop(RegSet::of(d, count), sp);
1600       if (VerifyOops)
1601         verify_oop_array(size, d, count, r16);
1602     }
1603     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1604     __ leave();
1605     __ mov(r0, zr); // return 0
1606     __ ret(lr);
1607     return start;
1608 }
1609 
1610   // Arguments:
1611   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1612   //             ignored
1613   //   name    - stub name string
1614   //
1615   // Inputs:
1616   //   c_rarg0   - source array address
1617   //   c_rarg1   - destination array address
1618   //   c_rarg2   - element count, treated as ssize_t, can be zero
1619   //
1620   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1621   // we let the hardware handle it.  The one to eight bytes within words,
1622   // dwords or qwords that span cache line boundaries will still be loaded
1623   // and stored atomically.
1624   //
1625   // Side Effects:
1626   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1627   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1628   // we let the hardware handle it.  The one to eight bytes within words,
1629   // dwords or qwords that span cache line boundaries will still be loaded
1630   // and stored atomically.
1631   //
1632   // Side Effects:
1633   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1634   //   used by generate_conjoint_byte_copy().
1635   //
1636   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1637     const bool not_oop = false;
1638     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1639   }
1640 
1641   // Arguments:
1642   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1643   //             ignored
1644   //   name    - stub name string
1645   //
1646   // Inputs:
1647   //   c_rarg0   - source array address
1648   //   c_rarg1   - destination array address
1649   //   c_rarg2   - element count, treated as ssize_t, can be zero
1650   //
1651   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1652   // we let the hardware handle it.  The one to eight bytes within words,
1653   // dwords or qwords that span cache line boundaries will still be loaded
1654   // and stored atomically.
1655   //
1656   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1657                                       address* entry, const char *name) {
1658     const bool not_oop = false;
1659     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1660   }
1661 
1662   // Arguments:
1663   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1664   //             ignored
1665   //   name    - stub name string
1666   //
1667   // Inputs:
1668   //   c_rarg0   - source array address
1669   //   c_rarg1   - destination array address
1670   //   c_rarg2   - element count, treated as ssize_t, can be zero
1671   //
1672   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1673   // let the hardware handle it.  The two or four words within dwords
1674   // or qwords that span cache line boundaries will still be loaded
1675   // and stored atomically.
1676   //
1677   // Side Effects:
1678   //   disjoint_short_copy_entry is set to the no-overlap entry point
1679   //   used by generate_conjoint_short_copy().
1680   //
1681   address generate_disjoint_short_copy(bool aligned,
1682                                        address* entry, const char *name) {
1683     const bool not_oop = false;
1684     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1685   }
1686 
1687   // Arguments:
1688   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1689   //             ignored
1690   //   name    - stub name string
1691   //
1692   // Inputs:
1693   //   c_rarg0   - source array address
1694   //   c_rarg1   - destination array address
1695   //   c_rarg2   - element count, treated as ssize_t, can be zero
1696   //
1697   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1698   // let the hardware handle it.  The two or four words within dwords
1699   // or qwords that span cache line boundaries will still be loaded
1700   // and stored atomically.
1701   //
1702   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1703                                        address *entry, const char *name) {
1704     const bool not_oop = false;
1705     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1706 
1707   }
1708   // Arguments:
1709   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1710   //             ignored
1711   //   name    - stub name string
1712   //
1713   // Inputs:
1714   //   c_rarg0   - source array address
1715   //   c_rarg1   - destination array address
1716   //   c_rarg2   - element count, treated as ssize_t, can be zero
1717   //
1718   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1719   // the hardware handle it.  The two dwords within qwords that span
1720   // cache line boundaries will still be loaded and stored atomically.
1721   //
1722   // Side Effects:
1723   //   disjoint_int_copy_entry is set to the no-overlap entry point
1724   //   used by generate_conjoint_int_oop_copy().
1725   //
1726   address generate_disjoint_int_copy(bool aligned, address *entry,
1727                                          const char *name, bool dest_uninitialized = false) {
1728     const bool not_oop = false;
1729     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1730   }
1731 
1732   // Arguments:
1733   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1734   //             ignored
1735   //   name    - stub name string
1736   //
1737   // Inputs:
1738   //   c_rarg0   - source array address
1739   //   c_rarg1   - destination array address
1740   //   c_rarg2   - element count, treated as ssize_t, can be zero
1741   //
1742   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1743   // the hardware handle it.  The two dwords within qwords that span
1744   // cache line boundaries will still be loaded and stored atomically.
1745   //
1746   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1747                                      address *entry, const char *name,
1748                                      bool dest_uninitialized = false) {
1749     const bool not_oop = false;
1750     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1751   }
1752 
1753 
1754   // Arguments:
1755   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1756   //             ignored
1757   //   name    - stub name string
1758   //
1759   // Inputs:
1760   //   c_rarg0   - source array address
1761   //   c_rarg1   - destination array address
1762   //   c_rarg2   - element count, treated as size_t, can be zero
1763   //
1764   // Side Effects:
1765   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1766   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1767   //
1768   address generate_disjoint_long_copy(bool aligned, address *entry,
1769                                           const char *name, bool dest_uninitialized = false) {
1770     const bool not_oop = false;
1771     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1772   }
1773 
1774   // Arguments:
1775   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1776   //             ignored
1777   //   name    - stub name string
1778   //
1779   // Inputs:
1780   //   c_rarg0   - source array address
1781   //   c_rarg1   - destination array address
1782   //   c_rarg2   - element count, treated as size_t, can be zero
1783   //
1784   address generate_conjoint_long_copy(bool aligned,
1785                                       address nooverlap_target, address *entry,
1786                                       const char *name, bool dest_uninitialized = false) {
1787     const bool not_oop = false;
1788     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1789   }
1790 
1791   // Arguments:
1792   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1793   //             ignored
1794   //   name    - stub name string
1795   //
1796   // Inputs:
1797   //   c_rarg0   - source array address
1798   //   c_rarg1   - destination array address
1799   //   c_rarg2   - element count, treated as size_t, can be zero
1800   //
1801   // Side Effects:
1802   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1803   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1804   //
1805   address generate_disjoint_oop_copy(bool aligned, address *entry,
1806                                      const char *name, bool dest_uninitialized) {
1807     const bool is_oop = true;
1808     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1809     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1810   }
1811 
1812   // Arguments:
1813   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1814   //             ignored
1815   //   name    - stub name string
1816   //
1817   // Inputs:
1818   //   c_rarg0   - source array address
1819   //   c_rarg1   - destination array address
1820   //   c_rarg2   - element count, treated as size_t, can be zero
1821   //
1822   address generate_conjoint_oop_copy(bool aligned,
1823                                      address nooverlap_target, address *entry,
1824                                      const char *name, bool dest_uninitialized) {
1825     const bool is_oop = true;
1826     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1827     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1828                                   name, dest_uninitialized);
1829   }
1830 
1831 
1832   // Helper for generating a dynamic type check.
1833   // Smashes rscratch1, rscratch2.
1834   void generate_type_check(Register sub_klass,
1835                            Register super_check_offset,
1836                            Register super_klass,
1837                            Label& L_success) {
1838     assert_different_registers(sub_klass, super_check_offset, super_klass);
1839 
1840     BLOCK_COMMENT("type_check:");
1841 
1842     Label L_miss;
1843 
1844     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1845                                      super_check_offset);
1846     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1847 
1848     // Fall through on failure!
1849     __ BIND(L_miss);
1850   }
1851 
1852   //
1853   //  Generate checkcasting array copy stub
1854   //
1855   //  Input:
1856   //    c_rarg0   - source array address
1857   //    c_rarg1   - destination array address
1858   //    c_rarg2   - element count, treated as ssize_t, can be zero
1859   //    c_rarg3   - size_t ckoff (super_check_offset)
1860   //    c_rarg4   - oop ckval (super_klass)
1861   //
1862   //  Output:
1863   //    r0 ==  0  -  success
1864   //    r0 == -1^K - failure, where K is partial transfer count
1865   //
1866   address generate_checkcast_copy(const char *name, address *entry,
1867                                   bool dest_uninitialized = false) {
1868 
1869     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1870 
1871     // Input registers (after setup_arg_regs)
1872     const Register from        = c_rarg0;   // source array address
1873     const Register to          = c_rarg1;   // destination array address
1874     const Register count       = c_rarg2;   // elementscount
1875     const Register ckoff       = c_rarg3;   // super_check_offset
1876     const Register ckval       = c_rarg4;   // super_klass
1877 
1878     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1879     RegSet wb_post_saved_regs = RegSet::of(count);
1880 
1881     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1882     const Register copied_oop  = r22;       // actual oop copied
1883     const Register count_save  = r21;       // orig elementscount
1884     const Register start_to    = r20;       // destination array start address
1885     const Register r19_klass   = r19;       // oop._klass
1886 
1887     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1888     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1889 
1890     //---------------------------------------------------------------
1891     // Assembler stub will be used for this call to arraycopy
1892     // if the two arrays are subtypes of Object[] but the
1893     // destination array type is not equal to or a supertype
1894     // of the source type.  Each element must be separately
1895     // checked.
1896 
1897     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1898                                copied_oop, r19_klass, count_save);
1899 
1900     __ align(CodeEntryAlignment);
1901     StubCodeMark mark(this, "StubRoutines", name);
1902     address start = __ pc();
1903 
1904     __ enter(); // required for proper stackwalking of RuntimeStub frame
1905 
1906 #ifdef ASSERT
1907     // caller guarantees that the arrays really are different
1908     // otherwise, we would have to make conjoint checks
1909     { Label L;
1910       __ b(L);                  // conjoint check not yet implemented
1911       __ stop("checkcast_copy within a single array");
1912       __ bind(L);
1913     }
1914 #endif //ASSERT
1915 
1916     // Caller of this entry point must set up the argument registers.
1917     if (entry != nullptr) {
1918       *entry = __ pc();
1919       BLOCK_COMMENT("Entry:");
1920     }
1921 
1922      // Empty array:  Nothing to do.
1923     __ cbz(count, L_done);
1924     __ push(RegSet::of(r19, r20, r21, r22), sp);
1925 
1926 #ifdef ASSERT
1927     BLOCK_COMMENT("assert consistent ckoff/ckval");
1928     // The ckoff and ckval must be mutually consistent,
1929     // even though caller generates both.
1930     { Label L;
1931       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1932       __ ldrw(start_to, Address(ckval, sco_offset));
1933       __ cmpw(ckoff, start_to);
1934       __ br(Assembler::EQ, L);
1935       __ stop("super_check_offset inconsistent");
1936       __ bind(L);
1937     }
1938 #endif //ASSERT
1939 
1940     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1941     bool is_oop = true;
1942     int element_size = UseCompressedOops ? 4 : 8;
1943     if (dest_uninitialized) {
1944       decorators |= IS_DEST_UNINITIALIZED;
1945     }
1946 
1947     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1948     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1949 
1950     // save the original count
1951     __ mov(count_save, count);
1952 
1953     // Copy from low to high addresses
1954     __ mov(start_to, to);              // Save destination array start address
1955     __ b(L_load_element);
1956 
1957     // ======== begin loop ========
1958     // (Loop is rotated; its entry is L_load_element.)
1959     // Loop control:
1960     //   for (; count != 0; count--) {
1961     //     copied_oop = load_heap_oop(from++);
1962     //     ... generate_type_check ...;
1963     //     store_heap_oop(to++, copied_oop);
1964     //   }
1965     __ align(OptoLoopAlignment);
1966 
1967     __ BIND(L_store_element);
1968     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1969                       __ post(to, element_size), copied_oop, noreg,
1970                       gct1, gct2, gct3);
1971     __ sub(count, count, 1);
1972     __ cbz(count, L_do_card_marks);
1973 
1974     // ======== loop entry is here ========
1975     __ BIND(L_load_element);
1976     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1977                      copied_oop, noreg, __ post(from, element_size),
1978                      gct1);
1979     __ cbz(copied_oop, L_store_element);
1980 
1981     __ load_klass(r19_klass, copied_oop);// query the object klass
1982     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1983     // ======== end loop ========
1984 
1985     // It was a real error; we must depend on the caller to finish the job.
1986     // Register count = remaining oops, count_orig = total oops.
1987     // Emit GC store barriers for the oops we have copied and report
1988     // their number to the caller.
1989 
1990     __ subs(count, count_save, count);     // K = partially copied oop count
1991     __ eon(count, count, zr);                   // report (-1^K) to caller
1992     __ br(Assembler::EQ, L_done_pop);
1993 
1994     __ BIND(L_do_card_marks);
1995     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1996 
1997     __ bind(L_done_pop);
1998     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1999     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2000 
2001     __ bind(L_done);
2002     __ mov(r0, count);
2003     __ leave();
2004     __ ret(lr);
2005 
2006     return start;
2007   }
2008 
2009   // Perform range checks on the proposed arraycopy.
2010   // Kills temp, but nothing else.
2011   // Also, clean the sign bits of src_pos and dst_pos.
2012   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2013                               Register src_pos, // source position (c_rarg1)
2014                               Register dst,     // destination array oo (c_rarg2)
2015                               Register dst_pos, // destination position (c_rarg3)
2016                               Register length,
2017                               Register temp,
2018                               Label& L_failed) {
2019     BLOCK_COMMENT("arraycopy_range_checks:");
2020 
2021     assert_different_registers(rscratch1, temp);
2022 
2023     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2024     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2025     __ addw(temp, length, src_pos);
2026     __ cmpw(temp, rscratch1);
2027     __ br(Assembler::HI, L_failed);
2028 
2029     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2030     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2031     __ addw(temp, length, dst_pos);
2032     __ cmpw(temp, rscratch1);
2033     __ br(Assembler::HI, L_failed);
2034 
2035     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2036     __ movw(src_pos, src_pos);
2037     __ movw(dst_pos, dst_pos);
2038 
2039     BLOCK_COMMENT("arraycopy_range_checks done");
2040   }
2041 
2042   // These stubs get called from some dumb test routine.
2043   // I'll write them properly when they're called from
2044   // something that's actually doing something.
2045   static void fake_arraycopy_stub(address src, address dst, int count) {
2046     assert(count == 0, "huh?");
2047   }
2048 
2049 
2050   //
2051   //  Generate 'unsafe' array copy stub
2052   //  Though just as safe as the other stubs, it takes an unscaled
2053   //  size_t argument instead of an element count.
2054   //
2055   //  Input:
2056   //    c_rarg0   - source array address
2057   //    c_rarg1   - destination array address
2058   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2059   //
2060   // Examines the alignment of the operands and dispatches
2061   // to a long, int, short, or byte copy loop.
2062   //
2063   address generate_unsafe_copy(const char *name,
2064                                address byte_copy_entry,
2065                                address short_copy_entry,
2066                                address int_copy_entry,
2067                                address long_copy_entry) {
2068     Label L_long_aligned, L_int_aligned, L_short_aligned;
2069     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2070 
2071     __ align(CodeEntryAlignment);
2072     StubCodeMark mark(this, "StubRoutines", name);
2073     address start = __ pc();
2074     __ enter(); // required for proper stackwalking of RuntimeStub frame
2075 
2076     // bump this on entry, not on exit:
2077     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2078 
2079     __ orr(rscratch1, s, d);
2080     __ orr(rscratch1, rscratch1, count);
2081 
2082     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2083     __ cbz(rscratch1, L_long_aligned);
2084     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2085     __ cbz(rscratch1, L_int_aligned);
2086     __ tbz(rscratch1, 0, L_short_aligned);
2087     __ b(RuntimeAddress(byte_copy_entry));
2088 
2089     __ BIND(L_short_aligned);
2090     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2091     __ b(RuntimeAddress(short_copy_entry));
2092     __ BIND(L_int_aligned);
2093     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2094     __ b(RuntimeAddress(int_copy_entry));
2095     __ BIND(L_long_aligned);
2096     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2097     __ b(RuntimeAddress(long_copy_entry));
2098 
2099     return start;
2100   }
2101 
2102   //
2103   //  Generate generic array copy stubs
2104   //
2105   //  Input:
2106   //    c_rarg0    -  src oop
2107   //    c_rarg1    -  src_pos (32-bits)
2108   //    c_rarg2    -  dst oop
2109   //    c_rarg3    -  dst_pos (32-bits)
2110   //    c_rarg4    -  element count (32-bits)
2111   //
2112   //  Output:
2113   //    r0 ==  0  -  success
2114   //    r0 == -1^K - failure, where K is partial transfer count
2115   //
2116   address generate_generic_copy(const char *name,
2117                                 address byte_copy_entry, address short_copy_entry,
2118                                 address int_copy_entry, address oop_copy_entry,
2119                                 address long_copy_entry, address checkcast_copy_entry) {
2120 
2121     Label L_failed, L_objArray;
2122     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2123 
2124     // Input registers
2125     const Register src        = c_rarg0;  // source array oop
2126     const Register src_pos    = c_rarg1;  // source position
2127     const Register dst        = c_rarg2;  // destination array oop
2128     const Register dst_pos    = c_rarg3;  // destination position
2129     const Register length     = c_rarg4;
2130 
2131 
2132     // Registers used as temps
2133     const Register dst_klass  = c_rarg5;
2134 
2135     __ align(CodeEntryAlignment);
2136 
2137     StubCodeMark mark(this, "StubRoutines", name);
2138 
2139     address start = __ pc();
2140 
2141     __ enter(); // required for proper stackwalking of RuntimeStub frame
2142 
2143     // bump this on entry, not on exit:
2144     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2145 
2146     //-----------------------------------------------------------------------
2147     // Assembler stub will be used for this call to arraycopy
2148     // if the following conditions are met:
2149     //
2150     // (1) src and dst must not be null.
2151     // (2) src_pos must not be negative.
2152     // (3) dst_pos must not be negative.
2153     // (4) length  must not be negative.
2154     // (5) src klass and dst klass should be the same and not null.
2155     // (6) src and dst should be arrays.
2156     // (7) src_pos + length must not exceed length of src.
2157     // (8) dst_pos + length must not exceed length of dst.
2158     //
2159 
2160     //  if (src == nullptr) return -1;
2161     __ cbz(src, L_failed);
2162 
2163     //  if (src_pos < 0) return -1;
2164     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2165 
2166     //  if (dst == nullptr) return -1;
2167     __ cbz(dst, L_failed);
2168 
2169     //  if (dst_pos < 0) return -1;
2170     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2171 
2172     // registers used as temp
2173     const Register scratch_length    = r16; // elements count to copy
2174     const Register scratch_src_klass = r17; // array klass
2175     const Register lh                = r15; // layout helper
2176 
2177     //  if (length < 0) return -1;
2178     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2179     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2180 
2181     __ load_klass(scratch_src_klass, src);
2182 #ifdef ASSERT
2183     //  assert(src->klass() != nullptr);
2184     {
2185       BLOCK_COMMENT("assert klasses not null {");
2186       Label L1, L2;
2187       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2188       __ bind(L1);
2189       __ stop("broken null klass");
2190       __ bind(L2);
2191       __ load_klass(rscratch1, dst);
2192       __ cbz(rscratch1, L1);     // this would be broken also
2193       BLOCK_COMMENT("} assert klasses not null done");
2194     }
2195 #endif
2196 
2197     // Load layout helper (32-bits)
2198     //
2199     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2200     // 32        30    24            16              8     2                 0
2201     //
2202     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2203     //
2204 
2205     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2206 
2207     // Handle objArrays completely differently...
2208     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2209     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2210     __ movw(rscratch1, objArray_lh);
2211     __ eorw(rscratch2, lh, rscratch1);
2212     __ cbzw(rscratch2, L_objArray);
2213 
2214     //  if (src->klass() != dst->klass()) return -1;
2215     __ load_klass(rscratch2, dst);
2216     __ eor(rscratch2, rscratch2, scratch_src_klass);
2217     __ cbnz(rscratch2, L_failed);
2218 
2219     //  if (!src->is_Array()) return -1;
2220     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2221 
2222     // At this point, it is known to be a typeArray (array_tag 0x3).
2223 #ifdef ASSERT
2224     {
2225       BLOCK_COMMENT("assert primitive array {");
2226       Label L;
2227       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2228       __ cmpw(lh, rscratch2);
2229       __ br(Assembler::GE, L);
2230       __ stop("must be a primitive array");
2231       __ bind(L);
2232       BLOCK_COMMENT("} assert primitive array done");
2233     }
2234 #endif
2235 
2236     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2237                            rscratch2, L_failed);
2238 
2239     // TypeArrayKlass
2240     //
2241     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2242     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2243     //
2244 
2245     const Register rscratch1_offset = rscratch1;    // array offset
2246     const Register r15_elsize = lh; // element size
2247 
2248     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2249            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2250     __ add(src, src, rscratch1_offset);           // src array offset
2251     __ add(dst, dst, rscratch1_offset);           // dst array offset
2252     BLOCK_COMMENT("choose copy loop based on element size");
2253 
2254     // next registers should be set before the jump to corresponding stub
2255     const Register from     = c_rarg0;  // source array address
2256     const Register to       = c_rarg1;  // destination array address
2257     const Register count    = c_rarg2;  // elements count
2258 
2259     // 'from', 'to', 'count' registers should be set in such order
2260     // since they are the same as 'src', 'src_pos', 'dst'.
2261 
2262     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2263 
2264     // The possible values of elsize are 0-3, i.e. exact_log2(element
2265     // size in bytes).  We do a simple bitwise binary search.
2266   __ BIND(L_copy_bytes);
2267     __ tbnz(r15_elsize, 1, L_copy_ints);
2268     __ tbnz(r15_elsize, 0, L_copy_shorts);
2269     __ lea(from, Address(src, src_pos));// src_addr
2270     __ lea(to,   Address(dst, dst_pos));// dst_addr
2271     __ movw(count, scratch_length); // length
2272     __ b(RuntimeAddress(byte_copy_entry));
2273 
2274   __ BIND(L_copy_shorts);
2275     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2276     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2277     __ movw(count, scratch_length); // length
2278     __ b(RuntimeAddress(short_copy_entry));
2279 
2280   __ BIND(L_copy_ints);
2281     __ tbnz(r15_elsize, 0, L_copy_longs);
2282     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2283     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2284     __ movw(count, scratch_length); // length
2285     __ b(RuntimeAddress(int_copy_entry));
2286 
2287   __ BIND(L_copy_longs);
2288 #ifdef ASSERT
2289     {
2290       BLOCK_COMMENT("assert long copy {");
2291       Label L;
2292       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2293       __ cmpw(r15_elsize, LogBytesPerLong);
2294       __ br(Assembler::EQ, L);
2295       __ stop("must be long copy, but elsize is wrong");
2296       __ bind(L);
2297       BLOCK_COMMENT("} assert long copy done");
2298     }
2299 #endif
2300     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2301     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2302     __ movw(count, scratch_length); // length
2303     __ b(RuntimeAddress(long_copy_entry));
2304 
2305     // ObjArrayKlass
2306   __ BIND(L_objArray);
2307     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2308 
2309     Label L_plain_copy, L_checkcast_copy;
2310     //  test array classes for subtyping
2311     __ load_klass(r15, dst);
2312     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2313     __ br(Assembler::NE, L_checkcast_copy);
2314 
2315     // Identically typed arrays can be copied without element-wise checks.
2316     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2317                            rscratch2, L_failed);
2318 
2319     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2320     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2321     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2322     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2323     __ movw(count, scratch_length); // length
2324   __ BIND(L_plain_copy);
2325     __ b(RuntimeAddress(oop_copy_entry));
2326 
2327   __ BIND(L_checkcast_copy);
2328     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2329     {
2330       // Before looking at dst.length, make sure dst is also an objArray.
2331       __ ldrw(rscratch1, Address(r15, lh_offset));
2332       __ movw(rscratch2, objArray_lh);
2333       __ eorw(rscratch1, rscratch1, rscratch2);
2334       __ cbnzw(rscratch1, L_failed);
2335 
2336       // It is safe to examine both src.length and dst.length.
2337       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2338                              r15, L_failed);
2339 
2340       __ load_klass(dst_klass, dst); // reload
2341 
2342       // Marshal the base address arguments now, freeing registers.
2343       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2344       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2345       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2346       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2347       __ movw(count, length);           // length (reloaded)
2348       Register sco_temp = c_rarg3;      // this register is free now
2349       assert_different_registers(from, to, count, sco_temp,
2350                                  dst_klass, scratch_src_klass);
2351       // assert_clean_int(count, sco_temp);
2352 
2353       // Generate the type check.
2354       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2355       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2356 
2357       // Smashes rscratch1, rscratch2
2358       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2359 
2360       // Fetch destination element klass from the ObjArrayKlass header.
2361       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2362       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2363       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2364 
2365       // the checkcast_copy loop needs two extra arguments:
2366       assert(c_rarg3 == sco_temp, "#3 already in place");
2367       // Set up arguments for checkcast_copy_entry.
2368       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2369       __ b(RuntimeAddress(checkcast_copy_entry));
2370     }
2371 
2372   __ BIND(L_failed);
2373     __ mov(r0, -1);
2374     __ leave();   // required for proper stackwalking of RuntimeStub frame
2375     __ ret(lr);
2376 
2377     return start;
2378   }
2379 
2380   //
2381   // Generate stub for array fill. If "aligned" is true, the
2382   // "to" address is assumed to be heapword aligned.
2383   //
2384   // Arguments for generated stub:
2385   //   to:    c_rarg0
2386   //   value: c_rarg1
2387   //   count: c_rarg2 treated as signed
2388   //
2389   address generate_fill(BasicType t, bool aligned, const char *name) {
2390     __ align(CodeEntryAlignment);
2391     StubCodeMark mark(this, "StubRoutines", name);
2392     address start = __ pc();
2393 
2394     BLOCK_COMMENT("Entry:");
2395 
2396     const Register to        = c_rarg0;  // source array address
2397     const Register value     = c_rarg1;  // value
2398     const Register count     = c_rarg2;  // elements count
2399 
2400     const Register bz_base = r10;        // base for block_zero routine
2401     const Register cnt_words = r11;      // temp register
2402 
2403     __ enter();
2404 
2405     Label L_fill_elements, L_exit1;
2406 
2407     int shift = -1;
2408     switch (t) {
2409       case T_BYTE:
2410         shift = 0;
2411         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2412         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2413         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2414         __ br(Assembler::LO, L_fill_elements);
2415         break;
2416       case T_SHORT:
2417         shift = 1;
2418         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2419         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2420         __ br(Assembler::LO, L_fill_elements);
2421         break;
2422       case T_INT:
2423         shift = 2;
2424         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2425         __ br(Assembler::LO, L_fill_elements);
2426         break;
2427       default: ShouldNotReachHere();
2428     }
2429 
2430     // Align source address at 8 bytes address boundary.
2431     Label L_skip_align1, L_skip_align2, L_skip_align4;
2432     if (!aligned) {
2433       switch (t) {
2434         case T_BYTE:
2435           // One byte misalignment happens only for byte arrays.
2436           __ tbz(to, 0, L_skip_align1);
2437           __ strb(value, Address(__ post(to, 1)));
2438           __ subw(count, count, 1);
2439           __ bind(L_skip_align1);
2440           // Fallthrough
2441         case T_SHORT:
2442           // Two bytes misalignment happens only for byte and short (char) arrays.
2443           __ tbz(to, 1, L_skip_align2);
2444           __ strh(value, Address(__ post(to, 2)));
2445           __ subw(count, count, 2 >> shift);
2446           __ bind(L_skip_align2);
2447           // Fallthrough
2448         case T_INT:
2449           // Align to 8 bytes, we know we are 4 byte aligned to start.
2450           __ tbz(to, 2, L_skip_align4);
2451           __ strw(value, Address(__ post(to, 4)));
2452           __ subw(count, count, 4 >> shift);
2453           __ bind(L_skip_align4);
2454           break;
2455         default: ShouldNotReachHere();
2456       }
2457     }
2458 
2459     //
2460     //  Fill large chunks
2461     //
2462     __ lsrw(cnt_words, count, 3 - shift); // number of words
2463     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2464     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2465     if (UseBlockZeroing) {
2466       Label non_block_zeroing, rest;
2467       // If the fill value is zero we can use the fast zero_words().
2468       __ cbnz(value, non_block_zeroing);
2469       __ mov(bz_base, to);
2470       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2471       address tpc = __ zero_words(bz_base, cnt_words);
2472       if (tpc == nullptr) {
2473         fatal("CodeCache is full at generate_fill");
2474       }
2475       __ b(rest);
2476       __ bind(non_block_zeroing);
2477       __ fill_words(to, cnt_words, value);
2478       __ bind(rest);
2479     } else {
2480       __ fill_words(to, cnt_words, value);
2481     }
2482 
2483     // Remaining count is less than 8 bytes. Fill it by a single store.
2484     // Note that the total length is no less than 8 bytes.
2485     if (t == T_BYTE || t == T_SHORT) {
2486       Label L_exit1;
2487       __ cbzw(count, L_exit1);
2488       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2489       __ str(value, Address(to, -8));    // overwrite some elements
2490       __ bind(L_exit1);
2491       __ leave();
2492       __ ret(lr);
2493     }
2494 
2495     // Handle copies less than 8 bytes.
2496     Label L_fill_2, L_fill_4, L_exit2;
2497     __ bind(L_fill_elements);
2498     switch (t) {
2499       case T_BYTE:
2500         __ tbz(count, 0, L_fill_2);
2501         __ strb(value, Address(__ post(to, 1)));
2502         __ bind(L_fill_2);
2503         __ tbz(count, 1, L_fill_4);
2504         __ strh(value, Address(__ post(to, 2)));
2505         __ bind(L_fill_4);
2506         __ tbz(count, 2, L_exit2);
2507         __ strw(value, Address(to));
2508         break;
2509       case T_SHORT:
2510         __ tbz(count, 0, L_fill_4);
2511         __ strh(value, Address(__ post(to, 2)));
2512         __ bind(L_fill_4);
2513         __ tbz(count, 1, L_exit2);
2514         __ strw(value, Address(to));
2515         break;
2516       case T_INT:
2517         __ cbzw(count, L_exit2);
2518         __ strw(value, Address(to));
2519         break;
2520       default: ShouldNotReachHere();
2521     }
2522     __ bind(L_exit2);
2523     __ leave();
2524     __ ret(lr);
2525     return start;
2526   }
2527 
2528   address generate_data_cache_writeback() {
2529     const Register line        = c_rarg0;  // address of line to write back
2530 
2531     __ align(CodeEntryAlignment);
2532 
2533     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2534 
2535     address start = __ pc();
2536     __ enter();
2537     __ cache_wb(Address(line, 0));
2538     __ leave();
2539     __ ret(lr);
2540 
2541     return start;
2542   }
2543 
2544   address generate_data_cache_writeback_sync() {
2545     const Register is_pre     = c_rarg0;  // pre or post sync
2546 
2547     __ align(CodeEntryAlignment);
2548 
2549     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2550 
2551     // pre wbsync is a no-op
2552     // post wbsync translates to an sfence
2553 
2554     Label skip;
2555     address start = __ pc();
2556     __ enter();
2557     __ cbnz(is_pre, skip);
2558     __ cache_wbsync(false);
2559     __ bind(skip);
2560     __ leave();
2561     __ ret(lr);
2562 
2563     return start;
2564   }
2565 
2566   void generate_arraycopy_stubs() {
2567     address entry;
2568     address entry_jbyte_arraycopy;
2569     address entry_jshort_arraycopy;
2570     address entry_jint_arraycopy;
2571     address entry_oop_arraycopy;
2572     address entry_jlong_arraycopy;
2573     address entry_checkcast_arraycopy;
2574 
2575     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2576     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2577 
2578     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2579     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2580 
2581     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2582     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2583 
2584     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2585 
2586     //*** jbyte
2587     // Always need aligned and unaligned versions
2588     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2589                                                                                   "jbyte_disjoint_arraycopy");
2590     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2591                                                                                   &entry_jbyte_arraycopy,
2592                                                                                   "jbyte_arraycopy");
2593     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2594                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2595     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2596                                                                                   "arrayof_jbyte_arraycopy");
2597 
2598     //*** jshort
2599     // Always need aligned and unaligned versions
2600     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2601                                                                                     "jshort_disjoint_arraycopy");
2602     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2603                                                                                     &entry_jshort_arraycopy,
2604                                                                                     "jshort_arraycopy");
2605     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2606                                                                                     "arrayof_jshort_disjoint_arraycopy");
2607     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2608                                                                                     "arrayof_jshort_arraycopy");
2609 
2610     //*** jint
2611     // Aligned versions
2612     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2613                                                                                 "arrayof_jint_disjoint_arraycopy");
2614     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2615                                                                                 "arrayof_jint_arraycopy");
2616     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2617     // entry_jint_arraycopy always points to the unaligned version
2618     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2619                                                                                 "jint_disjoint_arraycopy");
2620     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2621                                                                                 &entry_jint_arraycopy,
2622                                                                                 "jint_arraycopy");
2623 
2624     //*** jlong
2625     // It is always aligned
2626     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2627                                                                                   "arrayof_jlong_disjoint_arraycopy");
2628     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2629                                                                                   "arrayof_jlong_arraycopy");
2630     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2631     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2632 
2633     //*** oops
2634     {
2635       // With compressed oops we need unaligned versions; notice that
2636       // we overwrite entry_oop_arraycopy.
2637       bool aligned = !UseCompressedOops;
2638 
2639       StubRoutines::_arrayof_oop_disjoint_arraycopy
2640         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2641                                      /*dest_uninitialized*/false);
2642       StubRoutines::_arrayof_oop_arraycopy
2643         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2644                                      /*dest_uninitialized*/false);
2645       // Aligned versions without pre-barriers
2646       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2647         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2648                                      /*dest_uninitialized*/true);
2649       StubRoutines::_arrayof_oop_arraycopy_uninit
2650         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2651                                      /*dest_uninitialized*/true);
2652     }
2653 
2654     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2655     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2656     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2657     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2658 
2659     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2660     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2661                                                                         /*dest_uninitialized*/true);
2662 
2663     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2664                                                               entry_jbyte_arraycopy,
2665                                                               entry_jshort_arraycopy,
2666                                                               entry_jint_arraycopy,
2667                                                               entry_jlong_arraycopy);
2668 
2669     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2670                                                                entry_jbyte_arraycopy,
2671                                                                entry_jshort_arraycopy,
2672                                                                entry_jint_arraycopy,
2673                                                                entry_oop_arraycopy,
2674                                                                entry_jlong_arraycopy,
2675                                                                entry_checkcast_arraycopy);
2676 
2677     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2678     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2679     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2680     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2681     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2682     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2683   }
2684 
2685   void generate_math_stubs() { Unimplemented(); }
2686 
2687   // Arguments:
2688   //
2689   // Inputs:
2690   //   c_rarg0   - source byte array address
2691   //   c_rarg1   - destination byte array address
2692   //   c_rarg2   - K (key) in little endian int array
2693   //
2694   address generate_aescrypt_encryptBlock() {
2695     __ align(CodeEntryAlignment);
2696     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2697 
2698     const Register from        = c_rarg0;  // source array address
2699     const Register to          = c_rarg1;  // destination array address
2700     const Register key         = c_rarg2;  // key array address
2701     const Register keylen      = rscratch1;
2702 
2703     address start = __ pc();
2704     __ enter();
2705 
2706     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2707 
2708     __ aesenc_loadkeys(key, keylen);
2709     __ aesecb_encrypt(from, to, keylen);
2710 
2711     __ mov(r0, 0);
2712 
2713     __ leave();
2714     __ ret(lr);
2715 
2716     return start;
2717   }
2718 
2719   // Arguments:
2720   //
2721   // Inputs:
2722   //   c_rarg0   - source byte array address
2723   //   c_rarg1   - destination byte array address
2724   //   c_rarg2   - K (key) in little endian int array
2725   //
2726   address generate_aescrypt_decryptBlock() {
2727     assert(UseAES, "need AES cryptographic extension support");
2728     __ align(CodeEntryAlignment);
2729     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2730     Label L_doLast;
2731 
2732     const Register from        = c_rarg0;  // source array address
2733     const Register to          = c_rarg1;  // destination array address
2734     const Register key         = c_rarg2;  // key array address
2735     const Register keylen      = rscratch1;
2736 
2737     address start = __ pc();
2738     __ enter(); // required for proper stackwalking of RuntimeStub frame
2739 
2740     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2741 
2742     __ aesecb_decrypt(from, to, key, keylen);
2743 
2744     __ mov(r0, 0);
2745 
2746     __ leave();
2747     __ ret(lr);
2748 
2749     return start;
2750   }
2751 
2752   // Arguments:
2753   //
2754   // Inputs:
2755   //   c_rarg0   - source byte array address
2756   //   c_rarg1   - destination byte array address
2757   //   c_rarg2   - K (key) in little endian int array
2758   //   c_rarg3   - r vector byte array address
2759   //   c_rarg4   - input length
2760   //
2761   // Output:
2762   //   x0        - input length
2763   //
2764   address generate_cipherBlockChaining_encryptAESCrypt() {
2765     assert(UseAES, "need AES cryptographic extension support");
2766     __ align(CodeEntryAlignment);
2767     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2768 
2769     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2770 
2771     const Register from        = c_rarg0;  // source array address
2772     const Register to          = c_rarg1;  // destination array address
2773     const Register key         = c_rarg2;  // key array address
2774     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2775                                            // and left with the results of the last encryption block
2776     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2777     const Register keylen      = rscratch1;
2778 
2779     address start = __ pc();
2780 
2781       __ enter();
2782 
2783       __ movw(rscratch2, len_reg);
2784 
2785       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2786 
2787       __ ld1(v0, __ T16B, rvec);
2788 
2789       __ cmpw(keylen, 52);
2790       __ br(Assembler::CC, L_loadkeys_44);
2791       __ br(Assembler::EQ, L_loadkeys_52);
2792 
2793       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2794       __ rev32(v17, __ T16B, v17);
2795       __ rev32(v18, __ T16B, v18);
2796     __ BIND(L_loadkeys_52);
2797       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2798       __ rev32(v19, __ T16B, v19);
2799       __ rev32(v20, __ T16B, v20);
2800     __ BIND(L_loadkeys_44);
2801       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2802       __ rev32(v21, __ T16B, v21);
2803       __ rev32(v22, __ T16B, v22);
2804       __ rev32(v23, __ T16B, v23);
2805       __ rev32(v24, __ T16B, v24);
2806       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2807       __ rev32(v25, __ T16B, v25);
2808       __ rev32(v26, __ T16B, v26);
2809       __ rev32(v27, __ T16B, v27);
2810       __ rev32(v28, __ T16B, v28);
2811       __ ld1(v29, v30, v31, __ T16B, key);
2812       __ rev32(v29, __ T16B, v29);
2813       __ rev32(v30, __ T16B, v30);
2814       __ rev32(v31, __ T16B, v31);
2815 
2816     __ BIND(L_aes_loop);
2817       __ ld1(v1, __ T16B, __ post(from, 16));
2818       __ eor(v0, __ T16B, v0, v1);
2819 
2820       __ br(Assembler::CC, L_rounds_44);
2821       __ br(Assembler::EQ, L_rounds_52);
2822 
2823       __ aese(v0, v17); __ aesmc(v0, v0);
2824       __ aese(v0, v18); __ aesmc(v0, v0);
2825     __ BIND(L_rounds_52);
2826       __ aese(v0, v19); __ aesmc(v0, v0);
2827       __ aese(v0, v20); __ aesmc(v0, v0);
2828     __ BIND(L_rounds_44);
2829       __ aese(v0, v21); __ aesmc(v0, v0);
2830       __ aese(v0, v22); __ aesmc(v0, v0);
2831       __ aese(v0, v23); __ aesmc(v0, v0);
2832       __ aese(v0, v24); __ aesmc(v0, v0);
2833       __ aese(v0, v25); __ aesmc(v0, v0);
2834       __ aese(v0, v26); __ aesmc(v0, v0);
2835       __ aese(v0, v27); __ aesmc(v0, v0);
2836       __ aese(v0, v28); __ aesmc(v0, v0);
2837       __ aese(v0, v29); __ aesmc(v0, v0);
2838       __ aese(v0, v30);
2839       __ eor(v0, __ T16B, v0, v31);
2840 
2841       __ st1(v0, __ T16B, __ post(to, 16));
2842 
2843       __ subw(len_reg, len_reg, 16);
2844       __ cbnzw(len_reg, L_aes_loop);
2845 
2846       __ st1(v0, __ T16B, rvec);
2847 
2848       __ mov(r0, rscratch2);
2849 
2850       __ leave();
2851       __ ret(lr);
2852 
2853       return start;
2854   }
2855 
2856   // Arguments:
2857   //
2858   // Inputs:
2859   //   c_rarg0   - source byte array address
2860   //   c_rarg1   - destination byte array address
2861   //   c_rarg2   - K (key) in little endian int array
2862   //   c_rarg3   - r vector byte array address
2863   //   c_rarg4   - input length
2864   //
2865   // Output:
2866   //   r0        - input length
2867   //
2868   address generate_cipherBlockChaining_decryptAESCrypt() {
2869     assert(UseAES, "need AES cryptographic extension support");
2870     __ align(CodeEntryAlignment);
2871     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2872 
2873     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2874 
2875     const Register from        = c_rarg0;  // source array address
2876     const Register to          = c_rarg1;  // destination array address
2877     const Register key         = c_rarg2;  // key array address
2878     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2879                                            // and left with the results of the last encryption block
2880     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2881     const Register keylen      = rscratch1;
2882 
2883     address start = __ pc();
2884 
2885       __ enter();
2886 
2887       __ movw(rscratch2, len_reg);
2888 
2889       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2890 
2891       __ ld1(v2, __ T16B, rvec);
2892 
2893       __ ld1(v31, __ T16B, __ post(key, 16));
2894       __ rev32(v31, __ T16B, v31);
2895 
2896       __ cmpw(keylen, 52);
2897       __ br(Assembler::CC, L_loadkeys_44);
2898       __ br(Assembler::EQ, L_loadkeys_52);
2899 
2900       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2901       __ rev32(v17, __ T16B, v17);
2902       __ rev32(v18, __ T16B, v18);
2903     __ BIND(L_loadkeys_52);
2904       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2905       __ rev32(v19, __ T16B, v19);
2906       __ rev32(v20, __ T16B, v20);
2907     __ BIND(L_loadkeys_44);
2908       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2909       __ rev32(v21, __ T16B, v21);
2910       __ rev32(v22, __ T16B, v22);
2911       __ rev32(v23, __ T16B, v23);
2912       __ rev32(v24, __ T16B, v24);
2913       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2914       __ rev32(v25, __ T16B, v25);
2915       __ rev32(v26, __ T16B, v26);
2916       __ rev32(v27, __ T16B, v27);
2917       __ rev32(v28, __ T16B, v28);
2918       __ ld1(v29, v30, __ T16B, key);
2919       __ rev32(v29, __ T16B, v29);
2920       __ rev32(v30, __ T16B, v30);
2921 
2922     __ BIND(L_aes_loop);
2923       __ ld1(v0, __ T16B, __ post(from, 16));
2924       __ orr(v1, __ T16B, v0, v0);
2925 
2926       __ br(Assembler::CC, L_rounds_44);
2927       __ br(Assembler::EQ, L_rounds_52);
2928 
2929       __ aesd(v0, v17); __ aesimc(v0, v0);
2930       __ aesd(v0, v18); __ aesimc(v0, v0);
2931     __ BIND(L_rounds_52);
2932       __ aesd(v0, v19); __ aesimc(v0, v0);
2933       __ aesd(v0, v20); __ aesimc(v0, v0);
2934     __ BIND(L_rounds_44);
2935       __ aesd(v0, v21); __ aesimc(v0, v0);
2936       __ aesd(v0, v22); __ aesimc(v0, v0);
2937       __ aesd(v0, v23); __ aesimc(v0, v0);
2938       __ aesd(v0, v24); __ aesimc(v0, v0);
2939       __ aesd(v0, v25); __ aesimc(v0, v0);
2940       __ aesd(v0, v26); __ aesimc(v0, v0);
2941       __ aesd(v0, v27); __ aesimc(v0, v0);
2942       __ aesd(v0, v28); __ aesimc(v0, v0);
2943       __ aesd(v0, v29); __ aesimc(v0, v0);
2944       __ aesd(v0, v30);
2945       __ eor(v0, __ T16B, v0, v31);
2946       __ eor(v0, __ T16B, v0, v2);
2947 
2948       __ st1(v0, __ T16B, __ post(to, 16));
2949       __ orr(v2, __ T16B, v1, v1);
2950 
2951       __ subw(len_reg, len_reg, 16);
2952       __ cbnzw(len_reg, L_aes_loop);
2953 
2954       __ st1(v2, __ T16B, rvec);
2955 
2956       __ mov(r0, rscratch2);
2957 
2958       __ leave();
2959       __ ret(lr);
2960 
2961     return start;
2962   }
2963 
2964   // Big-endian 128-bit + 64-bit -> 128-bit addition.
2965   // Inputs: 128-bits. in is preserved.
2966   // The least-significant 64-bit word is in the upper dword of each vector.
2967   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2968   // Output: result
2969   void be_add_128_64(FloatRegister result, FloatRegister in,
2970                      FloatRegister inc, FloatRegister tmp) {
2971     assert_different_registers(result, tmp, inc);
2972 
2973     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
2974                                            // input
2975     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
2976     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
2977                                            // MSD == 0 (must be!) to LSD
2978     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
2979   }
2980 
2981   // CTR AES crypt.
2982   // Arguments:
2983   //
2984   // Inputs:
2985   //   c_rarg0   - source byte array address
2986   //   c_rarg1   - destination byte array address
2987   //   c_rarg2   - K (key) in little endian int array
2988   //   c_rarg3   - counter vector byte array address
2989   //   c_rarg4   - input length
2990   //   c_rarg5   - saved encryptedCounter start
2991   //   c_rarg6   - saved used length
2992   //
2993   // Output:
2994   //   r0       - input length
2995   //
2996   address generate_counterMode_AESCrypt() {
2997     const Register in = c_rarg0;
2998     const Register out = c_rarg1;
2999     const Register key = c_rarg2;
3000     const Register counter = c_rarg3;
3001     const Register saved_len = c_rarg4, len = r10;
3002     const Register saved_encrypted_ctr = c_rarg5;
3003     const Register used_ptr = c_rarg6, used = r12;
3004 
3005     const Register offset = r7;
3006     const Register keylen = r11;
3007 
3008     const unsigned char block_size = 16;
3009     const int bulk_width = 4;
3010     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3011     // performance with larger data sizes, but it also means that the
3012     // fast path isn't used until you have at least 8 blocks, and up
3013     // to 127 bytes of data will be executed on the slow path. For
3014     // that reason, and also so as not to blow away too much icache, 4
3015     // blocks seems like a sensible compromise.
3016 
3017     // Algorithm:
3018     //
3019     //    if (len == 0) {
3020     //        goto DONE;
3021     //    }
3022     //    int result = len;
3023     //    do {
3024     //        if (used >= blockSize) {
3025     //            if (len >= bulk_width * blockSize) {
3026     //                CTR_large_block();
3027     //                if (len == 0)
3028     //                    goto DONE;
3029     //            }
3030     //            for (;;) {
3031     //                16ByteVector v0 = counter;
3032     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3033     //                used = 0;
3034     //                if (len < blockSize)
3035     //                    break;    /* goto NEXT */
3036     //                16ByteVector v1 = load16Bytes(in, offset);
3037     //                v1 = v1 ^ encryptedCounter;
3038     //                store16Bytes(out, offset);
3039     //                used = blockSize;
3040     //                offset += blockSize;
3041     //                len -= blockSize;
3042     //                if (len == 0)
3043     //                    goto DONE;
3044     //            }
3045     //        }
3046     //      NEXT:
3047     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3048     //        len--;
3049     //    } while (len != 0);
3050     //  DONE:
3051     //    return result;
3052     //
3053     // CTR_large_block()
3054     //    Wide bulk encryption of whole blocks.
3055 
3056     __ align(CodeEntryAlignment);
3057     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3058     const address start = __ pc();
3059     __ enter();
3060 
3061     Label DONE, CTR_large_block, large_block_return;
3062     __ ldrw(used, Address(used_ptr));
3063     __ cbzw(saved_len, DONE);
3064 
3065     __ mov(len, saved_len);
3066     __ mov(offset, 0);
3067 
3068     // Compute #rounds for AES based on the length of the key array
3069     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3070 
3071     __ aesenc_loadkeys(key, keylen);
3072 
3073     {
3074       Label L_CTR_loop, NEXT;
3075 
3076       __ bind(L_CTR_loop);
3077 
3078       __ cmp(used, block_size);
3079       __ br(__ LO, NEXT);
3080 
3081       // Maybe we have a lot of data
3082       __ subsw(rscratch1, len, bulk_width * block_size);
3083       __ br(__ HS, CTR_large_block);
3084       __ BIND(large_block_return);
3085       __ cbzw(len, DONE);
3086 
3087       // Setup the counter
3088       __ movi(v4, __ T4S, 0);
3089       __ movi(v5, __ T4S, 1);
3090       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3091 
3092       // 128-bit big-endian increment
3093       __ ld1(v0, __ T16B, counter);
3094       __ rev64(v16, __ T16B, v0);
3095       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3096       __ rev64(v16, __ T16B, v16);
3097       __ st1(v16, __ T16B, counter);
3098       // Previous counter value is in v0
3099       // v4 contains { 0, 1 }
3100 
3101       {
3102         // We have fewer than bulk_width blocks of data left. Encrypt
3103         // them one by one until there is less than a full block
3104         // remaining, being careful to save both the encrypted counter
3105         // and the counter.
3106 
3107         Label inner_loop;
3108         __ bind(inner_loop);
3109         // Counter to encrypt is in v0
3110         __ aesecb_encrypt(noreg, noreg, keylen);
3111         __ st1(v0, __ T16B, saved_encrypted_ctr);
3112 
3113         // Do we have a remaining full block?
3114 
3115         __ mov(used, 0);
3116         __ cmp(len, block_size);
3117         __ br(__ LO, NEXT);
3118 
3119         // Yes, we have a full block
3120         __ ldrq(v1, Address(in, offset));
3121         __ eor(v1, __ T16B, v1, v0);
3122         __ strq(v1, Address(out, offset));
3123         __ mov(used, block_size);
3124         __ add(offset, offset, block_size);
3125 
3126         __ subw(len, len, block_size);
3127         __ cbzw(len, DONE);
3128 
3129         // Increment the counter, store it back
3130         __ orr(v0, __ T16B, v16, v16);
3131         __ rev64(v16, __ T16B, v16);
3132         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3133         __ rev64(v16, __ T16B, v16);
3134         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3135 
3136         __ b(inner_loop);
3137       }
3138 
3139       __ BIND(NEXT);
3140 
3141       // Encrypt a single byte, and loop.
3142       // We expect this to be a rare event.
3143       __ ldrb(rscratch1, Address(in, offset));
3144       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3145       __ eor(rscratch1, rscratch1, rscratch2);
3146       __ strb(rscratch1, Address(out, offset));
3147       __ add(offset, offset, 1);
3148       __ add(used, used, 1);
3149       __ subw(len, len,1);
3150       __ cbnzw(len, L_CTR_loop);
3151     }
3152 
3153     __ bind(DONE);
3154     __ strw(used, Address(used_ptr));
3155     __ mov(r0, saved_len);
3156 
3157     __ leave(); // required for proper stackwalking of RuntimeStub frame
3158     __ ret(lr);
3159 
3160     // Bulk encryption
3161 
3162     __ BIND (CTR_large_block);
3163     assert(bulk_width == 4 || bulk_width == 8, "must be");
3164 
3165     if (bulk_width == 8) {
3166       __ sub(sp, sp, 4 * 16);
3167       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3168     }
3169     __ sub(sp, sp, 4 * 16);
3170     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3171     RegSet saved_regs = (RegSet::of(in, out, offset)
3172                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3173     __ push(saved_regs, sp);
3174     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3175     __ add(in, in, offset);
3176     __ add(out, out, offset);
3177 
3178     // Keys should already be loaded into the correct registers
3179 
3180     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3181     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3182 
3183     // AES/CTR loop
3184     {
3185       Label L_CTR_loop;
3186       __ BIND(L_CTR_loop);
3187 
3188       // Setup the counters
3189       __ movi(v8, __ T4S, 0);
3190       __ movi(v9, __ T4S, 1);
3191       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3192 
3193       for (int i = 0; i < bulk_width; i++) {
3194         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3195         __ rev64(v0_ofs, __ T16B, v16);
3196         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3197       }
3198 
3199       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3200 
3201       // Encrypt the counters
3202       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3203 
3204       if (bulk_width == 8) {
3205         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3206       }
3207 
3208       // XOR the encrypted counters with the inputs
3209       for (int i = 0; i < bulk_width; i++) {
3210         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3211         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3212         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3213       }
3214 
3215       // Write the encrypted data
3216       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3217       if (bulk_width == 8) {
3218         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3219       }
3220 
3221       __ subw(len, len, 16 * bulk_width);
3222       __ cbnzw(len, L_CTR_loop);
3223     }
3224 
3225     // Save the counter back where it goes
3226     __ rev64(v16, __ T16B, v16);
3227     __ st1(v16, __ T16B, counter);
3228 
3229     __ pop(saved_regs, sp);
3230 
3231     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3232     if (bulk_width == 8) {
3233       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3234     }
3235 
3236     __ andr(rscratch1, len, -16 * bulk_width);
3237     __ sub(len, len, rscratch1);
3238     __ add(offset, offset, rscratch1);
3239     __ mov(used, 16);
3240     __ strw(used, Address(used_ptr));
3241     __ b(large_block_return);
3242 
3243     return start;
3244   }
3245 
3246   // Vector AES Galois Counter Mode implementation. Parameters:
3247   //
3248   // in = c_rarg0
3249   // len = c_rarg1
3250   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3251   // out = c_rarg3
3252   // key = c_rarg4
3253   // state = c_rarg5 - GHASH.state
3254   // subkeyHtbl = c_rarg6 - powers of H
3255   // counter = c_rarg7 - 16 bytes of CTR
3256   // return - number of processed bytes
3257   address generate_galoisCounterMode_AESCrypt() {
3258     address ghash_polynomial = __ pc();
3259     __ emit_int64(0x87);  // The low-order bits of the field
3260                           // polynomial (i.e. p = z^7+z^2+z+1)
3261                           // repeated in the low and high parts of a
3262                           // 128-bit vector
3263     __ emit_int64(0x87);
3264 
3265     __ align(CodeEntryAlignment);
3266      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3267     address start = __ pc();
3268     __ enter();
3269 
3270     const Register in = c_rarg0;
3271     const Register len = c_rarg1;
3272     const Register ct = c_rarg2;
3273     const Register out = c_rarg3;
3274     // and updated with the incremented counter in the end
3275 
3276     const Register key = c_rarg4;
3277     const Register state = c_rarg5;
3278 
3279     const Register subkeyHtbl = c_rarg6;
3280 
3281     const Register counter = c_rarg7;
3282 
3283     const Register keylen = r10;
3284     // Save state before entering routine
3285     __ sub(sp, sp, 4 * 16);
3286     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3287     __ sub(sp, sp, 4 * 16);
3288     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3289 
3290     // __ andr(len, len, -512);
3291     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3292     __ str(len, __ pre(sp, -2 * wordSize));
3293 
3294     Label DONE;
3295     __ cbz(len, DONE);
3296 
3297     // Compute #rounds for AES based on the length of the key array
3298     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3299 
3300     __ aesenc_loadkeys(key, keylen);
3301     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3302     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3303 
3304     // AES/CTR loop
3305     {
3306       Label L_CTR_loop;
3307       __ BIND(L_CTR_loop);
3308 
3309       // Setup the counters
3310       __ movi(v8, __ T4S, 0);
3311       __ movi(v9, __ T4S, 1);
3312       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3313 
3314       assert(v0->encoding() < v8->encoding(), "");
3315       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3316         FloatRegister f = as_FloatRegister(i);
3317         __ rev32(f, __ T16B, v16);
3318         __ addv(v16, __ T4S, v16, v8);
3319       }
3320 
3321       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3322 
3323       // Encrypt the counters
3324       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3325 
3326       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3327 
3328       // XOR the encrypted counters with the inputs
3329       for (int i = 0; i < 8; i++) {
3330         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3331         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3332         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3333       }
3334       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3335       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3336 
3337       __ subw(len, len, 16 * 8);
3338       __ cbnzw(len, L_CTR_loop);
3339     }
3340 
3341     __ rev32(v16, __ T16B, v16);
3342     __ st1(v16, __ T16B, counter);
3343 
3344     __ ldr(len, Address(sp));
3345     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3346 
3347     // GHASH/CTR loop
3348     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3349                                 len, /*unrolls*/4);
3350 
3351 #ifdef ASSERT
3352     { Label L;
3353       __ cmp(len, (unsigned char)0);
3354       __ br(Assembler::EQ, L);
3355       __ stop("stubGenerator: abort");
3356       __ bind(L);
3357   }
3358 #endif
3359 
3360   __ bind(DONE);
3361     // Return the number of bytes processed
3362     __ ldr(r0, __ post(sp, 2 * wordSize));
3363 
3364     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3365     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3366 
3367     __ leave(); // required for proper stackwalking of RuntimeStub frame
3368     __ ret(lr);
3369      return start;
3370   }
3371 
3372   class Cached64Bytes {
3373   private:
3374     MacroAssembler *_masm;
3375     Register _regs[8];
3376 
3377   public:
3378     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3379       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3380       auto it = rs.begin();
3381       for (auto &r: _regs) {
3382         r = *it;
3383         ++it;
3384       }
3385     }
3386 
3387     void gen_loads(Register base) {
3388       for (int i = 0; i < 8; i += 2) {
3389         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3390       }
3391     }
3392 
3393     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3394     void extract_u32(Register dest, int i) {
3395       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3396     }
3397   };
3398 
3399   // Utility routines for md5.
3400   // Clobbers r10 and r11.
3401   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3402               int k, int s, int t) {
3403     Register rscratch3 = r10;
3404     Register rscratch4 = r11;
3405 
3406     __ eorw(rscratch3, r3, r4);
3407     __ movw(rscratch2, t);
3408     __ andw(rscratch3, rscratch3, r2);
3409     __ addw(rscratch4, r1, rscratch2);
3410     reg_cache.extract_u32(rscratch1, k);
3411     __ eorw(rscratch3, rscratch3, r4);
3412     __ addw(rscratch4, rscratch4, rscratch1);
3413     __ addw(rscratch3, rscratch3, rscratch4);
3414     __ rorw(rscratch2, rscratch3, 32 - s);
3415     __ addw(r1, rscratch2, r2);
3416   }
3417 
3418   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3419               int k, int s, int t) {
3420     Register rscratch3 = r10;
3421     Register rscratch4 = r11;
3422 
3423     __ andw(rscratch3, r2, r4);
3424     __ bicw(rscratch4, r3, r4);
3425     reg_cache.extract_u32(rscratch1, k);
3426     __ movw(rscratch2, t);
3427     __ orrw(rscratch3, rscratch3, rscratch4);
3428     __ addw(rscratch4, r1, rscratch2);
3429     __ addw(rscratch4, rscratch4, rscratch1);
3430     __ addw(rscratch3, rscratch3, rscratch4);
3431     __ rorw(rscratch2, rscratch3, 32 - s);
3432     __ addw(r1, rscratch2, r2);
3433   }
3434 
3435   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3436               int k, int s, int t) {
3437     Register rscratch3 = r10;
3438     Register rscratch4 = r11;
3439 
3440     __ eorw(rscratch3, r3, r4);
3441     __ movw(rscratch2, t);
3442     __ addw(rscratch4, r1, rscratch2);
3443     reg_cache.extract_u32(rscratch1, k);
3444     __ eorw(rscratch3, rscratch3, r2);
3445     __ addw(rscratch4, rscratch4, rscratch1);
3446     __ addw(rscratch3, rscratch3, rscratch4);
3447     __ rorw(rscratch2, rscratch3, 32 - s);
3448     __ addw(r1, rscratch2, r2);
3449   }
3450 
3451   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3452               int k, int s, int t) {
3453     Register rscratch3 = r10;
3454     Register rscratch4 = r11;
3455 
3456     __ movw(rscratch3, t);
3457     __ ornw(rscratch2, r2, r4);
3458     __ addw(rscratch4, r1, rscratch3);
3459     reg_cache.extract_u32(rscratch1, k);
3460     __ eorw(rscratch3, rscratch2, r3);
3461     __ addw(rscratch4, rscratch4, rscratch1);
3462     __ addw(rscratch3, rscratch3, rscratch4);
3463     __ rorw(rscratch2, rscratch3, 32 - s);
3464     __ addw(r1, rscratch2, r2);
3465   }
3466 
3467   // Arguments:
3468   //
3469   // Inputs:
3470   //   c_rarg0   - byte[]  source+offset
3471   //   c_rarg1   - int[]   SHA.state
3472   //   c_rarg2   - int     offset
3473   //   c_rarg3   - int     limit
3474   //
3475   address generate_md5_implCompress(bool multi_block, const char *name) {
3476     __ align(CodeEntryAlignment);
3477     StubCodeMark mark(this, "StubRoutines", name);
3478     address start = __ pc();
3479 
3480     Register buf       = c_rarg0;
3481     Register state     = c_rarg1;
3482     Register ofs       = c_rarg2;
3483     Register limit     = c_rarg3;
3484     Register a         = r4;
3485     Register b         = r5;
3486     Register c         = r6;
3487     Register d         = r7;
3488     Register rscratch3 = r10;
3489     Register rscratch4 = r11;
3490 
3491     Register state_regs[2] = { r12, r13 };
3492     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3493     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3494 
3495     __ push(saved_regs, sp);
3496 
3497     __ ldp(state_regs[0], state_regs[1], Address(state));
3498     __ ubfx(a, state_regs[0],  0, 32);
3499     __ ubfx(b, state_regs[0], 32, 32);
3500     __ ubfx(c, state_regs[1],  0, 32);
3501     __ ubfx(d, state_regs[1], 32, 32);
3502 
3503     Label md5_loop;
3504     __ BIND(md5_loop);
3505 
3506     reg_cache.gen_loads(buf);
3507 
3508     // Round 1
3509     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3510     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3511     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3512     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3513     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3514     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3515     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3516     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3517     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3518     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3519     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3520     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3521     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3522     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3523     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3524     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3525 
3526     // Round 2
3527     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3528     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3529     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3530     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3531     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3532     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3533     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3534     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3535     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3536     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3537     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3538     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3539     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3540     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3541     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3542     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3543 
3544     // Round 3
3545     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3546     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3547     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3548     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3549     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3550     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3551     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3552     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3553     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3554     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3555     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3556     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3557     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3558     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3559     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3560     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3561 
3562     // Round 4
3563     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3564     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3565     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3566     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3567     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3568     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3569     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3570     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3571     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3572     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3573     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3574     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3575     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3576     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3577     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3578     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3579 
3580     __ addw(a, state_regs[0], a);
3581     __ ubfx(rscratch2, state_regs[0], 32, 32);
3582     __ addw(b, rscratch2, b);
3583     __ addw(c, state_regs[1], c);
3584     __ ubfx(rscratch4, state_regs[1], 32, 32);
3585     __ addw(d, rscratch4, d);
3586 
3587     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3588     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3589 
3590     if (multi_block) {
3591       __ add(buf, buf, 64);
3592       __ add(ofs, ofs, 64);
3593       __ cmp(ofs, limit);
3594       __ br(Assembler::LE, md5_loop);
3595       __ mov(c_rarg0, ofs); // return ofs
3596     }
3597 
3598     // write hash values back in the correct order
3599     __ stp(state_regs[0], state_regs[1], Address(state));
3600 
3601     __ pop(saved_regs, sp);
3602 
3603     __ ret(lr);
3604 
3605     return start;
3606   }
3607 
3608   // Arguments:
3609   //
3610   // Inputs:
3611   //   c_rarg0   - byte[]  source+offset
3612   //   c_rarg1   - int[]   SHA.state
3613   //   c_rarg2   - int     offset
3614   //   c_rarg3   - int     limit
3615   //
3616   address generate_sha1_implCompress(bool multi_block, const char *name) {
3617     __ align(CodeEntryAlignment);
3618     StubCodeMark mark(this, "StubRoutines", name);
3619     address start = __ pc();
3620 
3621     Register buf   = c_rarg0;
3622     Register state = c_rarg1;
3623     Register ofs   = c_rarg2;
3624     Register limit = c_rarg3;
3625 
3626     Label keys;
3627     Label sha1_loop;
3628 
3629     // load the keys into v0..v3
3630     __ adr(rscratch1, keys);
3631     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3632     // load 5 words state into v6, v7
3633     __ ldrq(v6, Address(state, 0));
3634     __ ldrs(v7, Address(state, 16));
3635 
3636 
3637     __ BIND(sha1_loop);
3638     // load 64 bytes of data into v16..v19
3639     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3640     __ rev32(v16, __ T16B, v16);
3641     __ rev32(v17, __ T16B, v17);
3642     __ rev32(v18, __ T16B, v18);
3643     __ rev32(v19, __ T16B, v19);
3644 
3645     // do the sha1
3646     __ addv(v4, __ T4S, v16, v0);
3647     __ orr(v20, __ T16B, v6, v6);
3648 
3649     FloatRegister d0 = v16;
3650     FloatRegister d1 = v17;
3651     FloatRegister d2 = v18;
3652     FloatRegister d3 = v19;
3653 
3654     for (int round = 0; round < 20; round++) {
3655       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3656       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3657       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3658       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3659       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3660 
3661       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3662       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3663       __ sha1h(tmp2, __ T4S, v20);
3664       if (round < 5)
3665         __ sha1c(v20, __ T4S, tmp3, tmp4);
3666       else if (round < 10 || round >= 15)
3667         __ sha1p(v20, __ T4S, tmp3, tmp4);
3668       else
3669         __ sha1m(v20, __ T4S, tmp3, tmp4);
3670       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3671 
3672       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3673     }
3674 
3675     __ addv(v7, __ T2S, v7, v21);
3676     __ addv(v6, __ T4S, v6, v20);
3677 
3678     if (multi_block) {
3679       __ add(ofs, ofs, 64);
3680       __ cmp(ofs, limit);
3681       __ br(Assembler::LE, sha1_loop);
3682       __ mov(c_rarg0, ofs); // return ofs
3683     }
3684 
3685     __ strq(v6, Address(state, 0));
3686     __ strs(v7, Address(state, 16));
3687 
3688     __ ret(lr);
3689 
3690     __ bind(keys);
3691     __ emit_int32(0x5a827999);
3692     __ emit_int32(0x6ed9eba1);
3693     __ emit_int32(0x8f1bbcdc);
3694     __ emit_int32(0xca62c1d6);
3695 
3696     return start;
3697   }
3698 
3699 
3700   // Arguments:
3701   //
3702   // Inputs:
3703   //   c_rarg0   - byte[]  source+offset
3704   //   c_rarg1   - int[]   SHA.state
3705   //   c_rarg2   - int     offset
3706   //   c_rarg3   - int     limit
3707   //
3708   address generate_sha256_implCompress(bool multi_block, const char *name) {
3709     static const uint32_t round_consts[64] = {
3710       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3711       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3712       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3713       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3714       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3715       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3716       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3717       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3718       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3719       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3720       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3721       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3722       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3723       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3724       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3725       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3726     };
3727     __ align(CodeEntryAlignment);
3728     StubCodeMark mark(this, "StubRoutines", name);
3729     address start = __ pc();
3730 
3731     Register buf   = c_rarg0;
3732     Register state = c_rarg1;
3733     Register ofs   = c_rarg2;
3734     Register limit = c_rarg3;
3735 
3736     Label sha1_loop;
3737 
3738     __ stpd(v8, v9, __ pre(sp, -32));
3739     __ stpd(v10, v11, Address(sp, 16));
3740 
3741 // dga == v0
3742 // dgb == v1
3743 // dg0 == v2
3744 // dg1 == v3
3745 // dg2 == v4
3746 // t0 == v6
3747 // t1 == v7
3748 
3749     // load 16 keys to v16..v31
3750     __ lea(rscratch1, ExternalAddress((address)round_consts));
3751     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3752     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3753     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3754     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3755 
3756     // load 8 words (256 bits) state
3757     __ ldpq(v0, v1, state);
3758 
3759     __ BIND(sha1_loop);
3760     // load 64 bytes of data into v8..v11
3761     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3762     __ rev32(v8, __ T16B, v8);
3763     __ rev32(v9, __ T16B, v9);
3764     __ rev32(v10, __ T16B, v10);
3765     __ rev32(v11, __ T16B, v11);
3766 
3767     __ addv(v6, __ T4S, v8, v16);
3768     __ orr(v2, __ T16B, v0, v0);
3769     __ orr(v3, __ T16B, v1, v1);
3770 
3771     FloatRegister d0 = v8;
3772     FloatRegister d1 = v9;
3773     FloatRegister d2 = v10;
3774     FloatRegister d3 = v11;
3775 
3776 
3777     for (int round = 0; round < 16; round++) {
3778       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3779       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3780       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3781       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3782 
3783       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3784        __ orr(v4, __ T16B, v2, v2);
3785       if (round < 15)
3786         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3787       __ sha256h(v2, __ T4S, v3, tmp2);
3788       __ sha256h2(v3, __ T4S, v4, tmp2);
3789       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3790 
3791       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3792     }
3793 
3794     __ addv(v0, __ T4S, v0, v2);
3795     __ addv(v1, __ T4S, v1, v3);
3796 
3797     if (multi_block) {
3798       __ add(ofs, ofs, 64);
3799       __ cmp(ofs, limit);
3800       __ br(Assembler::LE, sha1_loop);
3801       __ mov(c_rarg0, ofs); // return ofs
3802     }
3803 
3804     __ ldpd(v10, v11, Address(sp, 16));
3805     __ ldpd(v8, v9, __ post(sp, 32));
3806 
3807     __ stpq(v0, v1, state);
3808 
3809     __ ret(lr);
3810 
3811     return start;
3812   }
3813 
3814   // Double rounds for sha512.
3815   void sha512_dround(int dr,
3816                      FloatRegister vi0, FloatRegister vi1,
3817                      FloatRegister vi2, FloatRegister vi3,
3818                      FloatRegister vi4, FloatRegister vrc0,
3819                      FloatRegister vrc1, FloatRegister vin0,
3820                      FloatRegister vin1, FloatRegister vin2,
3821                      FloatRegister vin3, FloatRegister vin4) {
3822       if (dr < 36) {
3823         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3824       }
3825       __ addv(v5, __ T2D, vrc0, vin0);
3826       __ ext(v6, __ T16B, vi2, vi3, 8);
3827       __ ext(v5, __ T16B, v5, v5, 8);
3828       __ ext(v7, __ T16B, vi1, vi2, 8);
3829       __ addv(vi3, __ T2D, vi3, v5);
3830       if (dr < 32) {
3831         __ ext(v5, __ T16B, vin3, vin4, 8);
3832         __ sha512su0(vin0, __ T2D, vin1);
3833       }
3834       __ sha512h(vi3, __ T2D, v6, v7);
3835       if (dr < 32) {
3836         __ sha512su1(vin0, __ T2D, vin2, v5);
3837       }
3838       __ addv(vi4, __ T2D, vi1, vi3);
3839       __ sha512h2(vi3, __ T2D, vi1, vi0);
3840   }
3841 
3842   // Arguments:
3843   //
3844   // Inputs:
3845   //   c_rarg0   - byte[]  source+offset
3846   //   c_rarg1   - int[]   SHA.state
3847   //   c_rarg2   - int     offset
3848   //   c_rarg3   - int     limit
3849   //
3850   address generate_sha512_implCompress(bool multi_block, const char *name) {
3851     static const uint64_t round_consts[80] = {
3852       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3853       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3854       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3855       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3856       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3857       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3858       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3859       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3860       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3861       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3862       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3863       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3864       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3865       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3866       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3867       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3868       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3869       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3870       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3871       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3872       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3873       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3874       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3875       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3876       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3877       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3878       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3879     };
3880 
3881     __ align(CodeEntryAlignment);
3882     StubCodeMark mark(this, "StubRoutines", name);
3883     address start = __ pc();
3884 
3885     Register buf   = c_rarg0;
3886     Register state = c_rarg1;
3887     Register ofs   = c_rarg2;
3888     Register limit = c_rarg3;
3889 
3890     __ stpd(v8, v9, __ pre(sp, -64));
3891     __ stpd(v10, v11, Address(sp, 16));
3892     __ stpd(v12, v13, Address(sp, 32));
3893     __ stpd(v14, v15, Address(sp, 48));
3894 
3895     Label sha512_loop;
3896 
3897     // load state
3898     __ ld1(v8, v9, v10, v11, __ T2D, state);
3899 
3900     // load first 4 round constants
3901     __ lea(rscratch1, ExternalAddress((address)round_consts));
3902     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3903 
3904     __ BIND(sha512_loop);
3905     // load 128B of data into v12..v19
3906     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3907     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3908     __ rev64(v12, __ T16B, v12);
3909     __ rev64(v13, __ T16B, v13);
3910     __ rev64(v14, __ T16B, v14);
3911     __ rev64(v15, __ T16B, v15);
3912     __ rev64(v16, __ T16B, v16);
3913     __ rev64(v17, __ T16B, v17);
3914     __ rev64(v18, __ T16B, v18);
3915     __ rev64(v19, __ T16B, v19);
3916 
3917     __ mov(rscratch2, rscratch1);
3918 
3919     __ mov(v0, __ T16B, v8);
3920     __ mov(v1, __ T16B, v9);
3921     __ mov(v2, __ T16B, v10);
3922     __ mov(v3, __ T16B, v11);
3923 
3924     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3925     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3926     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3927     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3928     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3929     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3930     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3931     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3932     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3933     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3934     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3935     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3936     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3937     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3938     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3939     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3940     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3941     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3942     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3943     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3944     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3945     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3946     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3947     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3948     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3949     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3950     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3951     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3952     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3953     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3954     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3955     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3956     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3957     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3958     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3959     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3960     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3961     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3962     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3963     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3964 
3965     __ addv(v8, __ T2D, v8, v0);
3966     __ addv(v9, __ T2D, v9, v1);
3967     __ addv(v10, __ T2D, v10, v2);
3968     __ addv(v11, __ T2D, v11, v3);
3969 
3970     if (multi_block) {
3971       __ add(ofs, ofs, 128);
3972       __ cmp(ofs, limit);
3973       __ br(Assembler::LE, sha512_loop);
3974       __ mov(c_rarg0, ofs); // return ofs
3975     }
3976 
3977     __ st1(v8, v9, v10, v11, __ T2D, state);
3978 
3979     __ ldpd(v14, v15, Address(sp, 48));
3980     __ ldpd(v12, v13, Address(sp, 32));
3981     __ ldpd(v10, v11, Address(sp, 16));
3982     __ ldpd(v8, v9, __ post(sp, 64));
3983 
3984     __ ret(lr);
3985 
3986     return start;
3987   }
3988 
3989   // Arguments:
3990   //
3991   // Inputs:
3992   //   c_rarg0   - byte[]  source+offset
3993   //   c_rarg1   - byte[]  SHA.state
3994   //   c_rarg2   - int     block_size
3995   //   c_rarg3   - int     offset
3996   //   c_rarg4   - int     limit
3997   //
3998   address generate_sha3_implCompress(bool multi_block, const char *name) {
3999     static const uint64_t round_consts[24] = {
4000       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4001       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4002       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4003       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4004       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4005       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4006       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4007       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4008     };
4009 
4010     __ align(CodeEntryAlignment);
4011     StubCodeMark mark(this, "StubRoutines", name);
4012     address start = __ pc();
4013 
4014     Register buf           = c_rarg0;
4015     Register state         = c_rarg1;
4016     Register block_size    = c_rarg2;
4017     Register ofs           = c_rarg3;
4018     Register limit         = c_rarg4;
4019 
4020     Label sha3_loop, rounds24_loop;
4021     Label sha3_512_or_sha3_384, shake128;
4022 
4023     __ stpd(v8, v9, __ pre(sp, -64));
4024     __ stpd(v10, v11, Address(sp, 16));
4025     __ stpd(v12, v13, Address(sp, 32));
4026     __ stpd(v14, v15, Address(sp, 48));
4027 
4028     // load state
4029     __ add(rscratch1, state, 32);
4030     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4031     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4032     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4033     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4034     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4035     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4036     __ ld1(v24, __ T1D, rscratch1);
4037 
4038     __ BIND(sha3_loop);
4039 
4040     // 24 keccak rounds
4041     __ movw(rscratch2, 24);
4042 
4043     // load round_constants base
4044     __ lea(rscratch1, ExternalAddress((address) round_consts));
4045 
4046     // load input
4047     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4048     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4049     __ eor(v0, __ T8B, v0, v25);
4050     __ eor(v1, __ T8B, v1, v26);
4051     __ eor(v2, __ T8B, v2, v27);
4052     __ eor(v3, __ T8B, v3, v28);
4053     __ eor(v4, __ T8B, v4, v29);
4054     __ eor(v5, __ T8B, v5, v30);
4055     __ eor(v6, __ T8B, v6, v31);
4056 
4057     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4058     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4059 
4060     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4061     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4062     __ eor(v7, __ T8B, v7, v25);
4063     __ eor(v8, __ T8B, v8, v26);
4064     __ eor(v9, __ T8B, v9, v27);
4065     __ eor(v10, __ T8B, v10, v28);
4066     __ eor(v11, __ T8B, v11, v29);
4067     __ eor(v12, __ T8B, v12, v30);
4068     __ eor(v13, __ T8B, v13, v31);
4069 
4070     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4071     __ eor(v14, __ T8B, v14, v25);
4072     __ eor(v15, __ T8B, v15, v26);
4073     __ eor(v16, __ T8B, v16, v27);
4074 
4075     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4076     __ andw(c_rarg5, block_size, 48);
4077     __ cbzw(c_rarg5, rounds24_loop);
4078 
4079     __ tbnz(block_size, 5, shake128);
4080     // block_size == 144, bit5 == 0, SHA3-244
4081     __ ldrd(v28, __ post(buf, 8));
4082     __ eor(v17, __ T8B, v17, v28);
4083     __ b(rounds24_loop);
4084 
4085     __ BIND(shake128);
4086     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4087     __ eor(v17, __ T8B, v17, v28);
4088     __ eor(v18, __ T8B, v18, v29);
4089     __ eor(v19, __ T8B, v19, v30);
4090     __ eor(v20, __ T8B, v20, v31);
4091     __ b(rounds24_loop); // block_size == 168, SHAKE128
4092 
4093     __ BIND(sha3_512_or_sha3_384);
4094     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4095     __ eor(v7, __ T8B, v7, v25);
4096     __ eor(v8, __ T8B, v8, v26);
4097     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4098 
4099     // SHA3-384
4100     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4101     __ eor(v9,  __ T8B, v9,  v27);
4102     __ eor(v10, __ T8B, v10, v28);
4103     __ eor(v11, __ T8B, v11, v29);
4104     __ eor(v12, __ T8B, v12, v30);
4105 
4106     __ BIND(rounds24_loop);
4107     __ subw(rscratch2, rscratch2, 1);
4108 
4109     __ eor3(v29, __ T16B, v4, v9, v14);
4110     __ eor3(v26, __ T16B, v1, v6, v11);
4111     __ eor3(v28, __ T16B, v3, v8, v13);
4112     __ eor3(v25, __ T16B, v0, v5, v10);
4113     __ eor3(v27, __ T16B, v2, v7, v12);
4114     __ eor3(v29, __ T16B, v29, v19, v24);
4115     __ eor3(v26, __ T16B, v26, v16, v21);
4116     __ eor3(v28, __ T16B, v28, v18, v23);
4117     __ eor3(v25, __ T16B, v25, v15, v20);
4118     __ eor3(v27, __ T16B, v27, v17, v22);
4119 
4120     __ rax1(v30, __ T2D, v29, v26);
4121     __ rax1(v26, __ T2D, v26, v28);
4122     __ rax1(v28, __ T2D, v28, v25);
4123     __ rax1(v25, __ T2D, v25, v27);
4124     __ rax1(v27, __ T2D, v27, v29);
4125 
4126     __ eor(v0, __ T16B, v0, v30);
4127     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4128     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4129     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4130     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4131     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4132     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4133     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4134     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4135     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4136     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4137     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4138     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4139     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4140     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4141     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4142     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4143     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4144     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4145     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4146     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4147     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4148     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4149     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4150     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4151 
4152     __ bcax(v20, __ T16B, v31, v22, v8);
4153     __ bcax(v21, __ T16B, v8,  v23, v22);
4154     __ bcax(v22, __ T16B, v22, v24, v23);
4155     __ bcax(v23, __ T16B, v23, v31, v24);
4156     __ bcax(v24, __ T16B, v24, v8,  v31);
4157 
4158     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4159 
4160     __ bcax(v17, __ T16B, v25, v19, v3);
4161     __ bcax(v18, __ T16B, v3,  v15, v19);
4162     __ bcax(v19, __ T16B, v19, v16, v15);
4163     __ bcax(v15, __ T16B, v15, v25, v16);
4164     __ bcax(v16, __ T16B, v16, v3,  v25);
4165 
4166     __ bcax(v10, __ T16B, v29, v12, v26);
4167     __ bcax(v11, __ T16B, v26, v13, v12);
4168     __ bcax(v12, __ T16B, v12, v14, v13);
4169     __ bcax(v13, __ T16B, v13, v29, v14);
4170     __ bcax(v14, __ T16B, v14, v26, v29);
4171 
4172     __ bcax(v7, __ T16B, v30, v9,  v4);
4173     __ bcax(v8, __ T16B, v4,  v5,  v9);
4174     __ bcax(v9, __ T16B, v9,  v6,  v5);
4175     __ bcax(v5, __ T16B, v5,  v30, v6);
4176     __ bcax(v6, __ T16B, v6,  v4,  v30);
4177 
4178     __ bcax(v3, __ T16B, v27, v0,  v28);
4179     __ bcax(v4, __ T16B, v28, v1,  v0);
4180     __ bcax(v0, __ T16B, v0,  v2,  v1);
4181     __ bcax(v1, __ T16B, v1,  v27, v2);
4182     __ bcax(v2, __ T16B, v2,  v28, v27);
4183 
4184     __ eor(v0, __ T16B, v0, v31);
4185 
4186     __ cbnzw(rscratch2, rounds24_loop);
4187 
4188     if (multi_block) {
4189       __ add(ofs, ofs, block_size);
4190       __ cmp(ofs, limit);
4191       __ br(Assembler::LE, sha3_loop);
4192       __ mov(c_rarg0, ofs); // return ofs
4193     }
4194 
4195     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4196     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4197     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4198     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4199     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4200     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4201     __ st1(v24, __ T1D, state);
4202 
4203     __ ldpd(v14, v15, Address(sp, 48));
4204     __ ldpd(v12, v13, Address(sp, 32));
4205     __ ldpd(v10, v11, Address(sp, 16));
4206     __ ldpd(v8, v9, __ post(sp, 64));
4207 
4208     __ ret(lr);
4209 
4210     return start;
4211   }
4212 
4213   /**
4214    *  Arguments:
4215    *
4216    * Inputs:
4217    *   c_rarg0   - int crc
4218    *   c_rarg1   - byte* buf
4219    *   c_rarg2   - int length
4220    *
4221    * Output:
4222    *       rax   - int crc result
4223    */
4224   address generate_updateBytesCRC32() {
4225     assert(UseCRC32Intrinsics, "what are we doing here?");
4226 
4227     __ align(CodeEntryAlignment);
4228     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4229 
4230     address start = __ pc();
4231 
4232     const Register crc   = c_rarg0;  // crc
4233     const Register buf   = c_rarg1;  // source java byte array address
4234     const Register len   = c_rarg2;  // length
4235     const Register table0 = c_rarg3; // crc_table address
4236     const Register table1 = c_rarg4;
4237     const Register table2 = c_rarg5;
4238     const Register table3 = c_rarg6;
4239     const Register tmp3 = c_rarg7;
4240 
4241     BLOCK_COMMENT("Entry:");
4242     __ enter(); // required for proper stackwalking of RuntimeStub frame
4243 
4244     __ kernel_crc32(crc, buf, len,
4245               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4246 
4247     __ leave(); // required for proper stackwalking of RuntimeStub frame
4248     __ ret(lr);
4249 
4250     return start;
4251   }
4252 
4253   // ChaCha20 block function.  This version parallelizes by loading
4254   // individual 32-bit state elements into vectors for four blocks
4255   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4256   //
4257   // state (int[16]) = c_rarg0
4258   // keystream (byte[1024]) = c_rarg1
4259   // return - number of bytes of keystream (always 256)
4260   address generate_chacha20Block_blockpar() {
4261     Label L_twoRounds, L_cc20_const;
4262     // The constant data is broken into two 128-bit segments to be loaded
4263     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4264     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4265     // The second 128-bits is a table constant used for 8-bit left rotations.
4266     __ BIND(L_cc20_const);
4267     __ emit_int64(0x0000000100000000UL);
4268     __ emit_int64(0x0000000300000002UL);
4269     __ emit_int64(0x0605040702010003UL);
4270     __ emit_int64(0x0E0D0C0F0A09080BUL);
4271 
4272     __ align(CodeEntryAlignment);
4273     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4274     address start = __ pc();
4275     __ enter();
4276 
4277     int i, j;
4278     const Register state = c_rarg0;
4279     const Register keystream = c_rarg1;
4280     const Register loopCtr = r10;
4281     const Register tmpAddr = r11;
4282 
4283     const FloatRegister stateFirst = v0;
4284     const FloatRegister stateSecond = v1;
4285     const FloatRegister stateThird = v2;
4286     const FloatRegister stateFourth = v3;
4287     const FloatRegister origCtrState = v28;
4288     const FloatRegister scratch = v29;
4289     const FloatRegister lrot8Tbl = v30;
4290 
4291     // Organize SIMD registers in an array that facilitates
4292     // putting repetitive opcodes into loop structures.  It is
4293     // important that each grouping of 4 registers is monotonically
4294     // increasing to support the requirements of multi-register
4295     // instructions (e.g. ld4r, st4, etc.)
4296     const FloatRegister workSt[16] = {
4297          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4298         v20, v21, v22, v23, v24, v25, v26, v27
4299     };
4300 
4301     // Load from memory and interlace across 16 SIMD registers,
4302     // With each word from memory being broadcast to all lanes of
4303     // each successive SIMD register.
4304     //      Addr(0) -> All lanes in workSt[i]
4305     //      Addr(4) -> All lanes workSt[i + 1], etc.
4306     __ mov(tmpAddr, state);
4307     for (i = 0; i < 16; i += 4) {
4308       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4309           __ post(tmpAddr, 16));
4310     }
4311 
4312     // Pull in constant data.  The first 16 bytes are the add overlay
4313     // which is applied to the vector holding the counter (state[12]).
4314     // The second 16 bytes is the index register for the 8-bit left
4315     // rotation tbl instruction.
4316     __ adr(tmpAddr, L_cc20_const);
4317     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4318     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4319 
4320     // Set up the 10 iteration loop and perform all 8 quarter round ops
4321     __ mov(loopCtr, 10);
4322     __ BIND(L_twoRounds);
4323 
4324     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4325         scratch, lrot8Tbl);
4326     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4327         scratch, lrot8Tbl);
4328     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4329         scratch, lrot8Tbl);
4330     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4331         scratch, lrot8Tbl);
4332 
4333     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4334         scratch, lrot8Tbl);
4335     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4336         scratch, lrot8Tbl);
4337     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4338         scratch, lrot8Tbl);
4339     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4340         scratch, lrot8Tbl);
4341 
4342     // Decrement and iterate
4343     __ sub(loopCtr, loopCtr, 1);
4344     __ cbnz(loopCtr, L_twoRounds);
4345 
4346     __ mov(tmpAddr, state);
4347 
4348     // Add the starting state back to the post-loop keystream
4349     // state.  We read/interlace the state array from memory into
4350     // 4 registers similar to what we did in the beginning.  Then
4351     // add the counter overlay onto workSt[12] at the end.
4352     for (i = 0; i < 16; i += 4) {
4353       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4354           __ post(tmpAddr, 16));
4355       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4356       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4357       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4358       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4359     }
4360     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4361 
4362     // Write to key stream, storing the same element out of workSt[0..15]
4363     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4364     // for the next element position.
4365     for (i = 0; i < 4; i++) {
4366       for (j = 0; j < 16; j += 4) {
4367         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4368             __ post(keystream, 16));
4369       }
4370     }
4371 
4372     __ mov(r0, 256);             // Return length of output keystream
4373     __ leave();
4374     __ ret(lr);
4375 
4376     return start;
4377   }
4378 
4379   /**
4380    *  Arguments:
4381    *
4382    * Inputs:
4383    *   c_rarg0   - int crc
4384    *   c_rarg1   - byte* buf
4385    *   c_rarg2   - int length
4386    *   c_rarg3   - int* table
4387    *
4388    * Output:
4389    *       r0   - int crc result
4390    */
4391   address generate_updateBytesCRC32C() {
4392     assert(UseCRC32CIntrinsics, "what are we doing here?");
4393 
4394     __ align(CodeEntryAlignment);
4395     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4396 
4397     address start = __ pc();
4398 
4399     const Register crc   = c_rarg0;  // crc
4400     const Register buf   = c_rarg1;  // source java byte array address
4401     const Register len   = c_rarg2;  // length
4402     const Register table0 = c_rarg3; // crc_table address
4403     const Register table1 = c_rarg4;
4404     const Register table2 = c_rarg5;
4405     const Register table3 = c_rarg6;
4406     const Register tmp3 = c_rarg7;
4407 
4408     BLOCK_COMMENT("Entry:");
4409     __ enter(); // required for proper stackwalking of RuntimeStub frame
4410 
4411     __ kernel_crc32c(crc, buf, len,
4412               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4413 
4414     __ leave(); // required for proper stackwalking of RuntimeStub frame
4415     __ ret(lr);
4416 
4417     return start;
4418   }
4419 
4420   /***
4421    *  Arguments:
4422    *
4423    *  Inputs:
4424    *   c_rarg0   - int   adler
4425    *   c_rarg1   - byte* buff
4426    *   c_rarg2   - int   len
4427    *
4428    * Output:
4429    *   c_rarg0   - int adler result
4430    */
4431   address generate_updateBytesAdler32() {
4432     __ align(CodeEntryAlignment);
4433     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4434     address start = __ pc();
4435 
4436     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4437 
4438     // Aliases
4439     Register adler  = c_rarg0;
4440     Register s1     = c_rarg0;
4441     Register s2     = c_rarg3;
4442     Register buff   = c_rarg1;
4443     Register len    = c_rarg2;
4444     Register nmax  = r4;
4445     Register base  = r5;
4446     Register count = r6;
4447     Register temp0 = rscratch1;
4448     Register temp1 = rscratch2;
4449     FloatRegister vbytes = v0;
4450     FloatRegister vs1acc = v1;
4451     FloatRegister vs2acc = v2;
4452     FloatRegister vtable = v3;
4453 
4454     // Max number of bytes we can process before having to take the mod
4455     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4456     uint64_t BASE = 0xfff1;
4457     uint64_t NMAX = 0x15B0;
4458 
4459     __ mov(base, BASE);
4460     __ mov(nmax, NMAX);
4461 
4462     // Load accumulation coefficients for the upper 16 bits
4463     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4464     __ ld1(vtable, __ T16B, Address(temp0));
4465 
4466     // s1 is initialized to the lower 16 bits of adler
4467     // s2 is initialized to the upper 16 bits of adler
4468     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4469     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4470 
4471     // The pipelined loop needs at least 16 elements for 1 iteration
4472     // It does check this, but it is more effective to skip to the cleanup loop
4473     __ cmp(len, (u1)16);
4474     __ br(Assembler::HS, L_nmax);
4475     __ cbz(len, L_combine);
4476 
4477     __ bind(L_simple_by1_loop);
4478     __ ldrb(temp0, Address(__ post(buff, 1)));
4479     __ add(s1, s1, temp0);
4480     __ add(s2, s2, s1);
4481     __ subs(len, len, 1);
4482     __ br(Assembler::HI, L_simple_by1_loop);
4483 
4484     // s1 = s1 % BASE
4485     __ subs(temp0, s1, base);
4486     __ csel(s1, temp0, s1, Assembler::HS);
4487 
4488     // s2 = s2 % BASE
4489     __ lsr(temp0, s2, 16);
4490     __ lsl(temp1, temp0, 4);
4491     __ sub(temp1, temp1, temp0);
4492     __ add(s2, temp1, s2, ext::uxth);
4493 
4494     __ subs(temp0, s2, base);
4495     __ csel(s2, temp0, s2, Assembler::HS);
4496 
4497     __ b(L_combine);
4498 
4499     __ bind(L_nmax);
4500     __ subs(len, len, nmax);
4501     __ sub(count, nmax, 16);
4502     __ br(Assembler::LO, L_by16);
4503 
4504     __ bind(L_nmax_loop);
4505 
4506     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4507                                       vbytes, vs1acc, vs2acc, vtable);
4508 
4509     __ subs(count, count, 16);
4510     __ br(Assembler::HS, L_nmax_loop);
4511 
4512     // s1 = s1 % BASE
4513     __ lsr(temp0, s1, 16);
4514     __ lsl(temp1, temp0, 4);
4515     __ sub(temp1, temp1, temp0);
4516     __ add(temp1, temp1, s1, ext::uxth);
4517 
4518     __ lsr(temp0, temp1, 16);
4519     __ lsl(s1, temp0, 4);
4520     __ sub(s1, s1, temp0);
4521     __ add(s1, s1, temp1, ext:: uxth);
4522 
4523     __ subs(temp0, s1, base);
4524     __ csel(s1, temp0, s1, Assembler::HS);
4525 
4526     // s2 = s2 % BASE
4527     __ lsr(temp0, s2, 16);
4528     __ lsl(temp1, temp0, 4);
4529     __ sub(temp1, temp1, temp0);
4530     __ add(temp1, temp1, s2, ext::uxth);
4531 
4532     __ lsr(temp0, temp1, 16);
4533     __ lsl(s2, temp0, 4);
4534     __ sub(s2, s2, temp0);
4535     __ add(s2, s2, temp1, ext:: uxth);
4536 
4537     __ subs(temp0, s2, base);
4538     __ csel(s2, temp0, s2, Assembler::HS);
4539 
4540     __ subs(len, len, nmax);
4541     __ sub(count, nmax, 16);
4542     __ br(Assembler::HS, L_nmax_loop);
4543 
4544     __ bind(L_by16);
4545     __ adds(len, len, count);
4546     __ br(Assembler::LO, L_by1);
4547 
4548     __ bind(L_by16_loop);
4549 
4550     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4551                                       vbytes, vs1acc, vs2acc, vtable);
4552 
4553     __ subs(len, len, 16);
4554     __ br(Assembler::HS, L_by16_loop);
4555 
4556     __ bind(L_by1);
4557     __ adds(len, len, 15);
4558     __ br(Assembler::LO, L_do_mod);
4559 
4560     __ bind(L_by1_loop);
4561     __ ldrb(temp0, Address(__ post(buff, 1)));
4562     __ add(s1, temp0, s1);
4563     __ add(s2, s2, s1);
4564     __ subs(len, len, 1);
4565     __ br(Assembler::HS, L_by1_loop);
4566 
4567     __ bind(L_do_mod);
4568     // s1 = s1 % BASE
4569     __ lsr(temp0, s1, 16);
4570     __ lsl(temp1, temp0, 4);
4571     __ sub(temp1, temp1, temp0);
4572     __ add(temp1, temp1, s1, ext::uxth);
4573 
4574     __ lsr(temp0, temp1, 16);
4575     __ lsl(s1, temp0, 4);
4576     __ sub(s1, s1, temp0);
4577     __ add(s1, s1, temp1, ext:: uxth);
4578 
4579     __ subs(temp0, s1, base);
4580     __ csel(s1, temp0, s1, Assembler::HS);
4581 
4582     // s2 = s2 % BASE
4583     __ lsr(temp0, s2, 16);
4584     __ lsl(temp1, temp0, 4);
4585     __ sub(temp1, temp1, temp0);
4586     __ add(temp1, temp1, s2, ext::uxth);
4587 
4588     __ lsr(temp0, temp1, 16);
4589     __ lsl(s2, temp0, 4);
4590     __ sub(s2, s2, temp0);
4591     __ add(s2, s2, temp1, ext:: uxth);
4592 
4593     __ subs(temp0, s2, base);
4594     __ csel(s2, temp0, s2, Assembler::HS);
4595 
4596     // Combine lower bits and higher bits
4597     __ bind(L_combine);
4598     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4599 
4600     __ ret(lr);
4601 
4602     return start;
4603   }
4604 
4605   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4606           Register temp0, Register temp1, FloatRegister vbytes,
4607           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4608     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4609     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4610     // In non-vectorized code, we update s1 and s2 as:
4611     //   s1 <- s1 + b1
4612     //   s2 <- s2 + s1
4613     //   s1 <- s1 + b2
4614     //   s2 <- s2 + b1
4615     //   ...
4616     //   s1 <- s1 + b16
4617     //   s2 <- s2 + s1
4618     // Putting above assignments together, we have:
4619     //   s1_new = s1 + b1 + b2 + ... + b16
4620     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4621     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4622     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4623     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4624 
4625     // s2 = s2 + s1 * 16
4626     __ add(s2, s2, s1, Assembler::LSL, 4);
4627 
4628     // vs1acc = b1 + b2 + b3 + ... + b16
4629     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4630     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4631     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4632     __ uaddlv(vs1acc, __ T16B, vbytes);
4633     __ uaddlv(vs2acc, __ T8H, vs2acc);
4634 
4635     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4636     __ fmovd(temp0, vs1acc);
4637     __ fmovd(temp1, vs2acc);
4638     __ add(s1, s1, temp0);
4639     __ add(s2, s2, temp1);
4640   }
4641 
4642   /**
4643    *  Arguments:
4644    *
4645    *  Input:
4646    *    c_rarg0   - x address
4647    *    c_rarg1   - x length
4648    *    c_rarg2   - y address
4649    *    c_rarg3   - y length
4650    *    c_rarg4   - z address
4651    *    c_rarg5   - z length
4652    */
4653   address generate_multiplyToLen() {
4654     __ align(CodeEntryAlignment);
4655     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4656 
4657     address start = __ pc();
4658     const Register x     = r0;
4659     const Register xlen  = r1;
4660     const Register y     = r2;
4661     const Register ylen  = r3;
4662     const Register z     = r4;
4663     const Register zlen  = r5;
4664 
4665     const Register tmp1  = r10;
4666     const Register tmp2  = r11;
4667     const Register tmp3  = r12;
4668     const Register tmp4  = r13;
4669     const Register tmp5  = r14;
4670     const Register tmp6  = r15;
4671     const Register tmp7  = r16;
4672 
4673     BLOCK_COMMENT("Entry:");
4674     __ enter(); // required for proper stackwalking of RuntimeStub frame
4675     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4676     __ leave(); // required for proper stackwalking of RuntimeStub frame
4677     __ ret(lr);
4678 
4679     return start;
4680   }
4681 
4682   address generate_squareToLen() {
4683     // squareToLen algorithm for sizes 1..127 described in java code works
4684     // faster than multiply_to_len on some CPUs and slower on others, but
4685     // multiply_to_len shows a bit better overall results
4686     __ align(CodeEntryAlignment);
4687     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4688     address start = __ pc();
4689 
4690     const Register x     = r0;
4691     const Register xlen  = r1;
4692     const Register z     = r2;
4693     const Register zlen  = r3;
4694     const Register y     = r4; // == x
4695     const Register ylen  = r5; // == xlen
4696 
4697     const Register tmp1  = r10;
4698     const Register tmp2  = r11;
4699     const Register tmp3  = r12;
4700     const Register tmp4  = r13;
4701     const Register tmp5  = r14;
4702     const Register tmp6  = r15;
4703     const Register tmp7  = r16;
4704 
4705     RegSet spilled_regs = RegSet::of(y, ylen);
4706     BLOCK_COMMENT("Entry:");
4707     __ enter();
4708     __ push(spilled_regs, sp);
4709     __ mov(y, x);
4710     __ mov(ylen, xlen);
4711     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4712     __ pop(spilled_regs, sp);
4713     __ leave();
4714     __ ret(lr);
4715     return start;
4716   }
4717 
4718   address generate_mulAdd() {
4719     __ align(CodeEntryAlignment);
4720     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4721 
4722     address start = __ pc();
4723 
4724     const Register out     = r0;
4725     const Register in      = r1;
4726     const Register offset  = r2;
4727     const Register len     = r3;
4728     const Register k       = r4;
4729 
4730     BLOCK_COMMENT("Entry:");
4731     __ enter();
4732     __ mul_add(out, in, offset, len, k);
4733     __ leave();
4734     __ ret(lr);
4735 
4736     return start;
4737   }
4738 
4739   // Arguments:
4740   //
4741   // Input:
4742   //   c_rarg0   - newArr address
4743   //   c_rarg1   - oldArr address
4744   //   c_rarg2   - newIdx
4745   //   c_rarg3   - shiftCount
4746   //   c_rarg4   - numIter
4747   //
4748   address generate_bigIntegerRightShift() {
4749     __ align(CodeEntryAlignment);
4750     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4751     address start = __ pc();
4752 
4753     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4754 
4755     Register newArr        = c_rarg0;
4756     Register oldArr        = c_rarg1;
4757     Register newIdx        = c_rarg2;
4758     Register shiftCount    = c_rarg3;
4759     Register numIter       = c_rarg4;
4760     Register idx           = numIter;
4761 
4762     Register newArrCur     = rscratch1;
4763     Register shiftRevCount = rscratch2;
4764     Register oldArrCur     = r13;
4765     Register oldArrNext    = r14;
4766 
4767     FloatRegister oldElem0        = v0;
4768     FloatRegister oldElem1        = v1;
4769     FloatRegister newElem         = v2;
4770     FloatRegister shiftVCount     = v3;
4771     FloatRegister shiftVRevCount  = v4;
4772 
4773     __ cbz(idx, Exit);
4774 
4775     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4776 
4777     // left shift count
4778     __ movw(shiftRevCount, 32);
4779     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4780 
4781     // numIter too small to allow a 4-words SIMD loop, rolling back
4782     __ cmp(numIter, (u1)4);
4783     __ br(Assembler::LT, ShiftThree);
4784 
4785     __ dup(shiftVCount,    __ T4S, shiftCount);
4786     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4787     __ negr(shiftVCount,   __ T4S, shiftVCount);
4788 
4789     __ BIND(ShiftSIMDLoop);
4790 
4791     // Calculate the load addresses
4792     __ sub(idx, idx, 4);
4793     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4794     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4795     __ add(oldArrCur,  oldArrNext, 4);
4796 
4797     // Load 4 words and process
4798     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4799     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4800     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4801     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4802     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4803     __ st1(newElem,   __ T4S,  Address(newArrCur));
4804 
4805     __ cmp(idx, (u1)4);
4806     __ br(Assembler::LT, ShiftTwoLoop);
4807     __ b(ShiftSIMDLoop);
4808 
4809     __ BIND(ShiftTwoLoop);
4810     __ cbz(idx, Exit);
4811     __ cmp(idx, (u1)1);
4812     __ br(Assembler::EQ, ShiftOne);
4813 
4814     // Calculate the load addresses
4815     __ sub(idx, idx, 2);
4816     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4817     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4818     __ add(oldArrCur,  oldArrNext, 4);
4819 
4820     // Load 2 words and process
4821     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4822     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4823     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4824     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4825     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4826     __ st1(newElem,   __ T2S, Address(newArrCur));
4827     __ b(ShiftTwoLoop);
4828 
4829     __ BIND(ShiftThree);
4830     __ tbz(idx, 1, ShiftOne);
4831     __ tbz(idx, 0, ShiftTwo);
4832     __ ldrw(r10,  Address(oldArr, 12));
4833     __ ldrw(r11,  Address(oldArr, 8));
4834     __ lsrvw(r10, r10, shiftCount);
4835     __ lslvw(r11, r11, shiftRevCount);
4836     __ orrw(r12,  r10, r11);
4837     __ strw(r12,  Address(newArr, 8));
4838 
4839     __ BIND(ShiftTwo);
4840     __ ldrw(r10,  Address(oldArr, 8));
4841     __ ldrw(r11,  Address(oldArr, 4));
4842     __ lsrvw(r10, r10, shiftCount);
4843     __ lslvw(r11, r11, shiftRevCount);
4844     __ orrw(r12,  r10, r11);
4845     __ strw(r12,  Address(newArr, 4));
4846 
4847     __ BIND(ShiftOne);
4848     __ ldrw(r10,  Address(oldArr, 4));
4849     __ ldrw(r11,  Address(oldArr));
4850     __ lsrvw(r10, r10, shiftCount);
4851     __ lslvw(r11, r11, shiftRevCount);
4852     __ orrw(r12,  r10, r11);
4853     __ strw(r12,  Address(newArr));
4854 
4855     __ BIND(Exit);
4856     __ ret(lr);
4857 
4858     return start;
4859   }
4860 
4861   // Arguments:
4862   //
4863   // Input:
4864   //   c_rarg0   - newArr address
4865   //   c_rarg1   - oldArr address
4866   //   c_rarg2   - newIdx
4867   //   c_rarg3   - shiftCount
4868   //   c_rarg4   - numIter
4869   //
4870   address generate_bigIntegerLeftShift() {
4871     __ align(CodeEntryAlignment);
4872     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4873     address start = __ pc();
4874 
4875     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4876 
4877     Register newArr        = c_rarg0;
4878     Register oldArr        = c_rarg1;
4879     Register newIdx        = c_rarg2;
4880     Register shiftCount    = c_rarg3;
4881     Register numIter       = c_rarg4;
4882 
4883     Register shiftRevCount = rscratch1;
4884     Register oldArrNext    = rscratch2;
4885 
4886     FloatRegister oldElem0        = v0;
4887     FloatRegister oldElem1        = v1;
4888     FloatRegister newElem         = v2;
4889     FloatRegister shiftVCount     = v3;
4890     FloatRegister shiftVRevCount  = v4;
4891 
4892     __ cbz(numIter, Exit);
4893 
4894     __ add(oldArrNext, oldArr, 4);
4895     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4896 
4897     // right shift count
4898     __ movw(shiftRevCount, 32);
4899     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4900 
4901     // numIter too small to allow a 4-words SIMD loop, rolling back
4902     __ cmp(numIter, (u1)4);
4903     __ br(Assembler::LT, ShiftThree);
4904 
4905     __ dup(shiftVCount,     __ T4S, shiftCount);
4906     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4907     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4908 
4909     __ BIND(ShiftSIMDLoop);
4910 
4911     // load 4 words and process
4912     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4913     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4914     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4915     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4916     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4917     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4918     __ sub(numIter,   numIter, 4);
4919 
4920     __ cmp(numIter, (u1)4);
4921     __ br(Assembler::LT, ShiftTwoLoop);
4922     __ b(ShiftSIMDLoop);
4923 
4924     __ BIND(ShiftTwoLoop);
4925     __ cbz(numIter, Exit);
4926     __ cmp(numIter, (u1)1);
4927     __ br(Assembler::EQ, ShiftOne);
4928 
4929     // load 2 words and process
4930     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4931     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4932     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4933     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4934     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4935     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4936     __ sub(numIter,   numIter, 2);
4937     __ b(ShiftTwoLoop);
4938 
4939     __ BIND(ShiftThree);
4940     __ ldrw(r10,  __ post(oldArr, 4));
4941     __ ldrw(r11,  __ post(oldArrNext, 4));
4942     __ lslvw(r10, r10, shiftCount);
4943     __ lsrvw(r11, r11, shiftRevCount);
4944     __ orrw(r12,  r10, r11);
4945     __ strw(r12,  __ post(newArr, 4));
4946     __ tbz(numIter, 1, Exit);
4947     __ tbz(numIter, 0, ShiftOne);
4948 
4949     __ BIND(ShiftTwo);
4950     __ ldrw(r10,  __ post(oldArr, 4));
4951     __ ldrw(r11,  __ post(oldArrNext, 4));
4952     __ lslvw(r10, r10, shiftCount);
4953     __ lsrvw(r11, r11, shiftRevCount);
4954     __ orrw(r12,  r10, r11);
4955     __ strw(r12,  __ post(newArr, 4));
4956 
4957     __ BIND(ShiftOne);
4958     __ ldrw(r10,  Address(oldArr));
4959     __ ldrw(r11,  Address(oldArrNext));
4960     __ lslvw(r10, r10, shiftCount);
4961     __ lsrvw(r11, r11, shiftRevCount);
4962     __ orrw(r12,  r10, r11);
4963     __ strw(r12,  Address(newArr));
4964 
4965     __ BIND(Exit);
4966     __ ret(lr);
4967 
4968     return start;
4969   }
4970 
4971   address generate_count_positives(address &count_positives_long) {
4972     const u1 large_loop_size = 64;
4973     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4974     int dcache_line = VM_Version::dcache_line_size();
4975 
4976     Register ary1 = r1, len = r2, result = r0;
4977 
4978     __ align(CodeEntryAlignment);
4979 
4980     StubCodeMark mark(this, "StubRoutines", "count_positives");
4981 
4982     address entry = __ pc();
4983 
4984     __ enter();
4985     // precondition: a copy of len is already in result
4986     // __ mov(result, len);
4987 
4988   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4989         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4990 
4991   __ cmp(len, (u1)15);
4992   __ br(Assembler::GT, LEN_OVER_15);
4993   // The only case when execution falls into this code is when pointer is near
4994   // the end of memory page and we have to avoid reading next page
4995   __ add(ary1, ary1, len);
4996   __ subs(len, len, 8);
4997   __ br(Assembler::GT, LEN_OVER_8);
4998   __ ldr(rscratch2, Address(ary1, -8));
4999   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
5000   __ lsrv(rscratch2, rscratch2, rscratch1);
5001   __ tst(rscratch2, UPPER_BIT_MASK);
5002   __ csel(result, zr, result, Assembler::NE);
5003   __ leave();
5004   __ ret(lr);
5005   __ bind(LEN_OVER_8);
5006   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5007   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5008   __ tst(rscratch2, UPPER_BIT_MASK);
5009   __ br(Assembler::NE, RET_NO_POP);
5010   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5011   __ lsrv(rscratch1, rscratch1, rscratch2);
5012   __ tst(rscratch1, UPPER_BIT_MASK);
5013   __ bind(RET_NO_POP);
5014   __ csel(result, zr, result, Assembler::NE);
5015   __ leave();
5016   __ ret(lr);
5017 
5018   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5019   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5020 
5021   count_positives_long = __ pc(); // 2nd entry point
5022 
5023   __ enter();
5024 
5025   __ bind(LEN_OVER_15);
5026     __ push(spilled_regs, sp);
5027     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5028     __ cbz(rscratch2, ALIGNED);
5029     __ ldp(tmp6, tmp1, Address(ary1));
5030     __ mov(tmp5, 16);
5031     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5032     __ add(ary1, ary1, rscratch1);
5033     __ orr(tmp6, tmp6, tmp1);
5034     __ tst(tmp6, UPPER_BIT_MASK);
5035     __ br(Assembler::NE, RET_ADJUST);
5036     __ sub(len, len, rscratch1);
5037 
5038   __ bind(ALIGNED);
5039     __ cmp(len, large_loop_size);
5040     __ br(Assembler::LT, CHECK_16);
5041     // Perform 16-byte load as early return in pre-loop to handle situation
5042     // when initially aligned large array has negative values at starting bytes,
5043     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5044     // slower. Cases with negative bytes further ahead won't be affected that
5045     // much. In fact, it'll be faster due to early loads, less instructions and
5046     // less branches in LARGE_LOOP.
5047     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5048     __ sub(len, len, 16);
5049     __ orr(tmp6, tmp6, tmp1);
5050     __ tst(tmp6, UPPER_BIT_MASK);
5051     __ br(Assembler::NE, RET_ADJUST_16);
5052     __ cmp(len, large_loop_size);
5053     __ br(Assembler::LT, CHECK_16);
5054 
5055     if (SoftwarePrefetchHintDistance >= 0
5056         && SoftwarePrefetchHintDistance >= dcache_line) {
5057       // initial prefetch
5058       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5059     }
5060   __ bind(LARGE_LOOP);
5061     if (SoftwarePrefetchHintDistance >= 0) {
5062       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5063     }
5064     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5065     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5066     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5067     // instructions per cycle and have less branches, but this approach disables
5068     // early return, thus, all 64 bytes are loaded and checked every time.
5069     __ ldp(tmp2, tmp3, Address(ary1));
5070     __ ldp(tmp4, tmp5, Address(ary1, 16));
5071     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5072     __ ldp(tmp6, tmp1, Address(ary1, 48));
5073     __ add(ary1, ary1, large_loop_size);
5074     __ sub(len, len, large_loop_size);
5075     __ orr(tmp2, tmp2, tmp3);
5076     __ orr(tmp4, tmp4, tmp5);
5077     __ orr(rscratch1, rscratch1, rscratch2);
5078     __ orr(tmp6, tmp6, tmp1);
5079     __ orr(tmp2, tmp2, tmp4);
5080     __ orr(rscratch1, rscratch1, tmp6);
5081     __ orr(tmp2, tmp2, rscratch1);
5082     __ tst(tmp2, UPPER_BIT_MASK);
5083     __ br(Assembler::NE, RET_ADJUST_LONG);
5084     __ cmp(len, large_loop_size);
5085     __ br(Assembler::GE, LARGE_LOOP);
5086 
5087   __ bind(CHECK_16); // small 16-byte load pre-loop
5088     __ cmp(len, (u1)16);
5089     __ br(Assembler::LT, POST_LOOP16);
5090 
5091   __ bind(LOOP16); // small 16-byte load loop
5092     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5093     __ sub(len, len, 16);
5094     __ orr(tmp2, tmp2, tmp3);
5095     __ tst(tmp2, UPPER_BIT_MASK);
5096     __ br(Assembler::NE, RET_ADJUST_16);
5097     __ cmp(len, (u1)16);
5098     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5099 
5100   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5101     __ cmp(len, (u1)8);
5102     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5103     __ ldr(tmp3, Address(__ post(ary1, 8)));
5104     __ tst(tmp3, UPPER_BIT_MASK);
5105     __ br(Assembler::NE, RET_ADJUST);
5106     __ sub(len, len, 8);
5107 
5108   __ bind(POST_LOOP16_LOAD_TAIL);
5109     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5110     __ ldr(tmp1, Address(ary1));
5111     __ mov(tmp2, 64);
5112     __ sub(tmp4, tmp2, len, __ LSL, 3);
5113     __ lslv(tmp1, tmp1, tmp4);
5114     __ tst(tmp1, UPPER_BIT_MASK);
5115     __ br(Assembler::NE, RET_ADJUST);
5116     // Fallthrough
5117 
5118   __ bind(RET_LEN);
5119     __ pop(spilled_regs, sp);
5120     __ leave();
5121     __ ret(lr);
5122 
5123     // difference result - len is the count of guaranteed to be
5124     // positive bytes
5125 
5126   __ bind(RET_ADJUST_LONG);
5127     __ add(len, len, (u1)(large_loop_size - 16));
5128   __ bind(RET_ADJUST_16);
5129     __ add(len, len, 16);
5130   __ bind(RET_ADJUST);
5131     __ pop(spilled_regs, sp);
5132     __ leave();
5133     __ sub(result, result, len);
5134     __ ret(lr);
5135 
5136     return entry;
5137   }
5138 
5139   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5140         bool usePrefetch, Label &NOT_EQUAL) {
5141     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5142         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5143         tmp7 = r12, tmp8 = r13;
5144     Label LOOP;
5145 
5146     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5147     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5148     __ bind(LOOP);
5149     if (usePrefetch) {
5150       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5151       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5152     }
5153     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5154     __ eor(tmp1, tmp1, tmp2);
5155     __ eor(tmp3, tmp3, tmp4);
5156     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5157     __ orr(tmp1, tmp1, tmp3);
5158     __ cbnz(tmp1, NOT_EQUAL);
5159     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5160     __ eor(tmp5, tmp5, tmp6);
5161     __ eor(tmp7, tmp7, tmp8);
5162     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5163     __ orr(tmp5, tmp5, tmp7);
5164     __ cbnz(tmp5, NOT_EQUAL);
5165     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5166     __ eor(tmp1, tmp1, tmp2);
5167     __ eor(tmp3, tmp3, tmp4);
5168     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5169     __ orr(tmp1, tmp1, tmp3);
5170     __ cbnz(tmp1, NOT_EQUAL);
5171     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5172     __ eor(tmp5, tmp5, tmp6);
5173     __ sub(cnt1, cnt1, 8 * wordSize);
5174     __ eor(tmp7, tmp7, tmp8);
5175     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5176     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5177     // cmp) because subs allows an unlimited range of immediate operand.
5178     __ subs(tmp6, cnt1, loopThreshold);
5179     __ orr(tmp5, tmp5, tmp7);
5180     __ cbnz(tmp5, NOT_EQUAL);
5181     __ br(__ GE, LOOP);
5182     // post-loop
5183     __ eor(tmp1, tmp1, tmp2);
5184     __ eor(tmp3, tmp3, tmp4);
5185     __ orr(tmp1, tmp1, tmp3);
5186     __ sub(cnt1, cnt1, 2 * wordSize);
5187     __ cbnz(tmp1, NOT_EQUAL);
5188   }
5189 
5190   void generate_large_array_equals_loop_simd(int loopThreshold,
5191         bool usePrefetch, Label &NOT_EQUAL) {
5192     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5193         tmp2 = rscratch2;
5194     Label LOOP;
5195 
5196     __ bind(LOOP);
5197     if (usePrefetch) {
5198       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5199       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5200     }
5201     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5202     __ sub(cnt1, cnt1, 8 * wordSize);
5203     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5204     __ subs(tmp1, cnt1, loopThreshold);
5205     __ eor(v0, __ T16B, v0, v4);
5206     __ eor(v1, __ T16B, v1, v5);
5207     __ eor(v2, __ T16B, v2, v6);
5208     __ eor(v3, __ T16B, v3, v7);
5209     __ orr(v0, __ T16B, v0, v1);
5210     __ orr(v1, __ T16B, v2, v3);
5211     __ orr(v0, __ T16B, v0, v1);
5212     __ umov(tmp1, v0, __ D, 0);
5213     __ umov(tmp2, v0, __ D, 1);
5214     __ orr(tmp1, tmp1, tmp2);
5215     __ cbnz(tmp1, NOT_EQUAL);
5216     __ br(__ GE, LOOP);
5217   }
5218 
5219   // a1 = r1 - array1 address
5220   // a2 = r2 - array2 address
5221   // result = r0 - return value. Already contains "false"
5222   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5223   // r3-r5 are reserved temporary registers
5224   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5225   address generate_large_array_equals() {
5226     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5227         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5228         tmp7 = r12, tmp8 = r13;
5229     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5230         SMALL_LOOP, POST_LOOP;
5231     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5232     // calculate if at least 32 prefetched bytes are used
5233     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5234     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5235     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5236     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5237         tmp5, tmp6, tmp7, tmp8);
5238 
5239     __ align(CodeEntryAlignment);
5240 
5241     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5242 
5243     address entry = __ pc();
5244     __ enter();
5245     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5246     // also advance pointers to use post-increment instead of pre-increment
5247     __ add(a1, a1, wordSize);
5248     __ add(a2, a2, wordSize);
5249     if (AvoidUnalignedAccesses) {
5250       // both implementations (SIMD/nonSIMD) are using relatively large load
5251       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5252       // on some CPUs in case of address is not at least 16-byte aligned.
5253       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5254       // load if needed at least for 1st address and make if 16-byte aligned.
5255       Label ALIGNED16;
5256       __ tbz(a1, 3, ALIGNED16);
5257       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5258       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5259       __ sub(cnt1, cnt1, wordSize);
5260       __ eor(tmp1, tmp1, tmp2);
5261       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5262       __ bind(ALIGNED16);
5263     }
5264     if (UseSIMDForArrayEquals) {
5265       if (SoftwarePrefetchHintDistance >= 0) {
5266         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5267         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5268         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5269             /* prfm = */ true, NOT_EQUAL);
5270         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5271         __ br(__ LT, TAIL);
5272       }
5273       __ bind(NO_PREFETCH_LARGE_LOOP);
5274       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5275           /* prfm = */ false, NOT_EQUAL);
5276     } else {
5277       __ push(spilled_regs, sp);
5278       if (SoftwarePrefetchHintDistance >= 0) {
5279         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5280         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5281         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5282             /* prfm = */ true, NOT_EQUAL);
5283         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5284         __ br(__ LT, TAIL);
5285       }
5286       __ bind(NO_PREFETCH_LARGE_LOOP);
5287       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5288           /* prfm = */ false, NOT_EQUAL);
5289     }
5290     __ bind(TAIL);
5291       __ cbz(cnt1, EQUAL);
5292       __ subs(cnt1, cnt1, wordSize);
5293       __ br(__ LE, POST_LOOP);
5294     __ bind(SMALL_LOOP);
5295       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5296       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5297       __ subs(cnt1, cnt1, wordSize);
5298       __ eor(tmp1, tmp1, tmp2);
5299       __ cbnz(tmp1, NOT_EQUAL);
5300       __ br(__ GT, SMALL_LOOP);
5301     __ bind(POST_LOOP);
5302       __ ldr(tmp1, Address(a1, cnt1));
5303       __ ldr(tmp2, Address(a2, cnt1));
5304       __ eor(tmp1, tmp1, tmp2);
5305       __ cbnz(tmp1, NOT_EQUAL);
5306     __ bind(EQUAL);
5307       __ mov(result, true);
5308     __ bind(NOT_EQUAL);
5309       if (!UseSIMDForArrayEquals) {
5310         __ pop(spilled_regs, sp);
5311       }
5312     __ bind(NOT_EQUAL_NO_POP);
5313     __ leave();
5314     __ ret(lr);
5315     return entry;
5316   }
5317 
5318   address generate_dsin_dcos(bool isCos) {
5319     __ align(CodeEntryAlignment);
5320     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5321     address start = __ pc();
5322     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5323         (address)StubRoutines::aarch64::_two_over_pi,
5324         (address)StubRoutines::aarch64::_pio2,
5325         (address)StubRoutines::aarch64::_dsin_coef,
5326         (address)StubRoutines::aarch64::_dcos_coef);
5327     return start;
5328   }
5329 
5330   address generate_dlog() {
5331     __ align(CodeEntryAlignment);
5332     StubCodeMark mark(this, "StubRoutines", "dlog");
5333     address entry = __ pc();
5334     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5335         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5336     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5337     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5338         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5339     return entry;
5340   }
5341 
5342 
5343   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5344   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5345       Label &DIFF2) {
5346     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5347     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5348 
5349     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5350     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5351     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5352     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5353 
5354     __ fmovd(tmpL, vtmp3);
5355     __ eor(rscratch2, tmp3, tmpL);
5356     __ cbnz(rscratch2, DIFF2);
5357 
5358     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5359     __ umov(tmpL, vtmp3, __ D, 1);
5360     __ eor(rscratch2, tmpU, tmpL);
5361     __ cbnz(rscratch2, DIFF1);
5362 
5363     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5364     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5365     __ fmovd(tmpL, vtmp);
5366     __ eor(rscratch2, tmp3, tmpL);
5367     __ cbnz(rscratch2, DIFF2);
5368 
5369     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5370     __ umov(tmpL, vtmp, __ D, 1);
5371     __ eor(rscratch2, tmpU, tmpL);
5372     __ cbnz(rscratch2, DIFF1);
5373   }
5374 
5375   // r0  = result
5376   // r1  = str1
5377   // r2  = cnt1
5378   // r3  = str2
5379   // r4  = cnt2
5380   // r10 = tmp1
5381   // r11 = tmp2
5382   address generate_compare_long_string_different_encoding(bool isLU) {
5383     __ align(CodeEntryAlignment);
5384     StubCodeMark mark(this, "StubRoutines", isLU
5385         ? "compare_long_string_different_encoding LU"
5386         : "compare_long_string_different_encoding UL");
5387     address entry = __ pc();
5388     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5389         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5390         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5391     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5392         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5393     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5394     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5395 
5396     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5397 
5398     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5399     // cnt2 == amount of characters left to compare
5400     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5401     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5402     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5403     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5404     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5405     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5406     __ eor(rscratch2, tmp1, tmp2);
5407     __ mov(rscratch1, tmp2);
5408     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5409     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5410              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5411     __ push(spilled_regs, sp);
5412     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5413     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5414 
5415     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5416 
5417     if (SoftwarePrefetchHintDistance >= 0) {
5418       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5419       __ br(__ LT, NO_PREFETCH);
5420       __ bind(LARGE_LOOP_PREFETCH);
5421         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5422         __ mov(tmp4, 2);
5423         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5424         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5425           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5426           __ subs(tmp4, tmp4, 1);
5427           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5428           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5429           __ mov(tmp4, 2);
5430         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5431           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5432           __ subs(tmp4, tmp4, 1);
5433           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5434           __ sub(cnt2, cnt2, 64);
5435           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5436           __ br(__ GE, LARGE_LOOP_PREFETCH);
5437     }
5438     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5439     __ bind(NO_PREFETCH);
5440     __ subs(cnt2, cnt2, 16);
5441     __ br(__ LT, TAIL);
5442     __ align(OptoLoopAlignment);
5443     __ bind(SMALL_LOOP); // smaller loop
5444       __ subs(cnt2, cnt2, 16);
5445       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5446       __ br(__ GE, SMALL_LOOP);
5447       __ cmn(cnt2, (u1)16);
5448       __ br(__ EQ, LOAD_LAST);
5449     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5450       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5451       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5452       __ ldr(tmp3, Address(cnt1, -8));
5453       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5454       __ b(LOAD_LAST);
5455     __ bind(DIFF2);
5456       __ mov(tmpU, tmp3);
5457     __ bind(DIFF1);
5458       __ pop(spilled_regs, sp);
5459       __ b(CALCULATE_DIFFERENCE);
5460     __ bind(LOAD_LAST);
5461       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5462       // No need to load it again
5463       __ mov(tmpU, tmp3);
5464       __ pop(spilled_regs, sp);
5465 
5466       // tmp2 points to the address of the last 4 Latin1 characters right now
5467       __ ldrs(vtmp, Address(tmp2));
5468       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5469       __ fmovd(tmpL, vtmp);
5470 
5471       __ eor(rscratch2, tmpU, tmpL);
5472       __ cbz(rscratch2, DONE);
5473 
5474     // Find the first different characters in the longwords and
5475     // compute their difference.
5476     __ bind(CALCULATE_DIFFERENCE);
5477       __ rev(rscratch2, rscratch2);
5478       __ clz(rscratch2, rscratch2);
5479       __ andr(rscratch2, rscratch2, -16);
5480       __ lsrv(tmp1, tmp1, rscratch2);
5481       __ uxthw(tmp1, tmp1);
5482       __ lsrv(rscratch1, rscratch1, rscratch2);
5483       __ uxthw(rscratch1, rscratch1);
5484       __ subw(result, tmp1, rscratch1);
5485     __ bind(DONE);
5486       __ ret(lr);
5487     return entry;
5488   }
5489 
5490   address generate_method_entry_barrier() {
5491     __ align(CodeEntryAlignment);
5492     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5493 
5494     Label deoptimize_label;
5495 
5496     address start = __ pc();
5497 
5498     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5499 
5500     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5501       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5502       // We can get here despite the nmethod being good, if we have not
5503       // yet applied our cross modification fence (or data fence).
5504       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5505       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5506       __ ldrw(rscratch2, rscratch2);
5507       __ strw(rscratch2, thread_epoch_addr);
5508       __ isb();
5509       __ membar(__ LoadLoad);
5510     }
5511 
5512     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5513 
5514     __ enter();
5515     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5516 
5517     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5518 
5519     __ push_call_clobbered_registers();
5520 
5521     __ mov(c_rarg0, rscratch2);
5522     __ call_VM_leaf
5523          (CAST_FROM_FN_PTR
5524           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5525 
5526     __ reset_last_Java_frame(true);
5527 
5528     __ mov(rscratch1, r0);
5529 
5530     __ pop_call_clobbered_registers();
5531 
5532     __ cbnz(rscratch1, deoptimize_label);
5533 
5534     __ leave();
5535     __ ret(lr);
5536 
5537     __ BIND(deoptimize_label);
5538 
5539     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5540     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5541 
5542     __ mov(sp, rscratch1);
5543     __ br(rscratch2);
5544 
5545     return start;
5546   }
5547 
5548   // r0  = result
5549   // r1  = str1
5550   // r2  = cnt1
5551   // r3  = str2
5552   // r4  = cnt2
5553   // r10 = tmp1
5554   // r11 = tmp2
5555   address generate_compare_long_string_same_encoding(bool isLL) {
5556     __ align(CodeEntryAlignment);
5557     StubCodeMark mark(this, "StubRoutines", isLL
5558         ? "compare_long_string_same_encoding LL"
5559         : "compare_long_string_same_encoding UU");
5560     address entry = __ pc();
5561     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5562         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5563 
5564     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5565 
5566     // exit from large loop when less than 64 bytes left to read or we're about
5567     // to prefetch memory behind array border
5568     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5569 
5570     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5571     __ eor(rscratch2, tmp1, tmp2);
5572     __ cbnz(rscratch2, CAL_DIFFERENCE);
5573 
5574     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5575     // update pointers, because of previous read
5576     __ add(str1, str1, wordSize);
5577     __ add(str2, str2, wordSize);
5578     if (SoftwarePrefetchHintDistance >= 0) {
5579       __ align(OptoLoopAlignment);
5580       __ bind(LARGE_LOOP_PREFETCH);
5581         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5582         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5583 
5584         for (int i = 0; i < 4; i++) {
5585           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5586           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5587           __ cmp(tmp1, tmp2);
5588           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5589           __ br(Assembler::NE, DIFF);
5590         }
5591         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5592         __ add(str1, str1, 64);
5593         __ add(str2, str2, 64);
5594         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5595         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5596         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5597     }
5598 
5599     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5600     __ br(Assembler::LE, LESS16);
5601     __ align(OptoLoopAlignment);
5602     __ bind(LOOP_COMPARE16);
5603       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5604       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5605       __ cmp(tmp1, tmp2);
5606       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5607       __ br(Assembler::NE, DIFF);
5608       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5609       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5610       __ br(Assembler::LT, LESS16);
5611 
5612       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5613       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5614       __ cmp(tmp1, tmp2);
5615       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5616       __ br(Assembler::NE, DIFF);
5617       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5618       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5619       __ br(Assembler::GE, LOOP_COMPARE16);
5620       __ cbz(cnt2, LENGTH_DIFF);
5621 
5622     __ bind(LESS16);
5623       // each 8 compare
5624       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5625       __ br(Assembler::LE, LESS8);
5626       __ ldr(tmp1, Address(__ post(str1, 8)));
5627       __ ldr(tmp2, Address(__ post(str2, 8)));
5628       __ eor(rscratch2, tmp1, tmp2);
5629       __ cbnz(rscratch2, CAL_DIFFERENCE);
5630       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5631 
5632     __ bind(LESS8); // directly load last 8 bytes
5633       if (!isLL) {
5634         __ add(cnt2, cnt2, cnt2);
5635       }
5636       __ ldr(tmp1, Address(str1, cnt2));
5637       __ ldr(tmp2, Address(str2, cnt2));
5638       __ eor(rscratch2, tmp1, tmp2);
5639       __ cbz(rscratch2, LENGTH_DIFF);
5640       __ b(CAL_DIFFERENCE);
5641 
5642     __ bind(DIFF);
5643       __ cmp(tmp1, tmp2);
5644       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5645       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5646       // reuse rscratch2 register for the result of eor instruction
5647       __ eor(rscratch2, tmp1, tmp2);
5648 
5649     __ bind(CAL_DIFFERENCE);
5650       __ rev(rscratch2, rscratch2);
5651       __ clz(rscratch2, rscratch2);
5652       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5653       __ lsrv(tmp1, tmp1, rscratch2);
5654       __ lsrv(tmp2, tmp2, rscratch2);
5655       if (isLL) {
5656         __ uxtbw(tmp1, tmp1);
5657         __ uxtbw(tmp2, tmp2);
5658       } else {
5659         __ uxthw(tmp1, tmp1);
5660         __ uxthw(tmp2, tmp2);
5661       }
5662       __ subw(result, tmp1, tmp2);
5663 
5664     __ bind(LENGTH_DIFF);
5665       __ ret(lr);
5666     return entry;
5667   }
5668 
5669   enum string_compare_mode {
5670     LL,
5671     LU,
5672     UL,
5673     UU,
5674   };
5675 
5676   // The following registers are declared in aarch64.ad
5677   // r0  = result
5678   // r1  = str1
5679   // r2  = cnt1
5680   // r3  = str2
5681   // r4  = cnt2
5682   // r10 = tmp1
5683   // r11 = tmp2
5684   // z0  = ztmp1
5685   // z1  = ztmp2
5686   // p0  = pgtmp1
5687   // p1  = pgtmp2
5688   address generate_compare_long_string_sve(string_compare_mode mode) {
5689     __ align(CodeEntryAlignment);
5690     address entry = __ pc();
5691     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5692              tmp1 = r10, tmp2 = r11;
5693 
5694     Label LOOP, DONE, MISMATCH;
5695     Register vec_len = tmp1;
5696     Register idx = tmp2;
5697     // The minimum of the string lengths has been stored in cnt2.
5698     Register cnt = cnt2;
5699     FloatRegister ztmp1 = z0, ztmp2 = z1;
5700     PRegister pgtmp1 = p0, pgtmp2 = p1;
5701 
5702 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5703     switch (mode) {                                                            \
5704       case LL:                                                                 \
5705         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5706         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5707         break;                                                                 \
5708       case LU:                                                                 \
5709         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5710         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5711         break;                                                                 \
5712       case UL:                                                                 \
5713         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5714         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5715         break;                                                                 \
5716       case UU:                                                                 \
5717         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5718         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5719         break;                                                                 \
5720       default:                                                                 \
5721         ShouldNotReachHere();                                                  \
5722     }
5723 
5724     const char* stubname;
5725     switch (mode) {
5726       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5727       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5728       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5729       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5730       default: ShouldNotReachHere();
5731     }
5732 
5733     StubCodeMark mark(this, "StubRoutines", stubname);
5734 
5735     __ mov(idx, 0);
5736     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5737 
5738     if (mode == LL) {
5739       __ sve_cntb(vec_len);
5740     } else {
5741       __ sve_cnth(vec_len);
5742     }
5743 
5744     __ sub(rscratch1, cnt, vec_len);
5745 
5746     __ bind(LOOP);
5747 
5748       // main loop
5749       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5750       __ add(idx, idx, vec_len);
5751       // Compare strings.
5752       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5753       __ br(__ NE, MISMATCH);
5754       __ cmp(idx, rscratch1);
5755       __ br(__ LT, LOOP);
5756 
5757     // post loop, last iteration
5758     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5759 
5760     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5761     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5762     __ br(__ EQ, DONE);
5763 
5764     __ bind(MISMATCH);
5765 
5766     // Crop the vector to find its location.
5767     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5768     // Extract the first different characters of each string.
5769     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5770     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5771 
5772     // Compute the difference of the first different characters.
5773     __ sub(result, rscratch1, rscratch2);
5774 
5775     __ bind(DONE);
5776     __ ret(lr);
5777 #undef LOAD_PAIR
5778     return entry;
5779   }
5780 
5781   void generate_compare_long_strings() {
5782     if (UseSVE == 0) {
5783       StubRoutines::aarch64::_compare_long_string_LL
5784           = generate_compare_long_string_same_encoding(true);
5785       StubRoutines::aarch64::_compare_long_string_UU
5786           = generate_compare_long_string_same_encoding(false);
5787       StubRoutines::aarch64::_compare_long_string_LU
5788           = generate_compare_long_string_different_encoding(true);
5789       StubRoutines::aarch64::_compare_long_string_UL
5790           = generate_compare_long_string_different_encoding(false);
5791     } else {
5792       StubRoutines::aarch64::_compare_long_string_LL
5793           = generate_compare_long_string_sve(LL);
5794       StubRoutines::aarch64::_compare_long_string_UU
5795           = generate_compare_long_string_sve(UU);
5796       StubRoutines::aarch64::_compare_long_string_LU
5797           = generate_compare_long_string_sve(LU);
5798       StubRoutines::aarch64::_compare_long_string_UL
5799           = generate_compare_long_string_sve(UL);
5800     }
5801   }
5802 
5803   // R0 = result
5804   // R1 = str2
5805   // R2 = cnt1
5806   // R3 = str1
5807   // R4 = cnt2
5808   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
5809   //
5810   // This generic linear code use few additional ideas, which makes it faster:
5811   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5812   // in order to skip initial loading(help in systems with 1 ld pipeline)
5813   // 2) we can use "fast" algorithm of finding single character to search for
5814   // first symbol with less branches(1 branch per each loaded register instead
5815   // of branch for each symbol), so, this is where constants like
5816   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5817   // 3) after loading and analyzing 1st register of source string, it can be
5818   // used to search for every 1st character entry, saving few loads in
5819   // comparison with "simplier-but-slower" implementation
5820   // 4) in order to avoid lots of push/pop operations, code below is heavily
5821   // re-using/re-initializing/compressing register values, which makes code
5822   // larger and a bit less readable, however, most of extra operations are
5823   // issued during loads or branches, so, penalty is minimal
5824   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5825     const char* stubName = str1_isL
5826         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5827         : "indexof_linear_uu";
5828     __ align(CodeEntryAlignment);
5829     StubCodeMark mark(this, "StubRoutines", stubName);
5830     address entry = __ pc();
5831 
5832     int str1_chr_size = str1_isL ? 1 : 2;
5833     int str2_chr_size = str2_isL ? 1 : 2;
5834     int str1_chr_shift = str1_isL ? 0 : 1;
5835     int str2_chr_shift = str2_isL ? 0 : 1;
5836     bool isL = str1_isL && str2_isL;
5837    // parameters
5838     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5839     // temporary registers
5840     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5841     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5842     // redefinitions
5843     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5844 
5845     __ push(spilled_regs, sp);
5846     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5847         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5848         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5849         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5850         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5851         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5852     // Read whole register from str1. It is safe, because length >=8 here
5853     __ ldr(ch1, Address(str1));
5854     // Read whole register from str2. It is safe, because length >=8 here
5855     __ ldr(ch2, Address(str2));
5856     __ sub(cnt2, cnt2, cnt1);
5857     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5858     if (str1_isL != str2_isL) {
5859       __ eor(v0, __ T16B, v0, v0);
5860     }
5861     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5862     __ mul(first, first, tmp1);
5863     // check if we have less than 1 register to check
5864     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5865     if (str1_isL != str2_isL) {
5866       __ fmovd(v1, ch1);
5867     }
5868     __ br(__ LE, L_SMALL);
5869     __ eor(ch2, first, ch2);
5870     if (str1_isL != str2_isL) {
5871       __ zip1(v1, __ T16B, v1, v0);
5872     }
5873     __ sub(tmp2, ch2, tmp1);
5874     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5875     __ bics(tmp2, tmp2, ch2);
5876     if (str1_isL != str2_isL) {
5877       __ fmovd(ch1, v1);
5878     }
5879     __ br(__ NE, L_HAS_ZERO);
5880     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5881     __ add(result, result, wordSize/str2_chr_size);
5882     __ add(str2, str2, wordSize);
5883     __ br(__ LT, L_POST_LOOP);
5884     __ BIND(L_LOOP);
5885       __ ldr(ch2, Address(str2));
5886       __ eor(ch2, first, ch2);
5887       __ sub(tmp2, ch2, tmp1);
5888       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5889       __ bics(tmp2, tmp2, ch2);
5890       __ br(__ NE, L_HAS_ZERO);
5891     __ BIND(L_LOOP_PROCEED);
5892       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5893       __ add(str2, str2, wordSize);
5894       __ add(result, result, wordSize/str2_chr_size);
5895       __ br(__ GE, L_LOOP);
5896     __ BIND(L_POST_LOOP);
5897       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5898       __ br(__ LE, NOMATCH);
5899       __ ldr(ch2, Address(str2));
5900       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5901       __ eor(ch2, first, ch2);
5902       __ sub(tmp2, ch2, tmp1);
5903       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5904       __ mov(tmp4, -1); // all bits set
5905       __ b(L_SMALL_PROCEED);
5906     __ align(OptoLoopAlignment);
5907     __ BIND(L_SMALL);
5908       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5909       __ eor(ch2, first, ch2);
5910       if (str1_isL != str2_isL) {
5911         __ zip1(v1, __ T16B, v1, v0);
5912       }
5913       __ sub(tmp2, ch2, tmp1);
5914       __ mov(tmp4, -1); // all bits set
5915       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5916       if (str1_isL != str2_isL) {
5917         __ fmovd(ch1, v1); // move converted 4 symbols
5918       }
5919     __ BIND(L_SMALL_PROCEED);
5920       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5921       __ bic(tmp2, tmp2, ch2);
5922       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5923       __ rbit(tmp2, tmp2);
5924       __ br(__ EQ, NOMATCH);
5925     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5926       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5927       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5928       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5929       if (str2_isL) { // LL
5930         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5931         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5932         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5933         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5934         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5935       } else {
5936         __ mov(ch2, 0xE); // all bits in byte set except last one
5937         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5938         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5939         __ lslv(tmp2, tmp2, tmp4);
5940         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5941         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5942         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5943         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5944       }
5945       __ cmp(ch1, ch2);
5946       __ mov(tmp4, wordSize/str2_chr_size);
5947       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5948     __ BIND(L_SMALL_CMP_LOOP);
5949       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5950                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5951       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5952                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5953       __ add(tmp4, tmp4, 1);
5954       __ cmp(tmp4, cnt1);
5955       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5956       __ cmp(first, ch2);
5957       __ br(__ EQ, L_SMALL_CMP_LOOP);
5958     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5959       __ cbz(tmp2, NOMATCH); // no more matches. exit
5960       __ clz(tmp4, tmp2);
5961       __ add(result, result, 1); // advance index
5962       __ add(str2, str2, str2_chr_size); // advance pointer
5963       __ b(L_SMALL_HAS_ZERO_LOOP);
5964     __ align(OptoLoopAlignment);
5965     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5966       __ cmp(first, ch2);
5967       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5968       __ b(DONE);
5969     __ align(OptoLoopAlignment);
5970     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5971       if (str2_isL) { // LL
5972         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5973         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5974         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5975         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5976         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5977       } else {
5978         __ mov(ch2, 0xE); // all bits in byte set except last one
5979         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5980         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5981         __ lslv(tmp2, tmp2, tmp4);
5982         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5983         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5984         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5985         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5986       }
5987       __ cmp(ch1, ch2);
5988       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5989       __ b(DONE);
5990     __ align(OptoLoopAlignment);
5991     __ BIND(L_HAS_ZERO);
5992       __ rbit(tmp2, tmp2);
5993       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5994       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5995       // It's fine because both counters are 32bit and are not changed in this
5996       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5997       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5998       __ sub(result, result, 1);
5999     __ BIND(L_HAS_ZERO_LOOP);
6000       __ mov(cnt1, wordSize/str2_chr_size);
6001       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6002       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6003       if (str2_isL) {
6004         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6005         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6006         __ lslv(tmp2, tmp2, tmp4);
6007         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6008         __ add(tmp4, tmp4, 1);
6009         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6010         __ lsl(tmp2, tmp2, 1);
6011         __ mov(tmp4, wordSize/str2_chr_size);
6012       } else {
6013         __ mov(ch2, 0xE);
6014         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6015         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6016         __ lslv(tmp2, tmp2, tmp4);
6017         __ add(tmp4, tmp4, 1);
6018         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6019         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6020         __ lsl(tmp2, tmp2, 1);
6021         __ mov(tmp4, wordSize/str2_chr_size);
6022         __ sub(str2, str2, str2_chr_size);
6023       }
6024       __ cmp(ch1, ch2);
6025       __ mov(tmp4, wordSize/str2_chr_size);
6026       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6027     __ BIND(L_CMP_LOOP);
6028       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6029                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6030       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6031                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6032       __ add(tmp4, tmp4, 1);
6033       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6034       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6035       __ cmp(cnt1, ch2);
6036       __ br(__ EQ, L_CMP_LOOP);
6037     __ BIND(L_CMP_LOOP_NOMATCH);
6038       // here we're not matched
6039       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6040       __ clz(tmp4, tmp2);
6041       __ add(str2, str2, str2_chr_size); // advance pointer
6042       __ b(L_HAS_ZERO_LOOP);
6043     __ align(OptoLoopAlignment);
6044     __ BIND(L_CMP_LOOP_LAST_CMP);
6045       __ cmp(cnt1, ch2);
6046       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6047       __ b(DONE);
6048     __ align(OptoLoopAlignment);
6049     __ BIND(L_CMP_LOOP_LAST_CMP2);
6050       if (str2_isL) {
6051         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6052         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6053         __ lslv(tmp2, tmp2, tmp4);
6054         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6055         __ add(tmp4, tmp4, 1);
6056         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6057         __ lsl(tmp2, tmp2, 1);
6058       } else {
6059         __ mov(ch2, 0xE);
6060         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6061         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6062         __ lslv(tmp2, tmp2, tmp4);
6063         __ add(tmp4, tmp4, 1);
6064         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6065         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6066         __ lsl(tmp2, tmp2, 1);
6067         __ sub(str2, str2, str2_chr_size);
6068       }
6069       __ cmp(ch1, ch2);
6070       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6071       __ b(DONE);
6072     __ align(OptoLoopAlignment);
6073     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6074       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6075       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6076       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6077       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6078       // result by analyzed characters value, so, we can just reset lower bits
6079       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6080       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6081       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6082       // index of last analyzed substring inside current octet. So, str2 in at
6083       // respective start address. We need to advance it to next octet
6084       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6085       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6086       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6087       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6088       __ movw(cnt2, cnt2);
6089       __ b(L_LOOP_PROCEED);
6090     __ align(OptoLoopAlignment);
6091     __ BIND(NOMATCH);
6092       __ mov(result, -1);
6093     __ BIND(DONE);
6094       __ pop(spilled_regs, sp);
6095       __ ret(lr);
6096     return entry;
6097   }
6098 
6099   void generate_string_indexof_stubs() {
6100     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6101     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6102     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6103   }
6104 
6105   void inflate_and_store_2_fp_registers(bool generatePrfm,
6106       FloatRegister src1, FloatRegister src2) {
6107     Register dst = r1;
6108     __ zip1(v1, __ T16B, src1, v0);
6109     __ zip2(v2, __ T16B, src1, v0);
6110     if (generatePrfm) {
6111       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6112     }
6113     __ zip1(v3, __ T16B, src2, v0);
6114     __ zip2(v4, __ T16B, src2, v0);
6115     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6116   }
6117 
6118   // R0 = src
6119   // R1 = dst
6120   // R2 = len
6121   // R3 = len >> 3
6122   // V0 = 0
6123   // v1 = loaded 8 bytes
6124   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6125   address generate_large_byte_array_inflate() {
6126     __ align(CodeEntryAlignment);
6127     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6128     address entry = __ pc();
6129     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6130     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6131     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6132 
6133     // do one more 8-byte read to have address 16-byte aligned in most cases
6134     // also use single store instruction
6135     __ ldrd(v2, __ post(src, 8));
6136     __ sub(octetCounter, octetCounter, 2);
6137     __ zip1(v1, __ T16B, v1, v0);
6138     __ zip1(v2, __ T16B, v2, v0);
6139     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6140     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6141     __ subs(rscratch1, octetCounter, large_loop_threshold);
6142     __ br(__ LE, LOOP_START);
6143     __ b(LOOP_PRFM_START);
6144     __ bind(LOOP_PRFM);
6145       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6146     __ bind(LOOP_PRFM_START);
6147       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6148       __ sub(octetCounter, octetCounter, 8);
6149       __ subs(rscratch1, octetCounter, large_loop_threshold);
6150       inflate_and_store_2_fp_registers(true, v3, v4);
6151       inflate_and_store_2_fp_registers(true, v5, v6);
6152       __ br(__ GT, LOOP_PRFM);
6153       __ cmp(octetCounter, (u1)8);
6154       __ br(__ LT, DONE);
6155     __ bind(LOOP);
6156       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6157       __ bind(LOOP_START);
6158       __ sub(octetCounter, octetCounter, 8);
6159       __ cmp(octetCounter, (u1)8);
6160       inflate_and_store_2_fp_registers(false, v3, v4);
6161       inflate_and_store_2_fp_registers(false, v5, v6);
6162       __ br(__ GE, LOOP);
6163     __ bind(DONE);
6164       __ ret(lr);
6165     return entry;
6166   }
6167 
6168   /**
6169    *  Arguments:
6170    *
6171    *  Input:
6172    *  c_rarg0   - current state address
6173    *  c_rarg1   - H key address
6174    *  c_rarg2   - data address
6175    *  c_rarg3   - number of blocks
6176    *
6177    *  Output:
6178    *  Updated state at c_rarg0
6179    */
6180   address generate_ghash_processBlocks() {
6181     // Bafflingly, GCM uses little-endian for the byte order, but
6182     // big-endian for the bit order.  For example, the polynomial 1 is
6183     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6184     //
6185     // So, we must either reverse the bytes in each word and do
6186     // everything big-endian or reverse the bits in each byte and do
6187     // it little-endian.  On AArch64 it's more idiomatic to reverse
6188     // the bits in each byte (we have an instruction, RBIT, to do
6189     // that) and keep the data in little-endian bit order through the
6190     // calculation, bit-reversing the inputs and outputs.
6191 
6192     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6193     __ align(wordSize * 2);
6194     address p = __ pc();
6195     __ emit_int64(0x87);  // The low-order bits of the field
6196                           // polynomial (i.e. p = z^7+z^2+z+1)
6197                           // repeated in the low and high parts of a
6198                           // 128-bit vector
6199     __ emit_int64(0x87);
6200 
6201     __ align(CodeEntryAlignment);
6202     address start = __ pc();
6203 
6204     Register state   = c_rarg0;
6205     Register subkeyH = c_rarg1;
6206     Register data    = c_rarg2;
6207     Register blocks  = c_rarg3;
6208 
6209     FloatRegister vzr = v30;
6210     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6211 
6212     __ ldrq(v24, p);    // The field polynomial
6213 
6214     __ ldrq(v0, Address(state));
6215     __ ldrq(v1, Address(subkeyH));
6216 
6217     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6218     __ rbit(v0, __ T16B, v0);
6219     __ rev64(v1, __ T16B, v1);
6220     __ rbit(v1, __ T16B, v1);
6221 
6222     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6223     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6224 
6225     {
6226       Label L_ghash_loop;
6227       __ bind(L_ghash_loop);
6228 
6229       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6230                                                  // reversing each byte
6231       __ rbit(v2, __ T16B, v2);
6232       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6233 
6234       // Multiply state in v2 by subkey in v1
6235       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6236                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6237                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6238       // Reduce v7:v5 by the field polynomial
6239       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6240 
6241       __ sub(blocks, blocks, 1);
6242       __ cbnz(blocks, L_ghash_loop);
6243     }
6244 
6245     // The bit-reversed result is at this point in v0
6246     __ rev64(v0, __ T16B, v0);
6247     __ rbit(v0, __ T16B, v0);
6248 
6249     __ st1(v0, __ T16B, state);
6250     __ ret(lr);
6251 
6252     return start;
6253   }
6254 
6255   address generate_ghash_processBlocks_wide() {
6256     address small = generate_ghash_processBlocks();
6257 
6258     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6259     __ align(wordSize * 2);
6260     address p = __ pc();
6261     __ emit_int64(0x87);  // The low-order bits of the field
6262                           // polynomial (i.e. p = z^7+z^2+z+1)
6263                           // repeated in the low and high parts of a
6264                           // 128-bit vector
6265     __ emit_int64(0x87);
6266 
6267     __ align(CodeEntryAlignment);
6268     address start = __ pc();
6269 
6270     Register state   = c_rarg0;
6271     Register subkeyH = c_rarg1;
6272     Register data    = c_rarg2;
6273     Register blocks  = c_rarg3;
6274 
6275     const int unroll = 4;
6276 
6277     __ cmp(blocks, (unsigned char)(unroll * 2));
6278     __ br(__ LT, small);
6279 
6280     if (unroll > 1) {
6281     // Save state before entering routine
6282       __ sub(sp, sp, 4 * 16);
6283       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6284       __ sub(sp, sp, 4 * 16);
6285       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6286     }
6287 
6288     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6289 
6290     if (unroll > 1) {
6291       // And restore state
6292       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6293       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6294     }
6295 
6296     __ cmp(blocks, (unsigned char)0);
6297     __ br(__ GT, small);
6298 
6299     __ ret(lr);
6300 
6301     return start;
6302   }
6303 
6304   void generate_base64_encode_simdround(Register src, Register dst,
6305         FloatRegister codec, u8 size) {
6306 
6307     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6308     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6309     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6310 
6311     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6312 
6313     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6314 
6315     __ ushr(ind0, arrangement, in0,  2);
6316 
6317     __ ushr(ind1, arrangement, in1,  2);
6318     __ shl(in0,   arrangement, in0,  6);
6319     __ orr(ind1,  arrangement, ind1, in0);
6320     __ ushr(ind1, arrangement, ind1, 2);
6321 
6322     __ ushr(ind2, arrangement, in2,  4);
6323     __ shl(in1,   arrangement, in1,  4);
6324     __ orr(ind2,  arrangement, in1,  ind2);
6325     __ ushr(ind2, arrangement, ind2, 2);
6326 
6327     __ shl(ind3,  arrangement, in2,  2);
6328     __ ushr(ind3, arrangement, ind3, 2);
6329 
6330     __ tbl(out0,  arrangement, codec,  4, ind0);
6331     __ tbl(out1,  arrangement, codec,  4, ind1);
6332     __ tbl(out2,  arrangement, codec,  4, ind2);
6333     __ tbl(out3,  arrangement, codec,  4, ind3);
6334 
6335     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6336   }
6337 
6338    /**
6339    *  Arguments:
6340    *
6341    *  Input:
6342    *  c_rarg0   - src_start
6343    *  c_rarg1   - src_offset
6344    *  c_rarg2   - src_length
6345    *  c_rarg3   - dest_start
6346    *  c_rarg4   - dest_offset
6347    *  c_rarg5   - isURL
6348    *
6349    */
6350   address generate_base64_encodeBlock() {
6351 
6352     static const char toBase64[64] = {
6353       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6354       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6355       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6356       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6357       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6358     };
6359 
6360     static const char toBase64URL[64] = {
6361       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6362       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6363       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6364       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6365       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6366     };
6367 
6368     __ align(CodeEntryAlignment);
6369     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6370     address start = __ pc();
6371 
6372     Register src   = c_rarg0;  // source array
6373     Register soff  = c_rarg1;  // source start offset
6374     Register send  = c_rarg2;  // source end offset
6375     Register dst   = c_rarg3;  // dest array
6376     Register doff  = c_rarg4;  // position for writing to dest array
6377     Register isURL = c_rarg5;  // Base64 or URL character set
6378 
6379     // c_rarg6 and c_rarg7 are free to use as temps
6380     Register codec  = c_rarg6;
6381     Register length = c_rarg7;
6382 
6383     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6384 
6385     __ add(src, src, soff);
6386     __ add(dst, dst, doff);
6387     __ sub(length, send, soff);
6388 
6389     // load the codec base address
6390     __ lea(codec, ExternalAddress((address) toBase64));
6391     __ cbz(isURL, ProcessData);
6392     __ lea(codec, ExternalAddress((address) toBase64URL));
6393 
6394     __ BIND(ProcessData);
6395 
6396     // too short to formup a SIMD loop, roll back
6397     __ cmp(length, (u1)24);
6398     __ br(Assembler::LT, Process3B);
6399 
6400     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6401 
6402     __ BIND(Process48B);
6403     __ cmp(length, (u1)48);
6404     __ br(Assembler::LT, Process24B);
6405     generate_base64_encode_simdround(src, dst, v0, 16);
6406     __ sub(length, length, 48);
6407     __ b(Process48B);
6408 
6409     __ BIND(Process24B);
6410     __ cmp(length, (u1)24);
6411     __ br(Assembler::LT, SIMDExit);
6412     generate_base64_encode_simdround(src, dst, v0, 8);
6413     __ sub(length, length, 24);
6414 
6415     __ BIND(SIMDExit);
6416     __ cbz(length, Exit);
6417 
6418     __ BIND(Process3B);
6419     //  3 src bytes, 24 bits
6420     __ ldrb(r10, __ post(src, 1));
6421     __ ldrb(r11, __ post(src, 1));
6422     __ ldrb(r12, __ post(src, 1));
6423     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6424     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6425     // codec index
6426     __ ubfmw(r15, r12, 18, 23);
6427     __ ubfmw(r14, r12, 12, 17);
6428     __ ubfmw(r13, r12, 6,  11);
6429     __ andw(r12,  r12, 63);
6430     // get the code based on the codec
6431     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6432     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6433     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6434     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6435     __ strb(r15, __ post(dst, 1));
6436     __ strb(r14, __ post(dst, 1));
6437     __ strb(r13, __ post(dst, 1));
6438     __ strb(r12, __ post(dst, 1));
6439     __ sub(length, length, 3);
6440     __ cbnz(length, Process3B);
6441 
6442     __ BIND(Exit);
6443     __ ret(lr);
6444 
6445     return start;
6446   }
6447 
6448   void generate_base64_decode_simdround(Register src, Register dst,
6449         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6450 
6451     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6452     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6453 
6454     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6455     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6456 
6457     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6458 
6459     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6460 
6461     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6462 
6463     // we need unsigned saturating subtract, to make sure all input values
6464     // in range [0, 63] will have 0U value in the higher half lookup
6465     __ uqsubv(decH0, __ T16B, in0, v27);
6466     __ uqsubv(decH1, __ T16B, in1, v27);
6467     __ uqsubv(decH2, __ T16B, in2, v27);
6468     __ uqsubv(decH3, __ T16B, in3, v27);
6469 
6470     // lower half lookup
6471     __ tbl(decL0, arrangement, codecL, 4, in0);
6472     __ tbl(decL1, arrangement, codecL, 4, in1);
6473     __ tbl(decL2, arrangement, codecL, 4, in2);
6474     __ tbl(decL3, arrangement, codecL, 4, in3);
6475 
6476     // higher half lookup
6477     __ tbx(decH0, arrangement, codecH, 4, decH0);
6478     __ tbx(decH1, arrangement, codecH, 4, decH1);
6479     __ tbx(decH2, arrangement, codecH, 4, decH2);
6480     __ tbx(decH3, arrangement, codecH, 4, decH3);
6481 
6482     // combine lower and higher
6483     __ orr(decL0, arrangement, decL0, decH0);
6484     __ orr(decL1, arrangement, decL1, decH1);
6485     __ orr(decL2, arrangement, decL2, decH2);
6486     __ orr(decL3, arrangement, decL3, decH3);
6487 
6488     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6489     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6490     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6491     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6492     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6493     __ orr(in0, arrangement, decH0, decH1);
6494     __ orr(in1, arrangement, decH2, decH3);
6495     __ orr(in2, arrangement, in0,   in1);
6496     __ umaxv(in3, arrangement, in2);
6497     __ umov(rscratch2, in3, __ B, 0);
6498 
6499     // get the data to output
6500     __ shl(out0,  arrangement, decL0, 2);
6501     __ ushr(out1, arrangement, decL1, 4);
6502     __ orr(out0,  arrangement, out0,  out1);
6503     __ shl(out1,  arrangement, decL1, 4);
6504     __ ushr(out2, arrangement, decL2, 2);
6505     __ orr(out1,  arrangement, out1,  out2);
6506     __ shl(out2,  arrangement, decL2, 6);
6507     __ orr(out2,  arrangement, out2,  decL3);
6508 
6509     __ cbz(rscratch2, NoIllegalData);
6510 
6511     // handle illegal input
6512     __ umov(r10, in2, __ D, 0);
6513     if (size == 16) {
6514       __ cbnz(r10, ErrorInLowerHalf);
6515 
6516       // illegal input is in higher half, store the lower half now.
6517       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6518 
6519       __ umov(r10, in2,  __ D, 1);
6520       __ umov(r11, out0, __ D, 1);
6521       __ umov(r12, out1, __ D, 1);
6522       __ umov(r13, out2, __ D, 1);
6523       __ b(StoreLegalData);
6524 
6525       __ BIND(ErrorInLowerHalf);
6526     }
6527     __ umov(r11, out0, __ D, 0);
6528     __ umov(r12, out1, __ D, 0);
6529     __ umov(r13, out2, __ D, 0);
6530 
6531     __ BIND(StoreLegalData);
6532     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6533     __ strb(r11, __ post(dst, 1));
6534     __ strb(r12, __ post(dst, 1));
6535     __ strb(r13, __ post(dst, 1));
6536     __ lsr(r10, r10, 8);
6537     __ lsr(r11, r11, 8);
6538     __ lsr(r12, r12, 8);
6539     __ lsr(r13, r13, 8);
6540     __ b(StoreLegalData);
6541 
6542     __ BIND(NoIllegalData);
6543     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6544   }
6545 
6546 
6547    /**
6548    *  Arguments:
6549    *
6550    *  Input:
6551    *  c_rarg0   - src_start
6552    *  c_rarg1   - src_offset
6553    *  c_rarg2   - src_length
6554    *  c_rarg3   - dest_start
6555    *  c_rarg4   - dest_offset
6556    *  c_rarg5   - isURL
6557    *  c_rarg6   - isMIME
6558    *
6559    */
6560   address generate_base64_decodeBlock() {
6561 
6562     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6563     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6564     // titled "Base64 decoding".
6565 
6566     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6567     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6568     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6569     static const uint8_t fromBase64ForNoSIMD[256] = {
6570       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6571       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6572       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6573        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6574       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6575        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6576       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6577        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6578       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6579       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6580       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6581       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6582       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6583       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6584       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6585       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6586     };
6587 
6588     static const uint8_t fromBase64URLForNoSIMD[256] = {
6589       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6590       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6591       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6592        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6593       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6594        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6595       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6596        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6597       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6598       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6599       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6600       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6601       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6602       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6603       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6604       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6605     };
6606 
6607     // A legal value of base64 code is in range [0, 127].  We need two lookups
6608     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6609     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6610     // table vector lookup use tbx, out of range indices are unchanged in
6611     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6612     // The value of index 64 is set to 0, so that we know that we already get the
6613     // decoded data with the 1st lookup.
6614     static const uint8_t fromBase64ForSIMD[128] = {
6615       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6616       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6617       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6618        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6619         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6620        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6621       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6622        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6623     };
6624 
6625     static const uint8_t fromBase64URLForSIMD[128] = {
6626       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6627       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6628       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6629        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6630         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6631        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6632        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6633        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6634     };
6635 
6636     __ align(CodeEntryAlignment);
6637     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6638     address start = __ pc();
6639 
6640     Register src    = c_rarg0;  // source array
6641     Register soff   = c_rarg1;  // source start offset
6642     Register send   = c_rarg2;  // source end offset
6643     Register dst    = c_rarg3;  // dest array
6644     Register doff   = c_rarg4;  // position for writing to dest array
6645     Register isURL  = c_rarg5;  // Base64 or URL character set
6646     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6647 
6648     Register length = send;    // reuse send as length of source data to process
6649 
6650     Register simd_codec   = c_rarg6;
6651     Register nosimd_codec = c_rarg7;
6652 
6653     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6654 
6655     __ enter();
6656 
6657     __ add(src, src, soff);
6658     __ add(dst, dst, doff);
6659 
6660     __ mov(doff, dst);
6661 
6662     __ sub(length, send, soff);
6663     __ bfm(length, zr, 0, 1);
6664 
6665     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6666     __ cbz(isURL, ProcessData);
6667     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6668 
6669     __ BIND(ProcessData);
6670     __ mov(rscratch1, length);
6671     __ cmp(length, (u1)144); // 144 = 80 + 64
6672     __ br(Assembler::LT, Process4B);
6673 
6674     // In the MIME case, the line length cannot be more than 76
6675     // bytes (see RFC 2045). This is too short a block for SIMD
6676     // to be worthwhile, so we use non-SIMD here.
6677     __ movw(rscratch1, 79);
6678 
6679     __ BIND(Process4B);
6680     __ ldrw(r14, __ post(src, 4));
6681     __ ubfxw(r10, r14, 0,  8);
6682     __ ubfxw(r11, r14, 8,  8);
6683     __ ubfxw(r12, r14, 16, 8);
6684     __ ubfxw(r13, r14, 24, 8);
6685     // get the de-code
6686     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6687     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6688     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6689     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6690     // error detection, 255u indicates an illegal input
6691     __ orrw(r14, r10, r11);
6692     __ orrw(r15, r12, r13);
6693     __ orrw(r14, r14, r15);
6694     __ tbnz(r14, 7, Exit);
6695     // recover the data
6696     __ lslw(r14, r10, 10);
6697     __ bfiw(r14, r11, 4, 6);
6698     __ bfmw(r14, r12, 2, 5);
6699     __ rev16w(r14, r14);
6700     __ bfiw(r13, r12, 6, 2);
6701     __ strh(r14, __ post(dst, 2));
6702     __ strb(r13, __ post(dst, 1));
6703     // non-simd loop
6704     __ subsw(rscratch1, rscratch1, 4);
6705     __ br(Assembler::GT, Process4B);
6706 
6707     // if exiting from PreProcess80B, rscratch1 == -1;
6708     // otherwise, rscratch1 == 0.
6709     __ cbzw(rscratch1, Exit);
6710     __ sub(length, length, 80);
6711 
6712     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6713     __ cbz(isURL, SIMDEnter);
6714     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6715 
6716     __ BIND(SIMDEnter);
6717     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6718     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6719     __ mov(rscratch1, 63);
6720     __ dup(v27, __ T16B, rscratch1);
6721 
6722     __ BIND(Process64B);
6723     __ cmp(length, (u1)64);
6724     __ br(Assembler::LT, Process32B);
6725     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6726     __ sub(length, length, 64);
6727     __ b(Process64B);
6728 
6729     __ BIND(Process32B);
6730     __ cmp(length, (u1)32);
6731     __ br(Assembler::LT, SIMDExit);
6732     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6733     __ sub(length, length, 32);
6734     __ b(Process32B);
6735 
6736     __ BIND(SIMDExit);
6737     __ cbz(length, Exit);
6738     __ movw(rscratch1, length);
6739     __ b(Process4B);
6740 
6741     __ BIND(Exit);
6742     __ sub(c_rarg0, dst, doff);
6743 
6744     __ leave();
6745     __ ret(lr);
6746 
6747     return start;
6748   }
6749 
6750   // Support for spin waits.
6751   address generate_spin_wait() {
6752     __ align(CodeEntryAlignment);
6753     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6754     address start = __ pc();
6755 
6756     __ spin_wait();
6757     __ ret(lr);
6758 
6759     return start;
6760   }
6761 
6762 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6763 
6764   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6765   //
6766   // If LSE is in use, generate LSE versions of all the stubs. The
6767   // non-LSE versions are in atomic_aarch64.S.
6768 
6769   // class AtomicStubMark records the entry point of a stub and the
6770   // stub pointer which will point to it. The stub pointer is set to
6771   // the entry point when ~AtomicStubMark() is called, which must be
6772   // after ICache::invalidate_range. This ensures safe publication of
6773   // the generated code.
6774   class AtomicStubMark {
6775     address _entry_point;
6776     aarch64_atomic_stub_t *_stub;
6777     MacroAssembler *_masm;
6778   public:
6779     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6780       _masm = masm;
6781       __ align(32);
6782       _entry_point = __ pc();
6783       _stub = stub;
6784     }
6785     ~AtomicStubMark() {
6786       *_stub = (aarch64_atomic_stub_t)_entry_point;
6787     }
6788   };
6789 
6790   // NB: For memory_order_conservative we need a trailing membar after
6791   // LSE atomic operations but not a leading membar.
6792   //
6793   // We don't need a leading membar because a clause in the Arm ARM
6794   // says:
6795   //
6796   //   Barrier-ordered-before
6797   //
6798   //   Barrier instructions order prior Memory effects before subsequent
6799   //   Memory effects generated by the same Observer. A read or a write
6800   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6801   //   Observer if and only if RW1 appears in program order before RW 2
6802   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6803   //   instruction with both Acquire and Release semantics.
6804   //
6805   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6806   // and Release semantics, therefore we don't need a leading
6807   // barrier. However, there is no corresponding Barrier-ordered-after
6808   // relationship, therefore we need a trailing membar to prevent a
6809   // later store or load from being reordered with the store in an
6810   // atomic instruction.
6811   //
6812   // This was checked by using the herd7 consistency model simulator
6813   // (http://diy.inria.fr/) with this test case:
6814   //
6815   // AArch64 LseCas
6816   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6817   // P0 | P1;
6818   // LDR W4, [X2] | MOV W3, #0;
6819   // DMB LD       | MOV W4, #1;
6820   // LDR W3, [X1] | CASAL W3, W4, [X1];
6821   //              | DMB ISH;
6822   //              | STR W4, [X2];
6823   // exists
6824   // (0:X3=0 /\ 0:X4=1)
6825   //
6826   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6827   // with the store to x in P1. Without the DMB in P1 this may happen.
6828   //
6829   // At the time of writing we don't know of any AArch64 hardware that
6830   // reorders stores in this way, but the Reference Manual permits it.
6831 
6832   void gen_cas_entry(Assembler::operand_size size,
6833                      atomic_memory_order order) {
6834     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6835       exchange_val = c_rarg2;
6836     bool acquire, release;
6837     switch (order) {
6838       case memory_order_relaxed:
6839         acquire = false;
6840         release = false;
6841         break;
6842       case memory_order_release:
6843         acquire = false;
6844         release = true;
6845         break;
6846       default:
6847         acquire = true;
6848         release = true;
6849         break;
6850     }
6851     __ mov(prev, compare_val);
6852     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6853     if (order == memory_order_conservative) {
6854       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6855     }
6856     if (size == Assembler::xword) {
6857       __ mov(r0, prev);
6858     } else {
6859       __ movw(r0, prev);
6860     }
6861     __ ret(lr);
6862   }
6863 
6864   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6865     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6866     // If not relaxed, then default to conservative.  Relaxed is the only
6867     // case we use enough to be worth specializing.
6868     if (order == memory_order_relaxed) {
6869       __ ldadd(size, incr, prev, addr);
6870     } else {
6871       __ ldaddal(size, incr, prev, addr);
6872       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6873     }
6874     if (size == Assembler::xword) {
6875       __ mov(r0, prev);
6876     } else {
6877       __ movw(r0, prev);
6878     }
6879     __ ret(lr);
6880   }
6881 
6882   void gen_swpal_entry(Assembler::operand_size size) {
6883     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6884     __ swpal(size, incr, prev, addr);
6885     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6886     if (size == Assembler::xword) {
6887       __ mov(r0, prev);
6888     } else {
6889       __ movw(r0, prev);
6890     }
6891     __ ret(lr);
6892   }
6893 
6894   void generate_atomic_entry_points() {
6895     if (! UseLSE) {
6896       return;
6897     }
6898 
6899     __ align(CodeEntryAlignment);
6900     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6901     address first_entry = __ pc();
6902 
6903     // ADD, memory_order_conservative
6904     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6905     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6906     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6907     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6908 
6909     // ADD, memory_order_relaxed
6910     AtomicStubMark mark_fetch_add_4_relaxed
6911       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6912     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6913     AtomicStubMark mark_fetch_add_8_relaxed
6914       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6915     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6916 
6917     // XCHG, memory_order_conservative
6918     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6919     gen_swpal_entry(Assembler::word);
6920     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6921     gen_swpal_entry(Assembler::xword);
6922 
6923     // CAS, memory_order_conservative
6924     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6925     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6926     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6927     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6928     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6929     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6930 
6931     // CAS, memory_order_relaxed
6932     AtomicStubMark mark_cmpxchg_1_relaxed
6933       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6934     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6935     AtomicStubMark mark_cmpxchg_4_relaxed
6936       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6937     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6938     AtomicStubMark mark_cmpxchg_8_relaxed
6939       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6940     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6941 
6942     AtomicStubMark mark_cmpxchg_4_release
6943       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6944     gen_cas_entry(MacroAssembler::word, memory_order_release);
6945     AtomicStubMark mark_cmpxchg_8_release
6946       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6947     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6948 
6949     AtomicStubMark mark_cmpxchg_4_seq_cst
6950       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6951     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6952     AtomicStubMark mark_cmpxchg_8_seq_cst
6953       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6954     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6955 
6956     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6957   }
6958 #endif // LINUX
6959 
6960   address generate_cont_thaw(Continuation::thaw_kind kind) {
6961     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
6962     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
6963 
6964     address start = __ pc();
6965 
6966     if (return_barrier) {
6967       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6968       __ mov(sp, rscratch1);
6969     }
6970     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6971 
6972     if (return_barrier) {
6973       // preserve possible return value from a method returning to the return barrier
6974       __ fmovd(rscratch1, v0);
6975       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6976     }
6977 
6978     __ movw(c_rarg1, (return_barrier ? 1 : 0));
6979     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
6980     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
6981 
6982     if (return_barrier) {
6983       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6984       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6985       __ fmovd(v0, rscratch1);
6986     }
6987     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6988 
6989 
6990     Label thaw_success;
6991     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
6992     __ cbnz(rscratch2, thaw_success);
6993     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
6994     __ br(rscratch1);
6995     __ bind(thaw_success);
6996 
6997     // make room for the thawed frames
6998     __ sub(rscratch1, sp, rscratch2);
6999     __ andr(rscratch1, rscratch1, -16); // align
7000     __ mov(sp, rscratch1);
7001 
7002     if (return_barrier) {
7003       // save original return value -- again
7004       __ fmovd(rscratch1, v0);
7005       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7006     }
7007 
7008     // If we want, we can templatize thaw by kind, and have three different entries
7009     __ movw(c_rarg1, (uint32_t)kind);
7010 
7011     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7012     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7013 
7014     if (return_barrier) {
7015       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7016       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7017       __ fmovd(v0, rscratch1);
7018     } else {
7019       __ mov(r0, zr); // return 0 (success) from doYield
7020     }
7021 
7022     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7023     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7024     __ mov(rfp, sp);
7025 
7026     if (return_barrier_exception) {
7027       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7028       __ authenticate_return_address(c_rarg1);
7029       __ verify_oop(r0);
7030       // save return value containing the exception oop in callee-saved R19
7031       __ mov(r19, r0);
7032 
7033       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7034 
7035       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7036       // __ reinitialize_ptrue();
7037 
7038       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7039 
7040       __ mov(r1, r0); // the exception handler
7041       __ mov(r0, r19); // restore return value containing the exception oop
7042       __ verify_oop(r0);
7043 
7044       __ leave();
7045       __ mov(r3, lr);
7046       __ br(r1); // the exception handler
7047     } else {
7048       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7049       __ leave();
7050       __ ret(lr);
7051     }
7052 
7053     return start;
7054   }
7055 
7056   address generate_cont_thaw() {
7057     if (!Continuations::enabled()) return nullptr;
7058 
7059     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7060     address start = __ pc();
7061     generate_cont_thaw(Continuation::thaw_top);
7062     return start;
7063   }
7064 
7065   address generate_cont_returnBarrier() {
7066     if (!Continuations::enabled()) return nullptr;
7067 
7068     // TODO: will probably need multiple return barriers depending on return type
7069     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7070     address start = __ pc();
7071 
7072     generate_cont_thaw(Continuation::thaw_return_barrier);
7073 
7074     return start;
7075   }
7076 
7077   address generate_cont_returnBarrier_exception() {
7078     if (!Continuations::enabled()) return nullptr;
7079 
7080     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7081     address start = __ pc();
7082 
7083     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7084 
7085     return start;
7086   }
7087 
7088   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7089   // are represented as long[5], with BITS_PER_LIMB = 26.
7090   // Pack five 26-bit limbs into three 64-bit registers.
7091   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7092     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7093     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7094     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7095     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7096 
7097     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7098     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7099     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7100     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7101 
7102     if (dest2->is_valid()) {
7103       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7104     } else {
7105 #ifdef ASSERT
7106       Label OK;
7107       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7108       __ br(__ EQ, OK);
7109       __ stop("high bits of Poly1305 integer should be zero");
7110       __ should_not_reach_here();
7111       __ bind(OK);
7112 #endif
7113     }
7114   }
7115 
7116   // As above, but return only a 128-bit integer, packed into two
7117   // 64-bit registers.
7118   void pack_26(Register dest0, Register dest1, Register src) {
7119     pack_26(dest0, dest1, noreg, src);
7120   }
7121 
7122   // Multiply and multiply-accumulate unsigned 64-bit registers.
7123   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7124     __ mul(prod_lo, n, m);
7125     __ umulh(prod_hi, n, m);
7126   }
7127   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7128     wide_mul(rscratch1, rscratch2, n, m);
7129     __ adds(sum_lo, sum_lo, rscratch1);
7130     __ adc(sum_hi, sum_hi, rscratch2);
7131   }
7132 
7133   // Poly1305, RFC 7539
7134 
7135   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7136   // description of the tricks used to simplify and accelerate this
7137   // computation.
7138 
7139   address generate_poly1305_processBlocks() {
7140     __ align(CodeEntryAlignment);
7141     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
7142     address start = __ pc();
7143     Label here;
7144     __ enter();
7145     RegSet callee_saved = RegSet::range(r19, r28);
7146     __ push(callee_saved, sp);
7147 
7148     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7149 
7150     // Arguments
7151     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7152 
7153     // R_n is the 128-bit randomly-generated key, packed into two
7154     // registers.  The caller passes this key to us as long[5], with
7155     // BITS_PER_LIMB = 26.
7156     const Register R_0 = *++regs, R_1 = *++regs;
7157     pack_26(R_0, R_1, r_start);
7158 
7159     // RR_n is (R_n >> 2) * 5
7160     const Register RR_0 = *++regs, RR_1 = *++regs;
7161     __ lsr(RR_0, R_0, 2);
7162     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7163     __ lsr(RR_1, R_1, 2);
7164     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7165 
7166     // U_n is the current checksum
7167     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7168     pack_26(U_0, U_1, U_2, acc_start);
7169 
7170     static constexpr int BLOCK_LENGTH = 16;
7171     Label DONE, LOOP;
7172 
7173     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7174     __ br(Assembler::LT, DONE); {
7175       __ bind(LOOP);
7176 
7177       // S_n is to be the sum of U_n and the next block of data
7178       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7179       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7180       __ adds(S_0, U_0, S_0);
7181       __ adcs(S_1, U_1, S_1);
7182       __ adc(S_2, U_2, zr);
7183       __ add(S_2, S_2, 1);
7184 
7185       const Register U_0HI = *++regs, U_1HI = *++regs;
7186 
7187       // NB: this logic depends on some of the special properties of
7188       // Poly1305 keys. In particular, because we know that the top
7189       // four bits of R_0 and R_1 are zero, we can add together
7190       // partial products without any risk of needing to propagate a
7191       // carry out.
7192       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7193       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7194       __ andr(U_2, R_0, 3);
7195       __ mul(U_2, S_2, U_2);
7196 
7197       // Recycle registers S_0, S_1, S_2
7198       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7199 
7200       // Partial reduction mod 2**130 - 5
7201       __ adds(U_1, U_0HI, U_1);
7202       __ adc(U_2, U_1HI, U_2);
7203       // Sum now in U_2:U_1:U_0.
7204       // Dead: U_0HI, U_1HI.
7205       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7206 
7207       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7208 
7209       // First, U_2:U_1:U_0 += (U_2 >> 2)
7210       __ lsr(rscratch1, U_2, 2);
7211       __ andr(U_2, U_2, (u8)3);
7212       __ adds(U_0, U_0, rscratch1);
7213       __ adcs(U_1, U_1, zr);
7214       __ adc(U_2, U_2, zr);
7215       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7216       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7217       __ adcs(U_1, U_1, zr);
7218       __ adc(U_2, U_2, zr);
7219 
7220       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7221       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7222       __ br(~ Assembler::LT, LOOP);
7223     }
7224 
7225     // Further reduce modulo 2^130 - 5
7226     __ lsr(rscratch1, U_2, 2);
7227     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7228     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7229     __ adcs(U_1, U_1, zr);
7230     __ andr(U_2, U_2, (u1)3);
7231     __ adc(U_2, U_2, zr);
7232 
7233     // Unpack the sum into five 26-bit limbs and write to memory.
7234     __ ubfiz(rscratch1, U_0, 0, 26);
7235     __ ubfx(rscratch2, U_0, 26, 26);
7236     __ stp(rscratch1, rscratch2, Address(acc_start));
7237     __ ubfx(rscratch1, U_0, 52, 12);
7238     __ bfi(rscratch1, U_1, 12, 14);
7239     __ ubfx(rscratch2, U_1, 14, 26);
7240     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7241     __ ubfx(rscratch1, U_1, 40, 24);
7242     __ bfi(rscratch1, U_2, 24, 3);
7243     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7244 
7245     __ bind(DONE);
7246     __ pop(callee_saved, sp);
7247     __ leave();
7248     __ ret(lr);
7249 
7250     return start;
7251   }
7252 
7253 #if INCLUDE_JFR
7254 
7255   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
7256     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7257     __ mov(c_rarg0, thread);
7258   }
7259 
7260   // The handle is dereferenced through a load barrier.
7261   static void jfr_epilogue(MacroAssembler* _masm) {
7262     __ reset_last_Java_frame(true);
7263   }
7264 
7265   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
7266   // It returns a jobject handle to the event writer.
7267   // The handle is dereferenced and the return value is the event writer oop.
7268   static RuntimeStub* generate_jfr_write_checkpoint() {
7269     enum layout {
7270       rbp_off,
7271       rbpH_off,
7272       return_off,
7273       return_off2,
7274       framesize // inclusive of return address
7275     };
7276 
7277     int insts_size = 1024;
7278     int locs_size = 64;
7279     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
7280     OopMapSet* oop_maps = new OopMapSet();
7281     MacroAssembler* masm = new MacroAssembler(&code);
7282     MacroAssembler* _masm = masm;
7283 
7284     address start = __ pc();
7285     __ enter();
7286     int frame_complete = __ pc() - start;
7287     address the_pc = __ pc();
7288     jfr_prologue(the_pc, _masm, rthread);
7289     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
7290     jfr_epilogue(_masm);
7291     __ resolve_global_jobject(r0, rscratch1, rscratch2);
7292     __ leave();
7293     __ ret(lr);
7294 
7295     OopMap* map = new OopMap(framesize, 1); // rfp
7296     oop_maps->add_gc_map(the_pc - start, map);
7297 
7298     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7299       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
7300                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7301                                     oop_maps, false);
7302     return stub;
7303   }
7304 
7305   // For c2: call to return a leased buffer.
7306   static RuntimeStub* generate_jfr_return_lease() {
7307     enum layout {
7308       rbp_off,
7309       rbpH_off,
7310       return_off,
7311       return_off2,
7312       framesize // inclusive of return address
7313     };
7314 
7315     int insts_size = 1024;
7316     int locs_size = 64;
7317     CodeBuffer code("jfr_return_lease", insts_size, locs_size);
7318     OopMapSet* oop_maps = new OopMapSet();
7319     MacroAssembler* masm = new MacroAssembler(&code);
7320     MacroAssembler* _masm = masm;
7321 
7322     address start = __ pc();
7323     __ enter();
7324     int frame_complete = __ pc() - start;
7325     address the_pc = __ pc();
7326     jfr_prologue(the_pc, _masm, rthread);
7327     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
7328     jfr_epilogue(_masm);
7329 
7330     __ leave();
7331     __ ret(lr);
7332 
7333     OopMap* map = new OopMap(framesize, 1); // rfp
7334     oop_maps->add_gc_map(the_pc - start, map);
7335 
7336     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7337       RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete,
7338                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7339                                     oop_maps, false);
7340     return stub;
7341   }
7342 
7343 #endif // INCLUDE_JFR
7344 
7345   // exception handler for upcall stubs
7346   address generate_upcall_stub_exception_handler() {
7347     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
7348     address start = __ pc();
7349 
7350     // Native caller has no idea how to handle exceptions,
7351     // so we just crash here. Up to callee to catch exceptions.
7352     __ verify_oop(r0);
7353     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7354     __ blr(rscratch1);
7355     __ should_not_reach_here();
7356 
7357     return start;
7358   }
7359 
7360   // Continuation point for throwing of implicit exceptions that are
7361   // not handled in the current activation. Fabricates an exception
7362   // oop and initiates normal exception dispatching in this
7363   // frame. Since we need to preserve callee-saved values (currently
7364   // only for C2, but done for C1 as well) we need a callee-saved oop
7365   // map and therefore have to make these stubs into RuntimeStubs
7366   // rather than BufferBlobs.  If the compiler needs all registers to
7367   // be preserved between the fault point and the exception handler
7368   // then it must assume responsibility for that in
7369   // AbstractCompiler::continuation_for_implicit_null_exception or
7370   // continuation_for_implicit_division_by_zero_exception. All other
7371   // implicit exceptions (e.g., NullPointerException or
7372   // AbstractMethodError on entry) are either at call sites or
7373   // otherwise assume that stack unwinding will be initiated, so
7374   // caller saved registers were assumed volatile in the compiler.
7375 
7376 #undef __
7377 #define __ masm->
7378 
7379   address generate_throw_exception(const char* name,
7380                                    address runtime_entry,
7381                                    Register arg1 = noreg,
7382                                    Register arg2 = noreg) {
7383     // Information about frame layout at time of blocking runtime call.
7384     // Note that we only have to preserve callee-saved registers since
7385     // the compilers are responsible for supplying a continuation point
7386     // if they expect all registers to be preserved.
7387     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7388     enum layout {
7389       rfp_off = 0,
7390       rfp_off2,
7391       return_off,
7392       return_off2,
7393       framesize // inclusive of return address
7394     };
7395 
7396     int insts_size = 512;
7397     int locs_size  = 64;
7398 
7399     CodeBuffer code(name, insts_size, locs_size);
7400     OopMapSet* oop_maps  = new OopMapSet();
7401     MacroAssembler* masm = new MacroAssembler(&code);
7402 
7403     address start = __ pc();
7404 
7405     // This is an inlined and slightly modified version of call_VM
7406     // which has the ability to fetch the return PC out of
7407     // thread-local storage and also sets up last_Java_sp slightly
7408     // differently than the real call_VM
7409 
7410     __ enter(); // Save FP and LR before call
7411 
7412     assert(is_even(framesize/2), "sp not 16-byte aligned");
7413 
7414     // lr and fp are already in place
7415     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
7416 
7417     int frame_complete = __ pc() - start;
7418 
7419     // Set up last_Java_sp and last_Java_fp
7420     address the_pc = __ pc();
7421     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7422 
7423     // Call runtime
7424     if (arg1 != noreg) {
7425       assert(arg2 != c_rarg1, "clobbered");
7426       __ mov(c_rarg1, arg1);
7427     }
7428     if (arg2 != noreg) {
7429       __ mov(c_rarg2, arg2);
7430     }
7431     __ mov(c_rarg0, rthread);
7432     BLOCK_COMMENT("call runtime_entry");
7433     __ mov(rscratch1, runtime_entry);
7434     __ blr(rscratch1);
7435 
7436     // Generate oop map
7437     OopMap* map = new OopMap(framesize, 0);
7438 
7439     oop_maps->add_gc_map(the_pc - start, map);
7440 
7441     __ reset_last_Java_frame(true);
7442 
7443     // Reinitialize the ptrue predicate register, in case the external runtime
7444     // call clobbers ptrue reg, as we may return to SVE compiled code.
7445     __ reinitialize_ptrue();
7446 
7447     __ leave();
7448 
7449     // check for pending exceptions
7450 #ifdef ASSERT
7451     Label L;
7452     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
7453     __ cbnz(rscratch1, L);
7454     __ should_not_reach_here();
7455     __ bind(L);
7456 #endif // ASSERT
7457     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7458 
7459     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7460     RuntimeStub* stub =
7461       RuntimeStub::new_runtime_stub(name,
7462                                     &code,
7463                                     frame_complete,
7464                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7465                                     oop_maps, false);
7466     return stub->entry_point();
7467   }
7468 
7469   class MontgomeryMultiplyGenerator : public MacroAssembler {
7470 
7471     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7472       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7473 
7474     RegSet _toSave;
7475     bool _squaring;
7476 
7477   public:
7478     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7479       : MacroAssembler(as->code()), _squaring(squaring) {
7480 
7481       // Register allocation
7482 
7483       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7484       Pa_base = *regs;       // Argument registers
7485       if (squaring)
7486         Pb_base = Pa_base;
7487       else
7488         Pb_base = *++regs;
7489       Pn_base = *++regs;
7490       Rlen= *++regs;
7491       inv = *++regs;
7492       Pm_base = *++regs;
7493 
7494                           // Working registers:
7495       Ra =  *++regs;        // The current digit of a, b, n, and m.
7496       Rb =  *++regs;
7497       Rm =  *++regs;
7498       Rn =  *++regs;
7499 
7500       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7501       Pb =  *++regs;
7502       Pm =  *++regs;
7503       Pn =  *++regs;
7504 
7505       t0 =  *++regs;        // Three registers which form a
7506       t1 =  *++regs;        // triple-precision accumuator.
7507       t2 =  *++regs;
7508 
7509       Ri =  *++regs;        // Inner and outer loop indexes.
7510       Rj =  *++regs;
7511 
7512       Rhi_ab = *++regs;     // Product registers: low and high parts
7513       Rlo_ab = *++regs;     // of a*b and m*n.
7514       Rhi_mn = *++regs;
7515       Rlo_mn = *++regs;
7516 
7517       // r19 and up are callee-saved.
7518       _toSave = RegSet::range(r19, *regs) + Pm_base;
7519     }
7520 
7521   private:
7522     void save_regs() {
7523       push(_toSave, sp);
7524     }
7525 
7526     void restore_regs() {
7527       pop(_toSave, sp);
7528     }
7529 
7530     template <typename T>
7531     void unroll_2(Register count, T block) {
7532       Label loop, end, odd;
7533       tbnz(count, 0, odd);
7534       cbz(count, end);
7535       align(16);
7536       bind(loop);
7537       (this->*block)();
7538       bind(odd);
7539       (this->*block)();
7540       subs(count, count, 2);
7541       br(Assembler::GT, loop);
7542       bind(end);
7543     }
7544 
7545     template <typename T>
7546     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7547       Label loop, end, odd;
7548       tbnz(count, 0, odd);
7549       cbz(count, end);
7550       align(16);
7551       bind(loop);
7552       (this->*block)(d, s, tmp);
7553       bind(odd);
7554       (this->*block)(d, s, tmp);
7555       subs(count, count, 2);
7556       br(Assembler::GT, loop);
7557       bind(end);
7558     }
7559 
7560     void pre1(RegisterOrConstant i) {
7561       block_comment("pre1");
7562       // Pa = Pa_base;
7563       // Pb = Pb_base + i;
7564       // Pm = Pm_base;
7565       // Pn = Pn_base + i;
7566       // Ra = *Pa;
7567       // Rb = *Pb;
7568       // Rm = *Pm;
7569       // Rn = *Pn;
7570       ldr(Ra, Address(Pa_base));
7571       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7572       ldr(Rm, Address(Pm_base));
7573       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7574       lea(Pa, Address(Pa_base));
7575       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7576       lea(Pm, Address(Pm_base));
7577       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7578 
7579       // Zero the m*n result.
7580       mov(Rhi_mn, zr);
7581       mov(Rlo_mn, zr);
7582     }
7583 
7584     // The core multiply-accumulate step of a Montgomery
7585     // multiplication.  The idea is to schedule operations as a
7586     // pipeline so that instructions with long latencies (loads and
7587     // multiplies) have time to complete before their results are
7588     // used.  This most benefits in-order implementations of the
7589     // architecture but out-of-order ones also benefit.
7590     void step() {
7591       block_comment("step");
7592       // MACC(Ra, Rb, t0, t1, t2);
7593       // Ra = *++Pa;
7594       // Rb = *--Pb;
7595       umulh(Rhi_ab, Ra, Rb);
7596       mul(Rlo_ab, Ra, Rb);
7597       ldr(Ra, pre(Pa, wordSize));
7598       ldr(Rb, pre(Pb, -wordSize));
7599       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7600                                        // previous iteration.
7601       // MACC(Rm, Rn, t0, t1, t2);
7602       // Rm = *++Pm;
7603       // Rn = *--Pn;
7604       umulh(Rhi_mn, Rm, Rn);
7605       mul(Rlo_mn, Rm, Rn);
7606       ldr(Rm, pre(Pm, wordSize));
7607       ldr(Rn, pre(Pn, -wordSize));
7608       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7609     }
7610 
7611     void post1() {
7612       block_comment("post1");
7613 
7614       // MACC(Ra, Rb, t0, t1, t2);
7615       // Ra = *++Pa;
7616       // Rb = *--Pb;
7617       umulh(Rhi_ab, Ra, Rb);
7618       mul(Rlo_ab, Ra, Rb);
7619       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7620       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7621 
7622       // *Pm = Rm = t0 * inv;
7623       mul(Rm, t0, inv);
7624       str(Rm, Address(Pm));
7625 
7626       // MACC(Rm, Rn, t0, t1, t2);
7627       // t0 = t1; t1 = t2; t2 = 0;
7628       umulh(Rhi_mn, Rm, Rn);
7629 
7630 #ifndef PRODUCT
7631       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7632       {
7633         mul(Rlo_mn, Rm, Rn);
7634         add(Rlo_mn, t0, Rlo_mn);
7635         Label ok;
7636         cbz(Rlo_mn, ok); {
7637           stop("broken Montgomery multiply");
7638         } bind(ok);
7639       }
7640 #endif
7641       // We have very carefully set things up so that
7642       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7643       // the lower half of Rm * Rn because we know the result already:
7644       // it must be -t0.  t0 + (-t0) must generate a carry iff
7645       // t0 != 0.  So, rather than do a mul and an adds we just set
7646       // the carry flag iff t0 is nonzero.
7647       //
7648       // mul(Rlo_mn, Rm, Rn);
7649       // adds(zr, t0, Rlo_mn);
7650       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7651       adcs(t0, t1, Rhi_mn);
7652       adc(t1, t2, zr);
7653       mov(t2, zr);
7654     }
7655 
7656     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7657       block_comment("pre2");
7658       // Pa = Pa_base + i-len;
7659       // Pb = Pb_base + len;
7660       // Pm = Pm_base + i-len;
7661       // Pn = Pn_base + len;
7662 
7663       if (i.is_register()) {
7664         sub(Rj, i.as_register(), len);
7665       } else {
7666         mov(Rj, i.as_constant());
7667         sub(Rj, Rj, len);
7668       }
7669       // Rj == i-len
7670 
7671       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7672       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7673       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7674       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7675 
7676       // Ra = *++Pa;
7677       // Rb = *--Pb;
7678       // Rm = *++Pm;
7679       // Rn = *--Pn;
7680       ldr(Ra, pre(Pa, wordSize));
7681       ldr(Rb, pre(Pb, -wordSize));
7682       ldr(Rm, pre(Pm, wordSize));
7683       ldr(Rn, pre(Pn, -wordSize));
7684 
7685       mov(Rhi_mn, zr);
7686       mov(Rlo_mn, zr);
7687     }
7688 
7689     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7690       block_comment("post2");
7691       if (i.is_constant()) {
7692         mov(Rj, i.as_constant()-len.as_constant());
7693       } else {
7694         sub(Rj, i.as_register(), len);
7695       }
7696 
7697       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7698 
7699       // As soon as we know the least significant digit of our result,
7700       // store it.
7701       // Pm_base[i-len] = t0;
7702       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7703 
7704       // t0 = t1; t1 = t2; t2 = 0;
7705       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7706       adc(t1, t2, zr);
7707       mov(t2, zr);
7708     }
7709 
7710     // A carry in t0 after Montgomery multiplication means that we
7711     // should subtract multiples of n from our result in m.  We'll
7712     // keep doing that until there is no carry.
7713     void normalize(RegisterOrConstant len) {
7714       block_comment("normalize");
7715       // while (t0)
7716       //   t0 = sub(Pm_base, Pn_base, t0, len);
7717       Label loop, post, again;
7718       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7719       cbz(t0, post); {
7720         bind(again); {
7721           mov(i, zr);
7722           mov(cnt, len);
7723           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7724           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7725           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7726           align(16);
7727           bind(loop); {
7728             sbcs(Rm, Rm, Rn);
7729             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7730             add(i, i, 1);
7731             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7732             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7733             sub(cnt, cnt, 1);
7734           } cbnz(cnt, loop);
7735           sbc(t0, t0, zr);
7736         } cbnz(t0, again);
7737       } bind(post);
7738     }
7739 
7740     // Move memory at s to d, reversing words.
7741     //    Increments d to end of copied memory
7742     //    Destroys tmp1, tmp2
7743     //    Preserves len
7744     //    Leaves s pointing to the address which was in d at start
7745     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7746       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7747       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7748 
7749       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7750       mov(tmp1, len);
7751       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7752       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7753     }
7754     // where
7755     void reverse1(Register d, Register s, Register tmp) {
7756       ldr(tmp, pre(s, -wordSize));
7757       ror(tmp, tmp, 32);
7758       str(tmp, post(d, wordSize));
7759     }
7760 
7761     void step_squaring() {
7762       // An extra ACC
7763       step();
7764       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7765     }
7766 
7767     void last_squaring(RegisterOrConstant i) {
7768       Label dont;
7769       // if ((i & 1) == 0) {
7770       tbnz(i.as_register(), 0, dont); {
7771         // MACC(Ra, Rb, t0, t1, t2);
7772         // Ra = *++Pa;
7773         // Rb = *--Pb;
7774         umulh(Rhi_ab, Ra, Rb);
7775         mul(Rlo_ab, Ra, Rb);
7776         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7777       } bind(dont);
7778     }
7779 
7780     void extra_step_squaring() {
7781       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7782 
7783       // MACC(Rm, Rn, t0, t1, t2);
7784       // Rm = *++Pm;
7785       // Rn = *--Pn;
7786       umulh(Rhi_mn, Rm, Rn);
7787       mul(Rlo_mn, Rm, Rn);
7788       ldr(Rm, pre(Pm, wordSize));
7789       ldr(Rn, pre(Pn, -wordSize));
7790     }
7791 
7792     void post1_squaring() {
7793       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7794 
7795       // *Pm = Rm = t0 * inv;
7796       mul(Rm, t0, inv);
7797       str(Rm, Address(Pm));
7798 
7799       // MACC(Rm, Rn, t0, t1, t2);
7800       // t0 = t1; t1 = t2; t2 = 0;
7801       umulh(Rhi_mn, Rm, Rn);
7802 
7803 #ifndef PRODUCT
7804       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7805       {
7806         mul(Rlo_mn, Rm, Rn);
7807         add(Rlo_mn, t0, Rlo_mn);
7808         Label ok;
7809         cbz(Rlo_mn, ok); {
7810           stop("broken Montgomery multiply");
7811         } bind(ok);
7812       }
7813 #endif
7814       // We have very carefully set things up so that
7815       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7816       // the lower half of Rm * Rn because we know the result already:
7817       // it must be -t0.  t0 + (-t0) must generate a carry iff
7818       // t0 != 0.  So, rather than do a mul and an adds we just set
7819       // the carry flag iff t0 is nonzero.
7820       //
7821       // mul(Rlo_mn, Rm, Rn);
7822       // adds(zr, t0, Rlo_mn);
7823       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7824       adcs(t0, t1, Rhi_mn);
7825       adc(t1, t2, zr);
7826       mov(t2, zr);
7827     }
7828 
7829     void acc(Register Rhi, Register Rlo,
7830              Register t0, Register t1, Register t2) {
7831       adds(t0, t0, Rlo);
7832       adcs(t1, t1, Rhi);
7833       adc(t2, t2, zr);
7834     }
7835 
7836   public:
7837     /**
7838      * Fast Montgomery multiplication.  The derivation of the
7839      * algorithm is in A Cryptographic Library for the Motorola
7840      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7841      *
7842      * Arguments:
7843      *
7844      * Inputs for multiplication:
7845      *   c_rarg0   - int array elements a
7846      *   c_rarg1   - int array elements b
7847      *   c_rarg2   - int array elements n (the modulus)
7848      *   c_rarg3   - int length
7849      *   c_rarg4   - int inv
7850      *   c_rarg5   - int array elements m (the result)
7851      *
7852      * Inputs for squaring:
7853      *   c_rarg0   - int array elements a
7854      *   c_rarg1   - int array elements n (the modulus)
7855      *   c_rarg2   - int length
7856      *   c_rarg3   - int inv
7857      *   c_rarg4   - int array elements m (the result)
7858      *
7859      */
7860     address generate_multiply() {
7861       Label argh, nothing;
7862       bind(argh);
7863       stop("MontgomeryMultiply total_allocation must be <= 8192");
7864 
7865       align(CodeEntryAlignment);
7866       address entry = pc();
7867 
7868       cbzw(Rlen, nothing);
7869 
7870       enter();
7871 
7872       // Make room.
7873       cmpw(Rlen, 512);
7874       br(Assembler::HI, argh);
7875       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7876       andr(sp, Ra, -2 * wordSize);
7877 
7878       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7879 
7880       {
7881         // Copy input args, reversing as we go.  We use Ra as a
7882         // temporary variable.
7883         reverse(Ra, Pa_base, Rlen, t0, t1);
7884         if (!_squaring)
7885           reverse(Ra, Pb_base, Rlen, t0, t1);
7886         reverse(Ra, Pn_base, Rlen, t0, t1);
7887       }
7888 
7889       // Push all call-saved registers and also Pm_base which we'll need
7890       // at the end.
7891       save_regs();
7892 
7893 #ifndef PRODUCT
7894       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7895       {
7896         ldr(Rn, Address(Pn_base, 0));
7897         mul(Rlo_mn, Rn, inv);
7898         subs(zr, Rlo_mn, -1);
7899         Label ok;
7900         br(EQ, ok); {
7901           stop("broken inverse in Montgomery multiply");
7902         } bind(ok);
7903       }
7904 #endif
7905 
7906       mov(Pm_base, Ra);
7907 
7908       mov(t0, zr);
7909       mov(t1, zr);
7910       mov(t2, zr);
7911 
7912       block_comment("for (int i = 0; i < len; i++) {");
7913       mov(Ri, zr); {
7914         Label loop, end;
7915         cmpw(Ri, Rlen);
7916         br(Assembler::GE, end);
7917 
7918         bind(loop);
7919         pre1(Ri);
7920 
7921         block_comment("  for (j = i; j; j--) {"); {
7922           movw(Rj, Ri);
7923           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7924         } block_comment("  } // j");
7925 
7926         post1();
7927         addw(Ri, Ri, 1);
7928         cmpw(Ri, Rlen);
7929         br(Assembler::LT, loop);
7930         bind(end);
7931         block_comment("} // i");
7932       }
7933 
7934       block_comment("for (int i = len; i < 2*len; i++) {");
7935       mov(Ri, Rlen); {
7936         Label loop, end;
7937         cmpw(Ri, Rlen, Assembler::LSL, 1);
7938         br(Assembler::GE, end);
7939 
7940         bind(loop);
7941         pre2(Ri, Rlen);
7942 
7943         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7944           lslw(Rj, Rlen, 1);
7945           subw(Rj, Rj, Ri);
7946           subw(Rj, Rj, 1);
7947           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7948         } block_comment("  } // j");
7949 
7950         post2(Ri, Rlen);
7951         addw(Ri, Ri, 1);
7952         cmpw(Ri, Rlen, Assembler::LSL, 1);
7953         br(Assembler::LT, loop);
7954         bind(end);
7955       }
7956       block_comment("} // i");
7957 
7958       normalize(Rlen);
7959 
7960       mov(Ra, Pm_base);  // Save Pm_base in Ra
7961       restore_regs();  // Restore caller's Pm_base
7962 
7963       // Copy our result into caller's Pm_base
7964       reverse(Pm_base, Ra, Rlen, t0, t1);
7965 
7966       leave();
7967       bind(nothing);
7968       ret(lr);
7969 
7970       return entry;
7971     }
7972     // In C, approximately:
7973 
7974     // void
7975     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7976     //                     julong Pn_base[], julong Pm_base[],
7977     //                     julong inv, int len) {
7978     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7979     //   julong *Pa, *Pb, *Pn, *Pm;
7980     //   julong Ra, Rb, Rn, Rm;
7981 
7982     //   int i;
7983 
7984     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7985 
7986     //   for (i = 0; i < len; i++) {
7987     //     int j;
7988 
7989     //     Pa = Pa_base;
7990     //     Pb = Pb_base + i;
7991     //     Pm = Pm_base;
7992     //     Pn = Pn_base + i;
7993 
7994     //     Ra = *Pa;
7995     //     Rb = *Pb;
7996     //     Rm = *Pm;
7997     //     Rn = *Pn;
7998 
7999     //     int iters = i;
8000     //     for (j = 0; iters--; j++) {
8001     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8002     //       MACC(Ra, Rb, t0, t1, t2);
8003     //       Ra = *++Pa;
8004     //       Rb = *--Pb;
8005     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8006     //       MACC(Rm, Rn, t0, t1, t2);
8007     //       Rm = *++Pm;
8008     //       Rn = *--Pn;
8009     //     }
8010 
8011     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
8012     //     MACC(Ra, Rb, t0, t1, t2);
8013     //     *Pm = Rm = t0 * inv;
8014     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8015     //     MACC(Rm, Rn, t0, t1, t2);
8016 
8017     //     assert(t0 == 0, "broken Montgomery multiply");
8018 
8019     //     t0 = t1; t1 = t2; t2 = 0;
8020     //   }
8021 
8022     //   for (i = len; i < 2*len; i++) {
8023     //     int j;
8024 
8025     //     Pa = Pa_base + i-len;
8026     //     Pb = Pb_base + len;
8027     //     Pm = Pm_base + i-len;
8028     //     Pn = Pn_base + len;
8029 
8030     //     Ra = *++Pa;
8031     //     Rb = *--Pb;
8032     //     Rm = *++Pm;
8033     //     Rn = *--Pn;
8034 
8035     //     int iters = len*2-i-1;
8036     //     for (j = i-len+1; iters--; j++) {
8037     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8038     //       MACC(Ra, Rb, t0, t1, t2);
8039     //       Ra = *++Pa;
8040     //       Rb = *--Pb;
8041     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8042     //       MACC(Rm, Rn, t0, t1, t2);
8043     //       Rm = *++Pm;
8044     //       Rn = *--Pn;
8045     //     }
8046 
8047     //     Pm_base[i-len] = t0;
8048     //     t0 = t1; t1 = t2; t2 = 0;
8049     //   }
8050 
8051     //   while (t0)
8052     //     t0 = sub(Pm_base, Pn_base, t0, len);
8053     // }
8054 
8055     /**
8056      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
8057      * multiplies than Montgomery multiplication so it should be up to
8058      * 25% faster.  However, its loop control is more complex and it
8059      * may actually run slower on some machines.
8060      *
8061      * Arguments:
8062      *
8063      * Inputs:
8064      *   c_rarg0   - int array elements a
8065      *   c_rarg1   - int array elements n (the modulus)
8066      *   c_rarg2   - int length
8067      *   c_rarg3   - int inv
8068      *   c_rarg4   - int array elements m (the result)
8069      *
8070      */
8071     address generate_square() {
8072       Label argh;
8073       bind(argh);
8074       stop("MontgomeryMultiply total_allocation must be <= 8192");
8075 
8076       align(CodeEntryAlignment);
8077       address entry = pc();
8078 
8079       enter();
8080 
8081       // Make room.
8082       cmpw(Rlen, 512);
8083       br(Assembler::HI, argh);
8084       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8085       andr(sp, Ra, -2 * wordSize);
8086 
8087       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8088 
8089       {
8090         // Copy input args, reversing as we go.  We use Ra as a
8091         // temporary variable.
8092         reverse(Ra, Pa_base, Rlen, t0, t1);
8093         reverse(Ra, Pn_base, Rlen, t0, t1);
8094       }
8095 
8096       // Push all call-saved registers and also Pm_base which we'll need
8097       // at the end.
8098       save_regs();
8099 
8100       mov(Pm_base, Ra);
8101 
8102       mov(t0, zr);
8103       mov(t1, zr);
8104       mov(t2, zr);
8105 
8106       block_comment("for (int i = 0; i < len; i++) {");
8107       mov(Ri, zr); {
8108         Label loop, end;
8109         bind(loop);
8110         cmp(Ri, Rlen);
8111         br(Assembler::GE, end);
8112 
8113         pre1(Ri);
8114 
8115         block_comment("for (j = (i+1)/2; j; j--) {"); {
8116           add(Rj, Ri, 1);
8117           lsr(Rj, Rj, 1);
8118           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8119         } block_comment("  } // j");
8120 
8121         last_squaring(Ri);
8122 
8123         block_comment("  for (j = i/2; j; j--) {"); {
8124           lsr(Rj, Ri, 1);
8125           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8126         } block_comment("  } // j");
8127 
8128         post1_squaring();
8129         add(Ri, Ri, 1);
8130         cmp(Ri, Rlen);
8131         br(Assembler::LT, loop);
8132 
8133         bind(end);
8134         block_comment("} // i");
8135       }
8136 
8137       block_comment("for (int i = len; i < 2*len; i++) {");
8138       mov(Ri, Rlen); {
8139         Label loop, end;
8140         bind(loop);
8141         cmp(Ri, Rlen, Assembler::LSL, 1);
8142         br(Assembler::GE, end);
8143 
8144         pre2(Ri, Rlen);
8145 
8146         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8147           lsl(Rj, Rlen, 1);
8148           sub(Rj, Rj, Ri);
8149           sub(Rj, Rj, 1);
8150           lsr(Rj, Rj, 1);
8151           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8152         } block_comment("  } // j");
8153 
8154         last_squaring(Ri);
8155 
8156         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8157           lsl(Rj, Rlen, 1);
8158           sub(Rj, Rj, Ri);
8159           lsr(Rj, Rj, 1);
8160           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8161         } block_comment("  } // j");
8162 
8163         post2(Ri, Rlen);
8164         add(Ri, Ri, 1);
8165         cmp(Ri, Rlen, Assembler::LSL, 1);
8166 
8167         br(Assembler::LT, loop);
8168         bind(end);
8169         block_comment("} // i");
8170       }
8171 
8172       normalize(Rlen);
8173 
8174       mov(Ra, Pm_base);  // Save Pm_base in Ra
8175       restore_regs();  // Restore caller's Pm_base
8176 
8177       // Copy our result into caller's Pm_base
8178       reverse(Pm_base, Ra, Rlen, t0, t1);
8179 
8180       leave();
8181       ret(lr);
8182 
8183       return entry;
8184     }
8185     // In C, approximately:
8186 
8187     // void
8188     // montgomery_square(julong Pa_base[], julong Pn_base[],
8189     //                   julong Pm_base[], julong inv, int len) {
8190     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8191     //   julong *Pa, *Pb, *Pn, *Pm;
8192     //   julong Ra, Rb, Rn, Rm;
8193 
8194     //   int i;
8195 
8196     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8197 
8198     //   for (i = 0; i < len; i++) {
8199     //     int j;
8200 
8201     //     Pa = Pa_base;
8202     //     Pb = Pa_base + i;
8203     //     Pm = Pm_base;
8204     //     Pn = Pn_base + i;
8205 
8206     //     Ra = *Pa;
8207     //     Rb = *Pb;
8208     //     Rm = *Pm;
8209     //     Rn = *Pn;
8210 
8211     //     int iters = (i+1)/2;
8212     //     for (j = 0; iters--; j++) {
8213     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8214     //       MACC2(Ra, Rb, t0, t1, t2);
8215     //       Ra = *++Pa;
8216     //       Rb = *--Pb;
8217     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8218     //       MACC(Rm, Rn, t0, t1, t2);
8219     //       Rm = *++Pm;
8220     //       Rn = *--Pn;
8221     //     }
8222     //     if ((i & 1) == 0) {
8223     //       assert(Ra == Pa_base[j], "must be");
8224     //       MACC(Ra, Ra, t0, t1, t2);
8225     //     }
8226     //     iters = i/2;
8227     //     assert(iters == i-j, "must be");
8228     //     for (; iters--; j++) {
8229     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8230     //       MACC(Rm, Rn, t0, t1, t2);
8231     //       Rm = *++Pm;
8232     //       Rn = *--Pn;
8233     //     }
8234 
8235     //     *Pm = Rm = t0 * inv;
8236     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8237     //     MACC(Rm, Rn, t0, t1, t2);
8238 
8239     //     assert(t0 == 0, "broken Montgomery multiply");
8240 
8241     //     t0 = t1; t1 = t2; t2 = 0;
8242     //   }
8243 
8244     //   for (i = len; i < 2*len; i++) {
8245     //     int start = i-len+1;
8246     //     int end = start + (len - start)/2;
8247     //     int j;
8248 
8249     //     Pa = Pa_base + i-len;
8250     //     Pb = Pa_base + len;
8251     //     Pm = Pm_base + i-len;
8252     //     Pn = Pn_base + len;
8253 
8254     //     Ra = *++Pa;
8255     //     Rb = *--Pb;
8256     //     Rm = *++Pm;
8257     //     Rn = *--Pn;
8258 
8259     //     int iters = (2*len-i-1)/2;
8260     //     assert(iters == end-start, "must be");
8261     //     for (j = start; iters--; j++) {
8262     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8263     //       MACC2(Ra, Rb, t0, t1, t2);
8264     //       Ra = *++Pa;
8265     //       Rb = *--Pb;
8266     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8267     //       MACC(Rm, Rn, t0, t1, t2);
8268     //       Rm = *++Pm;
8269     //       Rn = *--Pn;
8270     //     }
8271     //     if ((i & 1) == 0) {
8272     //       assert(Ra == Pa_base[j], "must be");
8273     //       MACC(Ra, Ra, t0, t1, t2);
8274     //     }
8275     //     iters =  (2*len-i)/2;
8276     //     assert(iters == len-j, "must be");
8277     //     for (; iters--; j++) {
8278     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8279     //       MACC(Rm, Rn, t0, t1, t2);
8280     //       Rm = *++Pm;
8281     //       Rn = *--Pn;
8282     //     }
8283     //     Pm_base[i-len] = t0;
8284     //     t0 = t1; t1 = t2; t2 = 0;
8285     //   }
8286 
8287     //   while (t0)
8288     //     t0 = sub(Pm_base, Pn_base, t0, len);
8289     // }
8290   };
8291 
8292 
8293   // Initialization
8294   void generate_initial_stubs() {
8295     // Generate initial stubs and initializes the entry points
8296 
8297     // entry points that exist in all platforms Note: This is code
8298     // that could be shared among different platforms - however the
8299     // benefit seems to be smaller than the disadvantage of having a
8300     // much more complicated generator structure. See also comment in
8301     // stubRoutines.hpp.
8302 
8303     StubRoutines::_forward_exception_entry = generate_forward_exception();
8304 
8305     StubRoutines::_call_stub_entry =
8306       generate_call_stub(StubRoutines::_call_stub_return_address);
8307 
8308     // is referenced by megamorphic call
8309     StubRoutines::_catch_exception_entry = generate_catch_exception();
8310 
8311     // Build this early so it's available for the interpreter.
8312     StubRoutines::_throw_StackOverflowError_entry =
8313       generate_throw_exception("StackOverflowError throw_exception",
8314                                CAST_FROM_FN_PTR(address,
8315                                                 SharedRuntime::throw_StackOverflowError));
8316     StubRoutines::_throw_delayed_StackOverflowError_entry =
8317       generate_throw_exception("delayed StackOverflowError throw_exception",
8318                                CAST_FROM_FN_PTR(address,
8319                                                 SharedRuntime::throw_delayed_StackOverflowError));
8320 
8321     // Initialize table for copy memory (arraycopy) check.
8322     if (UnsafeCopyMemory::_table == nullptr) {
8323       UnsafeCopyMemory::create_table(8);
8324     }
8325 
8326     if (UseCRC32Intrinsics) {
8327       // set table address before stub generation which use it
8328       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8329       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8330     }
8331 
8332     if (UseCRC32CIntrinsics) {
8333       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8334     }
8335 
8336     // Disabled until JDK-8210858 is fixed
8337     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
8338     //   StubRoutines::_dlog = generate_dlog();
8339     // }
8340 
8341     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8342       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8343     }
8344 
8345     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8346       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8347     }
8348   }
8349 
8350   void generate_continuation_stubs() {
8351     // Continuation stubs:
8352     StubRoutines::_cont_thaw          = generate_cont_thaw();
8353     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8354     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8355 
8356     JFR_ONLY(generate_jfr_stubs();)
8357   }
8358 
8359 #if INCLUDE_JFR
8360   void generate_jfr_stubs() {
8361     StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();
8362     StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();
8363     StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease();
8364     StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point();
8365   }
8366 #endif // INCLUDE_JFR
8367 
8368   void generate_final_stubs() {
8369     // support for verify_oop (must happen after universe_init)
8370     if (VerifyOops) {
8371       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8372     }
8373     StubRoutines::_throw_AbstractMethodError_entry =
8374       generate_throw_exception("AbstractMethodError throw_exception",
8375                                CAST_FROM_FN_PTR(address,
8376                                                 SharedRuntime::
8377                                                 throw_AbstractMethodError));
8378 
8379     StubRoutines::_throw_IncompatibleClassChangeError_entry =
8380       generate_throw_exception("IncompatibleClassChangeError throw_exception",
8381                                CAST_FROM_FN_PTR(address,
8382                                                 SharedRuntime::
8383                                                 throw_IncompatibleClassChangeError));
8384 
8385     StubRoutines::_throw_NullPointerException_at_call_entry =
8386       generate_throw_exception("NullPointerException at call throw_exception",
8387                                CAST_FROM_FN_PTR(address,
8388                                                 SharedRuntime::
8389                                                 throw_NullPointerException_at_call));
8390 
8391     // arraycopy stubs used by compilers
8392     generate_arraycopy_stubs();
8393 
8394     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8395     if (bs_nm != nullptr) {
8396       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8397     }
8398 
8399     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8400 
8401     if (UsePoly1305Intrinsics) {
8402       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
8403     }
8404 
8405 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8406 
8407     generate_atomic_entry_points();
8408 
8409 #endif // LINUX
8410 
8411     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8412 
8413     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8414   }
8415 
8416   void generate_compiler_stubs() {
8417 #if COMPILER2_OR_JVMCI
8418 
8419     if (UseSVE == 0) {
8420       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8421     }
8422 
8423     // array equals stub for large arrays.
8424     if (!UseSimpleArrayEquals) {
8425       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8426     }
8427 
8428     // byte_array_inflate stub for large arrays.
8429     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8430 
8431     // countPositives stub for large arrays.
8432     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8433 
8434     generate_compare_long_strings();
8435 
8436     generate_string_indexof_stubs();
8437 
8438 #ifdef COMPILER2
8439     if (UseMultiplyToLenIntrinsic) {
8440       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8441     }
8442 
8443     if (UseSquareToLenIntrinsic) {
8444       StubRoutines::_squareToLen = generate_squareToLen();
8445     }
8446 
8447     if (UseMulAddIntrinsic) {
8448       StubRoutines::_mulAdd = generate_mulAdd();
8449     }
8450 
8451     if (UseSIMDForBigIntegerShiftIntrinsics) {
8452       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8453       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8454     }
8455 
8456     if (UseMontgomeryMultiplyIntrinsic) {
8457       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8458       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8459       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8460     }
8461 
8462     if (UseMontgomerySquareIntrinsic) {
8463       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8464       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8465       // We use generate_multiply() rather than generate_square()
8466       // because it's faster for the sizes of modulus we care about.
8467       StubRoutines::_montgomerySquare = g.generate_multiply();
8468     }
8469 #endif // COMPILER2
8470 
8471     if (UseChaCha20Intrinsics) {
8472       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8473     }
8474 
8475     if (UseBASE64Intrinsics) {
8476         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8477         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8478     }
8479 
8480     // data cache line writeback
8481     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8482     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8483 
8484     if (UseAESIntrinsics) {
8485       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8486       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8487       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8488       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8489       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8490     }
8491     if (UseGHASHIntrinsics) {
8492       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8493       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8494     }
8495     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8496       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8497     }
8498 
8499     if (UseMD5Intrinsics) {
8500       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8501       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8502     }
8503     if (UseSHA1Intrinsics) {
8504       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8505       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8506     }
8507     if (UseSHA256Intrinsics) {
8508       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8509       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8510     }
8511     if (UseSHA512Intrinsics) {
8512       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8513       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8514     }
8515     if (UseSHA3Intrinsics) {
8516       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8517       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8518     }
8519 
8520     // generate Adler32 intrinsics code
8521     if (UseAdler32Intrinsics) {
8522       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8523     }
8524 #endif // COMPILER2_OR_JVMCI
8525   }
8526 
8527  public:
8528   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8529     switch(kind) {
8530     case Initial_stubs:
8531       generate_initial_stubs();
8532       break;
8533      case Continuation_stubs:
8534       generate_continuation_stubs();
8535       break;
8536     case Compiler_stubs:
8537       generate_compiler_stubs();
8538       break;
8539     case Final_stubs:
8540       generate_final_stubs();
8541       break;
8542     default:
8543       fatal("unexpected stubs kind: %d", kind);
8544       break;
8545     };
8546   }
8547 }; // end class declaration
8548 
8549 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8550   StubGenerator g(code, kind);
8551 }
8552 
8553 
8554 #if defined (LINUX)
8555 
8556 // Define pointers to atomic stubs and initialize them to point to the
8557 // code in atomic_aarch64.S.
8558 
8559 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8560   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8561     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8562   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8563     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8564 
8565 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8566 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8567 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8568 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8569 DEFAULT_ATOMIC_OP(xchg, 4, )
8570 DEFAULT_ATOMIC_OP(xchg, 8, )
8571 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8572 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8573 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8574 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8575 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8576 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8577 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8578 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8579 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8580 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8581 
8582 #undef DEFAULT_ATOMIC_OP
8583 
8584 #endif // LINUX