1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "prims/upcallLinker.hpp"
  45 #include "runtime/atomic.hpp"
  46 #include "runtime/continuation.hpp"
  47 #include "runtime/continuationEntry.inline.hpp"
  48 #include "runtime/frame.inline.hpp"
  49 #include "runtime/handles.inline.hpp"
  50 #include "runtime/javaThread.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/stubCodeGenerator.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/checkedCast.hpp"
  56 #include "utilities/globalDefinitions.hpp"
  57 #include "utilities/powerOfTwo.hpp"
  58 #ifdef COMPILER2
  59 #include "opto/runtime.hpp"
  60 #endif
  61 #if INCLUDE_ZGC
  62 #include "gc/z/zThreadLocalData.hpp"
  63 #endif
  64 
  65 // Declaration and definition of StubGenerator (no .hpp file).
  66 // For a more detailed description of the stub routine structure
  67 // see the comment in stubRoutines.hpp
  68 
  69 #undef __
  70 #define __ _masm->
  71 
  72 #ifdef PRODUCT
  73 #define BLOCK_COMMENT(str) /* nothing */
  74 #else
  75 #define BLOCK_COMMENT(str) __ block_comment(str)
  76 #endif
  77 
  78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  79 
  80 // Stub Code definitions
  81 
  82 class StubGenerator: public StubCodeGenerator {
  83  private:
  84 
  85 #ifdef PRODUCT
  86 #define inc_counter_np(counter) ((void)0)
  87 #else
  88   void inc_counter_np_(uint& counter) {
  89     __ lea(rscratch2, ExternalAddress((address)&counter));
  90     __ ldrw(rscratch1, Address(rscratch2));
  91     __ addw(rscratch1, rscratch1, 1);
  92     __ strw(rscratch1, Address(rscratch2));
  93   }
  94 #define inc_counter_np(counter) \
  95   BLOCK_COMMENT("inc_counter " #counter); \
  96   inc_counter_np_(counter);
  97 #endif
  98 
  99   // Call stubs are used to call Java from C
 100   //
 101   // Arguments:
 102   //    c_rarg0:   call wrapper address                   address
 103   //    c_rarg1:   result                                 address
 104   //    c_rarg2:   result type                            BasicType
 105   //    c_rarg3:   method                                 Method*
 106   //    c_rarg4:   (interpreter) entry point              address
 107   //    c_rarg5:   parameters                             intptr_t*
 108   //    c_rarg6:   parameter size (in words)              int
 109   //    c_rarg7:   thread                                 Thread*
 110   //
 111   // There is no return from the stub itself as any Java result
 112   // is written to result
 113   //
 114   // we save r30 (lr) as the return PC at the base of the frame and
 115   // link r29 (fp) below it as the frame pointer installing sp (r31)
 116   // into fp.
 117   //
 118   // we save r0-r7, which accounts for all the c arguments.
 119   //
 120   // TODO: strictly do we need to save them all? they are treated as
 121   // volatile by C so could we omit saving the ones we are going to
 122   // place in global registers (thread? method?) or those we only use
 123   // during setup of the Java call?
 124   //
 125   // we don't need to save r8 which C uses as an indirect result location
 126   // return register.
 127   //
 128   // we don't need to save r9-r15 which both C and Java treat as
 129   // volatile
 130   //
 131   // we don't need to save r16-18 because Java does not use them
 132   //
 133   // we save r19-r28 which Java uses as scratch registers and C
 134   // expects to be callee-save
 135   //
 136   // we save the bottom 64 bits of each value stored in v8-v15; it is
 137   // the responsibility of the caller to preserve larger values.
 138   //
 139   // so the stub frame looks like this when we enter Java code
 140   //
 141   //     [ return_from_Java     ] <--- sp
 142   //     [ argument word n      ]
 143   //      ...
 144   // -29 [ argument word 1      ]
 145   // -28 [ saved Floating-point Control Register ]
 146   // -26 [ saved v15            ] <--- sp_after_call
 147   // -25 [ saved v14            ]
 148   // -24 [ saved v13            ]
 149   // -23 [ saved v12            ]
 150   // -22 [ saved v11            ]
 151   // -21 [ saved v10            ]
 152   // -20 [ saved v9             ]
 153   // -19 [ saved v8             ]
 154   // -18 [ saved r28            ]
 155   // -17 [ saved r27            ]
 156   // -16 [ saved r26            ]
 157   // -15 [ saved r25            ]
 158   // -14 [ saved r24            ]
 159   // -13 [ saved r23            ]
 160   // -12 [ saved r22            ]
 161   // -11 [ saved r21            ]
 162   // -10 [ saved r20            ]
 163   //  -9 [ saved r19            ]
 164   //  -8 [ call wrapper    (r0) ]
 165   //  -7 [ result          (r1) ]
 166   //  -6 [ result type     (r2) ]
 167   //  -5 [ method          (r3) ]
 168   //  -4 [ entry point     (r4) ]
 169   //  -3 [ parameters      (r5) ]
 170   //  -2 [ parameter size  (r6) ]
 171   //  -1 [ thread (r7)          ]
 172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 173   //   1 [ saved lr       (r30) ]
 174 
 175   // Call stub stack layout word offsets from fp
 176   enum call_stub_layout {
 177     sp_after_call_off  = -28,
 178 
 179     fpcr_off           = sp_after_call_off,
 180     d15_off            = -26,
 181     d13_off            = -24,
 182     d11_off            = -22,
 183     d9_off             = -20,
 184 
 185     r28_off            = -18,
 186     r26_off            = -16,
 187     r24_off            = -14,
 188     r22_off            = -12,
 189     r20_off            = -10,
 190     call_wrapper_off   =  -8,
 191     result_off         =  -7,
 192     result_type_off    =  -6,
 193     method_off         =  -5,
 194     entry_point_off    =  -4,
 195     parameter_size_off =  -2,
 196     thread_off         =  -1,
 197     fp_f               =   0,
 198     retaddr_off        =   1,
 199   };
 200 
 201   address generate_call_stub(address& return_address) {
 202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 204            "adjust this code");
 205 
 206     StubCodeMark mark(this, "StubRoutines", "call_stub");
 207     address start = __ pc();
 208 
 209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 210 
 211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 213     const Address result        (rfp, result_off         * wordSize);
 214     const Address result_type   (rfp, result_type_off    * wordSize);
 215     const Address method        (rfp, method_off         * wordSize);
 216     const Address entry_point   (rfp, entry_point_off    * wordSize);
 217     const Address parameter_size(rfp, parameter_size_off * wordSize);
 218 
 219     const Address thread        (rfp, thread_off         * wordSize);
 220 
 221     const Address d15_save      (rfp, d15_off * wordSize);
 222     const Address d13_save      (rfp, d13_off * wordSize);
 223     const Address d11_save      (rfp, d11_off * wordSize);
 224     const Address d9_save       (rfp, d9_off * wordSize);
 225 
 226     const Address r28_save      (rfp, r28_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r24_save      (rfp, r24_off * wordSize);
 229     const Address r22_save      (rfp, r22_off * wordSize);
 230     const Address r20_save      (rfp, r20_off * wordSize);
 231 
 232     // stub code
 233 
 234     address aarch64_entry = __ pc();
 235 
 236     // set up frame and move sp to end of save area
 237     __ enter();
 238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 239 
 240     // save register parameters and Java scratch/global registers
 241     // n.b. we save thread even though it gets installed in
 242     // rthread because we want to sanity check rthread later
 243     __ str(c_rarg7,  thread);
 244     __ strw(c_rarg6, parameter_size);
 245     __ stp(c_rarg4, c_rarg5,  entry_point);
 246     __ stp(c_rarg2, c_rarg3,  result_type);
 247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 248 
 249     __ stp(r20, r19,   r20_save);
 250     __ stp(r22, r21,   r22_save);
 251     __ stp(r24, r23,   r24_save);
 252     __ stp(r26, r25,   r26_save);
 253     __ stp(r28, r27,   r28_save);
 254 
 255     __ stpd(v9,  v8,   d9_save);
 256     __ stpd(v11, v10,  d11_save);
 257     __ stpd(v13, v12,  d13_save);
 258     __ stpd(v15, v14,  d15_save);
 259 
 260     __ get_fpcr(rscratch1);
 261     __ str(rscratch1, fpcr_save);
 262     // Set FPCR to the state we need. We do want Round to Nearest. We
 263     // don't want non-IEEE rounding modes or floating-point traps.
 264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 266     __ set_fpcr(rscratch1);
 267 
 268     // install Java thread in global register now we have saved
 269     // whatever value it held
 270     __ mov(rthread, c_rarg7);
 271     // And method
 272     __ mov(rmethod, c_rarg3);
 273 
 274     // set up the heapbase register
 275     __ reinit_heapbase();
 276 
 277 #ifdef ASSERT
 278     // make sure we have no pending exceptions
 279     {
 280       Label L;
 281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 282       __ cmp(rscratch1, (u1)NULL_WORD);
 283       __ br(Assembler::EQ, L);
 284       __ stop("StubRoutines::call_stub: entered with pending exception");
 285       __ BIND(L);
 286     }
 287 #endif
 288     // pass parameters if any
 289     __ mov(esp, sp);
 290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 291     __ andr(sp, rscratch1, -2 * wordSize);
 292 
 293     BLOCK_COMMENT("pass parameters if any");
 294     Label parameters_done;
 295     // parameter count is still in c_rarg6
 296     // and parameter pointer identifying param 1 is in c_rarg5
 297     __ cbzw(c_rarg6, parameters_done);
 298 
 299     address loop = __ pc();
 300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 301     __ subsw(c_rarg6, c_rarg6, 1);
 302     __ push(rscratch1);
 303     __ br(Assembler::GT, loop);
 304 
 305     __ BIND(parameters_done);
 306 
 307     // call Java entry -- passing methdoOop, and current sp
 308     //      rmethod: Method*
 309     //      r19_sender_sp: sender sp
 310     BLOCK_COMMENT("call Java function");
 311     __ mov(r19_sender_sp, sp);
 312     __ blr(c_rarg4);
 313 
 314     // we do this here because the notify will already have been done
 315     // if we get to the next instruction via an exception
 316     //
 317     // n.b. adding this instruction here affects the calculation of
 318     // whether or not a routine returns to the call stub (used when
 319     // doing stack walks) since the normal test is to check the return
 320     // pc against the address saved below. so we may need to allow for
 321     // this extra instruction in the check.
 322 
 323     // save current address for use by exception handling code
 324 
 325     return_address = __ pc();
 326 
 327     // store result depending on type (everything that is not
 328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 329     // n.b. this assumes Java returns an integral result in r0
 330     // and a floating result in j_farg0
 331     __ ldr(j_rarg2, result);
 332     Label is_long, is_float, is_double, exit;
 333     __ ldr(j_rarg1, result_type);
 334     __ cmp(j_rarg1, (u1)T_OBJECT);
 335     __ br(Assembler::EQ, is_long);
 336     __ cmp(j_rarg1, (u1)T_LONG);
 337     __ br(Assembler::EQ, is_long);
 338     __ cmp(j_rarg1, (u1)T_FLOAT);
 339     __ br(Assembler::EQ, is_float);
 340     __ cmp(j_rarg1, (u1)T_DOUBLE);
 341     __ br(Assembler::EQ, is_double);
 342 
 343     // handle T_INT case
 344     __ strw(r0, Address(j_rarg2));
 345 
 346     __ BIND(exit);
 347 
 348     // pop parameters
 349     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 350 
 351 #ifdef ASSERT
 352     // verify that threads correspond
 353     {
 354       Label L, S;
 355       __ ldr(rscratch1, thread);
 356       __ cmp(rthread, rscratch1);
 357       __ br(Assembler::NE, S);
 358       __ get_thread(rscratch1);
 359       __ cmp(rthread, rscratch1);
 360       __ br(Assembler::EQ, L);
 361       __ BIND(S);
 362       __ stop("StubRoutines::call_stub: threads must correspond");
 363       __ BIND(L);
 364     }
 365 #endif
 366 
 367     __ pop_cont_fastpath(rthread);
 368 
 369     // restore callee-save registers
 370     __ ldpd(v15, v14,  d15_save);
 371     __ ldpd(v13, v12,  d13_save);
 372     __ ldpd(v11, v10,  d11_save);
 373     __ ldpd(v9,  v8,   d9_save);
 374 
 375     __ ldp(r28, r27,   r28_save);
 376     __ ldp(r26, r25,   r26_save);
 377     __ ldp(r24, r23,   r24_save);
 378     __ ldp(r22, r21,   r22_save);
 379     __ ldp(r20, r19,   r20_save);
 380 
 381     // restore fpcr
 382     __ ldr(rscratch1,  fpcr_save);
 383     __ set_fpcr(rscratch1);
 384 
 385     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 386     __ ldrw(c_rarg2, result_type);
 387     __ ldr(c_rarg3,  method);
 388     __ ldp(c_rarg4, c_rarg5,  entry_point);
 389     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 390 
 391     // leave frame and return to caller
 392     __ leave();
 393     __ ret(lr);
 394 
 395     // handle return types different from T_INT
 396 
 397     __ BIND(is_long);
 398     __ str(r0, Address(j_rarg2, 0));
 399     __ br(Assembler::AL, exit);
 400 
 401     __ BIND(is_float);
 402     __ strs(j_farg0, Address(j_rarg2, 0));
 403     __ br(Assembler::AL, exit);
 404 
 405     __ BIND(is_double);
 406     __ strd(j_farg0, Address(j_rarg2, 0));
 407     __ br(Assembler::AL, exit);
 408 
 409     return start;
 410   }
 411 
 412   // Return point for a Java call if there's an exception thrown in
 413   // Java code.  The exception is caught and transformed into a
 414   // pending exception stored in JavaThread that can be tested from
 415   // within the VM.
 416   //
 417   // Note: Usually the parameters are removed by the callee. In case
 418   // of an exception crossing an activation frame boundary, that is
 419   // not the case if the callee is compiled code => need to setup the
 420   // rsp.
 421   //
 422   // r0: exception oop
 423 
 424   address generate_catch_exception() {
 425     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 426     address start = __ pc();
 427 
 428     // same as in generate_call_stub():
 429     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 430     const Address thread        (rfp, thread_off         * wordSize);
 431 
 432 #ifdef ASSERT
 433     // verify that threads correspond
 434     {
 435       Label L, S;
 436       __ ldr(rscratch1, thread);
 437       __ cmp(rthread, rscratch1);
 438       __ br(Assembler::NE, S);
 439       __ get_thread(rscratch1);
 440       __ cmp(rthread, rscratch1);
 441       __ br(Assembler::EQ, L);
 442       __ bind(S);
 443       __ stop("StubRoutines::catch_exception: threads must correspond");
 444       __ bind(L);
 445     }
 446 #endif
 447 
 448     // set pending exception
 449     __ verify_oop(r0);
 450 
 451     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 452     __ mov(rscratch1, (address)__FILE__);
 453     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 454     __ movw(rscratch1, (int)__LINE__);
 455     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 456 
 457     // complete return to VM
 458     assert(StubRoutines::_call_stub_return_address != nullptr,
 459            "_call_stub_return_address must have been generated before");
 460     __ b(StubRoutines::_call_stub_return_address);
 461 
 462     return start;
 463   }
 464 
 465   // Continuation point for runtime calls returning with a pending
 466   // exception.  The pending exception check happened in the runtime
 467   // or native call stub.  The pending exception in Thread is
 468   // converted into a Java-level exception.
 469   //
 470   // Contract with Java-level exception handlers:
 471   // r0: exception
 472   // r3: throwing pc
 473   //
 474   // NOTE: At entry of this stub, exception-pc must be in LR !!
 475 
 476   // NOTE: this is always used as a jump target within generated code
 477   // so it just needs to be generated code with no x86 prolog
 478 
 479   address generate_forward_exception() {
 480     StubCodeMark mark(this, "StubRoutines", "forward exception");
 481     address start = __ pc();
 482 
 483     // Upon entry, LR points to the return address returning into
 484     // Java (interpreted or compiled) code; i.e., the return address
 485     // becomes the throwing pc.
 486     //
 487     // Arguments pushed before the runtime call are still on the stack
 488     // but the exception handler will reset the stack pointer ->
 489     // ignore them.  A potential result in registers can be ignored as
 490     // well.
 491 
 492 #ifdef ASSERT
 493     // make sure this code is only executed if there is a pending exception
 494     {
 495       Label L;
 496       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 497       __ cbnz(rscratch1, L);
 498       __ stop("StubRoutines::forward exception: no pending exception (1)");
 499       __ bind(L);
 500     }
 501 #endif
 502 
 503     // compute exception handler into r19
 504 
 505     // call the VM to find the handler address associated with the
 506     // caller address. pass thread in r0 and caller pc (ret address)
 507     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 508     // the stack.
 509     __ mov(c_rarg1, lr);
 510     // lr will be trashed by the VM call so we move it to R19
 511     // (callee-saved) because we also need to pass it to the handler
 512     // returned by this call.
 513     __ mov(r19, lr);
 514     BLOCK_COMMENT("call exception_handler_for_return_address");
 515     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 516                          SharedRuntime::exception_handler_for_return_address),
 517                     rthread, c_rarg1);
 518     // Reinitialize the ptrue predicate register, in case the external runtime
 519     // call clobbers ptrue reg, as we may return to SVE compiled code.
 520     __ reinitialize_ptrue();
 521 
 522     // we should not really care that lr is no longer the callee
 523     // address. we saved the value the handler needs in r19 so we can
 524     // just copy it to r3. however, the C2 handler will push its own
 525     // frame and then calls into the VM and the VM code asserts that
 526     // the PC for the frame above the handler belongs to a compiled
 527     // Java method. So, we restore lr here to satisfy that assert.
 528     __ mov(lr, r19);
 529     // setup r0 & r3 & clear pending exception
 530     __ mov(r3, r19);
 531     __ mov(r19, r0);
 532     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 533     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 534 
 535 #ifdef ASSERT
 536     // make sure exception is set
 537     {
 538       Label L;
 539       __ cbnz(r0, L);
 540       __ stop("StubRoutines::forward exception: no pending exception (2)");
 541       __ bind(L);
 542     }
 543 #endif
 544 
 545     // continue at exception handler
 546     // r0: exception
 547     // r3: throwing pc
 548     // r19: exception handler
 549     __ verify_oop(r0);
 550     __ br(r19);
 551 
 552     return start;
 553   }
 554 
 555   // Non-destructive plausibility checks for oops
 556   //
 557   // Arguments:
 558   //    r0: oop to verify
 559   //    rscratch1: error message
 560   //
 561   // Stack after saving c_rarg3:
 562   //    [tos + 0]: saved c_rarg3
 563   //    [tos + 1]: saved c_rarg2
 564   //    [tos + 2]: saved lr
 565   //    [tos + 3]: saved rscratch2
 566   //    [tos + 4]: saved r0
 567   //    [tos + 5]: saved rscratch1
 568   address generate_verify_oop() {
 569 
 570     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 571     address start = __ pc();
 572 
 573     Label exit, error;
 574 
 575     // save c_rarg2 and c_rarg3
 576     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 577 
 578     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 579     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 580     __ ldr(c_rarg3, Address(c_rarg2));
 581     __ add(c_rarg3, c_rarg3, 1);
 582     __ str(c_rarg3, Address(c_rarg2));
 583 
 584     // object is in r0
 585     // make sure object is 'reasonable'
 586     __ cbz(r0, exit); // if obj is null it is OK
 587 
 588     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 589     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 590 
 591     // return if everything seems ok
 592     __ bind(exit);
 593 
 594     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 595     __ ret(lr);
 596 
 597     // handle errors
 598     __ bind(error);
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600 
 601     __ push(RegSet::range(r0, r29), sp);
 602     // debug(char* msg, int64_t pc, int64_t regs[])
 603     __ mov(c_rarg0, rscratch1);      // pass address of error message
 604     __ mov(c_rarg1, lr);             // pass return address
 605     __ mov(c_rarg2, sp);             // pass address of regs on stack
 606 #ifndef PRODUCT
 607     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 608 #endif
 609     BLOCK_COMMENT("call MacroAssembler::debug");
 610     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 611     __ blr(rscratch1);
 612     __ hlt(0);
 613 
 614     return start;
 615   }
 616 
 617   // Generate indices for iota vector.
 618   address generate_iota_indices(const char *stub_name) {
 619     __ align(CodeEntryAlignment);
 620     StubCodeMark mark(this, "StubRoutines", stub_name);
 621     address start = __ pc();
 622     // B
 623     __ emit_data64(0x0706050403020100, relocInfo::none);
 624     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 625     // H
 626     __ emit_data64(0x0003000200010000, relocInfo::none);
 627     __ emit_data64(0x0007000600050004, relocInfo::none);
 628     // S
 629     __ emit_data64(0x0000000100000000, relocInfo::none);
 630     __ emit_data64(0x0000000300000002, relocInfo::none);
 631     // D
 632     __ emit_data64(0x0000000000000000, relocInfo::none);
 633     __ emit_data64(0x0000000000000001, relocInfo::none);
 634     // S - FP
 635     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 636     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 637     // D - FP
 638     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 639     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 640     return start;
 641   }
 642 
 643   // The inner part of zero_words().  This is the bulk operation,
 644   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 645   // caller is responsible for zeroing the last few words.
 646   //
 647   // Inputs:
 648   // r10: the HeapWord-aligned base address of an array to zero.
 649   // r11: the count in HeapWords, r11 > 0.
 650   //
 651   // Returns r10 and r11, adjusted for the caller to clear.
 652   // r10: the base address of the tail of words left to clear.
 653   // r11: the number of words in the tail.
 654   //      r11 < MacroAssembler::zero_words_block_size.
 655 
 656   address generate_zero_blocks() {
 657     Label done;
 658     Label base_aligned;
 659 
 660     Register base = r10, cnt = r11;
 661 
 662     __ align(CodeEntryAlignment);
 663     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 664     address start = __ pc();
 665 
 666     if (UseBlockZeroing) {
 667       int zva_length = VM_Version::zva_length();
 668 
 669       // Ensure ZVA length can be divided by 16. This is required by
 670       // the subsequent operations.
 671       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 672 
 673       __ tbz(base, 3, base_aligned);
 674       __ str(zr, Address(__ post(base, 8)));
 675       __ sub(cnt, cnt, 1);
 676       __ bind(base_aligned);
 677 
 678       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 679       // alignment.
 680       Label small;
 681       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 682       __ subs(rscratch1, cnt, low_limit >> 3);
 683       __ br(Assembler::LT, small);
 684       __ zero_dcache_blocks(base, cnt);
 685       __ bind(small);
 686     }
 687 
 688     {
 689       // Number of stp instructions we'll unroll
 690       const int unroll =
 691         MacroAssembler::zero_words_block_size / 2;
 692       // Clear the remaining blocks.
 693       Label loop;
 694       __ subs(cnt, cnt, unroll * 2);
 695       __ br(Assembler::LT, done);
 696       __ bind(loop);
 697       for (int i = 0; i < unroll; i++)
 698         __ stp(zr, zr, __ post(base, 16));
 699       __ subs(cnt, cnt, unroll * 2);
 700       __ br(Assembler::GE, loop);
 701       __ bind(done);
 702       __ add(cnt, cnt, unroll * 2);
 703     }
 704 
 705     __ ret(lr);
 706 
 707     return start;
 708   }
 709 
 710 
 711   typedef enum {
 712     copy_forwards = 1,
 713     copy_backwards = -1
 714   } copy_direction;
 715 
 716   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 717   // for arraycopy stubs.
 718   class ArrayCopyBarrierSetHelper : StackObj {
 719     BarrierSetAssembler* _bs_asm;
 720     MacroAssembler* _masm;
 721     DecoratorSet _decorators;
 722     BasicType _type;
 723     Register _gct1;
 724     Register _gct2;
 725     Register _gct3;
 726     FloatRegister _gcvt1;
 727     FloatRegister _gcvt2;
 728     FloatRegister _gcvt3;
 729 
 730   public:
 731     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 732                               DecoratorSet decorators,
 733                               BasicType type,
 734                               Register gct1,
 735                               Register gct2,
 736                               Register gct3,
 737                               FloatRegister gcvt1,
 738                               FloatRegister gcvt2,
 739                               FloatRegister gcvt3)
 740       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 741         _masm(masm),
 742         _decorators(decorators),
 743         _type(type),
 744         _gct1(gct1),
 745         _gct2(gct2),
 746         _gct3(gct3),
 747         _gcvt1(gcvt1),
 748         _gcvt2(gcvt2),
 749         _gcvt3(gcvt3) {
 750     }
 751 
 752     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 753       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 754                             dst1, dst2, src,
 755                             _gct1, _gct2, _gcvt1);
 756     }
 757 
 758     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 759       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 760                              dst, src1, src2,
 761                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 762     }
 763 
 764     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 765       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 766                             dst1, dst2, src,
 767                             _gct1);
 768     }
 769 
 770     void copy_store_at_16(Address dst, Register src1, Register src2) {
 771       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 772                              dst, src1, src2,
 773                              _gct1, _gct2, _gct3);
 774     }
 775 
 776     void copy_load_at_8(Register dst, Address src) {
 777       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 778                             dst, noreg, src,
 779                             _gct1);
 780     }
 781 
 782     void copy_store_at_8(Address dst, Register src) {
 783       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 784                              dst, src, noreg,
 785                              _gct1, _gct2, _gct3);
 786     }
 787   };
 788 
 789   // Bulk copy of blocks of 8 words.
 790   //
 791   // count is a count of words.
 792   //
 793   // Precondition: count >= 8
 794   //
 795   // Postconditions:
 796   //
 797   // The least significant bit of count contains the remaining count
 798   // of words to copy.  The rest of count is trash.
 799   //
 800   // s and d are adjusted to point to the remaining words to copy
 801   //
 802   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 803                            copy_direction direction) {
 804     int unit = wordSize * direction;
 805     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 806 
 807     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 808       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 809     const Register stride = r14;
 810     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 811     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 812     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 813 
 814     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 815     assert_different_registers(s, d, count, rscratch1, rscratch2);
 816 
 817     Label again, drain;
 818     const char *stub_name;
 819     if (direction == copy_forwards)
 820       stub_name = "forward_copy_longs";
 821     else
 822       stub_name = "backward_copy_longs";
 823 
 824     __ align(CodeEntryAlignment);
 825 
 826     StubCodeMark mark(this, "StubRoutines", stub_name);
 827 
 828     __ bind(start);
 829 
 830     Label unaligned_copy_long;
 831     if (AvoidUnalignedAccesses) {
 832       __ tbnz(d, 3, unaligned_copy_long);
 833     }
 834 
 835     if (direction == copy_forwards) {
 836       __ sub(s, s, bias);
 837       __ sub(d, d, bias);
 838     }
 839 
 840 #ifdef ASSERT
 841     // Make sure we are never given < 8 words
 842     {
 843       Label L;
 844       __ cmp(count, (u1)8);
 845       __ br(Assembler::GE, L);
 846       __ stop("genrate_copy_longs called with < 8 words");
 847       __ bind(L);
 848     }
 849 #endif
 850 
 851     // Fill 8 registers
 852     if (UseSIMDForMemoryOps) {
 853       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 854       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 855     } else {
 856       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 857       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 858       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 859       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 860     }
 861 
 862     __ subs(count, count, 16);
 863     __ br(Assembler::LO, drain);
 864 
 865     int prefetch = PrefetchCopyIntervalInBytes;
 866     bool use_stride = false;
 867     if (direction == copy_backwards) {
 868        use_stride = prefetch > 256;
 869        prefetch = -prefetch;
 870        if (use_stride) __ mov(stride, prefetch);
 871     }
 872 
 873     __ bind(again);
 874 
 875     if (PrefetchCopyIntervalInBytes > 0)
 876       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 877 
 878     if (UseSIMDForMemoryOps) {
 879       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 880       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 881       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 882       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 883     } else {
 884       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 885       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 886       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 887       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 888       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 889       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 890       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 891       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 892     }
 893 
 894     __ subs(count, count, 8);
 895     __ br(Assembler::HS, again);
 896 
 897     // Drain
 898     __ bind(drain);
 899     if (UseSIMDForMemoryOps) {
 900       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 901       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 902     } else {
 903       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 904       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 905       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 906       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 907     }
 908 
 909     {
 910       Label L1, L2;
 911       __ tbz(count, exact_log2(4), L1);
 912       if (UseSIMDForMemoryOps) {
 913         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 914         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 915       } else {
 916         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 917         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 918         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 919         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 920       }
 921       __ bind(L1);
 922 
 923       if (direction == copy_forwards) {
 924         __ add(s, s, bias);
 925         __ add(d, d, bias);
 926       }
 927 
 928       __ tbz(count, 1, L2);
 929       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 930       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 931       __ bind(L2);
 932     }
 933 
 934     __ ret(lr);
 935 
 936     if (AvoidUnalignedAccesses) {
 937       Label drain, again;
 938       // Register order for storing. Order is different for backward copy.
 939 
 940       __ bind(unaligned_copy_long);
 941 
 942       // source address is even aligned, target odd aligned
 943       //
 944       // when forward copying word pairs we read long pairs at offsets
 945       // {0, 2, 4, 6} (in long words). when backwards copying we read
 946       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 947       // address by -2 in the forwards case so we can compute the
 948       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 949       // or -1.
 950       //
 951       // when forward copying we need to store 1 word, 3 pairs and
 952       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 953       // zero offset We adjust the destination by -1 which means we
 954       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 955       //
 956       // When backwards copyng we need to store 1 word, 3 pairs and
 957       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 958       // offsets {1, 3, 5, 7, 8} * unit.
 959 
 960       if (direction == copy_forwards) {
 961         __ sub(s, s, 16);
 962         __ sub(d, d, 8);
 963       }
 964 
 965       // Fill 8 registers
 966       //
 967       // for forwards copy s was offset by -16 from the original input
 968       // value of s so the register contents are at these offsets
 969       // relative to the 64 bit block addressed by that original input
 970       // and so on for each successive 64 byte block when s is updated
 971       //
 972       // t0 at offset 0,  t1 at offset 8
 973       // t2 at offset 16, t3 at offset 24
 974       // t4 at offset 32, t5 at offset 40
 975       // t6 at offset 48, t7 at offset 56
 976 
 977       // for backwards copy s was not offset so the register contents
 978       // are at these offsets into the preceding 64 byte block
 979       // relative to that original input and so on for each successive
 980       // preceding 64 byte block when s is updated. this explains the
 981       // slightly counter-intuitive looking pattern of register usage
 982       // in the stp instructions for backwards copy.
 983       //
 984       // t0 at offset -16, t1 at offset -8
 985       // t2 at offset -32, t3 at offset -24
 986       // t4 at offset -48, t5 at offset -40
 987       // t6 at offset -64, t7 at offset -56
 988 
 989       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 990       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 991       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 992       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 993 
 994       __ subs(count, count, 16);
 995       __ br(Assembler::LO, drain);
 996 
 997       int prefetch = PrefetchCopyIntervalInBytes;
 998       bool use_stride = false;
 999       if (direction == copy_backwards) {
1000          use_stride = prefetch > 256;
1001          prefetch = -prefetch;
1002          if (use_stride) __ mov(stride, prefetch);
1003       }
1004 
1005       __ bind(again);
1006 
1007       if (PrefetchCopyIntervalInBytes > 0)
1008         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1009 
1010       if (direction == copy_forwards) {
1011        // allowing for the offset of -8 the store instructions place
1012        // registers into the target 64 bit block at the following
1013        // offsets
1014        //
1015        // t0 at offset 0
1016        // t1 at offset 8,  t2 at offset 16
1017        // t3 at offset 24, t4 at offset 32
1018        // t5 at offset 40, t6 at offset 48
1019        // t7 at offset 56
1020 
1021         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1022         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1023         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1024         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1025         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1026         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1027         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1028         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1029         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1030       } else {
1031        // d was not offset when we started so the registers are
1032        // written into the 64 bit block preceding d with the following
1033        // offsets
1034        //
1035        // t1 at offset -8
1036        // t3 at offset -24, t0 at offset -16
1037        // t5 at offset -48, t2 at offset -32
1038        // t7 at offset -56, t4 at offset -48
1039        //                   t6 at offset -64
1040        //
1041        // note that this matches the offsets previously noted for the
1042        // loads
1043 
1044         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1045         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1046         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1047         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1048         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1049         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1050         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1051         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1052         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1053       }
1054 
1055       __ subs(count, count, 8);
1056       __ br(Assembler::HS, again);
1057 
1058       // Drain
1059       //
1060       // this uses the same pattern of offsets and register arguments
1061       // as above
1062       __ bind(drain);
1063       if (direction == copy_forwards) {
1064         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1065         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1066         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1067         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1068         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1069       } else {
1070         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1071         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1072         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1073         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1074         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1075       }
1076       // now we need to copy any remaining part block which may
1077       // include a 4 word block subblock and/or a 2 word subblock.
1078       // bits 2 and 1 in the count are the tell-tale for whether we
1079       // have each such subblock
1080       {
1081         Label L1, L2;
1082         __ tbz(count, exact_log2(4), L1);
1083        // this is the same as above but copying only 4 longs hence
1084        // with only one intervening stp between the str instructions
1085        // but note that the offsets and registers still follow the
1086        // same pattern
1087         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1088         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1089         if (direction == copy_forwards) {
1090           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1091           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1092           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1093         } else {
1094           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1095           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1096           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1097         }
1098         __ bind(L1);
1099 
1100         __ tbz(count, 1, L2);
1101        // this is the same as above but copying only 2 longs hence
1102        // there is no intervening stp between the str instructions
1103        // but note that the offset and register patterns are still
1104        // the same
1105         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1106         if (direction == copy_forwards) {
1107           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1108           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1109         } else {
1110           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1111           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1112         }
1113         __ bind(L2);
1114 
1115        // for forwards copy we need to re-adjust the offsets we
1116        // applied so that s and d are follow the last words written
1117 
1118        if (direction == copy_forwards) {
1119          __ add(s, s, 16);
1120          __ add(d, d, 8);
1121        }
1122 
1123       }
1124 
1125       __ ret(lr);
1126       }
1127   }
1128 
1129   // Small copy: less than 16 bytes.
1130   //
1131   // NB: Ignores all of the bits of count which represent more than 15
1132   // bytes, so a caller doesn't have to mask them.
1133 
1134   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1135     bool is_backwards = step < 0;
1136     size_t granularity = uabs(step);
1137     int direction = is_backwards ? -1 : 1;
1138 
1139     Label Lword, Lint, Lshort, Lbyte;
1140 
1141     assert(granularity
1142            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1143 
1144     const Register t0 = r3;
1145     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1146     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1147 
1148     // ??? I don't know if this bit-test-and-branch is the right thing
1149     // to do.  It does a lot of jumping, resulting in several
1150     // mispredicted branches.  It might make more sense to do this
1151     // with something like Duff's device with a single computed branch.
1152 
1153     __ tbz(count, 3 - exact_log2(granularity), Lword);
1154     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1155     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1156     __ bind(Lword);
1157 
1158     if (granularity <= sizeof (jint)) {
1159       __ tbz(count, 2 - exact_log2(granularity), Lint);
1160       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1161       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1162       __ bind(Lint);
1163     }
1164 
1165     if (granularity <= sizeof (jshort)) {
1166       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1167       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1168       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1169       __ bind(Lshort);
1170     }
1171 
1172     if (granularity <= sizeof (jbyte)) {
1173       __ tbz(count, 0, Lbyte);
1174       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1175       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1176       __ bind(Lbyte);
1177     }
1178   }
1179 
1180   Label copy_f, copy_b;
1181   Label copy_obj_f, copy_obj_b;
1182   Label copy_obj_uninit_f, copy_obj_uninit_b;
1183 
1184   // All-singing all-dancing memory copy.
1185   //
1186   // Copy count units of memory from s to d.  The size of a unit is
1187   // step, which can be positive or negative depending on the direction
1188   // of copy.  If is_aligned is false, we align the source address.
1189   //
1190 
1191   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1192                    Register s, Register d, Register count, int step) {
1193     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1194     bool is_backwards = step < 0;
1195     unsigned int granularity = uabs(step);
1196     const Register t0 = r3, t1 = r4;
1197 
1198     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1199     // load all the data before writing anything
1200     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1201     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1202     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1203     const Register send = r17, dend = r16;
1204     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1205     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1206     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1207 
1208     if (PrefetchCopyIntervalInBytes > 0)
1209       __ prfm(Address(s, 0), PLDL1KEEP);
1210     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1211     __ br(Assembler::HI, copy_big);
1212 
1213     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1214     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1215 
1216     __ cmp(count, u1(16/granularity));
1217     __ br(Assembler::LS, copy16);
1218 
1219     __ cmp(count, u1(64/granularity));
1220     __ br(Assembler::HI, copy80);
1221 
1222     __ cmp(count, u1(32/granularity));
1223     __ br(Assembler::LS, copy32);
1224 
1225     // 33..64 bytes
1226     if (UseSIMDForMemoryOps) {
1227       bs.copy_load_at_32(v0, v1, Address(s, 0));
1228       bs.copy_load_at_32(v2, v3, Address(send, -32));
1229       bs.copy_store_at_32(Address(d, 0), v0, v1);
1230       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1231     } else {
1232       bs.copy_load_at_16(t0, t1, Address(s, 0));
1233       bs.copy_load_at_16(t2, t3, Address(s, 16));
1234       bs.copy_load_at_16(t4, t5, Address(send, -32));
1235       bs.copy_load_at_16(t6, t7, Address(send, -16));
1236 
1237       bs.copy_store_at_16(Address(d, 0), t0, t1);
1238       bs.copy_store_at_16(Address(d, 16), t2, t3);
1239       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1240       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1241     }
1242     __ b(finish);
1243 
1244     // 17..32 bytes
1245     __ bind(copy32);
1246     bs.copy_load_at_16(t0, t1, Address(s, 0));
1247     bs.copy_load_at_16(t6, t7, Address(send, -16));
1248 
1249     bs.copy_store_at_16(Address(d, 0), t0, t1);
1250     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1251     __ b(finish);
1252 
1253     // 65..80/96 bytes
1254     // (96 bytes if SIMD because we do 32 byes per instruction)
1255     __ bind(copy80);
1256     if (UseSIMDForMemoryOps) {
1257       bs.copy_load_at_32(v0, v1, Address(s, 0));
1258       bs.copy_load_at_32(v2, v3, Address(s, 32));
1259       // Unaligned pointers can be an issue for copying.
1260       // The issue has more chances to happen when granularity of data is
1261       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1262       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1263       // The most performance drop has been seen for the range 65-80 bytes.
1264       // For such cases using the pair of ldp/stp instead of the third pair of
1265       // ldpq/stpq fixes the performance issue.
1266       if (granularity < sizeof (jint)) {
1267         Label copy96;
1268         __ cmp(count, u1(80/granularity));
1269         __ br(Assembler::HI, copy96);
1270         bs.copy_load_at_16(t0, t1, Address(send, -16));
1271 
1272         bs.copy_store_at_32(Address(d, 0), v0, v1);
1273         bs.copy_store_at_32(Address(d, 32), v2, v3);
1274 
1275         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1276         __ b(finish);
1277 
1278         __ bind(copy96);
1279       }
1280       bs.copy_load_at_32(v4, v5, Address(send, -32));
1281 
1282       bs.copy_store_at_32(Address(d, 0), v0, v1);
1283       bs.copy_store_at_32(Address(d, 32), v2, v3);
1284 
1285       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1286     } else {
1287       bs.copy_load_at_16(t0, t1, Address(s, 0));
1288       bs.copy_load_at_16(t2, t3, Address(s, 16));
1289       bs.copy_load_at_16(t4, t5, Address(s, 32));
1290       bs.copy_load_at_16(t6, t7, Address(s, 48));
1291       bs.copy_load_at_16(t8, t9, Address(send, -16));
1292 
1293       bs.copy_store_at_16(Address(d, 0), t0, t1);
1294       bs.copy_store_at_16(Address(d, 16), t2, t3);
1295       bs.copy_store_at_16(Address(d, 32), t4, t5);
1296       bs.copy_store_at_16(Address(d, 48), t6, t7);
1297       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1298     }
1299     __ b(finish);
1300 
1301     // 0..16 bytes
1302     __ bind(copy16);
1303     __ cmp(count, u1(8/granularity));
1304     __ br(Assembler::LO, copy8);
1305 
1306     // 8..16 bytes
1307     bs.copy_load_at_8(t0, Address(s, 0));
1308     bs.copy_load_at_8(t1, Address(send, -8));
1309     bs.copy_store_at_8(Address(d, 0), t0);
1310     bs.copy_store_at_8(Address(dend, -8), t1);
1311     __ b(finish);
1312 
1313     if (granularity < 8) {
1314       // 4..7 bytes
1315       __ bind(copy8);
1316       __ tbz(count, 2 - exact_log2(granularity), copy4);
1317       __ ldrw(t0, Address(s, 0));
1318       __ ldrw(t1, Address(send, -4));
1319       __ strw(t0, Address(d, 0));
1320       __ strw(t1, Address(dend, -4));
1321       __ b(finish);
1322       if (granularity < 4) {
1323         // 0..3 bytes
1324         __ bind(copy4);
1325         __ cbz(count, finish); // get rid of 0 case
1326         if (granularity == 2) {
1327           __ ldrh(t0, Address(s, 0));
1328           __ strh(t0, Address(d, 0));
1329         } else { // granularity == 1
1330           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1331           // the first and last byte.
1332           // Handle the 3 byte case by loading and storing base + count/2
1333           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1334           // This does means in the 1 byte case we load/store the same
1335           // byte 3 times.
1336           __ lsr(count, count, 1);
1337           __ ldrb(t0, Address(s, 0));
1338           __ ldrb(t1, Address(send, -1));
1339           __ ldrb(t2, Address(s, count));
1340           __ strb(t0, Address(d, 0));
1341           __ strb(t1, Address(dend, -1));
1342           __ strb(t2, Address(d, count));
1343         }
1344         __ b(finish);
1345       }
1346     }
1347 
1348     __ bind(copy_big);
1349     if (is_backwards) {
1350       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1351       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1352     }
1353 
1354     // Now we've got the small case out of the way we can align the
1355     // source address on a 2-word boundary.
1356 
1357     // Here we will materialize a count in r15, which is used by copy_memory_small
1358     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1359     // Up until here, we have used t9, which aliases r15, but from here on, that register
1360     // can not be used as a temp register, as it contains the count.
1361 
1362     Label aligned;
1363 
1364     if (is_aligned) {
1365       // We may have to adjust by 1 word to get s 2-word-aligned.
1366       __ tbz(s, exact_log2(wordSize), aligned);
1367       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1368       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1369       __ sub(count, count, wordSize/granularity);
1370     } else {
1371       if (is_backwards) {
1372         __ andr(r15, s, 2 * wordSize - 1);
1373       } else {
1374         __ neg(r15, s);
1375         __ andr(r15, r15, 2 * wordSize - 1);
1376       }
1377       // r15 is the byte adjustment needed to align s.
1378       __ cbz(r15, aligned);
1379       int shift = exact_log2(granularity);
1380       if (shift)  __ lsr(r15, r15, shift);
1381       __ sub(count, count, r15);
1382 
1383 #if 0
1384       // ?? This code is only correct for a disjoint copy.  It may or
1385       // may not make sense to use it in that case.
1386 
1387       // Copy the first pair; s and d may not be aligned.
1388       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1389       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1390 
1391       // Align s and d, adjust count
1392       if (is_backwards) {
1393         __ sub(s, s, r15);
1394         __ sub(d, d, r15);
1395       } else {
1396         __ add(s, s, r15);
1397         __ add(d, d, r15);
1398       }
1399 #else
1400       copy_memory_small(decorators, type, s, d, r15, step);
1401 #endif
1402     }
1403 
1404     __ bind(aligned);
1405 
1406     // s is now 2-word-aligned.
1407 
1408     // We have a count of units and some trailing bytes.  Adjust the
1409     // count and do a bulk copy of words.
1410     __ lsr(r15, count, exact_log2(wordSize/granularity));
1411     if (direction == copy_forwards) {
1412       if (type != T_OBJECT) {
1413         __ bl(copy_f);
1414       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1415         __ bl(copy_obj_uninit_f);
1416       } else {
1417         __ bl(copy_obj_f);
1418       }
1419     } else {
1420       if (type != T_OBJECT) {
1421         __ bl(copy_b);
1422       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1423         __ bl(copy_obj_uninit_b);
1424       } else {
1425         __ bl(copy_obj_b);
1426       }
1427     }
1428 
1429     // And the tail.
1430     copy_memory_small(decorators, type, s, d, count, step);
1431 
1432     if (granularity >= 8) __ bind(copy8);
1433     if (granularity >= 4) __ bind(copy4);
1434     __ bind(finish);
1435   }
1436 
1437 
1438   void clobber_registers() {
1439 #ifdef ASSERT
1440     RegSet clobbered
1441       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1442     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1443     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1444     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1445       __ mov(*it, rscratch1);
1446     }
1447 #endif
1448 
1449   }
1450 
1451   // Scan over array at a for count oops, verifying each one.
1452   // Preserves a and count, clobbers rscratch1 and rscratch2.
1453   void verify_oop_array (int size, Register a, Register count, Register temp) {
1454     Label loop, end;
1455     __ mov(rscratch1, a);
1456     __ mov(rscratch2, zr);
1457     __ bind(loop);
1458     __ cmp(rscratch2, count);
1459     __ br(Assembler::HS, end);
1460     if (size == wordSize) {
1461       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1462       __ verify_oop(temp);
1463     } else {
1464       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1465       __ decode_heap_oop(temp); // calls verify_oop
1466     }
1467     __ add(rscratch2, rscratch2, 1);
1468     __ b(loop);
1469     __ bind(end);
1470   }
1471 
1472   // Arguments:
1473   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1474   //             ignored
1475   //   is_oop  - true => oop array, so generate store check code
1476   //   name    - stub name string
1477   //
1478   // Inputs:
1479   //   c_rarg0   - source array address
1480   //   c_rarg1   - destination array address
1481   //   c_rarg2   - element count, treated as ssize_t, can be zero
1482   //
1483   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1484   // the hardware handle it.  The two dwords within qwords that span
1485   // cache line boundaries will still be loaded and stored atomically.
1486   //
1487   // Side Effects:
1488   //   disjoint_int_copy_entry is set to the no-overlap entry point
1489   //   used by generate_conjoint_int_oop_copy().
1490   //
1491   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1492                                   const char *name, bool dest_uninitialized = false) {
1493     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1494     RegSet saved_reg = RegSet::of(s, d, count);
1495     __ align(CodeEntryAlignment);
1496     StubCodeMark mark(this, "StubRoutines", name);
1497     address start = __ pc();
1498     __ enter();
1499 
1500     if (entry != nullptr) {
1501       *entry = __ pc();
1502       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1503       BLOCK_COMMENT("Entry:");
1504     }
1505 
1506     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1507     if (dest_uninitialized) {
1508       decorators |= IS_DEST_UNINITIALIZED;
1509     }
1510     if (aligned) {
1511       decorators |= ARRAYCOPY_ALIGNED;
1512     }
1513 
1514     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1515     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1516 
1517     if (is_oop) {
1518       // save regs before copy_memory
1519       __ push(RegSet::of(d, count), sp);
1520     }
1521     {
1522       // UnsafeCopyMemory page error: continue after ucm
1523       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1524       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1525       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1526     }
1527 
1528     if (is_oop) {
1529       __ pop(RegSet::of(d, count), sp);
1530       if (VerifyOops)
1531         verify_oop_array(size, d, count, r16);
1532     }
1533 
1534     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1535 
1536     __ leave();
1537     __ mov(r0, zr); // return 0
1538     __ ret(lr);
1539     return start;
1540   }
1541 
1542   // Arguments:
1543   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1544   //             ignored
1545   //   is_oop  - true => oop array, so generate store check code
1546   //   name    - stub name string
1547   //
1548   // Inputs:
1549   //   c_rarg0   - source array address
1550   //   c_rarg1   - destination array address
1551   //   c_rarg2   - element count, treated as ssize_t, can be zero
1552   //
1553   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1554   // the hardware handle it.  The two dwords within qwords that span
1555   // cache line boundaries will still be loaded and stored atomically.
1556   //
1557   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1558                                  address *entry, const char *name,
1559                                  bool dest_uninitialized = false) {
1560     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1561     RegSet saved_regs = RegSet::of(s, d, count);
1562     StubCodeMark mark(this, "StubRoutines", name);
1563     address start = __ pc();
1564     __ enter();
1565 
1566     if (entry != nullptr) {
1567       *entry = __ pc();
1568       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1569       BLOCK_COMMENT("Entry:");
1570     }
1571 
1572     // use fwd copy when (d-s) above_equal (count*size)
1573     __ sub(rscratch1, d, s);
1574     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1575     __ br(Assembler::HS, nooverlap_target);
1576 
1577     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1578     if (dest_uninitialized) {
1579       decorators |= IS_DEST_UNINITIALIZED;
1580     }
1581     if (aligned) {
1582       decorators |= ARRAYCOPY_ALIGNED;
1583     }
1584 
1585     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1586     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1587 
1588     if (is_oop) {
1589       // save regs before copy_memory
1590       __ push(RegSet::of(d, count), sp);
1591     }
1592     {
1593       // UnsafeCopyMemory page error: continue after ucm
1594       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1595       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1596       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1597     }
1598     if (is_oop) {
1599       __ pop(RegSet::of(d, count), sp);
1600       if (VerifyOops)
1601         verify_oop_array(size, d, count, r16);
1602     }
1603     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1604     __ leave();
1605     __ mov(r0, zr); // return 0
1606     __ ret(lr);
1607     return start;
1608 }
1609 
1610   // Arguments:
1611   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1612   //             ignored
1613   //   name    - stub name string
1614   //
1615   // Inputs:
1616   //   c_rarg0   - source array address
1617   //   c_rarg1   - destination array address
1618   //   c_rarg2   - element count, treated as ssize_t, can be zero
1619   //
1620   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1621   // we let the hardware handle it.  The one to eight bytes within words,
1622   // dwords or qwords that span cache line boundaries will still be loaded
1623   // and stored atomically.
1624   //
1625   // Side Effects:
1626   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1627   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1628   // we let the hardware handle it.  The one to eight bytes within words,
1629   // dwords or qwords that span cache line boundaries will still be loaded
1630   // and stored atomically.
1631   //
1632   // Side Effects:
1633   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1634   //   used by generate_conjoint_byte_copy().
1635   //
1636   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1637     const bool not_oop = false;
1638     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1639   }
1640 
1641   // Arguments:
1642   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1643   //             ignored
1644   //   name    - stub name string
1645   //
1646   // Inputs:
1647   //   c_rarg0   - source array address
1648   //   c_rarg1   - destination array address
1649   //   c_rarg2   - element count, treated as ssize_t, can be zero
1650   //
1651   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1652   // we let the hardware handle it.  The one to eight bytes within words,
1653   // dwords or qwords that span cache line boundaries will still be loaded
1654   // and stored atomically.
1655   //
1656   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1657                                       address* entry, const char *name) {
1658     const bool not_oop = false;
1659     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1660   }
1661 
1662   // Arguments:
1663   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1664   //             ignored
1665   //   name    - stub name string
1666   //
1667   // Inputs:
1668   //   c_rarg0   - source array address
1669   //   c_rarg1   - destination array address
1670   //   c_rarg2   - element count, treated as ssize_t, can be zero
1671   //
1672   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1673   // let the hardware handle it.  The two or four words within dwords
1674   // or qwords that span cache line boundaries will still be loaded
1675   // and stored atomically.
1676   //
1677   // Side Effects:
1678   //   disjoint_short_copy_entry is set to the no-overlap entry point
1679   //   used by generate_conjoint_short_copy().
1680   //
1681   address generate_disjoint_short_copy(bool aligned,
1682                                        address* entry, const char *name) {
1683     const bool not_oop = false;
1684     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1685   }
1686 
1687   // Arguments:
1688   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1689   //             ignored
1690   //   name    - stub name string
1691   //
1692   // Inputs:
1693   //   c_rarg0   - source array address
1694   //   c_rarg1   - destination array address
1695   //   c_rarg2   - element count, treated as ssize_t, can be zero
1696   //
1697   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1698   // let the hardware handle it.  The two or four words within dwords
1699   // or qwords that span cache line boundaries will still be loaded
1700   // and stored atomically.
1701   //
1702   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1703                                        address *entry, const char *name) {
1704     const bool not_oop = false;
1705     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1706 
1707   }
1708   // Arguments:
1709   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1710   //             ignored
1711   //   name    - stub name string
1712   //
1713   // Inputs:
1714   //   c_rarg0   - source array address
1715   //   c_rarg1   - destination array address
1716   //   c_rarg2   - element count, treated as ssize_t, can be zero
1717   //
1718   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1719   // the hardware handle it.  The two dwords within qwords that span
1720   // cache line boundaries will still be loaded and stored atomically.
1721   //
1722   // Side Effects:
1723   //   disjoint_int_copy_entry is set to the no-overlap entry point
1724   //   used by generate_conjoint_int_oop_copy().
1725   //
1726   address generate_disjoint_int_copy(bool aligned, address *entry,
1727                                          const char *name, bool dest_uninitialized = false) {
1728     const bool not_oop = false;
1729     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1730   }
1731 
1732   // Arguments:
1733   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1734   //             ignored
1735   //   name    - stub name string
1736   //
1737   // Inputs:
1738   //   c_rarg0   - source array address
1739   //   c_rarg1   - destination array address
1740   //   c_rarg2   - element count, treated as ssize_t, can be zero
1741   //
1742   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1743   // the hardware handle it.  The two dwords within qwords that span
1744   // cache line boundaries will still be loaded and stored atomically.
1745   //
1746   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1747                                      address *entry, const char *name,
1748                                      bool dest_uninitialized = false) {
1749     const bool not_oop = false;
1750     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1751   }
1752 
1753 
1754   // Arguments:
1755   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1756   //             ignored
1757   //   name    - stub name string
1758   //
1759   // Inputs:
1760   //   c_rarg0   - source array address
1761   //   c_rarg1   - destination array address
1762   //   c_rarg2   - element count, treated as size_t, can be zero
1763   //
1764   // Side Effects:
1765   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1766   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1767   //
1768   address generate_disjoint_long_copy(bool aligned, address *entry,
1769                                           const char *name, bool dest_uninitialized = false) {
1770     const bool not_oop = false;
1771     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1772   }
1773 
1774   // Arguments:
1775   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1776   //             ignored
1777   //   name    - stub name string
1778   //
1779   // Inputs:
1780   //   c_rarg0   - source array address
1781   //   c_rarg1   - destination array address
1782   //   c_rarg2   - element count, treated as size_t, can be zero
1783   //
1784   address generate_conjoint_long_copy(bool aligned,
1785                                       address nooverlap_target, address *entry,
1786                                       const char *name, bool dest_uninitialized = false) {
1787     const bool not_oop = false;
1788     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1789   }
1790 
1791   // Arguments:
1792   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1793   //             ignored
1794   //   name    - stub name string
1795   //
1796   // Inputs:
1797   //   c_rarg0   - source array address
1798   //   c_rarg1   - destination array address
1799   //   c_rarg2   - element count, treated as size_t, can be zero
1800   //
1801   // Side Effects:
1802   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1803   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1804   //
1805   address generate_disjoint_oop_copy(bool aligned, address *entry,
1806                                      const char *name, bool dest_uninitialized) {
1807     const bool is_oop = true;
1808     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1809     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1810   }
1811 
1812   // Arguments:
1813   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1814   //             ignored
1815   //   name    - stub name string
1816   //
1817   // Inputs:
1818   //   c_rarg0   - source array address
1819   //   c_rarg1   - destination array address
1820   //   c_rarg2   - element count, treated as size_t, can be zero
1821   //
1822   address generate_conjoint_oop_copy(bool aligned,
1823                                      address nooverlap_target, address *entry,
1824                                      const char *name, bool dest_uninitialized) {
1825     const bool is_oop = true;
1826     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1827     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1828                                   name, dest_uninitialized);
1829   }
1830 
1831 
1832   // Helper for generating a dynamic type check.
1833   // Smashes rscratch1, rscratch2.
1834   void generate_type_check(Register sub_klass,
1835                            Register super_check_offset,
1836                            Register super_klass,
1837                            Label& L_success) {
1838     assert_different_registers(sub_klass, super_check_offset, super_klass);
1839 
1840     BLOCK_COMMENT("type_check:");
1841 
1842     Label L_miss;
1843 
1844     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1845                                      super_check_offset);
1846     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1847 
1848     // Fall through on failure!
1849     __ BIND(L_miss);
1850   }
1851 
1852   //
1853   //  Generate checkcasting array copy stub
1854   //
1855   //  Input:
1856   //    c_rarg0   - source array address
1857   //    c_rarg1   - destination array address
1858   //    c_rarg2   - element count, treated as ssize_t, can be zero
1859   //    c_rarg3   - size_t ckoff (super_check_offset)
1860   //    c_rarg4   - oop ckval (super_klass)
1861   //
1862   //  Output:
1863   //    r0 ==  0  -  success
1864   //    r0 == -1^K - failure, where K is partial transfer count
1865   //
1866   address generate_checkcast_copy(const char *name, address *entry,
1867                                   bool dest_uninitialized = false) {
1868 
1869     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1870 
1871     // Input registers (after setup_arg_regs)
1872     const Register from        = c_rarg0;   // source array address
1873     const Register to          = c_rarg1;   // destination array address
1874     const Register count       = c_rarg2;   // elementscount
1875     const Register ckoff       = c_rarg3;   // super_check_offset
1876     const Register ckval       = c_rarg4;   // super_klass
1877 
1878     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1879     RegSet wb_post_saved_regs = RegSet::of(count);
1880 
1881     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1882     const Register copied_oop  = r22;       // actual oop copied
1883     const Register count_save  = r21;       // orig elementscount
1884     const Register start_to    = r20;       // destination array start address
1885     const Register r19_klass   = r19;       // oop._klass
1886 
1887     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1888     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1889 
1890     //---------------------------------------------------------------
1891     // Assembler stub will be used for this call to arraycopy
1892     // if the two arrays are subtypes of Object[] but the
1893     // destination array type is not equal to or a supertype
1894     // of the source type.  Each element must be separately
1895     // checked.
1896 
1897     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1898                                copied_oop, r19_klass, count_save);
1899 
1900     __ align(CodeEntryAlignment);
1901     StubCodeMark mark(this, "StubRoutines", name);
1902     address start = __ pc();
1903 
1904     __ enter(); // required for proper stackwalking of RuntimeStub frame
1905 
1906 #ifdef ASSERT
1907     // caller guarantees that the arrays really are different
1908     // otherwise, we would have to make conjoint checks
1909     { Label L;
1910       __ b(L);                  // conjoint check not yet implemented
1911       __ stop("checkcast_copy within a single array");
1912       __ bind(L);
1913     }
1914 #endif //ASSERT
1915 
1916     // Caller of this entry point must set up the argument registers.
1917     if (entry != nullptr) {
1918       *entry = __ pc();
1919       BLOCK_COMMENT("Entry:");
1920     }
1921 
1922      // Empty array:  Nothing to do.
1923     __ cbz(count, L_done);
1924     __ push(RegSet::of(r19, r20, r21, r22), sp);
1925 
1926 #ifdef ASSERT
1927     BLOCK_COMMENT("assert consistent ckoff/ckval");
1928     // The ckoff and ckval must be mutually consistent,
1929     // even though caller generates both.
1930     { Label L;
1931       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1932       __ ldrw(start_to, Address(ckval, sco_offset));
1933       __ cmpw(ckoff, start_to);
1934       __ br(Assembler::EQ, L);
1935       __ stop("super_check_offset inconsistent");
1936       __ bind(L);
1937     }
1938 #endif //ASSERT
1939 
1940     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1941     bool is_oop = true;
1942     int element_size = UseCompressedOops ? 4 : 8;
1943     if (dest_uninitialized) {
1944       decorators |= IS_DEST_UNINITIALIZED;
1945     }
1946 
1947     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1948     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1949 
1950     // save the original count
1951     __ mov(count_save, count);
1952 
1953     // Copy from low to high addresses
1954     __ mov(start_to, to);              // Save destination array start address
1955     __ b(L_load_element);
1956 
1957     // ======== begin loop ========
1958     // (Loop is rotated; its entry is L_load_element.)
1959     // Loop control:
1960     //   for (; count != 0; count--) {
1961     //     copied_oop = load_heap_oop(from++);
1962     //     ... generate_type_check ...;
1963     //     store_heap_oop(to++, copied_oop);
1964     //   }
1965     __ align(OptoLoopAlignment);
1966 
1967     __ BIND(L_store_element);
1968     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1969                       __ post(to, element_size), copied_oop, noreg,
1970                       gct1, gct2, gct3);
1971     __ sub(count, count, 1);
1972     __ cbz(count, L_do_card_marks);
1973 
1974     // ======== loop entry is here ========
1975     __ BIND(L_load_element);
1976     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1977                      copied_oop, noreg, __ post(from, element_size),
1978                      gct1);
1979     __ cbz(copied_oop, L_store_element);
1980 
1981     __ load_klass(r19_klass, copied_oop);// query the object klass
1982     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1983     // ======== end loop ========
1984 
1985     // It was a real error; we must depend on the caller to finish the job.
1986     // Register count = remaining oops, count_orig = total oops.
1987     // Emit GC store barriers for the oops we have copied and report
1988     // their number to the caller.
1989 
1990     __ subs(count, count_save, count);     // K = partially copied oop count
1991     __ eon(count, count, zr);                   // report (-1^K) to caller
1992     __ br(Assembler::EQ, L_done_pop);
1993 
1994     __ BIND(L_do_card_marks);
1995     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1996 
1997     __ bind(L_done_pop);
1998     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1999     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2000 
2001     __ bind(L_done);
2002     __ mov(r0, count);
2003     __ leave();
2004     __ ret(lr);
2005 
2006     return start;
2007   }
2008 
2009   // Perform range checks on the proposed arraycopy.
2010   // Kills temp, but nothing else.
2011   // Also, clean the sign bits of src_pos and dst_pos.
2012   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2013                               Register src_pos, // source position (c_rarg1)
2014                               Register dst,     // destination array oo (c_rarg2)
2015                               Register dst_pos, // destination position (c_rarg3)
2016                               Register length,
2017                               Register temp,
2018                               Label& L_failed) {
2019     BLOCK_COMMENT("arraycopy_range_checks:");
2020 
2021     assert_different_registers(rscratch1, temp);
2022 
2023     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2024     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2025     __ addw(temp, length, src_pos);
2026     __ cmpw(temp, rscratch1);
2027     __ br(Assembler::HI, L_failed);
2028 
2029     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2030     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2031     __ addw(temp, length, dst_pos);
2032     __ cmpw(temp, rscratch1);
2033     __ br(Assembler::HI, L_failed);
2034 
2035     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2036     __ movw(src_pos, src_pos);
2037     __ movw(dst_pos, dst_pos);
2038 
2039     BLOCK_COMMENT("arraycopy_range_checks done");
2040   }
2041 
2042   // These stubs get called from some dumb test routine.
2043   // I'll write them properly when they're called from
2044   // something that's actually doing something.
2045   static void fake_arraycopy_stub(address src, address dst, int count) {
2046     assert(count == 0, "huh?");
2047   }
2048 
2049 
2050   //
2051   //  Generate 'unsafe' array copy stub
2052   //  Though just as safe as the other stubs, it takes an unscaled
2053   //  size_t argument instead of an element count.
2054   //
2055   //  Input:
2056   //    c_rarg0   - source array address
2057   //    c_rarg1   - destination array address
2058   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2059   //
2060   // Examines the alignment of the operands and dispatches
2061   // to a long, int, short, or byte copy loop.
2062   //
2063   address generate_unsafe_copy(const char *name,
2064                                address byte_copy_entry,
2065                                address short_copy_entry,
2066                                address int_copy_entry,
2067                                address long_copy_entry) {
2068     Label L_long_aligned, L_int_aligned, L_short_aligned;
2069     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2070 
2071     __ align(CodeEntryAlignment);
2072     StubCodeMark mark(this, "StubRoutines", name);
2073     address start = __ pc();
2074     __ enter(); // required for proper stackwalking of RuntimeStub frame
2075 
2076     // bump this on entry, not on exit:
2077     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2078 
2079     __ orr(rscratch1, s, d);
2080     __ orr(rscratch1, rscratch1, count);
2081 
2082     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2083     __ cbz(rscratch1, L_long_aligned);
2084     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2085     __ cbz(rscratch1, L_int_aligned);
2086     __ tbz(rscratch1, 0, L_short_aligned);
2087     __ b(RuntimeAddress(byte_copy_entry));
2088 
2089     __ BIND(L_short_aligned);
2090     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2091     __ b(RuntimeAddress(short_copy_entry));
2092     __ BIND(L_int_aligned);
2093     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2094     __ b(RuntimeAddress(int_copy_entry));
2095     __ BIND(L_long_aligned);
2096     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2097     __ b(RuntimeAddress(long_copy_entry));
2098 
2099     return start;
2100   }
2101 
2102   //
2103   //  Generate generic array copy stubs
2104   //
2105   //  Input:
2106   //    c_rarg0    -  src oop
2107   //    c_rarg1    -  src_pos (32-bits)
2108   //    c_rarg2    -  dst oop
2109   //    c_rarg3    -  dst_pos (32-bits)
2110   //    c_rarg4    -  element count (32-bits)
2111   //
2112   //  Output:
2113   //    r0 ==  0  -  success
2114   //    r0 == -1^K - failure, where K is partial transfer count
2115   //
2116   address generate_generic_copy(const char *name,
2117                                 address byte_copy_entry, address short_copy_entry,
2118                                 address int_copy_entry, address oop_copy_entry,
2119                                 address long_copy_entry, address checkcast_copy_entry) {
2120 
2121     Label L_failed, L_objArray;
2122     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2123 
2124     // Input registers
2125     const Register src        = c_rarg0;  // source array oop
2126     const Register src_pos    = c_rarg1;  // source position
2127     const Register dst        = c_rarg2;  // destination array oop
2128     const Register dst_pos    = c_rarg3;  // destination position
2129     const Register length     = c_rarg4;
2130 
2131 
2132     // Registers used as temps
2133     const Register dst_klass  = c_rarg5;
2134 
2135     __ align(CodeEntryAlignment);
2136 
2137     StubCodeMark mark(this, "StubRoutines", name);
2138 
2139     address start = __ pc();
2140 
2141     __ enter(); // required for proper stackwalking of RuntimeStub frame
2142 
2143     // bump this on entry, not on exit:
2144     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2145 
2146     //-----------------------------------------------------------------------
2147     // Assembler stub will be used for this call to arraycopy
2148     // if the following conditions are met:
2149     //
2150     // (1) src and dst must not be null.
2151     // (2) src_pos must not be negative.
2152     // (3) dst_pos must not be negative.
2153     // (4) length  must not be negative.
2154     // (5) src klass and dst klass should be the same and not null.
2155     // (6) src and dst should be arrays.
2156     // (7) src_pos + length must not exceed length of src.
2157     // (8) dst_pos + length must not exceed length of dst.
2158     //
2159 
2160     //  if (src == nullptr) return -1;
2161     __ cbz(src, L_failed);
2162 
2163     //  if (src_pos < 0) return -1;
2164     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2165 
2166     //  if (dst == nullptr) return -1;
2167     __ cbz(dst, L_failed);
2168 
2169     //  if (dst_pos < 0) return -1;
2170     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2171 
2172     // registers used as temp
2173     const Register scratch_length    = r16; // elements count to copy
2174     const Register scratch_src_klass = r17; // array klass
2175     const Register lh                = r15; // layout helper
2176 
2177     //  if (length < 0) return -1;
2178     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2179     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2180 
2181     __ load_klass(scratch_src_klass, src);
2182 #ifdef ASSERT
2183     //  assert(src->klass() != nullptr);
2184     {
2185       BLOCK_COMMENT("assert klasses not null {");
2186       Label L1, L2;
2187       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2188       __ bind(L1);
2189       __ stop("broken null klass");
2190       __ bind(L2);
2191       __ load_klass(rscratch1, dst);
2192       __ cbz(rscratch1, L1);     // this would be broken also
2193       BLOCK_COMMENT("} assert klasses not null done");
2194     }
2195 #endif
2196 
2197     // Load layout helper (32-bits)
2198     //
2199     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2200     // 32        30    24            16              8     2                 0
2201     //
2202     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2203     //
2204 
2205     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2206 
2207     // Handle objArrays completely differently...
2208     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2209     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2210     __ movw(rscratch1, objArray_lh);
2211     __ eorw(rscratch2, lh, rscratch1);
2212     __ cbzw(rscratch2, L_objArray);
2213 
2214     //  if (src->klass() != dst->klass()) return -1;
2215     __ load_klass(rscratch2, dst);
2216     __ eor(rscratch2, rscratch2, scratch_src_klass);
2217     __ cbnz(rscratch2, L_failed);
2218 
2219     //  if (!src->is_Array()) return -1;
2220     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2221 
2222     // At this point, it is known to be a typeArray (array_tag 0x3).
2223 #ifdef ASSERT
2224     {
2225       BLOCK_COMMENT("assert primitive array {");
2226       Label L;
2227       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2228       __ cmpw(lh, rscratch2);
2229       __ br(Assembler::GE, L);
2230       __ stop("must be a primitive array");
2231       __ bind(L);
2232       BLOCK_COMMENT("} assert primitive array done");
2233     }
2234 #endif
2235 
2236     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2237                            rscratch2, L_failed);
2238 
2239     // TypeArrayKlass
2240     //
2241     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2242     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2243     //
2244 
2245     const Register rscratch1_offset = rscratch1;    // array offset
2246     const Register r15_elsize = lh; // element size
2247 
2248     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2249            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2250     __ add(src, src, rscratch1_offset);           // src array offset
2251     __ add(dst, dst, rscratch1_offset);           // dst array offset
2252     BLOCK_COMMENT("choose copy loop based on element size");
2253 
2254     // next registers should be set before the jump to corresponding stub
2255     const Register from     = c_rarg0;  // source array address
2256     const Register to       = c_rarg1;  // destination array address
2257     const Register count    = c_rarg2;  // elements count
2258 
2259     // 'from', 'to', 'count' registers should be set in such order
2260     // since they are the same as 'src', 'src_pos', 'dst'.
2261 
2262     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2263 
2264     // The possible values of elsize are 0-3, i.e. exact_log2(element
2265     // size in bytes).  We do a simple bitwise binary search.
2266   __ BIND(L_copy_bytes);
2267     __ tbnz(r15_elsize, 1, L_copy_ints);
2268     __ tbnz(r15_elsize, 0, L_copy_shorts);
2269     __ lea(from, Address(src, src_pos));// src_addr
2270     __ lea(to,   Address(dst, dst_pos));// dst_addr
2271     __ movw(count, scratch_length); // length
2272     __ b(RuntimeAddress(byte_copy_entry));
2273 
2274   __ BIND(L_copy_shorts);
2275     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2276     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2277     __ movw(count, scratch_length); // length
2278     __ b(RuntimeAddress(short_copy_entry));
2279 
2280   __ BIND(L_copy_ints);
2281     __ tbnz(r15_elsize, 0, L_copy_longs);
2282     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2283     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2284     __ movw(count, scratch_length); // length
2285     __ b(RuntimeAddress(int_copy_entry));
2286 
2287   __ BIND(L_copy_longs);
2288 #ifdef ASSERT
2289     {
2290       BLOCK_COMMENT("assert long copy {");
2291       Label L;
2292       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2293       __ cmpw(r15_elsize, LogBytesPerLong);
2294       __ br(Assembler::EQ, L);
2295       __ stop("must be long copy, but elsize is wrong");
2296       __ bind(L);
2297       BLOCK_COMMENT("} assert long copy done");
2298     }
2299 #endif
2300     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2301     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2302     __ movw(count, scratch_length); // length
2303     __ b(RuntimeAddress(long_copy_entry));
2304 
2305     // ObjArrayKlass
2306   __ BIND(L_objArray);
2307     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2308 
2309     Label L_plain_copy, L_checkcast_copy;
2310     //  test array classes for subtyping
2311     __ load_klass(r15, dst);
2312     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2313     __ br(Assembler::NE, L_checkcast_copy);
2314 
2315     // Identically typed arrays can be copied without element-wise checks.
2316     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2317                            rscratch2, L_failed);
2318 
2319     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2320     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2321     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2322     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2323     __ movw(count, scratch_length); // length
2324   __ BIND(L_plain_copy);
2325     __ b(RuntimeAddress(oop_copy_entry));
2326 
2327   __ BIND(L_checkcast_copy);
2328     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2329     {
2330       // Before looking at dst.length, make sure dst is also an objArray.
2331       __ ldrw(rscratch1, Address(r15, lh_offset));
2332       __ movw(rscratch2, objArray_lh);
2333       __ eorw(rscratch1, rscratch1, rscratch2);
2334       __ cbnzw(rscratch1, L_failed);
2335 
2336       // It is safe to examine both src.length and dst.length.
2337       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2338                              r15, L_failed);
2339 
2340       __ load_klass(dst_klass, dst); // reload
2341 
2342       // Marshal the base address arguments now, freeing registers.
2343       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2344       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2345       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2346       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2347       __ movw(count, length);           // length (reloaded)
2348       Register sco_temp = c_rarg3;      // this register is free now
2349       assert_different_registers(from, to, count, sco_temp,
2350                                  dst_klass, scratch_src_klass);
2351       // assert_clean_int(count, sco_temp);
2352 
2353       // Generate the type check.
2354       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2355       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2356 
2357       // Smashes rscratch1, rscratch2
2358       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2359 
2360       // Fetch destination element klass from the ObjArrayKlass header.
2361       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2362       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2363       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2364 
2365       // the checkcast_copy loop needs two extra arguments:
2366       assert(c_rarg3 == sco_temp, "#3 already in place");
2367       // Set up arguments for checkcast_copy_entry.
2368       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2369       __ b(RuntimeAddress(checkcast_copy_entry));
2370     }
2371 
2372   __ BIND(L_failed);
2373     __ mov(r0, -1);
2374     __ leave();   // required for proper stackwalking of RuntimeStub frame
2375     __ ret(lr);
2376 
2377     return start;
2378   }
2379 
2380   //
2381   // Generate stub for array fill. If "aligned" is true, the
2382   // "to" address is assumed to be heapword aligned.
2383   //
2384   // Arguments for generated stub:
2385   //   to:    c_rarg0
2386   //   value: c_rarg1
2387   //   count: c_rarg2 treated as signed
2388   //
2389   address generate_fill(BasicType t, bool aligned, const char *name) {
2390     __ align(CodeEntryAlignment);
2391     StubCodeMark mark(this, "StubRoutines", name);
2392     address start = __ pc();
2393 
2394     BLOCK_COMMENT("Entry:");
2395 
2396     const Register to        = c_rarg0;  // source array address
2397     const Register value     = c_rarg1;  // value
2398     const Register count     = c_rarg2;  // elements count
2399 
2400     const Register bz_base = r10;        // base for block_zero routine
2401     const Register cnt_words = r11;      // temp register
2402 
2403     __ enter();
2404 
2405     Label L_fill_elements, L_exit1;
2406 
2407     int shift = -1;
2408     switch (t) {
2409       case T_BYTE:
2410         shift = 0;
2411         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2412         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2413         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2414         __ br(Assembler::LO, L_fill_elements);
2415         break;
2416       case T_SHORT:
2417         shift = 1;
2418         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2419         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2420         __ br(Assembler::LO, L_fill_elements);
2421         break;
2422       case T_INT:
2423         shift = 2;
2424         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2425         __ br(Assembler::LO, L_fill_elements);
2426         break;
2427       default: ShouldNotReachHere();
2428     }
2429 
2430     // Align source address at 8 bytes address boundary.
2431     Label L_skip_align1, L_skip_align2, L_skip_align4;
2432     if (!aligned) {
2433       switch (t) {
2434         case T_BYTE:
2435           // One byte misalignment happens only for byte arrays.
2436           __ tbz(to, 0, L_skip_align1);
2437           __ strb(value, Address(__ post(to, 1)));
2438           __ subw(count, count, 1);
2439           __ bind(L_skip_align1);
2440           // Fallthrough
2441         case T_SHORT:
2442           // Two bytes misalignment happens only for byte and short (char) arrays.
2443           __ tbz(to, 1, L_skip_align2);
2444           __ strh(value, Address(__ post(to, 2)));
2445           __ subw(count, count, 2 >> shift);
2446           __ bind(L_skip_align2);
2447           // Fallthrough
2448         case T_INT:
2449           // Align to 8 bytes, we know we are 4 byte aligned to start.
2450           __ tbz(to, 2, L_skip_align4);
2451           __ strw(value, Address(__ post(to, 4)));
2452           __ subw(count, count, 4 >> shift);
2453           __ bind(L_skip_align4);
2454           break;
2455         default: ShouldNotReachHere();
2456       }
2457     }
2458 
2459     //
2460     //  Fill large chunks
2461     //
2462     __ lsrw(cnt_words, count, 3 - shift); // number of words
2463     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2464     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2465     if (UseBlockZeroing) {
2466       Label non_block_zeroing, rest;
2467       // If the fill value is zero we can use the fast zero_words().
2468       __ cbnz(value, non_block_zeroing);
2469       __ mov(bz_base, to);
2470       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2471       address tpc = __ zero_words(bz_base, cnt_words);
2472       if (tpc == nullptr) {
2473         fatal("CodeCache is full at generate_fill");
2474       }
2475       __ b(rest);
2476       __ bind(non_block_zeroing);
2477       __ fill_words(to, cnt_words, value);
2478       __ bind(rest);
2479     } else {
2480       __ fill_words(to, cnt_words, value);
2481     }
2482 
2483     // Remaining count is less than 8 bytes. Fill it by a single store.
2484     // Note that the total length is no less than 8 bytes.
2485     if (t == T_BYTE || t == T_SHORT) {
2486       Label L_exit1;
2487       __ cbzw(count, L_exit1);
2488       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2489       __ str(value, Address(to, -8));    // overwrite some elements
2490       __ bind(L_exit1);
2491       __ leave();
2492       __ ret(lr);
2493     }
2494 
2495     // Handle copies less than 8 bytes.
2496     Label L_fill_2, L_fill_4, L_exit2;
2497     __ bind(L_fill_elements);
2498     switch (t) {
2499       case T_BYTE:
2500         __ tbz(count, 0, L_fill_2);
2501         __ strb(value, Address(__ post(to, 1)));
2502         __ bind(L_fill_2);
2503         __ tbz(count, 1, L_fill_4);
2504         __ strh(value, Address(__ post(to, 2)));
2505         __ bind(L_fill_4);
2506         __ tbz(count, 2, L_exit2);
2507         __ strw(value, Address(to));
2508         break;
2509       case T_SHORT:
2510         __ tbz(count, 0, L_fill_4);
2511         __ strh(value, Address(__ post(to, 2)));
2512         __ bind(L_fill_4);
2513         __ tbz(count, 1, L_exit2);
2514         __ strw(value, Address(to));
2515         break;
2516       case T_INT:
2517         __ cbzw(count, L_exit2);
2518         __ strw(value, Address(to));
2519         break;
2520       default: ShouldNotReachHere();
2521     }
2522     __ bind(L_exit2);
2523     __ leave();
2524     __ ret(lr);
2525     return start;
2526   }
2527 
2528   address generate_data_cache_writeback() {
2529     const Register line        = c_rarg0;  // address of line to write back
2530 
2531     __ align(CodeEntryAlignment);
2532 
2533     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2534 
2535     address start = __ pc();
2536     __ enter();
2537     __ cache_wb(Address(line, 0));
2538     __ leave();
2539     __ ret(lr);
2540 
2541     return start;
2542   }
2543 
2544   address generate_data_cache_writeback_sync() {
2545     const Register is_pre     = c_rarg0;  // pre or post sync
2546 
2547     __ align(CodeEntryAlignment);
2548 
2549     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2550 
2551     // pre wbsync is a no-op
2552     // post wbsync translates to an sfence
2553 
2554     Label skip;
2555     address start = __ pc();
2556     __ enter();
2557     __ cbnz(is_pre, skip);
2558     __ cache_wbsync(false);
2559     __ bind(skip);
2560     __ leave();
2561     __ ret(lr);
2562 
2563     return start;
2564   }
2565 
2566   void generate_arraycopy_stubs() {
2567     address entry;
2568     address entry_jbyte_arraycopy;
2569     address entry_jshort_arraycopy;
2570     address entry_jint_arraycopy;
2571     address entry_oop_arraycopy;
2572     address entry_jlong_arraycopy;
2573     address entry_checkcast_arraycopy;
2574 
2575     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2576     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2577 
2578     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2579     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2580 
2581     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2582     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2583 
2584     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2585 
2586     //*** jbyte
2587     // Always need aligned and unaligned versions
2588     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2589                                                                                   "jbyte_disjoint_arraycopy");
2590     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2591                                                                                   &entry_jbyte_arraycopy,
2592                                                                                   "jbyte_arraycopy");
2593     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2594                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2595     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2596                                                                                   "arrayof_jbyte_arraycopy");
2597 
2598     //*** jshort
2599     // Always need aligned and unaligned versions
2600     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2601                                                                                     "jshort_disjoint_arraycopy");
2602     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2603                                                                                     &entry_jshort_arraycopy,
2604                                                                                     "jshort_arraycopy");
2605     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2606                                                                                     "arrayof_jshort_disjoint_arraycopy");
2607     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2608                                                                                     "arrayof_jshort_arraycopy");
2609 
2610     //*** jint
2611     // Aligned versions
2612     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2613                                                                                 "arrayof_jint_disjoint_arraycopy");
2614     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2615                                                                                 "arrayof_jint_arraycopy");
2616     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2617     // entry_jint_arraycopy always points to the unaligned version
2618     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2619                                                                                 "jint_disjoint_arraycopy");
2620     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2621                                                                                 &entry_jint_arraycopy,
2622                                                                                 "jint_arraycopy");
2623 
2624     //*** jlong
2625     // It is always aligned
2626     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2627                                                                                   "arrayof_jlong_disjoint_arraycopy");
2628     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2629                                                                                   "arrayof_jlong_arraycopy");
2630     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2631     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2632 
2633     //*** oops
2634     {
2635       // With compressed oops we need unaligned versions; notice that
2636       // we overwrite entry_oop_arraycopy.
2637       bool aligned = !UseCompressedOops;
2638 
2639       StubRoutines::_arrayof_oop_disjoint_arraycopy
2640         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2641                                      /*dest_uninitialized*/false);
2642       StubRoutines::_arrayof_oop_arraycopy
2643         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2644                                      /*dest_uninitialized*/false);
2645       // Aligned versions without pre-barriers
2646       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2647         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2648                                      /*dest_uninitialized*/true);
2649       StubRoutines::_arrayof_oop_arraycopy_uninit
2650         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2651                                      /*dest_uninitialized*/true);
2652     }
2653 
2654     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2655     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2656     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2657     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2658 
2659     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2660     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2661                                                                         /*dest_uninitialized*/true);
2662 
2663     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2664                                                               entry_jbyte_arraycopy,
2665                                                               entry_jshort_arraycopy,
2666                                                               entry_jint_arraycopy,
2667                                                               entry_jlong_arraycopy);
2668 
2669     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2670                                                                entry_jbyte_arraycopy,
2671                                                                entry_jshort_arraycopy,
2672                                                                entry_jint_arraycopy,
2673                                                                entry_oop_arraycopy,
2674                                                                entry_jlong_arraycopy,
2675                                                                entry_checkcast_arraycopy);
2676 
2677     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2678     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2679     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2680     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2681     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2682     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2683   }
2684 
2685   void generate_math_stubs() { Unimplemented(); }
2686 
2687   // Arguments:
2688   //
2689   // Inputs:
2690   //   c_rarg0   - source byte array address
2691   //   c_rarg1   - destination byte array address
2692   //   c_rarg2   - K (key) in little endian int array
2693   //
2694   address generate_aescrypt_encryptBlock() {
2695     __ align(CodeEntryAlignment);
2696     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2697 
2698     const Register from        = c_rarg0;  // source array address
2699     const Register to          = c_rarg1;  // destination array address
2700     const Register key         = c_rarg2;  // key array address
2701     const Register keylen      = rscratch1;
2702 
2703     address start = __ pc();
2704     __ enter();
2705 
2706     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2707 
2708     __ aesenc_loadkeys(key, keylen);
2709     __ aesecb_encrypt(from, to, keylen);
2710 
2711     __ mov(r0, 0);
2712 
2713     __ leave();
2714     __ ret(lr);
2715 
2716     return start;
2717   }
2718 
2719   // Arguments:
2720   //
2721   // Inputs:
2722   //   c_rarg0   - source byte array address
2723   //   c_rarg1   - destination byte array address
2724   //   c_rarg2   - K (key) in little endian int array
2725   //
2726   address generate_aescrypt_decryptBlock() {
2727     assert(UseAES, "need AES cryptographic extension support");
2728     __ align(CodeEntryAlignment);
2729     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2730     Label L_doLast;
2731 
2732     const Register from        = c_rarg0;  // source array address
2733     const Register to          = c_rarg1;  // destination array address
2734     const Register key         = c_rarg2;  // key array address
2735     const Register keylen      = rscratch1;
2736 
2737     address start = __ pc();
2738     __ enter(); // required for proper stackwalking of RuntimeStub frame
2739 
2740     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2741 
2742     __ aesecb_decrypt(from, to, key, keylen);
2743 
2744     __ mov(r0, 0);
2745 
2746     __ leave();
2747     __ ret(lr);
2748 
2749     return start;
2750   }
2751 
2752   // Arguments:
2753   //
2754   // Inputs:
2755   //   c_rarg0   - source byte array address
2756   //   c_rarg1   - destination byte array address
2757   //   c_rarg2   - K (key) in little endian int array
2758   //   c_rarg3   - r vector byte array address
2759   //   c_rarg4   - input length
2760   //
2761   // Output:
2762   //   x0        - input length
2763   //
2764   address generate_cipherBlockChaining_encryptAESCrypt() {
2765     assert(UseAES, "need AES cryptographic extension support");
2766     __ align(CodeEntryAlignment);
2767     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2768 
2769     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2770 
2771     const Register from        = c_rarg0;  // source array address
2772     const Register to          = c_rarg1;  // destination array address
2773     const Register key         = c_rarg2;  // key array address
2774     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2775                                            // and left with the results of the last encryption block
2776     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2777     const Register keylen      = rscratch1;
2778 
2779     address start = __ pc();
2780 
2781       __ enter();
2782 
2783       __ movw(rscratch2, len_reg);
2784 
2785       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2786 
2787       __ ld1(v0, __ T16B, rvec);
2788 
2789       __ cmpw(keylen, 52);
2790       __ br(Assembler::CC, L_loadkeys_44);
2791       __ br(Assembler::EQ, L_loadkeys_52);
2792 
2793       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2794       __ rev32(v17, __ T16B, v17);
2795       __ rev32(v18, __ T16B, v18);
2796     __ BIND(L_loadkeys_52);
2797       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2798       __ rev32(v19, __ T16B, v19);
2799       __ rev32(v20, __ T16B, v20);
2800     __ BIND(L_loadkeys_44);
2801       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2802       __ rev32(v21, __ T16B, v21);
2803       __ rev32(v22, __ T16B, v22);
2804       __ rev32(v23, __ T16B, v23);
2805       __ rev32(v24, __ T16B, v24);
2806       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2807       __ rev32(v25, __ T16B, v25);
2808       __ rev32(v26, __ T16B, v26);
2809       __ rev32(v27, __ T16B, v27);
2810       __ rev32(v28, __ T16B, v28);
2811       __ ld1(v29, v30, v31, __ T16B, key);
2812       __ rev32(v29, __ T16B, v29);
2813       __ rev32(v30, __ T16B, v30);
2814       __ rev32(v31, __ T16B, v31);
2815 
2816     __ BIND(L_aes_loop);
2817       __ ld1(v1, __ T16B, __ post(from, 16));
2818       __ eor(v0, __ T16B, v0, v1);
2819 
2820       __ br(Assembler::CC, L_rounds_44);
2821       __ br(Assembler::EQ, L_rounds_52);
2822 
2823       __ aese(v0, v17); __ aesmc(v0, v0);
2824       __ aese(v0, v18); __ aesmc(v0, v0);
2825     __ BIND(L_rounds_52);
2826       __ aese(v0, v19); __ aesmc(v0, v0);
2827       __ aese(v0, v20); __ aesmc(v0, v0);
2828     __ BIND(L_rounds_44);
2829       __ aese(v0, v21); __ aesmc(v0, v0);
2830       __ aese(v0, v22); __ aesmc(v0, v0);
2831       __ aese(v0, v23); __ aesmc(v0, v0);
2832       __ aese(v0, v24); __ aesmc(v0, v0);
2833       __ aese(v0, v25); __ aesmc(v0, v0);
2834       __ aese(v0, v26); __ aesmc(v0, v0);
2835       __ aese(v0, v27); __ aesmc(v0, v0);
2836       __ aese(v0, v28); __ aesmc(v0, v0);
2837       __ aese(v0, v29); __ aesmc(v0, v0);
2838       __ aese(v0, v30);
2839       __ eor(v0, __ T16B, v0, v31);
2840 
2841       __ st1(v0, __ T16B, __ post(to, 16));
2842 
2843       __ subw(len_reg, len_reg, 16);
2844       __ cbnzw(len_reg, L_aes_loop);
2845 
2846       __ st1(v0, __ T16B, rvec);
2847 
2848       __ mov(r0, rscratch2);
2849 
2850       __ leave();
2851       __ ret(lr);
2852 
2853       return start;
2854   }
2855 
2856   // Arguments:
2857   //
2858   // Inputs:
2859   //   c_rarg0   - source byte array address
2860   //   c_rarg1   - destination byte array address
2861   //   c_rarg2   - K (key) in little endian int array
2862   //   c_rarg3   - r vector byte array address
2863   //   c_rarg4   - input length
2864   //
2865   // Output:
2866   //   r0        - input length
2867   //
2868   address generate_cipherBlockChaining_decryptAESCrypt() {
2869     assert(UseAES, "need AES cryptographic extension support");
2870     __ align(CodeEntryAlignment);
2871     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2872 
2873     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2874 
2875     const Register from        = c_rarg0;  // source array address
2876     const Register to          = c_rarg1;  // destination array address
2877     const Register key         = c_rarg2;  // key array address
2878     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2879                                            // and left with the results of the last encryption block
2880     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2881     const Register keylen      = rscratch1;
2882 
2883     address start = __ pc();
2884 
2885       __ enter();
2886 
2887       __ movw(rscratch2, len_reg);
2888 
2889       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2890 
2891       __ ld1(v2, __ T16B, rvec);
2892 
2893       __ ld1(v31, __ T16B, __ post(key, 16));
2894       __ rev32(v31, __ T16B, v31);
2895 
2896       __ cmpw(keylen, 52);
2897       __ br(Assembler::CC, L_loadkeys_44);
2898       __ br(Assembler::EQ, L_loadkeys_52);
2899 
2900       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2901       __ rev32(v17, __ T16B, v17);
2902       __ rev32(v18, __ T16B, v18);
2903     __ BIND(L_loadkeys_52);
2904       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2905       __ rev32(v19, __ T16B, v19);
2906       __ rev32(v20, __ T16B, v20);
2907     __ BIND(L_loadkeys_44);
2908       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2909       __ rev32(v21, __ T16B, v21);
2910       __ rev32(v22, __ T16B, v22);
2911       __ rev32(v23, __ T16B, v23);
2912       __ rev32(v24, __ T16B, v24);
2913       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2914       __ rev32(v25, __ T16B, v25);
2915       __ rev32(v26, __ T16B, v26);
2916       __ rev32(v27, __ T16B, v27);
2917       __ rev32(v28, __ T16B, v28);
2918       __ ld1(v29, v30, __ T16B, key);
2919       __ rev32(v29, __ T16B, v29);
2920       __ rev32(v30, __ T16B, v30);
2921 
2922     __ BIND(L_aes_loop);
2923       __ ld1(v0, __ T16B, __ post(from, 16));
2924       __ orr(v1, __ T16B, v0, v0);
2925 
2926       __ br(Assembler::CC, L_rounds_44);
2927       __ br(Assembler::EQ, L_rounds_52);
2928 
2929       __ aesd(v0, v17); __ aesimc(v0, v0);
2930       __ aesd(v0, v18); __ aesimc(v0, v0);
2931     __ BIND(L_rounds_52);
2932       __ aesd(v0, v19); __ aesimc(v0, v0);
2933       __ aesd(v0, v20); __ aesimc(v0, v0);
2934     __ BIND(L_rounds_44);
2935       __ aesd(v0, v21); __ aesimc(v0, v0);
2936       __ aesd(v0, v22); __ aesimc(v0, v0);
2937       __ aesd(v0, v23); __ aesimc(v0, v0);
2938       __ aesd(v0, v24); __ aesimc(v0, v0);
2939       __ aesd(v0, v25); __ aesimc(v0, v0);
2940       __ aesd(v0, v26); __ aesimc(v0, v0);
2941       __ aesd(v0, v27); __ aesimc(v0, v0);
2942       __ aesd(v0, v28); __ aesimc(v0, v0);
2943       __ aesd(v0, v29); __ aesimc(v0, v0);
2944       __ aesd(v0, v30);
2945       __ eor(v0, __ T16B, v0, v31);
2946       __ eor(v0, __ T16B, v0, v2);
2947 
2948       __ st1(v0, __ T16B, __ post(to, 16));
2949       __ orr(v2, __ T16B, v1, v1);
2950 
2951       __ subw(len_reg, len_reg, 16);
2952       __ cbnzw(len_reg, L_aes_loop);
2953 
2954       __ st1(v2, __ T16B, rvec);
2955 
2956       __ mov(r0, rscratch2);
2957 
2958       __ leave();
2959       __ ret(lr);
2960 
2961     return start;
2962   }
2963 
2964   // Big-endian 128-bit + 64-bit -> 128-bit addition.
2965   // Inputs: 128-bits. in is preserved.
2966   // The least-significant 64-bit word is in the upper dword of each vector.
2967   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2968   // Output: result
2969   void be_add_128_64(FloatRegister result, FloatRegister in,
2970                      FloatRegister inc, FloatRegister tmp) {
2971     assert_different_registers(result, tmp, inc);
2972 
2973     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
2974                                            // input
2975     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
2976     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
2977                                            // MSD == 0 (must be!) to LSD
2978     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
2979   }
2980 
2981   // CTR AES crypt.
2982   // Arguments:
2983   //
2984   // Inputs:
2985   //   c_rarg0   - source byte array address
2986   //   c_rarg1   - destination byte array address
2987   //   c_rarg2   - K (key) in little endian int array
2988   //   c_rarg3   - counter vector byte array address
2989   //   c_rarg4   - input length
2990   //   c_rarg5   - saved encryptedCounter start
2991   //   c_rarg6   - saved used length
2992   //
2993   // Output:
2994   //   r0       - input length
2995   //
2996   address generate_counterMode_AESCrypt() {
2997     const Register in = c_rarg0;
2998     const Register out = c_rarg1;
2999     const Register key = c_rarg2;
3000     const Register counter = c_rarg3;
3001     const Register saved_len = c_rarg4, len = r10;
3002     const Register saved_encrypted_ctr = c_rarg5;
3003     const Register used_ptr = c_rarg6, used = r12;
3004 
3005     const Register offset = r7;
3006     const Register keylen = r11;
3007 
3008     const unsigned char block_size = 16;
3009     const int bulk_width = 4;
3010     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3011     // performance with larger data sizes, but it also means that the
3012     // fast path isn't used until you have at least 8 blocks, and up
3013     // to 127 bytes of data will be executed on the slow path. For
3014     // that reason, and also so as not to blow away too much icache, 4
3015     // blocks seems like a sensible compromise.
3016 
3017     // Algorithm:
3018     //
3019     //    if (len == 0) {
3020     //        goto DONE;
3021     //    }
3022     //    int result = len;
3023     //    do {
3024     //        if (used >= blockSize) {
3025     //            if (len >= bulk_width * blockSize) {
3026     //                CTR_large_block();
3027     //                if (len == 0)
3028     //                    goto DONE;
3029     //            }
3030     //            for (;;) {
3031     //                16ByteVector v0 = counter;
3032     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3033     //                used = 0;
3034     //                if (len < blockSize)
3035     //                    break;    /* goto NEXT */
3036     //                16ByteVector v1 = load16Bytes(in, offset);
3037     //                v1 = v1 ^ encryptedCounter;
3038     //                store16Bytes(out, offset);
3039     //                used = blockSize;
3040     //                offset += blockSize;
3041     //                len -= blockSize;
3042     //                if (len == 0)
3043     //                    goto DONE;
3044     //            }
3045     //        }
3046     //      NEXT:
3047     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3048     //        len--;
3049     //    } while (len != 0);
3050     //  DONE:
3051     //    return result;
3052     //
3053     // CTR_large_block()
3054     //    Wide bulk encryption of whole blocks.
3055 
3056     __ align(CodeEntryAlignment);
3057     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3058     const address start = __ pc();
3059     __ enter();
3060 
3061     Label DONE, CTR_large_block, large_block_return;
3062     __ ldrw(used, Address(used_ptr));
3063     __ cbzw(saved_len, DONE);
3064 
3065     __ mov(len, saved_len);
3066     __ mov(offset, 0);
3067 
3068     // Compute #rounds for AES based on the length of the key array
3069     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3070 
3071     __ aesenc_loadkeys(key, keylen);
3072 
3073     {
3074       Label L_CTR_loop, NEXT;
3075 
3076       __ bind(L_CTR_loop);
3077 
3078       __ cmp(used, block_size);
3079       __ br(__ LO, NEXT);
3080 
3081       // Maybe we have a lot of data
3082       __ subsw(rscratch1, len, bulk_width * block_size);
3083       __ br(__ HS, CTR_large_block);
3084       __ BIND(large_block_return);
3085       __ cbzw(len, DONE);
3086 
3087       // Setup the counter
3088       __ movi(v4, __ T4S, 0);
3089       __ movi(v5, __ T4S, 1);
3090       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3091 
3092       // 128-bit big-endian increment
3093       __ ld1(v0, __ T16B, counter);
3094       __ rev64(v16, __ T16B, v0);
3095       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3096       __ rev64(v16, __ T16B, v16);
3097       __ st1(v16, __ T16B, counter);
3098       // Previous counter value is in v0
3099       // v4 contains { 0, 1 }
3100 
3101       {
3102         // We have fewer than bulk_width blocks of data left. Encrypt
3103         // them one by one until there is less than a full block
3104         // remaining, being careful to save both the encrypted counter
3105         // and the counter.
3106 
3107         Label inner_loop;
3108         __ bind(inner_loop);
3109         // Counter to encrypt is in v0
3110         __ aesecb_encrypt(noreg, noreg, keylen);
3111         __ st1(v0, __ T16B, saved_encrypted_ctr);
3112 
3113         // Do we have a remaining full block?
3114 
3115         __ mov(used, 0);
3116         __ cmp(len, block_size);
3117         __ br(__ LO, NEXT);
3118 
3119         // Yes, we have a full block
3120         __ ldrq(v1, Address(in, offset));
3121         __ eor(v1, __ T16B, v1, v0);
3122         __ strq(v1, Address(out, offset));
3123         __ mov(used, block_size);
3124         __ add(offset, offset, block_size);
3125 
3126         __ subw(len, len, block_size);
3127         __ cbzw(len, DONE);
3128 
3129         // Increment the counter, store it back
3130         __ orr(v0, __ T16B, v16, v16);
3131         __ rev64(v16, __ T16B, v16);
3132         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3133         __ rev64(v16, __ T16B, v16);
3134         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3135 
3136         __ b(inner_loop);
3137       }
3138 
3139       __ BIND(NEXT);
3140 
3141       // Encrypt a single byte, and loop.
3142       // We expect this to be a rare event.
3143       __ ldrb(rscratch1, Address(in, offset));
3144       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3145       __ eor(rscratch1, rscratch1, rscratch2);
3146       __ strb(rscratch1, Address(out, offset));
3147       __ add(offset, offset, 1);
3148       __ add(used, used, 1);
3149       __ subw(len, len,1);
3150       __ cbnzw(len, L_CTR_loop);
3151     }
3152 
3153     __ bind(DONE);
3154     __ strw(used, Address(used_ptr));
3155     __ mov(r0, saved_len);
3156 
3157     __ leave(); // required for proper stackwalking of RuntimeStub frame
3158     __ ret(lr);
3159 
3160     // Bulk encryption
3161 
3162     __ BIND (CTR_large_block);
3163     assert(bulk_width == 4 || bulk_width == 8, "must be");
3164 
3165     if (bulk_width == 8) {
3166       __ sub(sp, sp, 4 * 16);
3167       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3168     }
3169     __ sub(sp, sp, 4 * 16);
3170     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3171     RegSet saved_regs = (RegSet::of(in, out, offset)
3172                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3173     __ push(saved_regs, sp);
3174     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3175     __ add(in, in, offset);
3176     __ add(out, out, offset);
3177 
3178     // Keys should already be loaded into the correct registers
3179 
3180     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3181     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3182 
3183     // AES/CTR loop
3184     {
3185       Label L_CTR_loop;
3186       __ BIND(L_CTR_loop);
3187 
3188       // Setup the counters
3189       __ movi(v8, __ T4S, 0);
3190       __ movi(v9, __ T4S, 1);
3191       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3192 
3193       for (int i = 0; i < bulk_width; i++) {
3194         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3195         __ rev64(v0_ofs, __ T16B, v16);
3196         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3197       }
3198 
3199       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3200 
3201       // Encrypt the counters
3202       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3203 
3204       if (bulk_width == 8) {
3205         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3206       }
3207 
3208       // XOR the encrypted counters with the inputs
3209       for (int i = 0; i < bulk_width; i++) {
3210         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3211         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3212         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3213       }
3214 
3215       // Write the encrypted data
3216       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3217       if (bulk_width == 8) {
3218         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3219       }
3220 
3221       __ subw(len, len, 16 * bulk_width);
3222       __ cbnzw(len, L_CTR_loop);
3223     }
3224 
3225     // Save the counter back where it goes
3226     __ rev64(v16, __ T16B, v16);
3227     __ st1(v16, __ T16B, counter);
3228 
3229     __ pop(saved_regs, sp);
3230 
3231     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3232     if (bulk_width == 8) {
3233       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3234     }
3235 
3236     __ andr(rscratch1, len, -16 * bulk_width);
3237     __ sub(len, len, rscratch1);
3238     __ add(offset, offset, rscratch1);
3239     __ mov(used, 16);
3240     __ strw(used, Address(used_ptr));
3241     __ b(large_block_return);
3242 
3243     return start;
3244   }
3245 
3246   // Vector AES Galois Counter Mode implementation. Parameters:
3247   //
3248   // in = c_rarg0
3249   // len = c_rarg1
3250   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3251   // out = c_rarg3
3252   // key = c_rarg4
3253   // state = c_rarg5 - GHASH.state
3254   // subkeyHtbl = c_rarg6 - powers of H
3255   // counter = c_rarg7 - 16 bytes of CTR
3256   // return - number of processed bytes
3257   address generate_galoisCounterMode_AESCrypt() {
3258     address ghash_polynomial = __ pc();
3259     __ emit_int64(0x87);  // The low-order bits of the field
3260                           // polynomial (i.e. p = z^7+z^2+z+1)
3261                           // repeated in the low and high parts of a
3262                           // 128-bit vector
3263     __ emit_int64(0x87);
3264 
3265     __ align(CodeEntryAlignment);
3266      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3267     address start = __ pc();
3268     __ enter();
3269 
3270     const Register in = c_rarg0;
3271     const Register len = c_rarg1;
3272     const Register ct = c_rarg2;
3273     const Register out = c_rarg3;
3274     // and updated with the incremented counter in the end
3275 
3276     const Register key = c_rarg4;
3277     const Register state = c_rarg5;
3278 
3279     const Register subkeyHtbl = c_rarg6;
3280 
3281     const Register counter = c_rarg7;
3282 
3283     const Register keylen = r10;
3284     // Save state before entering routine
3285     __ sub(sp, sp, 4 * 16);
3286     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3287     __ sub(sp, sp, 4 * 16);
3288     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3289 
3290     // __ andr(len, len, -512);
3291     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3292     __ str(len, __ pre(sp, -2 * wordSize));
3293 
3294     Label DONE;
3295     __ cbz(len, DONE);
3296 
3297     // Compute #rounds for AES based on the length of the key array
3298     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3299 
3300     __ aesenc_loadkeys(key, keylen);
3301     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3302     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3303 
3304     // AES/CTR loop
3305     {
3306       Label L_CTR_loop;
3307       __ BIND(L_CTR_loop);
3308 
3309       // Setup the counters
3310       __ movi(v8, __ T4S, 0);
3311       __ movi(v9, __ T4S, 1);
3312       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3313 
3314       assert(v0->encoding() < v8->encoding(), "");
3315       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3316         FloatRegister f = as_FloatRegister(i);
3317         __ rev32(f, __ T16B, v16);
3318         __ addv(v16, __ T4S, v16, v8);
3319       }
3320 
3321       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3322 
3323       // Encrypt the counters
3324       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3325 
3326       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3327 
3328       // XOR the encrypted counters with the inputs
3329       for (int i = 0; i < 8; i++) {
3330         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3331         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3332         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3333       }
3334       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3335       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3336 
3337       __ subw(len, len, 16 * 8);
3338       __ cbnzw(len, L_CTR_loop);
3339     }
3340 
3341     __ rev32(v16, __ T16B, v16);
3342     __ st1(v16, __ T16B, counter);
3343 
3344     __ ldr(len, Address(sp));
3345     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3346 
3347     // GHASH/CTR loop
3348     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3349                                 len, /*unrolls*/4);
3350 
3351 #ifdef ASSERT
3352     { Label L;
3353       __ cmp(len, (unsigned char)0);
3354       __ br(Assembler::EQ, L);
3355       __ stop("stubGenerator: abort");
3356       __ bind(L);
3357   }
3358 #endif
3359 
3360   __ bind(DONE);
3361     // Return the number of bytes processed
3362     __ ldr(r0, __ post(sp, 2 * wordSize));
3363 
3364     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3365     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3366 
3367     __ leave(); // required for proper stackwalking of RuntimeStub frame
3368     __ ret(lr);
3369      return start;
3370   }
3371 
3372   class Cached64Bytes {
3373   private:
3374     MacroAssembler *_masm;
3375     Register _regs[8];
3376 
3377   public:
3378     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3379       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3380       auto it = rs.begin();
3381       for (auto &r: _regs) {
3382         r = *it;
3383         ++it;
3384       }
3385     }
3386 
3387     void gen_loads(Register base) {
3388       for (int i = 0; i < 8; i += 2) {
3389         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3390       }
3391     }
3392 
3393     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3394     void extract_u32(Register dest, int i) {
3395       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3396     }
3397   };
3398 
3399   // Utility routines for md5.
3400   // Clobbers r10 and r11.
3401   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3402               int k, int s, int t) {
3403     Register rscratch3 = r10;
3404     Register rscratch4 = r11;
3405 
3406     __ eorw(rscratch3, r3, r4);
3407     __ movw(rscratch2, t);
3408     __ andw(rscratch3, rscratch3, r2);
3409     __ addw(rscratch4, r1, rscratch2);
3410     reg_cache.extract_u32(rscratch1, k);
3411     __ eorw(rscratch3, rscratch3, r4);
3412     __ addw(rscratch4, rscratch4, rscratch1);
3413     __ addw(rscratch3, rscratch3, rscratch4);
3414     __ rorw(rscratch2, rscratch3, 32 - s);
3415     __ addw(r1, rscratch2, r2);
3416   }
3417 
3418   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3419               int k, int s, int t) {
3420     Register rscratch3 = r10;
3421     Register rscratch4 = r11;
3422 
3423     __ andw(rscratch3, r2, r4);
3424     __ bicw(rscratch4, r3, r4);
3425     reg_cache.extract_u32(rscratch1, k);
3426     __ movw(rscratch2, t);
3427     __ orrw(rscratch3, rscratch3, rscratch4);
3428     __ addw(rscratch4, r1, rscratch2);
3429     __ addw(rscratch4, rscratch4, rscratch1);
3430     __ addw(rscratch3, rscratch3, rscratch4);
3431     __ rorw(rscratch2, rscratch3, 32 - s);
3432     __ addw(r1, rscratch2, r2);
3433   }
3434 
3435   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3436               int k, int s, int t) {
3437     Register rscratch3 = r10;
3438     Register rscratch4 = r11;
3439 
3440     __ eorw(rscratch3, r3, r4);
3441     __ movw(rscratch2, t);
3442     __ addw(rscratch4, r1, rscratch2);
3443     reg_cache.extract_u32(rscratch1, k);
3444     __ eorw(rscratch3, rscratch3, r2);
3445     __ addw(rscratch4, rscratch4, rscratch1);
3446     __ addw(rscratch3, rscratch3, rscratch4);
3447     __ rorw(rscratch2, rscratch3, 32 - s);
3448     __ addw(r1, rscratch2, r2);
3449   }
3450 
3451   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3452               int k, int s, int t) {
3453     Register rscratch3 = r10;
3454     Register rscratch4 = r11;
3455 
3456     __ movw(rscratch3, t);
3457     __ ornw(rscratch2, r2, r4);
3458     __ addw(rscratch4, r1, rscratch3);
3459     reg_cache.extract_u32(rscratch1, k);
3460     __ eorw(rscratch3, rscratch2, r3);
3461     __ addw(rscratch4, rscratch4, rscratch1);
3462     __ addw(rscratch3, rscratch3, rscratch4);
3463     __ rorw(rscratch2, rscratch3, 32 - s);
3464     __ addw(r1, rscratch2, r2);
3465   }
3466 
3467   // Arguments:
3468   //
3469   // Inputs:
3470   //   c_rarg0   - byte[]  source+offset
3471   //   c_rarg1   - int[]   SHA.state
3472   //   c_rarg2   - int     offset
3473   //   c_rarg3   - int     limit
3474   //
3475   address generate_md5_implCompress(bool multi_block, const char *name) {
3476     __ align(CodeEntryAlignment);
3477     StubCodeMark mark(this, "StubRoutines", name);
3478     address start = __ pc();
3479 
3480     Register buf       = c_rarg0;
3481     Register state     = c_rarg1;
3482     Register ofs       = c_rarg2;
3483     Register limit     = c_rarg3;
3484     Register a         = r4;
3485     Register b         = r5;
3486     Register c         = r6;
3487     Register d         = r7;
3488     Register rscratch3 = r10;
3489     Register rscratch4 = r11;
3490 
3491     Register state_regs[2] = { r12, r13 };
3492     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3493     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3494 
3495     __ push(saved_regs, sp);
3496 
3497     __ ldp(state_regs[0], state_regs[1], Address(state));
3498     __ ubfx(a, state_regs[0],  0, 32);
3499     __ ubfx(b, state_regs[0], 32, 32);
3500     __ ubfx(c, state_regs[1],  0, 32);
3501     __ ubfx(d, state_regs[1], 32, 32);
3502 
3503     Label md5_loop;
3504     __ BIND(md5_loop);
3505 
3506     reg_cache.gen_loads(buf);
3507 
3508     // Round 1
3509     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3510     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3511     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3512     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3513     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3514     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3515     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3516     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3517     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3518     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3519     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3520     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3521     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3522     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3523     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3524     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3525 
3526     // Round 2
3527     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3528     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3529     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3530     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3531     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3532     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3533     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3534     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3535     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3536     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3537     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3538     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3539     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3540     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3541     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3542     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3543 
3544     // Round 3
3545     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3546     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3547     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3548     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3549     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3550     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3551     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3552     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3553     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3554     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3555     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3556     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3557     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3558     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3559     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3560     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3561 
3562     // Round 4
3563     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3564     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3565     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3566     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3567     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3568     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3569     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3570     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3571     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3572     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3573     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3574     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3575     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3576     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3577     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3578     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3579 
3580     __ addw(a, state_regs[0], a);
3581     __ ubfx(rscratch2, state_regs[0], 32, 32);
3582     __ addw(b, rscratch2, b);
3583     __ addw(c, state_regs[1], c);
3584     __ ubfx(rscratch4, state_regs[1], 32, 32);
3585     __ addw(d, rscratch4, d);
3586 
3587     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3588     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3589 
3590     if (multi_block) {
3591       __ add(buf, buf, 64);
3592       __ add(ofs, ofs, 64);
3593       __ cmp(ofs, limit);
3594       __ br(Assembler::LE, md5_loop);
3595       __ mov(c_rarg0, ofs); // return ofs
3596     }
3597 
3598     // write hash values back in the correct order
3599     __ stp(state_regs[0], state_regs[1], Address(state));
3600 
3601     __ pop(saved_regs, sp);
3602 
3603     __ ret(lr);
3604 
3605     return start;
3606   }
3607 
3608   // Arguments:
3609   //
3610   // Inputs:
3611   //   c_rarg0   - byte[]  source+offset
3612   //   c_rarg1   - int[]   SHA.state
3613   //   c_rarg2   - int     offset
3614   //   c_rarg3   - int     limit
3615   //
3616   address generate_sha1_implCompress(bool multi_block, const char *name) {
3617     __ align(CodeEntryAlignment);
3618     StubCodeMark mark(this, "StubRoutines", name);
3619     address start = __ pc();
3620 
3621     Register buf   = c_rarg0;
3622     Register state = c_rarg1;
3623     Register ofs   = c_rarg2;
3624     Register limit = c_rarg3;
3625 
3626     Label keys;
3627     Label sha1_loop;
3628 
3629     // load the keys into v0..v3
3630     __ adr(rscratch1, keys);
3631     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3632     // load 5 words state into v6, v7
3633     __ ldrq(v6, Address(state, 0));
3634     __ ldrs(v7, Address(state, 16));
3635 
3636 
3637     __ BIND(sha1_loop);
3638     // load 64 bytes of data into v16..v19
3639     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3640     __ rev32(v16, __ T16B, v16);
3641     __ rev32(v17, __ T16B, v17);
3642     __ rev32(v18, __ T16B, v18);
3643     __ rev32(v19, __ T16B, v19);
3644 
3645     // do the sha1
3646     __ addv(v4, __ T4S, v16, v0);
3647     __ orr(v20, __ T16B, v6, v6);
3648 
3649     FloatRegister d0 = v16;
3650     FloatRegister d1 = v17;
3651     FloatRegister d2 = v18;
3652     FloatRegister d3 = v19;
3653 
3654     for (int round = 0; round < 20; round++) {
3655       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3656       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3657       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3658       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3659       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3660 
3661       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3662       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3663       __ sha1h(tmp2, __ T4S, v20);
3664       if (round < 5)
3665         __ sha1c(v20, __ T4S, tmp3, tmp4);
3666       else if (round < 10 || round >= 15)
3667         __ sha1p(v20, __ T4S, tmp3, tmp4);
3668       else
3669         __ sha1m(v20, __ T4S, tmp3, tmp4);
3670       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3671 
3672       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3673     }
3674 
3675     __ addv(v7, __ T2S, v7, v21);
3676     __ addv(v6, __ T4S, v6, v20);
3677 
3678     if (multi_block) {
3679       __ add(ofs, ofs, 64);
3680       __ cmp(ofs, limit);
3681       __ br(Assembler::LE, sha1_loop);
3682       __ mov(c_rarg0, ofs); // return ofs
3683     }
3684 
3685     __ strq(v6, Address(state, 0));
3686     __ strs(v7, Address(state, 16));
3687 
3688     __ ret(lr);
3689 
3690     __ bind(keys);
3691     __ emit_int32(0x5a827999);
3692     __ emit_int32(0x6ed9eba1);
3693     __ emit_int32(0x8f1bbcdc);
3694     __ emit_int32(0xca62c1d6);
3695 
3696     return start;
3697   }
3698 
3699 
3700   // Arguments:
3701   //
3702   // Inputs:
3703   //   c_rarg0   - byte[]  source+offset
3704   //   c_rarg1   - int[]   SHA.state
3705   //   c_rarg2   - int     offset
3706   //   c_rarg3   - int     limit
3707   //
3708   address generate_sha256_implCompress(bool multi_block, const char *name) {
3709     static const uint32_t round_consts[64] = {
3710       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3711       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3712       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3713       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3714       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3715       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3716       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3717       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3718       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3719       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3720       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3721       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3722       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3723       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3724       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3725       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3726     };
3727     __ align(CodeEntryAlignment);
3728     StubCodeMark mark(this, "StubRoutines", name);
3729     address start = __ pc();
3730 
3731     Register buf   = c_rarg0;
3732     Register state = c_rarg1;
3733     Register ofs   = c_rarg2;
3734     Register limit = c_rarg3;
3735 
3736     Label sha1_loop;
3737 
3738     __ stpd(v8, v9, __ pre(sp, -32));
3739     __ stpd(v10, v11, Address(sp, 16));
3740 
3741 // dga == v0
3742 // dgb == v1
3743 // dg0 == v2
3744 // dg1 == v3
3745 // dg2 == v4
3746 // t0 == v6
3747 // t1 == v7
3748 
3749     // load 16 keys to v16..v31
3750     __ lea(rscratch1, ExternalAddress((address)round_consts));
3751     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3752     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3753     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3754     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3755 
3756     // load 8 words (256 bits) state
3757     __ ldpq(v0, v1, state);
3758 
3759     __ BIND(sha1_loop);
3760     // load 64 bytes of data into v8..v11
3761     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3762     __ rev32(v8, __ T16B, v8);
3763     __ rev32(v9, __ T16B, v9);
3764     __ rev32(v10, __ T16B, v10);
3765     __ rev32(v11, __ T16B, v11);
3766 
3767     __ addv(v6, __ T4S, v8, v16);
3768     __ orr(v2, __ T16B, v0, v0);
3769     __ orr(v3, __ T16B, v1, v1);
3770 
3771     FloatRegister d0 = v8;
3772     FloatRegister d1 = v9;
3773     FloatRegister d2 = v10;
3774     FloatRegister d3 = v11;
3775 
3776 
3777     for (int round = 0; round < 16; round++) {
3778       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3779       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3780       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3781       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3782 
3783       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3784        __ orr(v4, __ T16B, v2, v2);
3785       if (round < 15)
3786         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3787       __ sha256h(v2, __ T4S, v3, tmp2);
3788       __ sha256h2(v3, __ T4S, v4, tmp2);
3789       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3790 
3791       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3792     }
3793 
3794     __ addv(v0, __ T4S, v0, v2);
3795     __ addv(v1, __ T4S, v1, v3);
3796 
3797     if (multi_block) {
3798       __ add(ofs, ofs, 64);
3799       __ cmp(ofs, limit);
3800       __ br(Assembler::LE, sha1_loop);
3801       __ mov(c_rarg0, ofs); // return ofs
3802     }
3803 
3804     __ ldpd(v10, v11, Address(sp, 16));
3805     __ ldpd(v8, v9, __ post(sp, 32));
3806 
3807     __ stpq(v0, v1, state);
3808 
3809     __ ret(lr);
3810 
3811     return start;
3812   }
3813 
3814   // Double rounds for sha512.
3815   void sha512_dround(int dr,
3816                      FloatRegister vi0, FloatRegister vi1,
3817                      FloatRegister vi2, FloatRegister vi3,
3818                      FloatRegister vi4, FloatRegister vrc0,
3819                      FloatRegister vrc1, FloatRegister vin0,
3820                      FloatRegister vin1, FloatRegister vin2,
3821                      FloatRegister vin3, FloatRegister vin4) {
3822       if (dr < 36) {
3823         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3824       }
3825       __ addv(v5, __ T2D, vrc0, vin0);
3826       __ ext(v6, __ T16B, vi2, vi3, 8);
3827       __ ext(v5, __ T16B, v5, v5, 8);
3828       __ ext(v7, __ T16B, vi1, vi2, 8);
3829       __ addv(vi3, __ T2D, vi3, v5);
3830       if (dr < 32) {
3831         __ ext(v5, __ T16B, vin3, vin4, 8);
3832         __ sha512su0(vin0, __ T2D, vin1);
3833       }
3834       __ sha512h(vi3, __ T2D, v6, v7);
3835       if (dr < 32) {
3836         __ sha512su1(vin0, __ T2D, vin2, v5);
3837       }
3838       __ addv(vi4, __ T2D, vi1, vi3);
3839       __ sha512h2(vi3, __ T2D, vi1, vi0);
3840   }
3841 
3842   // Arguments:
3843   //
3844   // Inputs:
3845   //   c_rarg0   - byte[]  source+offset
3846   //   c_rarg1   - int[]   SHA.state
3847   //   c_rarg2   - int     offset
3848   //   c_rarg3   - int     limit
3849   //
3850   address generate_sha512_implCompress(bool multi_block, const char *name) {
3851     static const uint64_t round_consts[80] = {
3852       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3853       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3854       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3855       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3856       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3857       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3858       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3859       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3860       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3861       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3862       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3863       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3864       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3865       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3866       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3867       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3868       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3869       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3870       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3871       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3872       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3873       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3874       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3875       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3876       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3877       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3878       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3879     };
3880 
3881     __ align(CodeEntryAlignment);
3882     StubCodeMark mark(this, "StubRoutines", name);
3883     address start = __ pc();
3884 
3885     Register buf   = c_rarg0;
3886     Register state = c_rarg1;
3887     Register ofs   = c_rarg2;
3888     Register limit = c_rarg3;
3889 
3890     __ stpd(v8, v9, __ pre(sp, -64));
3891     __ stpd(v10, v11, Address(sp, 16));
3892     __ stpd(v12, v13, Address(sp, 32));
3893     __ stpd(v14, v15, Address(sp, 48));
3894 
3895     Label sha512_loop;
3896 
3897     // load state
3898     __ ld1(v8, v9, v10, v11, __ T2D, state);
3899 
3900     // load first 4 round constants
3901     __ lea(rscratch1, ExternalAddress((address)round_consts));
3902     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3903 
3904     __ BIND(sha512_loop);
3905     // load 128B of data into v12..v19
3906     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3907     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3908     __ rev64(v12, __ T16B, v12);
3909     __ rev64(v13, __ T16B, v13);
3910     __ rev64(v14, __ T16B, v14);
3911     __ rev64(v15, __ T16B, v15);
3912     __ rev64(v16, __ T16B, v16);
3913     __ rev64(v17, __ T16B, v17);
3914     __ rev64(v18, __ T16B, v18);
3915     __ rev64(v19, __ T16B, v19);
3916 
3917     __ mov(rscratch2, rscratch1);
3918 
3919     __ mov(v0, __ T16B, v8);
3920     __ mov(v1, __ T16B, v9);
3921     __ mov(v2, __ T16B, v10);
3922     __ mov(v3, __ T16B, v11);
3923 
3924     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3925     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3926     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3927     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3928     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3929     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3930     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3931     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3932     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3933     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3934     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3935     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3936     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3937     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3938     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3939     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3940     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3941     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3942     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3943     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3944     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3945     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3946     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3947     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3948     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3949     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3950     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3951     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3952     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3953     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3954     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3955     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3956     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3957     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3958     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3959     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3960     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3961     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3962     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3963     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3964 
3965     __ addv(v8, __ T2D, v8, v0);
3966     __ addv(v9, __ T2D, v9, v1);
3967     __ addv(v10, __ T2D, v10, v2);
3968     __ addv(v11, __ T2D, v11, v3);
3969 
3970     if (multi_block) {
3971       __ add(ofs, ofs, 128);
3972       __ cmp(ofs, limit);
3973       __ br(Assembler::LE, sha512_loop);
3974       __ mov(c_rarg0, ofs); // return ofs
3975     }
3976 
3977     __ st1(v8, v9, v10, v11, __ T2D, state);
3978 
3979     __ ldpd(v14, v15, Address(sp, 48));
3980     __ ldpd(v12, v13, Address(sp, 32));
3981     __ ldpd(v10, v11, Address(sp, 16));
3982     __ ldpd(v8, v9, __ post(sp, 64));
3983 
3984     __ ret(lr);
3985 
3986     return start;
3987   }
3988 
3989   // Arguments:
3990   //
3991   // Inputs:
3992   //   c_rarg0   - byte[]  source+offset
3993   //   c_rarg1   - byte[]  SHA.state
3994   //   c_rarg2   - int     block_size
3995   //   c_rarg3   - int     offset
3996   //   c_rarg4   - int     limit
3997   //
3998   address generate_sha3_implCompress(bool multi_block, const char *name) {
3999     static const uint64_t round_consts[24] = {
4000       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4001       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4002       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4003       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4004       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4005       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4006       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4007       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4008     };
4009 
4010     __ align(CodeEntryAlignment);
4011     StubCodeMark mark(this, "StubRoutines", name);
4012     address start = __ pc();
4013 
4014     Register buf           = c_rarg0;
4015     Register state         = c_rarg1;
4016     Register block_size    = c_rarg2;
4017     Register ofs           = c_rarg3;
4018     Register limit         = c_rarg4;
4019 
4020     Label sha3_loop, rounds24_loop;
4021     Label sha3_512_or_sha3_384, shake128;
4022 
4023     __ stpd(v8, v9, __ pre(sp, -64));
4024     __ stpd(v10, v11, Address(sp, 16));
4025     __ stpd(v12, v13, Address(sp, 32));
4026     __ stpd(v14, v15, Address(sp, 48));
4027 
4028     // load state
4029     __ add(rscratch1, state, 32);
4030     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4031     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4032     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4033     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4034     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4035     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4036     __ ld1(v24, __ T1D, rscratch1);
4037 
4038     __ BIND(sha3_loop);
4039 
4040     // 24 keccak rounds
4041     __ movw(rscratch2, 24);
4042 
4043     // load round_constants base
4044     __ lea(rscratch1, ExternalAddress((address) round_consts));
4045 
4046     // load input
4047     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4048     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4049     __ eor(v0, __ T8B, v0, v25);
4050     __ eor(v1, __ T8B, v1, v26);
4051     __ eor(v2, __ T8B, v2, v27);
4052     __ eor(v3, __ T8B, v3, v28);
4053     __ eor(v4, __ T8B, v4, v29);
4054     __ eor(v5, __ T8B, v5, v30);
4055     __ eor(v6, __ T8B, v6, v31);
4056 
4057     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4058     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4059 
4060     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4061     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4062     __ eor(v7, __ T8B, v7, v25);
4063     __ eor(v8, __ T8B, v8, v26);
4064     __ eor(v9, __ T8B, v9, v27);
4065     __ eor(v10, __ T8B, v10, v28);
4066     __ eor(v11, __ T8B, v11, v29);
4067     __ eor(v12, __ T8B, v12, v30);
4068     __ eor(v13, __ T8B, v13, v31);
4069 
4070     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4071     __ eor(v14, __ T8B, v14, v25);
4072     __ eor(v15, __ T8B, v15, v26);
4073     __ eor(v16, __ T8B, v16, v27);
4074 
4075     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4076     __ andw(c_rarg5, block_size, 48);
4077     __ cbzw(c_rarg5, rounds24_loop);
4078 
4079     __ tbnz(block_size, 5, shake128);
4080     // block_size == 144, bit5 == 0, SHA3-244
4081     __ ldrd(v28, __ post(buf, 8));
4082     __ eor(v17, __ T8B, v17, v28);
4083     __ b(rounds24_loop);
4084 
4085     __ BIND(shake128);
4086     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4087     __ eor(v17, __ T8B, v17, v28);
4088     __ eor(v18, __ T8B, v18, v29);
4089     __ eor(v19, __ T8B, v19, v30);
4090     __ eor(v20, __ T8B, v20, v31);
4091     __ b(rounds24_loop); // block_size == 168, SHAKE128
4092 
4093     __ BIND(sha3_512_or_sha3_384);
4094     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4095     __ eor(v7, __ T8B, v7, v25);
4096     __ eor(v8, __ T8B, v8, v26);
4097     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4098 
4099     // SHA3-384
4100     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4101     __ eor(v9,  __ T8B, v9,  v27);
4102     __ eor(v10, __ T8B, v10, v28);
4103     __ eor(v11, __ T8B, v11, v29);
4104     __ eor(v12, __ T8B, v12, v30);
4105 
4106     __ BIND(rounds24_loop);
4107     __ subw(rscratch2, rscratch2, 1);
4108 
4109     __ eor3(v29, __ T16B, v4, v9, v14);
4110     __ eor3(v26, __ T16B, v1, v6, v11);
4111     __ eor3(v28, __ T16B, v3, v8, v13);
4112     __ eor3(v25, __ T16B, v0, v5, v10);
4113     __ eor3(v27, __ T16B, v2, v7, v12);
4114     __ eor3(v29, __ T16B, v29, v19, v24);
4115     __ eor3(v26, __ T16B, v26, v16, v21);
4116     __ eor3(v28, __ T16B, v28, v18, v23);
4117     __ eor3(v25, __ T16B, v25, v15, v20);
4118     __ eor3(v27, __ T16B, v27, v17, v22);
4119 
4120     __ rax1(v30, __ T2D, v29, v26);
4121     __ rax1(v26, __ T2D, v26, v28);
4122     __ rax1(v28, __ T2D, v28, v25);
4123     __ rax1(v25, __ T2D, v25, v27);
4124     __ rax1(v27, __ T2D, v27, v29);
4125 
4126     __ eor(v0, __ T16B, v0, v30);
4127     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4128     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4129     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4130     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4131     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4132     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4133     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4134     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4135     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4136     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4137     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4138     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4139     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4140     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4141     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4142     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4143     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4144     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4145     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4146     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4147     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4148     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4149     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4150     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4151 
4152     __ bcax(v20, __ T16B, v31, v22, v8);
4153     __ bcax(v21, __ T16B, v8,  v23, v22);
4154     __ bcax(v22, __ T16B, v22, v24, v23);
4155     __ bcax(v23, __ T16B, v23, v31, v24);
4156     __ bcax(v24, __ T16B, v24, v8,  v31);
4157 
4158     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4159 
4160     __ bcax(v17, __ T16B, v25, v19, v3);
4161     __ bcax(v18, __ T16B, v3,  v15, v19);
4162     __ bcax(v19, __ T16B, v19, v16, v15);
4163     __ bcax(v15, __ T16B, v15, v25, v16);
4164     __ bcax(v16, __ T16B, v16, v3,  v25);
4165 
4166     __ bcax(v10, __ T16B, v29, v12, v26);
4167     __ bcax(v11, __ T16B, v26, v13, v12);
4168     __ bcax(v12, __ T16B, v12, v14, v13);
4169     __ bcax(v13, __ T16B, v13, v29, v14);
4170     __ bcax(v14, __ T16B, v14, v26, v29);
4171 
4172     __ bcax(v7, __ T16B, v30, v9,  v4);
4173     __ bcax(v8, __ T16B, v4,  v5,  v9);
4174     __ bcax(v9, __ T16B, v9,  v6,  v5);
4175     __ bcax(v5, __ T16B, v5,  v30, v6);
4176     __ bcax(v6, __ T16B, v6,  v4,  v30);
4177 
4178     __ bcax(v3, __ T16B, v27, v0,  v28);
4179     __ bcax(v4, __ T16B, v28, v1,  v0);
4180     __ bcax(v0, __ T16B, v0,  v2,  v1);
4181     __ bcax(v1, __ T16B, v1,  v27, v2);
4182     __ bcax(v2, __ T16B, v2,  v28, v27);
4183 
4184     __ eor(v0, __ T16B, v0, v31);
4185 
4186     __ cbnzw(rscratch2, rounds24_loop);
4187 
4188     if (multi_block) {
4189       __ add(ofs, ofs, block_size);
4190       __ cmp(ofs, limit);
4191       __ br(Assembler::LE, sha3_loop);
4192       __ mov(c_rarg0, ofs); // return ofs
4193     }
4194 
4195     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4196     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4197     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4198     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4199     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4200     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4201     __ st1(v24, __ T1D, state);
4202 
4203     __ ldpd(v14, v15, Address(sp, 48));
4204     __ ldpd(v12, v13, Address(sp, 32));
4205     __ ldpd(v10, v11, Address(sp, 16));
4206     __ ldpd(v8, v9, __ post(sp, 64));
4207 
4208     __ ret(lr);
4209 
4210     return start;
4211   }
4212 
4213   /**
4214    *  Arguments:
4215    *
4216    * Inputs:
4217    *   c_rarg0   - int crc
4218    *   c_rarg1   - byte* buf
4219    *   c_rarg2   - int length
4220    *
4221    * Output:
4222    *       rax   - int crc result
4223    */
4224   address generate_updateBytesCRC32() {
4225     assert(UseCRC32Intrinsics, "what are we doing here?");
4226 
4227     __ align(CodeEntryAlignment);
4228     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4229 
4230     address start = __ pc();
4231 
4232     const Register crc   = c_rarg0;  // crc
4233     const Register buf   = c_rarg1;  // source java byte array address
4234     const Register len   = c_rarg2;  // length
4235     const Register table0 = c_rarg3; // crc_table address
4236     const Register table1 = c_rarg4;
4237     const Register table2 = c_rarg5;
4238     const Register table3 = c_rarg6;
4239     const Register tmp3 = c_rarg7;
4240 
4241     BLOCK_COMMENT("Entry:");
4242     __ enter(); // required for proper stackwalking of RuntimeStub frame
4243 
4244     __ kernel_crc32(crc, buf, len,
4245               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4246 
4247     __ leave(); // required for proper stackwalking of RuntimeStub frame
4248     __ ret(lr);
4249 
4250     return start;
4251   }
4252 
4253   // ChaCha20 block function.  This version parallelizes by loading
4254   // individual 32-bit state elements into vectors for four blocks
4255   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4256   //
4257   // state (int[16]) = c_rarg0
4258   // keystream (byte[1024]) = c_rarg1
4259   // return - number of bytes of keystream (always 256)
4260   address generate_chacha20Block_blockpar() {
4261     Label L_twoRounds, L_cc20_const;
4262     // The constant data is broken into two 128-bit segments to be loaded
4263     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4264     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4265     // The second 128-bits is a table constant used for 8-bit left rotations.
4266     __ BIND(L_cc20_const);
4267     __ emit_int64(0x0000000100000000UL);
4268     __ emit_int64(0x0000000300000002UL);
4269     __ emit_int64(0x0605040702010003UL);
4270     __ emit_int64(0x0E0D0C0F0A09080BUL);
4271 
4272     __ align(CodeEntryAlignment);
4273     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4274     address start = __ pc();
4275     __ enter();
4276 
4277     int i, j;
4278     const Register state = c_rarg0;
4279     const Register keystream = c_rarg1;
4280     const Register loopCtr = r10;
4281     const Register tmpAddr = r11;
4282 
4283     const FloatRegister stateFirst = v0;
4284     const FloatRegister stateSecond = v1;
4285     const FloatRegister stateThird = v2;
4286     const FloatRegister stateFourth = v3;
4287     const FloatRegister origCtrState = v28;
4288     const FloatRegister scratch = v29;
4289     const FloatRegister lrot8Tbl = v30;
4290 
4291     // Organize SIMD registers in an array that facilitates
4292     // putting repetitive opcodes into loop structures.  It is
4293     // important that each grouping of 4 registers is monotonically
4294     // increasing to support the requirements of multi-register
4295     // instructions (e.g. ld4r, st4, etc.)
4296     const FloatRegister workSt[16] = {
4297          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4298         v20, v21, v22, v23, v24, v25, v26, v27
4299     };
4300 
4301     // Load from memory and interlace across 16 SIMD registers,
4302     // With each word from memory being broadcast to all lanes of
4303     // each successive SIMD register.
4304     //      Addr(0) -> All lanes in workSt[i]
4305     //      Addr(4) -> All lanes workSt[i + 1], etc.
4306     __ mov(tmpAddr, state);
4307     for (i = 0; i < 16; i += 4) {
4308       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4309           __ post(tmpAddr, 16));
4310     }
4311 
4312     // Pull in constant data.  The first 16 bytes are the add overlay
4313     // which is applied to the vector holding the counter (state[12]).
4314     // The second 16 bytes is the index register for the 8-bit left
4315     // rotation tbl instruction.
4316     __ adr(tmpAddr, L_cc20_const);
4317     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4318     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4319 
4320     // Set up the 10 iteration loop and perform all 8 quarter round ops
4321     __ mov(loopCtr, 10);
4322     __ BIND(L_twoRounds);
4323 
4324     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4325         scratch, lrot8Tbl);
4326     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4327         scratch, lrot8Tbl);
4328     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4329         scratch, lrot8Tbl);
4330     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4331         scratch, lrot8Tbl);
4332 
4333     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4334         scratch, lrot8Tbl);
4335     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4336         scratch, lrot8Tbl);
4337     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4338         scratch, lrot8Tbl);
4339     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4340         scratch, lrot8Tbl);
4341 
4342     // Decrement and iterate
4343     __ sub(loopCtr, loopCtr, 1);
4344     __ cbnz(loopCtr, L_twoRounds);
4345 
4346     __ mov(tmpAddr, state);
4347 
4348     // Add the starting state back to the post-loop keystream
4349     // state.  We read/interlace the state array from memory into
4350     // 4 registers similar to what we did in the beginning.  Then
4351     // add the counter overlay onto workSt[12] at the end.
4352     for (i = 0; i < 16; i += 4) {
4353       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4354           __ post(tmpAddr, 16));
4355       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4356       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4357       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4358       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4359     }
4360     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4361 
4362     // Write to key stream, storing the same element out of workSt[0..15]
4363     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4364     // for the next element position.
4365     for (i = 0; i < 4; i++) {
4366       for (j = 0; j < 16; j += 4) {
4367         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4368             __ post(keystream, 16));
4369       }
4370     }
4371 
4372     __ mov(r0, 256);             // Return length of output keystream
4373     __ leave();
4374     __ ret(lr);
4375 
4376     return start;
4377   }
4378 
4379   /**
4380    *  Arguments:
4381    *
4382    * Inputs:
4383    *   c_rarg0   - int crc
4384    *   c_rarg1   - byte* buf
4385    *   c_rarg2   - int length
4386    *   c_rarg3   - int* table
4387    *
4388    * Output:
4389    *       r0   - int crc result
4390    */
4391   address generate_updateBytesCRC32C() {
4392     assert(UseCRC32CIntrinsics, "what are we doing here?");
4393 
4394     __ align(CodeEntryAlignment);
4395     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4396 
4397     address start = __ pc();
4398 
4399     const Register crc   = c_rarg0;  // crc
4400     const Register buf   = c_rarg1;  // source java byte array address
4401     const Register len   = c_rarg2;  // length
4402     const Register table0 = c_rarg3; // crc_table address
4403     const Register table1 = c_rarg4;
4404     const Register table2 = c_rarg5;
4405     const Register table3 = c_rarg6;
4406     const Register tmp3 = c_rarg7;
4407 
4408     BLOCK_COMMENT("Entry:");
4409     __ enter(); // required for proper stackwalking of RuntimeStub frame
4410 
4411     __ kernel_crc32c(crc, buf, len,
4412               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4413 
4414     __ leave(); // required for proper stackwalking of RuntimeStub frame
4415     __ ret(lr);
4416 
4417     return start;
4418   }
4419 
4420   /***
4421    *  Arguments:
4422    *
4423    *  Inputs:
4424    *   c_rarg0   - int   adler
4425    *   c_rarg1   - byte* buff
4426    *   c_rarg2   - int   len
4427    *
4428    * Output:
4429    *   c_rarg0   - int adler result
4430    */
4431   address generate_updateBytesAdler32() {
4432     __ align(CodeEntryAlignment);
4433     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4434     address start = __ pc();
4435 
4436     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4437 
4438     // Aliases
4439     Register adler  = c_rarg0;
4440     Register s1     = c_rarg0;
4441     Register s2     = c_rarg3;
4442     Register buff   = c_rarg1;
4443     Register len    = c_rarg2;
4444     Register nmax  = r4;
4445     Register base  = r5;
4446     Register count = r6;
4447     Register temp0 = rscratch1;
4448     Register temp1 = rscratch2;
4449     FloatRegister vbytes = v0;
4450     FloatRegister vs1acc = v1;
4451     FloatRegister vs2acc = v2;
4452     FloatRegister vtable = v3;
4453 
4454     // Max number of bytes we can process before having to take the mod
4455     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4456     uint64_t BASE = 0xfff1;
4457     uint64_t NMAX = 0x15B0;
4458 
4459     __ mov(base, BASE);
4460     __ mov(nmax, NMAX);
4461 
4462     // Load accumulation coefficients for the upper 16 bits
4463     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4464     __ ld1(vtable, __ T16B, Address(temp0));
4465 
4466     // s1 is initialized to the lower 16 bits of adler
4467     // s2 is initialized to the upper 16 bits of adler
4468     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4469     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4470 
4471     // The pipelined loop needs at least 16 elements for 1 iteration
4472     // It does check this, but it is more effective to skip to the cleanup loop
4473     __ cmp(len, (u1)16);
4474     __ br(Assembler::HS, L_nmax);
4475     __ cbz(len, L_combine);
4476 
4477     __ bind(L_simple_by1_loop);
4478     __ ldrb(temp0, Address(__ post(buff, 1)));
4479     __ add(s1, s1, temp0);
4480     __ add(s2, s2, s1);
4481     __ subs(len, len, 1);
4482     __ br(Assembler::HI, L_simple_by1_loop);
4483 
4484     // s1 = s1 % BASE
4485     __ subs(temp0, s1, base);
4486     __ csel(s1, temp0, s1, Assembler::HS);
4487 
4488     // s2 = s2 % BASE
4489     __ lsr(temp0, s2, 16);
4490     __ lsl(temp1, temp0, 4);
4491     __ sub(temp1, temp1, temp0);
4492     __ add(s2, temp1, s2, ext::uxth);
4493 
4494     __ subs(temp0, s2, base);
4495     __ csel(s2, temp0, s2, Assembler::HS);
4496 
4497     __ b(L_combine);
4498 
4499     __ bind(L_nmax);
4500     __ subs(len, len, nmax);
4501     __ sub(count, nmax, 16);
4502     __ br(Assembler::LO, L_by16);
4503 
4504     __ bind(L_nmax_loop);
4505 
4506     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4507                                       vbytes, vs1acc, vs2acc, vtable);
4508 
4509     __ subs(count, count, 16);
4510     __ br(Assembler::HS, L_nmax_loop);
4511 
4512     // s1 = s1 % BASE
4513     __ lsr(temp0, s1, 16);
4514     __ lsl(temp1, temp0, 4);
4515     __ sub(temp1, temp1, temp0);
4516     __ add(temp1, temp1, s1, ext::uxth);
4517 
4518     __ lsr(temp0, temp1, 16);
4519     __ lsl(s1, temp0, 4);
4520     __ sub(s1, s1, temp0);
4521     __ add(s1, s1, temp1, ext:: uxth);
4522 
4523     __ subs(temp0, s1, base);
4524     __ csel(s1, temp0, s1, Assembler::HS);
4525 
4526     // s2 = s2 % BASE
4527     __ lsr(temp0, s2, 16);
4528     __ lsl(temp1, temp0, 4);
4529     __ sub(temp1, temp1, temp0);
4530     __ add(temp1, temp1, s2, ext::uxth);
4531 
4532     __ lsr(temp0, temp1, 16);
4533     __ lsl(s2, temp0, 4);
4534     __ sub(s2, s2, temp0);
4535     __ add(s2, s2, temp1, ext:: uxth);
4536 
4537     __ subs(temp0, s2, base);
4538     __ csel(s2, temp0, s2, Assembler::HS);
4539 
4540     __ subs(len, len, nmax);
4541     __ sub(count, nmax, 16);
4542     __ br(Assembler::HS, L_nmax_loop);
4543 
4544     __ bind(L_by16);
4545     __ adds(len, len, count);
4546     __ br(Assembler::LO, L_by1);
4547 
4548     __ bind(L_by16_loop);
4549 
4550     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4551                                       vbytes, vs1acc, vs2acc, vtable);
4552 
4553     __ subs(len, len, 16);
4554     __ br(Assembler::HS, L_by16_loop);
4555 
4556     __ bind(L_by1);
4557     __ adds(len, len, 15);
4558     __ br(Assembler::LO, L_do_mod);
4559 
4560     __ bind(L_by1_loop);
4561     __ ldrb(temp0, Address(__ post(buff, 1)));
4562     __ add(s1, temp0, s1);
4563     __ add(s2, s2, s1);
4564     __ subs(len, len, 1);
4565     __ br(Assembler::HS, L_by1_loop);
4566 
4567     __ bind(L_do_mod);
4568     // s1 = s1 % BASE
4569     __ lsr(temp0, s1, 16);
4570     __ lsl(temp1, temp0, 4);
4571     __ sub(temp1, temp1, temp0);
4572     __ add(temp1, temp1, s1, ext::uxth);
4573 
4574     __ lsr(temp0, temp1, 16);
4575     __ lsl(s1, temp0, 4);
4576     __ sub(s1, s1, temp0);
4577     __ add(s1, s1, temp1, ext:: uxth);
4578 
4579     __ subs(temp0, s1, base);
4580     __ csel(s1, temp0, s1, Assembler::HS);
4581 
4582     // s2 = s2 % BASE
4583     __ lsr(temp0, s2, 16);
4584     __ lsl(temp1, temp0, 4);
4585     __ sub(temp1, temp1, temp0);
4586     __ add(temp1, temp1, s2, ext::uxth);
4587 
4588     __ lsr(temp0, temp1, 16);
4589     __ lsl(s2, temp0, 4);
4590     __ sub(s2, s2, temp0);
4591     __ add(s2, s2, temp1, ext:: uxth);
4592 
4593     __ subs(temp0, s2, base);
4594     __ csel(s2, temp0, s2, Assembler::HS);
4595 
4596     // Combine lower bits and higher bits
4597     __ bind(L_combine);
4598     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4599 
4600     __ ret(lr);
4601 
4602     return start;
4603   }
4604 
4605   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4606           Register temp0, Register temp1, FloatRegister vbytes,
4607           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4608     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4609     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4610     // In non-vectorized code, we update s1 and s2 as:
4611     //   s1 <- s1 + b1
4612     //   s2 <- s2 + s1
4613     //   s1 <- s1 + b2
4614     //   s2 <- s2 + b1
4615     //   ...
4616     //   s1 <- s1 + b16
4617     //   s2 <- s2 + s1
4618     // Putting above assignments together, we have:
4619     //   s1_new = s1 + b1 + b2 + ... + b16
4620     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4621     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4622     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4623     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4624 
4625     // s2 = s2 + s1 * 16
4626     __ add(s2, s2, s1, Assembler::LSL, 4);
4627 
4628     // vs1acc = b1 + b2 + b3 + ... + b16
4629     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4630     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4631     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4632     __ uaddlv(vs1acc, __ T16B, vbytes);
4633     __ uaddlv(vs2acc, __ T8H, vs2acc);
4634 
4635     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4636     __ fmovd(temp0, vs1acc);
4637     __ fmovd(temp1, vs2acc);
4638     __ add(s1, s1, temp0);
4639     __ add(s2, s2, temp1);
4640   }
4641 
4642   /**
4643    *  Arguments:
4644    *
4645    *  Input:
4646    *    c_rarg0   - x address
4647    *    c_rarg1   - x length
4648    *    c_rarg2   - y address
4649    *    c_rarg3   - y length
4650    *    c_rarg4   - z address
4651    *    c_rarg5   - z length
4652    */
4653   address generate_multiplyToLen() {
4654     __ align(CodeEntryAlignment);
4655     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4656 
4657     address start = __ pc();
4658     const Register x     = r0;
4659     const Register xlen  = r1;
4660     const Register y     = r2;
4661     const Register ylen  = r3;
4662     const Register z     = r4;
4663     const Register zlen  = r5;
4664 
4665     const Register tmp1  = r10;
4666     const Register tmp2  = r11;
4667     const Register tmp3  = r12;
4668     const Register tmp4  = r13;
4669     const Register tmp5  = r14;
4670     const Register tmp6  = r15;
4671     const Register tmp7  = r16;
4672 
4673     BLOCK_COMMENT("Entry:");
4674     __ enter(); // required for proper stackwalking of RuntimeStub frame
4675     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4676     __ leave(); // required for proper stackwalking of RuntimeStub frame
4677     __ ret(lr);
4678 
4679     return start;
4680   }
4681 
4682   address generate_squareToLen() {
4683     // squareToLen algorithm for sizes 1..127 described in java code works
4684     // faster than multiply_to_len on some CPUs and slower on others, but
4685     // multiply_to_len shows a bit better overall results
4686     __ align(CodeEntryAlignment);
4687     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4688     address start = __ pc();
4689 
4690     const Register x     = r0;
4691     const Register xlen  = r1;
4692     const Register z     = r2;
4693     const Register zlen  = r3;
4694     const Register y     = r4; // == x
4695     const Register ylen  = r5; // == xlen
4696 
4697     const Register tmp1  = r10;
4698     const Register tmp2  = r11;
4699     const Register tmp3  = r12;
4700     const Register tmp4  = r13;
4701     const Register tmp5  = r14;
4702     const Register tmp6  = r15;
4703     const Register tmp7  = r16;
4704 
4705     RegSet spilled_regs = RegSet::of(y, ylen);
4706     BLOCK_COMMENT("Entry:");
4707     __ enter();
4708     __ push(spilled_regs, sp);
4709     __ mov(y, x);
4710     __ mov(ylen, xlen);
4711     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4712     __ pop(spilled_regs, sp);
4713     __ leave();
4714     __ ret(lr);
4715     return start;
4716   }
4717 
4718   address generate_mulAdd() {
4719     __ align(CodeEntryAlignment);
4720     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4721 
4722     address start = __ pc();
4723 
4724     const Register out     = r0;
4725     const Register in      = r1;
4726     const Register offset  = r2;
4727     const Register len     = r3;
4728     const Register k       = r4;
4729 
4730     BLOCK_COMMENT("Entry:");
4731     __ enter();
4732     __ mul_add(out, in, offset, len, k);
4733     __ leave();
4734     __ ret(lr);
4735 
4736     return start;
4737   }
4738 
4739   // Arguments:
4740   //
4741   // Input:
4742   //   c_rarg0   - newArr address
4743   //   c_rarg1   - oldArr address
4744   //   c_rarg2   - newIdx
4745   //   c_rarg3   - shiftCount
4746   //   c_rarg4   - numIter
4747   //
4748   address generate_bigIntegerRightShift() {
4749     __ align(CodeEntryAlignment);
4750     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4751     address start = __ pc();
4752 
4753     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4754 
4755     Register newArr        = c_rarg0;
4756     Register oldArr        = c_rarg1;
4757     Register newIdx        = c_rarg2;
4758     Register shiftCount    = c_rarg3;
4759     Register numIter       = c_rarg4;
4760     Register idx           = numIter;
4761 
4762     Register newArrCur     = rscratch1;
4763     Register shiftRevCount = rscratch2;
4764     Register oldArrCur     = r13;
4765     Register oldArrNext    = r14;
4766 
4767     FloatRegister oldElem0        = v0;
4768     FloatRegister oldElem1        = v1;
4769     FloatRegister newElem         = v2;
4770     FloatRegister shiftVCount     = v3;
4771     FloatRegister shiftVRevCount  = v4;
4772 
4773     __ cbz(idx, Exit);
4774 
4775     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4776 
4777     // left shift count
4778     __ movw(shiftRevCount, 32);
4779     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4780 
4781     // numIter too small to allow a 4-words SIMD loop, rolling back
4782     __ cmp(numIter, (u1)4);
4783     __ br(Assembler::LT, ShiftThree);
4784 
4785     __ dup(shiftVCount,    __ T4S, shiftCount);
4786     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4787     __ negr(shiftVCount,   __ T4S, shiftVCount);
4788 
4789     __ BIND(ShiftSIMDLoop);
4790 
4791     // Calculate the load addresses
4792     __ sub(idx, idx, 4);
4793     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4794     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4795     __ add(oldArrCur,  oldArrNext, 4);
4796 
4797     // Load 4 words and process
4798     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4799     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4800     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4801     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4802     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4803     __ st1(newElem,   __ T4S,  Address(newArrCur));
4804 
4805     __ cmp(idx, (u1)4);
4806     __ br(Assembler::LT, ShiftTwoLoop);
4807     __ b(ShiftSIMDLoop);
4808 
4809     __ BIND(ShiftTwoLoop);
4810     __ cbz(idx, Exit);
4811     __ cmp(idx, (u1)1);
4812     __ br(Assembler::EQ, ShiftOne);
4813 
4814     // Calculate the load addresses
4815     __ sub(idx, idx, 2);
4816     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4817     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4818     __ add(oldArrCur,  oldArrNext, 4);
4819 
4820     // Load 2 words and process
4821     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4822     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4823     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4824     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4825     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4826     __ st1(newElem,   __ T2S, Address(newArrCur));
4827     __ b(ShiftTwoLoop);
4828 
4829     __ BIND(ShiftThree);
4830     __ tbz(idx, 1, ShiftOne);
4831     __ tbz(idx, 0, ShiftTwo);
4832     __ ldrw(r10,  Address(oldArr, 12));
4833     __ ldrw(r11,  Address(oldArr, 8));
4834     __ lsrvw(r10, r10, shiftCount);
4835     __ lslvw(r11, r11, shiftRevCount);
4836     __ orrw(r12,  r10, r11);
4837     __ strw(r12,  Address(newArr, 8));
4838 
4839     __ BIND(ShiftTwo);
4840     __ ldrw(r10,  Address(oldArr, 8));
4841     __ ldrw(r11,  Address(oldArr, 4));
4842     __ lsrvw(r10, r10, shiftCount);
4843     __ lslvw(r11, r11, shiftRevCount);
4844     __ orrw(r12,  r10, r11);
4845     __ strw(r12,  Address(newArr, 4));
4846 
4847     __ BIND(ShiftOne);
4848     __ ldrw(r10,  Address(oldArr, 4));
4849     __ ldrw(r11,  Address(oldArr));
4850     __ lsrvw(r10, r10, shiftCount);
4851     __ lslvw(r11, r11, shiftRevCount);
4852     __ orrw(r12,  r10, r11);
4853     __ strw(r12,  Address(newArr));
4854 
4855     __ BIND(Exit);
4856     __ ret(lr);
4857 
4858     return start;
4859   }
4860 
4861   // Arguments:
4862   //
4863   // Input:
4864   //   c_rarg0   - newArr address
4865   //   c_rarg1   - oldArr address
4866   //   c_rarg2   - newIdx
4867   //   c_rarg3   - shiftCount
4868   //   c_rarg4   - numIter
4869   //
4870   address generate_bigIntegerLeftShift() {
4871     __ align(CodeEntryAlignment);
4872     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4873     address start = __ pc();
4874 
4875     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4876 
4877     Register newArr        = c_rarg0;
4878     Register oldArr        = c_rarg1;
4879     Register newIdx        = c_rarg2;
4880     Register shiftCount    = c_rarg3;
4881     Register numIter       = c_rarg4;
4882 
4883     Register shiftRevCount = rscratch1;
4884     Register oldArrNext    = rscratch2;
4885 
4886     FloatRegister oldElem0        = v0;
4887     FloatRegister oldElem1        = v1;
4888     FloatRegister newElem         = v2;
4889     FloatRegister shiftVCount     = v3;
4890     FloatRegister shiftVRevCount  = v4;
4891 
4892     __ cbz(numIter, Exit);
4893 
4894     __ add(oldArrNext, oldArr, 4);
4895     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4896 
4897     // right shift count
4898     __ movw(shiftRevCount, 32);
4899     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4900 
4901     // numIter too small to allow a 4-words SIMD loop, rolling back
4902     __ cmp(numIter, (u1)4);
4903     __ br(Assembler::LT, ShiftThree);
4904 
4905     __ dup(shiftVCount,     __ T4S, shiftCount);
4906     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4907     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4908 
4909     __ BIND(ShiftSIMDLoop);
4910 
4911     // load 4 words and process
4912     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4913     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4914     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4915     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4916     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4917     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4918     __ sub(numIter,   numIter, 4);
4919 
4920     __ cmp(numIter, (u1)4);
4921     __ br(Assembler::LT, ShiftTwoLoop);
4922     __ b(ShiftSIMDLoop);
4923 
4924     __ BIND(ShiftTwoLoop);
4925     __ cbz(numIter, Exit);
4926     __ cmp(numIter, (u1)1);
4927     __ br(Assembler::EQ, ShiftOne);
4928 
4929     // load 2 words and process
4930     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4931     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4932     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4933     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4934     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4935     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4936     __ sub(numIter,   numIter, 2);
4937     __ b(ShiftTwoLoop);
4938 
4939     __ BIND(ShiftThree);
4940     __ ldrw(r10,  __ post(oldArr, 4));
4941     __ ldrw(r11,  __ post(oldArrNext, 4));
4942     __ lslvw(r10, r10, shiftCount);
4943     __ lsrvw(r11, r11, shiftRevCount);
4944     __ orrw(r12,  r10, r11);
4945     __ strw(r12,  __ post(newArr, 4));
4946     __ tbz(numIter, 1, Exit);
4947     __ tbz(numIter, 0, ShiftOne);
4948 
4949     __ BIND(ShiftTwo);
4950     __ ldrw(r10,  __ post(oldArr, 4));
4951     __ ldrw(r11,  __ post(oldArrNext, 4));
4952     __ lslvw(r10, r10, shiftCount);
4953     __ lsrvw(r11, r11, shiftRevCount);
4954     __ orrw(r12,  r10, r11);
4955     __ strw(r12,  __ post(newArr, 4));
4956 
4957     __ BIND(ShiftOne);
4958     __ ldrw(r10,  Address(oldArr));
4959     __ ldrw(r11,  Address(oldArrNext));
4960     __ lslvw(r10, r10, shiftCount);
4961     __ lsrvw(r11, r11, shiftRevCount);
4962     __ orrw(r12,  r10, r11);
4963     __ strw(r12,  Address(newArr));
4964 
4965     __ BIND(Exit);
4966     __ ret(lr);
4967 
4968     return start;
4969   }
4970 
4971   address generate_count_positives(address &count_positives_long) {
4972     const u1 large_loop_size = 64;
4973     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4974     int dcache_line = VM_Version::dcache_line_size();
4975 
4976     Register ary1 = r1, len = r2, result = r0;
4977 
4978     __ align(CodeEntryAlignment);
4979 
4980     StubCodeMark mark(this, "StubRoutines", "count_positives");
4981 
4982     address entry = __ pc();
4983 
4984     __ enter();
4985     // precondition: a copy of len is already in result
4986     // __ mov(result, len);
4987 
4988   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4989         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4990 
4991   __ cmp(len, (u1)15);
4992   __ br(Assembler::GT, LEN_OVER_15);
4993   // The only case when execution falls into this code is when pointer is near
4994   // the end of memory page and we have to avoid reading next page
4995   __ add(ary1, ary1, len);
4996   __ subs(len, len, 8);
4997   __ br(Assembler::GT, LEN_OVER_8);
4998   __ ldr(rscratch2, Address(ary1, -8));
4999   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
5000   __ lsrv(rscratch2, rscratch2, rscratch1);
5001   __ tst(rscratch2, UPPER_BIT_MASK);
5002   __ csel(result, zr, result, Assembler::NE);
5003   __ leave();
5004   __ ret(lr);
5005   __ bind(LEN_OVER_8);
5006   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5007   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5008   __ tst(rscratch2, UPPER_BIT_MASK);
5009   __ br(Assembler::NE, RET_NO_POP);
5010   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5011   __ lsrv(rscratch1, rscratch1, rscratch2);
5012   __ tst(rscratch1, UPPER_BIT_MASK);
5013   __ bind(RET_NO_POP);
5014   __ csel(result, zr, result, Assembler::NE);
5015   __ leave();
5016   __ ret(lr);
5017 
5018   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5019   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5020 
5021   count_positives_long = __ pc(); // 2nd entry point
5022 
5023   __ enter();
5024 
5025   __ bind(LEN_OVER_15);
5026     __ push(spilled_regs, sp);
5027     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5028     __ cbz(rscratch2, ALIGNED);
5029     __ ldp(tmp6, tmp1, Address(ary1));
5030     __ mov(tmp5, 16);
5031     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5032     __ add(ary1, ary1, rscratch1);
5033     __ orr(tmp6, tmp6, tmp1);
5034     __ tst(tmp6, UPPER_BIT_MASK);
5035     __ br(Assembler::NE, RET_ADJUST);
5036     __ sub(len, len, rscratch1);
5037 
5038   __ bind(ALIGNED);
5039     __ cmp(len, large_loop_size);
5040     __ br(Assembler::LT, CHECK_16);
5041     // Perform 16-byte load as early return in pre-loop to handle situation
5042     // when initially aligned large array has negative values at starting bytes,
5043     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5044     // slower. Cases with negative bytes further ahead won't be affected that
5045     // much. In fact, it'll be faster due to early loads, less instructions and
5046     // less branches in LARGE_LOOP.
5047     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5048     __ sub(len, len, 16);
5049     __ orr(tmp6, tmp6, tmp1);
5050     __ tst(tmp6, UPPER_BIT_MASK);
5051     __ br(Assembler::NE, RET_ADJUST_16);
5052     __ cmp(len, large_loop_size);
5053     __ br(Assembler::LT, CHECK_16);
5054 
5055     if (SoftwarePrefetchHintDistance >= 0
5056         && SoftwarePrefetchHintDistance >= dcache_line) {
5057       // initial prefetch
5058       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5059     }
5060   __ bind(LARGE_LOOP);
5061     if (SoftwarePrefetchHintDistance >= 0) {
5062       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5063     }
5064     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5065     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5066     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5067     // instructions per cycle and have less branches, but this approach disables
5068     // early return, thus, all 64 bytes are loaded and checked every time.
5069     __ ldp(tmp2, tmp3, Address(ary1));
5070     __ ldp(tmp4, tmp5, Address(ary1, 16));
5071     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5072     __ ldp(tmp6, tmp1, Address(ary1, 48));
5073     __ add(ary1, ary1, large_loop_size);
5074     __ sub(len, len, large_loop_size);
5075     __ orr(tmp2, tmp2, tmp3);
5076     __ orr(tmp4, tmp4, tmp5);
5077     __ orr(rscratch1, rscratch1, rscratch2);
5078     __ orr(tmp6, tmp6, tmp1);
5079     __ orr(tmp2, tmp2, tmp4);
5080     __ orr(rscratch1, rscratch1, tmp6);
5081     __ orr(tmp2, tmp2, rscratch1);
5082     __ tst(tmp2, UPPER_BIT_MASK);
5083     __ br(Assembler::NE, RET_ADJUST_LONG);
5084     __ cmp(len, large_loop_size);
5085     __ br(Assembler::GE, LARGE_LOOP);
5086 
5087   __ bind(CHECK_16); // small 16-byte load pre-loop
5088     __ cmp(len, (u1)16);
5089     __ br(Assembler::LT, POST_LOOP16);
5090 
5091   __ bind(LOOP16); // small 16-byte load loop
5092     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5093     __ sub(len, len, 16);
5094     __ orr(tmp2, tmp2, tmp3);
5095     __ tst(tmp2, UPPER_BIT_MASK);
5096     __ br(Assembler::NE, RET_ADJUST_16);
5097     __ cmp(len, (u1)16);
5098     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5099 
5100   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5101     __ cmp(len, (u1)8);
5102     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5103     __ ldr(tmp3, Address(__ post(ary1, 8)));
5104     __ tst(tmp3, UPPER_BIT_MASK);
5105     __ br(Assembler::NE, RET_ADJUST);
5106     __ sub(len, len, 8);
5107 
5108   __ bind(POST_LOOP16_LOAD_TAIL);
5109     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5110     __ ldr(tmp1, Address(ary1));
5111     __ mov(tmp2, 64);
5112     __ sub(tmp4, tmp2, len, __ LSL, 3);
5113     __ lslv(tmp1, tmp1, tmp4);
5114     __ tst(tmp1, UPPER_BIT_MASK);
5115     __ br(Assembler::NE, RET_ADJUST);
5116     // Fallthrough
5117 
5118   __ bind(RET_LEN);
5119     __ pop(spilled_regs, sp);
5120     __ leave();
5121     __ ret(lr);
5122 
5123     // difference result - len is the count of guaranteed to be
5124     // positive bytes
5125 
5126   __ bind(RET_ADJUST_LONG);
5127     __ add(len, len, (u1)(large_loop_size - 16));
5128   __ bind(RET_ADJUST_16);
5129     __ add(len, len, 16);
5130   __ bind(RET_ADJUST);
5131     __ pop(spilled_regs, sp);
5132     __ leave();
5133     __ sub(result, result, len);
5134     __ ret(lr);
5135 
5136     return entry;
5137   }
5138 
5139   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5140         bool usePrefetch, Label &NOT_EQUAL) {
5141     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5142         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5143         tmp7 = r12, tmp8 = r13;
5144     Label LOOP;
5145 
5146     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5147     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5148     __ bind(LOOP);
5149     if (usePrefetch) {
5150       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5151       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5152     }
5153     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5154     __ eor(tmp1, tmp1, tmp2);
5155     __ eor(tmp3, tmp3, tmp4);
5156     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5157     __ orr(tmp1, tmp1, tmp3);
5158     __ cbnz(tmp1, NOT_EQUAL);
5159     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5160     __ eor(tmp5, tmp5, tmp6);
5161     __ eor(tmp7, tmp7, tmp8);
5162     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5163     __ orr(tmp5, tmp5, tmp7);
5164     __ cbnz(tmp5, NOT_EQUAL);
5165     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5166     __ eor(tmp1, tmp1, tmp2);
5167     __ eor(tmp3, tmp3, tmp4);
5168     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5169     __ orr(tmp1, tmp1, tmp3);
5170     __ cbnz(tmp1, NOT_EQUAL);
5171     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5172     __ eor(tmp5, tmp5, tmp6);
5173     __ sub(cnt1, cnt1, 8 * wordSize);
5174     __ eor(tmp7, tmp7, tmp8);
5175     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5176     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5177     // cmp) because subs allows an unlimited range of immediate operand.
5178     __ subs(tmp6, cnt1, loopThreshold);
5179     __ orr(tmp5, tmp5, tmp7);
5180     __ cbnz(tmp5, NOT_EQUAL);
5181     __ br(__ GE, LOOP);
5182     // post-loop
5183     __ eor(tmp1, tmp1, tmp2);
5184     __ eor(tmp3, tmp3, tmp4);
5185     __ orr(tmp1, tmp1, tmp3);
5186     __ sub(cnt1, cnt1, 2 * wordSize);
5187     __ cbnz(tmp1, NOT_EQUAL);
5188   }
5189 
5190   void generate_large_array_equals_loop_simd(int loopThreshold,
5191         bool usePrefetch, Label &NOT_EQUAL) {
5192     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5193         tmp2 = rscratch2;
5194     Label LOOP;
5195 
5196     __ bind(LOOP);
5197     if (usePrefetch) {
5198       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5199       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5200     }
5201     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5202     __ sub(cnt1, cnt1, 8 * wordSize);
5203     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5204     __ subs(tmp1, cnt1, loopThreshold);
5205     __ eor(v0, __ T16B, v0, v4);
5206     __ eor(v1, __ T16B, v1, v5);
5207     __ eor(v2, __ T16B, v2, v6);
5208     __ eor(v3, __ T16B, v3, v7);
5209     __ orr(v0, __ T16B, v0, v1);
5210     __ orr(v1, __ T16B, v2, v3);
5211     __ orr(v0, __ T16B, v0, v1);
5212     __ umov(tmp1, v0, __ D, 0);
5213     __ umov(tmp2, v0, __ D, 1);
5214     __ orr(tmp1, tmp1, tmp2);
5215     __ cbnz(tmp1, NOT_EQUAL);
5216     __ br(__ GE, LOOP);
5217   }
5218 
5219   // a1 = r1 - array1 address
5220   // a2 = r2 - array2 address
5221   // result = r0 - return value. Already contains "false"
5222   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5223   // r3-r5 are reserved temporary registers
5224   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5225   address generate_large_array_equals() {
5226     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5227         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5228         tmp7 = r12, tmp8 = r13;
5229     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5230         SMALL_LOOP, POST_LOOP;
5231     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5232     // calculate if at least 32 prefetched bytes are used
5233     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5234     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5235     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5236     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5237         tmp5, tmp6, tmp7, tmp8);
5238 
5239     __ align(CodeEntryAlignment);
5240 
5241     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5242 
5243     address entry = __ pc();
5244     __ enter();
5245     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5246     // also advance pointers to use post-increment instead of pre-increment
5247     __ add(a1, a1, wordSize);
5248     __ add(a2, a2, wordSize);
5249     if (AvoidUnalignedAccesses) {
5250       // both implementations (SIMD/nonSIMD) are using relatively large load
5251       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5252       // on some CPUs in case of address is not at least 16-byte aligned.
5253       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5254       // load if needed at least for 1st address and make if 16-byte aligned.
5255       Label ALIGNED16;
5256       __ tbz(a1, 3, ALIGNED16);
5257       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5258       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5259       __ sub(cnt1, cnt1, wordSize);
5260       __ eor(tmp1, tmp1, tmp2);
5261       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5262       __ bind(ALIGNED16);
5263     }
5264     if (UseSIMDForArrayEquals) {
5265       if (SoftwarePrefetchHintDistance >= 0) {
5266         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5267         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5268         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5269             /* prfm = */ true, NOT_EQUAL);
5270         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5271         __ br(__ LT, TAIL);
5272       }
5273       __ bind(NO_PREFETCH_LARGE_LOOP);
5274       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5275           /* prfm = */ false, NOT_EQUAL);
5276     } else {
5277       __ push(spilled_regs, sp);
5278       if (SoftwarePrefetchHintDistance >= 0) {
5279         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5280         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5281         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5282             /* prfm = */ true, NOT_EQUAL);
5283         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5284         __ br(__ LT, TAIL);
5285       }
5286       __ bind(NO_PREFETCH_LARGE_LOOP);
5287       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5288           /* prfm = */ false, NOT_EQUAL);
5289     }
5290     __ bind(TAIL);
5291       __ cbz(cnt1, EQUAL);
5292       __ subs(cnt1, cnt1, wordSize);
5293       __ br(__ LE, POST_LOOP);
5294     __ bind(SMALL_LOOP);
5295       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5296       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5297       __ subs(cnt1, cnt1, wordSize);
5298       __ eor(tmp1, tmp1, tmp2);
5299       __ cbnz(tmp1, NOT_EQUAL);
5300       __ br(__ GT, SMALL_LOOP);
5301     __ bind(POST_LOOP);
5302       __ ldr(tmp1, Address(a1, cnt1));
5303       __ ldr(tmp2, Address(a2, cnt1));
5304       __ eor(tmp1, tmp1, tmp2);
5305       __ cbnz(tmp1, NOT_EQUAL);
5306     __ bind(EQUAL);
5307       __ mov(result, true);
5308     __ bind(NOT_EQUAL);
5309       if (!UseSIMDForArrayEquals) {
5310         __ pop(spilled_regs, sp);
5311       }
5312     __ bind(NOT_EQUAL_NO_POP);
5313     __ leave();
5314     __ ret(lr);
5315     return entry;
5316   }
5317 
5318   address generate_dsin_dcos(bool isCos) {
5319     __ align(CodeEntryAlignment);
5320     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5321     address start = __ pc();
5322     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5323         (address)StubRoutines::aarch64::_two_over_pi,
5324         (address)StubRoutines::aarch64::_pio2,
5325         (address)StubRoutines::aarch64::_dsin_coef,
5326         (address)StubRoutines::aarch64::_dcos_coef);
5327     return start;
5328   }
5329 
5330   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5331   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5332       Label &DIFF2) {
5333     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5334     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5335 
5336     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5337     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5338     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5339     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5340 
5341     __ fmovd(tmpL, vtmp3);
5342     __ eor(rscratch2, tmp3, tmpL);
5343     __ cbnz(rscratch2, DIFF2);
5344 
5345     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5346     __ umov(tmpL, vtmp3, __ D, 1);
5347     __ eor(rscratch2, tmpU, tmpL);
5348     __ cbnz(rscratch2, DIFF1);
5349 
5350     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5351     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5352     __ fmovd(tmpL, vtmp);
5353     __ eor(rscratch2, tmp3, tmpL);
5354     __ cbnz(rscratch2, DIFF2);
5355 
5356     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5357     __ umov(tmpL, vtmp, __ D, 1);
5358     __ eor(rscratch2, tmpU, tmpL);
5359     __ cbnz(rscratch2, DIFF1);
5360   }
5361 
5362   // r0  = result
5363   // r1  = str1
5364   // r2  = cnt1
5365   // r3  = str2
5366   // r4  = cnt2
5367   // r10 = tmp1
5368   // r11 = tmp2
5369   address generate_compare_long_string_different_encoding(bool isLU) {
5370     __ align(CodeEntryAlignment);
5371     StubCodeMark mark(this, "StubRoutines", isLU
5372         ? "compare_long_string_different_encoding LU"
5373         : "compare_long_string_different_encoding UL");
5374     address entry = __ pc();
5375     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5376         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5377         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5378     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5379         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5380     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5381     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5382 
5383     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5384 
5385     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5386     // cnt2 == amount of characters left to compare
5387     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5388     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5389     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5390     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5391     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5392     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5393     __ eor(rscratch2, tmp1, tmp2);
5394     __ mov(rscratch1, tmp2);
5395     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5396     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5397              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5398     __ push(spilled_regs, sp);
5399     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5400     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5401 
5402     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5403 
5404     if (SoftwarePrefetchHintDistance >= 0) {
5405       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5406       __ br(__ LT, NO_PREFETCH);
5407       __ bind(LARGE_LOOP_PREFETCH);
5408         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5409         __ mov(tmp4, 2);
5410         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5411         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5412           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5413           __ subs(tmp4, tmp4, 1);
5414           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5415           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5416           __ mov(tmp4, 2);
5417         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5418           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5419           __ subs(tmp4, tmp4, 1);
5420           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5421           __ sub(cnt2, cnt2, 64);
5422           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5423           __ br(__ GE, LARGE_LOOP_PREFETCH);
5424     }
5425     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5426     __ bind(NO_PREFETCH);
5427     __ subs(cnt2, cnt2, 16);
5428     __ br(__ LT, TAIL);
5429     __ align(OptoLoopAlignment);
5430     __ bind(SMALL_LOOP); // smaller loop
5431       __ subs(cnt2, cnt2, 16);
5432       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5433       __ br(__ GE, SMALL_LOOP);
5434       __ cmn(cnt2, (u1)16);
5435       __ br(__ EQ, LOAD_LAST);
5436     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5437       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5438       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5439       __ ldr(tmp3, Address(cnt1, -8));
5440       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5441       __ b(LOAD_LAST);
5442     __ bind(DIFF2);
5443       __ mov(tmpU, tmp3);
5444     __ bind(DIFF1);
5445       __ pop(spilled_regs, sp);
5446       __ b(CALCULATE_DIFFERENCE);
5447     __ bind(LOAD_LAST);
5448       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5449       // No need to load it again
5450       __ mov(tmpU, tmp3);
5451       __ pop(spilled_regs, sp);
5452 
5453       // tmp2 points to the address of the last 4 Latin1 characters right now
5454       __ ldrs(vtmp, Address(tmp2));
5455       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5456       __ fmovd(tmpL, vtmp);
5457 
5458       __ eor(rscratch2, tmpU, tmpL);
5459       __ cbz(rscratch2, DONE);
5460 
5461     // Find the first different characters in the longwords and
5462     // compute their difference.
5463     __ bind(CALCULATE_DIFFERENCE);
5464       __ rev(rscratch2, rscratch2);
5465       __ clz(rscratch2, rscratch2);
5466       __ andr(rscratch2, rscratch2, -16);
5467       __ lsrv(tmp1, tmp1, rscratch2);
5468       __ uxthw(tmp1, tmp1);
5469       __ lsrv(rscratch1, rscratch1, rscratch2);
5470       __ uxthw(rscratch1, rscratch1);
5471       __ subw(result, tmp1, rscratch1);
5472     __ bind(DONE);
5473       __ ret(lr);
5474     return entry;
5475   }
5476 
5477   // r0 = input (float16)
5478   // v0 = result (float)
5479   // v1 = temporary float register
5480   address generate_float16ToFloat() {
5481     __ align(CodeEntryAlignment);
5482     StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
5483     address entry = __ pc();
5484     BLOCK_COMMENT("Entry:");
5485     __ flt16_to_flt(v0, r0, v1);
5486     __ ret(lr);
5487     return entry;
5488   }
5489 
5490   // v0 = input (float)
5491   // r0 = result (float16)
5492   // v1 = temporary float register
5493   address generate_floatToFloat16() {
5494     __ align(CodeEntryAlignment);
5495     StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
5496     address entry = __ pc();
5497     BLOCK_COMMENT("Entry:");
5498     __ flt_to_flt16(r0, v0, v1);
5499     __ ret(lr);
5500     return entry;
5501   }
5502 
5503   address generate_method_entry_barrier() {
5504     __ align(CodeEntryAlignment);
5505     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5506 
5507     Label deoptimize_label;
5508 
5509     address start = __ pc();
5510 
5511     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5512 
5513     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5514       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5515       // We can get here despite the nmethod being good, if we have not
5516       // yet applied our cross modification fence (or data fence).
5517       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5518       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5519       __ ldrw(rscratch2, rscratch2);
5520       __ strw(rscratch2, thread_epoch_addr);
5521       __ isb();
5522       __ membar(__ LoadLoad);
5523     }
5524 
5525     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5526 
5527     __ enter();
5528     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5529 
5530     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5531 
5532     __ push_call_clobbered_registers();
5533 
5534     __ mov(c_rarg0, rscratch2);
5535     __ call_VM_leaf
5536          (CAST_FROM_FN_PTR
5537           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5538 
5539     __ reset_last_Java_frame(true);
5540 
5541     __ mov(rscratch1, r0);
5542 
5543     __ pop_call_clobbered_registers();
5544 
5545     __ cbnz(rscratch1, deoptimize_label);
5546 
5547     __ leave();
5548     __ ret(lr);
5549 
5550     __ BIND(deoptimize_label);
5551 
5552     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5553     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5554 
5555     __ mov(sp, rscratch1);
5556     __ br(rscratch2);
5557 
5558     return start;
5559   }
5560 
5561   // r0  = result
5562   // r1  = str1
5563   // r2  = cnt1
5564   // r3  = str2
5565   // r4  = cnt2
5566   // r10 = tmp1
5567   // r11 = tmp2
5568   address generate_compare_long_string_same_encoding(bool isLL) {
5569     __ align(CodeEntryAlignment);
5570     StubCodeMark mark(this, "StubRoutines", isLL
5571         ? "compare_long_string_same_encoding LL"
5572         : "compare_long_string_same_encoding UU");
5573     address entry = __ pc();
5574     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5575         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5576 
5577     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5578 
5579     // exit from large loop when less than 64 bytes left to read or we're about
5580     // to prefetch memory behind array border
5581     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5582 
5583     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5584     __ eor(rscratch2, tmp1, tmp2);
5585     __ cbnz(rscratch2, CAL_DIFFERENCE);
5586 
5587     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5588     // update pointers, because of previous read
5589     __ add(str1, str1, wordSize);
5590     __ add(str2, str2, wordSize);
5591     if (SoftwarePrefetchHintDistance >= 0) {
5592       __ align(OptoLoopAlignment);
5593       __ bind(LARGE_LOOP_PREFETCH);
5594         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5595         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5596 
5597         for (int i = 0; i < 4; i++) {
5598           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5599           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5600           __ cmp(tmp1, tmp2);
5601           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5602           __ br(Assembler::NE, DIFF);
5603         }
5604         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5605         __ add(str1, str1, 64);
5606         __ add(str2, str2, 64);
5607         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5608         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5609         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5610     }
5611 
5612     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5613     __ br(Assembler::LE, LESS16);
5614     __ align(OptoLoopAlignment);
5615     __ bind(LOOP_COMPARE16);
5616       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5617       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5618       __ cmp(tmp1, tmp2);
5619       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5620       __ br(Assembler::NE, DIFF);
5621       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5622       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5623       __ br(Assembler::LT, LESS16);
5624 
5625       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5626       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5627       __ cmp(tmp1, tmp2);
5628       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5629       __ br(Assembler::NE, DIFF);
5630       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5631       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5632       __ br(Assembler::GE, LOOP_COMPARE16);
5633       __ cbz(cnt2, LENGTH_DIFF);
5634 
5635     __ bind(LESS16);
5636       // each 8 compare
5637       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5638       __ br(Assembler::LE, LESS8);
5639       __ ldr(tmp1, Address(__ post(str1, 8)));
5640       __ ldr(tmp2, Address(__ post(str2, 8)));
5641       __ eor(rscratch2, tmp1, tmp2);
5642       __ cbnz(rscratch2, CAL_DIFFERENCE);
5643       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5644 
5645     __ bind(LESS8); // directly load last 8 bytes
5646       if (!isLL) {
5647         __ add(cnt2, cnt2, cnt2);
5648       }
5649       __ ldr(tmp1, Address(str1, cnt2));
5650       __ ldr(tmp2, Address(str2, cnt2));
5651       __ eor(rscratch2, tmp1, tmp2);
5652       __ cbz(rscratch2, LENGTH_DIFF);
5653       __ b(CAL_DIFFERENCE);
5654 
5655     __ bind(DIFF);
5656       __ cmp(tmp1, tmp2);
5657       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5658       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5659       // reuse rscratch2 register for the result of eor instruction
5660       __ eor(rscratch2, tmp1, tmp2);
5661 
5662     __ bind(CAL_DIFFERENCE);
5663       __ rev(rscratch2, rscratch2);
5664       __ clz(rscratch2, rscratch2);
5665       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5666       __ lsrv(tmp1, tmp1, rscratch2);
5667       __ lsrv(tmp2, tmp2, rscratch2);
5668       if (isLL) {
5669         __ uxtbw(tmp1, tmp1);
5670         __ uxtbw(tmp2, tmp2);
5671       } else {
5672         __ uxthw(tmp1, tmp1);
5673         __ uxthw(tmp2, tmp2);
5674       }
5675       __ subw(result, tmp1, tmp2);
5676 
5677     __ bind(LENGTH_DIFF);
5678       __ ret(lr);
5679     return entry;
5680   }
5681 
5682   enum string_compare_mode {
5683     LL,
5684     LU,
5685     UL,
5686     UU,
5687   };
5688 
5689   // The following registers are declared in aarch64.ad
5690   // r0  = result
5691   // r1  = str1
5692   // r2  = cnt1
5693   // r3  = str2
5694   // r4  = cnt2
5695   // r10 = tmp1
5696   // r11 = tmp2
5697   // z0  = ztmp1
5698   // z1  = ztmp2
5699   // p0  = pgtmp1
5700   // p1  = pgtmp2
5701   address generate_compare_long_string_sve(string_compare_mode mode) {
5702     __ align(CodeEntryAlignment);
5703     address entry = __ pc();
5704     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5705              tmp1 = r10, tmp2 = r11;
5706 
5707     Label LOOP, DONE, MISMATCH;
5708     Register vec_len = tmp1;
5709     Register idx = tmp2;
5710     // The minimum of the string lengths has been stored in cnt2.
5711     Register cnt = cnt2;
5712     FloatRegister ztmp1 = z0, ztmp2 = z1;
5713     PRegister pgtmp1 = p0, pgtmp2 = p1;
5714 
5715 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5716     switch (mode) {                                                            \
5717       case LL:                                                                 \
5718         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5719         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5720         break;                                                                 \
5721       case LU:                                                                 \
5722         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5723         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5724         break;                                                                 \
5725       case UL:                                                                 \
5726         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5727         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5728         break;                                                                 \
5729       case UU:                                                                 \
5730         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5731         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5732         break;                                                                 \
5733       default:                                                                 \
5734         ShouldNotReachHere();                                                  \
5735     }
5736 
5737     const char* stubname;
5738     switch (mode) {
5739       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5740       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5741       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5742       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5743       default: ShouldNotReachHere();
5744     }
5745 
5746     StubCodeMark mark(this, "StubRoutines", stubname);
5747 
5748     __ mov(idx, 0);
5749     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5750 
5751     if (mode == LL) {
5752       __ sve_cntb(vec_len);
5753     } else {
5754       __ sve_cnth(vec_len);
5755     }
5756 
5757     __ sub(rscratch1, cnt, vec_len);
5758 
5759     __ bind(LOOP);
5760 
5761       // main loop
5762       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5763       __ add(idx, idx, vec_len);
5764       // Compare strings.
5765       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5766       __ br(__ NE, MISMATCH);
5767       __ cmp(idx, rscratch1);
5768       __ br(__ LT, LOOP);
5769 
5770     // post loop, last iteration
5771     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5772 
5773     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5774     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5775     __ br(__ EQ, DONE);
5776 
5777     __ bind(MISMATCH);
5778 
5779     // Crop the vector to find its location.
5780     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5781     // Extract the first different characters of each string.
5782     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5783     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5784 
5785     // Compute the difference of the first different characters.
5786     __ sub(result, rscratch1, rscratch2);
5787 
5788     __ bind(DONE);
5789     __ ret(lr);
5790 #undef LOAD_PAIR
5791     return entry;
5792   }
5793 
5794   void generate_compare_long_strings() {
5795     if (UseSVE == 0) {
5796       StubRoutines::aarch64::_compare_long_string_LL
5797           = generate_compare_long_string_same_encoding(true);
5798       StubRoutines::aarch64::_compare_long_string_UU
5799           = generate_compare_long_string_same_encoding(false);
5800       StubRoutines::aarch64::_compare_long_string_LU
5801           = generate_compare_long_string_different_encoding(true);
5802       StubRoutines::aarch64::_compare_long_string_UL
5803           = generate_compare_long_string_different_encoding(false);
5804     } else {
5805       StubRoutines::aarch64::_compare_long_string_LL
5806           = generate_compare_long_string_sve(LL);
5807       StubRoutines::aarch64::_compare_long_string_UU
5808           = generate_compare_long_string_sve(UU);
5809       StubRoutines::aarch64::_compare_long_string_LU
5810           = generate_compare_long_string_sve(LU);
5811       StubRoutines::aarch64::_compare_long_string_UL
5812           = generate_compare_long_string_sve(UL);
5813     }
5814   }
5815 
5816   // R0 = result
5817   // R1 = str2
5818   // R2 = cnt1
5819   // R3 = str1
5820   // R4 = cnt2
5821   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
5822   //
5823   // This generic linear code use few additional ideas, which makes it faster:
5824   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5825   // in order to skip initial loading(help in systems with 1 ld pipeline)
5826   // 2) we can use "fast" algorithm of finding single character to search for
5827   // first symbol with less branches(1 branch per each loaded register instead
5828   // of branch for each symbol), so, this is where constants like
5829   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5830   // 3) after loading and analyzing 1st register of source string, it can be
5831   // used to search for every 1st character entry, saving few loads in
5832   // comparison with "simplier-but-slower" implementation
5833   // 4) in order to avoid lots of push/pop operations, code below is heavily
5834   // re-using/re-initializing/compressing register values, which makes code
5835   // larger and a bit less readable, however, most of extra operations are
5836   // issued during loads or branches, so, penalty is minimal
5837   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5838     const char* stubName = str1_isL
5839         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5840         : "indexof_linear_uu";
5841     __ align(CodeEntryAlignment);
5842     StubCodeMark mark(this, "StubRoutines", stubName);
5843     address entry = __ pc();
5844 
5845     int str1_chr_size = str1_isL ? 1 : 2;
5846     int str2_chr_size = str2_isL ? 1 : 2;
5847     int str1_chr_shift = str1_isL ? 0 : 1;
5848     int str2_chr_shift = str2_isL ? 0 : 1;
5849     bool isL = str1_isL && str2_isL;
5850    // parameters
5851     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5852     // temporary registers
5853     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5854     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5855     // redefinitions
5856     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5857 
5858     __ push(spilled_regs, sp);
5859     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5860         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5861         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5862         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5863         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5864         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5865     // Read whole register from str1. It is safe, because length >=8 here
5866     __ ldr(ch1, Address(str1));
5867     // Read whole register from str2. It is safe, because length >=8 here
5868     __ ldr(ch2, Address(str2));
5869     __ sub(cnt2, cnt2, cnt1);
5870     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5871     if (str1_isL != str2_isL) {
5872       __ eor(v0, __ T16B, v0, v0);
5873     }
5874     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5875     __ mul(first, first, tmp1);
5876     // check if we have less than 1 register to check
5877     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5878     if (str1_isL != str2_isL) {
5879       __ fmovd(v1, ch1);
5880     }
5881     __ br(__ LE, L_SMALL);
5882     __ eor(ch2, first, ch2);
5883     if (str1_isL != str2_isL) {
5884       __ zip1(v1, __ T16B, v1, v0);
5885     }
5886     __ sub(tmp2, ch2, tmp1);
5887     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5888     __ bics(tmp2, tmp2, ch2);
5889     if (str1_isL != str2_isL) {
5890       __ fmovd(ch1, v1);
5891     }
5892     __ br(__ NE, L_HAS_ZERO);
5893     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5894     __ add(result, result, wordSize/str2_chr_size);
5895     __ add(str2, str2, wordSize);
5896     __ br(__ LT, L_POST_LOOP);
5897     __ BIND(L_LOOP);
5898       __ ldr(ch2, Address(str2));
5899       __ eor(ch2, first, ch2);
5900       __ sub(tmp2, ch2, tmp1);
5901       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5902       __ bics(tmp2, tmp2, ch2);
5903       __ br(__ NE, L_HAS_ZERO);
5904     __ BIND(L_LOOP_PROCEED);
5905       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5906       __ add(str2, str2, wordSize);
5907       __ add(result, result, wordSize/str2_chr_size);
5908       __ br(__ GE, L_LOOP);
5909     __ BIND(L_POST_LOOP);
5910       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5911       __ br(__ LE, NOMATCH);
5912       __ ldr(ch2, Address(str2));
5913       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5914       __ eor(ch2, first, ch2);
5915       __ sub(tmp2, ch2, tmp1);
5916       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5917       __ mov(tmp4, -1); // all bits set
5918       __ b(L_SMALL_PROCEED);
5919     __ align(OptoLoopAlignment);
5920     __ BIND(L_SMALL);
5921       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5922       __ eor(ch2, first, ch2);
5923       if (str1_isL != str2_isL) {
5924         __ zip1(v1, __ T16B, v1, v0);
5925       }
5926       __ sub(tmp2, ch2, tmp1);
5927       __ mov(tmp4, -1); // all bits set
5928       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5929       if (str1_isL != str2_isL) {
5930         __ fmovd(ch1, v1); // move converted 4 symbols
5931       }
5932     __ BIND(L_SMALL_PROCEED);
5933       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5934       __ bic(tmp2, tmp2, ch2);
5935       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5936       __ rbit(tmp2, tmp2);
5937       __ br(__ EQ, NOMATCH);
5938     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5939       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5940       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5941       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5942       if (str2_isL) { // LL
5943         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5944         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5945         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5946         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5947         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5948       } else {
5949         __ mov(ch2, 0xE); // all bits in byte set except last one
5950         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5951         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5952         __ lslv(tmp2, tmp2, tmp4);
5953         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5954         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5955         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5956         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5957       }
5958       __ cmp(ch1, ch2);
5959       __ mov(tmp4, wordSize/str2_chr_size);
5960       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5961     __ BIND(L_SMALL_CMP_LOOP);
5962       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5963                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5964       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5965                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5966       __ add(tmp4, tmp4, 1);
5967       __ cmp(tmp4, cnt1);
5968       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5969       __ cmp(first, ch2);
5970       __ br(__ EQ, L_SMALL_CMP_LOOP);
5971     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5972       __ cbz(tmp2, NOMATCH); // no more matches. exit
5973       __ clz(tmp4, tmp2);
5974       __ add(result, result, 1); // advance index
5975       __ add(str2, str2, str2_chr_size); // advance pointer
5976       __ b(L_SMALL_HAS_ZERO_LOOP);
5977     __ align(OptoLoopAlignment);
5978     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5979       __ cmp(first, ch2);
5980       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5981       __ b(DONE);
5982     __ align(OptoLoopAlignment);
5983     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5984       if (str2_isL) { // LL
5985         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5986         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5987         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5988         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5989         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5990       } else {
5991         __ mov(ch2, 0xE); // all bits in byte set except last one
5992         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5993         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5994         __ lslv(tmp2, tmp2, tmp4);
5995         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5996         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5997         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5998         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5999       }
6000       __ cmp(ch1, ch2);
6001       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6002       __ b(DONE);
6003     __ align(OptoLoopAlignment);
6004     __ BIND(L_HAS_ZERO);
6005       __ rbit(tmp2, tmp2);
6006       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
6007       // Now, perform compression of counters(cnt2 and cnt1) into one register.
6008       // It's fine because both counters are 32bit and are not changed in this
6009       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
6010       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
6011       __ sub(result, result, 1);
6012     __ BIND(L_HAS_ZERO_LOOP);
6013       __ mov(cnt1, wordSize/str2_chr_size);
6014       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6015       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6016       if (str2_isL) {
6017         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6018         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6019         __ lslv(tmp2, tmp2, tmp4);
6020         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6021         __ add(tmp4, tmp4, 1);
6022         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6023         __ lsl(tmp2, tmp2, 1);
6024         __ mov(tmp4, wordSize/str2_chr_size);
6025       } else {
6026         __ mov(ch2, 0xE);
6027         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6028         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6029         __ lslv(tmp2, tmp2, tmp4);
6030         __ add(tmp4, tmp4, 1);
6031         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6032         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6033         __ lsl(tmp2, tmp2, 1);
6034         __ mov(tmp4, wordSize/str2_chr_size);
6035         __ sub(str2, str2, str2_chr_size);
6036       }
6037       __ cmp(ch1, ch2);
6038       __ mov(tmp4, wordSize/str2_chr_size);
6039       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6040     __ BIND(L_CMP_LOOP);
6041       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6042                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6043       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6044                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6045       __ add(tmp4, tmp4, 1);
6046       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6047       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6048       __ cmp(cnt1, ch2);
6049       __ br(__ EQ, L_CMP_LOOP);
6050     __ BIND(L_CMP_LOOP_NOMATCH);
6051       // here we're not matched
6052       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6053       __ clz(tmp4, tmp2);
6054       __ add(str2, str2, str2_chr_size); // advance pointer
6055       __ b(L_HAS_ZERO_LOOP);
6056     __ align(OptoLoopAlignment);
6057     __ BIND(L_CMP_LOOP_LAST_CMP);
6058       __ cmp(cnt1, ch2);
6059       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6060       __ b(DONE);
6061     __ align(OptoLoopAlignment);
6062     __ BIND(L_CMP_LOOP_LAST_CMP2);
6063       if (str2_isL) {
6064         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6065         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6066         __ lslv(tmp2, tmp2, tmp4);
6067         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6068         __ add(tmp4, tmp4, 1);
6069         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6070         __ lsl(tmp2, tmp2, 1);
6071       } else {
6072         __ mov(ch2, 0xE);
6073         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6074         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6075         __ lslv(tmp2, tmp2, tmp4);
6076         __ add(tmp4, tmp4, 1);
6077         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6078         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6079         __ lsl(tmp2, tmp2, 1);
6080         __ sub(str2, str2, str2_chr_size);
6081       }
6082       __ cmp(ch1, ch2);
6083       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6084       __ b(DONE);
6085     __ align(OptoLoopAlignment);
6086     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6087       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6088       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6089       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6090       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6091       // result by analyzed characters value, so, we can just reset lower bits
6092       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6093       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6094       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6095       // index of last analyzed substring inside current octet. So, str2 in at
6096       // respective start address. We need to advance it to next octet
6097       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6098       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6099       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6100       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6101       __ movw(cnt2, cnt2);
6102       __ b(L_LOOP_PROCEED);
6103     __ align(OptoLoopAlignment);
6104     __ BIND(NOMATCH);
6105       __ mov(result, -1);
6106     __ BIND(DONE);
6107       __ pop(spilled_regs, sp);
6108       __ ret(lr);
6109     return entry;
6110   }
6111 
6112   void generate_string_indexof_stubs() {
6113     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6114     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6115     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6116   }
6117 
6118   void inflate_and_store_2_fp_registers(bool generatePrfm,
6119       FloatRegister src1, FloatRegister src2) {
6120     Register dst = r1;
6121     __ zip1(v1, __ T16B, src1, v0);
6122     __ zip2(v2, __ T16B, src1, v0);
6123     if (generatePrfm) {
6124       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6125     }
6126     __ zip1(v3, __ T16B, src2, v0);
6127     __ zip2(v4, __ T16B, src2, v0);
6128     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6129   }
6130 
6131   // R0 = src
6132   // R1 = dst
6133   // R2 = len
6134   // R3 = len >> 3
6135   // V0 = 0
6136   // v1 = loaded 8 bytes
6137   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6138   address generate_large_byte_array_inflate() {
6139     __ align(CodeEntryAlignment);
6140     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6141     address entry = __ pc();
6142     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6143     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6144     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6145 
6146     // do one more 8-byte read to have address 16-byte aligned in most cases
6147     // also use single store instruction
6148     __ ldrd(v2, __ post(src, 8));
6149     __ sub(octetCounter, octetCounter, 2);
6150     __ zip1(v1, __ T16B, v1, v0);
6151     __ zip1(v2, __ T16B, v2, v0);
6152     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6153     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6154     __ subs(rscratch1, octetCounter, large_loop_threshold);
6155     __ br(__ LE, LOOP_START);
6156     __ b(LOOP_PRFM_START);
6157     __ bind(LOOP_PRFM);
6158       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6159     __ bind(LOOP_PRFM_START);
6160       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6161       __ sub(octetCounter, octetCounter, 8);
6162       __ subs(rscratch1, octetCounter, large_loop_threshold);
6163       inflate_and_store_2_fp_registers(true, v3, v4);
6164       inflate_and_store_2_fp_registers(true, v5, v6);
6165       __ br(__ GT, LOOP_PRFM);
6166       __ cmp(octetCounter, (u1)8);
6167       __ br(__ LT, DONE);
6168     __ bind(LOOP);
6169       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6170       __ bind(LOOP_START);
6171       __ sub(octetCounter, octetCounter, 8);
6172       __ cmp(octetCounter, (u1)8);
6173       inflate_and_store_2_fp_registers(false, v3, v4);
6174       inflate_and_store_2_fp_registers(false, v5, v6);
6175       __ br(__ GE, LOOP);
6176     __ bind(DONE);
6177       __ ret(lr);
6178     return entry;
6179   }
6180 
6181   /**
6182    *  Arguments:
6183    *
6184    *  Input:
6185    *  c_rarg0   - current state address
6186    *  c_rarg1   - H key address
6187    *  c_rarg2   - data address
6188    *  c_rarg3   - number of blocks
6189    *
6190    *  Output:
6191    *  Updated state at c_rarg0
6192    */
6193   address generate_ghash_processBlocks() {
6194     // Bafflingly, GCM uses little-endian for the byte order, but
6195     // big-endian for the bit order.  For example, the polynomial 1 is
6196     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6197     //
6198     // So, we must either reverse the bytes in each word and do
6199     // everything big-endian or reverse the bits in each byte and do
6200     // it little-endian.  On AArch64 it's more idiomatic to reverse
6201     // the bits in each byte (we have an instruction, RBIT, to do
6202     // that) and keep the data in little-endian bit order through the
6203     // calculation, bit-reversing the inputs and outputs.
6204 
6205     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6206     __ align(wordSize * 2);
6207     address p = __ pc();
6208     __ emit_int64(0x87);  // The low-order bits of the field
6209                           // polynomial (i.e. p = z^7+z^2+z+1)
6210                           // repeated in the low and high parts of a
6211                           // 128-bit vector
6212     __ emit_int64(0x87);
6213 
6214     __ align(CodeEntryAlignment);
6215     address start = __ pc();
6216 
6217     Register state   = c_rarg0;
6218     Register subkeyH = c_rarg1;
6219     Register data    = c_rarg2;
6220     Register blocks  = c_rarg3;
6221 
6222     FloatRegister vzr = v30;
6223     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6224 
6225     __ ldrq(v24, p);    // The field polynomial
6226 
6227     __ ldrq(v0, Address(state));
6228     __ ldrq(v1, Address(subkeyH));
6229 
6230     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6231     __ rbit(v0, __ T16B, v0);
6232     __ rev64(v1, __ T16B, v1);
6233     __ rbit(v1, __ T16B, v1);
6234 
6235     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6236     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6237 
6238     {
6239       Label L_ghash_loop;
6240       __ bind(L_ghash_loop);
6241 
6242       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6243                                                  // reversing each byte
6244       __ rbit(v2, __ T16B, v2);
6245       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6246 
6247       // Multiply state in v2 by subkey in v1
6248       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6249                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6250                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6251       // Reduce v7:v5 by the field polynomial
6252       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6253 
6254       __ sub(blocks, blocks, 1);
6255       __ cbnz(blocks, L_ghash_loop);
6256     }
6257 
6258     // The bit-reversed result is at this point in v0
6259     __ rev64(v0, __ T16B, v0);
6260     __ rbit(v0, __ T16B, v0);
6261 
6262     __ st1(v0, __ T16B, state);
6263     __ ret(lr);
6264 
6265     return start;
6266   }
6267 
6268   address generate_ghash_processBlocks_wide() {
6269     address small = generate_ghash_processBlocks();
6270 
6271     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6272     __ align(wordSize * 2);
6273     address p = __ pc();
6274     __ emit_int64(0x87);  // The low-order bits of the field
6275                           // polynomial (i.e. p = z^7+z^2+z+1)
6276                           // repeated in the low and high parts of a
6277                           // 128-bit vector
6278     __ emit_int64(0x87);
6279 
6280     __ align(CodeEntryAlignment);
6281     address start = __ pc();
6282 
6283     Register state   = c_rarg0;
6284     Register subkeyH = c_rarg1;
6285     Register data    = c_rarg2;
6286     Register blocks  = c_rarg3;
6287 
6288     const int unroll = 4;
6289 
6290     __ cmp(blocks, (unsigned char)(unroll * 2));
6291     __ br(__ LT, small);
6292 
6293     if (unroll > 1) {
6294     // Save state before entering routine
6295       __ sub(sp, sp, 4 * 16);
6296       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6297       __ sub(sp, sp, 4 * 16);
6298       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6299     }
6300 
6301     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6302 
6303     if (unroll > 1) {
6304       // And restore state
6305       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6306       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6307     }
6308 
6309     __ cmp(blocks, (unsigned char)0);
6310     __ br(__ GT, small);
6311 
6312     __ ret(lr);
6313 
6314     return start;
6315   }
6316 
6317   void generate_base64_encode_simdround(Register src, Register dst,
6318         FloatRegister codec, u8 size) {
6319 
6320     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6321     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6322     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6323 
6324     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6325 
6326     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6327 
6328     __ ushr(ind0, arrangement, in0,  2);
6329 
6330     __ ushr(ind1, arrangement, in1,  2);
6331     __ shl(in0,   arrangement, in0,  6);
6332     __ orr(ind1,  arrangement, ind1, in0);
6333     __ ushr(ind1, arrangement, ind1, 2);
6334 
6335     __ ushr(ind2, arrangement, in2,  4);
6336     __ shl(in1,   arrangement, in1,  4);
6337     __ orr(ind2,  arrangement, in1,  ind2);
6338     __ ushr(ind2, arrangement, ind2, 2);
6339 
6340     __ shl(ind3,  arrangement, in2,  2);
6341     __ ushr(ind3, arrangement, ind3, 2);
6342 
6343     __ tbl(out0,  arrangement, codec,  4, ind0);
6344     __ tbl(out1,  arrangement, codec,  4, ind1);
6345     __ tbl(out2,  arrangement, codec,  4, ind2);
6346     __ tbl(out3,  arrangement, codec,  4, ind3);
6347 
6348     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6349   }
6350 
6351    /**
6352    *  Arguments:
6353    *
6354    *  Input:
6355    *  c_rarg0   - src_start
6356    *  c_rarg1   - src_offset
6357    *  c_rarg2   - src_length
6358    *  c_rarg3   - dest_start
6359    *  c_rarg4   - dest_offset
6360    *  c_rarg5   - isURL
6361    *
6362    */
6363   address generate_base64_encodeBlock() {
6364 
6365     static const char toBase64[64] = {
6366       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6367       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6368       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6369       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6370       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6371     };
6372 
6373     static const char toBase64URL[64] = {
6374       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6375       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6376       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6377       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6378       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6379     };
6380 
6381     __ align(CodeEntryAlignment);
6382     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6383     address start = __ pc();
6384 
6385     Register src   = c_rarg0;  // source array
6386     Register soff  = c_rarg1;  // source start offset
6387     Register send  = c_rarg2;  // source end offset
6388     Register dst   = c_rarg3;  // dest array
6389     Register doff  = c_rarg4;  // position for writing to dest array
6390     Register isURL = c_rarg5;  // Base64 or URL character set
6391 
6392     // c_rarg6 and c_rarg7 are free to use as temps
6393     Register codec  = c_rarg6;
6394     Register length = c_rarg7;
6395 
6396     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6397 
6398     __ add(src, src, soff);
6399     __ add(dst, dst, doff);
6400     __ sub(length, send, soff);
6401 
6402     // load the codec base address
6403     __ lea(codec, ExternalAddress((address) toBase64));
6404     __ cbz(isURL, ProcessData);
6405     __ lea(codec, ExternalAddress((address) toBase64URL));
6406 
6407     __ BIND(ProcessData);
6408 
6409     // too short to formup a SIMD loop, roll back
6410     __ cmp(length, (u1)24);
6411     __ br(Assembler::LT, Process3B);
6412 
6413     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6414 
6415     __ BIND(Process48B);
6416     __ cmp(length, (u1)48);
6417     __ br(Assembler::LT, Process24B);
6418     generate_base64_encode_simdround(src, dst, v0, 16);
6419     __ sub(length, length, 48);
6420     __ b(Process48B);
6421 
6422     __ BIND(Process24B);
6423     __ cmp(length, (u1)24);
6424     __ br(Assembler::LT, SIMDExit);
6425     generate_base64_encode_simdround(src, dst, v0, 8);
6426     __ sub(length, length, 24);
6427 
6428     __ BIND(SIMDExit);
6429     __ cbz(length, Exit);
6430 
6431     __ BIND(Process3B);
6432     //  3 src bytes, 24 bits
6433     __ ldrb(r10, __ post(src, 1));
6434     __ ldrb(r11, __ post(src, 1));
6435     __ ldrb(r12, __ post(src, 1));
6436     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6437     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6438     // codec index
6439     __ ubfmw(r15, r12, 18, 23);
6440     __ ubfmw(r14, r12, 12, 17);
6441     __ ubfmw(r13, r12, 6,  11);
6442     __ andw(r12,  r12, 63);
6443     // get the code based on the codec
6444     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6445     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6446     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6447     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6448     __ strb(r15, __ post(dst, 1));
6449     __ strb(r14, __ post(dst, 1));
6450     __ strb(r13, __ post(dst, 1));
6451     __ strb(r12, __ post(dst, 1));
6452     __ sub(length, length, 3);
6453     __ cbnz(length, Process3B);
6454 
6455     __ BIND(Exit);
6456     __ ret(lr);
6457 
6458     return start;
6459   }
6460 
6461   void generate_base64_decode_simdround(Register src, Register dst,
6462         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6463 
6464     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6465     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6466 
6467     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6468     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6469 
6470     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6471 
6472     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6473 
6474     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6475 
6476     // we need unsigned saturating subtract, to make sure all input values
6477     // in range [0, 63] will have 0U value in the higher half lookup
6478     __ uqsubv(decH0, __ T16B, in0, v27);
6479     __ uqsubv(decH1, __ T16B, in1, v27);
6480     __ uqsubv(decH2, __ T16B, in2, v27);
6481     __ uqsubv(decH3, __ T16B, in3, v27);
6482 
6483     // lower half lookup
6484     __ tbl(decL0, arrangement, codecL, 4, in0);
6485     __ tbl(decL1, arrangement, codecL, 4, in1);
6486     __ tbl(decL2, arrangement, codecL, 4, in2);
6487     __ tbl(decL3, arrangement, codecL, 4, in3);
6488 
6489     // higher half lookup
6490     __ tbx(decH0, arrangement, codecH, 4, decH0);
6491     __ tbx(decH1, arrangement, codecH, 4, decH1);
6492     __ tbx(decH2, arrangement, codecH, 4, decH2);
6493     __ tbx(decH3, arrangement, codecH, 4, decH3);
6494 
6495     // combine lower and higher
6496     __ orr(decL0, arrangement, decL0, decH0);
6497     __ orr(decL1, arrangement, decL1, decH1);
6498     __ orr(decL2, arrangement, decL2, decH2);
6499     __ orr(decL3, arrangement, decL3, decH3);
6500 
6501     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6502     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6503     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6504     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6505     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6506     __ orr(in0, arrangement, decH0, decH1);
6507     __ orr(in1, arrangement, decH2, decH3);
6508     __ orr(in2, arrangement, in0,   in1);
6509     __ umaxv(in3, arrangement, in2);
6510     __ umov(rscratch2, in3, __ B, 0);
6511 
6512     // get the data to output
6513     __ shl(out0,  arrangement, decL0, 2);
6514     __ ushr(out1, arrangement, decL1, 4);
6515     __ orr(out0,  arrangement, out0,  out1);
6516     __ shl(out1,  arrangement, decL1, 4);
6517     __ ushr(out2, arrangement, decL2, 2);
6518     __ orr(out1,  arrangement, out1,  out2);
6519     __ shl(out2,  arrangement, decL2, 6);
6520     __ orr(out2,  arrangement, out2,  decL3);
6521 
6522     __ cbz(rscratch2, NoIllegalData);
6523 
6524     // handle illegal input
6525     __ umov(r10, in2, __ D, 0);
6526     if (size == 16) {
6527       __ cbnz(r10, ErrorInLowerHalf);
6528 
6529       // illegal input is in higher half, store the lower half now.
6530       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6531 
6532       __ umov(r10, in2,  __ D, 1);
6533       __ umov(r11, out0, __ D, 1);
6534       __ umov(r12, out1, __ D, 1);
6535       __ umov(r13, out2, __ D, 1);
6536       __ b(StoreLegalData);
6537 
6538       __ BIND(ErrorInLowerHalf);
6539     }
6540     __ umov(r11, out0, __ D, 0);
6541     __ umov(r12, out1, __ D, 0);
6542     __ umov(r13, out2, __ D, 0);
6543 
6544     __ BIND(StoreLegalData);
6545     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6546     __ strb(r11, __ post(dst, 1));
6547     __ strb(r12, __ post(dst, 1));
6548     __ strb(r13, __ post(dst, 1));
6549     __ lsr(r10, r10, 8);
6550     __ lsr(r11, r11, 8);
6551     __ lsr(r12, r12, 8);
6552     __ lsr(r13, r13, 8);
6553     __ b(StoreLegalData);
6554 
6555     __ BIND(NoIllegalData);
6556     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6557   }
6558 
6559 
6560    /**
6561    *  Arguments:
6562    *
6563    *  Input:
6564    *  c_rarg0   - src_start
6565    *  c_rarg1   - src_offset
6566    *  c_rarg2   - src_length
6567    *  c_rarg3   - dest_start
6568    *  c_rarg4   - dest_offset
6569    *  c_rarg5   - isURL
6570    *  c_rarg6   - isMIME
6571    *
6572    */
6573   address generate_base64_decodeBlock() {
6574 
6575     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6576     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6577     // titled "Base64 decoding".
6578 
6579     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6580     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6581     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6582     static const uint8_t fromBase64ForNoSIMD[256] = {
6583       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6584       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6585       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6586        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6587       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6588        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6589       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6590        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6591       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6592       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6593       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6594       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6595       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6596       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6597       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6598       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6599     };
6600 
6601     static const uint8_t fromBase64URLForNoSIMD[256] = {
6602       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6603       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6604       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6605        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6606       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6607        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6608       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6609        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6610       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6611       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6612       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6613       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6614       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6615       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6616       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6617       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6618     };
6619 
6620     // A legal value of base64 code is in range [0, 127].  We need two lookups
6621     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6622     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6623     // table vector lookup use tbx, out of range indices are unchanged in
6624     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6625     // The value of index 64 is set to 0, so that we know that we already get the
6626     // decoded data with the 1st lookup.
6627     static const uint8_t fromBase64ForSIMD[128] = {
6628       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6629       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6630       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6631        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6632         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6633        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6634       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6635        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6636     };
6637 
6638     static const uint8_t fromBase64URLForSIMD[128] = {
6639       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6640       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6641       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6642        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6643         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6644        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6645        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6646        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6647     };
6648 
6649     __ align(CodeEntryAlignment);
6650     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6651     address start = __ pc();
6652 
6653     Register src    = c_rarg0;  // source array
6654     Register soff   = c_rarg1;  // source start offset
6655     Register send   = c_rarg2;  // source end offset
6656     Register dst    = c_rarg3;  // dest array
6657     Register doff   = c_rarg4;  // position for writing to dest array
6658     Register isURL  = c_rarg5;  // Base64 or URL character set
6659     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6660 
6661     Register length = send;    // reuse send as length of source data to process
6662 
6663     Register simd_codec   = c_rarg6;
6664     Register nosimd_codec = c_rarg7;
6665 
6666     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6667 
6668     __ enter();
6669 
6670     __ add(src, src, soff);
6671     __ add(dst, dst, doff);
6672 
6673     __ mov(doff, dst);
6674 
6675     __ sub(length, send, soff);
6676     __ bfm(length, zr, 0, 1);
6677 
6678     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6679     __ cbz(isURL, ProcessData);
6680     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6681 
6682     __ BIND(ProcessData);
6683     __ mov(rscratch1, length);
6684     __ cmp(length, (u1)144); // 144 = 80 + 64
6685     __ br(Assembler::LT, Process4B);
6686 
6687     // In the MIME case, the line length cannot be more than 76
6688     // bytes (see RFC 2045). This is too short a block for SIMD
6689     // to be worthwhile, so we use non-SIMD here.
6690     __ movw(rscratch1, 79);
6691 
6692     __ BIND(Process4B);
6693     __ ldrw(r14, __ post(src, 4));
6694     __ ubfxw(r10, r14, 0,  8);
6695     __ ubfxw(r11, r14, 8,  8);
6696     __ ubfxw(r12, r14, 16, 8);
6697     __ ubfxw(r13, r14, 24, 8);
6698     // get the de-code
6699     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6700     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6701     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6702     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6703     // error detection, 255u indicates an illegal input
6704     __ orrw(r14, r10, r11);
6705     __ orrw(r15, r12, r13);
6706     __ orrw(r14, r14, r15);
6707     __ tbnz(r14, 7, Exit);
6708     // recover the data
6709     __ lslw(r14, r10, 10);
6710     __ bfiw(r14, r11, 4, 6);
6711     __ bfmw(r14, r12, 2, 5);
6712     __ rev16w(r14, r14);
6713     __ bfiw(r13, r12, 6, 2);
6714     __ strh(r14, __ post(dst, 2));
6715     __ strb(r13, __ post(dst, 1));
6716     // non-simd loop
6717     __ subsw(rscratch1, rscratch1, 4);
6718     __ br(Assembler::GT, Process4B);
6719 
6720     // if exiting from PreProcess80B, rscratch1 == -1;
6721     // otherwise, rscratch1 == 0.
6722     __ cbzw(rscratch1, Exit);
6723     __ sub(length, length, 80);
6724 
6725     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6726     __ cbz(isURL, SIMDEnter);
6727     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6728 
6729     __ BIND(SIMDEnter);
6730     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6731     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6732     __ mov(rscratch1, 63);
6733     __ dup(v27, __ T16B, rscratch1);
6734 
6735     __ BIND(Process64B);
6736     __ cmp(length, (u1)64);
6737     __ br(Assembler::LT, Process32B);
6738     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6739     __ sub(length, length, 64);
6740     __ b(Process64B);
6741 
6742     __ BIND(Process32B);
6743     __ cmp(length, (u1)32);
6744     __ br(Assembler::LT, SIMDExit);
6745     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6746     __ sub(length, length, 32);
6747     __ b(Process32B);
6748 
6749     __ BIND(SIMDExit);
6750     __ cbz(length, Exit);
6751     __ movw(rscratch1, length);
6752     __ b(Process4B);
6753 
6754     __ BIND(Exit);
6755     __ sub(c_rarg0, dst, doff);
6756 
6757     __ leave();
6758     __ ret(lr);
6759 
6760     return start;
6761   }
6762 
6763   // Support for spin waits.
6764   address generate_spin_wait() {
6765     __ align(CodeEntryAlignment);
6766     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6767     address start = __ pc();
6768 
6769     __ spin_wait();
6770     __ ret(lr);
6771 
6772     return start;
6773   }
6774 
6775 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6776 
6777   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6778   //
6779   // If LSE is in use, generate LSE versions of all the stubs. The
6780   // non-LSE versions are in atomic_aarch64.S.
6781 
6782   // class AtomicStubMark records the entry point of a stub and the
6783   // stub pointer which will point to it. The stub pointer is set to
6784   // the entry point when ~AtomicStubMark() is called, which must be
6785   // after ICache::invalidate_range. This ensures safe publication of
6786   // the generated code.
6787   class AtomicStubMark {
6788     address _entry_point;
6789     aarch64_atomic_stub_t *_stub;
6790     MacroAssembler *_masm;
6791   public:
6792     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6793       _masm = masm;
6794       __ align(32);
6795       _entry_point = __ pc();
6796       _stub = stub;
6797     }
6798     ~AtomicStubMark() {
6799       *_stub = (aarch64_atomic_stub_t)_entry_point;
6800     }
6801   };
6802 
6803   // NB: For memory_order_conservative we need a trailing membar after
6804   // LSE atomic operations but not a leading membar.
6805   //
6806   // We don't need a leading membar because a clause in the Arm ARM
6807   // says:
6808   //
6809   //   Barrier-ordered-before
6810   //
6811   //   Barrier instructions order prior Memory effects before subsequent
6812   //   Memory effects generated by the same Observer. A read or a write
6813   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6814   //   Observer if and only if RW1 appears in program order before RW 2
6815   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6816   //   instruction with both Acquire and Release semantics.
6817   //
6818   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6819   // and Release semantics, therefore we don't need a leading
6820   // barrier. However, there is no corresponding Barrier-ordered-after
6821   // relationship, therefore we need a trailing membar to prevent a
6822   // later store or load from being reordered with the store in an
6823   // atomic instruction.
6824   //
6825   // This was checked by using the herd7 consistency model simulator
6826   // (http://diy.inria.fr/) with this test case:
6827   //
6828   // AArch64 LseCas
6829   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6830   // P0 | P1;
6831   // LDR W4, [X2] | MOV W3, #0;
6832   // DMB LD       | MOV W4, #1;
6833   // LDR W3, [X1] | CASAL W3, W4, [X1];
6834   //              | DMB ISH;
6835   //              | STR W4, [X2];
6836   // exists
6837   // (0:X3=0 /\ 0:X4=1)
6838   //
6839   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6840   // with the store to x in P1. Without the DMB in P1 this may happen.
6841   //
6842   // At the time of writing we don't know of any AArch64 hardware that
6843   // reorders stores in this way, but the Reference Manual permits it.
6844 
6845   void gen_cas_entry(Assembler::operand_size size,
6846                      atomic_memory_order order) {
6847     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6848       exchange_val = c_rarg2;
6849     bool acquire, release;
6850     switch (order) {
6851       case memory_order_relaxed:
6852         acquire = false;
6853         release = false;
6854         break;
6855       case memory_order_release:
6856         acquire = false;
6857         release = true;
6858         break;
6859       default:
6860         acquire = true;
6861         release = true;
6862         break;
6863     }
6864     __ mov(prev, compare_val);
6865     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6866     if (order == memory_order_conservative) {
6867       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6868     }
6869     if (size == Assembler::xword) {
6870       __ mov(r0, prev);
6871     } else {
6872       __ movw(r0, prev);
6873     }
6874     __ ret(lr);
6875   }
6876 
6877   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6878     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6879     // If not relaxed, then default to conservative.  Relaxed is the only
6880     // case we use enough to be worth specializing.
6881     if (order == memory_order_relaxed) {
6882       __ ldadd(size, incr, prev, addr);
6883     } else {
6884       __ ldaddal(size, incr, prev, addr);
6885       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6886     }
6887     if (size == Assembler::xword) {
6888       __ mov(r0, prev);
6889     } else {
6890       __ movw(r0, prev);
6891     }
6892     __ ret(lr);
6893   }
6894 
6895   void gen_swpal_entry(Assembler::operand_size size) {
6896     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6897     __ swpal(size, incr, prev, addr);
6898     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6899     if (size == Assembler::xword) {
6900       __ mov(r0, prev);
6901     } else {
6902       __ movw(r0, prev);
6903     }
6904     __ ret(lr);
6905   }
6906 
6907   void generate_atomic_entry_points() {
6908     if (! UseLSE) {
6909       return;
6910     }
6911 
6912     __ align(CodeEntryAlignment);
6913     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6914     address first_entry = __ pc();
6915 
6916     // ADD, memory_order_conservative
6917     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6918     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6919     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6920     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6921 
6922     // ADD, memory_order_relaxed
6923     AtomicStubMark mark_fetch_add_4_relaxed
6924       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6925     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6926     AtomicStubMark mark_fetch_add_8_relaxed
6927       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6928     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6929 
6930     // XCHG, memory_order_conservative
6931     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6932     gen_swpal_entry(Assembler::word);
6933     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6934     gen_swpal_entry(Assembler::xword);
6935 
6936     // CAS, memory_order_conservative
6937     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6938     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6939     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6940     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6941     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6942     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6943 
6944     // CAS, memory_order_relaxed
6945     AtomicStubMark mark_cmpxchg_1_relaxed
6946       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6947     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6948     AtomicStubMark mark_cmpxchg_4_relaxed
6949       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6950     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6951     AtomicStubMark mark_cmpxchg_8_relaxed
6952       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6953     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6954 
6955     AtomicStubMark mark_cmpxchg_4_release
6956       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6957     gen_cas_entry(MacroAssembler::word, memory_order_release);
6958     AtomicStubMark mark_cmpxchg_8_release
6959       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6960     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6961 
6962     AtomicStubMark mark_cmpxchg_4_seq_cst
6963       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6964     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6965     AtomicStubMark mark_cmpxchg_8_seq_cst
6966       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6967     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6968 
6969     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6970   }
6971 #endif // LINUX
6972 
6973   address generate_cont_thaw(Continuation::thaw_kind kind) {
6974     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
6975     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
6976 
6977     address start = __ pc();
6978 
6979     if (return_barrier) {
6980       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6981       __ mov(sp, rscratch1);
6982     }
6983     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6984 
6985     if (return_barrier) {
6986       // preserve possible return value from a method returning to the return barrier
6987       __ fmovd(rscratch1, v0);
6988       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6989     }
6990 
6991     __ movw(c_rarg1, (return_barrier ? 1 : 0));
6992     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
6993     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
6994 
6995     if (return_barrier) {
6996       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6997       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6998       __ fmovd(v0, rscratch1);
6999     }
7000     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7001 
7002 
7003     Label thaw_success;
7004     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
7005     __ cbnz(rscratch2, thaw_success);
7006     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
7007     __ br(rscratch1);
7008     __ bind(thaw_success);
7009 
7010     // make room for the thawed frames
7011     __ sub(rscratch1, sp, rscratch2);
7012     __ andr(rscratch1, rscratch1, -16); // align
7013     __ mov(sp, rscratch1);
7014 
7015     if (return_barrier) {
7016       // save original return value -- again
7017       __ fmovd(rscratch1, v0);
7018       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7019     }
7020 
7021     // If we want, we can templatize thaw by kind, and have three different entries
7022     __ movw(c_rarg1, (uint32_t)kind);
7023 
7024     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7025     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7026 
7027     if (return_barrier) {
7028       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7029       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7030       __ fmovd(v0, rscratch1);
7031     } else {
7032       __ mov(r0, zr); // return 0 (success) from doYield
7033     }
7034 
7035     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7036     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7037     __ mov(rfp, sp);
7038 
7039     if (return_barrier_exception) {
7040       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7041       __ authenticate_return_address(c_rarg1);
7042       __ verify_oop(r0);
7043       // save return value containing the exception oop in callee-saved R19
7044       __ mov(r19, r0);
7045 
7046       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7047 
7048       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7049       // __ reinitialize_ptrue();
7050 
7051       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7052 
7053       __ mov(r1, r0); // the exception handler
7054       __ mov(r0, r19); // restore return value containing the exception oop
7055       __ verify_oop(r0);
7056 
7057       __ leave();
7058       __ mov(r3, lr);
7059       __ br(r1); // the exception handler
7060     } else {
7061       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7062       __ leave();
7063       __ ret(lr);
7064     }
7065 
7066     return start;
7067   }
7068 
7069   address generate_cont_thaw() {
7070     if (!Continuations::enabled()) return nullptr;
7071 
7072     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7073     address start = __ pc();
7074     generate_cont_thaw(Continuation::thaw_top);
7075     return start;
7076   }
7077 
7078   address generate_cont_returnBarrier() {
7079     if (!Continuations::enabled()) return nullptr;
7080 
7081     // TODO: will probably need multiple return barriers depending on return type
7082     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7083     address start = __ pc();
7084 
7085     generate_cont_thaw(Continuation::thaw_return_barrier);
7086 
7087     return start;
7088   }
7089 
7090   address generate_cont_returnBarrier_exception() {
7091     if (!Continuations::enabled()) return nullptr;
7092 
7093     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7094     address start = __ pc();
7095 
7096     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7097 
7098     return start;
7099   }
7100 
7101   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7102   // are represented as long[5], with BITS_PER_LIMB = 26.
7103   // Pack five 26-bit limbs into three 64-bit registers.
7104   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7105     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7106     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7107     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7108     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7109 
7110     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7111     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7112     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7113     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7114 
7115     if (dest2->is_valid()) {
7116       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7117     } else {
7118 #ifdef ASSERT
7119       Label OK;
7120       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7121       __ br(__ EQ, OK);
7122       __ stop("high bits of Poly1305 integer should be zero");
7123       __ should_not_reach_here();
7124       __ bind(OK);
7125 #endif
7126     }
7127   }
7128 
7129   // As above, but return only a 128-bit integer, packed into two
7130   // 64-bit registers.
7131   void pack_26(Register dest0, Register dest1, Register src) {
7132     pack_26(dest0, dest1, noreg, src);
7133   }
7134 
7135   // Multiply and multiply-accumulate unsigned 64-bit registers.
7136   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7137     __ mul(prod_lo, n, m);
7138     __ umulh(prod_hi, n, m);
7139   }
7140   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7141     wide_mul(rscratch1, rscratch2, n, m);
7142     __ adds(sum_lo, sum_lo, rscratch1);
7143     __ adc(sum_hi, sum_hi, rscratch2);
7144   }
7145 
7146   // Poly1305, RFC 7539
7147 
7148   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7149   // description of the tricks used to simplify and accelerate this
7150   // computation.
7151 
7152   address generate_poly1305_processBlocks() {
7153     __ align(CodeEntryAlignment);
7154     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
7155     address start = __ pc();
7156     Label here;
7157     __ enter();
7158     RegSet callee_saved = RegSet::range(r19, r28);
7159     __ push(callee_saved, sp);
7160 
7161     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7162 
7163     // Arguments
7164     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7165 
7166     // R_n is the 128-bit randomly-generated key, packed into two
7167     // registers.  The caller passes this key to us as long[5], with
7168     // BITS_PER_LIMB = 26.
7169     const Register R_0 = *++regs, R_1 = *++regs;
7170     pack_26(R_0, R_1, r_start);
7171 
7172     // RR_n is (R_n >> 2) * 5
7173     const Register RR_0 = *++regs, RR_1 = *++regs;
7174     __ lsr(RR_0, R_0, 2);
7175     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7176     __ lsr(RR_1, R_1, 2);
7177     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7178 
7179     // U_n is the current checksum
7180     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7181     pack_26(U_0, U_1, U_2, acc_start);
7182 
7183     static constexpr int BLOCK_LENGTH = 16;
7184     Label DONE, LOOP;
7185 
7186     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7187     __ br(Assembler::LT, DONE); {
7188       __ bind(LOOP);
7189 
7190       // S_n is to be the sum of U_n and the next block of data
7191       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7192       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7193       __ adds(S_0, U_0, S_0);
7194       __ adcs(S_1, U_1, S_1);
7195       __ adc(S_2, U_2, zr);
7196       __ add(S_2, S_2, 1);
7197 
7198       const Register U_0HI = *++regs, U_1HI = *++regs;
7199 
7200       // NB: this logic depends on some of the special properties of
7201       // Poly1305 keys. In particular, because we know that the top
7202       // four bits of R_0 and R_1 are zero, we can add together
7203       // partial products without any risk of needing to propagate a
7204       // carry out.
7205       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7206       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7207       __ andr(U_2, R_0, 3);
7208       __ mul(U_2, S_2, U_2);
7209 
7210       // Recycle registers S_0, S_1, S_2
7211       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7212 
7213       // Partial reduction mod 2**130 - 5
7214       __ adds(U_1, U_0HI, U_1);
7215       __ adc(U_2, U_1HI, U_2);
7216       // Sum now in U_2:U_1:U_0.
7217       // Dead: U_0HI, U_1HI.
7218       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7219 
7220       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7221 
7222       // First, U_2:U_1:U_0 += (U_2 >> 2)
7223       __ lsr(rscratch1, U_2, 2);
7224       __ andr(U_2, U_2, (u8)3);
7225       __ adds(U_0, U_0, rscratch1);
7226       __ adcs(U_1, U_1, zr);
7227       __ adc(U_2, U_2, zr);
7228       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7229       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7230       __ adcs(U_1, U_1, zr);
7231       __ adc(U_2, U_2, zr);
7232 
7233       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7234       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7235       __ br(~ Assembler::LT, LOOP);
7236     }
7237 
7238     // Further reduce modulo 2^130 - 5
7239     __ lsr(rscratch1, U_2, 2);
7240     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7241     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7242     __ adcs(U_1, U_1, zr);
7243     __ andr(U_2, U_2, (u1)3);
7244     __ adc(U_2, U_2, zr);
7245 
7246     // Unpack the sum into five 26-bit limbs and write to memory.
7247     __ ubfiz(rscratch1, U_0, 0, 26);
7248     __ ubfx(rscratch2, U_0, 26, 26);
7249     __ stp(rscratch1, rscratch2, Address(acc_start));
7250     __ ubfx(rscratch1, U_0, 52, 12);
7251     __ bfi(rscratch1, U_1, 12, 14);
7252     __ ubfx(rscratch2, U_1, 14, 26);
7253     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7254     __ ubfx(rscratch1, U_1, 40, 24);
7255     __ bfi(rscratch1, U_2, 24, 3);
7256     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7257 
7258     __ bind(DONE);
7259     __ pop(callee_saved, sp);
7260     __ leave();
7261     __ ret(lr);
7262 
7263     return start;
7264   }
7265 
7266 #if INCLUDE_JFR
7267 
7268   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
7269     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7270     __ mov(c_rarg0, thread);
7271   }
7272 
7273   // The handle is dereferenced through a load barrier.
7274   static void jfr_epilogue(MacroAssembler* _masm) {
7275     __ reset_last_Java_frame(true);
7276   }
7277 
7278   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
7279   // It returns a jobject handle to the event writer.
7280   // The handle is dereferenced and the return value is the event writer oop.
7281   static RuntimeStub* generate_jfr_write_checkpoint() {
7282     enum layout {
7283       rbp_off,
7284       rbpH_off,
7285       return_off,
7286       return_off2,
7287       framesize // inclusive of return address
7288     };
7289 
7290     int insts_size = 1024;
7291     int locs_size = 64;
7292     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
7293     OopMapSet* oop_maps = new OopMapSet();
7294     MacroAssembler* masm = new MacroAssembler(&code);
7295     MacroAssembler* _masm = masm;
7296 
7297     address start = __ pc();
7298     __ enter();
7299     int frame_complete = __ pc() - start;
7300     address the_pc = __ pc();
7301     jfr_prologue(the_pc, _masm, rthread);
7302     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
7303     jfr_epilogue(_masm);
7304     __ resolve_global_jobject(r0, rscratch1, rscratch2);
7305     __ leave();
7306     __ ret(lr);
7307 
7308     OopMap* map = new OopMap(framesize, 1); // rfp
7309     oop_maps->add_gc_map(the_pc - start, map);
7310 
7311     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7312       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
7313                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7314                                     oop_maps, false);
7315     return stub;
7316   }
7317 
7318   // For c2: call to return a leased buffer.
7319   static RuntimeStub* generate_jfr_return_lease() {
7320     enum layout {
7321       rbp_off,
7322       rbpH_off,
7323       return_off,
7324       return_off2,
7325       framesize // inclusive of return address
7326     };
7327 
7328     int insts_size = 1024;
7329     int locs_size = 64;
7330     CodeBuffer code("jfr_return_lease", insts_size, locs_size);
7331     OopMapSet* oop_maps = new OopMapSet();
7332     MacroAssembler* masm = new MacroAssembler(&code);
7333     MacroAssembler* _masm = masm;
7334 
7335     address start = __ pc();
7336     __ enter();
7337     int frame_complete = __ pc() - start;
7338     address the_pc = __ pc();
7339     jfr_prologue(the_pc, _masm, rthread);
7340     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
7341     jfr_epilogue(_masm);
7342 
7343     __ leave();
7344     __ ret(lr);
7345 
7346     OopMap* map = new OopMap(framesize, 1); // rfp
7347     oop_maps->add_gc_map(the_pc - start, map);
7348 
7349     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7350       RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete,
7351                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7352                                     oop_maps, false);
7353     return stub;
7354   }
7355 
7356 #endif // INCLUDE_JFR
7357 
7358   // exception handler for upcall stubs
7359   address generate_upcall_stub_exception_handler() {
7360     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
7361     address start = __ pc();
7362 
7363     // Native caller has no idea how to handle exceptions,
7364     // so we just crash here. Up to callee to catch exceptions.
7365     __ verify_oop(r0);
7366     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7367     __ blr(rscratch1);
7368     __ should_not_reach_here();
7369 
7370     return start;
7371   }
7372 
7373   // Continuation point for throwing of implicit exceptions that are
7374   // not handled in the current activation. Fabricates an exception
7375   // oop and initiates normal exception dispatching in this
7376   // frame. Since we need to preserve callee-saved values (currently
7377   // only for C2, but done for C1 as well) we need a callee-saved oop
7378   // map and therefore have to make these stubs into RuntimeStubs
7379   // rather than BufferBlobs.  If the compiler needs all registers to
7380   // be preserved between the fault point and the exception handler
7381   // then it must assume responsibility for that in
7382   // AbstractCompiler::continuation_for_implicit_null_exception or
7383   // continuation_for_implicit_division_by_zero_exception. All other
7384   // implicit exceptions (e.g., NullPointerException or
7385   // AbstractMethodError on entry) are either at call sites or
7386   // otherwise assume that stack unwinding will be initiated, so
7387   // caller saved registers were assumed volatile in the compiler.
7388 
7389 #undef __
7390 #define __ masm->
7391 
7392   address generate_throw_exception(const char* name,
7393                                    address runtime_entry,
7394                                    Register arg1 = noreg,
7395                                    Register arg2 = noreg) {
7396     // Information about frame layout at time of blocking runtime call.
7397     // Note that we only have to preserve callee-saved registers since
7398     // the compilers are responsible for supplying a continuation point
7399     // if they expect all registers to be preserved.
7400     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7401     enum layout {
7402       rfp_off = 0,
7403       rfp_off2,
7404       return_off,
7405       return_off2,
7406       framesize // inclusive of return address
7407     };
7408 
7409     int insts_size = 512;
7410     int locs_size  = 64;
7411 
7412     CodeBuffer code(name, insts_size, locs_size);
7413     OopMapSet* oop_maps  = new OopMapSet();
7414     MacroAssembler* masm = new MacroAssembler(&code);
7415 
7416     address start = __ pc();
7417 
7418     // This is an inlined and slightly modified version of call_VM
7419     // which has the ability to fetch the return PC out of
7420     // thread-local storage and also sets up last_Java_sp slightly
7421     // differently than the real call_VM
7422 
7423     __ enter(); // Save FP and LR before call
7424 
7425     assert(is_even(framesize/2), "sp not 16-byte aligned");
7426 
7427     // lr and fp are already in place
7428     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
7429 
7430     int frame_complete = __ pc() - start;
7431 
7432     // Set up last_Java_sp and last_Java_fp
7433     address the_pc = __ pc();
7434     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7435 
7436     // Call runtime
7437     if (arg1 != noreg) {
7438       assert(arg2 != c_rarg1, "clobbered");
7439       __ mov(c_rarg1, arg1);
7440     }
7441     if (arg2 != noreg) {
7442       __ mov(c_rarg2, arg2);
7443     }
7444     __ mov(c_rarg0, rthread);
7445     BLOCK_COMMENT("call runtime_entry");
7446     __ mov(rscratch1, runtime_entry);
7447     __ blr(rscratch1);
7448 
7449     // Generate oop map
7450     OopMap* map = new OopMap(framesize, 0);
7451 
7452     oop_maps->add_gc_map(the_pc - start, map);
7453 
7454     __ reset_last_Java_frame(true);
7455 
7456     // Reinitialize the ptrue predicate register, in case the external runtime
7457     // call clobbers ptrue reg, as we may return to SVE compiled code.
7458     __ reinitialize_ptrue();
7459 
7460     __ leave();
7461 
7462     // check for pending exceptions
7463 #ifdef ASSERT
7464     Label L;
7465     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
7466     __ cbnz(rscratch1, L);
7467     __ should_not_reach_here();
7468     __ bind(L);
7469 #endif // ASSERT
7470     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7471 
7472     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7473     RuntimeStub* stub =
7474       RuntimeStub::new_runtime_stub(name,
7475                                     &code,
7476                                     frame_complete,
7477                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7478                                     oop_maps, false);
7479     return stub->entry_point();
7480   }
7481 
7482   class MontgomeryMultiplyGenerator : public MacroAssembler {
7483 
7484     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7485       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7486 
7487     RegSet _toSave;
7488     bool _squaring;
7489 
7490   public:
7491     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7492       : MacroAssembler(as->code()), _squaring(squaring) {
7493 
7494       // Register allocation
7495 
7496       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7497       Pa_base = *regs;       // Argument registers
7498       if (squaring)
7499         Pb_base = Pa_base;
7500       else
7501         Pb_base = *++regs;
7502       Pn_base = *++regs;
7503       Rlen= *++regs;
7504       inv = *++regs;
7505       Pm_base = *++regs;
7506 
7507                           // Working registers:
7508       Ra =  *++regs;        // The current digit of a, b, n, and m.
7509       Rb =  *++regs;
7510       Rm =  *++regs;
7511       Rn =  *++regs;
7512 
7513       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7514       Pb =  *++regs;
7515       Pm =  *++regs;
7516       Pn =  *++regs;
7517 
7518       t0 =  *++regs;        // Three registers which form a
7519       t1 =  *++regs;        // triple-precision accumuator.
7520       t2 =  *++regs;
7521 
7522       Ri =  *++regs;        // Inner and outer loop indexes.
7523       Rj =  *++regs;
7524 
7525       Rhi_ab = *++regs;     // Product registers: low and high parts
7526       Rlo_ab = *++regs;     // of a*b and m*n.
7527       Rhi_mn = *++regs;
7528       Rlo_mn = *++regs;
7529 
7530       // r19 and up are callee-saved.
7531       _toSave = RegSet::range(r19, *regs) + Pm_base;
7532     }
7533 
7534   private:
7535     void save_regs() {
7536       push(_toSave, sp);
7537     }
7538 
7539     void restore_regs() {
7540       pop(_toSave, sp);
7541     }
7542 
7543     template <typename T>
7544     void unroll_2(Register count, T block) {
7545       Label loop, end, odd;
7546       tbnz(count, 0, odd);
7547       cbz(count, end);
7548       align(16);
7549       bind(loop);
7550       (this->*block)();
7551       bind(odd);
7552       (this->*block)();
7553       subs(count, count, 2);
7554       br(Assembler::GT, loop);
7555       bind(end);
7556     }
7557 
7558     template <typename T>
7559     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7560       Label loop, end, odd;
7561       tbnz(count, 0, odd);
7562       cbz(count, end);
7563       align(16);
7564       bind(loop);
7565       (this->*block)(d, s, tmp);
7566       bind(odd);
7567       (this->*block)(d, s, tmp);
7568       subs(count, count, 2);
7569       br(Assembler::GT, loop);
7570       bind(end);
7571     }
7572 
7573     void pre1(RegisterOrConstant i) {
7574       block_comment("pre1");
7575       // Pa = Pa_base;
7576       // Pb = Pb_base + i;
7577       // Pm = Pm_base;
7578       // Pn = Pn_base + i;
7579       // Ra = *Pa;
7580       // Rb = *Pb;
7581       // Rm = *Pm;
7582       // Rn = *Pn;
7583       ldr(Ra, Address(Pa_base));
7584       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7585       ldr(Rm, Address(Pm_base));
7586       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7587       lea(Pa, Address(Pa_base));
7588       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7589       lea(Pm, Address(Pm_base));
7590       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7591 
7592       // Zero the m*n result.
7593       mov(Rhi_mn, zr);
7594       mov(Rlo_mn, zr);
7595     }
7596 
7597     // The core multiply-accumulate step of a Montgomery
7598     // multiplication.  The idea is to schedule operations as a
7599     // pipeline so that instructions with long latencies (loads and
7600     // multiplies) have time to complete before their results are
7601     // used.  This most benefits in-order implementations of the
7602     // architecture but out-of-order ones also benefit.
7603     void step() {
7604       block_comment("step");
7605       // MACC(Ra, Rb, t0, t1, t2);
7606       // Ra = *++Pa;
7607       // Rb = *--Pb;
7608       umulh(Rhi_ab, Ra, Rb);
7609       mul(Rlo_ab, Ra, Rb);
7610       ldr(Ra, pre(Pa, wordSize));
7611       ldr(Rb, pre(Pb, -wordSize));
7612       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7613                                        // previous iteration.
7614       // MACC(Rm, Rn, t0, t1, t2);
7615       // Rm = *++Pm;
7616       // Rn = *--Pn;
7617       umulh(Rhi_mn, Rm, Rn);
7618       mul(Rlo_mn, Rm, Rn);
7619       ldr(Rm, pre(Pm, wordSize));
7620       ldr(Rn, pre(Pn, -wordSize));
7621       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7622     }
7623 
7624     void post1() {
7625       block_comment("post1");
7626 
7627       // MACC(Ra, Rb, t0, t1, t2);
7628       // Ra = *++Pa;
7629       // Rb = *--Pb;
7630       umulh(Rhi_ab, Ra, Rb);
7631       mul(Rlo_ab, Ra, Rb);
7632       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7633       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7634 
7635       // *Pm = Rm = t0 * inv;
7636       mul(Rm, t0, inv);
7637       str(Rm, Address(Pm));
7638 
7639       // MACC(Rm, Rn, t0, t1, t2);
7640       // t0 = t1; t1 = t2; t2 = 0;
7641       umulh(Rhi_mn, Rm, Rn);
7642 
7643 #ifndef PRODUCT
7644       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7645       {
7646         mul(Rlo_mn, Rm, Rn);
7647         add(Rlo_mn, t0, Rlo_mn);
7648         Label ok;
7649         cbz(Rlo_mn, ok); {
7650           stop("broken Montgomery multiply");
7651         } bind(ok);
7652       }
7653 #endif
7654       // We have very carefully set things up so that
7655       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7656       // the lower half of Rm * Rn because we know the result already:
7657       // it must be -t0.  t0 + (-t0) must generate a carry iff
7658       // t0 != 0.  So, rather than do a mul and an adds we just set
7659       // the carry flag iff t0 is nonzero.
7660       //
7661       // mul(Rlo_mn, Rm, Rn);
7662       // adds(zr, t0, Rlo_mn);
7663       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7664       adcs(t0, t1, Rhi_mn);
7665       adc(t1, t2, zr);
7666       mov(t2, zr);
7667     }
7668 
7669     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7670       block_comment("pre2");
7671       // Pa = Pa_base + i-len;
7672       // Pb = Pb_base + len;
7673       // Pm = Pm_base + i-len;
7674       // Pn = Pn_base + len;
7675 
7676       if (i.is_register()) {
7677         sub(Rj, i.as_register(), len);
7678       } else {
7679         mov(Rj, i.as_constant());
7680         sub(Rj, Rj, len);
7681       }
7682       // Rj == i-len
7683 
7684       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7685       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7686       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7687       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7688 
7689       // Ra = *++Pa;
7690       // Rb = *--Pb;
7691       // Rm = *++Pm;
7692       // Rn = *--Pn;
7693       ldr(Ra, pre(Pa, wordSize));
7694       ldr(Rb, pre(Pb, -wordSize));
7695       ldr(Rm, pre(Pm, wordSize));
7696       ldr(Rn, pre(Pn, -wordSize));
7697 
7698       mov(Rhi_mn, zr);
7699       mov(Rlo_mn, zr);
7700     }
7701 
7702     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7703       block_comment("post2");
7704       if (i.is_constant()) {
7705         mov(Rj, i.as_constant()-len.as_constant());
7706       } else {
7707         sub(Rj, i.as_register(), len);
7708       }
7709 
7710       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7711 
7712       // As soon as we know the least significant digit of our result,
7713       // store it.
7714       // Pm_base[i-len] = t0;
7715       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7716 
7717       // t0 = t1; t1 = t2; t2 = 0;
7718       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7719       adc(t1, t2, zr);
7720       mov(t2, zr);
7721     }
7722 
7723     // A carry in t0 after Montgomery multiplication means that we
7724     // should subtract multiples of n from our result in m.  We'll
7725     // keep doing that until there is no carry.
7726     void normalize(RegisterOrConstant len) {
7727       block_comment("normalize");
7728       // while (t0)
7729       //   t0 = sub(Pm_base, Pn_base, t0, len);
7730       Label loop, post, again;
7731       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7732       cbz(t0, post); {
7733         bind(again); {
7734           mov(i, zr);
7735           mov(cnt, len);
7736           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7737           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7738           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7739           align(16);
7740           bind(loop); {
7741             sbcs(Rm, Rm, Rn);
7742             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7743             add(i, i, 1);
7744             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7745             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7746             sub(cnt, cnt, 1);
7747           } cbnz(cnt, loop);
7748           sbc(t0, t0, zr);
7749         } cbnz(t0, again);
7750       } bind(post);
7751     }
7752 
7753     // Move memory at s to d, reversing words.
7754     //    Increments d to end of copied memory
7755     //    Destroys tmp1, tmp2
7756     //    Preserves len
7757     //    Leaves s pointing to the address which was in d at start
7758     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7759       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7760       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7761 
7762       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7763       mov(tmp1, len);
7764       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7765       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7766     }
7767     // where
7768     void reverse1(Register d, Register s, Register tmp) {
7769       ldr(tmp, pre(s, -wordSize));
7770       ror(tmp, tmp, 32);
7771       str(tmp, post(d, wordSize));
7772     }
7773 
7774     void step_squaring() {
7775       // An extra ACC
7776       step();
7777       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7778     }
7779 
7780     void last_squaring(RegisterOrConstant i) {
7781       Label dont;
7782       // if ((i & 1) == 0) {
7783       tbnz(i.as_register(), 0, dont); {
7784         // MACC(Ra, Rb, t0, t1, t2);
7785         // Ra = *++Pa;
7786         // Rb = *--Pb;
7787         umulh(Rhi_ab, Ra, Rb);
7788         mul(Rlo_ab, Ra, Rb);
7789         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7790       } bind(dont);
7791     }
7792 
7793     void extra_step_squaring() {
7794       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7795 
7796       // MACC(Rm, Rn, t0, t1, t2);
7797       // Rm = *++Pm;
7798       // Rn = *--Pn;
7799       umulh(Rhi_mn, Rm, Rn);
7800       mul(Rlo_mn, Rm, Rn);
7801       ldr(Rm, pre(Pm, wordSize));
7802       ldr(Rn, pre(Pn, -wordSize));
7803     }
7804 
7805     void post1_squaring() {
7806       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7807 
7808       // *Pm = Rm = t0 * inv;
7809       mul(Rm, t0, inv);
7810       str(Rm, Address(Pm));
7811 
7812       // MACC(Rm, Rn, t0, t1, t2);
7813       // t0 = t1; t1 = t2; t2 = 0;
7814       umulh(Rhi_mn, Rm, Rn);
7815 
7816 #ifndef PRODUCT
7817       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7818       {
7819         mul(Rlo_mn, Rm, Rn);
7820         add(Rlo_mn, t0, Rlo_mn);
7821         Label ok;
7822         cbz(Rlo_mn, ok); {
7823           stop("broken Montgomery multiply");
7824         } bind(ok);
7825       }
7826 #endif
7827       // We have very carefully set things up so that
7828       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7829       // the lower half of Rm * Rn because we know the result already:
7830       // it must be -t0.  t0 + (-t0) must generate a carry iff
7831       // t0 != 0.  So, rather than do a mul and an adds we just set
7832       // the carry flag iff t0 is nonzero.
7833       //
7834       // mul(Rlo_mn, Rm, Rn);
7835       // adds(zr, t0, Rlo_mn);
7836       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7837       adcs(t0, t1, Rhi_mn);
7838       adc(t1, t2, zr);
7839       mov(t2, zr);
7840     }
7841 
7842     void acc(Register Rhi, Register Rlo,
7843              Register t0, Register t1, Register t2) {
7844       adds(t0, t0, Rlo);
7845       adcs(t1, t1, Rhi);
7846       adc(t2, t2, zr);
7847     }
7848 
7849   public:
7850     /**
7851      * Fast Montgomery multiplication.  The derivation of the
7852      * algorithm is in A Cryptographic Library for the Motorola
7853      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7854      *
7855      * Arguments:
7856      *
7857      * Inputs for multiplication:
7858      *   c_rarg0   - int array elements a
7859      *   c_rarg1   - int array elements b
7860      *   c_rarg2   - int array elements n (the modulus)
7861      *   c_rarg3   - int length
7862      *   c_rarg4   - int inv
7863      *   c_rarg5   - int array elements m (the result)
7864      *
7865      * Inputs for squaring:
7866      *   c_rarg0   - int array elements a
7867      *   c_rarg1   - int array elements n (the modulus)
7868      *   c_rarg2   - int length
7869      *   c_rarg3   - int inv
7870      *   c_rarg4   - int array elements m (the result)
7871      *
7872      */
7873     address generate_multiply() {
7874       Label argh, nothing;
7875       bind(argh);
7876       stop("MontgomeryMultiply total_allocation must be <= 8192");
7877 
7878       align(CodeEntryAlignment);
7879       address entry = pc();
7880 
7881       cbzw(Rlen, nothing);
7882 
7883       enter();
7884 
7885       // Make room.
7886       cmpw(Rlen, 512);
7887       br(Assembler::HI, argh);
7888       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7889       andr(sp, Ra, -2 * wordSize);
7890 
7891       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7892 
7893       {
7894         // Copy input args, reversing as we go.  We use Ra as a
7895         // temporary variable.
7896         reverse(Ra, Pa_base, Rlen, t0, t1);
7897         if (!_squaring)
7898           reverse(Ra, Pb_base, Rlen, t0, t1);
7899         reverse(Ra, Pn_base, Rlen, t0, t1);
7900       }
7901 
7902       // Push all call-saved registers and also Pm_base which we'll need
7903       // at the end.
7904       save_regs();
7905 
7906 #ifndef PRODUCT
7907       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7908       {
7909         ldr(Rn, Address(Pn_base, 0));
7910         mul(Rlo_mn, Rn, inv);
7911         subs(zr, Rlo_mn, -1);
7912         Label ok;
7913         br(EQ, ok); {
7914           stop("broken inverse in Montgomery multiply");
7915         } bind(ok);
7916       }
7917 #endif
7918 
7919       mov(Pm_base, Ra);
7920 
7921       mov(t0, zr);
7922       mov(t1, zr);
7923       mov(t2, zr);
7924 
7925       block_comment("for (int i = 0; i < len; i++) {");
7926       mov(Ri, zr); {
7927         Label loop, end;
7928         cmpw(Ri, Rlen);
7929         br(Assembler::GE, end);
7930 
7931         bind(loop);
7932         pre1(Ri);
7933 
7934         block_comment("  for (j = i; j; j--) {"); {
7935           movw(Rj, Ri);
7936           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7937         } block_comment("  } // j");
7938 
7939         post1();
7940         addw(Ri, Ri, 1);
7941         cmpw(Ri, Rlen);
7942         br(Assembler::LT, loop);
7943         bind(end);
7944         block_comment("} // i");
7945       }
7946 
7947       block_comment("for (int i = len; i < 2*len; i++) {");
7948       mov(Ri, Rlen); {
7949         Label loop, end;
7950         cmpw(Ri, Rlen, Assembler::LSL, 1);
7951         br(Assembler::GE, end);
7952 
7953         bind(loop);
7954         pre2(Ri, Rlen);
7955 
7956         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7957           lslw(Rj, Rlen, 1);
7958           subw(Rj, Rj, Ri);
7959           subw(Rj, Rj, 1);
7960           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7961         } block_comment("  } // j");
7962 
7963         post2(Ri, Rlen);
7964         addw(Ri, Ri, 1);
7965         cmpw(Ri, Rlen, Assembler::LSL, 1);
7966         br(Assembler::LT, loop);
7967         bind(end);
7968       }
7969       block_comment("} // i");
7970 
7971       normalize(Rlen);
7972 
7973       mov(Ra, Pm_base);  // Save Pm_base in Ra
7974       restore_regs();  // Restore caller's Pm_base
7975 
7976       // Copy our result into caller's Pm_base
7977       reverse(Pm_base, Ra, Rlen, t0, t1);
7978 
7979       leave();
7980       bind(nothing);
7981       ret(lr);
7982 
7983       return entry;
7984     }
7985     // In C, approximately:
7986 
7987     // void
7988     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7989     //                     julong Pn_base[], julong Pm_base[],
7990     //                     julong inv, int len) {
7991     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7992     //   julong *Pa, *Pb, *Pn, *Pm;
7993     //   julong Ra, Rb, Rn, Rm;
7994 
7995     //   int i;
7996 
7997     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7998 
7999     //   for (i = 0; i < len; i++) {
8000     //     int j;
8001 
8002     //     Pa = Pa_base;
8003     //     Pb = Pb_base + i;
8004     //     Pm = Pm_base;
8005     //     Pn = Pn_base + i;
8006 
8007     //     Ra = *Pa;
8008     //     Rb = *Pb;
8009     //     Rm = *Pm;
8010     //     Rn = *Pn;
8011 
8012     //     int iters = i;
8013     //     for (j = 0; iters--; j++) {
8014     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8015     //       MACC(Ra, Rb, t0, t1, t2);
8016     //       Ra = *++Pa;
8017     //       Rb = *--Pb;
8018     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8019     //       MACC(Rm, Rn, t0, t1, t2);
8020     //       Rm = *++Pm;
8021     //       Rn = *--Pn;
8022     //     }
8023 
8024     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
8025     //     MACC(Ra, Rb, t0, t1, t2);
8026     //     *Pm = Rm = t0 * inv;
8027     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8028     //     MACC(Rm, Rn, t0, t1, t2);
8029 
8030     //     assert(t0 == 0, "broken Montgomery multiply");
8031 
8032     //     t0 = t1; t1 = t2; t2 = 0;
8033     //   }
8034 
8035     //   for (i = len; i < 2*len; i++) {
8036     //     int j;
8037 
8038     //     Pa = Pa_base + i-len;
8039     //     Pb = Pb_base + len;
8040     //     Pm = Pm_base + i-len;
8041     //     Pn = Pn_base + len;
8042 
8043     //     Ra = *++Pa;
8044     //     Rb = *--Pb;
8045     //     Rm = *++Pm;
8046     //     Rn = *--Pn;
8047 
8048     //     int iters = len*2-i-1;
8049     //     for (j = i-len+1; iters--; j++) {
8050     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8051     //       MACC(Ra, Rb, t0, t1, t2);
8052     //       Ra = *++Pa;
8053     //       Rb = *--Pb;
8054     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8055     //       MACC(Rm, Rn, t0, t1, t2);
8056     //       Rm = *++Pm;
8057     //       Rn = *--Pn;
8058     //     }
8059 
8060     //     Pm_base[i-len] = t0;
8061     //     t0 = t1; t1 = t2; t2 = 0;
8062     //   }
8063 
8064     //   while (t0)
8065     //     t0 = sub(Pm_base, Pn_base, t0, len);
8066     // }
8067 
8068     /**
8069      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
8070      * multiplies than Montgomery multiplication so it should be up to
8071      * 25% faster.  However, its loop control is more complex and it
8072      * may actually run slower on some machines.
8073      *
8074      * Arguments:
8075      *
8076      * Inputs:
8077      *   c_rarg0   - int array elements a
8078      *   c_rarg1   - int array elements n (the modulus)
8079      *   c_rarg2   - int length
8080      *   c_rarg3   - int inv
8081      *   c_rarg4   - int array elements m (the result)
8082      *
8083      */
8084     address generate_square() {
8085       Label argh;
8086       bind(argh);
8087       stop("MontgomeryMultiply total_allocation must be <= 8192");
8088 
8089       align(CodeEntryAlignment);
8090       address entry = pc();
8091 
8092       enter();
8093 
8094       // Make room.
8095       cmpw(Rlen, 512);
8096       br(Assembler::HI, argh);
8097       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8098       andr(sp, Ra, -2 * wordSize);
8099 
8100       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8101 
8102       {
8103         // Copy input args, reversing as we go.  We use Ra as a
8104         // temporary variable.
8105         reverse(Ra, Pa_base, Rlen, t0, t1);
8106         reverse(Ra, Pn_base, Rlen, t0, t1);
8107       }
8108 
8109       // Push all call-saved registers and also Pm_base which we'll need
8110       // at the end.
8111       save_regs();
8112 
8113       mov(Pm_base, Ra);
8114 
8115       mov(t0, zr);
8116       mov(t1, zr);
8117       mov(t2, zr);
8118 
8119       block_comment("for (int i = 0; i < len; i++) {");
8120       mov(Ri, zr); {
8121         Label loop, end;
8122         bind(loop);
8123         cmp(Ri, Rlen);
8124         br(Assembler::GE, end);
8125 
8126         pre1(Ri);
8127 
8128         block_comment("for (j = (i+1)/2; j; j--) {"); {
8129           add(Rj, Ri, 1);
8130           lsr(Rj, Rj, 1);
8131           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8132         } block_comment("  } // j");
8133 
8134         last_squaring(Ri);
8135 
8136         block_comment("  for (j = i/2; j; j--) {"); {
8137           lsr(Rj, Ri, 1);
8138           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8139         } block_comment("  } // j");
8140 
8141         post1_squaring();
8142         add(Ri, Ri, 1);
8143         cmp(Ri, Rlen);
8144         br(Assembler::LT, loop);
8145 
8146         bind(end);
8147         block_comment("} // i");
8148       }
8149 
8150       block_comment("for (int i = len; i < 2*len; i++) {");
8151       mov(Ri, Rlen); {
8152         Label loop, end;
8153         bind(loop);
8154         cmp(Ri, Rlen, Assembler::LSL, 1);
8155         br(Assembler::GE, end);
8156 
8157         pre2(Ri, Rlen);
8158 
8159         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8160           lsl(Rj, Rlen, 1);
8161           sub(Rj, Rj, Ri);
8162           sub(Rj, Rj, 1);
8163           lsr(Rj, Rj, 1);
8164           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8165         } block_comment("  } // j");
8166 
8167         last_squaring(Ri);
8168 
8169         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8170           lsl(Rj, Rlen, 1);
8171           sub(Rj, Rj, Ri);
8172           lsr(Rj, Rj, 1);
8173           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8174         } block_comment("  } // j");
8175 
8176         post2(Ri, Rlen);
8177         add(Ri, Ri, 1);
8178         cmp(Ri, Rlen, Assembler::LSL, 1);
8179 
8180         br(Assembler::LT, loop);
8181         bind(end);
8182         block_comment("} // i");
8183       }
8184 
8185       normalize(Rlen);
8186 
8187       mov(Ra, Pm_base);  // Save Pm_base in Ra
8188       restore_regs();  // Restore caller's Pm_base
8189 
8190       // Copy our result into caller's Pm_base
8191       reverse(Pm_base, Ra, Rlen, t0, t1);
8192 
8193       leave();
8194       ret(lr);
8195 
8196       return entry;
8197     }
8198     // In C, approximately:
8199 
8200     // void
8201     // montgomery_square(julong Pa_base[], julong Pn_base[],
8202     //                   julong Pm_base[], julong inv, int len) {
8203     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8204     //   julong *Pa, *Pb, *Pn, *Pm;
8205     //   julong Ra, Rb, Rn, Rm;
8206 
8207     //   int i;
8208 
8209     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8210 
8211     //   for (i = 0; i < len; i++) {
8212     //     int j;
8213 
8214     //     Pa = Pa_base;
8215     //     Pb = Pa_base + i;
8216     //     Pm = Pm_base;
8217     //     Pn = Pn_base + i;
8218 
8219     //     Ra = *Pa;
8220     //     Rb = *Pb;
8221     //     Rm = *Pm;
8222     //     Rn = *Pn;
8223 
8224     //     int iters = (i+1)/2;
8225     //     for (j = 0; iters--; j++) {
8226     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8227     //       MACC2(Ra, Rb, t0, t1, t2);
8228     //       Ra = *++Pa;
8229     //       Rb = *--Pb;
8230     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8231     //       MACC(Rm, Rn, t0, t1, t2);
8232     //       Rm = *++Pm;
8233     //       Rn = *--Pn;
8234     //     }
8235     //     if ((i & 1) == 0) {
8236     //       assert(Ra == Pa_base[j], "must be");
8237     //       MACC(Ra, Ra, t0, t1, t2);
8238     //     }
8239     //     iters = i/2;
8240     //     assert(iters == i-j, "must be");
8241     //     for (; iters--; j++) {
8242     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8243     //       MACC(Rm, Rn, t0, t1, t2);
8244     //       Rm = *++Pm;
8245     //       Rn = *--Pn;
8246     //     }
8247 
8248     //     *Pm = Rm = t0 * inv;
8249     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8250     //     MACC(Rm, Rn, t0, t1, t2);
8251 
8252     //     assert(t0 == 0, "broken Montgomery multiply");
8253 
8254     //     t0 = t1; t1 = t2; t2 = 0;
8255     //   }
8256 
8257     //   for (i = len; i < 2*len; i++) {
8258     //     int start = i-len+1;
8259     //     int end = start + (len - start)/2;
8260     //     int j;
8261 
8262     //     Pa = Pa_base + i-len;
8263     //     Pb = Pa_base + len;
8264     //     Pm = Pm_base + i-len;
8265     //     Pn = Pn_base + len;
8266 
8267     //     Ra = *++Pa;
8268     //     Rb = *--Pb;
8269     //     Rm = *++Pm;
8270     //     Rn = *--Pn;
8271 
8272     //     int iters = (2*len-i-1)/2;
8273     //     assert(iters == end-start, "must be");
8274     //     for (j = start; iters--; j++) {
8275     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8276     //       MACC2(Ra, Rb, t0, t1, t2);
8277     //       Ra = *++Pa;
8278     //       Rb = *--Pb;
8279     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8280     //       MACC(Rm, Rn, t0, t1, t2);
8281     //       Rm = *++Pm;
8282     //       Rn = *--Pn;
8283     //     }
8284     //     if ((i & 1) == 0) {
8285     //       assert(Ra == Pa_base[j], "must be");
8286     //       MACC(Ra, Ra, t0, t1, t2);
8287     //     }
8288     //     iters =  (2*len-i)/2;
8289     //     assert(iters == len-j, "must be");
8290     //     for (; iters--; j++) {
8291     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8292     //       MACC(Rm, Rn, t0, t1, t2);
8293     //       Rm = *++Pm;
8294     //       Rn = *--Pn;
8295     //     }
8296     //     Pm_base[i-len] = t0;
8297     //     t0 = t1; t1 = t2; t2 = 0;
8298     //   }
8299 
8300     //   while (t0)
8301     //     t0 = sub(Pm_base, Pn_base, t0, len);
8302     // }
8303   };
8304 
8305 
8306   // Initialization
8307   void generate_initial_stubs() {
8308     // Generate initial stubs and initializes the entry points
8309 
8310     // entry points that exist in all platforms Note: This is code
8311     // that could be shared among different platforms - however the
8312     // benefit seems to be smaller than the disadvantage of having a
8313     // much more complicated generator structure. See also comment in
8314     // stubRoutines.hpp.
8315 
8316     StubRoutines::_forward_exception_entry = generate_forward_exception();
8317 
8318     StubRoutines::_call_stub_entry =
8319       generate_call_stub(StubRoutines::_call_stub_return_address);
8320 
8321     // is referenced by megamorphic call
8322     StubRoutines::_catch_exception_entry = generate_catch_exception();
8323 
8324     // Build this early so it's available for the interpreter.
8325     StubRoutines::_throw_StackOverflowError_entry =
8326       generate_throw_exception("StackOverflowError throw_exception",
8327                                CAST_FROM_FN_PTR(address,
8328                                                 SharedRuntime::throw_StackOverflowError));
8329     StubRoutines::_throw_delayed_StackOverflowError_entry =
8330       generate_throw_exception("delayed StackOverflowError throw_exception",
8331                                CAST_FROM_FN_PTR(address,
8332                                                 SharedRuntime::throw_delayed_StackOverflowError));
8333 
8334     // Initialize table for copy memory (arraycopy) check.
8335     if (UnsafeCopyMemory::_table == nullptr) {
8336       UnsafeCopyMemory::create_table(8);
8337     }
8338 
8339     if (UseCRC32Intrinsics) {
8340       // set table address before stub generation which use it
8341       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8342       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8343     }
8344 
8345     if (UseCRC32CIntrinsics) {
8346       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8347     }
8348 
8349     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8350       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8351     }
8352 
8353     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8354       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8355     }
8356 
8357     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
8358         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
8359       StubRoutines::_hf2f = generate_float16ToFloat();
8360       StubRoutines::_f2hf = generate_floatToFloat16();
8361     }
8362   }
8363 
8364   void generate_continuation_stubs() {
8365     // Continuation stubs:
8366     StubRoutines::_cont_thaw          = generate_cont_thaw();
8367     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8368     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8369 
8370     JFR_ONLY(generate_jfr_stubs();)
8371   }
8372 
8373 #if INCLUDE_JFR
8374   void generate_jfr_stubs() {
8375     StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();
8376     StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();
8377     StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease();
8378     StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point();
8379   }
8380 #endif // INCLUDE_JFR
8381 
8382   void generate_final_stubs() {
8383     // support for verify_oop (must happen after universe_init)
8384     if (VerifyOops) {
8385       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8386     }
8387     StubRoutines::_throw_AbstractMethodError_entry =
8388       generate_throw_exception("AbstractMethodError throw_exception",
8389                                CAST_FROM_FN_PTR(address,
8390                                                 SharedRuntime::
8391                                                 throw_AbstractMethodError));
8392 
8393     StubRoutines::_throw_IncompatibleClassChangeError_entry =
8394       generate_throw_exception("IncompatibleClassChangeError throw_exception",
8395                                CAST_FROM_FN_PTR(address,
8396                                                 SharedRuntime::
8397                                                 throw_IncompatibleClassChangeError));
8398 
8399     StubRoutines::_throw_NullPointerException_at_call_entry =
8400       generate_throw_exception("NullPointerException at call throw_exception",
8401                                CAST_FROM_FN_PTR(address,
8402                                                 SharedRuntime::
8403                                                 throw_NullPointerException_at_call));
8404 
8405     // arraycopy stubs used by compilers
8406     generate_arraycopy_stubs();
8407 
8408     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8409     if (bs_nm != nullptr) {
8410       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8411     }
8412 
8413     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8414 
8415     if (UsePoly1305Intrinsics) {
8416       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
8417     }
8418 
8419 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8420 
8421     generate_atomic_entry_points();
8422 
8423 #endif // LINUX
8424 
8425     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8426 
8427     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8428   }
8429 
8430   void generate_compiler_stubs() {
8431 #if COMPILER2_OR_JVMCI
8432 
8433     if (UseSVE == 0) {
8434       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8435     }
8436 
8437     // array equals stub for large arrays.
8438     if (!UseSimpleArrayEquals) {
8439       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8440     }
8441 
8442     // byte_array_inflate stub for large arrays.
8443     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8444 
8445     // countPositives stub for large arrays.
8446     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8447 
8448     generate_compare_long_strings();
8449 
8450     generate_string_indexof_stubs();
8451 
8452 #ifdef COMPILER2
8453     if (UseMultiplyToLenIntrinsic) {
8454       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8455     }
8456 
8457     if (UseSquareToLenIntrinsic) {
8458       StubRoutines::_squareToLen = generate_squareToLen();
8459     }
8460 
8461     if (UseMulAddIntrinsic) {
8462       StubRoutines::_mulAdd = generate_mulAdd();
8463     }
8464 
8465     if (UseSIMDForBigIntegerShiftIntrinsics) {
8466       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8467       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8468     }
8469 
8470     if (UseMontgomeryMultiplyIntrinsic) {
8471       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8472       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8473       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8474     }
8475 
8476     if (UseMontgomerySquareIntrinsic) {
8477       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8478       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8479       // We use generate_multiply() rather than generate_square()
8480       // because it's faster for the sizes of modulus we care about.
8481       StubRoutines::_montgomerySquare = g.generate_multiply();
8482     }
8483 #endif // COMPILER2
8484 
8485     if (UseChaCha20Intrinsics) {
8486       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8487     }
8488 
8489     if (UseBASE64Intrinsics) {
8490         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8491         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8492     }
8493 
8494     // data cache line writeback
8495     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8496     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8497 
8498     if (UseAESIntrinsics) {
8499       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8500       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8501       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8502       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8503       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8504     }
8505     if (UseGHASHIntrinsics) {
8506       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8507       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8508     }
8509     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8510       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8511     }
8512 
8513     if (UseMD5Intrinsics) {
8514       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8515       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8516     }
8517     if (UseSHA1Intrinsics) {
8518       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8519       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8520     }
8521     if (UseSHA256Intrinsics) {
8522       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8523       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8524     }
8525     if (UseSHA512Intrinsics) {
8526       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8527       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8528     }
8529     if (UseSHA3Intrinsics) {
8530       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8531       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8532     }
8533 
8534     // generate Adler32 intrinsics code
8535     if (UseAdler32Intrinsics) {
8536       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8537     }
8538 #endif // COMPILER2_OR_JVMCI
8539   }
8540 
8541  public:
8542   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8543     switch(kind) {
8544     case Initial_stubs:
8545       generate_initial_stubs();
8546       break;
8547      case Continuation_stubs:
8548       generate_continuation_stubs();
8549       break;
8550     case Compiler_stubs:
8551       generate_compiler_stubs();
8552       break;
8553     case Final_stubs:
8554       generate_final_stubs();
8555       break;
8556     default:
8557       fatal("unexpected stubs kind: %d", kind);
8558       break;
8559     };
8560   }
8561 }; // end class declaration
8562 
8563 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8564   StubGenerator g(code, kind);
8565 }
8566 
8567 
8568 #if defined (LINUX)
8569 
8570 // Define pointers to atomic stubs and initialize them to point to the
8571 // code in atomic_aarch64.S.
8572 
8573 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8574   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8575     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8576   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8577     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8578 
8579 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8580 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8581 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8582 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8583 DEFAULT_ATOMIC_OP(xchg, 4, )
8584 DEFAULT_ATOMIC_OP(xchg, 8, )
8585 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8586 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8587 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8588 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8589 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8590 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8591 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8592 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8593 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8594 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8595 
8596 #undef DEFAULT_ATOMIC_OP
8597 
8598 #endif // LINUX