New src/hotspot/cpu/aarch64/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "prims/upcallLinker.hpp"
  45 #include "runtime/atomic.hpp"
  46 #include "runtime/continuation.hpp"
  47 #include "runtime/continuationEntry.inline.hpp"
  48 #include "runtime/frame.inline.hpp"
  49 #include "runtime/handles.inline.hpp"
  50 #include "runtime/javaThread.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/stubCodeGenerator.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/checkedCast.hpp"
  56 #include "utilities/globalDefinitions.hpp"
  57 #include "utilities/powerOfTwo.hpp"
  58 #ifdef COMPILER2
  59 #include "opto/runtime.hpp"
  60 #endif
  61 #if INCLUDE_ZGC
  62 #include "gc/z/zThreadLocalData.hpp"
  63 #endif
  64 
  65 // Declaration and definition of StubGenerator (no .hpp file).
  66 // For a more detailed description of the stub routine structure
  67 // see the comment in stubRoutines.hpp
  68 
  69 #undef __
  70 #define __ _masm->
  71 
  72 #ifdef PRODUCT
  73 #define BLOCK_COMMENT(str) /* nothing */
  74 #else
  75 #define BLOCK_COMMENT(str) __ block_comment(str)
  76 #endif
  77 
  78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  79 
  80 // Stub Code definitions
  81 
  82 class StubGenerator: public StubCodeGenerator {
  83  private:
  84 
  85 #ifdef PRODUCT
  86 #define inc_counter_np(counter) ((void)0)
  87 #else
  88   void inc_counter_np_(uint& counter) {
  89     __ incrementw(ExternalAddress((address)&counter));
  90   }
  91 #define inc_counter_np(counter) \
  92   BLOCK_COMMENT("inc_counter " #counter); \
  93   inc_counter_np_(counter);
  94 #endif
  95 
  96   // Call stubs are used to call Java from C
  97   //
  98   // Arguments:
  99   //    c_rarg0:   call wrapper address                   address
 100   //    c_rarg1:   result                                 address
 101   //    c_rarg2:   result type                            BasicType
 102   //    c_rarg3:   method                                 Method*
 103   //    c_rarg4:   (interpreter) entry point              address
 104   //    c_rarg5:   parameters                             intptr_t*
 105   //    c_rarg6:   parameter size (in words)              int
 106   //    c_rarg7:   thread                                 Thread*
 107   //
 108   // There is no return from the stub itself as any Java result
 109   // is written to result
 110   //
 111   // we save r30 (lr) as the return PC at the base of the frame and
 112   // link r29 (fp) below it as the frame pointer installing sp (r31)
 113   // into fp.
 114   //
 115   // we save r0-r7, which accounts for all the c arguments.
 116   //
 117   // TODO: strictly do we need to save them all? they are treated as
 118   // volatile by C so could we omit saving the ones we are going to
 119   // place in global registers (thread? method?) or those we only use
 120   // during setup of the Java call?
 121   //
 122   // we don't need to save r8 which C uses as an indirect result location
 123   // return register.
 124   //
 125   // we don't need to save r9-r15 which both C and Java treat as
 126   // volatile
 127   //
 128   // we don't need to save r16-18 because Java does not use them
 129   //
 130   // we save r19-r28 which Java uses as scratch registers and C
 131   // expects to be callee-save
 132   //
 133   // we save the bottom 64 bits of each value stored in v8-v15; it is
 134   // the responsibility of the caller to preserve larger values.
 135   //
 136   // so the stub frame looks like this when we enter Java code
 137   //
 138   //     [ return_from_Java     ] <--- sp
 139   //     [ argument word n      ]
 140   //      ...
 141   // -29 [ argument word 1      ]
 142   // -28 [ saved Floating-point Control Register ]
 143   // -26 [ saved v15            ] <--- sp_after_call
 144   // -25 [ saved v14            ]
 145   // -24 [ saved v13            ]
 146   // -23 [ saved v12            ]
 147   // -22 [ saved v11            ]
 148   // -21 [ saved v10            ]
 149   // -20 [ saved v9             ]
 150   // -19 [ saved v8             ]
 151   // -18 [ saved r28            ]
 152   // -17 [ saved r27            ]
 153   // -16 [ saved r26            ]
 154   // -15 [ saved r25            ]
 155   // -14 [ saved r24            ]
 156   // -13 [ saved r23            ]
 157   // -12 [ saved r22            ]
 158   // -11 [ saved r21            ]
 159   // -10 [ saved r20            ]
 160   //  -9 [ saved r19            ]
 161   //  -8 [ call wrapper    (r0) ]
 162   //  -7 [ result          (r1) ]
 163   //  -6 [ result type     (r2) ]
 164   //  -5 [ method          (r3) ]
 165   //  -4 [ entry point     (r4) ]
 166   //  -3 [ parameters      (r5) ]
 167   //  -2 [ parameter size  (r6) ]
 168   //  -1 [ thread (r7)          ]
 169   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 170   //   1 [ saved lr       (r30) ]
 171 
 172   // Call stub stack layout word offsets from fp
 173   enum call_stub_layout {
 174     sp_after_call_off  = -28,
 175 
 176     fpcr_off           = sp_after_call_off,
 177     d15_off            = -26,
 178     d13_off            = -24,
 179     d11_off            = -22,
 180     d9_off             = -20,
 181 
 182     r28_off            = -18,
 183     r26_off            = -16,
 184     r24_off            = -14,
 185     r22_off            = -12,
 186     r20_off            = -10,
 187     call_wrapper_off   =  -8,
 188     result_off         =  -7,
 189     result_type_off    =  -6,
 190     method_off         =  -5,
 191     entry_point_off    =  -4,
 192     parameter_size_off =  -2,
 193     thread_off         =  -1,
 194     fp_f               =   0,
 195     retaddr_off        =   1,
 196   };
 197 
 198   address generate_call_stub(address& return_address) {
 199     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 200            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 201            "adjust this code");
 202 
 203     StubCodeMark mark(this, "StubRoutines", "call_stub");
 204     address start = __ pc();
 205 
 206     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 207 
 208     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 209     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 210     const Address result        (rfp, result_off         * wordSize);
 211     const Address result_type   (rfp, result_type_off    * wordSize);
 212     const Address method        (rfp, method_off         * wordSize);
 213     const Address entry_point   (rfp, entry_point_off    * wordSize);
 214     const Address parameter_size(rfp, parameter_size_off * wordSize);
 215 
 216     const Address thread        (rfp, thread_off         * wordSize);
 217 
 218     const Address d15_save      (rfp, d15_off * wordSize);
 219     const Address d13_save      (rfp, d13_off * wordSize);
 220     const Address d11_save      (rfp, d11_off * wordSize);
 221     const Address d9_save       (rfp, d9_off * wordSize);
 222 
 223     const Address r28_save      (rfp, r28_off * wordSize);
 224     const Address r26_save      (rfp, r26_off * wordSize);
 225     const Address r24_save      (rfp, r24_off * wordSize);
 226     const Address r22_save      (rfp, r22_off * wordSize);
 227     const Address r20_save      (rfp, r20_off * wordSize);
 228 
 229     // stub code
 230 
 231     address aarch64_entry = __ pc();
 232 
 233     // set up frame and move sp to end of save area
 234     __ enter();
 235     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 236 
 237     // save register parameters and Java scratch/global registers
 238     // n.b. we save thread even though it gets installed in
 239     // rthread because we want to sanity check rthread later
 240     __ str(c_rarg7,  thread);
 241     __ strw(c_rarg6, parameter_size);
 242     __ stp(c_rarg4, c_rarg5,  entry_point);
 243     __ stp(c_rarg2, c_rarg3,  result_type);
 244     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 245 
 246     __ stp(r20, r19,   r20_save);
 247     __ stp(r22, r21,   r22_save);
 248     __ stp(r24, r23,   r24_save);
 249     __ stp(r26, r25,   r26_save);
 250     __ stp(r28, r27,   r28_save);
 251 
 252     __ stpd(v9,  v8,   d9_save);
 253     __ stpd(v11, v10,  d11_save);
 254     __ stpd(v13, v12,  d13_save);
 255     __ stpd(v15, v14,  d15_save);
 256 
 257     __ get_fpcr(rscratch1);
 258     __ str(rscratch1, fpcr_save);
 259     // Set FPCR to the state we need. We do want Round to Nearest. We
 260     // don't want non-IEEE rounding modes or floating-point traps.
 261     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 262     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 263     __ set_fpcr(rscratch1);
 264 
 265     // install Java thread in global register now we have saved
 266     // whatever value it held
 267     __ mov(rthread, c_rarg7);
 268     // And method
 269     __ mov(rmethod, c_rarg3);
 270 
 271     // set up the heapbase register
 272     __ reinit_heapbase();
 273 
 274 #ifdef ASSERT
 275     // make sure we have no pending exceptions
 276     {
 277       Label L;
 278       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 279       __ cmp(rscratch1, (u1)NULL_WORD);
 280       __ br(Assembler::EQ, L);
 281       __ stop("StubRoutines::call_stub: entered with pending exception");
 282       __ BIND(L);
 283     }
 284 #endif
 285     // pass parameters if any
 286     __ mov(esp, sp);
 287     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 288     __ andr(sp, rscratch1, -2 * wordSize);
 289 
 290     BLOCK_COMMENT("pass parameters if any");
 291     Label parameters_done;
 292     // parameter count is still in c_rarg6
 293     // and parameter pointer identifying param 1 is in c_rarg5
 294     __ cbzw(c_rarg6, parameters_done);
 295 
 296     address loop = __ pc();
 297     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 298     __ subsw(c_rarg6, c_rarg6, 1);
 299     __ push(rscratch1);
 300     __ br(Assembler::GT, loop);
 301 
 302     __ BIND(parameters_done);
 303 
 304     // call Java entry -- passing methdoOop, and current sp
 305     //      rmethod: Method*
 306     //      r19_sender_sp: sender sp
 307     BLOCK_COMMENT("call Java function");
 308     __ mov(r19_sender_sp, sp);
 309     __ blr(c_rarg4);
 310 
 311     // we do this here because the notify will already have been done
 312     // if we get to the next instruction via an exception
 313     //
 314     // n.b. adding this instruction here affects the calculation of
 315     // whether or not a routine returns to the call stub (used when
 316     // doing stack walks) since the normal test is to check the return
 317     // pc against the address saved below. so we may need to allow for
 318     // this extra instruction in the check.
 319 
 320     // save current address for use by exception handling code
 321 
 322     return_address = __ pc();
 323 
 324     // store result depending on type (everything that is not
 325     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 326     // n.b. this assumes Java returns an integral result in r0
 327     // and a floating result in j_farg0
 328     __ ldr(j_rarg2, result);
 329     Label is_long, is_float, is_double, exit;
 330     __ ldr(j_rarg1, result_type);
 331     __ cmp(j_rarg1, (u1)T_OBJECT);
 332     __ br(Assembler::EQ, is_long);
 333     __ cmp(j_rarg1, (u1)T_LONG);
 334     __ br(Assembler::EQ, is_long);
 335     __ cmp(j_rarg1, (u1)T_FLOAT);
 336     __ br(Assembler::EQ, is_float);
 337     __ cmp(j_rarg1, (u1)T_DOUBLE);
 338     __ br(Assembler::EQ, is_double);
 339 
 340     // handle T_INT case
 341     __ strw(r0, Address(j_rarg2));
 342 
 343     __ BIND(exit);
 344 
 345     // pop parameters
 346     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 347 
 348 #ifdef ASSERT
 349     // verify that threads correspond
 350     {
 351       Label L, S;
 352       __ ldr(rscratch1, thread);
 353       __ cmp(rthread, rscratch1);
 354       __ br(Assembler::NE, S);
 355       __ get_thread(rscratch1);
 356       __ cmp(rthread, rscratch1);
 357       __ br(Assembler::EQ, L);
 358       __ BIND(S);
 359       __ stop("StubRoutines::call_stub: threads must correspond");
 360       __ BIND(L);
 361     }
 362 #endif
 363 
 364     __ pop_cont_fastpath(rthread);
 365 
 366     // restore callee-save registers
 367     __ ldpd(v15, v14,  d15_save);
 368     __ ldpd(v13, v12,  d13_save);
 369     __ ldpd(v11, v10,  d11_save);
 370     __ ldpd(v9,  v8,   d9_save);
 371 
 372     __ ldp(r28, r27,   r28_save);
 373     __ ldp(r26, r25,   r26_save);
 374     __ ldp(r24, r23,   r24_save);
 375     __ ldp(r22, r21,   r22_save);
 376     __ ldp(r20, r19,   r20_save);
 377 
 378     // restore fpcr
 379     __ ldr(rscratch1,  fpcr_save);
 380     __ set_fpcr(rscratch1);
 381 
 382     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 383     __ ldrw(c_rarg2, result_type);
 384     __ ldr(c_rarg3,  method);
 385     __ ldp(c_rarg4, c_rarg5,  entry_point);
 386     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 387 
 388     // leave frame and return to caller
 389     __ leave();
 390     __ ret(lr);
 391 
 392     // handle return types different from T_INT
 393 
 394     __ BIND(is_long);
 395     __ str(r0, Address(j_rarg2, 0));
 396     __ br(Assembler::AL, exit);
 397 
 398     __ BIND(is_float);
 399     __ strs(j_farg0, Address(j_rarg2, 0));
 400     __ br(Assembler::AL, exit);
 401 
 402     __ BIND(is_double);
 403     __ strd(j_farg0, Address(j_rarg2, 0));
 404     __ br(Assembler::AL, exit);
 405 
 406     return start;
 407   }
 408 
 409   // Return point for a Java call if there's an exception thrown in
 410   // Java code.  The exception is caught and transformed into a
 411   // pending exception stored in JavaThread that can be tested from
 412   // within the VM.
 413   //
 414   // Note: Usually the parameters are removed by the callee. In case
 415   // of an exception crossing an activation frame boundary, that is
 416   // not the case if the callee is compiled code => need to setup the
 417   // rsp.
 418   //
 419   // r0: exception oop
 420 
 421   address generate_catch_exception() {
 422     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 423     address start = __ pc();
 424 
 425     // same as in generate_call_stub():
 426     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 427     const Address thread        (rfp, thread_off         * wordSize);
 428 
 429 #ifdef ASSERT
 430     // verify that threads correspond
 431     {
 432       Label L, S;
 433       __ ldr(rscratch1, thread);
 434       __ cmp(rthread, rscratch1);
 435       __ br(Assembler::NE, S);
 436       __ get_thread(rscratch1);
 437       __ cmp(rthread, rscratch1);
 438       __ br(Assembler::EQ, L);
 439       __ bind(S);
 440       __ stop("StubRoutines::catch_exception: threads must correspond");
 441       __ bind(L);
 442     }
 443 #endif
 444 
 445     // set pending exception
 446     __ verify_oop(r0);
 447 
 448     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 449     __ mov(rscratch1, (address)__FILE__);
 450     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 451     __ movw(rscratch1, (int)__LINE__);
 452     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 453 
 454     // complete return to VM
 455     assert(StubRoutines::_call_stub_return_address != nullptr,
 456            "_call_stub_return_address must have been generated before");
 457     __ b(StubRoutines::_call_stub_return_address);
 458 
 459     return start;
 460   }
 461 
 462   // Continuation point for runtime calls returning with a pending
 463   // exception.  The pending exception check happened in the runtime
 464   // or native call stub.  The pending exception in Thread is
 465   // converted into a Java-level exception.
 466   //
 467   // Contract with Java-level exception handlers:
 468   // r0: exception
 469   // r3: throwing pc
 470   //
 471   // NOTE: At entry of this stub, exception-pc must be in LR !!
 472 
 473   // NOTE: this is always used as a jump target within generated code
 474   // so it just needs to be generated code with no x86 prolog
 475 
 476   address generate_forward_exception() {
 477     StubCodeMark mark(this, "StubRoutines", "forward exception");
 478     address start = __ pc();
 479 
 480     // Upon entry, LR points to the return address returning into
 481     // Java (interpreted or compiled) code; i.e., the return address
 482     // becomes the throwing pc.
 483     //
 484     // Arguments pushed before the runtime call are still on the stack
 485     // but the exception handler will reset the stack pointer ->
 486     // ignore them.  A potential result in registers can be ignored as
 487     // well.
 488 
 489 #ifdef ASSERT
 490     // make sure this code is only executed if there is a pending exception
 491     {
 492       Label L;
 493       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 494       __ cbnz(rscratch1, L);
 495       __ stop("StubRoutines::forward exception: no pending exception (1)");
 496       __ bind(L);
 497     }
 498 #endif
 499 
 500     // compute exception handler into r19
 501 
 502     // call the VM to find the handler address associated with the
 503     // caller address. pass thread in r0 and caller pc (ret address)
 504     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 505     // the stack.
 506     __ mov(c_rarg1, lr);
 507     // lr will be trashed by the VM call so we move it to R19
 508     // (callee-saved) because we also need to pass it to the handler
 509     // returned by this call.
 510     __ mov(r19, lr);
 511     BLOCK_COMMENT("call exception_handler_for_return_address");
 512     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 513                          SharedRuntime::exception_handler_for_return_address),
 514                     rthread, c_rarg1);
 515     // Reinitialize the ptrue predicate register, in case the external runtime
 516     // call clobbers ptrue reg, as we may return to SVE compiled code.
 517     __ reinitialize_ptrue();
 518 
 519     // we should not really care that lr is no longer the callee
 520     // address. we saved the value the handler needs in r19 so we can
 521     // just copy it to r3. however, the C2 handler will push its own
 522     // frame and then calls into the VM and the VM code asserts that
 523     // the PC for the frame above the handler belongs to a compiled
 524     // Java method. So, we restore lr here to satisfy that assert.
 525     __ mov(lr, r19);
 526     // setup r0 & r3 & clear pending exception
 527     __ mov(r3, r19);
 528     __ mov(r19, r0);
 529     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 530     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 531 
 532 #ifdef ASSERT
 533     // make sure exception is set
 534     {
 535       Label L;
 536       __ cbnz(r0, L);
 537       __ stop("StubRoutines::forward exception: no pending exception (2)");
 538       __ bind(L);
 539     }
 540 #endif
 541 
 542     // continue at exception handler
 543     // r0: exception
 544     // r3: throwing pc
 545     // r19: exception handler
 546     __ verify_oop(r0);
 547     __ br(r19);
 548 
 549     return start;
 550   }
 551 
 552   // Non-destructive plausibility checks for oops
 553   //
 554   // Arguments:
 555   //    r0: oop to verify
 556   //    rscratch1: error message
 557   //
 558   // Stack after saving c_rarg3:
 559   //    [tos + 0]: saved c_rarg3
 560   //    [tos + 1]: saved c_rarg2
 561   //    [tos + 2]: saved lr
 562   //    [tos + 3]: saved rscratch2
 563   //    [tos + 4]: saved r0
 564   //    [tos + 5]: saved rscratch1
 565   address generate_verify_oop() {
 566 
 567     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 568     address start = __ pc();
 569 
 570     Label exit, error;
 571 
 572     // save c_rarg2 and c_rarg3
 573     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 574 
 575     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 576     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 577     __ ldr(c_rarg3, Address(c_rarg2));
 578     __ add(c_rarg3, c_rarg3, 1);
 579     __ str(c_rarg3, Address(c_rarg2));
 580 
 581     // object is in r0
 582     // make sure object is 'reasonable'
 583     __ cbz(r0, exit); // if obj is null it is OK
 584 
 585     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 586     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 587 
 588     // return if everything seems ok
 589     __ bind(exit);
 590 
 591     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 592     __ ret(lr);
 593 
 594     // handle errors
 595     __ bind(error);
 596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 597 
 598     __ push(RegSet::range(r0, r29), sp);
 599     // debug(char* msg, int64_t pc, int64_t regs[])
 600     __ mov(c_rarg0, rscratch1);      // pass address of error message
 601     __ mov(c_rarg1, lr);             // pass return address
 602     __ mov(c_rarg2, sp);             // pass address of regs on stack
 603 #ifndef PRODUCT
 604     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 605 #endif
 606     BLOCK_COMMENT("call MacroAssembler::debug");
 607     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 608     __ blr(rscratch1);
 609     __ hlt(0);
 610 
 611     return start;
 612   }
 613 
 614   // Generate indices for iota vector.
 615   address generate_iota_indices(const char *stub_name) {
 616     __ align(CodeEntryAlignment);
 617     StubCodeMark mark(this, "StubRoutines", stub_name);
 618     address start = __ pc();
 619     // B
 620     __ emit_data64(0x0706050403020100, relocInfo::none);
 621     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 622     // H
 623     __ emit_data64(0x0003000200010000, relocInfo::none);
 624     __ emit_data64(0x0007000600050004, relocInfo::none);
 625     // S
 626     __ emit_data64(0x0000000100000000, relocInfo::none);
 627     __ emit_data64(0x0000000300000002, relocInfo::none);
 628     // D
 629     __ emit_data64(0x0000000000000000, relocInfo::none);
 630     __ emit_data64(0x0000000000000001, relocInfo::none);
 631     // S - FP
 632     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 633     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 634     // D - FP
 635     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 636     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 637     return start;
 638   }
 639 
 640   // The inner part of zero_words().  This is the bulk operation,
 641   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 642   // caller is responsible for zeroing the last few words.
 643   //
 644   // Inputs:
 645   // r10: the HeapWord-aligned base address of an array to zero.
 646   // r11: the count in HeapWords, r11 > 0.
 647   //
 648   // Returns r10 and r11, adjusted for the caller to clear.
 649   // r10: the base address of the tail of words left to clear.
 650   // r11: the number of words in the tail.
 651   //      r11 < MacroAssembler::zero_words_block_size.
 652 
 653   address generate_zero_blocks() {
 654     Label done;
 655     Label base_aligned;
 656 
 657     Register base = r10, cnt = r11;
 658 
 659     __ align(CodeEntryAlignment);
 660     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 661     address start = __ pc();
 662 
 663     if (UseBlockZeroing) {
 664       int zva_length = VM_Version::zva_length();
 665 
 666       // Ensure ZVA length can be divided by 16. This is required by
 667       // the subsequent operations.
 668       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 669 
 670       __ tbz(base, 3, base_aligned);
 671       __ str(zr, Address(__ post(base, 8)));
 672       __ sub(cnt, cnt, 1);
 673       __ bind(base_aligned);
 674 
 675       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 676       // alignment.
 677       Label small;
 678       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 679       __ subs(rscratch1, cnt, low_limit >> 3);
 680       __ br(Assembler::LT, small);
 681       __ zero_dcache_blocks(base, cnt);
 682       __ bind(small);
 683     }
 684 
 685     {
 686       // Number of stp instructions we'll unroll
 687       const int unroll =
 688         MacroAssembler::zero_words_block_size / 2;
 689       // Clear the remaining blocks.
 690       Label loop;
 691       __ subs(cnt, cnt, unroll * 2);
 692       __ br(Assembler::LT, done);
 693       __ bind(loop);
 694       for (int i = 0; i < unroll; i++)
 695         __ stp(zr, zr, __ post(base, 16));
 696       __ subs(cnt, cnt, unroll * 2);
 697       __ br(Assembler::GE, loop);
 698       __ bind(done);
 699       __ add(cnt, cnt, unroll * 2);
 700     }
 701 
 702     __ ret(lr);
 703 
 704     return start;
 705   }
 706 
 707 
 708   typedef enum {
 709     copy_forwards = 1,
 710     copy_backwards = -1
 711   } copy_direction;
 712 
 713   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 714   // for arraycopy stubs.
 715   class ArrayCopyBarrierSetHelper : StackObj {
 716     BarrierSetAssembler* _bs_asm;
 717     MacroAssembler* _masm;
 718     DecoratorSet _decorators;
 719     BasicType _type;
 720     Register _gct1;
 721     Register _gct2;
 722     Register _gct3;
 723     FloatRegister _gcvt1;
 724     FloatRegister _gcvt2;
 725     FloatRegister _gcvt3;
 726 
 727   public:
 728     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 729                               DecoratorSet decorators,
 730                               BasicType type,
 731                               Register gct1,
 732                               Register gct2,
 733                               Register gct3,
 734                               FloatRegister gcvt1,
 735                               FloatRegister gcvt2,
 736                               FloatRegister gcvt3)
 737       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 738         _masm(masm),
 739         _decorators(decorators),
 740         _type(type),
 741         _gct1(gct1),
 742         _gct2(gct2),
 743         _gct3(gct3),
 744         _gcvt1(gcvt1),
 745         _gcvt2(gcvt2),
 746         _gcvt3(gcvt3) {
 747     }
 748 
 749     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 750       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 751                             dst1, dst2, src,
 752                             _gct1, _gct2, _gcvt1);
 753     }
 754 
 755     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 756       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 757                              dst, src1, src2,
 758                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 759     }
 760 
 761     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 762       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 763                             dst1, dst2, src,
 764                             _gct1);
 765     }
 766 
 767     void copy_store_at_16(Address dst, Register src1, Register src2) {
 768       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 769                              dst, src1, src2,
 770                              _gct1, _gct2, _gct3);
 771     }
 772 
 773     void copy_load_at_8(Register dst, Address src) {
 774       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 775                             dst, noreg, src,
 776                             _gct1);
 777     }
 778 
 779     void copy_store_at_8(Address dst, Register src) {
 780       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 781                              dst, src, noreg,
 782                              _gct1, _gct2, _gct3);
 783     }
 784   };
 785 
 786   // Bulk copy of blocks of 8 words.
 787   //
 788   // count is a count of words.
 789   //
 790   // Precondition: count >= 8
 791   //
 792   // Postconditions:
 793   //
 794   // The least significant bit of count contains the remaining count
 795   // of words to copy.  The rest of count is trash.
 796   //
 797   // s and d are adjusted to point to the remaining words to copy
 798   //
 799   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 800                            copy_direction direction) {
 801     int unit = wordSize * direction;
 802     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 803 
 804     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 805       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 806     const Register stride = r14;
 807     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 808     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 809     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 810 
 811     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 812     assert_different_registers(s, d, count, rscratch1, rscratch2);
 813 
 814     Label again, drain;
 815     const char *stub_name;
 816     if (direction == copy_forwards)
 817       stub_name = "forward_copy_longs";
 818     else
 819       stub_name = "backward_copy_longs";
 820 
 821     __ align(CodeEntryAlignment);
 822 
 823     StubCodeMark mark(this, "StubRoutines", stub_name);
 824 
 825     __ bind(start);
 826 
 827     Label unaligned_copy_long;
 828     if (AvoidUnalignedAccesses) {
 829       __ tbnz(d, 3, unaligned_copy_long);
 830     }
 831 
 832     if (direction == copy_forwards) {
 833       __ sub(s, s, bias);
 834       __ sub(d, d, bias);
 835     }
 836 
 837 #ifdef ASSERT
 838     // Make sure we are never given < 8 words
 839     {
 840       Label L;
 841       __ cmp(count, (u1)8);
 842       __ br(Assembler::GE, L);
 843       __ stop("genrate_copy_longs called with < 8 words");
 844       __ bind(L);
 845     }
 846 #endif
 847 
 848     // Fill 8 registers
 849     if (UseSIMDForMemoryOps) {
 850       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 851       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 852     } else {
 853       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 854       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 855       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 856       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 857     }
 858 
 859     __ subs(count, count, 16);
 860     __ br(Assembler::LO, drain);
 861 
 862     int prefetch = PrefetchCopyIntervalInBytes;
 863     bool use_stride = false;
 864     if (direction == copy_backwards) {
 865        use_stride = prefetch > 256;
 866        prefetch = -prefetch;
 867        if (use_stride) __ mov(stride, prefetch);
 868     }
 869 
 870     __ bind(again);
 871 
 872     if (PrefetchCopyIntervalInBytes > 0)
 873       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 874 
 875     if (UseSIMDForMemoryOps) {
 876       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 877       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 878       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 879       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 880     } else {
 881       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 882       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 883       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 884       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 885       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 886       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 887       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 888       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 889     }
 890 
 891     __ subs(count, count, 8);
 892     __ br(Assembler::HS, again);
 893 
 894     // Drain
 895     __ bind(drain);
 896     if (UseSIMDForMemoryOps) {
 897       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 898       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 899     } else {
 900       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 901       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 902       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 903       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 904     }
 905 
 906     {
 907       Label L1, L2;
 908       __ tbz(count, exact_log2(4), L1);
 909       if (UseSIMDForMemoryOps) {
 910         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 911         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 912       } else {
 913         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 914         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 915         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 916         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 917       }
 918       __ bind(L1);
 919 
 920       if (direction == copy_forwards) {
 921         __ add(s, s, bias);
 922         __ add(d, d, bias);
 923       }
 924 
 925       __ tbz(count, 1, L2);
 926       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 927       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 928       __ bind(L2);
 929     }
 930 
 931     __ ret(lr);
 932 
 933     if (AvoidUnalignedAccesses) {
 934       Label drain, again;
 935       // Register order for storing. Order is different for backward copy.
 936 
 937       __ bind(unaligned_copy_long);
 938 
 939       // source address is even aligned, target odd aligned
 940       //
 941       // when forward copying word pairs we read long pairs at offsets
 942       // {0, 2, 4, 6} (in long words). when backwards copying we read
 943       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 944       // address by -2 in the forwards case so we can compute the
 945       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 946       // or -1.
 947       //
 948       // when forward copying we need to store 1 word, 3 pairs and
 949       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 950       // zero offset We adjust the destination by -1 which means we
 951       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 952       //
 953       // When backwards copyng we need to store 1 word, 3 pairs and
 954       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 955       // offsets {1, 3, 5, 7, 8} * unit.
 956 
 957       if (direction == copy_forwards) {
 958         __ sub(s, s, 16);
 959         __ sub(d, d, 8);
 960       }
 961 
 962       // Fill 8 registers
 963       //
 964       // for forwards copy s was offset by -16 from the original input
 965       // value of s so the register contents are at these offsets
 966       // relative to the 64 bit block addressed by that original input
 967       // and so on for each successive 64 byte block when s is updated
 968       //
 969       // t0 at offset 0,  t1 at offset 8
 970       // t2 at offset 16, t3 at offset 24
 971       // t4 at offset 32, t5 at offset 40
 972       // t6 at offset 48, t7 at offset 56
 973 
 974       // for backwards copy s was not offset so the register contents
 975       // are at these offsets into the preceding 64 byte block
 976       // relative to that original input and so on for each successive
 977       // preceding 64 byte block when s is updated. this explains the
 978       // slightly counter-intuitive looking pattern of register usage
 979       // in the stp instructions for backwards copy.
 980       //
 981       // t0 at offset -16, t1 at offset -8
 982       // t2 at offset -32, t3 at offset -24
 983       // t4 at offset -48, t5 at offset -40
 984       // t6 at offset -64, t7 at offset -56
 985 
 986       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 987       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 988       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 989       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 990 
 991       __ subs(count, count, 16);
 992       __ br(Assembler::LO, drain);
 993 
 994       int prefetch = PrefetchCopyIntervalInBytes;
 995       bool use_stride = false;
 996       if (direction == copy_backwards) {
 997          use_stride = prefetch > 256;
 998          prefetch = -prefetch;
 999          if (use_stride) __ mov(stride, prefetch);
1000       }
1001 
1002       __ bind(again);
1003 
1004       if (PrefetchCopyIntervalInBytes > 0)
1005         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1006 
1007       if (direction == copy_forwards) {
1008        // allowing for the offset of -8 the store instructions place
1009        // registers into the target 64 bit block at the following
1010        // offsets
1011        //
1012        // t0 at offset 0
1013        // t1 at offset 8,  t2 at offset 16
1014        // t3 at offset 24, t4 at offset 32
1015        // t5 at offset 40, t6 at offset 48
1016        // t7 at offset 56
1017 
1018         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1019         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1020         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1021         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1022         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1023         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1024         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1025         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1026         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1027       } else {
1028        // d was not offset when we started so the registers are
1029        // written into the 64 bit block preceding d with the following
1030        // offsets
1031        //
1032        // t1 at offset -8
1033        // t3 at offset -24, t0 at offset -16
1034        // t5 at offset -48, t2 at offset -32
1035        // t7 at offset -56, t4 at offset -48
1036        //                   t6 at offset -64
1037        //
1038        // note that this matches the offsets previously noted for the
1039        // loads
1040 
1041         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1042         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1043         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1044         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1045         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1046         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1047         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1048         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1049         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1050       }
1051 
1052       __ subs(count, count, 8);
1053       __ br(Assembler::HS, again);
1054 
1055       // Drain
1056       //
1057       // this uses the same pattern of offsets and register arguments
1058       // as above
1059       __ bind(drain);
1060       if (direction == copy_forwards) {
1061         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1062         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1063         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1064         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1065         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1066       } else {
1067         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1068         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1069         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1070         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1071         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1072       }
1073       // now we need to copy any remaining part block which may
1074       // include a 4 word block subblock and/or a 2 word subblock.
1075       // bits 2 and 1 in the count are the tell-tale for whether we
1076       // have each such subblock
1077       {
1078         Label L1, L2;
1079         __ tbz(count, exact_log2(4), L1);
1080        // this is the same as above but copying only 4 longs hence
1081        // with only one intervening stp between the str instructions
1082        // but note that the offsets and registers still follow the
1083        // same pattern
1084         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1085         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1086         if (direction == copy_forwards) {
1087           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1088           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1089           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1090         } else {
1091           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1092           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1093           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1094         }
1095         __ bind(L1);
1096 
1097         __ tbz(count, 1, L2);
1098        // this is the same as above but copying only 2 longs hence
1099        // there is no intervening stp between the str instructions
1100        // but note that the offset and register patterns are still
1101        // the same
1102         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1103         if (direction == copy_forwards) {
1104           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1105           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1106         } else {
1107           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1108           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1109         }
1110         __ bind(L2);
1111 
1112        // for forwards copy we need to re-adjust the offsets we
1113        // applied so that s and d are follow the last words written
1114 
1115        if (direction == copy_forwards) {
1116          __ add(s, s, 16);
1117          __ add(d, d, 8);
1118        }
1119 
1120       }
1121 
1122       __ ret(lr);
1123       }
1124   }
1125 
1126   // Small copy: less than 16 bytes.
1127   //
1128   // NB: Ignores all of the bits of count which represent more than 15
1129   // bytes, so a caller doesn't have to mask them.
1130 
1131   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1132     bool is_backwards = step < 0;
1133     size_t granularity = uabs(step);
1134     int direction = is_backwards ? -1 : 1;
1135 
1136     Label Lword, Lint, Lshort, Lbyte;
1137 
1138     assert(granularity
1139            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1140 
1141     const Register t0 = r3;
1142     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1143     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1144 
1145     // ??? I don't know if this bit-test-and-branch is the right thing
1146     // to do.  It does a lot of jumping, resulting in several
1147     // mispredicted branches.  It might make more sense to do this
1148     // with something like Duff's device with a single computed branch.
1149 
1150     __ tbz(count, 3 - exact_log2(granularity), Lword);
1151     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1152     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1153     __ bind(Lword);
1154 
1155     if (granularity <= sizeof (jint)) {
1156       __ tbz(count, 2 - exact_log2(granularity), Lint);
1157       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1158       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1159       __ bind(Lint);
1160     }
1161 
1162     if (granularity <= sizeof (jshort)) {
1163       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1164       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1165       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1166       __ bind(Lshort);
1167     }
1168 
1169     if (granularity <= sizeof (jbyte)) {
1170       __ tbz(count, 0, Lbyte);
1171       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1172       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1173       __ bind(Lbyte);
1174     }
1175   }
1176 
1177   Label copy_f, copy_b;
1178   Label copy_obj_f, copy_obj_b;
1179   Label copy_obj_uninit_f, copy_obj_uninit_b;
1180 
1181   // All-singing all-dancing memory copy.
1182   //
1183   // Copy count units of memory from s to d.  The size of a unit is
1184   // step, which can be positive or negative depending on the direction
1185   // of copy.  If is_aligned is false, we align the source address.
1186   //
1187 
1188   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1189                    Register s, Register d, Register count, int step) {
1190     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1191     bool is_backwards = step < 0;
1192     unsigned int granularity = uabs(step);
1193     const Register t0 = r3, t1 = r4;
1194 
1195     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1196     // load all the data before writing anything
1197     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1198     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1199     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1200     const Register send = r17, dend = r16;
1201     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1202     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1203     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1204 
1205     if (PrefetchCopyIntervalInBytes > 0)
1206       __ prfm(Address(s, 0), PLDL1KEEP);
1207     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1208     __ br(Assembler::HI, copy_big);
1209 
1210     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1211     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1212 
1213     __ cmp(count, u1(16/granularity));
1214     __ br(Assembler::LS, copy16);
1215 
1216     __ cmp(count, u1(64/granularity));
1217     __ br(Assembler::HI, copy80);
1218 
1219     __ cmp(count, u1(32/granularity));
1220     __ br(Assembler::LS, copy32);
1221 
1222     // 33..64 bytes
1223     if (UseSIMDForMemoryOps) {
1224       bs.copy_load_at_32(v0, v1, Address(s, 0));
1225       bs.copy_load_at_32(v2, v3, Address(send, -32));
1226       bs.copy_store_at_32(Address(d, 0), v0, v1);
1227       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1228     } else {
1229       bs.copy_load_at_16(t0, t1, Address(s, 0));
1230       bs.copy_load_at_16(t2, t3, Address(s, 16));
1231       bs.copy_load_at_16(t4, t5, Address(send, -32));
1232       bs.copy_load_at_16(t6, t7, Address(send, -16));
1233 
1234       bs.copy_store_at_16(Address(d, 0), t0, t1);
1235       bs.copy_store_at_16(Address(d, 16), t2, t3);
1236       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1237       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1238     }
1239     __ b(finish);
1240 
1241     // 17..32 bytes
1242     __ bind(copy32);
1243     bs.copy_load_at_16(t0, t1, Address(s, 0));
1244     bs.copy_load_at_16(t6, t7, Address(send, -16));
1245 
1246     bs.copy_store_at_16(Address(d, 0), t0, t1);
1247     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1248     __ b(finish);
1249 
1250     // 65..80/96 bytes
1251     // (96 bytes if SIMD because we do 32 byes per instruction)
1252     __ bind(copy80);
1253     if (UseSIMDForMemoryOps) {
1254       bs.copy_load_at_32(v0, v1, Address(s, 0));
1255       bs.copy_load_at_32(v2, v3, Address(s, 32));
1256       // Unaligned pointers can be an issue for copying.
1257       // The issue has more chances to happen when granularity of data is
1258       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1259       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1260       // The most performance drop has been seen for the range 65-80 bytes.
1261       // For such cases using the pair of ldp/stp instead of the third pair of
1262       // ldpq/stpq fixes the performance issue.
1263       if (granularity < sizeof (jint)) {
1264         Label copy96;
1265         __ cmp(count, u1(80/granularity));
1266         __ br(Assembler::HI, copy96);
1267         bs.copy_load_at_16(t0, t1, Address(send, -16));
1268 
1269         bs.copy_store_at_32(Address(d, 0), v0, v1);
1270         bs.copy_store_at_32(Address(d, 32), v2, v3);
1271 
1272         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1273         __ b(finish);
1274 
1275         __ bind(copy96);
1276       }
1277       bs.copy_load_at_32(v4, v5, Address(send, -32));
1278 
1279       bs.copy_store_at_32(Address(d, 0), v0, v1);
1280       bs.copy_store_at_32(Address(d, 32), v2, v3);
1281 
1282       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1283     } else {
1284       bs.copy_load_at_16(t0, t1, Address(s, 0));
1285       bs.copy_load_at_16(t2, t3, Address(s, 16));
1286       bs.copy_load_at_16(t4, t5, Address(s, 32));
1287       bs.copy_load_at_16(t6, t7, Address(s, 48));
1288       bs.copy_load_at_16(t8, t9, Address(send, -16));
1289 
1290       bs.copy_store_at_16(Address(d, 0), t0, t1);
1291       bs.copy_store_at_16(Address(d, 16), t2, t3);
1292       bs.copy_store_at_16(Address(d, 32), t4, t5);
1293       bs.copy_store_at_16(Address(d, 48), t6, t7);
1294       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1295     }
1296     __ b(finish);
1297 
1298     // 0..16 bytes
1299     __ bind(copy16);
1300     __ cmp(count, u1(8/granularity));
1301     __ br(Assembler::LO, copy8);
1302 
1303     // 8..16 bytes
1304     bs.copy_load_at_8(t0, Address(s, 0));
1305     bs.copy_load_at_8(t1, Address(send, -8));
1306     bs.copy_store_at_8(Address(d, 0), t0);
1307     bs.copy_store_at_8(Address(dend, -8), t1);
1308     __ b(finish);
1309 
1310     if (granularity < 8) {
1311       // 4..7 bytes
1312       __ bind(copy8);
1313       __ tbz(count, 2 - exact_log2(granularity), copy4);
1314       __ ldrw(t0, Address(s, 0));
1315       __ ldrw(t1, Address(send, -4));
1316       __ strw(t0, Address(d, 0));
1317       __ strw(t1, Address(dend, -4));
1318       __ b(finish);
1319       if (granularity < 4) {
1320         // 0..3 bytes
1321         __ bind(copy4);
1322         __ cbz(count, finish); // get rid of 0 case
1323         if (granularity == 2) {
1324           __ ldrh(t0, Address(s, 0));
1325           __ strh(t0, Address(d, 0));
1326         } else { // granularity == 1
1327           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1328           // the first and last byte.
1329           // Handle the 3 byte case by loading and storing base + count/2
1330           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1331           // This does means in the 1 byte case we load/store the same
1332           // byte 3 times.
1333           __ lsr(count, count, 1);
1334           __ ldrb(t0, Address(s, 0));
1335           __ ldrb(t1, Address(send, -1));
1336           __ ldrb(t2, Address(s, count));
1337           __ strb(t0, Address(d, 0));
1338           __ strb(t1, Address(dend, -1));
1339           __ strb(t2, Address(d, count));
1340         }
1341         __ b(finish);
1342       }
1343     }
1344 
1345     __ bind(copy_big);
1346     if (is_backwards) {
1347       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1348       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1349     }
1350 
1351     // Now we've got the small case out of the way we can align the
1352     // source address on a 2-word boundary.
1353 
1354     // Here we will materialize a count in r15, which is used by copy_memory_small
1355     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1356     // Up until here, we have used t9, which aliases r15, but from here on, that register
1357     // can not be used as a temp register, as it contains the count.
1358 
1359     Label aligned;
1360 
1361     if (is_aligned) {
1362       // We may have to adjust by 1 word to get s 2-word-aligned.
1363       __ tbz(s, exact_log2(wordSize), aligned);
1364       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1365       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1366       __ sub(count, count, wordSize/granularity);
1367     } else {
1368       if (is_backwards) {
1369         __ andr(r15, s, 2 * wordSize - 1);
1370       } else {
1371         __ neg(r15, s);
1372         __ andr(r15, r15, 2 * wordSize - 1);
1373       }
1374       // r15 is the byte adjustment needed to align s.
1375       __ cbz(r15, aligned);
1376       int shift = exact_log2(granularity);
1377       if (shift)  __ lsr(r15, r15, shift);
1378       __ sub(count, count, r15);
1379 
1380 #if 0
1381       // ?? This code is only correct for a disjoint copy.  It may or
1382       // may not make sense to use it in that case.
1383 
1384       // Copy the first pair; s and d may not be aligned.
1385       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1386       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1387 
1388       // Align s and d, adjust count
1389       if (is_backwards) {
1390         __ sub(s, s, r15);
1391         __ sub(d, d, r15);
1392       } else {
1393         __ add(s, s, r15);
1394         __ add(d, d, r15);
1395       }
1396 #else
1397       copy_memory_small(decorators, type, s, d, r15, step);
1398 #endif
1399     }
1400 
1401     __ bind(aligned);
1402 
1403     // s is now 2-word-aligned.
1404 
1405     // We have a count of units and some trailing bytes.  Adjust the
1406     // count and do a bulk copy of words.
1407     __ lsr(r15, count, exact_log2(wordSize/granularity));
1408     if (direction == copy_forwards) {
1409       if (type != T_OBJECT) {
1410         __ bl(copy_f);
1411       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1412         __ bl(copy_obj_uninit_f);
1413       } else {
1414         __ bl(copy_obj_f);
1415       }
1416     } else {
1417       if (type != T_OBJECT) {
1418         __ bl(copy_b);
1419       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1420         __ bl(copy_obj_uninit_b);
1421       } else {
1422         __ bl(copy_obj_b);
1423       }
1424     }
1425 
1426     // And the tail.
1427     copy_memory_small(decorators, type, s, d, count, step);
1428 
1429     if (granularity >= 8) __ bind(copy8);
1430     if (granularity >= 4) __ bind(copy4);
1431     __ bind(finish);
1432   }
1433 
1434 
1435   void clobber_registers() {
1436 #ifdef ASSERT
1437     RegSet clobbered
1438       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1439     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1440     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1441     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1442       __ mov(*it, rscratch1);
1443     }
1444 #endif
1445 
1446   }
1447 
1448   // Scan over array at a for count oops, verifying each one.
1449   // Preserves a and count, clobbers rscratch1 and rscratch2.
1450   void verify_oop_array (int size, Register a, Register count, Register temp) {
1451     Label loop, end;
1452     __ mov(rscratch1, a);
1453     __ mov(rscratch2, zr);
1454     __ bind(loop);
1455     __ cmp(rscratch2, count);
1456     __ br(Assembler::HS, end);
1457     if (size == wordSize) {
1458       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1459       __ verify_oop(temp);
1460     } else {
1461       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1462       __ decode_heap_oop(temp); // calls verify_oop
1463     }
1464     __ add(rscratch2, rscratch2, 1);
1465     __ b(loop);
1466     __ bind(end);
1467   }
1468 
1469   // Arguments:
1470   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1471   //             ignored
1472   //   is_oop  - true => oop array, so generate store check code
1473   //   name    - stub name string
1474   //
1475   // Inputs:
1476   //   c_rarg0   - source array address
1477   //   c_rarg1   - destination array address
1478   //   c_rarg2   - element count, treated as ssize_t, can be zero
1479   //
1480   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1481   // the hardware handle it.  The two dwords within qwords that span
1482   // cache line boundaries will still be loaded and stored atomically.
1483   //
1484   // Side Effects:
1485   //   disjoint_int_copy_entry is set to the no-overlap entry point
1486   //   used by generate_conjoint_int_oop_copy().
1487   //
1488   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1489                                   const char *name, bool dest_uninitialized = false) {
1490     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1491     RegSet saved_reg = RegSet::of(s, d, count);
1492     __ align(CodeEntryAlignment);
1493     StubCodeMark mark(this, "StubRoutines", name);
1494     address start = __ pc();
1495     __ enter();
1496 
1497     if (entry != nullptr) {
1498       *entry = __ pc();
1499       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1500       BLOCK_COMMENT("Entry:");
1501     }
1502 
1503     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1504     if (dest_uninitialized) {
1505       decorators |= IS_DEST_UNINITIALIZED;
1506     }
1507     if (aligned) {
1508       decorators |= ARRAYCOPY_ALIGNED;
1509     }
1510 
1511     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1512     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1513 
1514     if (is_oop) {
1515       // save regs before copy_memory
1516       __ push(RegSet::of(d, count), sp);
1517     }
1518     {
1519       // UnsafeMemoryAccess page error: continue after unsafe access
1520       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1521       UnsafeMemoryAccessMark umam(this, add_entry, true);
1522       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1523     }
1524 
1525     if (is_oop) {
1526       __ pop(RegSet::of(d, count), sp);
1527       if (VerifyOops)
1528         verify_oop_array(size, d, count, r16);
1529     }
1530 
1531     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1532 
1533     __ leave();
1534     __ mov(r0, zr); // return 0
1535     __ ret(lr);
1536     return start;
1537   }
1538 
1539   // Arguments:
1540   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1541   //             ignored
1542   //   is_oop  - true => oop array, so generate store check code
1543   //   name    - stub name string
1544   //
1545   // Inputs:
1546   //   c_rarg0   - source array address
1547   //   c_rarg1   - destination array address
1548   //   c_rarg2   - element count, treated as ssize_t, can be zero
1549   //
1550   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1551   // the hardware handle it.  The two dwords within qwords that span
1552   // cache line boundaries will still be loaded and stored atomically.
1553   //
1554   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1555                                  address *entry, const char *name,
1556                                  bool dest_uninitialized = false) {
1557     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1558     RegSet saved_regs = RegSet::of(s, d, count);
1559     StubCodeMark mark(this, "StubRoutines", name);
1560     address start = __ pc();
1561     __ enter();
1562 
1563     if (entry != nullptr) {
1564       *entry = __ pc();
1565       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1566       BLOCK_COMMENT("Entry:");
1567     }
1568 
1569     // use fwd copy when (d-s) above_equal (count*size)
1570     __ sub(rscratch1, d, s);
1571     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1572     __ br(Assembler::HS, nooverlap_target);
1573 
1574     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1575     if (dest_uninitialized) {
1576       decorators |= IS_DEST_UNINITIALIZED;
1577     }
1578     if (aligned) {
1579       decorators |= ARRAYCOPY_ALIGNED;
1580     }
1581 
1582     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1583     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1584 
1585     if (is_oop) {
1586       // save regs before copy_memory
1587       __ push(RegSet::of(d, count), sp);
1588     }
1589     {
1590       // UnsafeMemoryAccess page error: continue after unsafe access
1591       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1592       UnsafeMemoryAccessMark umam(this, add_entry, true);
1593       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1594     }
1595     if (is_oop) {
1596       __ pop(RegSet::of(d, count), sp);
1597       if (VerifyOops)
1598         verify_oop_array(size, d, count, r16);
1599     }
1600     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1601     __ leave();
1602     __ mov(r0, zr); // return 0
1603     __ ret(lr);
1604     return start;
1605 }
1606 
1607   // Arguments:
1608   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1609   //             ignored
1610   //   name    - stub name string
1611   //
1612   // Inputs:
1613   //   c_rarg0   - source array address
1614   //   c_rarg1   - destination array address
1615   //   c_rarg2   - element count, treated as ssize_t, can be zero
1616   //
1617   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1618   // we let the hardware handle it.  The one to eight bytes within words,
1619   // dwords or qwords that span cache line boundaries will still be loaded
1620   // and stored atomically.
1621   //
1622   // Side Effects:
1623   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1624   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1625   // we let the hardware handle it.  The one to eight bytes within words,
1626   // dwords or qwords that span cache line boundaries will still be loaded
1627   // and stored atomically.
1628   //
1629   // Side Effects:
1630   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1631   //   used by generate_conjoint_byte_copy().
1632   //
1633   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1634     const bool not_oop = false;
1635     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1636   }
1637 
1638   // Arguments:
1639   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1640   //             ignored
1641   //   name    - stub name string
1642   //
1643   // Inputs:
1644   //   c_rarg0   - source array address
1645   //   c_rarg1   - destination array address
1646   //   c_rarg2   - element count, treated as ssize_t, can be zero
1647   //
1648   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1649   // we let the hardware handle it.  The one to eight bytes within words,
1650   // dwords or qwords that span cache line boundaries will still be loaded
1651   // and stored atomically.
1652   //
1653   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1654                                       address* entry, const char *name) {
1655     const bool not_oop = false;
1656     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1657   }
1658 
1659   // Arguments:
1660   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1661   //             ignored
1662   //   name    - stub name string
1663   //
1664   // Inputs:
1665   //   c_rarg0   - source array address
1666   //   c_rarg1   - destination array address
1667   //   c_rarg2   - element count, treated as ssize_t, can be zero
1668   //
1669   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1670   // let the hardware handle it.  The two or four words within dwords
1671   // or qwords that span cache line boundaries will still be loaded
1672   // and stored atomically.
1673   //
1674   // Side Effects:
1675   //   disjoint_short_copy_entry is set to the no-overlap entry point
1676   //   used by generate_conjoint_short_copy().
1677   //
1678   address generate_disjoint_short_copy(bool aligned,
1679                                        address* entry, const char *name) {
1680     const bool not_oop = false;
1681     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1682   }
1683 
1684   // Arguments:
1685   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1686   //             ignored
1687   //   name    - stub name string
1688   //
1689   // Inputs:
1690   //   c_rarg0   - source array address
1691   //   c_rarg1   - destination array address
1692   //   c_rarg2   - element count, treated as ssize_t, can be zero
1693   //
1694   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1695   // let the hardware handle it.  The two or four words within dwords
1696   // or qwords that span cache line boundaries will still be loaded
1697   // and stored atomically.
1698   //
1699   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1700                                        address *entry, const char *name) {
1701     const bool not_oop = false;
1702     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1703 
1704   }
1705   // Arguments:
1706   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1707   //             ignored
1708   //   name    - stub name string
1709   //
1710   // Inputs:
1711   //   c_rarg0   - source array address
1712   //   c_rarg1   - destination array address
1713   //   c_rarg2   - element count, treated as ssize_t, can be zero
1714   //
1715   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1716   // the hardware handle it.  The two dwords within qwords that span
1717   // cache line boundaries will still be loaded and stored atomically.
1718   //
1719   // Side Effects:
1720   //   disjoint_int_copy_entry is set to the no-overlap entry point
1721   //   used by generate_conjoint_int_oop_copy().
1722   //
1723   address generate_disjoint_int_copy(bool aligned, address *entry,
1724                                          const char *name, bool dest_uninitialized = false) {
1725     const bool not_oop = false;
1726     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1727   }
1728 
1729   // Arguments:
1730   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1731   //             ignored
1732   //   name    - stub name string
1733   //
1734   // Inputs:
1735   //   c_rarg0   - source array address
1736   //   c_rarg1   - destination array address
1737   //   c_rarg2   - element count, treated as ssize_t, can be zero
1738   //
1739   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1740   // the hardware handle it.  The two dwords within qwords that span
1741   // cache line boundaries will still be loaded and stored atomically.
1742   //
1743   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1744                                      address *entry, const char *name,
1745                                      bool dest_uninitialized = false) {
1746     const bool not_oop = false;
1747     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1748   }
1749 
1750 
1751   // Arguments:
1752   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1753   //             ignored
1754   //   name    - stub name string
1755   //
1756   // Inputs:
1757   //   c_rarg0   - source array address
1758   //   c_rarg1   - destination array address
1759   //   c_rarg2   - element count, treated as size_t, can be zero
1760   //
1761   // Side Effects:
1762   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1763   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1764   //
1765   address generate_disjoint_long_copy(bool aligned, address *entry,
1766                                           const char *name, bool dest_uninitialized = false) {
1767     const bool not_oop = false;
1768     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1769   }
1770 
1771   // Arguments:
1772   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1773   //             ignored
1774   //   name    - stub name string
1775   //
1776   // Inputs:
1777   //   c_rarg0   - source array address
1778   //   c_rarg1   - destination array address
1779   //   c_rarg2   - element count, treated as size_t, can be zero
1780   //
1781   address generate_conjoint_long_copy(bool aligned,
1782                                       address nooverlap_target, address *entry,
1783                                       const char *name, bool dest_uninitialized = false) {
1784     const bool not_oop = false;
1785     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1786   }
1787 
1788   // Arguments:
1789   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1790   //             ignored
1791   //   name    - stub name string
1792   //
1793   // Inputs:
1794   //   c_rarg0   - source array address
1795   //   c_rarg1   - destination array address
1796   //   c_rarg2   - element count, treated as size_t, can be zero
1797   //
1798   // Side Effects:
1799   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1800   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1801   //
1802   address generate_disjoint_oop_copy(bool aligned, address *entry,
1803                                      const char *name, bool dest_uninitialized) {
1804     const bool is_oop = true;
1805     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1806     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1807   }
1808 
1809   // Arguments:
1810   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1811   //             ignored
1812   //   name    - stub name string
1813   //
1814   // Inputs:
1815   //   c_rarg0   - source array address
1816   //   c_rarg1   - destination array address
1817   //   c_rarg2   - element count, treated as size_t, can be zero
1818   //
1819   address generate_conjoint_oop_copy(bool aligned,
1820                                      address nooverlap_target, address *entry,
1821                                      const char *name, bool dest_uninitialized) {
1822     const bool is_oop = true;
1823     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1824     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1825                                   name, dest_uninitialized);
1826   }
1827 
1828 
1829   // Helper for generating a dynamic type check.
1830   // Smashes rscratch1, rscratch2.
1831   void generate_type_check(Register sub_klass,
1832                            Register super_check_offset,
1833                            Register super_klass,
1834                            Label& L_success) {
1835     assert_different_registers(sub_klass, super_check_offset, super_klass);
1836 
1837     BLOCK_COMMENT("type_check:");
1838 
1839     Label L_miss;
1840 
1841     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1842                                      super_check_offset);
1843     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1844 
1845     // Fall through on failure!
1846     __ BIND(L_miss);
1847   }
1848 
1849   //
1850   //  Generate checkcasting array copy stub
1851   //
1852   //  Input:
1853   //    c_rarg0   - source array address
1854   //    c_rarg1   - destination array address
1855   //    c_rarg2   - element count, treated as ssize_t, can be zero
1856   //    c_rarg3   - size_t ckoff (super_check_offset)
1857   //    c_rarg4   - oop ckval (super_klass)
1858   //
1859   //  Output:
1860   //    r0 ==  0  -  success
1861   //    r0 == -1^K - failure, where K is partial transfer count
1862   //
1863   address generate_checkcast_copy(const char *name, address *entry,
1864                                   bool dest_uninitialized = false) {
1865 
1866     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1867 
1868     // Input registers (after setup_arg_regs)
1869     const Register from        = c_rarg0;   // source array address
1870     const Register to          = c_rarg1;   // destination array address
1871     const Register count       = c_rarg2;   // elementscount
1872     const Register ckoff       = c_rarg3;   // super_check_offset
1873     const Register ckval       = c_rarg4;   // super_klass
1874 
1875     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1876     RegSet wb_post_saved_regs = RegSet::of(count);
1877 
1878     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1879     const Register copied_oop  = r22;       // actual oop copied
1880     const Register count_save  = r21;       // orig elementscount
1881     const Register start_to    = r20;       // destination array start address
1882     const Register r19_klass   = r19;       // oop._klass
1883 
1884     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1885     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1886 
1887     //---------------------------------------------------------------
1888     // Assembler stub will be used for this call to arraycopy
1889     // if the two arrays are subtypes of Object[] but the
1890     // destination array type is not equal to or a supertype
1891     // of the source type.  Each element must be separately
1892     // checked.
1893 
1894     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1895                                copied_oop, r19_klass, count_save);
1896 
1897     __ align(CodeEntryAlignment);
1898     StubCodeMark mark(this, "StubRoutines", name);
1899     address start = __ pc();
1900 
1901     __ enter(); // required for proper stackwalking of RuntimeStub frame
1902 
1903 #ifdef ASSERT
1904     // caller guarantees that the arrays really are different
1905     // otherwise, we would have to make conjoint checks
1906     { Label L;
1907       __ b(L);                  // conjoint check not yet implemented
1908       __ stop("checkcast_copy within a single array");
1909       __ bind(L);
1910     }
1911 #endif //ASSERT
1912 
1913     // Caller of this entry point must set up the argument registers.
1914     if (entry != nullptr) {
1915       *entry = __ pc();
1916       BLOCK_COMMENT("Entry:");
1917     }
1918 
1919      // Empty array:  Nothing to do.
1920     __ cbz(count, L_done);
1921     __ push(RegSet::of(r19, r20, r21, r22), sp);
1922 
1923 #ifdef ASSERT
1924     BLOCK_COMMENT("assert consistent ckoff/ckval");
1925     // The ckoff and ckval must be mutually consistent,
1926     // even though caller generates both.
1927     { Label L;
1928       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1929       __ ldrw(start_to, Address(ckval, sco_offset));
1930       __ cmpw(ckoff, start_to);
1931       __ br(Assembler::EQ, L);
1932       __ stop("super_check_offset inconsistent");
1933       __ bind(L);
1934     }
1935 #endif //ASSERT
1936 
1937     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1938     bool is_oop = true;
1939     int element_size = UseCompressedOops ? 4 : 8;
1940     if (dest_uninitialized) {
1941       decorators |= IS_DEST_UNINITIALIZED;
1942     }
1943 
1944     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1945     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1946 
1947     // save the original count
1948     __ mov(count_save, count);
1949 
1950     // Copy from low to high addresses
1951     __ mov(start_to, to);              // Save destination array start address
1952     __ b(L_load_element);
1953 
1954     // ======== begin loop ========
1955     // (Loop is rotated; its entry is L_load_element.)
1956     // Loop control:
1957     //   for (; count != 0; count--) {
1958     //     copied_oop = load_heap_oop(from++);
1959     //     ... generate_type_check ...;
1960     //     store_heap_oop(to++, copied_oop);
1961     //   }
1962     __ align(OptoLoopAlignment);
1963 
1964     __ BIND(L_store_element);
1965     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1966                       __ post(to, element_size), copied_oop, noreg,
1967                       gct1, gct2, gct3);
1968     __ sub(count, count, 1);
1969     __ cbz(count, L_do_card_marks);
1970 
1971     // ======== loop entry is here ========
1972     __ BIND(L_load_element);
1973     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1974                      copied_oop, noreg, __ post(from, element_size),
1975                      gct1);
1976     __ cbz(copied_oop, L_store_element);
1977 
1978     __ load_klass(r19_klass, copied_oop);// query the object klass
1979     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1980     // ======== end loop ========
1981 
1982     // It was a real error; we must depend on the caller to finish the job.
1983     // Register count = remaining oops, count_orig = total oops.
1984     // Emit GC store barriers for the oops we have copied and report
1985     // their number to the caller.
1986 
1987     __ subs(count, count_save, count);     // K = partially copied oop count
1988     __ eon(count, count, zr);                   // report (-1^K) to caller
1989     __ br(Assembler::EQ, L_done_pop);
1990 
1991     __ BIND(L_do_card_marks);
1992     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1993 
1994     __ bind(L_done_pop);
1995     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1996     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1997 
1998     __ bind(L_done);
1999     __ mov(r0, count);
2000     __ leave();
2001     __ ret(lr);
2002 
2003     return start;
2004   }
2005 
2006   // Perform range checks on the proposed arraycopy.
2007   // Kills temp, but nothing else.
2008   // Also, clean the sign bits of src_pos and dst_pos.
2009   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2010                               Register src_pos, // source position (c_rarg1)
2011                               Register dst,     // destination array oo (c_rarg2)
2012                               Register dst_pos, // destination position (c_rarg3)
2013                               Register length,
2014                               Register temp,
2015                               Label& L_failed) {
2016     BLOCK_COMMENT("arraycopy_range_checks:");
2017 
2018     assert_different_registers(rscratch1, temp);
2019 
2020     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2021     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2022     __ addw(temp, length, src_pos);
2023     __ cmpw(temp, rscratch1);
2024     __ br(Assembler::HI, L_failed);
2025 
2026     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2027     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2028     __ addw(temp, length, dst_pos);
2029     __ cmpw(temp, rscratch1);
2030     __ br(Assembler::HI, L_failed);
2031 
2032     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2033     __ movw(src_pos, src_pos);
2034     __ movw(dst_pos, dst_pos);
2035 
2036     BLOCK_COMMENT("arraycopy_range_checks done");
2037   }
2038 
2039   // These stubs get called from some dumb test routine.
2040   // I'll write them properly when they're called from
2041   // something that's actually doing something.
2042   static void fake_arraycopy_stub(address src, address dst, int count) {
2043     assert(count == 0, "huh?");
2044   }
2045 
2046 
2047   //
2048   //  Generate 'unsafe' array copy stub
2049   //  Though just as safe as the other stubs, it takes an unscaled
2050   //  size_t argument instead of an element count.
2051   //
2052   //  Input:
2053   //    c_rarg0   - source array address
2054   //    c_rarg1   - destination array address
2055   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2056   //
2057   // Examines the alignment of the operands and dispatches
2058   // to a long, int, short, or byte copy loop.
2059   //
2060   address generate_unsafe_copy(const char *name,
2061                                address byte_copy_entry,
2062                                address short_copy_entry,
2063                                address int_copy_entry,
2064                                address long_copy_entry) {
2065     Label L_long_aligned, L_int_aligned, L_short_aligned;
2066     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2067 
2068     __ align(CodeEntryAlignment);
2069     StubCodeMark mark(this, "StubRoutines", name);
2070     address start = __ pc();
2071     __ enter(); // required for proper stackwalking of RuntimeStub frame
2072 
2073     // bump this on entry, not on exit:
2074     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2075 
2076     __ orr(rscratch1, s, d);
2077     __ orr(rscratch1, rscratch1, count);
2078 
2079     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2080     __ cbz(rscratch1, L_long_aligned);
2081     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2082     __ cbz(rscratch1, L_int_aligned);
2083     __ tbz(rscratch1, 0, L_short_aligned);
2084     __ b(RuntimeAddress(byte_copy_entry));
2085 
2086     __ BIND(L_short_aligned);
2087     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2088     __ b(RuntimeAddress(short_copy_entry));
2089     __ BIND(L_int_aligned);
2090     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2091     __ b(RuntimeAddress(int_copy_entry));
2092     __ BIND(L_long_aligned);
2093     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2094     __ b(RuntimeAddress(long_copy_entry));
2095 
2096     return start;
2097   }
2098 
2099   //
2100   //  Generate generic array copy stubs
2101   //
2102   //  Input:
2103   //    c_rarg0    -  src oop
2104   //    c_rarg1    -  src_pos (32-bits)
2105   //    c_rarg2    -  dst oop
2106   //    c_rarg3    -  dst_pos (32-bits)
2107   //    c_rarg4    -  element count (32-bits)
2108   //
2109   //  Output:
2110   //    r0 ==  0  -  success
2111   //    r0 == -1^K - failure, where K is partial transfer count
2112   //
2113   address generate_generic_copy(const char *name,
2114                                 address byte_copy_entry, address short_copy_entry,
2115                                 address int_copy_entry, address oop_copy_entry,
2116                                 address long_copy_entry, address checkcast_copy_entry) {
2117 
2118     Label L_failed, L_objArray;
2119     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2120 
2121     // Input registers
2122     const Register src        = c_rarg0;  // source array oop
2123     const Register src_pos    = c_rarg1;  // source position
2124     const Register dst        = c_rarg2;  // destination array oop
2125     const Register dst_pos    = c_rarg3;  // destination position
2126     const Register length     = c_rarg4;
2127 
2128 
2129     // Registers used as temps
2130     const Register dst_klass  = c_rarg5;
2131 
2132     __ align(CodeEntryAlignment);
2133 
2134     StubCodeMark mark(this, "StubRoutines", name);
2135 
2136     address start = __ pc();
2137 
2138     __ enter(); // required for proper stackwalking of RuntimeStub frame
2139 
2140     // bump this on entry, not on exit:
2141     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2142 
2143     //-----------------------------------------------------------------------
2144     // Assembler stub will be used for this call to arraycopy
2145     // if the following conditions are met:
2146     //
2147     // (1) src and dst must not be null.
2148     // (2) src_pos must not be negative.
2149     // (3) dst_pos must not be negative.
2150     // (4) length  must not be negative.
2151     // (5) src klass and dst klass should be the same and not null.
2152     // (6) src and dst should be arrays.
2153     // (7) src_pos + length must not exceed length of src.
2154     // (8) dst_pos + length must not exceed length of dst.
2155     //
2156 
2157     //  if (src == nullptr) return -1;
2158     __ cbz(src, L_failed);
2159 
2160     //  if (src_pos < 0) return -1;
2161     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2162 
2163     //  if (dst == nullptr) return -1;
2164     __ cbz(dst, L_failed);
2165 
2166     //  if (dst_pos < 0) return -1;
2167     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2168 
2169     // registers used as temp
2170     const Register scratch_length    = r16; // elements count to copy
2171     const Register scratch_src_klass = r17; // array klass
2172     const Register lh                = r15; // layout helper
2173 
2174     //  if (length < 0) return -1;
2175     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2176     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2177 
2178     __ load_klass(scratch_src_klass, src);
2179 #ifdef ASSERT
2180     //  assert(src->klass() != nullptr);
2181     {
2182       BLOCK_COMMENT("assert klasses not null {");
2183       Label L1, L2;
2184       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2185       __ bind(L1);
2186       __ stop("broken null klass");
2187       __ bind(L2);
2188       __ load_klass(rscratch1, dst);
2189       __ cbz(rscratch1, L1);     // this would be broken also
2190       BLOCK_COMMENT("} assert klasses not null done");
2191     }
2192 #endif
2193 
2194     // Load layout helper (32-bits)
2195     //
2196     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2197     // 32        30    24            16              8     2                 0
2198     //
2199     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2200     //
2201 
2202     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2203 
2204     // Handle objArrays completely differently...
2205     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2206     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2207     __ movw(rscratch1, objArray_lh);
2208     __ eorw(rscratch2, lh, rscratch1);
2209     __ cbzw(rscratch2, L_objArray);
2210 
2211     //  if (src->klass() != dst->klass()) return -1;
2212     __ load_klass(rscratch2, dst);
2213     __ eor(rscratch2, rscratch2, scratch_src_klass);
2214     __ cbnz(rscratch2, L_failed);
2215 
2216     //  if (!src->is_Array()) return -1;
2217     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2218 
2219     // At this point, it is known to be a typeArray (array_tag 0x3).
2220 #ifdef ASSERT
2221     {
2222       BLOCK_COMMENT("assert primitive array {");
2223       Label L;
2224       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2225       __ cmpw(lh, rscratch2);
2226       __ br(Assembler::GE, L);
2227       __ stop("must be a primitive array");
2228       __ bind(L);
2229       BLOCK_COMMENT("} assert primitive array done");
2230     }
2231 #endif
2232 
2233     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2234                            rscratch2, L_failed);
2235 
2236     // TypeArrayKlass
2237     //
2238     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2239     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2240     //
2241 
2242     const Register rscratch1_offset = rscratch1;    // array offset
2243     const Register r15_elsize = lh; // element size
2244 
2245     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2246            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2247     __ add(src, src, rscratch1_offset);           // src array offset
2248     __ add(dst, dst, rscratch1_offset);           // dst array offset
2249     BLOCK_COMMENT("choose copy loop based on element size");
2250 
2251     // next registers should be set before the jump to corresponding stub
2252     const Register from     = c_rarg0;  // source array address
2253     const Register to       = c_rarg1;  // destination array address
2254     const Register count    = c_rarg2;  // elements count
2255 
2256     // 'from', 'to', 'count' registers should be set in such order
2257     // since they are the same as 'src', 'src_pos', 'dst'.
2258 
2259     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2260 
2261     // The possible values of elsize are 0-3, i.e. exact_log2(element
2262     // size in bytes).  We do a simple bitwise binary search.
2263   __ BIND(L_copy_bytes);
2264     __ tbnz(r15_elsize, 1, L_copy_ints);
2265     __ tbnz(r15_elsize, 0, L_copy_shorts);
2266     __ lea(from, Address(src, src_pos));// src_addr
2267     __ lea(to,   Address(dst, dst_pos));// dst_addr
2268     __ movw(count, scratch_length); // length
2269     __ b(RuntimeAddress(byte_copy_entry));
2270 
2271   __ BIND(L_copy_shorts);
2272     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2273     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2274     __ movw(count, scratch_length); // length
2275     __ b(RuntimeAddress(short_copy_entry));
2276 
2277   __ BIND(L_copy_ints);
2278     __ tbnz(r15_elsize, 0, L_copy_longs);
2279     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2280     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2281     __ movw(count, scratch_length); // length
2282     __ b(RuntimeAddress(int_copy_entry));
2283 
2284   __ BIND(L_copy_longs);
2285 #ifdef ASSERT
2286     {
2287       BLOCK_COMMENT("assert long copy {");
2288       Label L;
2289       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2290       __ cmpw(r15_elsize, LogBytesPerLong);
2291       __ br(Assembler::EQ, L);
2292       __ stop("must be long copy, but elsize is wrong");
2293       __ bind(L);
2294       BLOCK_COMMENT("} assert long copy done");
2295     }
2296 #endif
2297     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2298     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2299     __ movw(count, scratch_length); // length
2300     __ b(RuntimeAddress(long_copy_entry));
2301 
2302     // ObjArrayKlass
2303   __ BIND(L_objArray);
2304     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2305 
2306     Label L_plain_copy, L_checkcast_copy;
2307     //  test array classes for subtyping
2308     __ load_klass(r15, dst);
2309     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2310     __ br(Assembler::NE, L_checkcast_copy);
2311 
2312     // Identically typed arrays can be copied without element-wise checks.
2313     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2314                            rscratch2, L_failed);
2315 
2316     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2317     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2318     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2319     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2320     __ movw(count, scratch_length); // length
2321   __ BIND(L_plain_copy);
2322     __ b(RuntimeAddress(oop_copy_entry));
2323 
2324   __ BIND(L_checkcast_copy);
2325     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2326     {
2327       // Before looking at dst.length, make sure dst is also an objArray.
2328       __ ldrw(rscratch1, Address(r15, lh_offset));
2329       __ movw(rscratch2, objArray_lh);
2330       __ eorw(rscratch1, rscratch1, rscratch2);
2331       __ cbnzw(rscratch1, L_failed);
2332 
2333       // It is safe to examine both src.length and dst.length.
2334       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2335                              r15, L_failed);
2336 
2337       __ load_klass(dst_klass, dst); // reload
2338 
2339       // Marshal the base address arguments now, freeing registers.
2340       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2341       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2342       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2343       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2344       __ movw(count, length);           // length (reloaded)
2345       Register sco_temp = c_rarg3;      // this register is free now
2346       assert_different_registers(from, to, count, sco_temp,
2347                                  dst_klass, scratch_src_klass);
2348       // assert_clean_int(count, sco_temp);
2349 
2350       // Generate the type check.
2351       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2352       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2353 
2354       // Smashes rscratch1, rscratch2
2355       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2356 
2357       // Fetch destination element klass from the ObjArrayKlass header.
2358       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2359       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2360       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2361 
2362       // the checkcast_copy loop needs two extra arguments:
2363       assert(c_rarg3 == sco_temp, "#3 already in place");
2364       // Set up arguments for checkcast_copy_entry.
2365       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2366       __ b(RuntimeAddress(checkcast_copy_entry));
2367     }
2368 
2369   __ BIND(L_failed);
2370     __ mov(r0, -1);
2371     __ leave();   // required for proper stackwalking of RuntimeStub frame
2372     __ ret(lr);
2373 
2374     return start;
2375   }
2376 
2377   //
2378   // Generate stub for array fill. If "aligned" is true, the
2379   // "to" address is assumed to be heapword aligned.
2380   //
2381   // Arguments for generated stub:
2382   //   to:    c_rarg0
2383   //   value: c_rarg1
2384   //   count: c_rarg2 treated as signed
2385   //
2386   address generate_fill(BasicType t, bool aligned, const char *name) {
2387     __ align(CodeEntryAlignment);
2388     StubCodeMark mark(this, "StubRoutines", name);
2389     address start = __ pc();
2390 
2391     BLOCK_COMMENT("Entry:");
2392 
2393     const Register to        = c_rarg0;  // source array address
2394     const Register value     = c_rarg1;  // value
2395     const Register count     = c_rarg2;  // elements count
2396 
2397     const Register bz_base = r10;        // base for block_zero routine
2398     const Register cnt_words = r11;      // temp register
2399 
2400     __ enter();
2401 
2402     Label L_fill_elements, L_exit1;
2403 
2404     int shift = -1;
2405     switch (t) {
2406       case T_BYTE:
2407         shift = 0;
2408         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2409         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2410         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2411         __ br(Assembler::LO, L_fill_elements);
2412         break;
2413       case T_SHORT:
2414         shift = 1;
2415         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2416         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2417         __ br(Assembler::LO, L_fill_elements);
2418         break;
2419       case T_INT:
2420         shift = 2;
2421         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2422         __ br(Assembler::LO, L_fill_elements);
2423         break;
2424       default: ShouldNotReachHere();
2425     }
2426 
2427     // Align source address at 8 bytes address boundary.
2428     Label L_skip_align1, L_skip_align2, L_skip_align4;
2429     if (!aligned) {
2430       switch (t) {
2431         case T_BYTE:
2432           // One byte misalignment happens only for byte arrays.
2433           __ tbz(to, 0, L_skip_align1);
2434           __ strb(value, Address(__ post(to, 1)));
2435           __ subw(count, count, 1);
2436           __ bind(L_skip_align1);
2437           // Fallthrough
2438         case T_SHORT:
2439           // Two bytes misalignment happens only for byte and short (char) arrays.
2440           __ tbz(to, 1, L_skip_align2);
2441           __ strh(value, Address(__ post(to, 2)));
2442           __ subw(count, count, 2 >> shift);
2443           __ bind(L_skip_align2);
2444           // Fallthrough
2445         case T_INT:
2446           // Align to 8 bytes, we know we are 4 byte aligned to start.
2447           __ tbz(to, 2, L_skip_align4);
2448           __ strw(value, Address(__ post(to, 4)));
2449           __ subw(count, count, 4 >> shift);
2450           __ bind(L_skip_align4);
2451           break;
2452         default: ShouldNotReachHere();
2453       }
2454     }
2455 
2456     //
2457     //  Fill large chunks
2458     //
2459     __ lsrw(cnt_words, count, 3 - shift); // number of words
2460     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2461     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2462     if (UseBlockZeroing) {
2463       Label non_block_zeroing, rest;
2464       // If the fill value is zero we can use the fast zero_words().
2465       __ cbnz(value, non_block_zeroing);
2466       __ mov(bz_base, to);
2467       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2468       address tpc = __ zero_words(bz_base, cnt_words);
2469       if (tpc == nullptr) {
2470         fatal("CodeCache is full at generate_fill");
2471       }
2472       __ b(rest);
2473       __ bind(non_block_zeroing);
2474       __ fill_words(to, cnt_words, value);
2475       __ bind(rest);
2476     } else {
2477       __ fill_words(to, cnt_words, value);
2478     }
2479 
2480     // Remaining count is less than 8 bytes. Fill it by a single store.
2481     // Note that the total length is no less than 8 bytes.
2482     if (t == T_BYTE || t == T_SHORT) {
2483       Label L_exit1;
2484       __ cbzw(count, L_exit1);
2485       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2486       __ str(value, Address(to, -8));    // overwrite some elements
2487       __ bind(L_exit1);
2488       __ leave();
2489       __ ret(lr);
2490     }
2491 
2492     // Handle copies less than 8 bytes.
2493     Label L_fill_2, L_fill_4, L_exit2;
2494     __ bind(L_fill_elements);
2495     switch (t) {
2496       case T_BYTE:
2497         __ tbz(count, 0, L_fill_2);
2498         __ strb(value, Address(__ post(to, 1)));
2499         __ bind(L_fill_2);
2500         __ tbz(count, 1, L_fill_4);
2501         __ strh(value, Address(__ post(to, 2)));
2502         __ bind(L_fill_4);
2503         __ tbz(count, 2, L_exit2);
2504         __ strw(value, Address(to));
2505         break;
2506       case T_SHORT:
2507         __ tbz(count, 0, L_fill_4);
2508         __ strh(value, Address(__ post(to, 2)));
2509         __ bind(L_fill_4);
2510         __ tbz(count, 1, L_exit2);
2511         __ strw(value, Address(to));
2512         break;
2513       case T_INT:
2514         __ cbzw(count, L_exit2);
2515         __ strw(value, Address(to));
2516         break;
2517       default: ShouldNotReachHere();
2518     }
2519     __ bind(L_exit2);
2520     __ leave();
2521     __ ret(lr);
2522     return start;
2523   }
2524 
2525   address generate_data_cache_writeback() {
2526     const Register line        = c_rarg0;  // address of line to write back
2527 
2528     __ align(CodeEntryAlignment);
2529 
2530     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2531 
2532     address start = __ pc();
2533     __ enter();
2534     __ cache_wb(Address(line, 0));
2535     __ leave();
2536     __ ret(lr);
2537 
2538     return start;
2539   }
2540 
2541   address generate_data_cache_writeback_sync() {
2542     const Register is_pre     = c_rarg0;  // pre or post sync
2543 
2544     __ align(CodeEntryAlignment);
2545 
2546     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2547 
2548     // pre wbsync is a no-op
2549     // post wbsync translates to an sfence
2550 
2551     Label skip;
2552     address start = __ pc();
2553     __ enter();
2554     __ cbnz(is_pre, skip);
2555     __ cache_wbsync(false);
2556     __ bind(skip);
2557     __ leave();
2558     __ ret(lr);
2559 
2560     return start;
2561   }
2562 
2563   void generate_arraycopy_stubs() {
2564     address entry;
2565     address entry_jbyte_arraycopy;
2566     address entry_jshort_arraycopy;
2567     address entry_jint_arraycopy;
2568     address entry_oop_arraycopy;
2569     address entry_jlong_arraycopy;
2570     address entry_checkcast_arraycopy;
2571 
2572     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2573     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2574 
2575     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2576     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2577 
2578     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2579     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2580 
2581     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2582 
2583     //*** jbyte
2584     // Always need aligned and unaligned versions
2585     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2586                                                                                   "jbyte_disjoint_arraycopy");
2587     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2588                                                                                   &entry_jbyte_arraycopy,
2589                                                                                   "jbyte_arraycopy");
2590     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2591                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2592     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2593                                                                                   "arrayof_jbyte_arraycopy");
2594 
2595     //*** jshort
2596     // Always need aligned and unaligned versions
2597     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2598                                                                                     "jshort_disjoint_arraycopy");
2599     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2600                                                                                     &entry_jshort_arraycopy,
2601                                                                                     "jshort_arraycopy");
2602     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2603                                                                                     "arrayof_jshort_disjoint_arraycopy");
2604     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2605                                                                                     "arrayof_jshort_arraycopy");
2606 
2607     //*** jint
2608     // Aligned versions
2609     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2610                                                                                 "arrayof_jint_disjoint_arraycopy");
2611     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2612                                                                                 "arrayof_jint_arraycopy");
2613     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2614     // entry_jint_arraycopy always points to the unaligned version
2615     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2616                                                                                 "jint_disjoint_arraycopy");
2617     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2618                                                                                 &entry_jint_arraycopy,
2619                                                                                 "jint_arraycopy");
2620 
2621     //*** jlong
2622     // It is always aligned
2623     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2624                                                                                   "arrayof_jlong_disjoint_arraycopy");
2625     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2626                                                                                   "arrayof_jlong_arraycopy");
2627     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2628     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2629 
2630     //*** oops
2631     {
2632       // With compressed oops we need unaligned versions; notice that
2633       // we overwrite entry_oop_arraycopy.
2634       bool aligned = !UseCompressedOops;
2635 
2636       StubRoutines::_arrayof_oop_disjoint_arraycopy
2637         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2638                                      /*dest_uninitialized*/false);
2639       StubRoutines::_arrayof_oop_arraycopy
2640         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2641                                      /*dest_uninitialized*/false);
2642       // Aligned versions without pre-barriers
2643       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2644         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2645                                      /*dest_uninitialized*/true);
2646       StubRoutines::_arrayof_oop_arraycopy_uninit
2647         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2648                                      /*dest_uninitialized*/true);
2649     }
2650 
2651     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2652     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2653     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2654     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2655 
2656     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2657     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2658                                                                         /*dest_uninitialized*/true);
2659 
2660     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2661                                                               entry_jbyte_arraycopy,
2662                                                               entry_jshort_arraycopy,
2663                                                               entry_jint_arraycopy,
2664                                                               entry_jlong_arraycopy);
2665 
2666     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2667                                                                entry_jbyte_arraycopy,
2668                                                                entry_jshort_arraycopy,
2669                                                                entry_jint_arraycopy,
2670                                                                entry_oop_arraycopy,
2671                                                                entry_jlong_arraycopy,
2672                                                                entry_checkcast_arraycopy);
2673 
2674     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2675     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2676     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2677     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2678     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2679     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2680   }
2681 
2682   void generate_math_stubs() { Unimplemented(); }
2683 
2684   // Arguments:
2685   //
2686   // Inputs:
2687   //   c_rarg0   - source byte array address
2688   //   c_rarg1   - destination byte array address
2689   //   c_rarg2   - K (key) in little endian int array
2690   //
2691   address generate_aescrypt_encryptBlock() {
2692     __ align(CodeEntryAlignment);
2693     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2694 
2695     const Register from        = c_rarg0;  // source array address
2696     const Register to          = c_rarg1;  // destination array address
2697     const Register key         = c_rarg2;  // key array address
2698     const Register keylen      = rscratch1;
2699 
2700     address start = __ pc();
2701     __ enter();
2702 
2703     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2704 
2705     __ aesenc_loadkeys(key, keylen);
2706     __ aesecb_encrypt(from, to, keylen);
2707 
2708     __ mov(r0, 0);
2709 
2710     __ leave();
2711     __ ret(lr);
2712 
2713     return start;
2714   }
2715 
2716   // Arguments:
2717   //
2718   // Inputs:
2719   //   c_rarg0   - source byte array address
2720   //   c_rarg1   - destination byte array address
2721   //   c_rarg2   - K (key) in little endian int array
2722   //
2723   address generate_aescrypt_decryptBlock() {
2724     assert(UseAES, "need AES cryptographic extension support");
2725     __ align(CodeEntryAlignment);
2726     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2727     Label L_doLast;
2728 
2729     const Register from        = c_rarg0;  // source array address
2730     const Register to          = c_rarg1;  // destination array address
2731     const Register key         = c_rarg2;  // key array address
2732     const Register keylen      = rscratch1;
2733 
2734     address start = __ pc();
2735     __ enter(); // required for proper stackwalking of RuntimeStub frame
2736 
2737     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2738 
2739     __ aesecb_decrypt(from, to, key, keylen);
2740 
2741     __ mov(r0, 0);
2742 
2743     __ leave();
2744     __ ret(lr);
2745 
2746     return start;
2747   }
2748 
2749   // Arguments:
2750   //
2751   // Inputs:
2752   //   c_rarg0   - source byte array address
2753   //   c_rarg1   - destination byte array address
2754   //   c_rarg2   - K (key) in little endian int array
2755   //   c_rarg3   - r vector byte array address
2756   //   c_rarg4   - input length
2757   //
2758   // Output:
2759   //   x0        - input length
2760   //
2761   address generate_cipherBlockChaining_encryptAESCrypt() {
2762     assert(UseAES, "need AES cryptographic extension support");
2763     __ align(CodeEntryAlignment);
2764     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2765 
2766     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2767 
2768     const Register from        = c_rarg0;  // source array address
2769     const Register to          = c_rarg1;  // destination array address
2770     const Register key         = c_rarg2;  // key array address
2771     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2772                                            // and left with the results of the last encryption block
2773     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2774     const Register keylen      = rscratch1;
2775 
2776     address start = __ pc();
2777 
2778       __ enter();
2779 
2780       __ movw(rscratch2, len_reg);
2781 
2782       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2783 
2784       __ ld1(v0, __ T16B, rvec);
2785 
2786       __ cmpw(keylen, 52);
2787       __ br(Assembler::CC, L_loadkeys_44);
2788       __ br(Assembler::EQ, L_loadkeys_52);
2789 
2790       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2791       __ rev32(v17, __ T16B, v17);
2792       __ rev32(v18, __ T16B, v18);
2793     __ BIND(L_loadkeys_52);
2794       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2795       __ rev32(v19, __ T16B, v19);
2796       __ rev32(v20, __ T16B, v20);
2797     __ BIND(L_loadkeys_44);
2798       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2799       __ rev32(v21, __ T16B, v21);
2800       __ rev32(v22, __ T16B, v22);
2801       __ rev32(v23, __ T16B, v23);
2802       __ rev32(v24, __ T16B, v24);
2803       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2804       __ rev32(v25, __ T16B, v25);
2805       __ rev32(v26, __ T16B, v26);
2806       __ rev32(v27, __ T16B, v27);
2807       __ rev32(v28, __ T16B, v28);
2808       __ ld1(v29, v30, v31, __ T16B, key);
2809       __ rev32(v29, __ T16B, v29);
2810       __ rev32(v30, __ T16B, v30);
2811       __ rev32(v31, __ T16B, v31);
2812 
2813     __ BIND(L_aes_loop);
2814       __ ld1(v1, __ T16B, __ post(from, 16));
2815       __ eor(v0, __ T16B, v0, v1);
2816 
2817       __ br(Assembler::CC, L_rounds_44);
2818       __ br(Assembler::EQ, L_rounds_52);
2819 
2820       __ aese(v0, v17); __ aesmc(v0, v0);
2821       __ aese(v0, v18); __ aesmc(v0, v0);
2822     __ BIND(L_rounds_52);
2823       __ aese(v0, v19); __ aesmc(v0, v0);
2824       __ aese(v0, v20); __ aesmc(v0, v0);
2825     __ BIND(L_rounds_44);
2826       __ aese(v0, v21); __ aesmc(v0, v0);
2827       __ aese(v0, v22); __ aesmc(v0, v0);
2828       __ aese(v0, v23); __ aesmc(v0, v0);
2829       __ aese(v0, v24); __ aesmc(v0, v0);
2830       __ aese(v0, v25); __ aesmc(v0, v0);
2831       __ aese(v0, v26); __ aesmc(v0, v0);
2832       __ aese(v0, v27); __ aesmc(v0, v0);
2833       __ aese(v0, v28); __ aesmc(v0, v0);
2834       __ aese(v0, v29); __ aesmc(v0, v0);
2835       __ aese(v0, v30);
2836       __ eor(v0, __ T16B, v0, v31);
2837 
2838       __ st1(v0, __ T16B, __ post(to, 16));
2839 
2840       __ subw(len_reg, len_reg, 16);
2841       __ cbnzw(len_reg, L_aes_loop);
2842 
2843       __ st1(v0, __ T16B, rvec);
2844 
2845       __ mov(r0, rscratch2);
2846 
2847       __ leave();
2848       __ ret(lr);
2849 
2850       return start;
2851   }
2852 
2853   // Arguments:
2854   //
2855   // Inputs:
2856   //   c_rarg0   - source byte array address
2857   //   c_rarg1   - destination byte array address
2858   //   c_rarg2   - K (key) in little endian int array
2859   //   c_rarg3   - r vector byte array address
2860   //   c_rarg4   - input length
2861   //
2862   // Output:
2863   //   r0        - input length
2864   //
2865   address generate_cipherBlockChaining_decryptAESCrypt() {
2866     assert(UseAES, "need AES cryptographic extension support");
2867     __ align(CodeEntryAlignment);
2868     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2869 
2870     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2871 
2872     const Register from        = c_rarg0;  // source array address
2873     const Register to          = c_rarg1;  // destination array address
2874     const Register key         = c_rarg2;  // key array address
2875     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2876                                            // and left with the results of the last encryption block
2877     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2878     const Register keylen      = rscratch1;
2879 
2880     address start = __ pc();
2881 
2882       __ enter();
2883 
2884       __ movw(rscratch2, len_reg);
2885 
2886       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2887 
2888       __ ld1(v2, __ T16B, rvec);
2889 
2890       __ ld1(v31, __ T16B, __ post(key, 16));
2891       __ rev32(v31, __ T16B, v31);
2892 
2893       __ cmpw(keylen, 52);
2894       __ br(Assembler::CC, L_loadkeys_44);
2895       __ br(Assembler::EQ, L_loadkeys_52);
2896 
2897       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2898       __ rev32(v17, __ T16B, v17);
2899       __ rev32(v18, __ T16B, v18);
2900     __ BIND(L_loadkeys_52);
2901       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2902       __ rev32(v19, __ T16B, v19);
2903       __ rev32(v20, __ T16B, v20);
2904     __ BIND(L_loadkeys_44);
2905       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2906       __ rev32(v21, __ T16B, v21);
2907       __ rev32(v22, __ T16B, v22);
2908       __ rev32(v23, __ T16B, v23);
2909       __ rev32(v24, __ T16B, v24);
2910       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2911       __ rev32(v25, __ T16B, v25);
2912       __ rev32(v26, __ T16B, v26);
2913       __ rev32(v27, __ T16B, v27);
2914       __ rev32(v28, __ T16B, v28);
2915       __ ld1(v29, v30, __ T16B, key);
2916       __ rev32(v29, __ T16B, v29);
2917       __ rev32(v30, __ T16B, v30);
2918 
2919     __ BIND(L_aes_loop);
2920       __ ld1(v0, __ T16B, __ post(from, 16));
2921       __ orr(v1, __ T16B, v0, v0);
2922 
2923       __ br(Assembler::CC, L_rounds_44);
2924       __ br(Assembler::EQ, L_rounds_52);
2925 
2926       __ aesd(v0, v17); __ aesimc(v0, v0);
2927       __ aesd(v0, v18); __ aesimc(v0, v0);
2928     __ BIND(L_rounds_52);
2929       __ aesd(v0, v19); __ aesimc(v0, v0);
2930       __ aesd(v0, v20); __ aesimc(v0, v0);
2931     __ BIND(L_rounds_44);
2932       __ aesd(v0, v21); __ aesimc(v0, v0);
2933       __ aesd(v0, v22); __ aesimc(v0, v0);
2934       __ aesd(v0, v23); __ aesimc(v0, v0);
2935       __ aesd(v0, v24); __ aesimc(v0, v0);
2936       __ aesd(v0, v25); __ aesimc(v0, v0);
2937       __ aesd(v0, v26); __ aesimc(v0, v0);
2938       __ aesd(v0, v27); __ aesimc(v0, v0);
2939       __ aesd(v0, v28); __ aesimc(v0, v0);
2940       __ aesd(v0, v29); __ aesimc(v0, v0);
2941       __ aesd(v0, v30);
2942       __ eor(v0, __ T16B, v0, v31);
2943       __ eor(v0, __ T16B, v0, v2);
2944 
2945       __ st1(v0, __ T16B, __ post(to, 16));
2946       __ orr(v2, __ T16B, v1, v1);
2947 
2948       __ subw(len_reg, len_reg, 16);
2949       __ cbnzw(len_reg, L_aes_loop);
2950 
2951       __ st1(v2, __ T16B, rvec);
2952 
2953       __ mov(r0, rscratch2);
2954 
2955       __ leave();
2956       __ ret(lr);
2957 
2958     return start;
2959   }
2960 
2961   // Big-endian 128-bit + 64-bit -> 128-bit addition.
2962   // Inputs: 128-bits. in is preserved.
2963   // The least-significant 64-bit word is in the upper dword of each vector.
2964   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2965   // Output: result
2966   void be_add_128_64(FloatRegister result, FloatRegister in,
2967                      FloatRegister inc, FloatRegister tmp) {
2968     assert_different_registers(result, tmp, inc);
2969 
2970     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
2971                                            // input
2972     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
2973     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
2974                                            // MSD == 0 (must be!) to LSD
2975     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
2976   }
2977 
2978   // CTR AES crypt.
2979   // Arguments:
2980   //
2981   // Inputs:
2982   //   c_rarg0   - source byte array address
2983   //   c_rarg1   - destination byte array address
2984   //   c_rarg2   - K (key) in little endian int array
2985   //   c_rarg3   - counter vector byte array address
2986   //   c_rarg4   - input length
2987   //   c_rarg5   - saved encryptedCounter start
2988   //   c_rarg6   - saved used length
2989   //
2990   // Output:
2991   //   r0       - input length
2992   //
2993   address generate_counterMode_AESCrypt() {
2994     const Register in = c_rarg0;
2995     const Register out = c_rarg1;
2996     const Register key = c_rarg2;
2997     const Register counter = c_rarg3;
2998     const Register saved_len = c_rarg4, len = r10;
2999     const Register saved_encrypted_ctr = c_rarg5;
3000     const Register used_ptr = c_rarg6, used = r12;
3001 
3002     const Register offset = r7;
3003     const Register keylen = r11;
3004 
3005     const unsigned char block_size = 16;
3006     const int bulk_width = 4;
3007     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3008     // performance with larger data sizes, but it also means that the
3009     // fast path isn't used until you have at least 8 blocks, and up
3010     // to 127 bytes of data will be executed on the slow path. For
3011     // that reason, and also so as not to blow away too much icache, 4
3012     // blocks seems like a sensible compromise.
3013 
3014     // Algorithm:
3015     //
3016     //    if (len == 0) {
3017     //        goto DONE;
3018     //    }
3019     //    int result = len;
3020     //    do {
3021     //        if (used >= blockSize) {
3022     //            if (len >= bulk_width * blockSize) {
3023     //                CTR_large_block();
3024     //                if (len == 0)
3025     //                    goto DONE;
3026     //            }
3027     //            for (;;) {
3028     //                16ByteVector v0 = counter;
3029     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3030     //                used = 0;
3031     //                if (len < blockSize)
3032     //                    break;    /* goto NEXT */
3033     //                16ByteVector v1 = load16Bytes(in, offset);
3034     //                v1 = v1 ^ encryptedCounter;
3035     //                store16Bytes(out, offset);
3036     //                used = blockSize;
3037     //                offset += blockSize;
3038     //                len -= blockSize;
3039     //                if (len == 0)
3040     //                    goto DONE;
3041     //            }
3042     //        }
3043     //      NEXT:
3044     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3045     //        len--;
3046     //    } while (len != 0);
3047     //  DONE:
3048     //    return result;
3049     //
3050     // CTR_large_block()
3051     //    Wide bulk encryption of whole blocks.
3052 
3053     __ align(CodeEntryAlignment);
3054     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3055     const address start = __ pc();
3056     __ enter();
3057 
3058     Label DONE, CTR_large_block, large_block_return;
3059     __ ldrw(used, Address(used_ptr));
3060     __ cbzw(saved_len, DONE);
3061 
3062     __ mov(len, saved_len);
3063     __ mov(offset, 0);
3064 
3065     // Compute #rounds for AES based on the length of the key array
3066     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3067 
3068     __ aesenc_loadkeys(key, keylen);
3069 
3070     {
3071       Label L_CTR_loop, NEXT;
3072 
3073       __ bind(L_CTR_loop);
3074 
3075       __ cmp(used, block_size);
3076       __ br(__ LO, NEXT);
3077 
3078       // Maybe we have a lot of data
3079       __ subsw(rscratch1, len, bulk_width * block_size);
3080       __ br(__ HS, CTR_large_block);
3081       __ BIND(large_block_return);
3082       __ cbzw(len, DONE);
3083 
3084       // Setup the counter
3085       __ movi(v4, __ T4S, 0);
3086       __ movi(v5, __ T4S, 1);
3087       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3088 
3089       // 128-bit big-endian increment
3090       __ ld1(v0, __ T16B, counter);
3091       __ rev64(v16, __ T16B, v0);
3092       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3093       __ rev64(v16, __ T16B, v16);
3094       __ st1(v16, __ T16B, counter);
3095       // Previous counter value is in v0
3096       // v4 contains { 0, 1 }
3097 
3098       {
3099         // We have fewer than bulk_width blocks of data left. Encrypt
3100         // them one by one until there is less than a full block
3101         // remaining, being careful to save both the encrypted counter
3102         // and the counter.
3103 
3104         Label inner_loop;
3105         __ bind(inner_loop);
3106         // Counter to encrypt is in v0
3107         __ aesecb_encrypt(noreg, noreg, keylen);
3108         __ st1(v0, __ T16B, saved_encrypted_ctr);
3109 
3110         // Do we have a remaining full block?
3111 
3112         __ mov(used, 0);
3113         __ cmp(len, block_size);
3114         __ br(__ LO, NEXT);
3115 
3116         // Yes, we have a full block
3117         __ ldrq(v1, Address(in, offset));
3118         __ eor(v1, __ T16B, v1, v0);
3119         __ strq(v1, Address(out, offset));
3120         __ mov(used, block_size);
3121         __ add(offset, offset, block_size);
3122 
3123         __ subw(len, len, block_size);
3124         __ cbzw(len, DONE);
3125 
3126         // Increment the counter, store it back
3127         __ orr(v0, __ T16B, v16, v16);
3128         __ rev64(v16, __ T16B, v16);
3129         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3130         __ rev64(v16, __ T16B, v16);
3131         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3132 
3133         __ b(inner_loop);
3134       }
3135 
3136       __ BIND(NEXT);
3137 
3138       // Encrypt a single byte, and loop.
3139       // We expect this to be a rare event.
3140       __ ldrb(rscratch1, Address(in, offset));
3141       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3142       __ eor(rscratch1, rscratch1, rscratch2);
3143       __ strb(rscratch1, Address(out, offset));
3144       __ add(offset, offset, 1);
3145       __ add(used, used, 1);
3146       __ subw(len, len,1);
3147       __ cbnzw(len, L_CTR_loop);
3148     }
3149 
3150     __ bind(DONE);
3151     __ strw(used, Address(used_ptr));
3152     __ mov(r0, saved_len);
3153 
3154     __ leave(); // required for proper stackwalking of RuntimeStub frame
3155     __ ret(lr);
3156 
3157     // Bulk encryption
3158 
3159     __ BIND (CTR_large_block);
3160     assert(bulk_width == 4 || bulk_width == 8, "must be");
3161 
3162     if (bulk_width == 8) {
3163       __ sub(sp, sp, 4 * 16);
3164       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3165     }
3166     __ sub(sp, sp, 4 * 16);
3167     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3168     RegSet saved_regs = (RegSet::of(in, out, offset)
3169                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3170     __ push(saved_regs, sp);
3171     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3172     __ add(in, in, offset);
3173     __ add(out, out, offset);
3174 
3175     // Keys should already be loaded into the correct registers
3176 
3177     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3178     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3179 
3180     // AES/CTR loop
3181     {
3182       Label L_CTR_loop;
3183       __ BIND(L_CTR_loop);
3184 
3185       // Setup the counters
3186       __ movi(v8, __ T4S, 0);
3187       __ movi(v9, __ T4S, 1);
3188       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3189 
3190       for (int i = 0; i < bulk_width; i++) {
3191         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3192         __ rev64(v0_ofs, __ T16B, v16);
3193         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3194       }
3195 
3196       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3197 
3198       // Encrypt the counters
3199       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3200 
3201       if (bulk_width == 8) {
3202         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3203       }
3204 
3205       // XOR the encrypted counters with the inputs
3206       for (int i = 0; i < bulk_width; i++) {
3207         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3208         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3209         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3210       }
3211 
3212       // Write the encrypted data
3213       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3214       if (bulk_width == 8) {
3215         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3216       }
3217 
3218       __ subw(len, len, 16 * bulk_width);
3219       __ cbnzw(len, L_CTR_loop);
3220     }
3221 
3222     // Save the counter back where it goes
3223     __ rev64(v16, __ T16B, v16);
3224     __ st1(v16, __ T16B, counter);
3225 
3226     __ pop(saved_regs, sp);
3227 
3228     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3229     if (bulk_width == 8) {
3230       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3231     }
3232 
3233     __ andr(rscratch1, len, -16 * bulk_width);
3234     __ sub(len, len, rscratch1);
3235     __ add(offset, offset, rscratch1);
3236     __ mov(used, 16);
3237     __ strw(used, Address(used_ptr));
3238     __ b(large_block_return);
3239 
3240     return start;
3241   }
3242 
3243   // Vector AES Galois Counter Mode implementation. Parameters:
3244   //
3245   // in = c_rarg0
3246   // len = c_rarg1
3247   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3248   // out = c_rarg3
3249   // key = c_rarg4
3250   // state = c_rarg5 - GHASH.state
3251   // subkeyHtbl = c_rarg6 - powers of H
3252   // counter = c_rarg7 - 16 bytes of CTR
3253   // return - number of processed bytes
3254   address generate_galoisCounterMode_AESCrypt() {
3255     address ghash_polynomial = __ pc();
3256     __ emit_int64(0x87);  // The low-order bits of the field
3257                           // polynomial (i.e. p = z^7+z^2+z+1)
3258                           // repeated in the low and high parts of a
3259                           // 128-bit vector
3260     __ emit_int64(0x87);
3261 
3262     __ align(CodeEntryAlignment);
3263      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3264     address start = __ pc();
3265     __ enter();
3266 
3267     const Register in = c_rarg0;
3268     const Register len = c_rarg1;
3269     const Register ct = c_rarg2;
3270     const Register out = c_rarg3;
3271     // and updated with the incremented counter in the end
3272 
3273     const Register key = c_rarg4;
3274     const Register state = c_rarg5;
3275 
3276     const Register subkeyHtbl = c_rarg6;
3277 
3278     const Register counter = c_rarg7;
3279 
3280     const Register keylen = r10;
3281     // Save state before entering routine
3282     __ sub(sp, sp, 4 * 16);
3283     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3284     __ sub(sp, sp, 4 * 16);
3285     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3286 
3287     // __ andr(len, len, -512);
3288     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3289     __ str(len, __ pre(sp, -2 * wordSize));
3290 
3291     Label DONE;
3292     __ cbz(len, DONE);
3293 
3294     // Compute #rounds for AES based on the length of the key array
3295     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3296 
3297     __ aesenc_loadkeys(key, keylen);
3298     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3299     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3300 
3301     // AES/CTR loop
3302     {
3303       Label L_CTR_loop;
3304       __ BIND(L_CTR_loop);
3305 
3306       // Setup the counters
3307       __ movi(v8, __ T4S, 0);
3308       __ movi(v9, __ T4S, 1);
3309       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3310 
3311       assert(v0->encoding() < v8->encoding(), "");
3312       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3313         FloatRegister f = as_FloatRegister(i);
3314         __ rev32(f, __ T16B, v16);
3315         __ addv(v16, __ T4S, v16, v8);
3316       }
3317 
3318       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3319 
3320       // Encrypt the counters
3321       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3322 
3323       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3324 
3325       // XOR the encrypted counters with the inputs
3326       for (int i = 0; i < 8; i++) {
3327         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3328         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3329         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3330       }
3331       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3332       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3333 
3334       __ subw(len, len, 16 * 8);
3335       __ cbnzw(len, L_CTR_loop);
3336     }
3337 
3338     __ rev32(v16, __ T16B, v16);
3339     __ st1(v16, __ T16B, counter);
3340 
3341     __ ldr(len, Address(sp));
3342     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3343 
3344     // GHASH/CTR loop
3345     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3346                                 len, /*unrolls*/4);
3347 
3348 #ifdef ASSERT
3349     { Label L;
3350       __ cmp(len, (unsigned char)0);
3351       __ br(Assembler::EQ, L);
3352       __ stop("stubGenerator: abort");
3353       __ bind(L);
3354   }
3355 #endif
3356 
3357   __ bind(DONE);
3358     // Return the number of bytes processed
3359     __ ldr(r0, __ post(sp, 2 * wordSize));
3360 
3361     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3362     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3363 
3364     __ leave(); // required for proper stackwalking of RuntimeStub frame
3365     __ ret(lr);
3366      return start;
3367   }
3368 
3369   class Cached64Bytes {
3370   private:
3371     MacroAssembler *_masm;
3372     Register _regs[8];
3373 
3374   public:
3375     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3376       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3377       auto it = rs.begin();
3378       for (auto &r: _regs) {
3379         r = *it;
3380         ++it;
3381       }
3382     }
3383 
3384     void gen_loads(Register base) {
3385       for (int i = 0; i < 8; i += 2) {
3386         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3387       }
3388     }
3389 
3390     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3391     void extract_u32(Register dest, int i) {
3392       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3393     }
3394   };
3395 
3396   // Utility routines for md5.
3397   // Clobbers r10 and r11.
3398   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3399               int k, int s, int t) {
3400     Register rscratch3 = r10;
3401     Register rscratch4 = r11;
3402 
3403     __ eorw(rscratch3, r3, r4);
3404     __ movw(rscratch2, t);
3405     __ andw(rscratch3, rscratch3, r2);
3406     __ addw(rscratch4, r1, rscratch2);
3407     reg_cache.extract_u32(rscratch1, k);
3408     __ eorw(rscratch3, rscratch3, r4);
3409     __ addw(rscratch4, rscratch4, rscratch1);
3410     __ addw(rscratch3, rscratch3, rscratch4);
3411     __ rorw(rscratch2, rscratch3, 32 - s);
3412     __ addw(r1, rscratch2, r2);
3413   }
3414 
3415   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3416               int k, int s, int t) {
3417     Register rscratch3 = r10;
3418     Register rscratch4 = r11;
3419 
3420     __ andw(rscratch3, r2, r4);
3421     __ bicw(rscratch4, r3, r4);
3422     reg_cache.extract_u32(rscratch1, k);
3423     __ movw(rscratch2, t);
3424     __ orrw(rscratch3, rscratch3, rscratch4);
3425     __ addw(rscratch4, r1, rscratch2);
3426     __ addw(rscratch4, rscratch4, rscratch1);
3427     __ addw(rscratch3, rscratch3, rscratch4);
3428     __ rorw(rscratch2, rscratch3, 32 - s);
3429     __ addw(r1, rscratch2, r2);
3430   }
3431 
3432   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3433               int k, int s, int t) {
3434     Register rscratch3 = r10;
3435     Register rscratch4 = r11;
3436 
3437     __ eorw(rscratch3, r3, r4);
3438     __ movw(rscratch2, t);
3439     __ addw(rscratch4, r1, rscratch2);
3440     reg_cache.extract_u32(rscratch1, k);
3441     __ eorw(rscratch3, rscratch3, r2);
3442     __ addw(rscratch4, rscratch4, rscratch1);
3443     __ addw(rscratch3, rscratch3, rscratch4);
3444     __ rorw(rscratch2, rscratch3, 32 - s);
3445     __ addw(r1, rscratch2, r2);
3446   }
3447 
3448   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3449               int k, int s, int t) {
3450     Register rscratch3 = r10;
3451     Register rscratch4 = r11;
3452 
3453     __ movw(rscratch3, t);
3454     __ ornw(rscratch2, r2, r4);
3455     __ addw(rscratch4, r1, rscratch3);
3456     reg_cache.extract_u32(rscratch1, k);
3457     __ eorw(rscratch3, rscratch2, r3);
3458     __ addw(rscratch4, rscratch4, rscratch1);
3459     __ addw(rscratch3, rscratch3, rscratch4);
3460     __ rorw(rscratch2, rscratch3, 32 - s);
3461     __ addw(r1, rscratch2, r2);
3462   }
3463 
3464   // Arguments:
3465   //
3466   // Inputs:
3467   //   c_rarg0   - byte[]  source+offset
3468   //   c_rarg1   - int[]   SHA.state
3469   //   c_rarg2   - int     offset
3470   //   c_rarg3   - int     limit
3471   //
3472   address generate_md5_implCompress(bool multi_block, const char *name) {
3473     __ align(CodeEntryAlignment);
3474     StubCodeMark mark(this, "StubRoutines", name);
3475     address start = __ pc();
3476 
3477     Register buf       = c_rarg0;
3478     Register state     = c_rarg1;
3479     Register ofs       = c_rarg2;
3480     Register limit     = c_rarg3;
3481     Register a         = r4;
3482     Register b         = r5;
3483     Register c         = r6;
3484     Register d         = r7;
3485     Register rscratch3 = r10;
3486     Register rscratch4 = r11;
3487 
3488     Register state_regs[2] = { r12, r13 };
3489     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3490     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3491 
3492     __ push(saved_regs, sp);
3493 
3494     __ ldp(state_regs[0], state_regs[1], Address(state));
3495     __ ubfx(a, state_regs[0],  0, 32);
3496     __ ubfx(b, state_regs[0], 32, 32);
3497     __ ubfx(c, state_regs[1],  0, 32);
3498     __ ubfx(d, state_regs[1], 32, 32);
3499 
3500     Label md5_loop;
3501     __ BIND(md5_loop);
3502 
3503     reg_cache.gen_loads(buf);
3504 
3505     // Round 1
3506     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3507     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3508     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3509     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3510     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3511     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3512     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3513     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3514     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3515     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3516     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3517     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3518     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3519     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3520     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3521     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3522 
3523     // Round 2
3524     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3525     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3526     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3527     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3528     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3529     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3530     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3531     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3532     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3533     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3534     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3535     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3536     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3537     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3538     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3539     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3540 
3541     // Round 3
3542     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3543     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3544     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3545     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3546     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3547     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3548     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3549     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3550     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3551     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3552     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3553     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3554     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3555     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3556     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3557     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3558 
3559     // Round 4
3560     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3561     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3562     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3563     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3564     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3565     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3566     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3567     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3568     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3569     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3570     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3571     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3572     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3573     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3574     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3575     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3576 
3577     __ addw(a, state_regs[0], a);
3578     __ ubfx(rscratch2, state_regs[0], 32, 32);
3579     __ addw(b, rscratch2, b);
3580     __ addw(c, state_regs[1], c);
3581     __ ubfx(rscratch4, state_regs[1], 32, 32);
3582     __ addw(d, rscratch4, d);
3583 
3584     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3585     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3586 
3587     if (multi_block) {
3588       __ add(buf, buf, 64);
3589       __ add(ofs, ofs, 64);
3590       __ cmp(ofs, limit);
3591       __ br(Assembler::LE, md5_loop);
3592       __ mov(c_rarg0, ofs); // return ofs
3593     }
3594 
3595     // write hash values back in the correct order
3596     __ stp(state_regs[0], state_regs[1], Address(state));
3597 
3598     __ pop(saved_regs, sp);
3599 
3600     __ ret(lr);
3601 
3602     return start;
3603   }
3604 
3605   // Arguments:
3606   //
3607   // Inputs:
3608   //   c_rarg0   - byte[]  source+offset
3609   //   c_rarg1   - int[]   SHA.state
3610   //   c_rarg2   - int     offset
3611   //   c_rarg3   - int     limit
3612   //
3613   address generate_sha1_implCompress(bool multi_block, const char *name) {
3614     __ align(CodeEntryAlignment);
3615     StubCodeMark mark(this, "StubRoutines", name);
3616     address start = __ pc();
3617 
3618     Register buf   = c_rarg0;
3619     Register state = c_rarg1;
3620     Register ofs   = c_rarg2;
3621     Register limit = c_rarg3;
3622 
3623     Label keys;
3624     Label sha1_loop;
3625 
3626     // load the keys into v0..v3
3627     __ adr(rscratch1, keys);
3628     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3629     // load 5 words state into v6, v7
3630     __ ldrq(v6, Address(state, 0));
3631     __ ldrs(v7, Address(state, 16));
3632 
3633 
3634     __ BIND(sha1_loop);
3635     // load 64 bytes of data into v16..v19
3636     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3637     __ rev32(v16, __ T16B, v16);
3638     __ rev32(v17, __ T16B, v17);
3639     __ rev32(v18, __ T16B, v18);
3640     __ rev32(v19, __ T16B, v19);
3641 
3642     // do the sha1
3643     __ addv(v4, __ T4S, v16, v0);
3644     __ orr(v20, __ T16B, v6, v6);
3645 
3646     FloatRegister d0 = v16;
3647     FloatRegister d1 = v17;
3648     FloatRegister d2 = v18;
3649     FloatRegister d3 = v19;
3650 
3651     for (int round = 0; round < 20; round++) {
3652       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3653       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3654       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3655       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3656       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3657 
3658       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3659       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3660       __ sha1h(tmp2, __ T4S, v20);
3661       if (round < 5)
3662         __ sha1c(v20, __ T4S, tmp3, tmp4);
3663       else if (round < 10 || round >= 15)
3664         __ sha1p(v20, __ T4S, tmp3, tmp4);
3665       else
3666         __ sha1m(v20, __ T4S, tmp3, tmp4);
3667       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3668 
3669       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3670     }
3671 
3672     __ addv(v7, __ T2S, v7, v21);
3673     __ addv(v6, __ T4S, v6, v20);
3674 
3675     if (multi_block) {
3676       __ add(ofs, ofs, 64);
3677       __ cmp(ofs, limit);
3678       __ br(Assembler::LE, sha1_loop);
3679       __ mov(c_rarg0, ofs); // return ofs
3680     }
3681 
3682     __ strq(v6, Address(state, 0));
3683     __ strs(v7, Address(state, 16));
3684 
3685     __ ret(lr);
3686 
3687     __ bind(keys);
3688     __ emit_int32(0x5a827999);
3689     __ emit_int32(0x6ed9eba1);
3690     __ emit_int32(0x8f1bbcdc);
3691     __ emit_int32(0xca62c1d6);
3692 
3693     return start;
3694   }
3695 
3696 
3697   // Arguments:
3698   //
3699   // Inputs:
3700   //   c_rarg0   - byte[]  source+offset
3701   //   c_rarg1   - int[]   SHA.state
3702   //   c_rarg2   - int     offset
3703   //   c_rarg3   - int     limit
3704   //
3705   address generate_sha256_implCompress(bool multi_block, const char *name) {
3706     static const uint32_t round_consts[64] = {
3707       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3708       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3709       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3710       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3711       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3712       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3713       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3714       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3715       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3716       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3717       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3718       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3719       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3720       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3721       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3722       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3723     };
3724     __ align(CodeEntryAlignment);
3725     StubCodeMark mark(this, "StubRoutines", name);
3726     address start = __ pc();
3727 
3728     Register buf   = c_rarg0;
3729     Register state = c_rarg1;
3730     Register ofs   = c_rarg2;
3731     Register limit = c_rarg3;
3732 
3733     Label sha1_loop;
3734 
3735     __ stpd(v8, v9, __ pre(sp, -32));
3736     __ stpd(v10, v11, Address(sp, 16));
3737 
3738 // dga == v0
3739 // dgb == v1
3740 // dg0 == v2
3741 // dg1 == v3
3742 // dg2 == v4
3743 // t0 == v6
3744 // t1 == v7
3745 
3746     // load 16 keys to v16..v31
3747     __ lea(rscratch1, ExternalAddress((address)round_consts));
3748     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3749     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3750     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3751     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3752 
3753     // load 8 words (256 bits) state
3754     __ ldpq(v0, v1, state);
3755 
3756     __ BIND(sha1_loop);
3757     // load 64 bytes of data into v8..v11
3758     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3759     __ rev32(v8, __ T16B, v8);
3760     __ rev32(v9, __ T16B, v9);
3761     __ rev32(v10, __ T16B, v10);
3762     __ rev32(v11, __ T16B, v11);
3763 
3764     __ addv(v6, __ T4S, v8, v16);
3765     __ orr(v2, __ T16B, v0, v0);
3766     __ orr(v3, __ T16B, v1, v1);
3767 
3768     FloatRegister d0 = v8;
3769     FloatRegister d1 = v9;
3770     FloatRegister d2 = v10;
3771     FloatRegister d3 = v11;
3772 
3773 
3774     for (int round = 0; round < 16; round++) {
3775       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3776       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3777       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3778       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3779 
3780       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3781        __ orr(v4, __ T16B, v2, v2);
3782       if (round < 15)
3783         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3784       __ sha256h(v2, __ T4S, v3, tmp2);
3785       __ sha256h2(v3, __ T4S, v4, tmp2);
3786       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3787 
3788       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3789     }
3790 
3791     __ addv(v0, __ T4S, v0, v2);
3792     __ addv(v1, __ T4S, v1, v3);
3793 
3794     if (multi_block) {
3795       __ add(ofs, ofs, 64);
3796       __ cmp(ofs, limit);
3797       __ br(Assembler::LE, sha1_loop);
3798       __ mov(c_rarg0, ofs); // return ofs
3799     }
3800 
3801     __ ldpd(v10, v11, Address(sp, 16));
3802     __ ldpd(v8, v9, __ post(sp, 32));
3803 
3804     __ stpq(v0, v1, state);
3805 
3806     __ ret(lr);
3807 
3808     return start;
3809   }
3810 
3811   // Double rounds for sha512.
3812   void sha512_dround(int dr,
3813                      FloatRegister vi0, FloatRegister vi1,
3814                      FloatRegister vi2, FloatRegister vi3,
3815                      FloatRegister vi4, FloatRegister vrc0,
3816                      FloatRegister vrc1, FloatRegister vin0,
3817                      FloatRegister vin1, FloatRegister vin2,
3818                      FloatRegister vin3, FloatRegister vin4) {
3819       if (dr < 36) {
3820         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3821       }
3822       __ addv(v5, __ T2D, vrc0, vin0);
3823       __ ext(v6, __ T16B, vi2, vi3, 8);
3824       __ ext(v5, __ T16B, v5, v5, 8);
3825       __ ext(v7, __ T16B, vi1, vi2, 8);
3826       __ addv(vi3, __ T2D, vi3, v5);
3827       if (dr < 32) {
3828         __ ext(v5, __ T16B, vin3, vin4, 8);
3829         __ sha512su0(vin0, __ T2D, vin1);
3830       }
3831       __ sha512h(vi3, __ T2D, v6, v7);
3832       if (dr < 32) {
3833         __ sha512su1(vin0, __ T2D, vin2, v5);
3834       }
3835       __ addv(vi4, __ T2D, vi1, vi3);
3836       __ sha512h2(vi3, __ T2D, vi1, vi0);
3837   }
3838 
3839   // Arguments:
3840   //
3841   // Inputs:
3842   //   c_rarg0   - byte[]  source+offset
3843   //   c_rarg1   - int[]   SHA.state
3844   //   c_rarg2   - int     offset
3845   //   c_rarg3   - int     limit
3846   //
3847   address generate_sha512_implCompress(bool multi_block, const char *name) {
3848     static const uint64_t round_consts[80] = {
3849       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3850       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3851       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3852       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3853       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3854       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3855       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3856       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3857       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3858       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3859       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3860       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3861       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3862       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3863       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3864       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3865       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3866       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3867       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3868       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3869       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3870       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3871       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3872       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3873       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3874       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3875       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3876     };
3877 
3878     __ align(CodeEntryAlignment);
3879     StubCodeMark mark(this, "StubRoutines", name);
3880     address start = __ pc();
3881 
3882     Register buf   = c_rarg0;
3883     Register state = c_rarg1;
3884     Register ofs   = c_rarg2;
3885     Register limit = c_rarg3;
3886 
3887     __ stpd(v8, v9, __ pre(sp, -64));
3888     __ stpd(v10, v11, Address(sp, 16));
3889     __ stpd(v12, v13, Address(sp, 32));
3890     __ stpd(v14, v15, Address(sp, 48));
3891 
3892     Label sha512_loop;
3893 
3894     // load state
3895     __ ld1(v8, v9, v10, v11, __ T2D, state);
3896 
3897     // load first 4 round constants
3898     __ lea(rscratch1, ExternalAddress((address)round_consts));
3899     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3900 
3901     __ BIND(sha512_loop);
3902     // load 128B of data into v12..v19
3903     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3904     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3905     __ rev64(v12, __ T16B, v12);
3906     __ rev64(v13, __ T16B, v13);
3907     __ rev64(v14, __ T16B, v14);
3908     __ rev64(v15, __ T16B, v15);
3909     __ rev64(v16, __ T16B, v16);
3910     __ rev64(v17, __ T16B, v17);
3911     __ rev64(v18, __ T16B, v18);
3912     __ rev64(v19, __ T16B, v19);
3913 
3914     __ mov(rscratch2, rscratch1);
3915 
3916     __ mov(v0, __ T16B, v8);
3917     __ mov(v1, __ T16B, v9);
3918     __ mov(v2, __ T16B, v10);
3919     __ mov(v3, __ T16B, v11);
3920 
3921     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3922     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3923     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3924     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3925     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3926     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3927     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3928     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3929     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3930     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3931     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3932     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3933     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3934     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3935     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3936     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3937     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3938     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3939     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3940     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3941     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3942     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3943     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3944     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3945     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3946     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3947     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3948     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3949     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3950     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3951     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3952     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3953     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3954     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3955     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3956     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3957     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3958     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3959     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3960     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3961 
3962     __ addv(v8, __ T2D, v8, v0);
3963     __ addv(v9, __ T2D, v9, v1);
3964     __ addv(v10, __ T2D, v10, v2);
3965     __ addv(v11, __ T2D, v11, v3);
3966 
3967     if (multi_block) {
3968       __ add(ofs, ofs, 128);
3969       __ cmp(ofs, limit);
3970       __ br(Assembler::LE, sha512_loop);
3971       __ mov(c_rarg0, ofs); // return ofs
3972     }
3973 
3974     __ st1(v8, v9, v10, v11, __ T2D, state);
3975 
3976     __ ldpd(v14, v15, Address(sp, 48));
3977     __ ldpd(v12, v13, Address(sp, 32));
3978     __ ldpd(v10, v11, Address(sp, 16));
3979     __ ldpd(v8, v9, __ post(sp, 64));
3980 
3981     __ ret(lr);
3982 
3983     return start;
3984   }
3985 
3986   // Arguments:
3987   //
3988   // Inputs:
3989   //   c_rarg0   - byte[]  source+offset
3990   //   c_rarg1   - byte[]  SHA.state
3991   //   c_rarg2   - int     block_size
3992   //   c_rarg3   - int     offset
3993   //   c_rarg4   - int     limit
3994   //
3995   address generate_sha3_implCompress(bool multi_block, const char *name) {
3996     static const uint64_t round_consts[24] = {
3997       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3998       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3999       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4000       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4001       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4002       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4003       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4004       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4005     };
4006 
4007     __ align(CodeEntryAlignment);
4008     StubCodeMark mark(this, "StubRoutines", name);
4009     address start = __ pc();
4010 
4011     Register buf           = c_rarg0;
4012     Register state         = c_rarg1;
4013     Register block_size    = c_rarg2;
4014     Register ofs           = c_rarg3;
4015     Register limit         = c_rarg4;
4016 
4017     Label sha3_loop, rounds24_loop;
4018     Label sha3_512_or_sha3_384, shake128;
4019 
4020     __ stpd(v8, v9, __ pre(sp, -64));
4021     __ stpd(v10, v11, Address(sp, 16));
4022     __ stpd(v12, v13, Address(sp, 32));
4023     __ stpd(v14, v15, Address(sp, 48));
4024 
4025     // load state
4026     __ add(rscratch1, state, 32);
4027     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4028     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4029     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4030     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4031     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4032     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4033     __ ld1(v24, __ T1D, rscratch1);
4034 
4035     __ BIND(sha3_loop);
4036 
4037     // 24 keccak rounds
4038     __ movw(rscratch2, 24);
4039 
4040     // load round_constants base
4041     __ lea(rscratch1, ExternalAddress((address) round_consts));
4042 
4043     // load input
4044     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4045     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4046     __ eor(v0, __ T8B, v0, v25);
4047     __ eor(v1, __ T8B, v1, v26);
4048     __ eor(v2, __ T8B, v2, v27);
4049     __ eor(v3, __ T8B, v3, v28);
4050     __ eor(v4, __ T8B, v4, v29);
4051     __ eor(v5, __ T8B, v5, v30);
4052     __ eor(v6, __ T8B, v6, v31);
4053 
4054     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4055     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4056 
4057     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4058     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4059     __ eor(v7, __ T8B, v7, v25);
4060     __ eor(v8, __ T8B, v8, v26);
4061     __ eor(v9, __ T8B, v9, v27);
4062     __ eor(v10, __ T8B, v10, v28);
4063     __ eor(v11, __ T8B, v11, v29);
4064     __ eor(v12, __ T8B, v12, v30);
4065     __ eor(v13, __ T8B, v13, v31);
4066 
4067     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4068     __ eor(v14, __ T8B, v14, v25);
4069     __ eor(v15, __ T8B, v15, v26);
4070     __ eor(v16, __ T8B, v16, v27);
4071 
4072     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4073     __ andw(c_rarg5, block_size, 48);
4074     __ cbzw(c_rarg5, rounds24_loop);
4075 
4076     __ tbnz(block_size, 5, shake128);
4077     // block_size == 144, bit5 == 0, SHA3-244
4078     __ ldrd(v28, __ post(buf, 8));
4079     __ eor(v17, __ T8B, v17, v28);
4080     __ b(rounds24_loop);
4081 
4082     __ BIND(shake128);
4083     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4084     __ eor(v17, __ T8B, v17, v28);
4085     __ eor(v18, __ T8B, v18, v29);
4086     __ eor(v19, __ T8B, v19, v30);
4087     __ eor(v20, __ T8B, v20, v31);
4088     __ b(rounds24_loop); // block_size == 168, SHAKE128
4089 
4090     __ BIND(sha3_512_or_sha3_384);
4091     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4092     __ eor(v7, __ T8B, v7, v25);
4093     __ eor(v8, __ T8B, v8, v26);
4094     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4095 
4096     // SHA3-384
4097     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4098     __ eor(v9,  __ T8B, v9,  v27);
4099     __ eor(v10, __ T8B, v10, v28);
4100     __ eor(v11, __ T8B, v11, v29);
4101     __ eor(v12, __ T8B, v12, v30);
4102 
4103     __ BIND(rounds24_loop);
4104     __ subw(rscratch2, rscratch2, 1);
4105 
4106     __ eor3(v29, __ T16B, v4, v9, v14);
4107     __ eor3(v26, __ T16B, v1, v6, v11);
4108     __ eor3(v28, __ T16B, v3, v8, v13);
4109     __ eor3(v25, __ T16B, v0, v5, v10);
4110     __ eor3(v27, __ T16B, v2, v7, v12);
4111     __ eor3(v29, __ T16B, v29, v19, v24);
4112     __ eor3(v26, __ T16B, v26, v16, v21);
4113     __ eor3(v28, __ T16B, v28, v18, v23);
4114     __ eor3(v25, __ T16B, v25, v15, v20);
4115     __ eor3(v27, __ T16B, v27, v17, v22);
4116 
4117     __ rax1(v30, __ T2D, v29, v26);
4118     __ rax1(v26, __ T2D, v26, v28);
4119     __ rax1(v28, __ T2D, v28, v25);
4120     __ rax1(v25, __ T2D, v25, v27);
4121     __ rax1(v27, __ T2D, v27, v29);
4122 
4123     __ eor(v0, __ T16B, v0, v30);
4124     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4125     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4126     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4127     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4128     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4129     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4130     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4131     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4132     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4133     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4134     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4135     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4136     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4137     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4138     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4139     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4140     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4141     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4142     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4143     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4144     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4145     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4146     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4147     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4148 
4149     __ bcax(v20, __ T16B, v31, v22, v8);
4150     __ bcax(v21, __ T16B, v8,  v23, v22);
4151     __ bcax(v22, __ T16B, v22, v24, v23);
4152     __ bcax(v23, __ T16B, v23, v31, v24);
4153     __ bcax(v24, __ T16B, v24, v8,  v31);
4154 
4155     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4156 
4157     __ bcax(v17, __ T16B, v25, v19, v3);
4158     __ bcax(v18, __ T16B, v3,  v15, v19);
4159     __ bcax(v19, __ T16B, v19, v16, v15);
4160     __ bcax(v15, __ T16B, v15, v25, v16);
4161     __ bcax(v16, __ T16B, v16, v3,  v25);
4162 
4163     __ bcax(v10, __ T16B, v29, v12, v26);
4164     __ bcax(v11, __ T16B, v26, v13, v12);
4165     __ bcax(v12, __ T16B, v12, v14, v13);
4166     __ bcax(v13, __ T16B, v13, v29, v14);
4167     __ bcax(v14, __ T16B, v14, v26, v29);
4168 
4169     __ bcax(v7, __ T16B, v30, v9,  v4);
4170     __ bcax(v8, __ T16B, v4,  v5,  v9);
4171     __ bcax(v9, __ T16B, v9,  v6,  v5);
4172     __ bcax(v5, __ T16B, v5,  v30, v6);
4173     __ bcax(v6, __ T16B, v6,  v4,  v30);
4174 
4175     __ bcax(v3, __ T16B, v27, v0,  v28);
4176     __ bcax(v4, __ T16B, v28, v1,  v0);
4177     __ bcax(v0, __ T16B, v0,  v2,  v1);
4178     __ bcax(v1, __ T16B, v1,  v27, v2);
4179     __ bcax(v2, __ T16B, v2,  v28, v27);
4180 
4181     __ eor(v0, __ T16B, v0, v31);
4182 
4183     __ cbnzw(rscratch2, rounds24_loop);
4184 
4185     if (multi_block) {
4186       __ add(ofs, ofs, block_size);
4187       __ cmp(ofs, limit);
4188       __ br(Assembler::LE, sha3_loop);
4189       __ mov(c_rarg0, ofs); // return ofs
4190     }
4191 
4192     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4193     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4194     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4195     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4196     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4197     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4198     __ st1(v24, __ T1D, state);
4199 
4200     __ ldpd(v14, v15, Address(sp, 48));
4201     __ ldpd(v12, v13, Address(sp, 32));
4202     __ ldpd(v10, v11, Address(sp, 16));
4203     __ ldpd(v8, v9, __ post(sp, 64));
4204 
4205     __ ret(lr);
4206 
4207     return start;
4208   }
4209 
4210   /**
4211    *  Arguments:
4212    *
4213    * Inputs:
4214    *   c_rarg0   - int crc
4215    *   c_rarg1   - byte* buf
4216    *   c_rarg2   - int length
4217    *
4218    * Output:
4219    *       rax   - int crc result
4220    */
4221   address generate_updateBytesCRC32() {
4222     assert(UseCRC32Intrinsics, "what are we doing here?");
4223 
4224     __ align(CodeEntryAlignment);
4225     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4226 
4227     address start = __ pc();
4228 
4229     const Register crc   = c_rarg0;  // crc
4230     const Register buf   = c_rarg1;  // source java byte array address
4231     const Register len   = c_rarg2;  // length
4232     const Register table0 = c_rarg3; // crc_table address
4233     const Register table1 = c_rarg4;
4234     const Register table2 = c_rarg5;
4235     const Register table3 = c_rarg6;
4236     const Register tmp3 = c_rarg7;
4237 
4238     BLOCK_COMMENT("Entry:");
4239     __ enter(); // required for proper stackwalking of RuntimeStub frame
4240 
4241     __ kernel_crc32(crc, buf, len,
4242               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4243 
4244     __ leave(); // required for proper stackwalking of RuntimeStub frame
4245     __ ret(lr);
4246 
4247     return start;
4248   }
4249 
4250   // ChaCha20 block function.  This version parallelizes by loading
4251   // individual 32-bit state elements into vectors for four blocks
4252   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4253   //
4254   // state (int[16]) = c_rarg0
4255   // keystream (byte[1024]) = c_rarg1
4256   // return - number of bytes of keystream (always 256)
4257   address generate_chacha20Block_blockpar() {
4258     Label L_twoRounds, L_cc20_const;
4259     // The constant data is broken into two 128-bit segments to be loaded
4260     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4261     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4262     // The second 128-bits is a table constant used for 8-bit left rotations.
4263     __ BIND(L_cc20_const);
4264     __ emit_int64(0x0000000100000000UL);
4265     __ emit_int64(0x0000000300000002UL);
4266     __ emit_int64(0x0605040702010003UL);
4267     __ emit_int64(0x0E0D0C0F0A09080BUL);
4268 
4269     __ align(CodeEntryAlignment);
4270     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4271     address start = __ pc();
4272     __ enter();
4273 
4274     int i, j;
4275     const Register state = c_rarg0;
4276     const Register keystream = c_rarg1;
4277     const Register loopCtr = r10;
4278     const Register tmpAddr = r11;
4279 
4280     const FloatRegister stateFirst = v0;
4281     const FloatRegister stateSecond = v1;
4282     const FloatRegister stateThird = v2;
4283     const FloatRegister stateFourth = v3;
4284     const FloatRegister origCtrState = v28;
4285     const FloatRegister scratch = v29;
4286     const FloatRegister lrot8Tbl = v30;
4287 
4288     // Organize SIMD registers in an array that facilitates
4289     // putting repetitive opcodes into loop structures.  It is
4290     // important that each grouping of 4 registers is monotonically
4291     // increasing to support the requirements of multi-register
4292     // instructions (e.g. ld4r, st4, etc.)
4293     const FloatRegister workSt[16] = {
4294          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4295         v20, v21, v22, v23, v24, v25, v26, v27
4296     };
4297 
4298     // Load from memory and interlace across 16 SIMD registers,
4299     // With each word from memory being broadcast to all lanes of
4300     // each successive SIMD register.
4301     //      Addr(0) -> All lanes in workSt[i]
4302     //      Addr(4) -> All lanes workSt[i + 1], etc.
4303     __ mov(tmpAddr, state);
4304     for (i = 0; i < 16; i += 4) {
4305       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4306           __ post(tmpAddr, 16));
4307     }
4308 
4309     // Pull in constant data.  The first 16 bytes are the add overlay
4310     // which is applied to the vector holding the counter (state[12]).
4311     // The second 16 bytes is the index register for the 8-bit left
4312     // rotation tbl instruction.
4313     __ adr(tmpAddr, L_cc20_const);
4314     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4315     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4316 
4317     // Set up the 10 iteration loop and perform all 8 quarter round ops
4318     __ mov(loopCtr, 10);
4319     __ BIND(L_twoRounds);
4320 
4321     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4322         scratch, lrot8Tbl);
4323     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4324         scratch, lrot8Tbl);
4325     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4326         scratch, lrot8Tbl);
4327     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4328         scratch, lrot8Tbl);
4329 
4330     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4331         scratch, lrot8Tbl);
4332     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4333         scratch, lrot8Tbl);
4334     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4335         scratch, lrot8Tbl);
4336     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4337         scratch, lrot8Tbl);
4338 
4339     // Decrement and iterate
4340     __ sub(loopCtr, loopCtr, 1);
4341     __ cbnz(loopCtr, L_twoRounds);
4342 
4343     __ mov(tmpAddr, state);
4344 
4345     // Add the starting state back to the post-loop keystream
4346     // state.  We read/interlace the state array from memory into
4347     // 4 registers similar to what we did in the beginning.  Then
4348     // add the counter overlay onto workSt[12] at the end.
4349     for (i = 0; i < 16; i += 4) {
4350       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4351           __ post(tmpAddr, 16));
4352       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4353       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4354       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4355       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4356     }
4357     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4358 
4359     // Write to key stream, storing the same element out of workSt[0..15]
4360     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4361     // for the next element position.
4362     for (i = 0; i < 4; i++) {
4363       for (j = 0; j < 16; j += 4) {
4364         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4365             __ post(keystream, 16));
4366       }
4367     }
4368 
4369     __ mov(r0, 256);             // Return length of output keystream
4370     __ leave();
4371     __ ret(lr);
4372 
4373     return start;
4374   }
4375 
4376   /**
4377    *  Arguments:
4378    *
4379    * Inputs:
4380    *   c_rarg0   - int crc
4381    *   c_rarg1   - byte* buf
4382    *   c_rarg2   - int length
4383    *   c_rarg3   - int* table
4384    *
4385    * Output:
4386    *       r0   - int crc result
4387    */
4388   address generate_updateBytesCRC32C() {
4389     assert(UseCRC32CIntrinsics, "what are we doing here?");
4390 
4391     __ align(CodeEntryAlignment);
4392     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4393 
4394     address start = __ pc();
4395 
4396     const Register crc   = c_rarg0;  // crc
4397     const Register buf   = c_rarg1;  // source java byte array address
4398     const Register len   = c_rarg2;  // length
4399     const Register table0 = c_rarg3; // crc_table address
4400     const Register table1 = c_rarg4;
4401     const Register table2 = c_rarg5;
4402     const Register table3 = c_rarg6;
4403     const Register tmp3 = c_rarg7;
4404 
4405     BLOCK_COMMENT("Entry:");
4406     __ enter(); // required for proper stackwalking of RuntimeStub frame
4407 
4408     __ kernel_crc32c(crc, buf, len,
4409               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4410 
4411     __ leave(); // required for proper stackwalking of RuntimeStub frame
4412     __ ret(lr);
4413 
4414     return start;
4415   }
4416 
4417   /***
4418    *  Arguments:
4419    *
4420    *  Inputs:
4421    *   c_rarg0   - int   adler
4422    *   c_rarg1   - byte* buff
4423    *   c_rarg2   - int   len
4424    *
4425    * Output:
4426    *   c_rarg0   - int adler result
4427    */
4428   address generate_updateBytesAdler32() {
4429     __ align(CodeEntryAlignment);
4430     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4431     address start = __ pc();
4432 
4433     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4434 
4435     // Aliases
4436     Register adler  = c_rarg0;
4437     Register s1     = c_rarg0;
4438     Register s2     = c_rarg3;
4439     Register buff   = c_rarg1;
4440     Register len    = c_rarg2;
4441     Register nmax  = r4;
4442     Register base  = r5;
4443     Register count = r6;
4444     Register temp0 = rscratch1;
4445     Register temp1 = rscratch2;
4446     FloatRegister vbytes = v0;
4447     FloatRegister vs1acc = v1;
4448     FloatRegister vs2acc = v2;
4449     FloatRegister vtable = v3;
4450 
4451     // Max number of bytes we can process before having to take the mod
4452     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4453     uint64_t BASE = 0xfff1;
4454     uint64_t NMAX = 0x15B0;
4455 
4456     __ mov(base, BASE);
4457     __ mov(nmax, NMAX);
4458 
4459     // Load accumulation coefficients for the upper 16 bits
4460     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4461     __ ld1(vtable, __ T16B, Address(temp0));
4462 
4463     // s1 is initialized to the lower 16 bits of adler
4464     // s2 is initialized to the upper 16 bits of adler
4465     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4466     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4467 
4468     // The pipelined loop needs at least 16 elements for 1 iteration
4469     // It does check this, but it is more effective to skip to the cleanup loop
4470     __ cmp(len, (u1)16);
4471     __ br(Assembler::HS, L_nmax);
4472     __ cbz(len, L_combine);
4473 
4474     __ bind(L_simple_by1_loop);
4475     __ ldrb(temp0, Address(__ post(buff, 1)));
4476     __ add(s1, s1, temp0);
4477     __ add(s2, s2, s1);
4478     __ subs(len, len, 1);
4479     __ br(Assembler::HI, L_simple_by1_loop);
4480 
4481     // s1 = s1 % BASE
4482     __ subs(temp0, s1, base);
4483     __ csel(s1, temp0, s1, Assembler::HS);
4484 
4485     // s2 = s2 % BASE
4486     __ lsr(temp0, s2, 16);
4487     __ lsl(temp1, temp0, 4);
4488     __ sub(temp1, temp1, temp0);
4489     __ add(s2, temp1, s2, ext::uxth);
4490 
4491     __ subs(temp0, s2, base);
4492     __ csel(s2, temp0, s2, Assembler::HS);
4493 
4494     __ b(L_combine);
4495 
4496     __ bind(L_nmax);
4497     __ subs(len, len, nmax);
4498     __ sub(count, nmax, 16);
4499     __ br(Assembler::LO, L_by16);
4500 
4501     __ bind(L_nmax_loop);
4502 
4503     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4504                                       vbytes, vs1acc, vs2acc, vtable);
4505 
4506     __ subs(count, count, 16);
4507     __ br(Assembler::HS, L_nmax_loop);
4508 
4509     // s1 = s1 % BASE
4510     __ lsr(temp0, s1, 16);
4511     __ lsl(temp1, temp0, 4);
4512     __ sub(temp1, temp1, temp0);
4513     __ add(temp1, temp1, s1, ext::uxth);
4514 
4515     __ lsr(temp0, temp1, 16);
4516     __ lsl(s1, temp0, 4);
4517     __ sub(s1, s1, temp0);
4518     __ add(s1, s1, temp1, ext:: uxth);
4519 
4520     __ subs(temp0, s1, base);
4521     __ csel(s1, temp0, s1, Assembler::HS);
4522 
4523     // s2 = s2 % BASE
4524     __ lsr(temp0, s2, 16);
4525     __ lsl(temp1, temp0, 4);
4526     __ sub(temp1, temp1, temp0);
4527     __ add(temp1, temp1, s2, ext::uxth);
4528 
4529     __ lsr(temp0, temp1, 16);
4530     __ lsl(s2, temp0, 4);
4531     __ sub(s2, s2, temp0);
4532     __ add(s2, s2, temp1, ext:: uxth);
4533 
4534     __ subs(temp0, s2, base);
4535     __ csel(s2, temp0, s2, Assembler::HS);
4536 
4537     __ subs(len, len, nmax);
4538     __ sub(count, nmax, 16);
4539     __ br(Assembler::HS, L_nmax_loop);
4540 
4541     __ bind(L_by16);
4542     __ adds(len, len, count);
4543     __ br(Assembler::LO, L_by1);
4544 
4545     __ bind(L_by16_loop);
4546 
4547     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4548                                       vbytes, vs1acc, vs2acc, vtable);
4549 
4550     __ subs(len, len, 16);
4551     __ br(Assembler::HS, L_by16_loop);
4552 
4553     __ bind(L_by1);
4554     __ adds(len, len, 15);
4555     __ br(Assembler::LO, L_do_mod);
4556 
4557     __ bind(L_by1_loop);
4558     __ ldrb(temp0, Address(__ post(buff, 1)));
4559     __ add(s1, temp0, s1);
4560     __ add(s2, s2, s1);
4561     __ subs(len, len, 1);
4562     __ br(Assembler::HS, L_by1_loop);
4563 
4564     __ bind(L_do_mod);
4565     // s1 = s1 % BASE
4566     __ lsr(temp0, s1, 16);
4567     __ lsl(temp1, temp0, 4);
4568     __ sub(temp1, temp1, temp0);
4569     __ add(temp1, temp1, s1, ext::uxth);
4570 
4571     __ lsr(temp0, temp1, 16);
4572     __ lsl(s1, temp0, 4);
4573     __ sub(s1, s1, temp0);
4574     __ add(s1, s1, temp1, ext:: uxth);
4575 
4576     __ subs(temp0, s1, base);
4577     __ csel(s1, temp0, s1, Assembler::HS);
4578 
4579     // s2 = s2 % BASE
4580     __ lsr(temp0, s2, 16);
4581     __ lsl(temp1, temp0, 4);
4582     __ sub(temp1, temp1, temp0);
4583     __ add(temp1, temp1, s2, ext::uxth);
4584 
4585     __ lsr(temp0, temp1, 16);
4586     __ lsl(s2, temp0, 4);
4587     __ sub(s2, s2, temp0);
4588     __ add(s2, s2, temp1, ext:: uxth);
4589 
4590     __ subs(temp0, s2, base);
4591     __ csel(s2, temp0, s2, Assembler::HS);
4592 
4593     // Combine lower bits and higher bits
4594     __ bind(L_combine);
4595     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4596 
4597     __ ret(lr);
4598 
4599     return start;
4600   }
4601 
4602   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4603           Register temp0, Register temp1, FloatRegister vbytes,
4604           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4605     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4606     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4607     // In non-vectorized code, we update s1 and s2 as:
4608     //   s1 <- s1 + b1
4609     //   s2 <- s2 + s1
4610     //   s1 <- s1 + b2
4611     //   s2 <- s2 + b1
4612     //   ...
4613     //   s1 <- s1 + b16
4614     //   s2 <- s2 + s1
4615     // Putting above assignments together, we have:
4616     //   s1_new = s1 + b1 + b2 + ... + b16
4617     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4618     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4619     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4620     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4621 
4622     // s2 = s2 + s1 * 16
4623     __ add(s2, s2, s1, Assembler::LSL, 4);
4624 
4625     // vs1acc = b1 + b2 + b3 + ... + b16
4626     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4627     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4628     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4629     __ uaddlv(vs1acc, __ T16B, vbytes);
4630     __ uaddlv(vs2acc, __ T8H, vs2acc);
4631 
4632     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4633     __ fmovd(temp0, vs1acc);
4634     __ fmovd(temp1, vs2acc);
4635     __ add(s1, s1, temp0);
4636     __ add(s2, s2, temp1);
4637   }
4638 
4639   /**
4640    *  Arguments:
4641    *
4642    *  Input:
4643    *    c_rarg0   - x address
4644    *    c_rarg1   - x length
4645    *    c_rarg2   - y address
4646    *    c_rarg3   - y length
4647    *    c_rarg4   - z address
4648    */
4649   address generate_multiplyToLen() {
4650     __ align(CodeEntryAlignment);
4651     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4652 
4653     address start = __ pc();
4654     const Register x     = r0;
4655     const Register xlen  = r1;
4656     const Register y     = r2;
4657     const Register ylen  = r3;
4658     const Register z     = r4;
4659 
4660     const Register tmp0  = r5;
4661     const Register tmp1  = r10;
4662     const Register tmp2  = r11;
4663     const Register tmp3  = r12;
4664     const Register tmp4  = r13;
4665     const Register tmp5  = r14;
4666     const Register tmp6  = r15;
4667     const Register tmp7  = r16;
4668 
4669     BLOCK_COMMENT("Entry:");
4670     __ enter(); // required for proper stackwalking of RuntimeStub frame
4671     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4672     __ leave(); // required for proper stackwalking of RuntimeStub frame
4673     __ ret(lr);
4674 
4675     return start;
4676   }
4677 
4678   address generate_squareToLen() {
4679     // squareToLen algorithm for sizes 1..127 described in java code works
4680     // faster than multiply_to_len on some CPUs and slower on others, but
4681     // multiply_to_len shows a bit better overall results
4682     __ align(CodeEntryAlignment);
4683     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4684     address start = __ pc();
4685 
4686     const Register x     = r0;
4687     const Register xlen  = r1;
4688     const Register z     = r2;
4689     const Register y     = r4; // == x
4690     const Register ylen  = r5; // == xlen
4691 
4692     const Register tmp0  = r3;
4693     const Register tmp1  = r10;
4694     const Register tmp2  = r11;
4695     const Register tmp3  = r12;
4696     const Register tmp4  = r13;
4697     const Register tmp5  = r14;
4698     const Register tmp6  = r15;
4699     const Register tmp7  = r16;
4700 
4701     RegSet spilled_regs = RegSet::of(y, ylen);
4702     BLOCK_COMMENT("Entry:");
4703     __ enter();
4704     __ push(spilled_regs, sp);
4705     __ mov(y, x);
4706     __ mov(ylen, xlen);
4707     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4708     __ pop(spilled_regs, sp);
4709     __ leave();
4710     __ ret(lr);
4711     return start;
4712   }
4713 
4714   address generate_mulAdd() {
4715     __ align(CodeEntryAlignment);
4716     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4717 
4718     address start = __ pc();
4719 
4720     const Register out     = r0;
4721     const Register in      = r1;
4722     const Register offset  = r2;
4723     const Register len     = r3;
4724     const Register k       = r4;
4725 
4726     BLOCK_COMMENT("Entry:");
4727     __ enter();
4728     __ mul_add(out, in, offset, len, k);
4729     __ leave();
4730     __ ret(lr);
4731 
4732     return start;
4733   }
4734 
4735   // Arguments:
4736   //
4737   // Input:
4738   //   c_rarg0   - newArr address
4739   //   c_rarg1   - oldArr address
4740   //   c_rarg2   - newIdx
4741   //   c_rarg3   - shiftCount
4742   //   c_rarg4   - numIter
4743   //
4744   address generate_bigIntegerRightShift() {
4745     __ align(CodeEntryAlignment);
4746     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4747     address start = __ pc();
4748 
4749     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4750 
4751     Register newArr        = c_rarg0;
4752     Register oldArr        = c_rarg1;
4753     Register newIdx        = c_rarg2;
4754     Register shiftCount    = c_rarg3;
4755     Register numIter       = c_rarg4;
4756     Register idx           = numIter;
4757 
4758     Register newArrCur     = rscratch1;
4759     Register shiftRevCount = rscratch2;
4760     Register oldArrCur     = r13;
4761     Register oldArrNext    = r14;
4762 
4763     FloatRegister oldElem0        = v0;
4764     FloatRegister oldElem1        = v1;
4765     FloatRegister newElem         = v2;
4766     FloatRegister shiftVCount     = v3;
4767     FloatRegister shiftVRevCount  = v4;
4768 
4769     __ cbz(idx, Exit);
4770 
4771     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4772 
4773     // left shift count
4774     __ movw(shiftRevCount, 32);
4775     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4776 
4777     // numIter too small to allow a 4-words SIMD loop, rolling back
4778     __ cmp(numIter, (u1)4);
4779     __ br(Assembler::LT, ShiftThree);
4780 
4781     __ dup(shiftVCount,    __ T4S, shiftCount);
4782     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4783     __ negr(shiftVCount,   __ T4S, shiftVCount);
4784 
4785     __ BIND(ShiftSIMDLoop);
4786 
4787     // Calculate the load addresses
4788     __ sub(idx, idx, 4);
4789     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4790     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4791     __ add(oldArrCur,  oldArrNext, 4);
4792 
4793     // Load 4 words and process
4794     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4795     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4796     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4797     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4798     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4799     __ st1(newElem,   __ T4S,  Address(newArrCur));
4800 
4801     __ cmp(idx, (u1)4);
4802     __ br(Assembler::LT, ShiftTwoLoop);
4803     __ b(ShiftSIMDLoop);
4804 
4805     __ BIND(ShiftTwoLoop);
4806     __ cbz(idx, Exit);
4807     __ cmp(idx, (u1)1);
4808     __ br(Assembler::EQ, ShiftOne);
4809 
4810     // Calculate the load addresses
4811     __ sub(idx, idx, 2);
4812     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4813     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4814     __ add(oldArrCur,  oldArrNext, 4);
4815 
4816     // Load 2 words and process
4817     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4818     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4819     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4820     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4821     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4822     __ st1(newElem,   __ T2S, Address(newArrCur));
4823     __ b(ShiftTwoLoop);
4824 
4825     __ BIND(ShiftThree);
4826     __ tbz(idx, 1, ShiftOne);
4827     __ tbz(idx, 0, ShiftTwo);
4828     __ ldrw(r10,  Address(oldArr, 12));
4829     __ ldrw(r11,  Address(oldArr, 8));
4830     __ lsrvw(r10, r10, shiftCount);
4831     __ lslvw(r11, r11, shiftRevCount);
4832     __ orrw(r12,  r10, r11);
4833     __ strw(r12,  Address(newArr, 8));
4834 
4835     __ BIND(ShiftTwo);
4836     __ ldrw(r10,  Address(oldArr, 8));
4837     __ ldrw(r11,  Address(oldArr, 4));
4838     __ lsrvw(r10, r10, shiftCount);
4839     __ lslvw(r11, r11, shiftRevCount);
4840     __ orrw(r12,  r10, r11);
4841     __ strw(r12,  Address(newArr, 4));
4842 
4843     __ BIND(ShiftOne);
4844     __ ldrw(r10,  Address(oldArr, 4));
4845     __ ldrw(r11,  Address(oldArr));
4846     __ lsrvw(r10, r10, shiftCount);
4847     __ lslvw(r11, r11, shiftRevCount);
4848     __ orrw(r12,  r10, r11);
4849     __ strw(r12,  Address(newArr));
4850 
4851     __ BIND(Exit);
4852     __ ret(lr);
4853 
4854     return start;
4855   }
4856 
4857   // Arguments:
4858   //
4859   // Input:
4860   //   c_rarg0   - newArr address
4861   //   c_rarg1   - oldArr address
4862   //   c_rarg2   - newIdx
4863   //   c_rarg3   - shiftCount
4864   //   c_rarg4   - numIter
4865   //
4866   address generate_bigIntegerLeftShift() {
4867     __ align(CodeEntryAlignment);
4868     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4869     address start = __ pc();
4870 
4871     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4872 
4873     Register newArr        = c_rarg0;
4874     Register oldArr        = c_rarg1;
4875     Register newIdx        = c_rarg2;
4876     Register shiftCount    = c_rarg3;
4877     Register numIter       = c_rarg4;
4878 
4879     Register shiftRevCount = rscratch1;
4880     Register oldArrNext    = rscratch2;
4881 
4882     FloatRegister oldElem0        = v0;
4883     FloatRegister oldElem1        = v1;
4884     FloatRegister newElem         = v2;
4885     FloatRegister shiftVCount     = v3;
4886     FloatRegister shiftVRevCount  = v4;
4887 
4888     __ cbz(numIter, Exit);
4889 
4890     __ add(oldArrNext, oldArr, 4);
4891     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4892 
4893     // right shift count
4894     __ movw(shiftRevCount, 32);
4895     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4896 
4897     // numIter too small to allow a 4-words SIMD loop, rolling back
4898     __ cmp(numIter, (u1)4);
4899     __ br(Assembler::LT, ShiftThree);
4900 
4901     __ dup(shiftVCount,     __ T4S, shiftCount);
4902     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4903     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4904 
4905     __ BIND(ShiftSIMDLoop);
4906 
4907     // load 4 words and process
4908     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4909     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4910     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4911     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4912     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4913     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4914     __ sub(numIter,   numIter, 4);
4915 
4916     __ cmp(numIter, (u1)4);
4917     __ br(Assembler::LT, ShiftTwoLoop);
4918     __ b(ShiftSIMDLoop);
4919 
4920     __ BIND(ShiftTwoLoop);
4921     __ cbz(numIter, Exit);
4922     __ cmp(numIter, (u1)1);
4923     __ br(Assembler::EQ, ShiftOne);
4924 
4925     // load 2 words and process
4926     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4927     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4928     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4929     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4930     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4931     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4932     __ sub(numIter,   numIter, 2);
4933     __ b(ShiftTwoLoop);
4934 
4935     __ BIND(ShiftThree);
4936     __ ldrw(r10,  __ post(oldArr, 4));
4937     __ ldrw(r11,  __ post(oldArrNext, 4));
4938     __ lslvw(r10, r10, shiftCount);
4939     __ lsrvw(r11, r11, shiftRevCount);
4940     __ orrw(r12,  r10, r11);
4941     __ strw(r12,  __ post(newArr, 4));
4942     __ tbz(numIter, 1, Exit);
4943     __ tbz(numIter, 0, ShiftOne);
4944 
4945     __ BIND(ShiftTwo);
4946     __ ldrw(r10,  __ post(oldArr, 4));
4947     __ ldrw(r11,  __ post(oldArrNext, 4));
4948     __ lslvw(r10, r10, shiftCount);
4949     __ lsrvw(r11, r11, shiftRevCount);
4950     __ orrw(r12,  r10, r11);
4951     __ strw(r12,  __ post(newArr, 4));
4952 
4953     __ BIND(ShiftOne);
4954     __ ldrw(r10,  Address(oldArr));
4955     __ ldrw(r11,  Address(oldArrNext));
4956     __ lslvw(r10, r10, shiftCount);
4957     __ lsrvw(r11, r11, shiftRevCount);
4958     __ orrw(r12,  r10, r11);
4959     __ strw(r12,  Address(newArr));
4960 
4961     __ BIND(Exit);
4962     __ ret(lr);
4963 
4964     return start;
4965   }
4966 
4967   address generate_count_positives(address &count_positives_long) {
4968     const u1 large_loop_size = 64;
4969     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4970     int dcache_line = VM_Version::dcache_line_size();
4971 
4972     Register ary1 = r1, len = r2, result = r0;
4973 
4974     __ align(CodeEntryAlignment);
4975 
4976     StubCodeMark mark(this, "StubRoutines", "count_positives");
4977 
4978     address entry = __ pc();
4979 
4980     __ enter();
4981     // precondition: a copy of len is already in result
4982     // __ mov(result, len);
4983 
4984   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4985         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4986 
4987   __ cmp(len, (u1)15);
4988   __ br(Assembler::GT, LEN_OVER_15);
4989   // The only case when execution falls into this code is when pointer is near
4990   // the end of memory page and we have to avoid reading next page
4991   __ add(ary1, ary1, len);
4992   __ subs(len, len, 8);
4993   __ br(Assembler::GT, LEN_OVER_8);
4994   __ ldr(rscratch2, Address(ary1, -8));
4995   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4996   __ lsrv(rscratch2, rscratch2, rscratch1);
4997   __ tst(rscratch2, UPPER_BIT_MASK);
4998   __ csel(result, zr, result, Assembler::NE);
4999   __ leave();
5000   __ ret(lr);
5001   __ bind(LEN_OVER_8);
5002   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5003   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5004   __ tst(rscratch2, UPPER_BIT_MASK);
5005   __ br(Assembler::NE, RET_NO_POP);
5006   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5007   __ lsrv(rscratch1, rscratch1, rscratch2);
5008   __ tst(rscratch1, UPPER_BIT_MASK);
5009   __ bind(RET_NO_POP);
5010   __ csel(result, zr, result, Assembler::NE);
5011   __ leave();
5012   __ ret(lr);
5013 
5014   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5015   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5016 
5017   count_positives_long = __ pc(); // 2nd entry point
5018 
5019   __ enter();
5020 
5021   __ bind(LEN_OVER_15);
5022     __ push(spilled_regs, sp);
5023     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5024     __ cbz(rscratch2, ALIGNED);
5025     __ ldp(tmp6, tmp1, Address(ary1));
5026     __ mov(tmp5, 16);
5027     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5028     __ add(ary1, ary1, rscratch1);
5029     __ orr(tmp6, tmp6, tmp1);
5030     __ tst(tmp6, UPPER_BIT_MASK);
5031     __ br(Assembler::NE, RET_ADJUST);
5032     __ sub(len, len, rscratch1);
5033 
5034   __ bind(ALIGNED);
5035     __ cmp(len, large_loop_size);
5036     __ br(Assembler::LT, CHECK_16);
5037     // Perform 16-byte load as early return in pre-loop to handle situation
5038     // when initially aligned large array has negative values at starting bytes,
5039     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5040     // slower. Cases with negative bytes further ahead won't be affected that
5041     // much. In fact, it'll be faster due to early loads, less instructions and
5042     // less branches in LARGE_LOOP.
5043     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5044     __ sub(len, len, 16);
5045     __ orr(tmp6, tmp6, tmp1);
5046     __ tst(tmp6, UPPER_BIT_MASK);
5047     __ br(Assembler::NE, RET_ADJUST_16);
5048     __ cmp(len, large_loop_size);
5049     __ br(Assembler::LT, CHECK_16);
5050 
5051     if (SoftwarePrefetchHintDistance >= 0
5052         && SoftwarePrefetchHintDistance >= dcache_line) {
5053       // initial prefetch
5054       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5055     }
5056   __ bind(LARGE_LOOP);
5057     if (SoftwarePrefetchHintDistance >= 0) {
5058       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5059     }
5060     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5061     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5062     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5063     // instructions per cycle and have less branches, but this approach disables
5064     // early return, thus, all 64 bytes are loaded and checked every time.
5065     __ ldp(tmp2, tmp3, Address(ary1));
5066     __ ldp(tmp4, tmp5, Address(ary1, 16));
5067     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5068     __ ldp(tmp6, tmp1, Address(ary1, 48));
5069     __ add(ary1, ary1, large_loop_size);
5070     __ sub(len, len, large_loop_size);
5071     __ orr(tmp2, tmp2, tmp3);
5072     __ orr(tmp4, tmp4, tmp5);
5073     __ orr(rscratch1, rscratch1, rscratch2);
5074     __ orr(tmp6, tmp6, tmp1);
5075     __ orr(tmp2, tmp2, tmp4);
5076     __ orr(rscratch1, rscratch1, tmp6);
5077     __ orr(tmp2, tmp2, rscratch1);
5078     __ tst(tmp2, UPPER_BIT_MASK);
5079     __ br(Assembler::NE, RET_ADJUST_LONG);
5080     __ cmp(len, large_loop_size);
5081     __ br(Assembler::GE, LARGE_LOOP);
5082 
5083   __ bind(CHECK_16); // small 16-byte load pre-loop
5084     __ cmp(len, (u1)16);
5085     __ br(Assembler::LT, POST_LOOP16);
5086 
5087   __ bind(LOOP16); // small 16-byte load loop
5088     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5089     __ sub(len, len, 16);
5090     __ orr(tmp2, tmp2, tmp3);
5091     __ tst(tmp2, UPPER_BIT_MASK);
5092     __ br(Assembler::NE, RET_ADJUST_16);
5093     __ cmp(len, (u1)16);
5094     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5095 
5096   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5097     __ cmp(len, (u1)8);
5098     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5099     __ ldr(tmp3, Address(__ post(ary1, 8)));
5100     __ tst(tmp3, UPPER_BIT_MASK);
5101     __ br(Assembler::NE, RET_ADJUST);
5102     __ sub(len, len, 8);
5103 
5104   __ bind(POST_LOOP16_LOAD_TAIL);
5105     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5106     __ ldr(tmp1, Address(ary1));
5107     __ mov(tmp2, 64);
5108     __ sub(tmp4, tmp2, len, __ LSL, 3);
5109     __ lslv(tmp1, tmp1, tmp4);
5110     __ tst(tmp1, UPPER_BIT_MASK);
5111     __ br(Assembler::NE, RET_ADJUST);
5112     // Fallthrough
5113 
5114   __ bind(RET_LEN);
5115     __ pop(spilled_regs, sp);
5116     __ leave();
5117     __ ret(lr);
5118 
5119     // difference result - len is the count of guaranteed to be
5120     // positive bytes
5121 
5122   __ bind(RET_ADJUST_LONG);
5123     __ add(len, len, (u1)(large_loop_size - 16));
5124   __ bind(RET_ADJUST_16);
5125     __ add(len, len, 16);
5126   __ bind(RET_ADJUST);
5127     __ pop(spilled_regs, sp);
5128     __ leave();
5129     __ sub(result, result, len);
5130     __ ret(lr);
5131 
5132     return entry;
5133   }
5134 
5135   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5136         bool usePrefetch, Label &NOT_EQUAL) {
5137     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5138         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5139         tmp7 = r12, tmp8 = r13;
5140     Label LOOP;
5141 
5142     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5143     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5144     __ bind(LOOP);
5145     if (usePrefetch) {
5146       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5147       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5148     }
5149     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5150     __ eor(tmp1, tmp1, tmp2);
5151     __ eor(tmp3, tmp3, tmp4);
5152     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5153     __ orr(tmp1, tmp1, tmp3);
5154     __ cbnz(tmp1, NOT_EQUAL);
5155     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5156     __ eor(tmp5, tmp5, tmp6);
5157     __ eor(tmp7, tmp7, tmp8);
5158     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5159     __ orr(tmp5, tmp5, tmp7);
5160     __ cbnz(tmp5, NOT_EQUAL);
5161     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5162     __ eor(tmp1, tmp1, tmp2);
5163     __ eor(tmp3, tmp3, tmp4);
5164     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5165     __ orr(tmp1, tmp1, tmp3);
5166     __ cbnz(tmp1, NOT_EQUAL);
5167     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5168     __ eor(tmp5, tmp5, tmp6);
5169     __ sub(cnt1, cnt1, 8 * wordSize);
5170     __ eor(tmp7, tmp7, tmp8);
5171     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5172     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5173     // cmp) because subs allows an unlimited range of immediate operand.
5174     __ subs(tmp6, cnt1, loopThreshold);
5175     __ orr(tmp5, tmp5, tmp7);
5176     __ cbnz(tmp5, NOT_EQUAL);
5177     __ br(__ GE, LOOP);
5178     // post-loop
5179     __ eor(tmp1, tmp1, tmp2);
5180     __ eor(tmp3, tmp3, tmp4);
5181     __ orr(tmp1, tmp1, tmp3);
5182     __ sub(cnt1, cnt1, 2 * wordSize);
5183     __ cbnz(tmp1, NOT_EQUAL);
5184   }
5185 
5186   void generate_large_array_equals_loop_simd(int loopThreshold,
5187         bool usePrefetch, Label &NOT_EQUAL) {
5188     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5189         tmp2 = rscratch2;
5190     Label LOOP;
5191 
5192     __ bind(LOOP);
5193     if (usePrefetch) {
5194       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5195       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5196     }
5197     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5198     __ sub(cnt1, cnt1, 8 * wordSize);
5199     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5200     __ subs(tmp1, cnt1, loopThreshold);
5201     __ eor(v0, __ T16B, v0, v4);
5202     __ eor(v1, __ T16B, v1, v5);
5203     __ eor(v2, __ T16B, v2, v6);
5204     __ eor(v3, __ T16B, v3, v7);
5205     __ orr(v0, __ T16B, v0, v1);
5206     __ orr(v1, __ T16B, v2, v3);
5207     __ orr(v0, __ T16B, v0, v1);
5208     __ umov(tmp1, v0, __ D, 0);
5209     __ umov(tmp2, v0, __ D, 1);
5210     __ orr(tmp1, tmp1, tmp2);
5211     __ cbnz(tmp1, NOT_EQUAL);
5212     __ br(__ GE, LOOP);
5213   }
5214 
5215   // a1 = r1 - array1 address
5216   // a2 = r2 - array2 address
5217   // result = r0 - return value. Already contains "false"
5218   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5219   // r3-r5 are reserved temporary registers
5220   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5221   address generate_large_array_equals() {
5222     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5223         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5224         tmp7 = r12, tmp8 = r13;
5225     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5226         SMALL_LOOP, POST_LOOP;
5227     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5228     // calculate if at least 32 prefetched bytes are used
5229     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5230     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5231     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5232     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5233         tmp5, tmp6, tmp7, tmp8);
5234 
5235     __ align(CodeEntryAlignment);
5236 
5237     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5238 
5239     address entry = __ pc();
5240     __ enter();
5241     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5242     // also advance pointers to use post-increment instead of pre-increment
5243     __ add(a1, a1, wordSize);
5244     __ add(a2, a2, wordSize);
5245     if (AvoidUnalignedAccesses) {
5246       // both implementations (SIMD/nonSIMD) are using relatively large load
5247       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5248       // on some CPUs in case of address is not at least 16-byte aligned.
5249       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5250       // load if needed at least for 1st address and make if 16-byte aligned.
5251       Label ALIGNED16;
5252       __ tbz(a1, 3, ALIGNED16);
5253       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5254       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5255       __ sub(cnt1, cnt1, wordSize);
5256       __ eor(tmp1, tmp1, tmp2);
5257       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5258       __ bind(ALIGNED16);
5259     }
5260     if (UseSIMDForArrayEquals) {
5261       if (SoftwarePrefetchHintDistance >= 0) {
5262         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5263         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5264         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5265             /* prfm = */ true, NOT_EQUAL);
5266         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5267         __ br(__ LT, TAIL);
5268       }
5269       __ bind(NO_PREFETCH_LARGE_LOOP);
5270       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5271           /* prfm = */ false, NOT_EQUAL);
5272     } else {
5273       __ push(spilled_regs, sp);
5274       if (SoftwarePrefetchHintDistance >= 0) {
5275         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5276         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5277         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5278             /* prfm = */ true, NOT_EQUAL);
5279         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5280         __ br(__ LT, TAIL);
5281       }
5282       __ bind(NO_PREFETCH_LARGE_LOOP);
5283       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5284           /* prfm = */ false, NOT_EQUAL);
5285     }
5286     __ bind(TAIL);
5287       __ cbz(cnt1, EQUAL);
5288       __ subs(cnt1, cnt1, wordSize);
5289       __ br(__ LE, POST_LOOP);
5290     __ bind(SMALL_LOOP);
5291       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5292       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5293       __ subs(cnt1, cnt1, wordSize);
5294       __ eor(tmp1, tmp1, tmp2);
5295       __ cbnz(tmp1, NOT_EQUAL);
5296       __ br(__ GT, SMALL_LOOP);
5297     __ bind(POST_LOOP);
5298       __ ldr(tmp1, Address(a1, cnt1));
5299       __ ldr(tmp2, Address(a2, cnt1));
5300       __ eor(tmp1, tmp1, tmp2);
5301       __ cbnz(tmp1, NOT_EQUAL);
5302     __ bind(EQUAL);
5303       __ mov(result, true);
5304     __ bind(NOT_EQUAL);
5305       if (!UseSIMDForArrayEquals) {
5306         __ pop(spilled_regs, sp);
5307       }
5308     __ bind(NOT_EQUAL_NO_POP);
5309     __ leave();
5310     __ ret(lr);
5311     return entry;
5312   }
5313 
5314   address generate_dsin_dcos(bool isCos) {
5315     __ align(CodeEntryAlignment);
5316     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5317     address start = __ pc();
5318     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5319         (address)StubRoutines::aarch64::_two_over_pi,
5320         (address)StubRoutines::aarch64::_pio2,
5321         (address)StubRoutines::aarch64::_dsin_coef,
5322         (address)StubRoutines::aarch64::_dcos_coef);
5323     return start;
5324   }
5325 
5326   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5327   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5328       Label &DIFF2) {
5329     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5330     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5331 
5332     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5333     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5334     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5335     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5336 
5337     __ fmovd(tmpL, vtmp3);
5338     __ eor(rscratch2, tmp3, tmpL);
5339     __ cbnz(rscratch2, DIFF2);
5340 
5341     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5342     __ umov(tmpL, vtmp3, __ D, 1);
5343     __ eor(rscratch2, tmpU, tmpL);
5344     __ cbnz(rscratch2, DIFF1);
5345 
5346     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5347     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5348     __ fmovd(tmpL, vtmp);
5349     __ eor(rscratch2, tmp3, tmpL);
5350     __ cbnz(rscratch2, DIFF2);
5351 
5352     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5353     __ umov(tmpL, vtmp, __ D, 1);
5354     __ eor(rscratch2, tmpU, tmpL);
5355     __ cbnz(rscratch2, DIFF1);
5356   }
5357 
5358   // r0  = result
5359   // r1  = str1
5360   // r2  = cnt1
5361   // r3  = str2
5362   // r4  = cnt2
5363   // r10 = tmp1
5364   // r11 = tmp2
5365   address generate_compare_long_string_different_encoding(bool isLU) {
5366     __ align(CodeEntryAlignment);
5367     StubCodeMark mark(this, "StubRoutines", isLU
5368         ? "compare_long_string_different_encoding LU"
5369         : "compare_long_string_different_encoding UL");
5370     address entry = __ pc();
5371     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5372         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5373         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5374     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5375         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5376     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5377     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5378 
5379     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5380 
5381     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5382     // cnt2 == amount of characters left to compare
5383     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5384     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5385     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5386     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5387     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5388     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5389     __ eor(rscratch2, tmp1, tmp2);
5390     __ mov(rscratch1, tmp2);
5391     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5392     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5393              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5394     __ push(spilled_regs, sp);
5395     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5396     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5397 
5398     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5399 
5400     if (SoftwarePrefetchHintDistance >= 0) {
5401       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5402       __ br(__ LT, NO_PREFETCH);
5403       __ bind(LARGE_LOOP_PREFETCH);
5404         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5405         __ mov(tmp4, 2);
5406         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5407         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5408           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5409           __ subs(tmp4, tmp4, 1);
5410           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5411           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5412           __ mov(tmp4, 2);
5413         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5414           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5415           __ subs(tmp4, tmp4, 1);
5416           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5417           __ sub(cnt2, cnt2, 64);
5418           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5419           __ br(__ GE, LARGE_LOOP_PREFETCH);
5420     }
5421     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5422     __ bind(NO_PREFETCH);
5423     __ subs(cnt2, cnt2, 16);
5424     __ br(__ LT, TAIL);
5425     __ align(OptoLoopAlignment);
5426     __ bind(SMALL_LOOP); // smaller loop
5427       __ subs(cnt2, cnt2, 16);
5428       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5429       __ br(__ GE, SMALL_LOOP);
5430       __ cmn(cnt2, (u1)16);
5431       __ br(__ EQ, LOAD_LAST);
5432     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5433       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5434       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5435       __ ldr(tmp3, Address(cnt1, -8));
5436       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5437       __ b(LOAD_LAST);
5438     __ bind(DIFF2);
5439       __ mov(tmpU, tmp3);
5440     __ bind(DIFF1);
5441       __ pop(spilled_regs, sp);
5442       __ b(CALCULATE_DIFFERENCE);
5443     __ bind(LOAD_LAST);
5444       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5445       // No need to load it again
5446       __ mov(tmpU, tmp3);
5447       __ pop(spilled_regs, sp);
5448 
5449       // tmp2 points to the address of the last 4 Latin1 characters right now
5450       __ ldrs(vtmp, Address(tmp2));
5451       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5452       __ fmovd(tmpL, vtmp);
5453 
5454       __ eor(rscratch2, tmpU, tmpL);
5455       __ cbz(rscratch2, DONE);
5456 
5457     // Find the first different characters in the longwords and
5458     // compute their difference.
5459     __ bind(CALCULATE_DIFFERENCE);
5460       __ rev(rscratch2, rscratch2);
5461       __ clz(rscratch2, rscratch2);
5462       __ andr(rscratch2, rscratch2, -16);
5463       __ lsrv(tmp1, tmp1, rscratch2);
5464       __ uxthw(tmp1, tmp1);
5465       __ lsrv(rscratch1, rscratch1, rscratch2);
5466       __ uxthw(rscratch1, rscratch1);
5467       __ subw(result, tmp1, rscratch1);
5468     __ bind(DONE);
5469       __ ret(lr);
5470     return entry;
5471   }
5472 
5473   // r0 = input (float16)
5474   // v0 = result (float)
5475   // v1 = temporary float register
5476   address generate_float16ToFloat() {
5477     __ align(CodeEntryAlignment);
5478     StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
5479     address entry = __ pc();
5480     BLOCK_COMMENT("Entry:");
5481     __ flt16_to_flt(v0, r0, v1);
5482     __ ret(lr);
5483     return entry;
5484   }
5485 
5486   // v0 = input (float)
5487   // r0 = result (float16)
5488   // v1 = temporary float register
5489   address generate_floatToFloat16() {
5490     __ align(CodeEntryAlignment);
5491     StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
5492     address entry = __ pc();
5493     BLOCK_COMMENT("Entry:");
5494     __ flt_to_flt16(r0, v0, v1);
5495     __ ret(lr);
5496     return entry;
5497   }
5498 
5499   address generate_method_entry_barrier() {
5500     __ align(CodeEntryAlignment);
5501     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5502 
5503     Label deoptimize_label;
5504 
5505     address start = __ pc();
5506 
5507     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5508 
5509     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5510       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5511       // We can get here despite the nmethod being good, if we have not
5512       // yet applied our cross modification fence (or data fence).
5513       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5514       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5515       __ ldrw(rscratch2, rscratch2);
5516       __ strw(rscratch2, thread_epoch_addr);
5517       __ isb();
5518       __ membar(__ LoadLoad);
5519     }
5520 
5521     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5522 
5523     __ enter();
5524     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5525 
5526     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5527 
5528     __ push_call_clobbered_registers();
5529 
5530     __ mov(c_rarg0, rscratch2);
5531     __ call_VM_leaf
5532          (CAST_FROM_FN_PTR
5533           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5534 
5535     __ reset_last_Java_frame(true);
5536 
5537     __ mov(rscratch1, r0);
5538 
5539     __ pop_call_clobbered_registers();
5540 
5541     __ cbnz(rscratch1, deoptimize_label);
5542 
5543     __ leave();
5544     __ ret(lr);
5545 
5546     __ BIND(deoptimize_label);
5547 
5548     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5549     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5550 
5551     __ mov(sp, rscratch1);
5552     __ br(rscratch2);
5553 
5554     return start;
5555   }
5556 
5557   // r0  = result
5558   // r1  = str1
5559   // r2  = cnt1
5560   // r3  = str2
5561   // r4  = cnt2
5562   // r10 = tmp1
5563   // r11 = tmp2
5564   address generate_compare_long_string_same_encoding(bool isLL) {
5565     __ align(CodeEntryAlignment);
5566     StubCodeMark mark(this, "StubRoutines", isLL
5567         ? "compare_long_string_same_encoding LL"
5568         : "compare_long_string_same_encoding UU");
5569     address entry = __ pc();
5570     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5571         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5572 
5573     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5574 
5575     // exit from large loop when less than 64 bytes left to read or we're about
5576     // to prefetch memory behind array border
5577     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5578 
5579     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5580     __ eor(rscratch2, tmp1, tmp2);
5581     __ cbnz(rscratch2, CAL_DIFFERENCE);
5582 
5583     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5584     // update pointers, because of previous read
5585     __ add(str1, str1, wordSize);
5586     __ add(str2, str2, wordSize);
5587     if (SoftwarePrefetchHintDistance >= 0) {
5588       __ align(OptoLoopAlignment);
5589       __ bind(LARGE_LOOP_PREFETCH);
5590         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5591         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5592 
5593         for (int i = 0; i < 4; i++) {
5594           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5595           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5596           __ cmp(tmp1, tmp2);
5597           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5598           __ br(Assembler::NE, DIFF);
5599         }
5600         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5601         __ add(str1, str1, 64);
5602         __ add(str2, str2, 64);
5603         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5604         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5605         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5606     }
5607 
5608     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5609     __ br(Assembler::LE, LESS16);
5610     __ align(OptoLoopAlignment);
5611     __ bind(LOOP_COMPARE16);
5612       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5613       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5614       __ cmp(tmp1, tmp2);
5615       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5616       __ br(Assembler::NE, DIFF);
5617       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5618       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5619       __ br(Assembler::LT, LESS16);
5620 
5621       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5622       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5623       __ cmp(tmp1, tmp2);
5624       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5625       __ br(Assembler::NE, DIFF);
5626       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5627       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5628       __ br(Assembler::GE, LOOP_COMPARE16);
5629       __ cbz(cnt2, LENGTH_DIFF);
5630 
5631     __ bind(LESS16);
5632       // each 8 compare
5633       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5634       __ br(Assembler::LE, LESS8);
5635       __ ldr(tmp1, Address(__ post(str1, 8)));
5636       __ ldr(tmp2, Address(__ post(str2, 8)));
5637       __ eor(rscratch2, tmp1, tmp2);
5638       __ cbnz(rscratch2, CAL_DIFFERENCE);
5639       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5640 
5641     __ bind(LESS8); // directly load last 8 bytes
5642       if (!isLL) {
5643         __ add(cnt2, cnt2, cnt2);
5644       }
5645       __ ldr(tmp1, Address(str1, cnt2));
5646       __ ldr(tmp2, Address(str2, cnt2));
5647       __ eor(rscratch2, tmp1, tmp2);
5648       __ cbz(rscratch2, LENGTH_DIFF);
5649       __ b(CAL_DIFFERENCE);
5650 
5651     __ bind(DIFF);
5652       __ cmp(tmp1, tmp2);
5653       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5654       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5655       // reuse rscratch2 register for the result of eor instruction
5656       __ eor(rscratch2, tmp1, tmp2);
5657 
5658     __ bind(CAL_DIFFERENCE);
5659       __ rev(rscratch2, rscratch2);
5660       __ clz(rscratch2, rscratch2);
5661       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5662       __ lsrv(tmp1, tmp1, rscratch2);
5663       __ lsrv(tmp2, tmp2, rscratch2);
5664       if (isLL) {
5665         __ uxtbw(tmp1, tmp1);
5666         __ uxtbw(tmp2, tmp2);
5667       } else {
5668         __ uxthw(tmp1, tmp1);
5669         __ uxthw(tmp2, tmp2);
5670       }
5671       __ subw(result, tmp1, tmp2);
5672 
5673     __ bind(LENGTH_DIFF);
5674       __ ret(lr);
5675     return entry;
5676   }
5677 
5678   enum string_compare_mode {
5679     LL,
5680     LU,
5681     UL,
5682     UU,
5683   };
5684 
5685   // The following registers are declared in aarch64.ad
5686   // r0  = result
5687   // r1  = str1
5688   // r2  = cnt1
5689   // r3  = str2
5690   // r4  = cnt2
5691   // r10 = tmp1
5692   // r11 = tmp2
5693   // z0  = ztmp1
5694   // z1  = ztmp2
5695   // p0  = pgtmp1
5696   // p1  = pgtmp2
5697   address generate_compare_long_string_sve(string_compare_mode mode) {
5698     __ align(CodeEntryAlignment);
5699     address entry = __ pc();
5700     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5701              tmp1 = r10, tmp2 = r11;
5702 
5703     Label LOOP, DONE, MISMATCH;
5704     Register vec_len = tmp1;
5705     Register idx = tmp2;
5706     // The minimum of the string lengths has been stored in cnt2.
5707     Register cnt = cnt2;
5708     FloatRegister ztmp1 = z0, ztmp2 = z1;
5709     PRegister pgtmp1 = p0, pgtmp2 = p1;
5710 
5711 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5712     switch (mode) {                                                            \
5713       case LL:                                                                 \
5714         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5715         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5716         break;                                                                 \
5717       case LU:                                                                 \
5718         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5719         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5720         break;                                                                 \
5721       case UL:                                                                 \
5722         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5723         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5724         break;                                                                 \
5725       case UU:                                                                 \
5726         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5727         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5728         break;                                                                 \
5729       default:                                                                 \
5730         ShouldNotReachHere();                                                  \
5731     }
5732 
5733     const char* stubname;
5734     switch (mode) {
5735       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5736       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5737       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5738       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5739       default: ShouldNotReachHere();
5740     }
5741 
5742     StubCodeMark mark(this, "StubRoutines", stubname);
5743 
5744     __ mov(idx, 0);
5745     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5746 
5747     if (mode == LL) {
5748       __ sve_cntb(vec_len);
5749     } else {
5750       __ sve_cnth(vec_len);
5751     }
5752 
5753     __ sub(rscratch1, cnt, vec_len);
5754 
5755     __ bind(LOOP);
5756 
5757       // main loop
5758       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5759       __ add(idx, idx, vec_len);
5760       // Compare strings.
5761       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5762       __ br(__ NE, MISMATCH);
5763       __ cmp(idx, rscratch1);
5764       __ br(__ LT, LOOP);
5765 
5766     // post loop, last iteration
5767     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5768 
5769     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5770     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5771     __ br(__ EQ, DONE);
5772 
5773     __ bind(MISMATCH);
5774 
5775     // Crop the vector to find its location.
5776     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5777     // Extract the first different characters of each string.
5778     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5779     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5780 
5781     // Compute the difference of the first different characters.
5782     __ sub(result, rscratch1, rscratch2);
5783 
5784     __ bind(DONE);
5785     __ ret(lr);
5786 #undef LOAD_PAIR
5787     return entry;
5788   }
5789 
5790   void generate_compare_long_strings() {
5791     if (UseSVE == 0) {
5792       StubRoutines::aarch64::_compare_long_string_LL
5793           = generate_compare_long_string_same_encoding(true);
5794       StubRoutines::aarch64::_compare_long_string_UU
5795           = generate_compare_long_string_same_encoding(false);
5796       StubRoutines::aarch64::_compare_long_string_LU
5797           = generate_compare_long_string_different_encoding(true);
5798       StubRoutines::aarch64::_compare_long_string_UL
5799           = generate_compare_long_string_different_encoding(false);
5800     } else {
5801       StubRoutines::aarch64::_compare_long_string_LL
5802           = generate_compare_long_string_sve(LL);
5803       StubRoutines::aarch64::_compare_long_string_UU
5804           = generate_compare_long_string_sve(UU);
5805       StubRoutines::aarch64::_compare_long_string_LU
5806           = generate_compare_long_string_sve(LU);
5807       StubRoutines::aarch64::_compare_long_string_UL
5808           = generate_compare_long_string_sve(UL);
5809     }
5810   }
5811 
5812   // R0 = result
5813   // R1 = str2
5814   // R2 = cnt1
5815   // R3 = str1
5816   // R4 = cnt2
5817   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
5818   //
5819   // This generic linear code use few additional ideas, which makes it faster:
5820   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5821   // in order to skip initial loading(help in systems with 1 ld pipeline)
5822   // 2) we can use "fast" algorithm of finding single character to search for
5823   // first symbol with less branches(1 branch per each loaded register instead
5824   // of branch for each symbol), so, this is where constants like
5825   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5826   // 3) after loading and analyzing 1st register of source string, it can be
5827   // used to search for every 1st character entry, saving few loads in
5828   // comparison with "simplier-but-slower" implementation
5829   // 4) in order to avoid lots of push/pop operations, code below is heavily
5830   // re-using/re-initializing/compressing register values, which makes code
5831   // larger and a bit less readable, however, most of extra operations are
5832   // issued during loads or branches, so, penalty is minimal
5833   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5834     const char* stubName = str1_isL
5835         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5836         : "indexof_linear_uu";
5837     __ align(CodeEntryAlignment);
5838     StubCodeMark mark(this, "StubRoutines", stubName);
5839     address entry = __ pc();
5840 
5841     int str1_chr_size = str1_isL ? 1 : 2;
5842     int str2_chr_size = str2_isL ? 1 : 2;
5843     int str1_chr_shift = str1_isL ? 0 : 1;
5844     int str2_chr_shift = str2_isL ? 0 : 1;
5845     bool isL = str1_isL && str2_isL;
5846    // parameters
5847     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5848     // temporary registers
5849     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5850     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5851     // redefinitions
5852     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5853 
5854     __ push(spilled_regs, sp);
5855     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5856         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5857         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5858         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5859         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5860         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5861     // Read whole register from str1. It is safe, because length >=8 here
5862     __ ldr(ch1, Address(str1));
5863     // Read whole register from str2. It is safe, because length >=8 here
5864     __ ldr(ch2, Address(str2));
5865     __ sub(cnt2, cnt2, cnt1);
5866     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5867     if (str1_isL != str2_isL) {
5868       __ eor(v0, __ T16B, v0, v0);
5869     }
5870     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5871     __ mul(first, first, tmp1);
5872     // check if we have less than 1 register to check
5873     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5874     if (str1_isL != str2_isL) {
5875       __ fmovd(v1, ch1);
5876     }
5877     __ br(__ LE, L_SMALL);
5878     __ eor(ch2, first, ch2);
5879     if (str1_isL != str2_isL) {
5880       __ zip1(v1, __ T16B, v1, v0);
5881     }
5882     __ sub(tmp2, ch2, tmp1);
5883     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5884     __ bics(tmp2, tmp2, ch2);
5885     if (str1_isL != str2_isL) {
5886       __ fmovd(ch1, v1);
5887     }
5888     __ br(__ NE, L_HAS_ZERO);
5889     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5890     __ add(result, result, wordSize/str2_chr_size);
5891     __ add(str2, str2, wordSize);
5892     __ br(__ LT, L_POST_LOOP);
5893     __ BIND(L_LOOP);
5894       __ ldr(ch2, Address(str2));
5895       __ eor(ch2, first, ch2);
5896       __ sub(tmp2, ch2, tmp1);
5897       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5898       __ bics(tmp2, tmp2, ch2);
5899       __ br(__ NE, L_HAS_ZERO);
5900     __ BIND(L_LOOP_PROCEED);
5901       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5902       __ add(str2, str2, wordSize);
5903       __ add(result, result, wordSize/str2_chr_size);
5904       __ br(__ GE, L_LOOP);
5905     __ BIND(L_POST_LOOP);
5906       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5907       __ br(__ LE, NOMATCH);
5908       __ ldr(ch2, Address(str2));
5909       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5910       __ eor(ch2, first, ch2);
5911       __ sub(tmp2, ch2, tmp1);
5912       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5913       __ mov(tmp4, -1); // all bits set
5914       __ b(L_SMALL_PROCEED);
5915     __ align(OptoLoopAlignment);
5916     __ BIND(L_SMALL);
5917       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5918       __ eor(ch2, first, ch2);
5919       if (str1_isL != str2_isL) {
5920         __ zip1(v1, __ T16B, v1, v0);
5921       }
5922       __ sub(tmp2, ch2, tmp1);
5923       __ mov(tmp4, -1); // all bits set
5924       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5925       if (str1_isL != str2_isL) {
5926         __ fmovd(ch1, v1); // move converted 4 symbols
5927       }
5928     __ BIND(L_SMALL_PROCEED);
5929       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5930       __ bic(tmp2, tmp2, ch2);
5931       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5932       __ rbit(tmp2, tmp2);
5933       __ br(__ EQ, NOMATCH);
5934     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5935       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5936       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5937       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5938       if (str2_isL) { // LL
5939         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5940         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5941         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5942         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5943         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5944       } else {
5945         __ mov(ch2, 0xE); // all bits in byte set except last one
5946         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5947         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5948         __ lslv(tmp2, tmp2, tmp4);
5949         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5950         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5951         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5952         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5953       }
5954       __ cmp(ch1, ch2);
5955       __ mov(tmp4, wordSize/str2_chr_size);
5956       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5957     __ BIND(L_SMALL_CMP_LOOP);
5958       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5959                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5960       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5961                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5962       __ add(tmp4, tmp4, 1);
5963       __ cmp(tmp4, cnt1);
5964       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5965       __ cmp(first, ch2);
5966       __ br(__ EQ, L_SMALL_CMP_LOOP);
5967     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5968       __ cbz(tmp2, NOMATCH); // no more matches. exit
5969       __ clz(tmp4, tmp2);
5970       __ add(result, result, 1); // advance index
5971       __ add(str2, str2, str2_chr_size); // advance pointer
5972       __ b(L_SMALL_HAS_ZERO_LOOP);
5973     __ align(OptoLoopAlignment);
5974     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5975       __ cmp(first, ch2);
5976       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5977       __ b(DONE);
5978     __ align(OptoLoopAlignment);
5979     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5980       if (str2_isL) { // LL
5981         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5982         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5983         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5984         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5985         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5986       } else {
5987         __ mov(ch2, 0xE); // all bits in byte set except last one
5988         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5989         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5990         __ lslv(tmp2, tmp2, tmp4);
5991         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5992         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5993         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5994         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5995       }
5996       __ cmp(ch1, ch2);
5997       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5998       __ b(DONE);
5999     __ align(OptoLoopAlignment);
6000     __ BIND(L_HAS_ZERO);
6001       __ rbit(tmp2, tmp2);
6002       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
6003       // Now, perform compression of counters(cnt2 and cnt1) into one register.
6004       // It's fine because both counters are 32bit and are not changed in this
6005       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
6006       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
6007       __ sub(result, result, 1);
6008     __ BIND(L_HAS_ZERO_LOOP);
6009       __ mov(cnt1, wordSize/str2_chr_size);
6010       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6011       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6012       if (str2_isL) {
6013         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6014         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6015         __ lslv(tmp2, tmp2, tmp4);
6016         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6017         __ add(tmp4, tmp4, 1);
6018         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6019         __ lsl(tmp2, tmp2, 1);
6020         __ mov(tmp4, wordSize/str2_chr_size);
6021       } else {
6022         __ mov(ch2, 0xE);
6023         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6024         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6025         __ lslv(tmp2, tmp2, tmp4);
6026         __ add(tmp4, tmp4, 1);
6027         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6028         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6029         __ lsl(tmp2, tmp2, 1);
6030         __ mov(tmp4, wordSize/str2_chr_size);
6031         __ sub(str2, str2, str2_chr_size);
6032       }
6033       __ cmp(ch1, ch2);
6034       __ mov(tmp4, wordSize/str2_chr_size);
6035       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6036     __ BIND(L_CMP_LOOP);
6037       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6038                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6039       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6040                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6041       __ add(tmp4, tmp4, 1);
6042       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6043       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6044       __ cmp(cnt1, ch2);
6045       __ br(__ EQ, L_CMP_LOOP);
6046     __ BIND(L_CMP_LOOP_NOMATCH);
6047       // here we're not matched
6048       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6049       __ clz(tmp4, tmp2);
6050       __ add(str2, str2, str2_chr_size); // advance pointer
6051       __ b(L_HAS_ZERO_LOOP);
6052     __ align(OptoLoopAlignment);
6053     __ BIND(L_CMP_LOOP_LAST_CMP);
6054       __ cmp(cnt1, ch2);
6055       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6056       __ b(DONE);
6057     __ align(OptoLoopAlignment);
6058     __ BIND(L_CMP_LOOP_LAST_CMP2);
6059       if (str2_isL) {
6060         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6061         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6062         __ lslv(tmp2, tmp2, tmp4);
6063         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6064         __ add(tmp4, tmp4, 1);
6065         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6066         __ lsl(tmp2, tmp2, 1);
6067       } else {
6068         __ mov(ch2, 0xE);
6069         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6070         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6071         __ lslv(tmp2, tmp2, tmp4);
6072         __ add(tmp4, tmp4, 1);
6073         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6074         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6075         __ lsl(tmp2, tmp2, 1);
6076         __ sub(str2, str2, str2_chr_size);
6077       }
6078       __ cmp(ch1, ch2);
6079       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6080       __ b(DONE);
6081     __ align(OptoLoopAlignment);
6082     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6083       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6084       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6085       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6086       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6087       // result by analyzed characters value, so, we can just reset lower bits
6088       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6089       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6090       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6091       // index of last analyzed substring inside current octet. So, str2 in at
6092       // respective start address. We need to advance it to next octet
6093       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6094       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6095       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6096       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6097       __ movw(cnt2, cnt2);
6098       __ b(L_LOOP_PROCEED);
6099     __ align(OptoLoopAlignment);
6100     __ BIND(NOMATCH);
6101       __ mov(result, -1);
6102     __ BIND(DONE);
6103       __ pop(spilled_regs, sp);
6104       __ ret(lr);
6105     return entry;
6106   }
6107 
6108   void generate_string_indexof_stubs() {
6109     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6110     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6111     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6112   }
6113 
6114   void inflate_and_store_2_fp_registers(bool generatePrfm,
6115       FloatRegister src1, FloatRegister src2) {
6116     Register dst = r1;
6117     __ zip1(v1, __ T16B, src1, v0);
6118     __ zip2(v2, __ T16B, src1, v0);
6119     if (generatePrfm) {
6120       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6121     }
6122     __ zip1(v3, __ T16B, src2, v0);
6123     __ zip2(v4, __ T16B, src2, v0);
6124     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6125   }
6126 
6127   // R0 = src
6128   // R1 = dst
6129   // R2 = len
6130   // R3 = len >> 3
6131   // V0 = 0
6132   // v1 = loaded 8 bytes
6133   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6134   address generate_large_byte_array_inflate() {
6135     __ align(CodeEntryAlignment);
6136     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6137     address entry = __ pc();
6138     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6139     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6140     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6141 
6142     // do one more 8-byte read to have address 16-byte aligned in most cases
6143     // also use single store instruction
6144     __ ldrd(v2, __ post(src, 8));
6145     __ sub(octetCounter, octetCounter, 2);
6146     __ zip1(v1, __ T16B, v1, v0);
6147     __ zip1(v2, __ T16B, v2, v0);
6148     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6149     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6150     __ subs(rscratch1, octetCounter, large_loop_threshold);
6151     __ br(__ LE, LOOP_START);
6152     __ b(LOOP_PRFM_START);
6153     __ bind(LOOP_PRFM);
6154       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6155     __ bind(LOOP_PRFM_START);
6156       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6157       __ sub(octetCounter, octetCounter, 8);
6158       __ subs(rscratch1, octetCounter, large_loop_threshold);
6159       inflate_and_store_2_fp_registers(true, v3, v4);
6160       inflate_and_store_2_fp_registers(true, v5, v6);
6161       __ br(__ GT, LOOP_PRFM);
6162       __ cmp(octetCounter, (u1)8);
6163       __ br(__ LT, DONE);
6164     __ bind(LOOP);
6165       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6166       __ bind(LOOP_START);
6167       __ sub(octetCounter, octetCounter, 8);
6168       __ cmp(octetCounter, (u1)8);
6169       inflate_and_store_2_fp_registers(false, v3, v4);
6170       inflate_and_store_2_fp_registers(false, v5, v6);
6171       __ br(__ GE, LOOP);
6172     __ bind(DONE);
6173       __ ret(lr);
6174     return entry;
6175   }
6176 
6177   /**
6178    *  Arguments:
6179    *
6180    *  Input:
6181    *  c_rarg0   - current state address
6182    *  c_rarg1   - H key address
6183    *  c_rarg2   - data address
6184    *  c_rarg3   - number of blocks
6185    *
6186    *  Output:
6187    *  Updated state at c_rarg0
6188    */
6189   address generate_ghash_processBlocks() {
6190     // Bafflingly, GCM uses little-endian for the byte order, but
6191     // big-endian for the bit order.  For example, the polynomial 1 is
6192     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6193     //
6194     // So, we must either reverse the bytes in each word and do
6195     // everything big-endian or reverse the bits in each byte and do
6196     // it little-endian.  On AArch64 it's more idiomatic to reverse
6197     // the bits in each byte (we have an instruction, RBIT, to do
6198     // that) and keep the data in little-endian bit order through the
6199     // calculation, bit-reversing the inputs and outputs.
6200 
6201     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6202     __ align(wordSize * 2);
6203     address p = __ pc();
6204     __ emit_int64(0x87);  // The low-order bits of the field
6205                           // polynomial (i.e. p = z^7+z^2+z+1)
6206                           // repeated in the low and high parts of a
6207                           // 128-bit vector
6208     __ emit_int64(0x87);
6209 
6210     __ align(CodeEntryAlignment);
6211     address start = __ pc();
6212 
6213     Register state   = c_rarg0;
6214     Register subkeyH = c_rarg1;
6215     Register data    = c_rarg2;
6216     Register blocks  = c_rarg3;
6217 
6218     FloatRegister vzr = v30;
6219     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6220 
6221     __ ldrq(v24, p);    // The field polynomial
6222 
6223     __ ldrq(v0, Address(state));
6224     __ ldrq(v1, Address(subkeyH));
6225 
6226     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6227     __ rbit(v0, __ T16B, v0);
6228     __ rev64(v1, __ T16B, v1);
6229     __ rbit(v1, __ T16B, v1);
6230 
6231     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6232     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6233 
6234     {
6235       Label L_ghash_loop;
6236       __ bind(L_ghash_loop);
6237 
6238       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6239                                                  // reversing each byte
6240       __ rbit(v2, __ T16B, v2);
6241       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6242 
6243       // Multiply state in v2 by subkey in v1
6244       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6245                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6246                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6247       // Reduce v7:v5 by the field polynomial
6248       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6249 
6250       __ sub(blocks, blocks, 1);
6251       __ cbnz(blocks, L_ghash_loop);
6252     }
6253 
6254     // The bit-reversed result is at this point in v0
6255     __ rev64(v0, __ T16B, v0);
6256     __ rbit(v0, __ T16B, v0);
6257 
6258     __ st1(v0, __ T16B, state);
6259     __ ret(lr);
6260 
6261     return start;
6262   }
6263 
6264   address generate_ghash_processBlocks_wide() {
6265     address small = generate_ghash_processBlocks();
6266 
6267     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6268     __ align(wordSize * 2);
6269     address p = __ pc();
6270     __ emit_int64(0x87);  // The low-order bits of the field
6271                           // polynomial (i.e. p = z^7+z^2+z+1)
6272                           // repeated in the low and high parts of a
6273                           // 128-bit vector
6274     __ emit_int64(0x87);
6275 
6276     __ align(CodeEntryAlignment);
6277     address start = __ pc();
6278 
6279     Register state   = c_rarg0;
6280     Register subkeyH = c_rarg1;
6281     Register data    = c_rarg2;
6282     Register blocks  = c_rarg3;
6283 
6284     const int unroll = 4;
6285 
6286     __ cmp(blocks, (unsigned char)(unroll * 2));
6287     __ br(__ LT, small);
6288 
6289     if (unroll > 1) {
6290     // Save state before entering routine
6291       __ sub(sp, sp, 4 * 16);
6292       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6293       __ sub(sp, sp, 4 * 16);
6294       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6295     }
6296 
6297     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6298 
6299     if (unroll > 1) {
6300       // And restore state
6301       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6302       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6303     }
6304 
6305     __ cmp(blocks, (unsigned char)0);
6306     __ br(__ GT, small);
6307 
6308     __ ret(lr);
6309 
6310     return start;
6311   }
6312 
6313   void generate_base64_encode_simdround(Register src, Register dst,
6314         FloatRegister codec, u8 size) {
6315 
6316     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6317     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6318     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6319 
6320     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6321 
6322     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6323 
6324     __ ushr(ind0, arrangement, in0,  2);
6325 
6326     __ ushr(ind1, arrangement, in1,  2);
6327     __ shl(in0,   arrangement, in0,  6);
6328     __ orr(ind1,  arrangement, ind1, in0);
6329     __ ushr(ind1, arrangement, ind1, 2);
6330 
6331     __ ushr(ind2, arrangement, in2,  4);
6332     __ shl(in1,   arrangement, in1,  4);
6333     __ orr(ind2,  arrangement, in1,  ind2);
6334     __ ushr(ind2, arrangement, ind2, 2);
6335 
6336     __ shl(ind3,  arrangement, in2,  2);
6337     __ ushr(ind3, arrangement, ind3, 2);
6338 
6339     __ tbl(out0,  arrangement, codec,  4, ind0);
6340     __ tbl(out1,  arrangement, codec,  4, ind1);
6341     __ tbl(out2,  arrangement, codec,  4, ind2);
6342     __ tbl(out3,  arrangement, codec,  4, ind3);
6343 
6344     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6345   }
6346 
6347    /**
6348    *  Arguments:
6349    *
6350    *  Input:
6351    *  c_rarg0   - src_start
6352    *  c_rarg1   - src_offset
6353    *  c_rarg2   - src_length
6354    *  c_rarg3   - dest_start
6355    *  c_rarg4   - dest_offset
6356    *  c_rarg5   - isURL
6357    *
6358    */
6359   address generate_base64_encodeBlock() {
6360 
6361     static const char toBase64[64] = {
6362       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6363       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6364       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6365       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6366       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6367     };
6368 
6369     static const char toBase64URL[64] = {
6370       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6371       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6372       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6373       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6374       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6375     };
6376 
6377     __ align(CodeEntryAlignment);
6378     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6379     address start = __ pc();
6380 
6381     Register src   = c_rarg0;  // source array
6382     Register soff  = c_rarg1;  // source start offset
6383     Register send  = c_rarg2;  // source end offset
6384     Register dst   = c_rarg3;  // dest array
6385     Register doff  = c_rarg4;  // position for writing to dest array
6386     Register isURL = c_rarg5;  // Base64 or URL character set
6387 
6388     // c_rarg6 and c_rarg7 are free to use as temps
6389     Register codec  = c_rarg6;
6390     Register length = c_rarg7;
6391 
6392     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6393 
6394     __ add(src, src, soff);
6395     __ add(dst, dst, doff);
6396     __ sub(length, send, soff);
6397 
6398     // load the codec base address
6399     __ lea(codec, ExternalAddress((address) toBase64));
6400     __ cbz(isURL, ProcessData);
6401     __ lea(codec, ExternalAddress((address) toBase64URL));
6402 
6403     __ BIND(ProcessData);
6404 
6405     // too short to formup a SIMD loop, roll back
6406     __ cmp(length, (u1)24);
6407     __ br(Assembler::LT, Process3B);
6408 
6409     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6410 
6411     __ BIND(Process48B);
6412     __ cmp(length, (u1)48);
6413     __ br(Assembler::LT, Process24B);
6414     generate_base64_encode_simdround(src, dst, v0, 16);
6415     __ sub(length, length, 48);
6416     __ b(Process48B);
6417 
6418     __ BIND(Process24B);
6419     __ cmp(length, (u1)24);
6420     __ br(Assembler::LT, SIMDExit);
6421     generate_base64_encode_simdround(src, dst, v0, 8);
6422     __ sub(length, length, 24);
6423 
6424     __ BIND(SIMDExit);
6425     __ cbz(length, Exit);
6426 
6427     __ BIND(Process3B);
6428     //  3 src bytes, 24 bits
6429     __ ldrb(r10, __ post(src, 1));
6430     __ ldrb(r11, __ post(src, 1));
6431     __ ldrb(r12, __ post(src, 1));
6432     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6433     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6434     // codec index
6435     __ ubfmw(r15, r12, 18, 23);
6436     __ ubfmw(r14, r12, 12, 17);
6437     __ ubfmw(r13, r12, 6,  11);
6438     __ andw(r12,  r12, 63);
6439     // get the code based on the codec
6440     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6441     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6442     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6443     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6444     __ strb(r15, __ post(dst, 1));
6445     __ strb(r14, __ post(dst, 1));
6446     __ strb(r13, __ post(dst, 1));
6447     __ strb(r12, __ post(dst, 1));
6448     __ sub(length, length, 3);
6449     __ cbnz(length, Process3B);
6450 
6451     __ BIND(Exit);
6452     __ ret(lr);
6453 
6454     return start;
6455   }
6456 
6457   void generate_base64_decode_simdround(Register src, Register dst,
6458         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6459 
6460     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6461     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6462 
6463     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6464     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6465 
6466     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6467 
6468     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6469 
6470     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6471 
6472     // we need unsigned saturating subtract, to make sure all input values
6473     // in range [0, 63] will have 0U value in the higher half lookup
6474     __ uqsubv(decH0, __ T16B, in0, v27);
6475     __ uqsubv(decH1, __ T16B, in1, v27);
6476     __ uqsubv(decH2, __ T16B, in2, v27);
6477     __ uqsubv(decH3, __ T16B, in3, v27);
6478 
6479     // lower half lookup
6480     __ tbl(decL0, arrangement, codecL, 4, in0);
6481     __ tbl(decL1, arrangement, codecL, 4, in1);
6482     __ tbl(decL2, arrangement, codecL, 4, in2);
6483     __ tbl(decL3, arrangement, codecL, 4, in3);
6484 
6485     // higher half lookup
6486     __ tbx(decH0, arrangement, codecH, 4, decH0);
6487     __ tbx(decH1, arrangement, codecH, 4, decH1);
6488     __ tbx(decH2, arrangement, codecH, 4, decH2);
6489     __ tbx(decH3, arrangement, codecH, 4, decH3);
6490 
6491     // combine lower and higher
6492     __ orr(decL0, arrangement, decL0, decH0);
6493     __ orr(decL1, arrangement, decL1, decH1);
6494     __ orr(decL2, arrangement, decL2, decH2);
6495     __ orr(decL3, arrangement, decL3, decH3);
6496 
6497     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6498     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6499     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6500     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6501     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6502     __ orr(in0, arrangement, decH0, decH1);
6503     __ orr(in1, arrangement, decH2, decH3);
6504     __ orr(in2, arrangement, in0,   in1);
6505     __ umaxv(in3, arrangement, in2);
6506     __ umov(rscratch2, in3, __ B, 0);
6507 
6508     // get the data to output
6509     __ shl(out0,  arrangement, decL0, 2);
6510     __ ushr(out1, arrangement, decL1, 4);
6511     __ orr(out0,  arrangement, out0,  out1);
6512     __ shl(out1,  arrangement, decL1, 4);
6513     __ ushr(out2, arrangement, decL2, 2);
6514     __ orr(out1,  arrangement, out1,  out2);
6515     __ shl(out2,  arrangement, decL2, 6);
6516     __ orr(out2,  arrangement, out2,  decL3);
6517 
6518     __ cbz(rscratch2, NoIllegalData);
6519 
6520     // handle illegal input
6521     __ umov(r10, in2, __ D, 0);
6522     if (size == 16) {
6523       __ cbnz(r10, ErrorInLowerHalf);
6524 
6525       // illegal input is in higher half, store the lower half now.
6526       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6527 
6528       __ umov(r10, in2,  __ D, 1);
6529       __ umov(r11, out0, __ D, 1);
6530       __ umov(r12, out1, __ D, 1);
6531       __ umov(r13, out2, __ D, 1);
6532       __ b(StoreLegalData);
6533 
6534       __ BIND(ErrorInLowerHalf);
6535     }
6536     __ umov(r11, out0, __ D, 0);
6537     __ umov(r12, out1, __ D, 0);
6538     __ umov(r13, out2, __ D, 0);
6539 
6540     __ BIND(StoreLegalData);
6541     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6542     __ strb(r11, __ post(dst, 1));
6543     __ strb(r12, __ post(dst, 1));
6544     __ strb(r13, __ post(dst, 1));
6545     __ lsr(r10, r10, 8);
6546     __ lsr(r11, r11, 8);
6547     __ lsr(r12, r12, 8);
6548     __ lsr(r13, r13, 8);
6549     __ b(StoreLegalData);
6550 
6551     __ BIND(NoIllegalData);
6552     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6553   }
6554 
6555 
6556    /**
6557    *  Arguments:
6558    *
6559    *  Input:
6560    *  c_rarg0   - src_start
6561    *  c_rarg1   - src_offset
6562    *  c_rarg2   - src_length
6563    *  c_rarg3   - dest_start
6564    *  c_rarg4   - dest_offset
6565    *  c_rarg5   - isURL
6566    *  c_rarg6   - isMIME
6567    *
6568    */
6569   address generate_base64_decodeBlock() {
6570 
6571     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6572     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6573     // titled "Base64 decoding".
6574 
6575     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6576     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6577     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6578     static const uint8_t fromBase64ForNoSIMD[256] = {
6579       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6580       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6581       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6582        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6583       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6584        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6585       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6586        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6587       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6588       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6589       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6590       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6591       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6592       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6593       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6594       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6595     };
6596 
6597     static const uint8_t fromBase64URLForNoSIMD[256] = {
6598       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6599       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6600       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6601        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6602       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6603        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6604       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6605        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6606       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6607       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6608       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6609       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6610       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6611       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6612       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6613       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6614     };
6615 
6616     // A legal value of base64 code is in range [0, 127].  We need two lookups
6617     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6618     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6619     // table vector lookup use tbx, out of range indices are unchanged in
6620     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6621     // The value of index 64 is set to 0, so that we know that we already get the
6622     // decoded data with the 1st lookup.
6623     static const uint8_t fromBase64ForSIMD[128] = {
6624       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6625       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6626       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6627        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6628         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6629        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6630       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6631        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6632     };
6633 
6634     static const uint8_t fromBase64URLForSIMD[128] = {
6635       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6636       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6637       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6638        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6639         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6640        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6641        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6642        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6643     };
6644 
6645     __ align(CodeEntryAlignment);
6646     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6647     address start = __ pc();
6648 
6649     Register src    = c_rarg0;  // source array
6650     Register soff   = c_rarg1;  // source start offset
6651     Register send   = c_rarg2;  // source end offset
6652     Register dst    = c_rarg3;  // dest array
6653     Register doff   = c_rarg4;  // position for writing to dest array
6654     Register isURL  = c_rarg5;  // Base64 or URL character set
6655     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6656 
6657     Register length = send;    // reuse send as length of source data to process
6658 
6659     Register simd_codec   = c_rarg6;
6660     Register nosimd_codec = c_rarg7;
6661 
6662     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6663 
6664     __ enter();
6665 
6666     __ add(src, src, soff);
6667     __ add(dst, dst, doff);
6668 
6669     __ mov(doff, dst);
6670 
6671     __ sub(length, send, soff);
6672     __ bfm(length, zr, 0, 1);
6673 
6674     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6675     __ cbz(isURL, ProcessData);
6676     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6677 
6678     __ BIND(ProcessData);
6679     __ mov(rscratch1, length);
6680     __ cmp(length, (u1)144); // 144 = 80 + 64
6681     __ br(Assembler::LT, Process4B);
6682 
6683     // In the MIME case, the line length cannot be more than 76
6684     // bytes (see RFC 2045). This is too short a block for SIMD
6685     // to be worthwhile, so we use non-SIMD here.
6686     __ movw(rscratch1, 79);
6687 
6688     __ BIND(Process4B);
6689     __ ldrw(r14, __ post(src, 4));
6690     __ ubfxw(r10, r14, 0,  8);
6691     __ ubfxw(r11, r14, 8,  8);
6692     __ ubfxw(r12, r14, 16, 8);
6693     __ ubfxw(r13, r14, 24, 8);
6694     // get the de-code
6695     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6696     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6697     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6698     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6699     // error detection, 255u indicates an illegal input
6700     __ orrw(r14, r10, r11);
6701     __ orrw(r15, r12, r13);
6702     __ orrw(r14, r14, r15);
6703     __ tbnz(r14, 7, Exit);
6704     // recover the data
6705     __ lslw(r14, r10, 10);
6706     __ bfiw(r14, r11, 4, 6);
6707     __ bfmw(r14, r12, 2, 5);
6708     __ rev16w(r14, r14);
6709     __ bfiw(r13, r12, 6, 2);
6710     __ strh(r14, __ post(dst, 2));
6711     __ strb(r13, __ post(dst, 1));
6712     // non-simd loop
6713     __ subsw(rscratch1, rscratch1, 4);
6714     __ br(Assembler::GT, Process4B);
6715 
6716     // if exiting from PreProcess80B, rscratch1 == -1;
6717     // otherwise, rscratch1 == 0.
6718     __ cbzw(rscratch1, Exit);
6719     __ sub(length, length, 80);
6720 
6721     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6722     __ cbz(isURL, SIMDEnter);
6723     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6724 
6725     __ BIND(SIMDEnter);
6726     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6727     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6728     __ mov(rscratch1, 63);
6729     __ dup(v27, __ T16B, rscratch1);
6730 
6731     __ BIND(Process64B);
6732     __ cmp(length, (u1)64);
6733     __ br(Assembler::LT, Process32B);
6734     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6735     __ sub(length, length, 64);
6736     __ b(Process64B);
6737 
6738     __ BIND(Process32B);
6739     __ cmp(length, (u1)32);
6740     __ br(Assembler::LT, SIMDExit);
6741     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6742     __ sub(length, length, 32);
6743     __ b(Process32B);
6744 
6745     __ BIND(SIMDExit);
6746     __ cbz(length, Exit);
6747     __ movw(rscratch1, length);
6748     __ b(Process4B);
6749 
6750     __ BIND(Exit);
6751     __ sub(c_rarg0, dst, doff);
6752 
6753     __ leave();
6754     __ ret(lr);
6755 
6756     return start;
6757   }
6758 
6759   // Support for spin waits.
6760   address generate_spin_wait() {
6761     __ align(CodeEntryAlignment);
6762     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6763     address start = __ pc();
6764 
6765     __ spin_wait();
6766     __ ret(lr);
6767 
6768     return start;
6769   }
6770 
6771   address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
6772     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
6773 
6774     address start = __ pc();
6775     const Register
6776       r_super_klass  = r0,
6777       r_array_base   = r1,
6778       r_array_length = r2,
6779       r_array_index  = r3,
6780       r_sub_klass    = r4,
6781       r_bitmap       = rscratch2,
6782       result         = r5;
6783     const FloatRegister
6784       vtemp          = v0;
6785 
6786     Label L_success;
6787     __ enter();
6788     __ lookup_secondary_supers_table(r_sub_klass, r_super_klass,
6789                                      r_array_base, r_array_length, r_array_index,
6790                                      vtemp, result, super_klass_index,
6791                                      /*stub_is_near*/true);
6792     __ leave();
6793     __ ret(lr);
6794 
6795     return start;
6796   }
6797 
6798   // Slow path implementation for UseSecondarySupersTable.
6799   address generate_lookup_secondary_supers_table_slow_path_stub() {
6800     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
6801 
6802     address start = __ pc();
6803     const Register
6804       r_super_klass  = r0,        // argument
6805       r_array_base   = r1,        // argument
6806       temp1          = r2,        // temp
6807       r_array_index  = r3,        // argument
6808       r_bitmap       = rscratch2, // argument
6809       result         = r5;        // argument
6810 
6811     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
6812     __ ret(lr);
6813 
6814     return start;
6815   }
6816 
6817 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6818 
6819   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6820   //
6821   // If LSE is in use, generate LSE versions of all the stubs. The
6822   // non-LSE versions are in atomic_aarch64.S.
6823 
6824   // class AtomicStubMark records the entry point of a stub and the
6825   // stub pointer which will point to it. The stub pointer is set to
6826   // the entry point when ~AtomicStubMark() is called, which must be
6827   // after ICache::invalidate_range. This ensures safe publication of
6828   // the generated code.
6829   class AtomicStubMark {
6830     address _entry_point;
6831     aarch64_atomic_stub_t *_stub;
6832     MacroAssembler *_masm;
6833   public:
6834     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6835       _masm = masm;
6836       __ align(32);
6837       _entry_point = __ pc();
6838       _stub = stub;
6839     }
6840     ~AtomicStubMark() {
6841       *_stub = (aarch64_atomic_stub_t)_entry_point;
6842     }
6843   };
6844 
6845   // NB: For memory_order_conservative we need a trailing membar after
6846   // LSE atomic operations but not a leading membar.
6847   //
6848   // We don't need a leading membar because a clause in the Arm ARM
6849   // says:
6850   //
6851   //   Barrier-ordered-before
6852   //
6853   //   Barrier instructions order prior Memory effects before subsequent
6854   //   Memory effects generated by the same Observer. A read or a write
6855   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6856   //   Observer if and only if RW1 appears in program order before RW 2
6857   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6858   //   instruction with both Acquire and Release semantics.
6859   //
6860   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6861   // and Release semantics, therefore we don't need a leading
6862   // barrier. However, there is no corresponding Barrier-ordered-after
6863   // relationship, therefore we need a trailing membar to prevent a
6864   // later store or load from being reordered with the store in an
6865   // atomic instruction.
6866   //
6867   // This was checked by using the herd7 consistency model simulator
6868   // (http://diy.inria.fr/) with this test case:
6869   //
6870   // AArch64 LseCas
6871   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6872   // P0 | P1;
6873   // LDR W4, [X2] | MOV W3, #0;
6874   // DMB LD       | MOV W4, #1;
6875   // LDR W3, [X1] | CASAL W3, W4, [X1];
6876   //              | DMB ISH;
6877   //              | STR W4, [X2];
6878   // exists
6879   // (0:X3=0 /\ 0:X4=1)
6880   //
6881   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6882   // with the store to x in P1. Without the DMB in P1 this may happen.
6883   //
6884   // At the time of writing we don't know of any AArch64 hardware that
6885   // reorders stores in this way, but the Reference Manual permits it.
6886 
6887   void gen_cas_entry(Assembler::operand_size size,
6888                      atomic_memory_order order) {
6889     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6890       exchange_val = c_rarg2;
6891     bool acquire, release;
6892     switch (order) {
6893       case memory_order_relaxed:
6894         acquire = false;
6895         release = false;
6896         break;
6897       case memory_order_release:
6898         acquire = false;
6899         release = true;
6900         break;
6901       default:
6902         acquire = true;
6903         release = true;
6904         break;
6905     }
6906     __ mov(prev, compare_val);
6907     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6908     if (order == memory_order_conservative) {
6909       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6910     }
6911     if (size == Assembler::xword) {
6912       __ mov(r0, prev);
6913     } else {
6914       __ movw(r0, prev);
6915     }
6916     __ ret(lr);
6917   }
6918 
6919   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6920     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6921     // If not relaxed, then default to conservative.  Relaxed is the only
6922     // case we use enough to be worth specializing.
6923     if (order == memory_order_relaxed) {
6924       __ ldadd(size, incr, prev, addr);
6925     } else {
6926       __ ldaddal(size, incr, prev, addr);
6927       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6928     }
6929     if (size == Assembler::xword) {
6930       __ mov(r0, prev);
6931     } else {
6932       __ movw(r0, prev);
6933     }
6934     __ ret(lr);
6935   }
6936 
6937   void gen_swpal_entry(Assembler::operand_size size) {
6938     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6939     __ swpal(size, incr, prev, addr);
6940     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6941     if (size == Assembler::xword) {
6942       __ mov(r0, prev);
6943     } else {
6944       __ movw(r0, prev);
6945     }
6946     __ ret(lr);
6947   }
6948 
6949   void generate_atomic_entry_points() {
6950     if (! UseLSE) {
6951       return;
6952     }
6953 
6954     __ align(CodeEntryAlignment);
6955     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6956     address first_entry = __ pc();
6957 
6958     // ADD, memory_order_conservative
6959     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6960     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6961     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6962     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6963 
6964     // ADD, memory_order_relaxed
6965     AtomicStubMark mark_fetch_add_4_relaxed
6966       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6967     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6968     AtomicStubMark mark_fetch_add_8_relaxed
6969       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6970     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6971 
6972     // XCHG, memory_order_conservative
6973     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6974     gen_swpal_entry(Assembler::word);
6975     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6976     gen_swpal_entry(Assembler::xword);
6977 
6978     // CAS, memory_order_conservative
6979     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6980     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6981     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6982     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6983     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6984     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6985 
6986     // CAS, memory_order_relaxed
6987     AtomicStubMark mark_cmpxchg_1_relaxed
6988       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6989     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6990     AtomicStubMark mark_cmpxchg_4_relaxed
6991       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6992     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6993     AtomicStubMark mark_cmpxchg_8_relaxed
6994       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6995     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6996 
6997     AtomicStubMark mark_cmpxchg_4_release
6998       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6999     gen_cas_entry(MacroAssembler::word, memory_order_release);
7000     AtomicStubMark mark_cmpxchg_8_release
7001       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
7002     gen_cas_entry(MacroAssembler::xword, memory_order_release);
7003 
7004     AtomicStubMark mark_cmpxchg_4_seq_cst
7005       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
7006     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
7007     AtomicStubMark mark_cmpxchg_8_seq_cst
7008       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
7009     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
7010 
7011     ICache::invalidate_range(first_entry, __ pc() - first_entry);
7012   }
7013 #endif // LINUX
7014 
7015   address generate_cont_thaw(Continuation::thaw_kind kind) {
7016     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
7017     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
7018 
7019     address start = __ pc();
7020 
7021     if (return_barrier) {
7022       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7023       __ mov(sp, rscratch1);
7024     }
7025     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7026 
7027     if (return_barrier) {
7028       // preserve possible return value from a method returning to the return barrier
7029       __ fmovd(rscratch1, v0);
7030       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7031     }
7032 
7033     __ movw(c_rarg1, (return_barrier ? 1 : 0));
7034     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
7035     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
7036 
7037     if (return_barrier) {
7038       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7039       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7040       __ fmovd(v0, rscratch1);
7041     }
7042     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7043 
7044 
7045     Label thaw_success;
7046     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
7047     __ cbnz(rscratch2, thaw_success);
7048     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
7049     __ br(rscratch1);
7050     __ bind(thaw_success);
7051 
7052     // make room for the thawed frames
7053     __ sub(rscratch1, sp, rscratch2);
7054     __ andr(rscratch1, rscratch1, -16); // align
7055     __ mov(sp, rscratch1);
7056 
7057     if (return_barrier) {
7058       // save original return value -- again
7059       __ fmovd(rscratch1, v0);
7060       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7061     }
7062 
7063     // If we want, we can templatize thaw by kind, and have three different entries
7064     __ movw(c_rarg1, (uint32_t)kind);
7065 
7066     __ set_last_Java_frame(sp, rfp, rscratch1, rscratch2);
7067     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7068     __ reset_last_Java_frame(true);
7069     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7070 
7071     if (return_barrier) {
7072       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7073       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7074       __ fmovd(v0, rscratch1);
7075     } else {
7076       __ mov(r0, zr); // return 0 (success) from doYield
7077     }
7078 
7079     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7080     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7081     __ mov(rfp, sp);
7082 
7083     if (return_barrier_exception) {
7084       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7085       __ authenticate_return_address(c_rarg1);
7086       __ verify_oop(r0);
7087       // save return value containing the exception oop in callee-saved R19
7088       __ mov(r19, r0);
7089 
7090       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7091 
7092       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7093       // __ reinitialize_ptrue();
7094 
7095       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7096 
7097       __ mov(r1, r0); // the exception handler
7098       __ mov(r0, r19); // restore return value containing the exception oop
7099       __ verify_oop(r0);
7100 
7101       __ leave();
7102       __ mov(r3, lr);
7103       __ br(r1); // the exception handler
7104     } else {
7105       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7106       __ leave();
7107       __ ret(lr);
7108     }
7109 
7110     return start;
7111   }
7112 
7113   address generate_cont_thaw() {
7114     if (!Continuations::enabled()) return nullptr;
7115 
7116     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7117     address start = __ pc();
7118     generate_cont_thaw(Continuation::thaw_top);
7119     return start;
7120   }
7121 
7122   address generate_cont_returnBarrier() {
7123     if (!Continuations::enabled()) return nullptr;
7124 
7125     // TODO: will probably need multiple return barriers depending on return type
7126     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7127     address start = __ pc();
7128 
7129     generate_cont_thaw(Continuation::thaw_return_barrier);
7130 
7131     return start;
7132   }
7133 
7134   address generate_cont_returnBarrier_exception() {
7135     if (!Continuations::enabled()) return nullptr;
7136 
7137     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7138     address start = __ pc();
7139 
7140     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7141 
7142     return start;
7143   }
7144 
7145   address generate_cont_preempt_stub() {
7146     if (!Continuations::enabled()) return nullptr;
7147     StubCodeMark mark(this, "StubRoutines","Continuation preempt stub");
7148     address start = __ pc();
7149 
7150     __ reset_last_Java_frame(true);
7151 
7152     // reset the flag
7153     __ strb(zr, Address(rthread, JavaThread::preempting_offset()));
7154 
7155     // Set sp to enterSpecial frame and then remove it from the stack
7156     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
7157     __ mov(sp, rscratch2);
7158 
7159     Label preemption_cancelled;
7160     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
7161     __ cbnz(rscratch1, preemption_cancelled);
7162 
7163     // Remove enterSpecial frame from the stack and return to Continuation.run()
7164     SharedRuntime::continuation_enter_cleanup(_masm);
7165     __ leave();
7166     __ ret(lr);
7167 
7168     __ bind(preemption_cancelled);
7169     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
7170     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
7171     __ lea(rscratch1, ExternalAddress((address)&ContinuationEntry::_thaw_call_pc));
7172     __ ldr(rscratch1, Address(rscratch1));
7173     __ br(rscratch1);
7174 
7175     return start;
7176   }
7177 
7178   address generate_cont_resume_compiler_adapter() {
7179     if (!Continuations::enabled()) return nullptr;
7180     StubCodeMark mark(this, "StubRoutines", "Continuation resume compiler adapter");
7181     address start = __ pc();
7182 
7183     // The safepoint blob handler expects that r20, being a callee saved register, will be preserved
7184     // during the VM call. It is used to check if the return pc back to Java was modified in the runtime.
7185     // If it wasn't, the return pc is modified so on return the poll instruction is skipped. Saving this
7186     // additional value of r20 during freeze will complicate too much the code, so we just zero it here
7187     // so that the comparison fails and the skip is not attempted in case the pc was indeed changed.
7188     __ movptr(r20, NULL_WORD);
7189 
7190     __ leave();
7191     __ ret(lr);
7192 
7193     return start;
7194   }
7195 
7196   address generate_cont_resume_monitor_operation() {
7197     if (!Continuations::enabled()) return nullptr;
7198     StubCodeMark mark(this, "StubRoutines","Continuation resume monitor operation");
7199     address start = __ pc();
7200 
7201     const Register waiter_reg = c_rarg1;
7202     __ ldr(waiter_reg, __ post(sp, 2 * wordSize));
7203 
7204 #ifdef ASSERT
7205     { Label L;
7206       __ cbnz(waiter_reg, L);
7207       __ stop("ObjectMonitor to use is null");
7208       __ bind(L);
7209     }
7210 #endif // ASSERT
7211 
7212     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
7213     __ mov(c_rarg0, rthread);
7214     __ rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::resume_monitor_operation));
7215     __ reset_last_Java_frame(true);
7216 
7217     Label failAcquire;
7218     __ ldrb(rscratch1, Address(rthread, JavaThread::preempting_offset()));
7219     __ cbnz(rscratch1, failAcquire);
7220     // We have the lock now, just return to caller (we will actually hit the
7221     // return barrier to thaw more frames)
7222 
7223     // ThawBase::push_resume_monitor_operation set things up so that
7224     // SP now points to {fp, lr}.
7225     __ ldp(rfp, lr, Address(__ post(sp, 2 * wordSize)));
7226     __ ret(lr);
7227 
7228     __ bind(failAcquire);
7229     __ strb(/*false*/zr, Address(rthread, JavaThread::preempting_offset()));
7230     // Set sp to enterSpecial frame
7231     __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7232     __ mov(sp, rscratch1);
7233     // Remove enterSpecial frame from the stack and return to Continuation.run()
7234     SharedRuntime::continuation_enter_cleanup(_masm);
7235     __ leave();
7236     __ ret(lr);
7237 
7238     return start;
7239   }
7240 
7241   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7242   // are represented as long[5], with BITS_PER_LIMB = 26.
7243   // Pack five 26-bit limbs into three 64-bit registers.
7244   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7245     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7246     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7247     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7248     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7249 
7250     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7251     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7252     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7253     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7254 
7255     if (dest2->is_valid()) {
7256       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7257     } else {
7258 #ifdef ASSERT
7259       Label OK;
7260       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7261       __ br(__ EQ, OK);
7262       __ stop("high bits of Poly1305 integer should be zero");
7263       __ should_not_reach_here();
7264       __ bind(OK);
7265 #endif
7266     }
7267   }
7268 
7269   // As above, but return only a 128-bit integer, packed into two
7270   // 64-bit registers.
7271   void pack_26(Register dest0, Register dest1, Register src) {
7272     pack_26(dest0, dest1, noreg, src);
7273   }
7274 
7275   // Multiply and multiply-accumulate unsigned 64-bit registers.
7276   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7277     __ mul(prod_lo, n, m);
7278     __ umulh(prod_hi, n, m);
7279   }
7280   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7281     wide_mul(rscratch1, rscratch2, n, m);
7282     __ adds(sum_lo, sum_lo, rscratch1);
7283     __ adc(sum_hi, sum_hi, rscratch2);
7284   }
7285 
7286   // Poly1305, RFC 7539
7287 
7288   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7289   // description of the tricks used to simplify and accelerate this
7290   // computation.
7291 
7292   address generate_poly1305_processBlocks() {
7293     __ align(CodeEntryAlignment);
7294     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
7295     address start = __ pc();
7296     Label here;
7297     __ enter();
7298     RegSet callee_saved = RegSet::range(r19, r28);
7299     __ push(callee_saved, sp);
7300 
7301     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7302 
7303     // Arguments
7304     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7305 
7306     // R_n is the 128-bit randomly-generated key, packed into two
7307     // registers.  The caller passes this key to us as long[5], with
7308     // BITS_PER_LIMB = 26.
7309     const Register R_0 = *++regs, R_1 = *++regs;
7310     pack_26(R_0, R_1, r_start);
7311 
7312     // RR_n is (R_n >> 2) * 5
7313     const Register RR_0 = *++regs, RR_1 = *++regs;
7314     __ lsr(RR_0, R_0, 2);
7315     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7316     __ lsr(RR_1, R_1, 2);
7317     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7318 
7319     // U_n is the current checksum
7320     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7321     pack_26(U_0, U_1, U_2, acc_start);
7322 
7323     static constexpr int BLOCK_LENGTH = 16;
7324     Label DONE, LOOP;
7325 
7326     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7327     __ br(Assembler::LT, DONE); {
7328       __ bind(LOOP);
7329 
7330       // S_n is to be the sum of U_n and the next block of data
7331       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7332       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7333       __ adds(S_0, U_0, S_0);
7334       __ adcs(S_1, U_1, S_1);
7335       __ adc(S_2, U_2, zr);
7336       __ add(S_2, S_2, 1);
7337 
7338       const Register U_0HI = *++regs, U_1HI = *++regs;
7339 
7340       // NB: this logic depends on some of the special properties of
7341       // Poly1305 keys. In particular, because we know that the top
7342       // four bits of R_0 and R_1 are zero, we can add together
7343       // partial products without any risk of needing to propagate a
7344       // carry out.
7345       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7346       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7347       __ andr(U_2, R_0, 3);
7348       __ mul(U_2, S_2, U_2);
7349 
7350       // Recycle registers S_0, S_1, S_2
7351       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7352 
7353       // Partial reduction mod 2**130 - 5
7354       __ adds(U_1, U_0HI, U_1);
7355       __ adc(U_2, U_1HI, U_2);
7356       // Sum now in U_2:U_1:U_0.
7357       // Dead: U_0HI, U_1HI.
7358       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7359 
7360       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7361 
7362       // First, U_2:U_1:U_0 += (U_2 >> 2)
7363       __ lsr(rscratch1, U_2, 2);
7364       __ andr(U_2, U_2, (u8)3);
7365       __ adds(U_0, U_0, rscratch1);
7366       __ adcs(U_1, U_1, zr);
7367       __ adc(U_2, U_2, zr);
7368       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7369       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7370       __ adcs(U_1, U_1, zr);
7371       __ adc(U_2, U_2, zr);
7372 
7373       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7374       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7375       __ br(~ Assembler::LT, LOOP);
7376     }
7377 
7378     // Further reduce modulo 2^130 - 5
7379     __ lsr(rscratch1, U_2, 2);
7380     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7381     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7382     __ adcs(U_1, U_1, zr);
7383     __ andr(U_2, U_2, (u1)3);
7384     __ adc(U_2, U_2, zr);
7385 
7386     // Unpack the sum into five 26-bit limbs and write to memory.
7387     __ ubfiz(rscratch1, U_0, 0, 26);
7388     __ ubfx(rscratch2, U_0, 26, 26);
7389     __ stp(rscratch1, rscratch2, Address(acc_start));
7390     __ ubfx(rscratch1, U_0, 52, 12);
7391     __ bfi(rscratch1, U_1, 12, 14);
7392     __ ubfx(rscratch2, U_1, 14, 26);
7393     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7394     __ ubfx(rscratch1, U_1, 40, 24);
7395     __ bfi(rscratch1, U_2, 24, 3);
7396     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7397 
7398     __ bind(DONE);
7399     __ pop(callee_saved, sp);
7400     __ leave();
7401     __ ret(lr);
7402 
7403     return start;
7404   }
7405 
7406 #if INCLUDE_JFR
7407 
7408   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
7409     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7410     __ mov(c_rarg0, thread);
7411   }
7412 
7413   // The handle is dereferenced through a load barrier.
7414   static void jfr_epilogue(MacroAssembler* _masm) {
7415     __ reset_last_Java_frame(true);
7416   }
7417 
7418   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
7419   // It returns a jobject handle to the event writer.
7420   // The handle is dereferenced and the return value is the event writer oop.
7421   static RuntimeStub* generate_jfr_write_checkpoint() {
7422     enum layout {
7423       rbp_off,
7424       rbpH_off,
7425       return_off,
7426       return_off2,
7427       framesize // inclusive of return address
7428     };
7429 
7430     int insts_size = 1024;
7431     int locs_size = 64;
7432     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
7433     OopMapSet* oop_maps = new OopMapSet();
7434     MacroAssembler* masm = new MacroAssembler(&code);
7435     MacroAssembler* _masm = masm;
7436 
7437     address start = __ pc();
7438     __ enter();
7439     int frame_complete = __ pc() - start;
7440     address the_pc = __ pc();
7441     jfr_prologue(the_pc, _masm, rthread);
7442     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
7443     jfr_epilogue(_masm);
7444     __ resolve_global_jobject(r0, rscratch1, rscratch2);
7445     __ leave();
7446     __ ret(lr);
7447 
7448     OopMap* map = new OopMap(framesize, 1); // rfp
7449     oop_maps->add_gc_map(the_pc - start, map);
7450 
7451     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7452       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
7453                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7454                                     oop_maps, false);
7455     return stub;
7456   }
7457 
7458   // For c2: call to return a leased buffer.
7459   static RuntimeStub* generate_jfr_return_lease() {
7460     enum layout {
7461       rbp_off,
7462       rbpH_off,
7463       return_off,
7464       return_off2,
7465       framesize // inclusive of return address
7466     };
7467 
7468     int insts_size = 1024;
7469     int locs_size = 64;
7470     CodeBuffer code("jfr_return_lease", insts_size, locs_size);
7471     OopMapSet* oop_maps = new OopMapSet();
7472     MacroAssembler* masm = new MacroAssembler(&code);
7473     MacroAssembler* _masm = masm;
7474 
7475     address start = __ pc();
7476     __ enter();
7477     int frame_complete = __ pc() - start;
7478     address the_pc = __ pc();
7479     jfr_prologue(the_pc, _masm, rthread);
7480     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
7481     jfr_epilogue(_masm);
7482 
7483     __ leave();
7484     __ ret(lr);
7485 
7486     OopMap* map = new OopMap(framesize, 1); // rfp
7487     oop_maps->add_gc_map(the_pc - start, map);
7488 
7489     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7490       RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete,
7491                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7492                                     oop_maps, false);
7493     return stub;
7494   }
7495 
7496 #endif // INCLUDE_JFR
7497 
7498   // exception handler for upcall stubs
7499   address generate_upcall_stub_exception_handler() {
7500     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
7501     address start = __ pc();
7502 
7503     // Native caller has no idea how to handle exceptions,
7504     // so we just crash here. Up to callee to catch exceptions.
7505     __ verify_oop(r0);
7506     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7507     __ blr(rscratch1);
7508     __ should_not_reach_here();
7509 
7510     return start;
7511   }
7512 
7513   // Continuation point for throwing of implicit exceptions that are
7514   // not handled in the current activation. Fabricates an exception
7515   // oop and initiates normal exception dispatching in this
7516   // frame. Since we need to preserve callee-saved values (currently
7517   // only for C2, but done for C1 as well) we need a callee-saved oop
7518   // map and therefore have to make these stubs into RuntimeStubs
7519   // rather than BufferBlobs.  If the compiler needs all registers to
7520   // be preserved between the fault point and the exception handler
7521   // then it must assume responsibility for that in
7522   // AbstractCompiler::continuation_for_implicit_null_exception or
7523   // continuation_for_implicit_division_by_zero_exception. All other
7524   // implicit exceptions (e.g., NullPointerException or
7525   // AbstractMethodError on entry) are either at call sites or
7526   // otherwise assume that stack unwinding will be initiated, so
7527   // caller saved registers were assumed volatile in the compiler.
7528 
7529 #undef __
7530 #define __ masm->
7531 
7532   address generate_throw_exception(const char* name,
7533                                    address runtime_entry,
7534                                    Register arg1 = noreg,
7535                                    Register arg2 = noreg) {
7536     // Information about frame layout at time of blocking runtime call.
7537     // Note that we only have to preserve callee-saved registers since
7538     // the compilers are responsible for supplying a continuation point
7539     // if they expect all registers to be preserved.
7540     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7541     enum layout {
7542       rfp_off = 0,
7543       rfp_off2,
7544       return_off,
7545       return_off2,
7546       framesize // inclusive of return address
7547     };
7548 
7549     int insts_size = 512;
7550     int locs_size  = 64;
7551 
7552     CodeBuffer code(name, insts_size, locs_size);
7553     OopMapSet* oop_maps  = new OopMapSet();
7554     MacroAssembler* masm = new MacroAssembler(&code);
7555 
7556     address start = __ pc();
7557 
7558     // This is an inlined and slightly modified version of call_VM
7559     // which has the ability to fetch the return PC out of
7560     // thread-local storage and also sets up last_Java_sp slightly
7561     // differently than the real call_VM
7562 
7563     __ enter(); // Save FP and LR before call
7564 
7565     assert(is_even(framesize/2), "sp not 16-byte aligned");
7566 
7567     // lr and fp are already in place
7568     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
7569 
7570     int frame_complete = __ pc() - start;
7571 
7572     // Set up last_Java_sp and last_Java_fp
7573     address the_pc = __ pc();
7574     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7575 
7576     // Call runtime
7577     if (arg1 != noreg) {
7578       assert(arg2 != c_rarg1, "clobbered");
7579       __ mov(c_rarg1, arg1);
7580     }
7581     if (arg2 != noreg) {
7582       __ mov(c_rarg2, arg2);
7583     }
7584     __ mov(c_rarg0, rthread);
7585     BLOCK_COMMENT("call runtime_entry");
7586     __ mov(rscratch1, runtime_entry);
7587     __ blr(rscratch1);
7588 
7589     // Generate oop map
7590     OopMap* map = new OopMap(framesize, 0);
7591 
7592     oop_maps->add_gc_map(the_pc - start, map);
7593 
7594     __ reset_last_Java_frame(true);
7595 
7596     // Reinitialize the ptrue predicate register, in case the external runtime
7597     // call clobbers ptrue reg, as we may return to SVE compiled code.
7598     __ reinitialize_ptrue();
7599 
7600     __ leave();
7601 
7602     // check for pending exceptions
7603 #ifdef ASSERT
7604     Label L;
7605     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
7606     __ cbnz(rscratch1, L);
7607     __ should_not_reach_here();
7608     __ bind(L);
7609 #endif // ASSERT
7610     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7611 
7612     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7613     RuntimeStub* stub =
7614       RuntimeStub::new_runtime_stub(name,
7615                                     &code,
7616                                     frame_complete,
7617                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7618                                     oop_maps, false);
7619     return stub->entry_point();
7620   }
7621 
7622   class MontgomeryMultiplyGenerator : public MacroAssembler {
7623 
7624     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7625       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7626 
7627     RegSet _toSave;
7628     bool _squaring;
7629 
7630   public:
7631     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7632       : MacroAssembler(as->code()), _squaring(squaring) {
7633 
7634       // Register allocation
7635 
7636       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7637       Pa_base = *regs;       // Argument registers
7638       if (squaring)
7639         Pb_base = Pa_base;
7640       else
7641         Pb_base = *++regs;
7642       Pn_base = *++regs;
7643       Rlen= *++regs;
7644       inv = *++regs;
7645       Pm_base = *++regs;
7646 
7647                           // Working registers:
7648       Ra =  *++regs;        // The current digit of a, b, n, and m.
7649       Rb =  *++regs;
7650       Rm =  *++regs;
7651       Rn =  *++regs;
7652 
7653       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7654       Pb =  *++regs;
7655       Pm =  *++regs;
7656       Pn =  *++regs;
7657 
7658       t0 =  *++regs;        // Three registers which form a
7659       t1 =  *++regs;        // triple-precision accumuator.
7660       t2 =  *++regs;
7661 
7662       Ri =  *++regs;        // Inner and outer loop indexes.
7663       Rj =  *++regs;
7664 
7665       Rhi_ab = *++regs;     // Product registers: low and high parts
7666       Rlo_ab = *++regs;     // of a*b and m*n.
7667       Rhi_mn = *++regs;
7668       Rlo_mn = *++regs;
7669 
7670       // r19 and up are callee-saved.
7671       _toSave = RegSet::range(r19, *regs) + Pm_base;
7672     }
7673 
7674   private:
7675     void save_regs() {
7676       push(_toSave, sp);
7677     }
7678 
7679     void restore_regs() {
7680       pop(_toSave, sp);
7681     }
7682 
7683     template <typename T>
7684     void unroll_2(Register count, T block) {
7685       Label loop, end, odd;
7686       tbnz(count, 0, odd);
7687       cbz(count, end);
7688       align(16);
7689       bind(loop);
7690       (this->*block)();
7691       bind(odd);
7692       (this->*block)();
7693       subs(count, count, 2);
7694       br(Assembler::GT, loop);
7695       bind(end);
7696     }
7697 
7698     template <typename T>
7699     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7700       Label loop, end, odd;
7701       tbnz(count, 0, odd);
7702       cbz(count, end);
7703       align(16);
7704       bind(loop);
7705       (this->*block)(d, s, tmp);
7706       bind(odd);
7707       (this->*block)(d, s, tmp);
7708       subs(count, count, 2);
7709       br(Assembler::GT, loop);
7710       bind(end);
7711     }
7712 
7713     void pre1(RegisterOrConstant i) {
7714       block_comment("pre1");
7715       // Pa = Pa_base;
7716       // Pb = Pb_base + i;
7717       // Pm = Pm_base;
7718       // Pn = Pn_base + i;
7719       // Ra = *Pa;
7720       // Rb = *Pb;
7721       // Rm = *Pm;
7722       // Rn = *Pn;
7723       ldr(Ra, Address(Pa_base));
7724       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7725       ldr(Rm, Address(Pm_base));
7726       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7727       lea(Pa, Address(Pa_base));
7728       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7729       lea(Pm, Address(Pm_base));
7730       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7731 
7732       // Zero the m*n result.
7733       mov(Rhi_mn, zr);
7734       mov(Rlo_mn, zr);
7735     }
7736 
7737     // The core multiply-accumulate step of a Montgomery
7738     // multiplication.  The idea is to schedule operations as a
7739     // pipeline so that instructions with long latencies (loads and
7740     // multiplies) have time to complete before their results are
7741     // used.  This most benefits in-order implementations of the
7742     // architecture but out-of-order ones also benefit.
7743     void step() {
7744       block_comment("step");
7745       // MACC(Ra, Rb, t0, t1, t2);
7746       // Ra = *++Pa;
7747       // Rb = *--Pb;
7748       umulh(Rhi_ab, Ra, Rb);
7749       mul(Rlo_ab, Ra, Rb);
7750       ldr(Ra, pre(Pa, wordSize));
7751       ldr(Rb, pre(Pb, -wordSize));
7752       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7753                                        // previous iteration.
7754       // MACC(Rm, Rn, t0, t1, t2);
7755       // Rm = *++Pm;
7756       // Rn = *--Pn;
7757       umulh(Rhi_mn, Rm, Rn);
7758       mul(Rlo_mn, Rm, Rn);
7759       ldr(Rm, pre(Pm, wordSize));
7760       ldr(Rn, pre(Pn, -wordSize));
7761       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7762     }
7763 
7764     void post1() {
7765       block_comment("post1");
7766 
7767       // MACC(Ra, Rb, t0, t1, t2);
7768       // Ra = *++Pa;
7769       // Rb = *--Pb;
7770       umulh(Rhi_ab, Ra, Rb);
7771       mul(Rlo_ab, Ra, Rb);
7772       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7773       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7774 
7775       // *Pm = Rm = t0 * inv;
7776       mul(Rm, t0, inv);
7777       str(Rm, Address(Pm));
7778 
7779       // MACC(Rm, Rn, t0, t1, t2);
7780       // t0 = t1; t1 = t2; t2 = 0;
7781       umulh(Rhi_mn, Rm, Rn);
7782 
7783 #ifndef PRODUCT
7784       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7785       {
7786         mul(Rlo_mn, Rm, Rn);
7787         add(Rlo_mn, t0, Rlo_mn);
7788         Label ok;
7789         cbz(Rlo_mn, ok); {
7790           stop("broken Montgomery multiply");
7791         } bind(ok);
7792       }
7793 #endif
7794       // We have very carefully set things up so that
7795       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7796       // the lower half of Rm * Rn because we know the result already:
7797       // it must be -t0.  t0 + (-t0) must generate a carry iff
7798       // t0 != 0.  So, rather than do a mul and an adds we just set
7799       // the carry flag iff t0 is nonzero.
7800       //
7801       // mul(Rlo_mn, Rm, Rn);
7802       // adds(zr, t0, Rlo_mn);
7803       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7804       adcs(t0, t1, Rhi_mn);
7805       adc(t1, t2, zr);
7806       mov(t2, zr);
7807     }
7808 
7809     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7810       block_comment("pre2");
7811       // Pa = Pa_base + i-len;
7812       // Pb = Pb_base + len;
7813       // Pm = Pm_base + i-len;
7814       // Pn = Pn_base + len;
7815 
7816       if (i.is_register()) {
7817         sub(Rj, i.as_register(), len);
7818       } else {
7819         mov(Rj, i.as_constant());
7820         sub(Rj, Rj, len);
7821       }
7822       // Rj == i-len
7823 
7824       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7825       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7826       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7827       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7828 
7829       // Ra = *++Pa;
7830       // Rb = *--Pb;
7831       // Rm = *++Pm;
7832       // Rn = *--Pn;
7833       ldr(Ra, pre(Pa, wordSize));
7834       ldr(Rb, pre(Pb, -wordSize));
7835       ldr(Rm, pre(Pm, wordSize));
7836       ldr(Rn, pre(Pn, -wordSize));
7837 
7838       mov(Rhi_mn, zr);
7839       mov(Rlo_mn, zr);
7840     }
7841 
7842     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7843       block_comment("post2");
7844       if (i.is_constant()) {
7845         mov(Rj, i.as_constant()-len.as_constant());
7846       } else {
7847         sub(Rj, i.as_register(), len);
7848       }
7849 
7850       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7851 
7852       // As soon as we know the least significant digit of our result,
7853       // store it.
7854       // Pm_base[i-len] = t0;
7855       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7856 
7857       // t0 = t1; t1 = t2; t2 = 0;
7858       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7859       adc(t1, t2, zr);
7860       mov(t2, zr);
7861     }
7862 
7863     // A carry in t0 after Montgomery multiplication means that we
7864     // should subtract multiples of n from our result in m.  We'll
7865     // keep doing that until there is no carry.
7866     void normalize(RegisterOrConstant len) {
7867       block_comment("normalize");
7868       // while (t0)
7869       //   t0 = sub(Pm_base, Pn_base, t0, len);
7870       Label loop, post, again;
7871       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7872       cbz(t0, post); {
7873         bind(again); {
7874           mov(i, zr);
7875           mov(cnt, len);
7876           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7877           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7878           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7879           align(16);
7880           bind(loop); {
7881             sbcs(Rm, Rm, Rn);
7882             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7883             add(i, i, 1);
7884             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7885             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7886             sub(cnt, cnt, 1);
7887           } cbnz(cnt, loop);
7888           sbc(t0, t0, zr);
7889         } cbnz(t0, again);
7890       } bind(post);
7891     }
7892 
7893     // Move memory at s to d, reversing words.
7894     //    Increments d to end of copied memory
7895     //    Destroys tmp1, tmp2
7896     //    Preserves len
7897     //    Leaves s pointing to the address which was in d at start
7898     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7899       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7900       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7901 
7902       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7903       mov(tmp1, len);
7904       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7905       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7906     }
7907     // where
7908     void reverse1(Register d, Register s, Register tmp) {
7909       ldr(tmp, pre(s, -wordSize));
7910       ror(tmp, tmp, 32);
7911       str(tmp, post(d, wordSize));
7912     }
7913 
7914     void step_squaring() {
7915       // An extra ACC
7916       step();
7917       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7918     }
7919 
7920     void last_squaring(RegisterOrConstant i) {
7921       Label dont;
7922       // if ((i & 1) == 0) {
7923       tbnz(i.as_register(), 0, dont); {
7924         // MACC(Ra, Rb, t0, t1, t2);
7925         // Ra = *++Pa;
7926         // Rb = *--Pb;
7927         umulh(Rhi_ab, Ra, Rb);
7928         mul(Rlo_ab, Ra, Rb);
7929         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7930       } bind(dont);
7931     }
7932 
7933     void extra_step_squaring() {
7934       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7935 
7936       // MACC(Rm, Rn, t0, t1, t2);
7937       // Rm = *++Pm;
7938       // Rn = *--Pn;
7939       umulh(Rhi_mn, Rm, Rn);
7940       mul(Rlo_mn, Rm, Rn);
7941       ldr(Rm, pre(Pm, wordSize));
7942       ldr(Rn, pre(Pn, -wordSize));
7943     }
7944 
7945     void post1_squaring() {
7946       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7947 
7948       // *Pm = Rm = t0 * inv;
7949       mul(Rm, t0, inv);
7950       str(Rm, Address(Pm));
7951 
7952       // MACC(Rm, Rn, t0, t1, t2);
7953       // t0 = t1; t1 = t2; t2 = 0;
7954       umulh(Rhi_mn, Rm, Rn);
7955 
7956 #ifndef PRODUCT
7957       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7958       {
7959         mul(Rlo_mn, Rm, Rn);
7960         add(Rlo_mn, t0, Rlo_mn);
7961         Label ok;
7962         cbz(Rlo_mn, ok); {
7963           stop("broken Montgomery multiply");
7964         } bind(ok);
7965       }
7966 #endif
7967       // We have very carefully set things up so that
7968       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7969       // the lower half of Rm * Rn because we know the result already:
7970       // it must be -t0.  t0 + (-t0) must generate a carry iff
7971       // t0 != 0.  So, rather than do a mul and an adds we just set
7972       // the carry flag iff t0 is nonzero.
7973       //
7974       // mul(Rlo_mn, Rm, Rn);
7975       // adds(zr, t0, Rlo_mn);
7976       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7977       adcs(t0, t1, Rhi_mn);
7978       adc(t1, t2, zr);
7979       mov(t2, zr);
7980     }
7981 
7982     void acc(Register Rhi, Register Rlo,
7983              Register t0, Register t1, Register t2) {
7984       adds(t0, t0, Rlo);
7985       adcs(t1, t1, Rhi);
7986       adc(t2, t2, zr);
7987     }
7988 
7989   public:
7990     /**
7991      * Fast Montgomery multiplication.  The derivation of the
7992      * algorithm is in A Cryptographic Library for the Motorola
7993      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7994      *
7995      * Arguments:
7996      *
7997      * Inputs for multiplication:
7998      *   c_rarg0   - int array elements a
7999      *   c_rarg1   - int array elements b
8000      *   c_rarg2   - int array elements n (the modulus)
8001      *   c_rarg3   - int length
8002      *   c_rarg4   - int inv
8003      *   c_rarg5   - int array elements m (the result)
8004      *
8005      * Inputs for squaring:
8006      *   c_rarg0   - int array elements a
8007      *   c_rarg1   - int array elements n (the modulus)
8008      *   c_rarg2   - int length
8009      *   c_rarg3   - int inv
8010      *   c_rarg4   - int array elements m (the result)
8011      *
8012      */
8013     address generate_multiply() {
8014       Label argh, nothing;
8015       bind(argh);
8016       stop("MontgomeryMultiply total_allocation must be <= 8192");
8017 
8018       align(CodeEntryAlignment);
8019       address entry = pc();
8020 
8021       cbzw(Rlen, nothing);
8022 
8023       enter();
8024 
8025       // Make room.
8026       cmpw(Rlen, 512);
8027       br(Assembler::HI, argh);
8028       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8029       andr(sp, Ra, -2 * wordSize);
8030 
8031       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8032 
8033       {
8034         // Copy input args, reversing as we go.  We use Ra as a
8035         // temporary variable.
8036         reverse(Ra, Pa_base, Rlen, t0, t1);
8037         if (!_squaring)
8038           reverse(Ra, Pb_base, Rlen, t0, t1);
8039         reverse(Ra, Pn_base, Rlen, t0, t1);
8040       }
8041 
8042       // Push all call-saved registers and also Pm_base which we'll need
8043       // at the end.
8044       save_regs();
8045 
8046 #ifndef PRODUCT
8047       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
8048       {
8049         ldr(Rn, Address(Pn_base, 0));
8050         mul(Rlo_mn, Rn, inv);
8051         subs(zr, Rlo_mn, -1);
8052         Label ok;
8053         br(EQ, ok); {
8054           stop("broken inverse in Montgomery multiply");
8055         } bind(ok);
8056       }
8057 #endif
8058 
8059       mov(Pm_base, Ra);
8060 
8061       mov(t0, zr);
8062       mov(t1, zr);
8063       mov(t2, zr);
8064 
8065       block_comment("for (int i = 0; i < len; i++) {");
8066       mov(Ri, zr); {
8067         Label loop, end;
8068         cmpw(Ri, Rlen);
8069         br(Assembler::GE, end);
8070 
8071         bind(loop);
8072         pre1(Ri);
8073 
8074         block_comment("  for (j = i; j; j--) {"); {
8075           movw(Rj, Ri);
8076           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
8077         } block_comment("  } // j");
8078 
8079         post1();
8080         addw(Ri, Ri, 1);
8081         cmpw(Ri, Rlen);
8082         br(Assembler::LT, loop);
8083         bind(end);
8084         block_comment("} // i");
8085       }
8086 
8087       block_comment("for (int i = len; i < 2*len; i++) {");
8088       mov(Ri, Rlen); {
8089         Label loop, end;
8090         cmpw(Ri, Rlen, Assembler::LSL, 1);
8091         br(Assembler::GE, end);
8092 
8093         bind(loop);
8094         pre2(Ri, Rlen);
8095 
8096         block_comment("  for (j = len*2-i-1; j; j--) {"); {
8097           lslw(Rj, Rlen, 1);
8098           subw(Rj, Rj, Ri);
8099           subw(Rj, Rj, 1);
8100           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
8101         } block_comment("  } // j");
8102 
8103         post2(Ri, Rlen);
8104         addw(Ri, Ri, 1);
8105         cmpw(Ri, Rlen, Assembler::LSL, 1);
8106         br(Assembler::LT, loop);
8107         bind(end);
8108       }
8109       block_comment("} // i");
8110 
8111       normalize(Rlen);
8112 
8113       mov(Ra, Pm_base);  // Save Pm_base in Ra
8114       restore_regs();  // Restore caller's Pm_base
8115 
8116       // Copy our result into caller's Pm_base
8117       reverse(Pm_base, Ra, Rlen, t0, t1);
8118 
8119       leave();
8120       bind(nothing);
8121       ret(lr);
8122 
8123       return entry;
8124     }
8125     // In C, approximately:
8126 
8127     // void
8128     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
8129     //                     julong Pn_base[], julong Pm_base[],
8130     //                     julong inv, int len) {
8131     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8132     //   julong *Pa, *Pb, *Pn, *Pm;
8133     //   julong Ra, Rb, Rn, Rm;
8134 
8135     //   int i;
8136 
8137     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8138 
8139     //   for (i = 0; i < len; i++) {
8140     //     int j;
8141 
8142     //     Pa = Pa_base;
8143     //     Pb = Pb_base + i;
8144     //     Pm = Pm_base;
8145     //     Pn = Pn_base + i;
8146 
8147     //     Ra = *Pa;
8148     //     Rb = *Pb;
8149     //     Rm = *Pm;
8150     //     Rn = *Pn;
8151 
8152     //     int iters = i;
8153     //     for (j = 0; iters--; j++) {
8154     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8155     //       MACC(Ra, Rb, t0, t1, t2);
8156     //       Ra = *++Pa;
8157     //       Rb = *--Pb;
8158     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8159     //       MACC(Rm, Rn, t0, t1, t2);
8160     //       Rm = *++Pm;
8161     //       Rn = *--Pn;
8162     //     }
8163 
8164     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
8165     //     MACC(Ra, Rb, t0, t1, t2);
8166     //     *Pm = Rm = t0 * inv;
8167     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8168     //     MACC(Rm, Rn, t0, t1, t2);
8169 
8170     //     assert(t0 == 0, "broken Montgomery multiply");
8171 
8172     //     t0 = t1; t1 = t2; t2 = 0;
8173     //   }
8174 
8175     //   for (i = len; i < 2*len; i++) {
8176     //     int j;
8177 
8178     //     Pa = Pa_base + i-len;
8179     //     Pb = Pb_base + len;
8180     //     Pm = Pm_base + i-len;
8181     //     Pn = Pn_base + len;
8182 
8183     //     Ra = *++Pa;
8184     //     Rb = *--Pb;
8185     //     Rm = *++Pm;
8186     //     Rn = *--Pn;
8187 
8188     //     int iters = len*2-i-1;
8189     //     for (j = i-len+1; iters--; j++) {
8190     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8191     //       MACC(Ra, Rb, t0, t1, t2);
8192     //       Ra = *++Pa;
8193     //       Rb = *--Pb;
8194     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8195     //       MACC(Rm, Rn, t0, t1, t2);
8196     //       Rm = *++Pm;
8197     //       Rn = *--Pn;
8198     //     }
8199 
8200     //     Pm_base[i-len] = t0;
8201     //     t0 = t1; t1 = t2; t2 = 0;
8202     //   }
8203 
8204     //   while (t0)
8205     //     t0 = sub(Pm_base, Pn_base, t0, len);
8206     // }
8207 
8208     /**
8209      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
8210      * multiplies than Montgomery multiplication so it should be up to
8211      * 25% faster.  However, its loop control is more complex and it
8212      * may actually run slower on some machines.
8213      *
8214      * Arguments:
8215      *
8216      * Inputs:
8217      *   c_rarg0   - int array elements a
8218      *   c_rarg1   - int array elements n (the modulus)
8219      *   c_rarg2   - int length
8220      *   c_rarg3   - int inv
8221      *   c_rarg4   - int array elements m (the result)
8222      *
8223      */
8224     address generate_square() {
8225       Label argh;
8226       bind(argh);
8227       stop("MontgomeryMultiply total_allocation must be <= 8192");
8228 
8229       align(CodeEntryAlignment);
8230       address entry = pc();
8231 
8232       enter();
8233 
8234       // Make room.
8235       cmpw(Rlen, 512);
8236       br(Assembler::HI, argh);
8237       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8238       andr(sp, Ra, -2 * wordSize);
8239 
8240       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8241 
8242       {
8243         // Copy input args, reversing as we go.  We use Ra as a
8244         // temporary variable.
8245         reverse(Ra, Pa_base, Rlen, t0, t1);
8246         reverse(Ra, Pn_base, Rlen, t0, t1);
8247       }
8248 
8249       // Push all call-saved registers and also Pm_base which we'll need
8250       // at the end.
8251       save_regs();
8252 
8253       mov(Pm_base, Ra);
8254 
8255       mov(t0, zr);
8256       mov(t1, zr);
8257       mov(t2, zr);
8258 
8259       block_comment("for (int i = 0; i < len; i++) {");
8260       mov(Ri, zr); {
8261         Label loop, end;
8262         bind(loop);
8263         cmp(Ri, Rlen);
8264         br(Assembler::GE, end);
8265 
8266         pre1(Ri);
8267 
8268         block_comment("for (j = (i+1)/2; j; j--) {"); {
8269           add(Rj, Ri, 1);
8270           lsr(Rj, Rj, 1);
8271           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8272         } block_comment("  } // j");
8273 
8274         last_squaring(Ri);
8275 
8276         block_comment("  for (j = i/2; j; j--) {"); {
8277           lsr(Rj, Ri, 1);
8278           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8279         } block_comment("  } // j");
8280 
8281         post1_squaring();
8282         add(Ri, Ri, 1);
8283         cmp(Ri, Rlen);
8284         br(Assembler::LT, loop);
8285 
8286         bind(end);
8287         block_comment("} // i");
8288       }
8289 
8290       block_comment("for (int i = len; i < 2*len; i++) {");
8291       mov(Ri, Rlen); {
8292         Label loop, end;
8293         bind(loop);
8294         cmp(Ri, Rlen, Assembler::LSL, 1);
8295         br(Assembler::GE, end);
8296 
8297         pre2(Ri, Rlen);
8298 
8299         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8300           lsl(Rj, Rlen, 1);
8301           sub(Rj, Rj, Ri);
8302           sub(Rj, Rj, 1);
8303           lsr(Rj, Rj, 1);
8304           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8305         } block_comment("  } // j");
8306 
8307         last_squaring(Ri);
8308 
8309         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8310           lsl(Rj, Rlen, 1);
8311           sub(Rj, Rj, Ri);
8312           lsr(Rj, Rj, 1);
8313           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8314         } block_comment("  } // j");
8315 
8316         post2(Ri, Rlen);
8317         add(Ri, Ri, 1);
8318         cmp(Ri, Rlen, Assembler::LSL, 1);
8319 
8320         br(Assembler::LT, loop);
8321         bind(end);
8322         block_comment("} // i");
8323       }
8324 
8325       normalize(Rlen);
8326 
8327       mov(Ra, Pm_base);  // Save Pm_base in Ra
8328       restore_regs();  // Restore caller's Pm_base
8329 
8330       // Copy our result into caller's Pm_base
8331       reverse(Pm_base, Ra, Rlen, t0, t1);
8332 
8333       leave();
8334       ret(lr);
8335 
8336       return entry;
8337     }
8338     // In C, approximately:
8339 
8340     // void
8341     // montgomery_square(julong Pa_base[], julong Pn_base[],
8342     //                   julong Pm_base[], julong inv, int len) {
8343     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8344     //   julong *Pa, *Pb, *Pn, *Pm;
8345     //   julong Ra, Rb, Rn, Rm;
8346 
8347     //   int i;
8348 
8349     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8350 
8351     //   for (i = 0; i < len; i++) {
8352     //     int j;
8353 
8354     //     Pa = Pa_base;
8355     //     Pb = Pa_base + i;
8356     //     Pm = Pm_base;
8357     //     Pn = Pn_base + i;
8358 
8359     //     Ra = *Pa;
8360     //     Rb = *Pb;
8361     //     Rm = *Pm;
8362     //     Rn = *Pn;
8363 
8364     //     int iters = (i+1)/2;
8365     //     for (j = 0; iters--; j++) {
8366     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8367     //       MACC2(Ra, Rb, t0, t1, t2);
8368     //       Ra = *++Pa;
8369     //       Rb = *--Pb;
8370     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8371     //       MACC(Rm, Rn, t0, t1, t2);
8372     //       Rm = *++Pm;
8373     //       Rn = *--Pn;
8374     //     }
8375     //     if ((i & 1) == 0) {
8376     //       assert(Ra == Pa_base[j], "must be");
8377     //       MACC(Ra, Ra, t0, t1, t2);
8378     //     }
8379     //     iters = i/2;
8380     //     assert(iters == i-j, "must be");
8381     //     for (; iters--; j++) {
8382     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8383     //       MACC(Rm, Rn, t0, t1, t2);
8384     //       Rm = *++Pm;
8385     //       Rn = *--Pn;
8386     //     }
8387 
8388     //     *Pm = Rm = t0 * inv;
8389     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8390     //     MACC(Rm, Rn, t0, t1, t2);
8391 
8392     //     assert(t0 == 0, "broken Montgomery multiply");
8393 
8394     //     t0 = t1; t1 = t2; t2 = 0;
8395     //   }
8396 
8397     //   for (i = len; i < 2*len; i++) {
8398     //     int start = i-len+1;
8399     //     int end = start + (len - start)/2;
8400     //     int j;
8401 
8402     //     Pa = Pa_base + i-len;
8403     //     Pb = Pa_base + len;
8404     //     Pm = Pm_base + i-len;
8405     //     Pn = Pn_base + len;
8406 
8407     //     Ra = *++Pa;
8408     //     Rb = *--Pb;
8409     //     Rm = *++Pm;
8410     //     Rn = *--Pn;
8411 
8412     //     int iters = (2*len-i-1)/2;
8413     //     assert(iters == end-start, "must be");
8414     //     for (j = start; iters--; j++) {
8415     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8416     //       MACC2(Ra, Rb, t0, t1, t2);
8417     //       Ra = *++Pa;
8418     //       Rb = *--Pb;
8419     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8420     //       MACC(Rm, Rn, t0, t1, t2);
8421     //       Rm = *++Pm;
8422     //       Rn = *--Pn;
8423     //     }
8424     //     if ((i & 1) == 0) {
8425     //       assert(Ra == Pa_base[j], "must be");
8426     //       MACC(Ra, Ra, t0, t1, t2);
8427     //     }
8428     //     iters =  (2*len-i)/2;
8429     //     assert(iters == len-j, "must be");
8430     //     for (; iters--; j++) {
8431     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8432     //       MACC(Rm, Rn, t0, t1, t2);
8433     //       Rm = *++Pm;
8434     //       Rn = *--Pn;
8435     //     }
8436     //     Pm_base[i-len] = t0;
8437     //     t0 = t1; t1 = t2; t2 = 0;
8438     //   }
8439 
8440     //   while (t0)
8441     //     t0 = sub(Pm_base, Pn_base, t0, len);
8442     // }
8443   };
8444 
8445 
8446   // Initialization
8447   void generate_initial_stubs() {
8448     // Generate initial stubs and initializes the entry points
8449 
8450     // entry points that exist in all platforms Note: This is code
8451     // that could be shared among different platforms - however the
8452     // benefit seems to be smaller than the disadvantage of having a
8453     // much more complicated generator structure. See also comment in
8454     // stubRoutines.hpp.
8455 
8456     StubRoutines::_forward_exception_entry = generate_forward_exception();
8457 
8458     StubRoutines::_call_stub_entry =
8459       generate_call_stub(StubRoutines::_call_stub_return_address);
8460 
8461     // is referenced by megamorphic call
8462     StubRoutines::_catch_exception_entry = generate_catch_exception();
8463 
8464     // Build this early so it's available for the interpreter.
8465     StubRoutines::_throw_StackOverflowError_entry =
8466       generate_throw_exception("StackOverflowError throw_exception",
8467                                CAST_FROM_FN_PTR(address,
8468                                                 SharedRuntime::throw_StackOverflowError));
8469     StubRoutines::_throw_delayed_StackOverflowError_entry =
8470       generate_throw_exception("delayed StackOverflowError throw_exception",
8471                                CAST_FROM_FN_PTR(address,
8472                                                 SharedRuntime::throw_delayed_StackOverflowError));
8473 
8474     // Initialize table for copy memory (arraycopy) check.
8475     if (UnsafeMemoryAccess::_table == nullptr) {
8476       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
8477     }
8478 
8479     if (UseCRC32Intrinsics) {
8480       // set table address before stub generation which use it
8481       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8482       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8483     }
8484 
8485     if (UseCRC32CIntrinsics) {
8486       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8487     }
8488 
8489     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8490       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8491     }
8492 
8493     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8494       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8495     }
8496 
8497     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
8498         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
8499       StubRoutines::_hf2f = generate_float16ToFloat();
8500       StubRoutines::_f2hf = generate_floatToFloat16();
8501     }
8502   }
8503 
8504   void generate_continuation_stubs() {
8505     // Continuation stubs:
8506     StubRoutines::_cont_thaw          = generate_cont_thaw();
8507     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8508     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8509     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
8510     StubRoutines::_cont_resume_monitor_operation = generate_cont_resume_monitor_operation();
8511     StubRoutines::_cont_resume_compiler_adapter = generate_cont_resume_compiler_adapter();
8512 
8513     JFR_ONLY(generate_jfr_stubs();)
8514   }
8515 
8516 #if INCLUDE_JFR
8517   void generate_jfr_stubs() {
8518     StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();
8519     StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();
8520     StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease();
8521     StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point();
8522   }
8523 #endif // INCLUDE_JFR
8524 
8525   void generate_final_stubs() {
8526     // support for verify_oop (must happen after universe_init)
8527     if (VerifyOops) {
8528       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8529     }
8530     StubRoutines::_throw_AbstractMethodError_entry =
8531       generate_throw_exception("AbstractMethodError throw_exception",
8532                                CAST_FROM_FN_PTR(address,
8533                                                 SharedRuntime::
8534                                                 throw_AbstractMethodError));
8535 
8536     StubRoutines::_throw_IncompatibleClassChangeError_entry =
8537       generate_throw_exception("IncompatibleClassChangeError throw_exception",
8538                                CAST_FROM_FN_PTR(address,
8539                                                 SharedRuntime::
8540                                                 throw_IncompatibleClassChangeError));
8541 
8542     StubRoutines::_throw_NullPointerException_at_call_entry =
8543       generate_throw_exception("NullPointerException at call throw_exception",
8544                                CAST_FROM_FN_PTR(address,
8545                                                 SharedRuntime::
8546                                                 throw_NullPointerException_at_call));
8547 
8548     // arraycopy stubs used by compilers
8549     generate_arraycopy_stubs();
8550 
8551     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8552     if (bs_nm != nullptr) {
8553       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8554     }
8555 
8556     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8557 
8558     if (UsePoly1305Intrinsics) {
8559       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
8560     }
8561 
8562 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8563 
8564     generate_atomic_entry_points();
8565 
8566 #endif // LINUX
8567 
8568 #ifdef COMPILER2
8569     if (UseSecondarySupersTable) {
8570       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
8571       if (! InlineSecondarySupersTest) {
8572         for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
8573           StubRoutines::_lookup_secondary_supers_table_stubs[slot]
8574             = generate_lookup_secondary_supers_table_stub(slot);
8575         }
8576       }
8577     }
8578 #endif
8579 
8580     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8581 
8582     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8583   }
8584 
8585   void generate_compiler_stubs() {
8586 #if COMPILER2_OR_JVMCI
8587 
8588     if (UseSVE == 0) {
8589       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8590     }
8591 
8592     // array equals stub for large arrays.
8593     if (!UseSimpleArrayEquals) {
8594       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8595     }
8596 
8597     // byte_array_inflate stub for large arrays.
8598     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8599 
8600     // countPositives stub for large arrays.
8601     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8602 
8603     generate_compare_long_strings();
8604 
8605     generate_string_indexof_stubs();
8606 
8607 #ifdef COMPILER2
8608     if (UseMultiplyToLenIntrinsic) {
8609       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8610     }
8611 
8612     if (UseSquareToLenIntrinsic) {
8613       StubRoutines::_squareToLen = generate_squareToLen();
8614     }
8615 
8616     if (UseMulAddIntrinsic) {
8617       StubRoutines::_mulAdd = generate_mulAdd();
8618     }
8619 
8620     if (UseSIMDForBigIntegerShiftIntrinsics) {
8621       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8622       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8623     }
8624 
8625     if (UseMontgomeryMultiplyIntrinsic) {
8626       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8627       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8628       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8629     }
8630 
8631     if (UseMontgomerySquareIntrinsic) {
8632       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8633       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8634       // We use generate_multiply() rather than generate_square()
8635       // because it's faster for the sizes of modulus we care about.
8636       StubRoutines::_montgomerySquare = g.generate_multiply();
8637     }
8638 #endif // COMPILER2
8639 
8640     if (UseChaCha20Intrinsics) {
8641       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8642     }
8643 
8644     if (UseBASE64Intrinsics) {
8645         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8646         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8647     }
8648 
8649     // data cache line writeback
8650     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8651     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8652 
8653     if (UseAESIntrinsics) {
8654       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8655       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8656       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8657       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8658       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8659     }
8660     if (UseGHASHIntrinsics) {
8661       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8662       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8663     }
8664     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8665       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8666     }
8667 
8668     if (UseMD5Intrinsics) {
8669       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8670       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8671     }
8672     if (UseSHA1Intrinsics) {
8673       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8674       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8675     }
8676     if (UseSHA256Intrinsics) {
8677       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8678       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8679     }
8680     if (UseSHA512Intrinsics) {
8681       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8682       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8683     }
8684     if (UseSHA3Intrinsics) {
8685       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8686       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8687     }
8688 
8689     // generate Adler32 intrinsics code
8690     if (UseAdler32Intrinsics) {
8691       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8692     }
8693 #endif // COMPILER2_OR_JVMCI
8694   }
8695 
8696  public:
8697   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8698     switch(kind) {
8699     case Initial_stubs:
8700       generate_initial_stubs();
8701       break;
8702      case Continuation_stubs:
8703       generate_continuation_stubs();
8704       break;
8705     case Compiler_stubs:
8706       generate_compiler_stubs();
8707       break;
8708     case Final_stubs:
8709       generate_final_stubs();
8710       break;
8711     default:
8712       fatal("unexpected stubs kind: %d", kind);
8713       break;
8714     };
8715   }
8716 }; // end class declaration
8717 
8718 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8719   StubGenerator g(code, kind);
8720 }
8721 
8722 
8723 #if defined (LINUX)
8724 
8725 // Define pointers to atomic stubs and initialize them to point to the
8726 // code in atomic_aarch64.S.
8727 
8728 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8729   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8730     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8731   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8732     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8733 
8734 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8735 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8736 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8737 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8738 DEFAULT_ATOMIC_OP(xchg, 4, )
8739 DEFAULT_ATOMIC_OP(xchg, 8, )
8740 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8741 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8742 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8743 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8744 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8745 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8746 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8747 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8748 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8749 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8750 
8751 #undef DEFAULT_ATOMIC_OP
8752 
8753 #endif // LINUX