1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "atomic_aarch64.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/interpreter.hpp"
  36 #include "memory/universe.hpp"
  37 #include "nativeInst_aarch64.hpp"
  38 #include "oops/instanceOop.hpp"
  39 #include "oops/method.hpp"
  40 #include "oops/objArrayKlass.hpp"
  41 #include "oops/oop.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/atomic.hpp"
  44 #include "runtime/frame.inline.hpp"
  45 #include "runtime/handles.inline.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubCodeGenerator.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/thread.inline.hpp"
  50 #include "utilities/align.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 #ifdef COMPILER2
  53 #include "opto/runtime.hpp"
  54 #endif
  55 #if INCLUDE_ZGC
  56 #include "gc/z/zThreadLocalData.hpp"
  57 #endif
  58 
  59 // Declaration and definition of StubGenerator (no .hpp file).
  60 // For a more detailed description of the stub routine structure
  61 // see the comment in stubRoutines.hpp
  62 
  63 #undef __
  64 #define __ _masm->
  65 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  66 
  67 #ifdef PRODUCT
  68 #define BLOCK_COMMENT(str) /* nothing */
  69 #else
  70 #define BLOCK_COMMENT(str) __ block_comment(str)
  71 #endif
  72 
  73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  74 
  75 // Stub Code definitions
  76 
  77 class StubGenerator: public StubCodeGenerator {
  78  private:
  79 
  80 #ifdef PRODUCT
  81 #define inc_counter_np(counter) ((void)0)
  82 #else
  83   void inc_counter_np_(int& counter) {
  84     __ lea(rscratch2, ExternalAddress((address)&counter));
  85     __ ldrw(rscratch1, Address(rscratch2));
  86     __ addw(rscratch1, rscratch1, 1);
  87     __ strw(rscratch1, Address(rscratch2));
  88   }
  89 #define inc_counter_np(counter) \
  90   BLOCK_COMMENT("inc_counter " #counter); \
  91   inc_counter_np_(counter);
  92 #endif
  93 
  94   // Call stubs are used to call Java from C
  95   //
  96   // Arguments:
  97   //    c_rarg0:   call wrapper address                   address
  98   //    c_rarg1:   result                                 address
  99   //    c_rarg2:   result type                            BasicType
 100   //    c_rarg3:   method                                 Method*
 101   //    c_rarg4:   (interpreter) entry point              address
 102   //    c_rarg5:   parameters                             intptr_t*
 103   //    c_rarg6:   parameter size (in words)              int
 104   //    c_rarg7:   thread                                 Thread*
 105   //
 106   // There is no return from the stub itself as any Java result
 107   // is written to result
 108   //
 109   // we save r30 (lr) as the return PC at the base of the frame and
 110   // link r29 (fp) below it as the frame pointer installing sp (r31)
 111   // into fp.
 112   //
 113   // we save r0-r7, which accounts for all the c arguments.
 114   //
 115   // TODO: strictly do we need to save them all? they are treated as
 116   // volatile by C so could we omit saving the ones we are going to
 117   // place in global registers (thread? method?) or those we only use
 118   // during setup of the Java call?
 119   //
 120   // we don't need to save r8 which C uses as an indirect result location
 121   // return register.
 122   //
 123   // we don't need to save r9-r15 which both C and Java treat as
 124   // volatile
 125   //
 126   // we don't need to save r16-18 because Java does not use them
 127   //
 128   // we save r19-r28 which Java uses as scratch registers and C
 129   // expects to be callee-save
 130   //
 131   // we save the bottom 64 bits of each value stored in v8-v15; it is
 132   // the responsibility of the caller to preserve larger values.
 133   //
 134   // so the stub frame looks like this when we enter Java code
 135   //
 136   //     [ return_from_Java     ] <--- sp
 137   //     [ argument word n      ]
 138   //      ...
 139   // -27 [ argument word 1      ]
 140   // -26 [ saved v15            ] <--- sp_after_call
 141   // -25 [ saved v14            ]
 142   // -24 [ saved v13            ]
 143   // -23 [ saved v12            ]
 144   // -22 [ saved v11            ]
 145   // -21 [ saved v10            ]
 146   // -20 [ saved v9             ]
 147   // -19 [ saved v8             ]
 148   // -18 [ saved r28            ]
 149   // -17 [ saved r27            ]
 150   // -16 [ saved r26            ]
 151   // -15 [ saved r25            ]
 152   // -14 [ saved r24            ]
 153   // -13 [ saved r23            ]
 154   // -12 [ saved r22            ]
 155   // -11 [ saved r21            ]
 156   // -10 [ saved r20            ]
 157   //  -9 [ saved r19            ]
 158   //  -8 [ call wrapper    (r0) ]
 159   //  -7 [ result          (r1) ]
 160   //  -6 [ result type     (r2) ]
 161   //  -5 [ method          (r3) ]
 162   //  -4 [ entry point     (r4) ]
 163   //  -3 [ parameters      (r5) ]
 164   //  -2 [ parameter size  (r6) ]
 165   //  -1 [ thread (r7)          ]
 166   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 167   //   1 [ saved lr       (r30) ]
 168 
 169   // Call stub stack layout word offsets from fp
 170   enum call_stub_layout {
 171     sp_after_call_off = -26,
 172 
 173     d15_off            = -26,
 174     d13_off            = -24,
 175     d11_off            = -22,
 176     d9_off             = -20,
 177 
 178     r28_off            = -18,
 179     r26_off            = -16,
 180     r24_off            = -14,
 181     r22_off            = -12,
 182     r20_off            = -10,
 183     call_wrapper_off   =  -8,
 184     result_off         =  -7,
 185     result_type_off    =  -6,
 186     method_off         =  -5,
 187     entry_point_off    =  -4,
 188     parameter_size_off =  -2,
 189     thread_off         =  -1,
 190     fp_f               =   0,
 191     retaddr_off        =   1,
 192   };
 193 
 194   address generate_call_stub(address& return_address) {
 195     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 196            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 197            "adjust this code");
 198 
 199     StubCodeMark mark(this, "StubRoutines", "call_stub");
 200     address start = __ pc();
 201 
 202     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 203 
 204     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 205     const Address result        (rfp, result_off         * wordSize);
 206     const Address result_type   (rfp, result_type_off    * wordSize);
 207     const Address method        (rfp, method_off         * wordSize);
 208     const Address entry_point   (rfp, entry_point_off    * wordSize);
 209     const Address parameter_size(rfp, parameter_size_off * wordSize);
 210 
 211     const Address thread        (rfp, thread_off         * wordSize);
 212 
 213     const Address d15_save      (rfp, d15_off * wordSize);
 214     const Address d13_save      (rfp, d13_off * wordSize);
 215     const Address d11_save      (rfp, d11_off * wordSize);
 216     const Address d9_save       (rfp, d9_off * wordSize);
 217 
 218     const Address r28_save      (rfp, r28_off * wordSize);
 219     const Address r26_save      (rfp, r26_off * wordSize);
 220     const Address r24_save      (rfp, r24_off * wordSize);
 221     const Address r22_save      (rfp, r22_off * wordSize);
 222     const Address r20_save      (rfp, r20_off * wordSize);
 223 
 224     // stub code
 225 
 226     address aarch64_entry = __ pc();
 227 
 228     // set up frame and move sp to end of save area
 229     __ enter();
 230     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 231 
 232     // save register parameters and Java scratch/global registers
 233     // n.b. we save thread even though it gets installed in
 234     // rthread because we want to sanity check rthread later
 235     __ str(c_rarg7,  thread);
 236     __ strw(c_rarg6, parameter_size);
 237     __ stp(c_rarg4, c_rarg5,  entry_point);
 238     __ stp(c_rarg2, c_rarg3,  result_type);
 239     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 240 
 241     __ stp(r20, r19,   r20_save);
 242     __ stp(r22, r21,   r22_save);
 243     __ stp(r24, r23,   r24_save);
 244     __ stp(r26, r25,   r26_save);
 245     __ stp(r28, r27,   r28_save);
 246 
 247     __ stpd(v9,  v8,   d9_save);
 248     __ stpd(v11, v10,  d11_save);
 249     __ stpd(v13, v12,  d13_save);
 250     __ stpd(v15, v14,  d15_save);
 251 
 252     // install Java thread in global register now we have saved
 253     // whatever value it held
 254     __ mov(rthread, c_rarg7);
 255     // And method
 256     __ mov(rmethod, c_rarg3);
 257 
 258     // set up the heapbase register
 259     __ reinit_heapbase();
 260 
 261 #ifdef ASSERT
 262     // make sure we have no pending exceptions
 263     {
 264       Label L;
 265       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 266       __ cmp(rscratch1, (u1)NULL_WORD);
 267       __ br(Assembler::EQ, L);
 268       __ stop("StubRoutines::call_stub: entered with pending exception");
 269       __ BIND(L);
 270     }
 271 #endif
 272     // pass parameters if any
 273     __ mov(esp, sp);
 274     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 275     __ andr(sp, rscratch1, -2 * wordSize);
 276 
 277     BLOCK_COMMENT("pass parameters if any");
 278     Label parameters_done;
 279     // parameter count is still in c_rarg6
 280     // and parameter pointer identifying param 1 is in c_rarg5
 281     __ cbzw(c_rarg6, parameters_done);
 282 
 283     address loop = __ pc();
 284     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 285     __ subsw(c_rarg6, c_rarg6, 1);
 286     __ push(rscratch1);
 287     __ br(Assembler::GT, loop);
 288 
 289     __ BIND(parameters_done);
 290 
 291     // call Java entry -- passing methdoOop, and current sp
 292     //      rmethod: Method*
 293     //      r13: sender sp
 294     BLOCK_COMMENT("call Java function");
 295     __ mov(r13, sp);
 296     __ blr(c_rarg4);
 297 
 298     // we do this here because the notify will already have been done
 299     // if we get to the next instruction via an exception
 300     //
 301     // n.b. adding this instruction here affects the calculation of
 302     // whether or not a routine returns to the call stub (used when
 303     // doing stack walks) since the normal test is to check the return
 304     // pc against the address saved below. so we may need to allow for
 305     // this extra instruction in the check.
 306 
 307     // save current address for use by exception handling code
 308 
 309     return_address = __ pc();
 310 
 311     // store result depending on type (everything that is not
 312     // T_OBJECT, T_INLINE_TYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 313     // n.b. this assumes Java returns an integral result in r0
 314     // and a floating result in j_farg0
 315     // All of j_rargN may be used to return inline type fields so be careful
 316     // not to clobber those.
 317     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
 318     // assignment of Rresult below.
 319     Register Rresult = r14, Rresult_type = r15;
 320     __ ldr(Rresult, result);
 321     Label is_long, is_float, is_double, is_value, exit;
 322     __ ldr(Rresult_type, result_type);
 323     __ cmp(Rresult_type, (u1)T_OBJECT);
 324     __ br(Assembler::EQ, is_long);
 325     __ cmp(Rresult_type, (u1)T_INLINE_TYPE);
 326     __ br(Assembler::EQ, is_value);
 327     __ cmp(Rresult_type, (u1)T_LONG);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(Rresult_type, (u1)T_FLOAT);
 330     __ br(Assembler::EQ, is_float);
 331     __ cmp(Rresult_type, (u1)T_DOUBLE);
 332     __ br(Assembler::EQ, is_double);
 333 
 334     // handle T_INT case
 335     __ strw(r0, Address(Rresult));
 336 
 337     __ BIND(exit);
 338 
 339     // pop parameters
 340     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 341 
 342 #ifdef ASSERT
 343     // verify that threads correspond
 344     {
 345       Label L, S;
 346       __ ldr(rscratch1, thread);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::NE, S);
 349       __ get_thread(rscratch1);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::EQ, L);
 352       __ BIND(S);
 353       __ stop("StubRoutines::call_stub: threads must correspond");
 354       __ BIND(L);
 355     }
 356 #endif
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376     // leave frame and return to caller
 377     __ leave();
 378     __ ret(lr);
 379 
 380     // handle return types different from T_INT
 381     __ BIND(is_value);
 382     if (InlineTypeReturnedAsFields) {
 383       // Check for flattened return value
 384       __ tbz(r0, 0, is_long);
 385       // Load pack handler address
 386       __ andr(rscratch1, r0, -2);
 387       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
 388       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
 389       __ blr(rscratch1);
 390       __ b(exit);
 391     }
 392 
 393     __ BIND(is_long);
 394     __ str(r0, Address(Rresult, 0));
 395     __ br(Assembler::AL, exit);
 396 
 397     __ BIND(is_float);
 398     __ strs(j_farg0, Address(Rresult, 0));
 399     __ br(Assembler::AL, exit);
 400 
 401     __ BIND(is_double);
 402     __ strd(j_farg0, Address(Rresult, 0));
 403     __ br(Assembler::AL, exit);
 404 
 405     return start;
 406   }
 407 
 408   // Return point for a Java call if there's an exception thrown in
 409   // Java code.  The exception is caught and transformed into a
 410   // pending exception stored in JavaThread that can be tested from
 411   // within the VM.
 412   //
 413   // Note: Usually the parameters are removed by the callee. In case
 414   // of an exception crossing an activation frame boundary, that is
 415   // not the case if the callee is compiled code => need to setup the
 416   // rsp.
 417   //
 418   // r0: exception oop
 419 
 420   address generate_catch_exception() {
 421     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 422     address start = __ pc();
 423 
 424     // same as in generate_call_stub():
 425     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 426     const Address thread        (rfp, thread_off         * wordSize);
 427 
 428 #ifdef ASSERT
 429     // verify that threads correspond
 430     {
 431       Label L, S;
 432       __ ldr(rscratch1, thread);
 433       __ cmp(rthread, rscratch1);
 434       __ br(Assembler::NE, S);
 435       __ get_thread(rscratch1);
 436       __ cmp(rthread, rscratch1);
 437       __ br(Assembler::EQ, L);
 438       __ bind(S);
 439       __ stop("StubRoutines::catch_exception: threads must correspond");
 440       __ bind(L);
 441     }
 442 #endif
 443 
 444     // set pending exception
 445     __ verify_oop(r0);
 446 
 447     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 448     __ mov(rscratch1, (address)__FILE__);
 449     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 450     __ movw(rscratch1, (int)__LINE__);
 451     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 452 
 453     // complete return to VM
 454     assert(StubRoutines::_call_stub_return_address != NULL,
 455            "_call_stub_return_address must have been generated before");
 456     __ b(StubRoutines::_call_stub_return_address);
 457 
 458     return start;
 459   }
 460 
 461   // Continuation point for runtime calls returning with a pending
 462   // exception.  The pending exception check happened in the runtime
 463   // or native call stub.  The pending exception in Thread is
 464   // converted into a Java-level exception.
 465   //
 466   // Contract with Java-level exception handlers:
 467   // r0: exception
 468   // r3: throwing pc
 469   //
 470   // NOTE: At entry of this stub, exception-pc must be in LR !!
 471 
 472   // NOTE: this is always used as a jump target within generated code
 473   // so it just needs to be generated code wiht no x86 prolog
 474 
 475   address generate_forward_exception() {
 476     StubCodeMark mark(this, "StubRoutines", "forward exception");
 477     address start = __ pc();
 478 
 479     // Upon entry, LR points to the return address returning into
 480     // Java (interpreted or compiled) code; i.e., the return address
 481     // becomes the throwing pc.
 482     //
 483     // Arguments pushed before the runtime call are still on the stack
 484     // but the exception handler will reset the stack pointer ->
 485     // ignore them.  A potential result in registers can be ignored as
 486     // well.
 487 
 488 #ifdef ASSERT
 489     // make sure this code is only executed if there is a pending exception
 490     {
 491       Label L;
 492       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 493       __ cbnz(rscratch1, L);
 494       __ stop("StubRoutines::forward exception: no pending exception (1)");
 495       __ bind(L);
 496     }
 497 #endif
 498 
 499     // compute exception handler into r19
 500 
 501     // call the VM to find the handler address associated with the
 502     // caller address. pass thread in r0 and caller pc (ret address)
 503     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 504     // the stack.
 505     __ mov(c_rarg1, lr);
 506     // lr will be trashed by the VM call so we move it to R19
 507     // (callee-saved) because we also need to pass it to the handler
 508     // returned by this call.
 509     __ mov(r19, lr);
 510     BLOCK_COMMENT("call exception_handler_for_return_address");
 511     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 512                          SharedRuntime::exception_handler_for_return_address),
 513                     rthread, c_rarg1);
 514     // Reinitialize the ptrue predicate register, in case the external runtime
 515     // call clobbers ptrue reg, as we may return to SVE compiled code.
 516     __ reinitialize_ptrue();
 517 
 518     // we should not really care that lr is no longer the callee
 519     // address. we saved the value the handler needs in r19 so we can
 520     // just copy it to r3. however, the C2 handler will push its own
 521     // frame and then calls into the VM and the VM code asserts that
 522     // the PC for the frame above the handler belongs to a compiled
 523     // Java method. So, we restore lr here to satisfy that assert.
 524     __ mov(lr, r19);
 525     // setup r0 & r3 & clear pending exception
 526     __ mov(r3, r19);
 527     __ mov(r19, r0);
 528     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 529     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 530 
 531 #ifdef ASSERT
 532     // make sure exception is set
 533     {
 534       Label L;
 535       __ cbnz(r0, L);
 536       __ stop("StubRoutines::forward exception: no pending exception (2)");
 537       __ bind(L);
 538     }
 539 #endif
 540 
 541     // continue at exception handler
 542     // r0: exception
 543     // r3: throwing pc
 544     // r19: exception handler
 545     __ verify_oop(r0);
 546     __ br(r19);
 547 
 548     return start;
 549   }
 550 
 551   // Non-destructive plausibility checks for oops
 552   //
 553   // Arguments:
 554   //    r0: oop to verify
 555   //    rscratch1: error message
 556   //
 557   // Stack after saving c_rarg3:
 558   //    [tos + 0]: saved c_rarg3
 559   //    [tos + 1]: saved c_rarg2
 560   //    [tos + 2]: saved lr
 561   //    [tos + 3]: saved rscratch2
 562   //    [tos + 4]: saved r0
 563   //    [tos + 5]: saved rscratch1
 564   address generate_verify_oop() {
 565 
 566     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 567     address start = __ pc();
 568 
 569     Label exit, error;
 570 
 571     // save c_rarg2 and c_rarg3
 572     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 573 
 574     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 575     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 576     __ ldr(c_rarg3, Address(c_rarg2));
 577     __ add(c_rarg3, c_rarg3, 1);
 578     __ str(c_rarg3, Address(c_rarg2));
 579 
 580     // object is in r0
 581     // make sure object is 'reasonable'
 582     __ cbz(r0, exit); // if obj is NULL it is OK
 583 
 584 #if INCLUDE_ZGC
 585     if (UseZGC) {
 586       // Check if mask is good.
 587       // verifies that ZAddressBadMask & r0 == 0
 588       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 589       __ andr(c_rarg2, r0, c_rarg3);
 590       __ cbnz(c_rarg2, error);
 591     }
 592 #endif
 593 
 594     // Check if the oop is in the right area of memory
 595     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 596     __ andr(c_rarg2, r0, c_rarg3);
 597     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 598 
 599     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 600     // instruction here because the flags register is live.
 601     __ eor(c_rarg2, c_rarg2, c_rarg3);
 602     __ cbnz(c_rarg2, error);
 603 
 604     // make sure klass is 'reasonable', which is not zero.
 605     __ load_klass(r0, r0);  // get klass
 606     __ cbz(r0, error);      // if klass is NULL it is broken
 607 
 608     // return if everything seems ok
 609     __ bind(exit);
 610 
 611     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 612     __ ret(lr);
 613 
 614     // handle errors
 615     __ bind(error);
 616     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 617 
 618     __ push(RegSet::range(r0, r29), sp);
 619     // debug(char* msg, int64_t pc, int64_t regs[])
 620     __ mov(c_rarg0, rscratch1);      // pass address of error message
 621     __ mov(c_rarg1, lr);             // pass return address
 622     __ mov(c_rarg2, sp);             // pass address of regs on stack
 623 #ifndef PRODUCT
 624     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 625 #endif
 626     BLOCK_COMMENT("call MacroAssembler::debug");
 627     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 628     __ blr(rscratch1);
 629     __ hlt(0);
 630 
 631     return start;
 632   }
 633 
 634   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 635 
 636   // Generate indices for iota vector.
 637   address generate_iota_indices(const char *stub_name) {
 638     __ align(CodeEntryAlignment);
 639     StubCodeMark mark(this, "StubRoutines", stub_name);
 640     address start = __ pc();
 641     __ emit_data64(0x0706050403020100, relocInfo::none);
 642     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 643     return start;
 644   }
 645 
 646   // The inner part of zero_words().  This is the bulk operation,
 647   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 648   // caller is responsible for zeroing the last few words.
 649   //
 650   // Inputs:
 651   // r10: the HeapWord-aligned base address of an array to zero.
 652   // r11: the count in HeapWords, r11 > 0.
 653   //
 654   // Returns r10 and r11, adjusted for the caller to clear.
 655   // r10: the base address of the tail of words left to clear.
 656   // r11: the number of words in the tail.
 657   //      r11 < MacroAssembler::zero_words_block_size.
 658 
 659   address generate_zero_blocks() {
 660     Label done;
 661     Label base_aligned;
 662 
 663     Register base = r10, cnt = r11;
 664 
 665     __ align(CodeEntryAlignment);
 666     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 667     address start = __ pc();
 668 
 669     if (UseBlockZeroing) {
 670       int zva_length = VM_Version::zva_length();
 671 
 672       // Ensure ZVA length can be divided by 16. This is required by
 673       // the subsequent operations.
 674       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 675 
 676       __ tbz(base, 3, base_aligned);
 677       __ str(zr, Address(__ post(base, 8)));
 678       __ sub(cnt, cnt, 1);
 679       __ bind(base_aligned);
 680 
 681       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 682       // alignment.
 683       Label small;
 684       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 685       __ subs(rscratch1, cnt, low_limit >> 3);
 686       __ br(Assembler::LT, small);
 687       __ zero_dcache_blocks(base, cnt);
 688       __ bind(small);
 689     }
 690 
 691     {
 692       // Number of stp instructions we'll unroll
 693       const int unroll =
 694         MacroAssembler::zero_words_block_size / 2;
 695       // Clear the remaining blocks.
 696       Label loop;
 697       __ subs(cnt, cnt, unroll * 2);
 698       __ br(Assembler::LT, done);
 699       __ bind(loop);
 700       for (int i = 0; i < unroll; i++)
 701         __ stp(zr, zr, __ post(base, 16));
 702       __ subs(cnt, cnt, unroll * 2);
 703       __ br(Assembler::GE, loop);
 704       __ bind(done);
 705       __ add(cnt, cnt, unroll * 2);
 706     }
 707 
 708     __ ret(lr);
 709 
 710     return start;
 711   }
 712 
 713 
 714   typedef enum {
 715     copy_forwards = 1,
 716     copy_backwards = -1
 717   } copy_direction;
 718 
 719   // Bulk copy of blocks of 8 words.
 720   //
 721   // count is a count of words.
 722   //
 723   // Precondition: count >= 8
 724   //
 725   // Postconditions:
 726   //
 727   // The least significant bit of count contains the remaining count
 728   // of words to copy.  The rest of count is trash.
 729   //
 730   // s and d are adjusted to point to the remaining words to copy
 731   //
 732   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 733                            copy_direction direction) {
 734     int unit = wordSize * direction;
 735     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 736 
 737     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 738       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 739     const Register stride = r13;
 740 
 741     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 742     assert_different_registers(s, d, count, rscratch1);
 743 
 744     Label again, drain;
 745     const char *stub_name;
 746     if (direction == copy_forwards)
 747       stub_name = "forward_copy_longs";
 748     else
 749       stub_name = "backward_copy_longs";
 750 
 751     __ align(CodeEntryAlignment);
 752 
 753     StubCodeMark mark(this, "StubRoutines", stub_name);
 754 
 755     __ bind(start);
 756 
 757     Label unaligned_copy_long;
 758     if (AvoidUnalignedAccesses) {
 759       __ tbnz(d, 3, unaligned_copy_long);
 760     }
 761 
 762     if (direction == copy_forwards) {
 763       __ sub(s, s, bias);
 764       __ sub(d, d, bias);
 765     }
 766 
 767 #ifdef ASSERT
 768     // Make sure we are never given < 8 words
 769     {
 770       Label L;
 771       __ cmp(count, (u1)8);
 772       __ br(Assembler::GE, L);
 773       __ stop("genrate_copy_longs called with < 8 words");
 774       __ bind(L);
 775     }
 776 #endif
 777 
 778     // Fill 8 registers
 779     if (UseSIMDForMemoryOps) {
 780       __ ldpq(v0, v1, Address(s, 4 * unit));
 781       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 782     } else {
 783       __ ldp(t0, t1, Address(s, 2 * unit));
 784       __ ldp(t2, t3, Address(s, 4 * unit));
 785       __ ldp(t4, t5, Address(s, 6 * unit));
 786       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 787     }
 788 
 789     __ subs(count, count, 16);
 790     __ br(Assembler::LO, drain);
 791 
 792     int prefetch = PrefetchCopyIntervalInBytes;
 793     bool use_stride = false;
 794     if (direction == copy_backwards) {
 795        use_stride = prefetch > 256;
 796        prefetch = -prefetch;
 797        if (use_stride) __ mov(stride, prefetch);
 798     }
 799 
 800     __ bind(again);
 801 
 802     if (PrefetchCopyIntervalInBytes > 0)
 803       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 804 
 805     if (UseSIMDForMemoryOps) {
 806       __ stpq(v0, v1, Address(d, 4 * unit));
 807       __ ldpq(v0, v1, Address(s, 4 * unit));
 808       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 809       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 810     } else {
 811       __ stp(t0, t1, Address(d, 2 * unit));
 812       __ ldp(t0, t1, Address(s, 2 * unit));
 813       __ stp(t2, t3, Address(d, 4 * unit));
 814       __ ldp(t2, t3, Address(s, 4 * unit));
 815       __ stp(t4, t5, Address(d, 6 * unit));
 816       __ ldp(t4, t5, Address(s, 6 * unit));
 817       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 818       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 819     }
 820 
 821     __ subs(count, count, 8);
 822     __ br(Assembler::HS, again);
 823 
 824     // Drain
 825     __ bind(drain);
 826     if (UseSIMDForMemoryOps) {
 827       __ stpq(v0, v1, Address(d, 4 * unit));
 828       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 829     } else {
 830       __ stp(t0, t1, Address(d, 2 * unit));
 831       __ stp(t2, t3, Address(d, 4 * unit));
 832       __ stp(t4, t5, Address(d, 6 * unit));
 833       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 834     }
 835 
 836     {
 837       Label L1, L2;
 838       __ tbz(count, exact_log2(4), L1);
 839       if (UseSIMDForMemoryOps) {
 840         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 841         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 842       } else {
 843         __ ldp(t0, t1, Address(s, 2 * unit));
 844         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 845         __ stp(t0, t1, Address(d, 2 * unit));
 846         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 847       }
 848       __ bind(L1);
 849 
 850       if (direction == copy_forwards) {
 851         __ add(s, s, bias);
 852         __ add(d, d, bias);
 853       }
 854 
 855       __ tbz(count, 1, L2);
 856       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 857       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 858       __ bind(L2);
 859     }
 860 
 861     __ ret(lr);
 862 
 863     if (AvoidUnalignedAccesses) {
 864       Label drain, again;
 865       // Register order for storing. Order is different for backward copy.
 866 
 867       __ bind(unaligned_copy_long);
 868 
 869       // source address is even aligned, target odd aligned
 870       //
 871       // when forward copying word pairs we read long pairs at offsets
 872       // {0, 2, 4, 6} (in long words). when backwards copying we read
 873       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 874       // address by -2 in the forwards case so we can compute the
 875       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 876       // or -1.
 877       //
 878       // when forward copying we need to store 1 word, 3 pairs and
 879       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 880       // zero offset We adjust the destination by -1 which means we
 881       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 882       //
 883       // When backwards copyng we need to store 1 word, 3 pairs and
 884       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 885       // offsets {1, 3, 5, 7, 8} * unit.
 886 
 887       if (direction == copy_forwards) {
 888         __ sub(s, s, 16);
 889         __ sub(d, d, 8);
 890       }
 891 
 892       // Fill 8 registers
 893       //
 894       // for forwards copy s was offset by -16 from the original input
 895       // value of s so the register contents are at these offsets
 896       // relative to the 64 bit block addressed by that original input
 897       // and so on for each successive 64 byte block when s is updated
 898       //
 899       // t0 at offset 0,  t1 at offset 8
 900       // t2 at offset 16, t3 at offset 24
 901       // t4 at offset 32, t5 at offset 40
 902       // t6 at offset 48, t7 at offset 56
 903 
 904       // for backwards copy s was not offset so the register contents
 905       // are at these offsets into the preceding 64 byte block
 906       // relative to that original input and so on for each successive
 907       // preceding 64 byte block when s is updated. this explains the
 908       // slightly counter-intuitive looking pattern of register usage
 909       // in the stp instructions for backwards copy.
 910       //
 911       // t0 at offset -16, t1 at offset -8
 912       // t2 at offset -32, t3 at offset -24
 913       // t4 at offset -48, t5 at offset -40
 914       // t6 at offset -64, t7 at offset -56
 915 
 916       __ ldp(t0, t1, Address(s, 2 * unit));
 917       __ ldp(t2, t3, Address(s, 4 * unit));
 918       __ ldp(t4, t5, Address(s, 6 * unit));
 919       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 920 
 921       __ subs(count, count, 16);
 922       __ br(Assembler::LO, drain);
 923 
 924       int prefetch = PrefetchCopyIntervalInBytes;
 925       bool use_stride = false;
 926       if (direction == copy_backwards) {
 927          use_stride = prefetch > 256;
 928          prefetch = -prefetch;
 929          if (use_stride) __ mov(stride, prefetch);
 930       }
 931 
 932       __ bind(again);
 933 
 934       if (PrefetchCopyIntervalInBytes > 0)
 935         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 936 
 937       if (direction == copy_forwards) {
 938        // allowing for the offset of -8 the store instructions place
 939        // registers into the target 64 bit block at the following
 940        // offsets
 941        //
 942        // t0 at offset 0
 943        // t1 at offset 8,  t2 at offset 16
 944        // t3 at offset 24, t4 at offset 32
 945        // t5 at offset 40, t6 at offset 48
 946        // t7 at offset 56
 947 
 948         __ str(t0, Address(d, 1 * unit));
 949         __ stp(t1, t2, Address(d, 2 * unit));
 950         __ ldp(t0, t1, Address(s, 2 * unit));
 951         __ stp(t3, t4, Address(d, 4 * unit));
 952         __ ldp(t2, t3, Address(s, 4 * unit));
 953         __ stp(t5, t6, Address(d, 6 * unit));
 954         __ ldp(t4, t5, Address(s, 6 * unit));
 955         __ str(t7, Address(__ pre(d, 8 * unit)));
 956         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 957       } else {
 958        // d was not offset when we started so the registers are
 959        // written into the 64 bit block preceding d with the following
 960        // offsets
 961        //
 962        // t1 at offset -8
 963        // t3 at offset -24, t0 at offset -16
 964        // t5 at offset -48, t2 at offset -32
 965        // t7 at offset -56, t4 at offset -48
 966        //                   t6 at offset -64
 967        //
 968        // note that this matches the offsets previously noted for the
 969        // loads
 970 
 971         __ str(t1, Address(d, 1 * unit));
 972         __ stp(t3, t0, Address(d, 3 * unit));
 973         __ ldp(t0, t1, Address(s, 2 * unit));
 974         __ stp(t5, t2, Address(d, 5 * unit));
 975         __ ldp(t2, t3, Address(s, 4 * unit));
 976         __ stp(t7, t4, Address(d, 7 * unit));
 977         __ ldp(t4, t5, Address(s, 6 * unit));
 978         __ str(t6, Address(__ pre(d, 8 * unit)));
 979         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 980       }
 981 
 982       __ subs(count, count, 8);
 983       __ br(Assembler::HS, again);
 984 
 985       // Drain
 986       //
 987       // this uses the same pattern of offsets and register arguments
 988       // as above
 989       __ bind(drain);
 990       if (direction == copy_forwards) {
 991         __ str(t0, Address(d, 1 * unit));
 992         __ stp(t1, t2, Address(d, 2 * unit));
 993         __ stp(t3, t4, Address(d, 4 * unit));
 994         __ stp(t5, t6, Address(d, 6 * unit));
 995         __ str(t7, Address(__ pre(d, 8 * unit)));
 996       } else {
 997         __ str(t1, Address(d, 1 * unit));
 998         __ stp(t3, t0, Address(d, 3 * unit));
 999         __ stp(t5, t2, Address(d, 5 * unit));
1000         __ stp(t7, t4, Address(d, 7 * unit));
1001         __ str(t6, Address(__ pre(d, 8 * unit)));
1002       }
1003       // now we need to copy any remaining part block which may
1004       // include a 4 word block subblock and/or a 2 word subblock.
1005       // bits 2 and 1 in the count are the tell-tale for whetehr we
1006       // have each such subblock
1007       {
1008         Label L1, L2;
1009         __ tbz(count, exact_log2(4), L1);
1010        // this is the same as above but copying only 4 longs hence
1011        // with ony one intervening stp between the str instructions
1012        // but note that the offsets and registers still follow the
1013        // same pattern
1014         __ ldp(t0, t1, Address(s, 2 * unit));
1015         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1016         if (direction == copy_forwards) {
1017           __ str(t0, Address(d, 1 * unit));
1018           __ stp(t1, t2, Address(d, 2 * unit));
1019           __ str(t3, Address(__ pre(d, 4 * unit)));
1020         } else {
1021           __ str(t1, Address(d, 1 * unit));
1022           __ stp(t3, t0, Address(d, 3 * unit));
1023           __ str(t2, Address(__ pre(d, 4 * unit)));
1024         }
1025         __ bind(L1);
1026 
1027         __ tbz(count, 1, L2);
1028        // this is the same as above but copying only 2 longs hence
1029        // there is no intervening stp between the str instructions
1030        // but note that the offset and register patterns are still
1031        // the same
1032         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1033         if (direction == copy_forwards) {
1034           __ str(t0, Address(d, 1 * unit));
1035           __ str(t1, Address(__ pre(d, 2 * unit)));
1036         } else {
1037           __ str(t1, Address(d, 1 * unit));
1038           __ str(t0, Address(__ pre(d, 2 * unit)));
1039         }
1040         __ bind(L2);
1041 
1042        // for forwards copy we need to re-adjust the offsets we
1043        // applied so that s and d are follow the last words written
1044 
1045        if (direction == copy_forwards) {
1046          __ add(s, s, 16);
1047          __ add(d, d, 8);
1048        }
1049 
1050       }
1051 
1052       __ ret(lr);
1053       }
1054   }
1055 
1056   // Small copy: less than 16 bytes.
1057   //
1058   // NB: Ignores all of the bits of count which represent more than 15
1059   // bytes, so a caller doesn't have to mask them.
1060 
1061   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1062     bool is_backwards = step < 0;
1063     size_t granularity = uabs(step);
1064     int direction = is_backwards ? -1 : 1;
1065     int unit = wordSize * direction;
1066 
1067     Label Lword, Lint, Lshort, Lbyte;
1068 
1069     assert(granularity
1070            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1071 
1072     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1073 
1074     // ??? I don't know if this bit-test-and-branch is the right thing
1075     // to do.  It does a lot of jumping, resulting in several
1076     // mispredicted branches.  It might make more sense to do this
1077     // with something like Duff's device with a single computed branch.
1078 
1079     __ tbz(count, 3 - exact_log2(granularity), Lword);
1080     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1081     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1082     __ bind(Lword);
1083 
1084     if (granularity <= sizeof (jint)) {
1085       __ tbz(count, 2 - exact_log2(granularity), Lint);
1086       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1087       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1088       __ bind(Lint);
1089     }
1090 
1091     if (granularity <= sizeof (jshort)) {
1092       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1093       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1094       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1095       __ bind(Lshort);
1096     }
1097 
1098     if (granularity <= sizeof (jbyte)) {
1099       __ tbz(count, 0, Lbyte);
1100       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1101       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1102       __ bind(Lbyte);
1103     }
1104   }
1105 
1106   Label copy_f, copy_b;
1107 
1108   // All-singing all-dancing memory copy.
1109   //
1110   // Copy count units of memory from s to d.  The size of a unit is
1111   // step, which can be positive or negative depending on the direction
1112   // of copy.  If is_aligned is false, we align the source address.
1113   //
1114 
1115   void copy_memory(bool is_aligned, Register s, Register d,
1116                    Register count, Register tmp, int step) {
1117     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1118     bool is_backwards = step < 0;
1119     unsigned int granularity = uabs(step);
1120     const Register t0 = r3, t1 = r4;
1121 
1122     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1123     // load all the data before writing anything
1124     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1125     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1126     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1127     const Register send = r17, dend = r16;
1128 
1129     if (PrefetchCopyIntervalInBytes > 0)
1130       __ prfm(Address(s, 0), PLDL1KEEP);
1131     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1132     __ br(Assembler::HI, copy_big);
1133 
1134     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1135     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1136 
1137     __ cmp(count, u1(16/granularity));
1138     __ br(Assembler::LS, copy16);
1139 
1140     __ cmp(count, u1(64/granularity));
1141     __ br(Assembler::HI, copy80);
1142 
1143     __ cmp(count, u1(32/granularity));
1144     __ br(Assembler::LS, copy32);
1145 
1146     // 33..64 bytes
1147     if (UseSIMDForMemoryOps) {
1148       __ ldpq(v0, v1, Address(s, 0));
1149       __ ldpq(v2, v3, Address(send, -32));
1150       __ stpq(v0, v1, Address(d, 0));
1151       __ stpq(v2, v3, Address(dend, -32));
1152     } else {
1153       __ ldp(t0, t1, Address(s, 0));
1154       __ ldp(t2, t3, Address(s, 16));
1155       __ ldp(t4, t5, Address(send, -32));
1156       __ ldp(t6, t7, Address(send, -16));
1157 
1158       __ stp(t0, t1, Address(d, 0));
1159       __ stp(t2, t3, Address(d, 16));
1160       __ stp(t4, t5, Address(dend, -32));
1161       __ stp(t6, t7, Address(dend, -16));
1162     }
1163     __ b(finish);
1164 
1165     // 17..32 bytes
1166     __ bind(copy32);
1167     __ ldp(t0, t1, Address(s, 0));
1168     __ ldp(t2, t3, Address(send, -16));
1169     __ stp(t0, t1, Address(d, 0));
1170     __ stp(t2, t3, Address(dend, -16));
1171     __ b(finish);
1172 
1173     // 65..80/96 bytes
1174     // (96 bytes if SIMD because we do 32 byes per instruction)
1175     __ bind(copy80);
1176     if (UseSIMDForMemoryOps) {
1177       __ ldpq(v0, v1, Address(s, 0));
1178       __ ldpq(v2, v3, Address(s, 32));
1179       // Unaligned pointers can be an issue for copying.
1180       // The issue has more chances to happen when granularity of data is
1181       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1182       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1183       // The most performance drop has been seen for the range 65-80 bytes.
1184       // For such cases using the pair of ldp/stp instead of the third pair of
1185       // ldpq/stpq fixes the performance issue.
1186       if (granularity < sizeof (jint)) {
1187         Label copy96;
1188         __ cmp(count, u1(80/granularity));
1189         __ br(Assembler::HI, copy96);
1190         __ ldp(t0, t1, Address(send, -16));
1191 
1192         __ stpq(v0, v1, Address(d, 0));
1193         __ stpq(v2, v3, Address(d, 32));
1194         __ stp(t0, t1, Address(dend, -16));
1195         __ b(finish);
1196 
1197         __ bind(copy96);
1198       }
1199       __ ldpq(v4, v5, Address(send, -32));
1200 
1201       __ stpq(v0, v1, Address(d, 0));
1202       __ stpq(v2, v3, Address(d, 32));
1203       __ stpq(v4, v5, Address(dend, -32));
1204     } else {
1205       __ ldp(t0, t1, Address(s, 0));
1206       __ ldp(t2, t3, Address(s, 16));
1207       __ ldp(t4, t5, Address(s, 32));
1208       __ ldp(t6, t7, Address(s, 48));
1209       __ ldp(t8, t9, Address(send, -16));
1210 
1211       __ stp(t0, t1, Address(d, 0));
1212       __ stp(t2, t3, Address(d, 16));
1213       __ stp(t4, t5, Address(d, 32));
1214       __ stp(t6, t7, Address(d, 48));
1215       __ stp(t8, t9, Address(dend, -16));
1216     }
1217     __ b(finish);
1218 
1219     // 0..16 bytes
1220     __ bind(copy16);
1221     __ cmp(count, u1(8/granularity));
1222     __ br(Assembler::LO, copy8);
1223 
1224     // 8..16 bytes
1225     __ ldr(t0, Address(s, 0));
1226     __ ldr(t1, Address(send, -8));
1227     __ str(t0, Address(d, 0));
1228     __ str(t1, Address(dend, -8));
1229     __ b(finish);
1230 
1231     if (granularity < 8) {
1232       // 4..7 bytes
1233       __ bind(copy8);
1234       __ tbz(count, 2 - exact_log2(granularity), copy4);
1235       __ ldrw(t0, Address(s, 0));
1236       __ ldrw(t1, Address(send, -4));
1237       __ strw(t0, Address(d, 0));
1238       __ strw(t1, Address(dend, -4));
1239       __ b(finish);
1240       if (granularity < 4) {
1241         // 0..3 bytes
1242         __ bind(copy4);
1243         __ cbz(count, finish); // get rid of 0 case
1244         if (granularity == 2) {
1245           __ ldrh(t0, Address(s, 0));
1246           __ strh(t0, Address(d, 0));
1247         } else { // granularity == 1
1248           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1249           // the first and last byte.
1250           // Handle the 3 byte case by loading and storing base + count/2
1251           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1252           // This does means in the 1 byte case we load/store the same
1253           // byte 3 times.
1254           __ lsr(count, count, 1);
1255           __ ldrb(t0, Address(s, 0));
1256           __ ldrb(t1, Address(send, -1));
1257           __ ldrb(t2, Address(s, count));
1258           __ strb(t0, Address(d, 0));
1259           __ strb(t1, Address(dend, -1));
1260           __ strb(t2, Address(d, count));
1261         }
1262         __ b(finish);
1263       }
1264     }
1265 
1266     __ bind(copy_big);
1267     if (is_backwards) {
1268       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1269       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1270     }
1271 
1272     // Now we've got the small case out of the way we can align the
1273     // source address on a 2-word boundary.
1274 
1275     Label aligned;
1276 
1277     if (is_aligned) {
1278       // We may have to adjust by 1 word to get s 2-word-aligned.
1279       __ tbz(s, exact_log2(wordSize), aligned);
1280       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1281       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1282       __ sub(count, count, wordSize/granularity);
1283     } else {
1284       if (is_backwards) {
1285         __ andr(rscratch2, s, 2 * wordSize - 1);
1286       } else {
1287         __ neg(rscratch2, s);
1288         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1289       }
1290       // rscratch2 is the byte adjustment needed to align s.
1291       __ cbz(rscratch2, aligned);
1292       int shift = exact_log2(granularity);
1293       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1294       __ sub(count, count, rscratch2);
1295 
1296 #if 0
1297       // ?? This code is only correct for a disjoint copy.  It may or
1298       // may not make sense to use it in that case.
1299 
1300       // Copy the first pair; s and d may not be aligned.
1301       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1302       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1303 
1304       // Align s and d, adjust count
1305       if (is_backwards) {
1306         __ sub(s, s, rscratch2);
1307         __ sub(d, d, rscratch2);
1308       } else {
1309         __ add(s, s, rscratch2);
1310         __ add(d, d, rscratch2);
1311       }
1312 #else
1313       copy_memory_small(s, d, rscratch2, rscratch1, step);
1314 #endif
1315     }
1316 
1317     __ bind(aligned);
1318 
1319     // s is now 2-word-aligned.
1320 
1321     // We have a count of units and some trailing bytes.  Adjust the
1322     // count and do a bulk copy of words.
1323     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1324     if (direction == copy_forwards)
1325       __ bl(copy_f);
1326     else
1327       __ bl(copy_b);
1328 
1329     // And the tail.
1330     copy_memory_small(s, d, count, tmp, step);
1331 
1332     if (granularity >= 8) __ bind(copy8);
1333     if (granularity >= 4) __ bind(copy4);
1334     __ bind(finish);
1335   }
1336 
1337 
1338   void clobber_registers() {
1339 #ifdef ASSERT
1340     RegSet clobbered
1341       = MacroAssembler::call_clobbered_registers() - rscratch1;
1342     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1343     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1344     for (RegSetIterator<> it = clobbered.begin(); *it != noreg; ++it) {
1345       __ mov(*it, rscratch1);
1346     }
1347 #endif
1348 
1349   }
1350 
1351   // Scan over array at a for count oops, verifying each one.
1352   // Preserves a and count, clobbers rscratch1 and rscratch2.
1353   void verify_oop_array (int size, Register a, Register count, Register temp) {
1354     Label loop, end;
1355     __ mov(rscratch1, a);
1356     __ mov(rscratch2, zr);
1357     __ bind(loop);
1358     __ cmp(rscratch2, count);
1359     __ br(Assembler::HS, end);
1360     if (size == wordSize) {
1361       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1362       __ verify_oop(temp);
1363     } else {
1364       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1365       __ decode_heap_oop(temp); // calls verify_oop
1366     }
1367     __ add(rscratch2, rscratch2, 1);
1368     __ b(loop);
1369     __ bind(end);
1370   }
1371 
1372   // Arguments:
1373   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1374   //             ignored
1375   //   is_oop  - true => oop array, so generate store check code
1376   //   name    - stub name string
1377   //
1378   // Inputs:
1379   //   c_rarg0   - source array address
1380   //   c_rarg1   - destination array address
1381   //   c_rarg2   - element count, treated as ssize_t, can be zero
1382   //
1383   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1384   // the hardware handle it.  The two dwords within qwords that span
1385   // cache line boundaries will still be loaded and stored atomically.
1386   //
1387   // Side Effects:
1388   //   disjoint_int_copy_entry is set to the no-overlap entry point
1389   //   used by generate_conjoint_int_oop_copy().
1390   //
1391   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1392                                   const char *name, bool dest_uninitialized = false) {
1393     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1394     RegSet saved_reg = RegSet::of(s, d, count);
1395     __ align(CodeEntryAlignment);
1396     StubCodeMark mark(this, "StubRoutines", name);
1397     address start = __ pc();
1398     __ enter();
1399 
1400     if (entry != NULL) {
1401       *entry = __ pc();
1402       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1403       BLOCK_COMMENT("Entry:");
1404     }
1405 
1406     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1407     if (dest_uninitialized) {
1408       decorators |= IS_DEST_UNINITIALIZED;
1409     }
1410     if (aligned) {
1411       decorators |= ARRAYCOPY_ALIGNED;
1412     }
1413 
1414     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1415     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1416 
1417     if (is_oop) {
1418       // save regs before copy_memory
1419       __ push(RegSet::of(d, count), sp);
1420     }
1421     {
1422       // UnsafeCopyMemory page error: continue after ucm
1423       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1424       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1425       copy_memory(aligned, s, d, count, rscratch1, size);
1426     }
1427 
1428     if (is_oop) {
1429       __ pop(RegSet::of(d, count), sp);
1430       if (VerifyOops)
1431         verify_oop_array(size, d, count, r16);
1432     }
1433 
1434     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1435 
1436     __ leave();
1437     __ mov(r0, zr); // return 0
1438     __ ret(lr);
1439     return start;
1440   }
1441 
1442   // Arguments:
1443   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1444   //             ignored
1445   //   is_oop  - true => oop array, so generate store check code
1446   //   name    - stub name string
1447   //
1448   // Inputs:
1449   //   c_rarg0   - source array address
1450   //   c_rarg1   - destination array address
1451   //   c_rarg2   - element count, treated as ssize_t, can be zero
1452   //
1453   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1454   // the hardware handle it.  The two dwords within qwords that span
1455   // cache line boundaries will still be loaded and stored atomically.
1456   //
1457   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1458                                  address *entry, const char *name,
1459                                  bool dest_uninitialized = false) {
1460     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1461     RegSet saved_regs = RegSet::of(s, d, count);
1462     StubCodeMark mark(this, "StubRoutines", name);
1463     address start = __ pc();
1464     __ enter();
1465 
1466     if (entry != NULL) {
1467       *entry = __ pc();
1468       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1469       BLOCK_COMMENT("Entry:");
1470     }
1471 
1472     // use fwd copy when (d-s) above_equal (count*size)
1473     __ sub(rscratch1, d, s);
1474     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1475     __ br(Assembler::HS, nooverlap_target);
1476 
1477     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1478     if (dest_uninitialized) {
1479       decorators |= IS_DEST_UNINITIALIZED;
1480     }
1481     if (aligned) {
1482       decorators |= ARRAYCOPY_ALIGNED;
1483     }
1484 
1485     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1486     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1487 
1488     if (is_oop) {
1489       // save regs before copy_memory
1490       __ push(RegSet::of(d, count), sp);
1491     }
1492     {
1493       // UnsafeCopyMemory page error: continue after ucm
1494       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1495       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1496       copy_memory(aligned, s, d, count, rscratch1, -size);
1497     }
1498     if (is_oop) {
1499       __ pop(RegSet::of(d, count), sp);
1500       if (VerifyOops)
1501         verify_oop_array(size, d, count, r16);
1502     }
1503     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1504     __ leave();
1505     __ mov(r0, zr); // return 0
1506     __ ret(lr);
1507     return start;
1508 }
1509 
1510   // Arguments:
1511   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1512   //             ignored
1513   //   name    - stub name string
1514   //
1515   // Inputs:
1516   //   c_rarg0   - source array address
1517   //   c_rarg1   - destination array address
1518   //   c_rarg2   - element count, treated as ssize_t, can be zero
1519   //
1520   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1521   // we let the hardware handle it.  The one to eight bytes within words,
1522   // dwords or qwords that span cache line boundaries will still be loaded
1523   // and stored atomically.
1524   //
1525   // Side Effects:
1526   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1527   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1528   // we let the hardware handle it.  The one to eight bytes within words,
1529   // dwords or qwords that span cache line boundaries will still be loaded
1530   // and stored atomically.
1531   //
1532   // Side Effects:
1533   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1534   //   used by generate_conjoint_byte_copy().
1535   //
1536   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1537     const bool not_oop = false;
1538     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1539   }
1540 
1541   // Arguments:
1542   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1543   //             ignored
1544   //   name    - stub name string
1545   //
1546   // Inputs:
1547   //   c_rarg0   - source array address
1548   //   c_rarg1   - destination array address
1549   //   c_rarg2   - element count, treated as ssize_t, can be zero
1550   //
1551   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1552   // we let the hardware handle it.  The one to eight bytes within words,
1553   // dwords or qwords that span cache line boundaries will still be loaded
1554   // and stored atomically.
1555   //
1556   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1557                                       address* entry, const char *name) {
1558     const bool not_oop = false;
1559     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1560   }
1561 
1562   // Arguments:
1563   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1564   //             ignored
1565   //   name    - stub name string
1566   //
1567   // Inputs:
1568   //   c_rarg0   - source array address
1569   //   c_rarg1   - destination array address
1570   //   c_rarg2   - element count, treated as ssize_t, can be zero
1571   //
1572   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1573   // let the hardware handle it.  The two or four words within dwords
1574   // or qwords that span cache line boundaries will still be loaded
1575   // and stored atomically.
1576   //
1577   // Side Effects:
1578   //   disjoint_short_copy_entry is set to the no-overlap entry point
1579   //   used by generate_conjoint_short_copy().
1580   //
1581   address generate_disjoint_short_copy(bool aligned,
1582                                        address* entry, const char *name) {
1583     const bool not_oop = false;
1584     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1585   }
1586 
1587   // Arguments:
1588   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1589   //             ignored
1590   //   name    - stub name string
1591   //
1592   // Inputs:
1593   //   c_rarg0   - source array address
1594   //   c_rarg1   - destination array address
1595   //   c_rarg2   - element count, treated as ssize_t, can be zero
1596   //
1597   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1598   // let the hardware handle it.  The two or four words within dwords
1599   // or qwords that span cache line boundaries will still be loaded
1600   // and stored atomically.
1601   //
1602   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1603                                        address *entry, const char *name) {
1604     const bool not_oop = false;
1605     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1606 
1607   }
1608   // Arguments:
1609   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1610   //             ignored
1611   //   name    - stub name string
1612   //
1613   // Inputs:
1614   //   c_rarg0   - source array address
1615   //   c_rarg1   - destination array address
1616   //   c_rarg2   - element count, treated as ssize_t, can be zero
1617   //
1618   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1619   // the hardware handle it.  The two dwords within qwords that span
1620   // cache line boundaries will still be loaded and stored atomically.
1621   //
1622   // Side Effects:
1623   //   disjoint_int_copy_entry is set to the no-overlap entry point
1624   //   used by generate_conjoint_int_oop_copy().
1625   //
1626   address generate_disjoint_int_copy(bool aligned, address *entry,
1627                                          const char *name, bool dest_uninitialized = false) {
1628     const bool not_oop = false;
1629     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1630   }
1631 
1632   // Arguments:
1633   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1634   //             ignored
1635   //   name    - stub name string
1636   //
1637   // Inputs:
1638   //   c_rarg0   - source array address
1639   //   c_rarg1   - destination array address
1640   //   c_rarg2   - element count, treated as ssize_t, can be zero
1641   //
1642   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1643   // the hardware handle it.  The two dwords within qwords that span
1644   // cache line boundaries will still be loaded and stored atomically.
1645   //
1646   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1647                                      address *entry, const char *name,
1648                                      bool dest_uninitialized = false) {
1649     const bool not_oop = false;
1650     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1651   }
1652 
1653 
1654   // Arguments:
1655   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1656   //             ignored
1657   //   name    - stub name string
1658   //
1659   // Inputs:
1660   //   c_rarg0   - source array address
1661   //   c_rarg1   - destination array address
1662   //   c_rarg2   - element count, treated as size_t, can be zero
1663   //
1664   // Side Effects:
1665   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1666   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1667   //
1668   address generate_disjoint_long_copy(bool aligned, address *entry,
1669                                           const char *name, bool dest_uninitialized = false) {
1670     const bool not_oop = false;
1671     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1672   }
1673 
1674   // Arguments:
1675   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1676   //             ignored
1677   //   name    - stub name string
1678   //
1679   // Inputs:
1680   //   c_rarg0   - source array address
1681   //   c_rarg1   - destination array address
1682   //   c_rarg2   - element count, treated as size_t, can be zero
1683   //
1684   address generate_conjoint_long_copy(bool aligned,
1685                                       address nooverlap_target, address *entry,
1686                                       const char *name, bool dest_uninitialized = false) {
1687     const bool not_oop = false;
1688     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1689   }
1690 
1691   // Arguments:
1692   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1693   //             ignored
1694   //   name    - stub name string
1695   //
1696   // Inputs:
1697   //   c_rarg0   - source array address
1698   //   c_rarg1   - destination array address
1699   //   c_rarg2   - element count, treated as size_t, can be zero
1700   //
1701   // Side Effects:
1702   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1703   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1704   //
1705   address generate_disjoint_oop_copy(bool aligned, address *entry,
1706                                      const char *name, bool dest_uninitialized) {
1707     const bool is_oop = true;
1708     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1709     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1710   }
1711 
1712   // Arguments:
1713   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1714   //             ignored
1715   //   name    - stub name string
1716   //
1717   // Inputs:
1718   //   c_rarg0   - source array address
1719   //   c_rarg1   - destination array address
1720   //   c_rarg2   - element count, treated as size_t, can be zero
1721   //
1722   address generate_conjoint_oop_copy(bool aligned,
1723                                      address nooverlap_target, address *entry,
1724                                      const char *name, bool dest_uninitialized) {
1725     const bool is_oop = true;
1726     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1727     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1728                                   name, dest_uninitialized);
1729   }
1730 
1731 
1732   // Helper for generating a dynamic type check.
1733   // Smashes rscratch1, rscratch2.
1734   void generate_type_check(Register sub_klass,
1735                            Register super_check_offset,
1736                            Register super_klass,
1737                            Label& L_success) {
1738     assert_different_registers(sub_klass, super_check_offset, super_klass);
1739 
1740     BLOCK_COMMENT("type_check:");
1741 
1742     Label L_miss;
1743 
1744     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1745                                      super_check_offset);
1746     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1747 
1748     // Fall through on failure!
1749     __ BIND(L_miss);
1750   }
1751 
1752   //
1753   //  Generate checkcasting array copy stub
1754   //
1755   //  Input:
1756   //    c_rarg0   - source array address
1757   //    c_rarg1   - destination array address
1758   //    c_rarg2   - element count, treated as ssize_t, can be zero
1759   //    c_rarg3   - size_t ckoff (super_check_offset)
1760   //    c_rarg4   - oop ckval (super_klass)
1761   //
1762   //  Output:
1763   //    r0 ==  0  -  success
1764   //    r0 == -1^K - failure, where K is partial transfer count
1765   //
1766   address generate_checkcast_copy(const char *name, address *entry,
1767                                   bool dest_uninitialized = false) {
1768 
1769     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1770 
1771     // Input registers (after setup_arg_regs)
1772     const Register from        = c_rarg0;   // source array address
1773     const Register to          = c_rarg1;   // destination array address
1774     const Register count       = c_rarg2;   // elementscount
1775     const Register ckoff       = c_rarg3;   // super_check_offset
1776     const Register ckval       = c_rarg4;   // super_klass
1777 
1778     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1779     RegSet wb_post_saved_regs = RegSet::of(count);
1780 
1781     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1782     const Register copied_oop  = r22;       // actual oop copied
1783     const Register count_save  = r21;       // orig elementscount
1784     const Register start_to    = r20;       // destination array start address
1785     const Register r19_klass   = r19;       // oop._klass
1786 
1787     //---------------------------------------------------------------
1788     // Assembler stub will be used for this call to arraycopy
1789     // if the two arrays are subtypes of Object[] but the
1790     // destination array type is not equal to or a supertype
1791     // of the source type.  Each element must be separately
1792     // checked.
1793 
1794     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1795                                copied_oop, r19_klass, count_save);
1796 
1797     __ align(CodeEntryAlignment);
1798     StubCodeMark mark(this, "StubRoutines", name);
1799     address start = __ pc();
1800 
1801     __ enter(); // required for proper stackwalking of RuntimeStub frame
1802 
1803 #ifdef ASSERT
1804     // caller guarantees that the arrays really are different
1805     // otherwise, we would have to make conjoint checks
1806     { Label L;
1807       array_overlap_test(L, TIMES_OOP);
1808       __ stop("checkcast_copy within a single array");
1809       __ bind(L);
1810     }
1811 #endif //ASSERT
1812 
1813     // Caller of this entry point must set up the argument registers.
1814     if (entry != NULL) {
1815       *entry = __ pc();
1816       BLOCK_COMMENT("Entry:");
1817     }
1818 
1819      // Empty array:  Nothing to do.
1820     __ cbz(count, L_done);
1821     __ push(RegSet::of(r19, r20, r21, r22), sp);
1822 
1823 #ifdef ASSERT
1824     BLOCK_COMMENT("assert consistent ckoff/ckval");
1825     // The ckoff and ckval must be mutually consistent,
1826     // even though caller generates both.
1827     { Label L;
1828       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1829       __ ldrw(start_to, Address(ckval, sco_offset));
1830       __ cmpw(ckoff, start_to);
1831       __ br(Assembler::EQ, L);
1832       __ stop("super_check_offset inconsistent");
1833       __ bind(L);
1834     }
1835 #endif //ASSERT
1836 
1837     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1838     bool is_oop = true;
1839     if (dest_uninitialized) {
1840       decorators |= IS_DEST_UNINITIALIZED;
1841     }
1842 
1843     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1844     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1845 
1846     // save the original count
1847     __ mov(count_save, count);
1848 
1849     // Copy from low to high addresses
1850     __ mov(start_to, to);              // Save destination array start address
1851     __ b(L_load_element);
1852 
1853     // ======== begin loop ========
1854     // (Loop is rotated; its entry is L_load_element.)
1855     // Loop control:
1856     //   for (; count != 0; count--) {
1857     //     copied_oop = load_heap_oop(from++);
1858     //     ... generate_type_check ...;
1859     //     store_heap_oop(to++, copied_oop);
1860     //   }
1861     __ align(OptoLoopAlignment);
1862 
1863     __ BIND(L_store_element);
1864     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
1865     __ sub(count, count, 1);
1866     __ cbz(count, L_do_card_marks);
1867 
1868     // ======== loop entry is here ========
1869     __ BIND(L_load_element);
1870     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1871     __ cbz(copied_oop, L_store_element);
1872 
1873     __ load_klass(r19_klass, copied_oop);// query the object klass
1874     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1875     // ======== end loop ========
1876 
1877     // It was a real error; we must depend on the caller to finish the job.
1878     // Register count = remaining oops, count_orig = total oops.
1879     // Emit GC store barriers for the oops we have copied and report
1880     // their number to the caller.
1881 
1882     __ subs(count, count_save, count);     // K = partially copied oop count
1883     __ eon(count, count, zr);                   // report (-1^K) to caller
1884     __ br(Assembler::EQ, L_done_pop);
1885 
1886     __ BIND(L_do_card_marks);
1887     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1888 
1889     __ bind(L_done_pop);
1890     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1891     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1892 
1893     __ bind(L_done);
1894     __ mov(r0, count);
1895     __ leave();
1896     __ ret(lr);
1897 
1898     return start;
1899   }
1900 
1901   // Perform range checks on the proposed arraycopy.
1902   // Kills temp, but nothing else.
1903   // Also, clean the sign bits of src_pos and dst_pos.
1904   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1905                               Register src_pos, // source position (c_rarg1)
1906                               Register dst,     // destination array oo (c_rarg2)
1907                               Register dst_pos, // destination position (c_rarg3)
1908                               Register length,
1909                               Register temp,
1910                               Label& L_failed) {
1911     BLOCK_COMMENT("arraycopy_range_checks:");
1912 
1913     assert_different_registers(rscratch1, temp);
1914 
1915     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1916     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1917     __ addw(temp, length, src_pos);
1918     __ cmpw(temp, rscratch1);
1919     __ br(Assembler::HI, L_failed);
1920 
1921     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1922     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1923     __ addw(temp, length, dst_pos);
1924     __ cmpw(temp, rscratch1);
1925     __ br(Assembler::HI, L_failed);
1926 
1927     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1928     __ movw(src_pos, src_pos);
1929     __ movw(dst_pos, dst_pos);
1930 
1931     BLOCK_COMMENT("arraycopy_range_checks done");
1932   }
1933 
1934   // These stubs get called from some dumb test routine.
1935   // I'll write them properly when they're called from
1936   // something that's actually doing something.
1937   static void fake_arraycopy_stub(address src, address dst, int count) {
1938     assert(count == 0, "huh?");
1939   }
1940 
1941 
1942   //
1943   //  Generate 'unsafe' array copy stub
1944   //  Though just as safe as the other stubs, it takes an unscaled
1945   //  size_t argument instead of an element count.
1946   //
1947   //  Input:
1948   //    c_rarg0   - source array address
1949   //    c_rarg1   - destination array address
1950   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1951   //
1952   // Examines the alignment of the operands and dispatches
1953   // to a long, int, short, or byte copy loop.
1954   //
1955   address generate_unsafe_copy(const char *name,
1956                                address byte_copy_entry,
1957                                address short_copy_entry,
1958                                address int_copy_entry,
1959                                address long_copy_entry) {
1960     Label L_long_aligned, L_int_aligned, L_short_aligned;
1961     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1962 
1963     __ align(CodeEntryAlignment);
1964     StubCodeMark mark(this, "StubRoutines", name);
1965     address start = __ pc();
1966     __ enter(); // required for proper stackwalking of RuntimeStub frame
1967 
1968     // bump this on entry, not on exit:
1969     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1970 
1971     __ orr(rscratch1, s, d);
1972     __ orr(rscratch1, rscratch1, count);
1973 
1974     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1975     __ cbz(rscratch1, L_long_aligned);
1976     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1977     __ cbz(rscratch1, L_int_aligned);
1978     __ tbz(rscratch1, 0, L_short_aligned);
1979     __ b(RuntimeAddress(byte_copy_entry));
1980 
1981     __ BIND(L_short_aligned);
1982     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1983     __ b(RuntimeAddress(short_copy_entry));
1984     __ BIND(L_int_aligned);
1985     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1986     __ b(RuntimeAddress(int_copy_entry));
1987     __ BIND(L_long_aligned);
1988     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1989     __ b(RuntimeAddress(long_copy_entry));
1990 
1991     return start;
1992   }
1993 
1994   //
1995   //  Generate generic array copy stubs
1996   //
1997   //  Input:
1998   //    c_rarg0    -  src oop
1999   //    c_rarg1    -  src_pos (32-bits)
2000   //    c_rarg2    -  dst oop
2001   //    c_rarg3    -  dst_pos (32-bits)
2002   //    c_rarg4    -  element count (32-bits)
2003   //
2004   //  Output:
2005   //    r0 ==  0  -  success
2006   //    r0 == -1^K - failure, where K is partial transfer count
2007   //
2008   address generate_generic_copy(const char *name,
2009                                 address byte_copy_entry, address short_copy_entry,
2010                                 address int_copy_entry, address oop_copy_entry,
2011                                 address long_copy_entry, address checkcast_copy_entry) {
2012 
2013     Label L_failed, L_objArray;
2014     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2015 
2016     // Input registers
2017     const Register src        = c_rarg0;  // source array oop
2018     const Register src_pos    = c_rarg1;  // source position
2019     const Register dst        = c_rarg2;  // destination array oop
2020     const Register dst_pos    = c_rarg3;  // destination position
2021     const Register length     = c_rarg4;
2022 
2023 
2024     // Registers used as temps
2025     const Register dst_klass  = c_rarg5;
2026 
2027     __ align(CodeEntryAlignment);
2028 
2029     StubCodeMark mark(this, "StubRoutines", name);
2030 
2031     address start = __ pc();
2032 
2033     __ enter(); // required for proper stackwalking of RuntimeStub frame
2034 
2035     // bump this on entry, not on exit:
2036     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2037 
2038     //-----------------------------------------------------------------------
2039     // Assembler stub will be used for this call to arraycopy
2040     // if the following conditions are met:
2041     //
2042     // (1) src and dst must not be null.
2043     // (2) src_pos must not be negative.
2044     // (3) dst_pos must not be negative.
2045     // (4) length  must not be negative.
2046     // (5) src klass and dst klass should be the same and not NULL.
2047     // (6) src and dst should be arrays.
2048     // (7) src_pos + length must not exceed length of src.
2049     // (8) dst_pos + length must not exceed length of dst.
2050     //
2051 
2052     //  if (src == NULL) return -1;
2053     __ cbz(src, L_failed);
2054 
2055     //  if (src_pos < 0) return -1;
2056     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2057 
2058     //  if (dst == NULL) return -1;
2059     __ cbz(dst, L_failed);
2060 
2061     //  if (dst_pos < 0) return -1;
2062     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2063 
2064     // registers used as temp
2065     const Register scratch_length    = r16; // elements count to copy
2066     const Register scratch_src_klass = r17; // array klass
2067     const Register lh                = r15; // layout helper
2068 
2069     //  if (length < 0) return -1;
2070     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2071     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2072 
2073     __ load_klass(scratch_src_klass, src);
2074 #ifdef ASSERT
2075     //  assert(src->klass() != NULL);
2076     {
2077       BLOCK_COMMENT("assert klasses not null {");
2078       Label L1, L2;
2079       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2080       __ bind(L1);
2081       __ stop("broken null klass");
2082       __ bind(L2);
2083       __ load_klass(rscratch1, dst);
2084       __ cbz(rscratch1, L1);     // this would be broken also
2085       BLOCK_COMMENT("} assert klasses not null done");
2086     }
2087 #endif
2088 
2089     // Load layout helper (32-bits)
2090     //
2091     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2092     // 32        30    24            16              8     2                 0
2093     //
2094     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2095     //
2096 
2097     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2098 
2099     // Handle objArrays completely differently...
2100     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2101     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2102     __ movw(rscratch1, objArray_lh);
2103     __ eorw(rscratch2, lh, rscratch1);
2104     __ cbzw(rscratch2, L_objArray);
2105 
2106     //  if (src->klass() != dst->klass()) return -1;
2107     __ load_klass(rscratch2, dst);
2108     __ eor(rscratch2, rscratch2, scratch_src_klass);
2109     __ cbnz(rscratch2, L_failed);
2110 
2111     // Check for flat inline type array -> return -1
2112     __ tst(lh, Klass::_lh_array_tag_vt_value_bit_inplace);
2113     __ br(Assembler::NE, L_failed);
2114 
2115     // Check for null-free (non-flat) inline type array -> handle as object array
2116     __ tst(lh, Klass::_lh_null_free_bit_inplace);
2117     __ br(Assembler::NE, L_failed);
2118 
2119     //  if (!src->is_Array()) return -1;
2120     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2121 
2122     // At this point, it is known to be a typeArray (array_tag 0x3).
2123 #ifdef ASSERT
2124     {
2125       BLOCK_COMMENT("assert primitive array {");
2126       Label L;
2127       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2128       __ cmpw(lh, rscratch2);
2129       __ br(Assembler::GE, L);
2130       __ stop("must be a primitive array");
2131       __ bind(L);
2132       BLOCK_COMMENT("} assert primitive array done");
2133     }
2134 #endif
2135 
2136     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2137                            rscratch2, L_failed);
2138 
2139     // TypeArrayKlass
2140     //
2141     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2142     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2143     //
2144 
2145     const Register rscratch1_offset = rscratch1;    // array offset
2146     const Register r15_elsize = lh; // element size
2147 
2148     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2149            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2150     __ add(src, src, rscratch1_offset);           // src array offset
2151     __ add(dst, dst, rscratch1_offset);           // dst array offset
2152     BLOCK_COMMENT("choose copy loop based on element size");
2153 
2154     // next registers should be set before the jump to corresponding stub
2155     const Register from     = c_rarg0;  // source array address
2156     const Register to       = c_rarg1;  // destination array address
2157     const Register count    = c_rarg2;  // elements count
2158 
2159     // 'from', 'to', 'count' registers should be set in such order
2160     // since they are the same as 'src', 'src_pos', 'dst'.
2161 
2162     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2163 
2164     // The possible values of elsize are 0-3, i.e. exact_log2(element
2165     // size in bytes).  We do a simple bitwise binary search.
2166   __ BIND(L_copy_bytes);
2167     __ tbnz(r15_elsize, 1, L_copy_ints);
2168     __ tbnz(r15_elsize, 0, L_copy_shorts);
2169     __ lea(from, Address(src, src_pos));// src_addr
2170     __ lea(to,   Address(dst, dst_pos));// dst_addr
2171     __ movw(count, scratch_length); // length
2172     __ b(RuntimeAddress(byte_copy_entry));
2173 
2174   __ BIND(L_copy_shorts);
2175     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2176     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2177     __ movw(count, scratch_length); // length
2178     __ b(RuntimeAddress(short_copy_entry));
2179 
2180   __ BIND(L_copy_ints);
2181     __ tbnz(r15_elsize, 0, L_copy_longs);
2182     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2183     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2184     __ movw(count, scratch_length); // length
2185     __ b(RuntimeAddress(int_copy_entry));
2186 
2187   __ BIND(L_copy_longs);
2188 #ifdef ASSERT
2189     {
2190       BLOCK_COMMENT("assert long copy {");
2191       Label L;
2192       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2193       __ cmpw(r15_elsize, LogBytesPerLong);
2194       __ br(Assembler::EQ, L);
2195       __ stop("must be long copy, but elsize is wrong");
2196       __ bind(L);
2197       BLOCK_COMMENT("} assert long copy done");
2198     }
2199 #endif
2200     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2201     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2202     __ movw(count, scratch_length); // length
2203     __ b(RuntimeAddress(long_copy_entry));
2204 
2205     // ObjArrayKlass
2206   __ BIND(L_objArray);
2207     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2208 
2209     Label L_plain_copy, L_checkcast_copy;
2210     //  test array classes for subtyping
2211     __ load_klass(r15, dst);
2212     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2213     __ br(Assembler::NE, L_checkcast_copy);
2214 
2215     // Identically typed arrays can be copied without element-wise checks.
2216     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2217                            rscratch2, L_failed);
2218 
2219     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2220     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2221     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2222     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2223     __ movw(count, scratch_length); // length
2224   __ BIND(L_plain_copy);
2225     __ b(RuntimeAddress(oop_copy_entry));
2226 
2227   __ BIND(L_checkcast_copy);
2228     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2229     {
2230       // Before looking at dst.length, make sure dst is also an objArray.
2231       __ ldrw(rscratch1, Address(r15, lh_offset));
2232       __ movw(rscratch2, objArray_lh);
2233       __ eorw(rscratch1, rscratch1, rscratch2);
2234       __ cbnzw(rscratch1, L_failed);
2235 
2236       // It is safe to examine both src.length and dst.length.
2237       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2238                              r15, L_failed);
2239 
2240       __ load_klass(dst_klass, dst); // reload
2241 
2242       // Marshal the base address arguments now, freeing registers.
2243       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2244       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2245       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2246       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2247       __ movw(count, length);           // length (reloaded)
2248       Register sco_temp = c_rarg3;      // this register is free now
2249       assert_different_registers(from, to, count, sco_temp,
2250                                  dst_klass, scratch_src_klass);
2251       // assert_clean_int(count, sco_temp);
2252 
2253       // Generate the type check.
2254       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2255       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2256 
2257       // Smashes rscratch1, rscratch2
2258       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2259 
2260       // Fetch destination element klass from the ObjArrayKlass header.
2261       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2262       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2263       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2264 
2265       // the checkcast_copy loop needs two extra arguments:
2266       assert(c_rarg3 == sco_temp, "#3 already in place");
2267       // Set up arguments for checkcast_copy_entry.
2268       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2269       __ b(RuntimeAddress(checkcast_copy_entry));
2270     }
2271 
2272   __ BIND(L_failed);
2273     __ mov(r0, -1);
2274     __ leave();   // required for proper stackwalking of RuntimeStub frame
2275     __ ret(lr);
2276 
2277     return start;
2278   }
2279 
2280   //
2281   // Generate stub for array fill. If "aligned" is true, the
2282   // "to" address is assumed to be heapword aligned.
2283   //
2284   // Arguments for generated stub:
2285   //   to:    c_rarg0
2286   //   value: c_rarg1
2287   //   count: c_rarg2 treated as signed
2288   //
2289   address generate_fill(BasicType t, bool aligned, const char *name) {
2290     __ align(CodeEntryAlignment);
2291     StubCodeMark mark(this, "StubRoutines", name);
2292     address start = __ pc();
2293 
2294     BLOCK_COMMENT("Entry:");
2295 
2296     const Register to        = c_rarg0;  // source array address
2297     const Register value     = c_rarg1;  // value
2298     const Register count     = c_rarg2;  // elements count
2299 
2300     const Register bz_base = r10;        // base for block_zero routine
2301     const Register cnt_words = r11;      // temp register
2302 
2303     __ enter();
2304 
2305     Label L_fill_elements, L_exit1;
2306 
2307     int shift = -1;
2308     switch (t) {
2309       case T_BYTE:
2310         shift = 0;
2311         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2312         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2313         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2314         __ br(Assembler::LO, L_fill_elements);
2315         break;
2316       case T_SHORT:
2317         shift = 1;
2318         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2319         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2320         __ br(Assembler::LO, L_fill_elements);
2321         break;
2322       case T_INT:
2323         shift = 2;
2324         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2325         __ br(Assembler::LO, L_fill_elements);
2326         break;
2327       default: ShouldNotReachHere();
2328     }
2329 
2330     // Align source address at 8 bytes address boundary.
2331     Label L_skip_align1, L_skip_align2, L_skip_align4;
2332     if (!aligned) {
2333       switch (t) {
2334         case T_BYTE:
2335           // One byte misalignment happens only for byte arrays.
2336           __ tbz(to, 0, L_skip_align1);
2337           __ strb(value, Address(__ post(to, 1)));
2338           __ subw(count, count, 1);
2339           __ bind(L_skip_align1);
2340           // Fallthrough
2341         case T_SHORT:
2342           // Two bytes misalignment happens only for byte and short (char) arrays.
2343           __ tbz(to, 1, L_skip_align2);
2344           __ strh(value, Address(__ post(to, 2)));
2345           __ subw(count, count, 2 >> shift);
2346           __ bind(L_skip_align2);
2347           // Fallthrough
2348         case T_INT:
2349           // Align to 8 bytes, we know we are 4 byte aligned to start.
2350           __ tbz(to, 2, L_skip_align4);
2351           __ strw(value, Address(__ post(to, 4)));
2352           __ subw(count, count, 4 >> shift);
2353           __ bind(L_skip_align4);
2354           break;
2355         default: ShouldNotReachHere();
2356       }
2357     }
2358 
2359     //
2360     //  Fill large chunks
2361     //
2362     __ lsrw(cnt_words, count, 3 - shift); // number of words
2363     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2364     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2365     if (UseBlockZeroing) {
2366       Label non_block_zeroing, rest;
2367       // If the fill value is zero we can use the fast zero_words().
2368       __ cbnz(value, non_block_zeroing);
2369       __ mov(bz_base, to);
2370       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2371       __ zero_words(bz_base, cnt_words);
2372       __ b(rest);
2373       __ bind(non_block_zeroing);
2374       __ fill_words(to, cnt_words, value);
2375       __ bind(rest);
2376     } else {
2377       __ fill_words(to, cnt_words, value);
2378     }
2379 
2380     // Remaining count is less than 8 bytes. Fill it by a single store.
2381     // Note that the total length is no less than 8 bytes.
2382     if (t == T_BYTE || t == T_SHORT) {
2383       Label L_exit1;
2384       __ cbzw(count, L_exit1);
2385       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2386       __ str(value, Address(to, -8));    // overwrite some elements
2387       __ bind(L_exit1);
2388       __ leave();
2389       __ ret(lr);
2390     }
2391 
2392     // Handle copies less than 8 bytes.
2393     Label L_fill_2, L_fill_4, L_exit2;
2394     __ bind(L_fill_elements);
2395     switch (t) {
2396       case T_BYTE:
2397         __ tbz(count, 0, L_fill_2);
2398         __ strb(value, Address(__ post(to, 1)));
2399         __ bind(L_fill_2);
2400         __ tbz(count, 1, L_fill_4);
2401         __ strh(value, Address(__ post(to, 2)));
2402         __ bind(L_fill_4);
2403         __ tbz(count, 2, L_exit2);
2404         __ strw(value, Address(to));
2405         break;
2406       case T_SHORT:
2407         __ tbz(count, 0, L_fill_4);
2408         __ strh(value, Address(__ post(to, 2)));
2409         __ bind(L_fill_4);
2410         __ tbz(count, 1, L_exit2);
2411         __ strw(value, Address(to));
2412         break;
2413       case T_INT:
2414         __ cbzw(count, L_exit2);
2415         __ strw(value, Address(to));
2416         break;
2417       default: ShouldNotReachHere();
2418     }
2419     __ bind(L_exit2);
2420     __ leave();
2421     __ ret(lr);
2422     return start;
2423   }
2424 
2425   address generate_data_cache_writeback() {
2426     const Register line        = c_rarg0;  // address of line to write back
2427 
2428     __ align(CodeEntryAlignment);
2429 
2430     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2431 
2432     address start = __ pc();
2433     __ enter();
2434     __ cache_wb(Address(line, 0));
2435     __ leave();
2436     __ ret(lr);
2437 
2438     return start;
2439   }
2440 
2441   address generate_data_cache_writeback_sync() {
2442     const Register is_pre     = c_rarg0;  // pre or post sync
2443 
2444     __ align(CodeEntryAlignment);
2445 
2446     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2447 
2448     // pre wbsync is a no-op
2449     // post wbsync translates to an sfence
2450 
2451     Label skip;
2452     address start = __ pc();
2453     __ enter();
2454     __ cbnz(is_pre, skip);
2455     __ cache_wbsync(false);
2456     __ bind(skip);
2457     __ leave();
2458     __ ret(lr);
2459 
2460     return start;
2461   }
2462 
2463   void generate_arraycopy_stubs() {
2464     address entry;
2465     address entry_jbyte_arraycopy;
2466     address entry_jshort_arraycopy;
2467     address entry_jint_arraycopy;
2468     address entry_oop_arraycopy;
2469     address entry_jlong_arraycopy;
2470     address entry_checkcast_arraycopy;
2471 
2472     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2473     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2474 
2475     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2476 
2477     //*** jbyte
2478     // Always need aligned and unaligned versions
2479     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2480                                                                                   "jbyte_disjoint_arraycopy");
2481     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2482                                                                                   &entry_jbyte_arraycopy,
2483                                                                                   "jbyte_arraycopy");
2484     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2485                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2486     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2487                                                                                   "arrayof_jbyte_arraycopy");
2488 
2489     //*** jshort
2490     // Always need aligned and unaligned versions
2491     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2492                                                                                     "jshort_disjoint_arraycopy");
2493     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2494                                                                                     &entry_jshort_arraycopy,
2495                                                                                     "jshort_arraycopy");
2496     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2497                                                                                     "arrayof_jshort_disjoint_arraycopy");
2498     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2499                                                                                     "arrayof_jshort_arraycopy");
2500 
2501     //*** jint
2502     // Aligned versions
2503     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2504                                                                                 "arrayof_jint_disjoint_arraycopy");
2505     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2506                                                                                 "arrayof_jint_arraycopy");
2507     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2508     // entry_jint_arraycopy always points to the unaligned version
2509     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2510                                                                                 "jint_disjoint_arraycopy");
2511     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2512                                                                                 &entry_jint_arraycopy,
2513                                                                                 "jint_arraycopy");
2514 
2515     //*** jlong
2516     // It is always aligned
2517     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2518                                                                                   "arrayof_jlong_disjoint_arraycopy");
2519     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2520                                                                                   "arrayof_jlong_arraycopy");
2521     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2522     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2523 
2524     //*** oops
2525     {
2526       // With compressed oops we need unaligned versions; notice that
2527       // we overwrite entry_oop_arraycopy.
2528       bool aligned = !UseCompressedOops;
2529 
2530       StubRoutines::_arrayof_oop_disjoint_arraycopy
2531         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2532                                      /*dest_uninitialized*/false);
2533       StubRoutines::_arrayof_oop_arraycopy
2534         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2535                                      /*dest_uninitialized*/false);
2536       // Aligned versions without pre-barriers
2537       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2538         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2539                                      /*dest_uninitialized*/true);
2540       StubRoutines::_arrayof_oop_arraycopy_uninit
2541         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2542                                      /*dest_uninitialized*/true);
2543     }
2544 
2545     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2546     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2547     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2548     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2549 
2550     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2551     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2552                                                                         /*dest_uninitialized*/true);
2553 
2554     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2555                                                               entry_jbyte_arraycopy,
2556                                                               entry_jshort_arraycopy,
2557                                                               entry_jint_arraycopy,
2558                                                               entry_jlong_arraycopy);
2559 
2560     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2561                                                                entry_jbyte_arraycopy,
2562                                                                entry_jshort_arraycopy,
2563                                                                entry_jint_arraycopy,
2564                                                                entry_oop_arraycopy,
2565                                                                entry_jlong_arraycopy,
2566                                                                entry_checkcast_arraycopy);
2567 
2568     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2569     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2570     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2571     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2572     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2573     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2574   }
2575 
2576   void generate_math_stubs() { Unimplemented(); }
2577 
2578   // Arguments:
2579   //
2580   // Inputs:
2581   //   c_rarg0   - source byte array address
2582   //   c_rarg1   - destination byte array address
2583   //   c_rarg2   - K (key) in little endian int array
2584   //
2585   address generate_aescrypt_encryptBlock() {
2586     __ align(CodeEntryAlignment);
2587     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2588 
2589     const Register from        = c_rarg0;  // source array address
2590     const Register to          = c_rarg1;  // destination array address
2591     const Register key         = c_rarg2;  // key array address
2592     const Register keylen      = rscratch1;
2593 
2594     address start = __ pc();
2595     __ enter();
2596 
2597     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2598 
2599     __ aesenc_loadkeys(key, keylen);
2600     __ aesecb_encrypt(from, to, keylen);
2601 
2602     __ mov(r0, 0);
2603 
2604     __ leave();
2605     __ ret(lr);
2606 
2607     return start;
2608   }
2609 
2610   // Arguments:
2611   //
2612   // Inputs:
2613   //   c_rarg0   - source byte array address
2614   //   c_rarg1   - destination byte array address
2615   //   c_rarg2   - K (key) in little endian int array
2616   //
2617   address generate_aescrypt_decryptBlock() {
2618     assert(UseAES, "need AES cryptographic extension support");
2619     __ align(CodeEntryAlignment);
2620     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2621     Label L_doLast;
2622 
2623     const Register from        = c_rarg0;  // source array address
2624     const Register to          = c_rarg1;  // destination array address
2625     const Register key         = c_rarg2;  // key array address
2626     const Register keylen      = rscratch1;
2627 
2628     address start = __ pc();
2629     __ enter(); // required for proper stackwalking of RuntimeStub frame
2630 
2631     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2632 
2633     __ aesecb_decrypt(from, to, key, keylen);
2634 
2635     __ mov(r0, 0);
2636 
2637     __ leave();
2638     __ ret(lr);
2639 
2640     return start;
2641   }
2642 
2643   // Arguments:
2644   //
2645   // Inputs:
2646   //   c_rarg0   - source byte array address
2647   //   c_rarg1   - destination byte array address
2648   //   c_rarg2   - K (key) in little endian int array
2649   //   c_rarg3   - r vector byte array address
2650   //   c_rarg4   - input length
2651   //
2652   // Output:
2653   //   x0        - input length
2654   //
2655   address generate_cipherBlockChaining_encryptAESCrypt() {
2656     assert(UseAES, "need AES cryptographic extension support");
2657     __ align(CodeEntryAlignment);
2658     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2659 
2660     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2661 
2662     const Register from        = c_rarg0;  // source array address
2663     const Register to          = c_rarg1;  // destination array address
2664     const Register key         = c_rarg2;  // key array address
2665     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2666                                            // and left with the results of the last encryption block
2667     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2668     const Register keylen      = rscratch1;
2669 
2670     address start = __ pc();
2671 
2672       __ enter();
2673 
2674       __ movw(rscratch2, len_reg);
2675 
2676       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2677 
2678       __ ld1(v0, __ T16B, rvec);
2679 
2680       __ cmpw(keylen, 52);
2681       __ br(Assembler::CC, L_loadkeys_44);
2682       __ br(Assembler::EQ, L_loadkeys_52);
2683 
2684       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2685       __ rev32(v17, __ T16B, v17);
2686       __ rev32(v18, __ T16B, v18);
2687     __ BIND(L_loadkeys_52);
2688       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2689       __ rev32(v19, __ T16B, v19);
2690       __ rev32(v20, __ T16B, v20);
2691     __ BIND(L_loadkeys_44);
2692       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2693       __ rev32(v21, __ T16B, v21);
2694       __ rev32(v22, __ T16B, v22);
2695       __ rev32(v23, __ T16B, v23);
2696       __ rev32(v24, __ T16B, v24);
2697       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2698       __ rev32(v25, __ T16B, v25);
2699       __ rev32(v26, __ T16B, v26);
2700       __ rev32(v27, __ T16B, v27);
2701       __ rev32(v28, __ T16B, v28);
2702       __ ld1(v29, v30, v31, __ T16B, key);
2703       __ rev32(v29, __ T16B, v29);
2704       __ rev32(v30, __ T16B, v30);
2705       __ rev32(v31, __ T16B, v31);
2706 
2707     __ BIND(L_aes_loop);
2708       __ ld1(v1, __ T16B, __ post(from, 16));
2709       __ eor(v0, __ T16B, v0, v1);
2710 
2711       __ br(Assembler::CC, L_rounds_44);
2712       __ br(Assembler::EQ, L_rounds_52);
2713 
2714       __ aese(v0, v17); __ aesmc(v0, v0);
2715       __ aese(v0, v18); __ aesmc(v0, v0);
2716     __ BIND(L_rounds_52);
2717       __ aese(v0, v19); __ aesmc(v0, v0);
2718       __ aese(v0, v20); __ aesmc(v0, v0);
2719     __ BIND(L_rounds_44);
2720       __ aese(v0, v21); __ aesmc(v0, v0);
2721       __ aese(v0, v22); __ aesmc(v0, v0);
2722       __ aese(v0, v23); __ aesmc(v0, v0);
2723       __ aese(v0, v24); __ aesmc(v0, v0);
2724       __ aese(v0, v25); __ aesmc(v0, v0);
2725       __ aese(v0, v26); __ aesmc(v0, v0);
2726       __ aese(v0, v27); __ aesmc(v0, v0);
2727       __ aese(v0, v28); __ aesmc(v0, v0);
2728       __ aese(v0, v29); __ aesmc(v0, v0);
2729       __ aese(v0, v30);
2730       __ eor(v0, __ T16B, v0, v31);
2731 
2732       __ st1(v0, __ T16B, __ post(to, 16));
2733 
2734       __ subw(len_reg, len_reg, 16);
2735       __ cbnzw(len_reg, L_aes_loop);
2736 
2737       __ st1(v0, __ T16B, rvec);
2738 
2739       __ mov(r0, rscratch2);
2740 
2741       __ leave();
2742       __ ret(lr);
2743 
2744       return start;
2745   }
2746 
2747   // Arguments:
2748   //
2749   // Inputs:
2750   //   c_rarg0   - source byte array address
2751   //   c_rarg1   - destination byte array address
2752   //   c_rarg2   - K (key) in little endian int array
2753   //   c_rarg3   - r vector byte array address
2754   //   c_rarg4   - input length
2755   //
2756   // Output:
2757   //   r0        - input length
2758   //
2759   address generate_cipherBlockChaining_decryptAESCrypt() {
2760     assert(UseAES, "need AES cryptographic extension support");
2761     __ align(CodeEntryAlignment);
2762     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2763 
2764     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2765 
2766     const Register from        = c_rarg0;  // source array address
2767     const Register to          = c_rarg1;  // destination array address
2768     const Register key         = c_rarg2;  // key array address
2769     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2770                                            // and left with the results of the last encryption block
2771     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2772     const Register keylen      = rscratch1;
2773 
2774     address start = __ pc();
2775 
2776       __ enter();
2777 
2778       __ movw(rscratch2, len_reg);
2779 
2780       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2781 
2782       __ ld1(v2, __ T16B, rvec);
2783 
2784       __ ld1(v31, __ T16B, __ post(key, 16));
2785       __ rev32(v31, __ T16B, v31);
2786 
2787       __ cmpw(keylen, 52);
2788       __ br(Assembler::CC, L_loadkeys_44);
2789       __ br(Assembler::EQ, L_loadkeys_52);
2790 
2791       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2792       __ rev32(v17, __ T16B, v17);
2793       __ rev32(v18, __ T16B, v18);
2794     __ BIND(L_loadkeys_52);
2795       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2796       __ rev32(v19, __ T16B, v19);
2797       __ rev32(v20, __ T16B, v20);
2798     __ BIND(L_loadkeys_44);
2799       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2800       __ rev32(v21, __ T16B, v21);
2801       __ rev32(v22, __ T16B, v22);
2802       __ rev32(v23, __ T16B, v23);
2803       __ rev32(v24, __ T16B, v24);
2804       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2805       __ rev32(v25, __ T16B, v25);
2806       __ rev32(v26, __ T16B, v26);
2807       __ rev32(v27, __ T16B, v27);
2808       __ rev32(v28, __ T16B, v28);
2809       __ ld1(v29, v30, __ T16B, key);
2810       __ rev32(v29, __ T16B, v29);
2811       __ rev32(v30, __ T16B, v30);
2812 
2813     __ BIND(L_aes_loop);
2814       __ ld1(v0, __ T16B, __ post(from, 16));
2815       __ orr(v1, __ T16B, v0, v0);
2816 
2817       __ br(Assembler::CC, L_rounds_44);
2818       __ br(Assembler::EQ, L_rounds_52);
2819 
2820       __ aesd(v0, v17); __ aesimc(v0, v0);
2821       __ aesd(v0, v18); __ aesimc(v0, v0);
2822     __ BIND(L_rounds_52);
2823       __ aesd(v0, v19); __ aesimc(v0, v0);
2824       __ aesd(v0, v20); __ aesimc(v0, v0);
2825     __ BIND(L_rounds_44);
2826       __ aesd(v0, v21); __ aesimc(v0, v0);
2827       __ aesd(v0, v22); __ aesimc(v0, v0);
2828       __ aesd(v0, v23); __ aesimc(v0, v0);
2829       __ aesd(v0, v24); __ aesimc(v0, v0);
2830       __ aesd(v0, v25); __ aesimc(v0, v0);
2831       __ aesd(v0, v26); __ aesimc(v0, v0);
2832       __ aesd(v0, v27); __ aesimc(v0, v0);
2833       __ aesd(v0, v28); __ aesimc(v0, v0);
2834       __ aesd(v0, v29); __ aesimc(v0, v0);
2835       __ aesd(v0, v30);
2836       __ eor(v0, __ T16B, v0, v31);
2837       __ eor(v0, __ T16B, v0, v2);
2838 
2839       __ st1(v0, __ T16B, __ post(to, 16));
2840       __ orr(v2, __ T16B, v1, v1);
2841 
2842       __ subw(len_reg, len_reg, 16);
2843       __ cbnzw(len_reg, L_aes_loop);
2844 
2845       __ st1(v2, __ T16B, rvec);
2846 
2847       __ mov(r0, rscratch2);
2848 
2849       __ leave();
2850       __ ret(lr);
2851 
2852     return start;
2853   }
2854 
2855   // CTR AES crypt.
2856   // Arguments:
2857   //
2858   // Inputs:
2859   //   c_rarg0   - source byte array address
2860   //   c_rarg1   - destination byte array address
2861   //   c_rarg2   - K (key) in little endian int array
2862   //   c_rarg3   - counter vector byte array address
2863   //   c_rarg4   - input length
2864   //   c_rarg5   - saved encryptedCounter start
2865   //   c_rarg6   - saved used length
2866   //
2867   // Output:
2868   //   r0       - input length
2869   //
2870   address generate_counterMode_AESCrypt() {
2871     const Register in = c_rarg0;
2872     const Register out = c_rarg1;
2873     const Register key = c_rarg2;
2874     const Register counter = c_rarg3;
2875     const Register saved_len = c_rarg4, len = r10;
2876     const Register saved_encrypted_ctr = c_rarg5;
2877     const Register used_ptr = c_rarg6, used = r12;
2878 
2879     const Register offset = r7;
2880     const Register keylen = r11;
2881 
2882     const unsigned char block_size = 16;
2883     const int bulk_width = 4;
2884     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2885     // performance with larger data sizes, but it also means that the
2886     // fast path isn't used until you have at least 8 blocks, and up
2887     // to 127 bytes of data will be executed on the slow path. For
2888     // that reason, and also so as not to blow away too much icache, 4
2889     // blocks seems like a sensible compromise.
2890 
2891     // Algorithm:
2892     //
2893     //    if (len == 0) {
2894     //        goto DONE;
2895     //    }
2896     //    int result = len;
2897     //    do {
2898     //        if (used >= blockSize) {
2899     //            if (len >= bulk_width * blockSize) {
2900     //                CTR_large_block();
2901     //                if (len == 0)
2902     //                    goto DONE;
2903     //            }
2904     //            for (;;) {
2905     //                16ByteVector v0 = counter;
2906     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2907     //                used = 0;
2908     //                if (len < blockSize)
2909     //                    break;    /* goto NEXT */
2910     //                16ByteVector v1 = load16Bytes(in, offset);
2911     //                v1 = v1 ^ encryptedCounter;
2912     //                store16Bytes(out, offset);
2913     //                used = blockSize;
2914     //                offset += blockSize;
2915     //                len -= blockSize;
2916     //                if (len == 0)
2917     //                    goto DONE;
2918     //            }
2919     //        }
2920     //      NEXT:
2921     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2922     //        len--;
2923     //    } while (len != 0);
2924     //  DONE:
2925     //    return result;
2926     //
2927     // CTR_large_block()
2928     //    Wide bulk encryption of whole blocks.
2929 
2930     __ align(CodeEntryAlignment);
2931     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2932     const address start = __ pc();
2933     __ enter();
2934 
2935     Label DONE, CTR_large_block, large_block_return;
2936     __ ldrw(used, Address(used_ptr));
2937     __ cbzw(saved_len, DONE);
2938 
2939     __ mov(len, saved_len);
2940     __ mov(offset, 0);
2941 
2942     // Compute #rounds for AES based on the length of the key array
2943     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2944 
2945     __ aesenc_loadkeys(key, keylen);
2946 
2947     {
2948       Label L_CTR_loop, NEXT;
2949 
2950       __ bind(L_CTR_loop);
2951 
2952       __ cmp(used, block_size);
2953       __ br(__ LO, NEXT);
2954 
2955       // Maybe we have a lot of data
2956       __ subsw(rscratch1, len, bulk_width * block_size);
2957       __ br(__ HS, CTR_large_block);
2958       __ BIND(large_block_return);
2959       __ cbzw(len, DONE);
2960 
2961       // Setup the counter
2962       __ movi(v4, __ T4S, 0);
2963       __ movi(v5, __ T4S, 1);
2964       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2965 
2966       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2967       __ rev32(v16, __ T16B, v0);
2968       __ addv(v16, __ T4S, v16, v4);
2969       __ rev32(v16, __ T16B, v16);
2970       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2971 
2972       {
2973         // We have fewer than bulk_width blocks of data left. Encrypt
2974         // them one by one until there is less than a full block
2975         // remaining, being careful to save both the encrypted counter
2976         // and the counter.
2977 
2978         Label inner_loop;
2979         __ bind(inner_loop);
2980         // Counter to encrypt is in v0
2981         __ aesecb_encrypt(noreg, noreg, keylen);
2982         __ st1(v0, __ T16B, saved_encrypted_ctr);
2983 
2984         // Do we have a remaining full block?
2985 
2986         __ mov(used, 0);
2987         __ cmp(len, block_size);
2988         __ br(__ LO, NEXT);
2989 
2990         // Yes, we have a full block
2991         __ ldrq(v1, Address(in, offset));
2992         __ eor(v1, __ T16B, v1, v0);
2993         __ strq(v1, Address(out, offset));
2994         __ mov(used, block_size);
2995         __ add(offset, offset, block_size);
2996 
2997         __ subw(len, len, block_size);
2998         __ cbzw(len, DONE);
2999 
3000         // Increment the counter, store it back
3001         __ orr(v0, __ T16B, v16, v16);
3002         __ rev32(v16, __ T16B, v16);
3003         __ addv(v16, __ T4S, v16, v4);
3004         __ rev32(v16, __ T16B, v16);
3005         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3006 
3007         __ b(inner_loop);
3008       }
3009 
3010       __ BIND(NEXT);
3011 
3012       // Encrypt a single byte, and loop.
3013       // We expect this to be a rare event.
3014       __ ldrb(rscratch1, Address(in, offset));
3015       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3016       __ eor(rscratch1, rscratch1, rscratch2);
3017       __ strb(rscratch1, Address(out, offset));
3018       __ add(offset, offset, 1);
3019       __ add(used, used, 1);
3020       __ subw(len, len,1);
3021       __ cbnzw(len, L_CTR_loop);
3022     }
3023 
3024     __ bind(DONE);
3025     __ strw(used, Address(used_ptr));
3026     __ mov(r0, saved_len);
3027 
3028     __ leave(); // required for proper stackwalking of RuntimeStub frame
3029     __ ret(lr);
3030 
3031     // Bulk encryption
3032 
3033     __ BIND (CTR_large_block);
3034     assert(bulk_width == 4 || bulk_width == 8, "must be");
3035 
3036     if (bulk_width == 8) {
3037       __ sub(sp, sp, 4 * 16);
3038       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3039     }
3040     __ sub(sp, sp, 4 * 16);
3041     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3042     RegSet saved_regs = (RegSet::of(in, out, offset)
3043                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3044     __ push(saved_regs, sp);
3045     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3046     __ add(in, in, offset);
3047     __ add(out, out, offset);
3048 
3049     // Keys should already be loaded into the correct registers
3050 
3051     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3052     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3053 
3054     // AES/CTR loop
3055     {
3056       Label L_CTR_loop;
3057       __ BIND(L_CTR_loop);
3058 
3059       // Setup the counters
3060       __ movi(v8, __ T4S, 0);
3061       __ movi(v9, __ T4S, 1);
3062       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3063 
3064       for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
3065         __ rev32(f, __ T16B, v16);
3066         __ addv(v16, __ T4S, v16, v8);
3067       }
3068 
3069       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3070 
3071       // Encrypt the counters
3072       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3073 
3074       if (bulk_width == 8) {
3075         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3076       }
3077 
3078       // XOR the encrypted counters with the inputs
3079       for (int i = 0; i < bulk_width; i++) {
3080         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3081       }
3082 
3083       // Write the encrypted data
3084       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3085       if (bulk_width == 8) {
3086         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3087       }
3088 
3089       __ subw(len, len, 16 * bulk_width);
3090       __ cbnzw(len, L_CTR_loop);
3091     }
3092 
3093     // Save the counter back where it goes
3094     __ rev32(v16, __ T16B, v16);
3095     __ st1(v16, __ T16B, counter);
3096 
3097     __ pop(saved_regs, sp);
3098 
3099     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3100     if (bulk_width == 8) {
3101       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3102     }
3103 
3104     __ andr(rscratch1, len, -16 * bulk_width);
3105     __ sub(len, len, rscratch1);
3106     __ add(offset, offset, rscratch1);
3107     __ mov(used, 16);
3108     __ strw(used, Address(used_ptr));
3109     __ b(large_block_return);
3110 
3111     return start;
3112   }
3113 
3114   // Vector AES Galois Counter Mode implementation. Parameters:
3115   //
3116   // in = c_rarg0
3117   // len = c_rarg1
3118   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3119   // out = c_rarg3
3120   // key = c_rarg4
3121   // state = c_rarg5 - GHASH.state
3122   // subkeyHtbl = c_rarg6 - powers of H
3123   // subkeyHtbl_48_entries = c_rarg7 (not used)
3124   // counter = [sp, #0] pointer to 16 bytes of CTR
3125   // return - number of processed bytes
3126   address generate_galoisCounterMode_AESCrypt() {
3127     address ghash_polynomial = __ pc();
3128     __ emit_int64(0x87);  // The low-order bits of the field
3129                           // polynomial (i.e. p = z^7+z^2+z+1)
3130                           // repeated in the low and high parts of a
3131                           // 128-bit vector
3132     __ emit_int64(0x87);
3133 
3134     __ align(CodeEntryAlignment);
3135      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3136     address start = __ pc();
3137     __ enter();
3138 
3139     const Register in = c_rarg0;
3140     const Register len = c_rarg1;
3141     const Register ct = c_rarg2;
3142     const Register out = c_rarg3;
3143     // and updated with the incremented counter in the end
3144 
3145     const Register key = c_rarg4;
3146     const Register state = c_rarg5;
3147 
3148     const Register subkeyHtbl = c_rarg6;
3149 
3150     // Pointer to CTR is passed on the stack before the (fp, lr) pair.
3151     const Address counter_mem(sp, 2 * wordSize);
3152     const Register counter = c_rarg7;
3153     __ ldr(counter, counter_mem);
3154 
3155     const Register keylen = r10;
3156     // Save state before entering routine
3157     __ sub(sp, sp, 4 * 16);
3158     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3159     __ sub(sp, sp, 4 * 16);
3160     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3161 
3162     // __ andr(len, len, -512);
3163     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3164     __ str(len, __ pre(sp, -2 * wordSize));
3165 
3166     Label DONE;
3167     __ cbz(len, DONE);
3168 
3169     // Compute #rounds for AES based on the length of the key array
3170     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3171 
3172     __ aesenc_loadkeys(key, keylen);
3173     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3174     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3175 
3176     // AES/CTR loop
3177     {
3178       Label L_CTR_loop;
3179       __ BIND(L_CTR_loop);
3180 
3181       // Setup the counters
3182       __ movi(v8, __ T4S, 0);
3183       __ movi(v9, __ T4S, 1);
3184       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3185       for (FloatRegister f = v0; f < v8; f++) {
3186         __ rev32(f, __ T16B, v16);
3187         __ addv(v16, __ T4S, v16, v8);
3188       }
3189 
3190       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3191 
3192       // Encrypt the counters
3193       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3194 
3195       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3196 
3197       // XOR the encrypted counters with the inputs
3198       for (int i = 0; i < 8; i++) {
3199         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3200       }
3201       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3202       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3203 
3204       __ subw(len, len, 16 * 8);
3205       __ cbnzw(len, L_CTR_loop);
3206     }
3207 
3208     __ rev32(v16, __ T16B, v16);
3209     __ st1(v16, __ T16B, counter);
3210 
3211     __ ldr(len, Address(sp));
3212     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3213 
3214     // GHASH/CTR loop
3215     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3216                                 len, /*unrolls*/4);
3217 
3218 #ifdef ASSERT
3219     { Label L;
3220       __ cmp(len, (unsigned char)0);
3221       __ br(Assembler::EQ, L);
3222       __ stop("stubGenerator: abort");
3223       __ bind(L);
3224   }
3225 #endif
3226 
3227   __ bind(DONE);
3228     // Return the number of bytes processed
3229     __ ldr(r0, __ post(sp, 2 * wordSize));
3230 
3231     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3232     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3233 
3234     __ leave(); // required for proper stackwalking of RuntimeStub frame
3235     __ ret(lr);
3236      return start;
3237   }
3238 
3239   // Arguments:
3240   //
3241   // Inputs:
3242   //   c_rarg0   - byte[]  source+offset
3243   //   c_rarg1   - int[]   SHA.state
3244   //   c_rarg2   - int     offset
3245   //   c_rarg3   - int     limit
3246   //
3247   address generate_sha1_implCompress(bool multi_block, const char *name) {
3248     __ align(CodeEntryAlignment);
3249     StubCodeMark mark(this, "StubRoutines", name);
3250     address start = __ pc();
3251 
3252     Register buf   = c_rarg0;
3253     Register state = c_rarg1;
3254     Register ofs   = c_rarg2;
3255     Register limit = c_rarg3;
3256 
3257     Label keys;
3258     Label sha1_loop;
3259 
3260     // load the keys into v0..v3
3261     __ adr(rscratch1, keys);
3262     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3263     // load 5 words state into v6, v7
3264     __ ldrq(v6, Address(state, 0));
3265     __ ldrs(v7, Address(state, 16));
3266 
3267 
3268     __ BIND(sha1_loop);
3269     // load 64 bytes of data into v16..v19
3270     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3271     __ rev32(v16, __ T16B, v16);
3272     __ rev32(v17, __ T16B, v17);
3273     __ rev32(v18, __ T16B, v18);
3274     __ rev32(v19, __ T16B, v19);
3275 
3276     // do the sha1
3277     __ addv(v4, __ T4S, v16, v0);
3278     __ orr(v20, __ T16B, v6, v6);
3279 
3280     FloatRegister d0 = v16;
3281     FloatRegister d1 = v17;
3282     FloatRegister d2 = v18;
3283     FloatRegister d3 = v19;
3284 
3285     for (int round = 0; round < 20; round++) {
3286       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3287       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3288       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3289       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3290       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3291 
3292       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3293       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3294       __ sha1h(tmp2, __ T4S, v20);
3295       if (round < 5)
3296         __ sha1c(v20, __ T4S, tmp3, tmp4);
3297       else if (round < 10 || round >= 15)
3298         __ sha1p(v20, __ T4S, tmp3, tmp4);
3299       else
3300         __ sha1m(v20, __ T4S, tmp3, tmp4);
3301       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3302 
3303       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3304     }
3305 
3306     __ addv(v7, __ T2S, v7, v21);
3307     __ addv(v6, __ T4S, v6, v20);
3308 
3309     if (multi_block) {
3310       __ add(ofs, ofs, 64);
3311       __ cmp(ofs, limit);
3312       __ br(Assembler::LE, sha1_loop);
3313       __ mov(c_rarg0, ofs); // return ofs
3314     }
3315 
3316     __ strq(v6, Address(state, 0));
3317     __ strs(v7, Address(state, 16));
3318 
3319     __ ret(lr);
3320 
3321     __ bind(keys);
3322     __ emit_int32(0x5a827999);
3323     __ emit_int32(0x6ed9eba1);
3324     __ emit_int32(0x8f1bbcdc);
3325     __ emit_int32(0xca62c1d6);
3326 
3327     return start;
3328   }
3329 
3330 
3331   // Arguments:
3332   //
3333   // Inputs:
3334   //   c_rarg0   - byte[]  source+offset
3335   //   c_rarg1   - int[]   SHA.state
3336   //   c_rarg2   - int     offset
3337   //   c_rarg3   - int     limit
3338   //
3339   address generate_sha256_implCompress(bool multi_block, const char *name) {
3340     static const uint32_t round_consts[64] = {
3341       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3342       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3343       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3344       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3345       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3346       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3347       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3348       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3349       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3350       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3351       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3352       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3353       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3354       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3355       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3356       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3357     };
3358     __ align(CodeEntryAlignment);
3359     StubCodeMark mark(this, "StubRoutines", name);
3360     address start = __ pc();
3361 
3362     Register buf   = c_rarg0;
3363     Register state = c_rarg1;
3364     Register ofs   = c_rarg2;
3365     Register limit = c_rarg3;
3366 
3367     Label sha1_loop;
3368 
3369     __ stpd(v8, v9, __ pre(sp, -32));
3370     __ stpd(v10, v11, Address(sp, 16));
3371 
3372 // dga == v0
3373 // dgb == v1
3374 // dg0 == v2
3375 // dg1 == v3
3376 // dg2 == v4
3377 // t0 == v6
3378 // t1 == v7
3379 
3380     // load 16 keys to v16..v31
3381     __ lea(rscratch1, ExternalAddress((address)round_consts));
3382     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3383     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3384     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3385     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3386 
3387     // load 8 words (256 bits) state
3388     __ ldpq(v0, v1, state);
3389 
3390     __ BIND(sha1_loop);
3391     // load 64 bytes of data into v8..v11
3392     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3393     __ rev32(v8, __ T16B, v8);
3394     __ rev32(v9, __ T16B, v9);
3395     __ rev32(v10, __ T16B, v10);
3396     __ rev32(v11, __ T16B, v11);
3397 
3398     __ addv(v6, __ T4S, v8, v16);
3399     __ orr(v2, __ T16B, v0, v0);
3400     __ orr(v3, __ T16B, v1, v1);
3401 
3402     FloatRegister d0 = v8;
3403     FloatRegister d1 = v9;
3404     FloatRegister d2 = v10;
3405     FloatRegister d3 = v11;
3406 
3407 
3408     for (int round = 0; round < 16; round++) {
3409       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3410       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3411       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3412       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3413 
3414       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3415        __ orr(v4, __ T16B, v2, v2);
3416       if (round < 15)
3417         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3418       __ sha256h(v2, __ T4S, v3, tmp2);
3419       __ sha256h2(v3, __ T4S, v4, tmp2);
3420       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3421 
3422       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3423     }
3424 
3425     __ addv(v0, __ T4S, v0, v2);
3426     __ addv(v1, __ T4S, v1, v3);
3427 
3428     if (multi_block) {
3429       __ add(ofs, ofs, 64);
3430       __ cmp(ofs, limit);
3431       __ br(Assembler::LE, sha1_loop);
3432       __ mov(c_rarg0, ofs); // return ofs
3433     }
3434 
3435     __ ldpd(v10, v11, Address(sp, 16));
3436     __ ldpd(v8, v9, __ post(sp, 32));
3437 
3438     __ stpq(v0, v1, state);
3439 
3440     __ ret(lr);
3441 
3442     return start;
3443   }
3444 
3445   // Arguments:
3446   //
3447   // Inputs:
3448   //   c_rarg0   - byte[]  source+offset
3449   //   c_rarg1   - int[]   SHA.state
3450   //   c_rarg2   - int     offset
3451   //   c_rarg3   - int     limit
3452   //
3453   address generate_sha512_implCompress(bool multi_block, const char *name) {
3454     static const uint64_t round_consts[80] = {
3455       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3456       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3457       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3458       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3459       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3460       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3461       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3462       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3463       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3464       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3465       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3466       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3467       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3468       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3469       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3470       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3471       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3472       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3473       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3474       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3475       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3476       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3477       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3478       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3479       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3480       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3481       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3482     };
3483 
3484     // Double rounds for sha512.
3485     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3486       if (dr < 36)                                                                   \
3487         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3488       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3489       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3490       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3491       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3492       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3493       if (dr < 32) {                                                                 \
3494         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3495         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3496       }                                                                              \
3497       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3498       if (dr < 32)                                                                   \
3499         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3500       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3501       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3502 
3503     __ align(CodeEntryAlignment);
3504     StubCodeMark mark(this, "StubRoutines", name);
3505     address start = __ pc();
3506 
3507     Register buf   = c_rarg0;
3508     Register state = c_rarg1;
3509     Register ofs   = c_rarg2;
3510     Register limit = c_rarg3;
3511 
3512     __ stpd(v8, v9, __ pre(sp, -64));
3513     __ stpd(v10, v11, Address(sp, 16));
3514     __ stpd(v12, v13, Address(sp, 32));
3515     __ stpd(v14, v15, Address(sp, 48));
3516 
3517     Label sha512_loop;
3518 
3519     // load state
3520     __ ld1(v8, v9, v10, v11, __ T2D, state);
3521 
3522     // load first 4 round constants
3523     __ lea(rscratch1, ExternalAddress((address)round_consts));
3524     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3525 
3526     __ BIND(sha512_loop);
3527     // load 128B of data into v12..v19
3528     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3529     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3530     __ rev64(v12, __ T16B, v12);
3531     __ rev64(v13, __ T16B, v13);
3532     __ rev64(v14, __ T16B, v14);
3533     __ rev64(v15, __ T16B, v15);
3534     __ rev64(v16, __ T16B, v16);
3535     __ rev64(v17, __ T16B, v17);
3536     __ rev64(v18, __ T16B, v18);
3537     __ rev64(v19, __ T16B, v19);
3538 
3539     __ mov(rscratch2, rscratch1);
3540 
3541     __ mov(v0, __ T16B, v8);
3542     __ mov(v1, __ T16B, v9);
3543     __ mov(v2, __ T16B, v10);
3544     __ mov(v3, __ T16B, v11);
3545 
3546     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3547     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3548     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3549     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3550     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3551     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3552     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3553     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3554     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3555     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3556     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3557     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3558     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3559     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3560     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3561     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3562     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3563     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3564     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3565     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3566     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3567     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3568     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3569     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3570     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3571     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3572     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3573     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3574     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3575     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3576     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3577     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3578     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3579     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3580     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3581     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3582     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3583     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3584     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3585     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3586 
3587     __ addv(v8, __ T2D, v8, v0);
3588     __ addv(v9, __ T2D, v9, v1);
3589     __ addv(v10, __ T2D, v10, v2);
3590     __ addv(v11, __ T2D, v11, v3);
3591 
3592     if (multi_block) {
3593       __ add(ofs, ofs, 128);
3594       __ cmp(ofs, limit);
3595       __ br(Assembler::LE, sha512_loop);
3596       __ mov(c_rarg0, ofs); // return ofs
3597     }
3598 
3599     __ st1(v8, v9, v10, v11, __ T2D, state);
3600 
3601     __ ldpd(v14, v15, Address(sp, 48));
3602     __ ldpd(v12, v13, Address(sp, 32));
3603     __ ldpd(v10, v11, Address(sp, 16));
3604     __ ldpd(v8, v9, __ post(sp, 64));
3605 
3606     __ ret(lr);
3607 
3608     return start;
3609   }
3610 
3611   // Arguments:
3612   //
3613   // Inputs:
3614   //   c_rarg0   - byte[]  source+offset
3615   //   c_rarg1   - byte[]   SHA.state
3616   //   c_rarg2   - int     digest_length
3617   //   c_rarg3   - int     offset
3618   //   c_rarg4   - int     limit
3619   //
3620   address generate_sha3_implCompress(bool multi_block, const char *name) {
3621     static const uint64_t round_consts[24] = {
3622       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3623       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3624       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3625       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3626       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3627       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3628       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3629       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3630     };
3631 
3632     __ align(CodeEntryAlignment);
3633     StubCodeMark mark(this, "StubRoutines", name);
3634     address start = __ pc();
3635 
3636     Register buf           = c_rarg0;
3637     Register state         = c_rarg1;
3638     Register digest_length = c_rarg2;
3639     Register ofs           = c_rarg3;
3640     Register limit         = c_rarg4;
3641 
3642     Label sha3_loop, rounds24_loop;
3643     Label sha3_512, sha3_384_or_224, sha3_256;
3644 
3645     __ stpd(v8, v9, __ pre(sp, -64));
3646     __ stpd(v10, v11, Address(sp, 16));
3647     __ stpd(v12, v13, Address(sp, 32));
3648     __ stpd(v14, v15, Address(sp, 48));
3649 
3650     // load state
3651     __ add(rscratch1, state, 32);
3652     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3653     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3654     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3655     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3656     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3657     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3658     __ ld1(v24, __ T1D, rscratch1);
3659 
3660     __ BIND(sha3_loop);
3661 
3662     // 24 keccak rounds
3663     __ movw(rscratch2, 24);
3664 
3665     // load round_constants base
3666     __ lea(rscratch1, ExternalAddress((address) round_consts));
3667 
3668     // load input
3669     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3670     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3671     __ eor(v0, __ T8B, v0, v25);
3672     __ eor(v1, __ T8B, v1, v26);
3673     __ eor(v2, __ T8B, v2, v27);
3674     __ eor(v3, __ T8B, v3, v28);
3675     __ eor(v4, __ T8B, v4, v29);
3676     __ eor(v5, __ T8B, v5, v30);
3677     __ eor(v6, __ T8B, v6, v31);
3678 
3679     // digest_length == 64, SHA3-512
3680     __ tbnz(digest_length, 6, sha3_512);
3681 
3682     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3683     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3684     __ eor(v7, __ T8B, v7, v25);
3685     __ eor(v8, __ T8B, v8, v26);
3686     __ eor(v9, __ T8B, v9, v27);
3687     __ eor(v10, __ T8B, v10, v28);
3688     __ eor(v11, __ T8B, v11, v29);
3689     __ eor(v12, __ T8B, v12, v30);
3690 
3691     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3692     __ tbnz(digest_length, 4, sha3_384_or_224);
3693 
3694     // SHA3-256
3695     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3696     __ eor(v13, __ T8B, v13, v25);
3697     __ eor(v14, __ T8B, v14, v26);
3698     __ eor(v15, __ T8B, v15, v27);
3699     __ eor(v16, __ T8B, v16, v28);
3700     __ b(rounds24_loop);
3701 
3702     __ BIND(sha3_384_or_224);
3703     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3704 
3705     // SHA3-224
3706     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3707     __ ld1(v29, __ T8B, __ post(buf, 8));
3708     __ eor(v13, __ T8B, v13, v25);
3709     __ eor(v14, __ T8B, v14, v26);
3710     __ eor(v15, __ T8B, v15, v27);
3711     __ eor(v16, __ T8B, v16, v28);
3712     __ eor(v17, __ T8B, v17, v29);
3713     __ b(rounds24_loop);
3714 
3715     __ BIND(sha3_512);
3716     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3717     __ eor(v7, __ T8B, v7, v25);
3718     __ eor(v8, __ T8B, v8, v26);
3719 
3720     __ BIND(rounds24_loop);
3721     __ subw(rscratch2, rscratch2, 1);
3722 
3723     __ eor3(v29, __ T16B, v4, v9, v14);
3724     __ eor3(v26, __ T16B, v1, v6, v11);
3725     __ eor3(v28, __ T16B, v3, v8, v13);
3726     __ eor3(v25, __ T16B, v0, v5, v10);
3727     __ eor3(v27, __ T16B, v2, v7, v12);
3728     __ eor3(v29, __ T16B, v29, v19, v24);
3729     __ eor3(v26, __ T16B, v26, v16, v21);
3730     __ eor3(v28, __ T16B, v28, v18, v23);
3731     __ eor3(v25, __ T16B, v25, v15, v20);
3732     __ eor3(v27, __ T16B, v27, v17, v22);
3733 
3734     __ rax1(v30, __ T2D, v29, v26);
3735     __ rax1(v26, __ T2D, v26, v28);
3736     __ rax1(v28, __ T2D, v28, v25);
3737     __ rax1(v25, __ T2D, v25, v27);
3738     __ rax1(v27, __ T2D, v27, v29);
3739 
3740     __ eor(v0, __ T16B, v0, v30);
3741     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3742     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3743     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3744     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3745     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3746     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3747     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3748     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3749     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3750     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3751     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3752     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3753     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3754     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3755     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3756     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3757     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3758     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3759     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3760     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3761     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3762     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3763     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3764     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3765 
3766     __ bcax(v20, __ T16B, v31, v22, v8);
3767     __ bcax(v21, __ T16B, v8,  v23, v22);
3768     __ bcax(v22, __ T16B, v22, v24, v23);
3769     __ bcax(v23, __ T16B, v23, v31, v24);
3770     __ bcax(v24, __ T16B, v24, v8,  v31);
3771 
3772     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3773 
3774     __ bcax(v17, __ T16B, v25, v19, v3);
3775     __ bcax(v18, __ T16B, v3,  v15, v19);
3776     __ bcax(v19, __ T16B, v19, v16, v15);
3777     __ bcax(v15, __ T16B, v15, v25, v16);
3778     __ bcax(v16, __ T16B, v16, v3,  v25);
3779 
3780     __ bcax(v10, __ T16B, v29, v12, v26);
3781     __ bcax(v11, __ T16B, v26, v13, v12);
3782     __ bcax(v12, __ T16B, v12, v14, v13);
3783     __ bcax(v13, __ T16B, v13, v29, v14);
3784     __ bcax(v14, __ T16B, v14, v26, v29);
3785 
3786     __ bcax(v7, __ T16B, v30, v9,  v4);
3787     __ bcax(v8, __ T16B, v4,  v5,  v9);
3788     __ bcax(v9, __ T16B, v9,  v6,  v5);
3789     __ bcax(v5, __ T16B, v5,  v30, v6);
3790     __ bcax(v6, __ T16B, v6,  v4,  v30);
3791 
3792     __ bcax(v3, __ T16B, v27, v0,  v28);
3793     __ bcax(v4, __ T16B, v28, v1,  v0);
3794     __ bcax(v0, __ T16B, v0,  v2,  v1);
3795     __ bcax(v1, __ T16B, v1,  v27, v2);
3796     __ bcax(v2, __ T16B, v2,  v28, v27);
3797 
3798     __ eor(v0, __ T16B, v0, v31);
3799 
3800     __ cbnzw(rscratch2, rounds24_loop);
3801 
3802     if (multi_block) {
3803       // block_size =  200 - 2 * digest_length, ofs += block_size
3804       __ add(ofs, ofs, 200);
3805       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3806 
3807       __ cmp(ofs, limit);
3808       __ br(Assembler::LE, sha3_loop);
3809       __ mov(c_rarg0, ofs); // return ofs
3810     }
3811 
3812     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3813     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3814     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3815     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3816     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3817     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3818     __ st1(v24, __ T1D, state);
3819 
3820     __ ldpd(v14, v15, Address(sp, 48));
3821     __ ldpd(v12, v13, Address(sp, 32));
3822     __ ldpd(v10, v11, Address(sp, 16));
3823     __ ldpd(v8, v9, __ post(sp, 64));
3824 
3825     __ ret(lr);
3826 
3827     return start;
3828   }
3829 
3830   // Safefetch stubs.
3831   void generate_safefetch(const char* name, int size, address* entry,
3832                           address* fault_pc, address* continuation_pc) {
3833     // safefetch signatures:
3834     //   int      SafeFetch32(int*      adr, int      errValue);
3835     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3836     //
3837     // arguments:
3838     //   c_rarg0 = adr
3839     //   c_rarg1 = errValue
3840     //
3841     // result:
3842     //   PPC_RET  = *adr or errValue
3843 
3844     StubCodeMark mark(this, "StubRoutines", name);
3845 
3846     // Entry point, pc or function descriptor.
3847     *entry = __ pc();
3848 
3849     // Load *adr into c_rarg1, may fault.
3850     *fault_pc = __ pc();
3851     switch (size) {
3852       case 4:
3853         // int32_t
3854         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3855         break;
3856       case 8:
3857         // int64_t
3858         __ ldr(c_rarg1, Address(c_rarg0, 0));
3859         break;
3860       default:
3861         ShouldNotReachHere();
3862     }
3863 
3864     // return errValue or *adr
3865     *continuation_pc = __ pc();
3866     __ mov(r0, c_rarg1);
3867     __ ret(lr);
3868   }
3869 
3870   /**
3871    *  Arguments:
3872    *
3873    * Inputs:
3874    *   c_rarg0   - int crc
3875    *   c_rarg1   - byte* buf
3876    *   c_rarg2   - int length
3877    *
3878    * Ouput:
3879    *       rax   - int crc result
3880    */
3881   address generate_updateBytesCRC32() {
3882     assert(UseCRC32Intrinsics, "what are we doing here?");
3883 
3884     __ align(CodeEntryAlignment);
3885     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3886 
3887     address start = __ pc();
3888 
3889     const Register crc   = c_rarg0;  // crc
3890     const Register buf   = c_rarg1;  // source java byte array address
3891     const Register len   = c_rarg2;  // length
3892     const Register table0 = c_rarg3; // crc_table address
3893     const Register table1 = c_rarg4;
3894     const Register table2 = c_rarg5;
3895     const Register table3 = c_rarg6;
3896     const Register tmp3 = c_rarg7;
3897 
3898     BLOCK_COMMENT("Entry:");
3899     __ enter(); // required for proper stackwalking of RuntimeStub frame
3900 
3901     __ kernel_crc32(crc, buf, len,
3902               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3903 
3904     __ leave(); // required for proper stackwalking of RuntimeStub frame
3905     __ ret(lr);
3906 
3907     return start;
3908   }
3909 
3910   /**
3911    *  Arguments:
3912    *
3913    * Inputs:
3914    *   c_rarg0   - int crc
3915    *   c_rarg1   - byte* buf
3916    *   c_rarg2   - int length
3917    *   c_rarg3   - int* table
3918    *
3919    * Ouput:
3920    *       r0   - int crc result
3921    */
3922   address generate_updateBytesCRC32C() {
3923     assert(UseCRC32CIntrinsics, "what are we doing here?");
3924 
3925     __ align(CodeEntryAlignment);
3926     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3927 
3928     address start = __ pc();
3929 
3930     const Register crc   = c_rarg0;  // crc
3931     const Register buf   = c_rarg1;  // source java byte array address
3932     const Register len   = c_rarg2;  // length
3933     const Register table0 = c_rarg3; // crc_table address
3934     const Register table1 = c_rarg4;
3935     const Register table2 = c_rarg5;
3936     const Register table3 = c_rarg6;
3937     const Register tmp3 = c_rarg7;
3938 
3939     BLOCK_COMMENT("Entry:");
3940     __ enter(); // required for proper stackwalking of RuntimeStub frame
3941 
3942     __ kernel_crc32c(crc, buf, len,
3943               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3944 
3945     __ leave(); // required for proper stackwalking of RuntimeStub frame
3946     __ ret(lr);
3947 
3948     return start;
3949   }
3950 
3951   /***
3952    *  Arguments:
3953    *
3954    *  Inputs:
3955    *   c_rarg0   - int   adler
3956    *   c_rarg1   - byte* buff
3957    *   c_rarg2   - int   len
3958    *
3959    * Output:
3960    *   c_rarg0   - int adler result
3961    */
3962   address generate_updateBytesAdler32() {
3963     __ align(CodeEntryAlignment);
3964     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3965     address start = __ pc();
3966 
3967     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3968 
3969     // Aliases
3970     Register adler  = c_rarg0;
3971     Register s1     = c_rarg0;
3972     Register s2     = c_rarg3;
3973     Register buff   = c_rarg1;
3974     Register len    = c_rarg2;
3975     Register nmax  = r4;
3976     Register base  = r5;
3977     Register count = r6;
3978     Register temp0 = rscratch1;
3979     Register temp1 = rscratch2;
3980     FloatRegister vbytes = v0;
3981     FloatRegister vs1acc = v1;
3982     FloatRegister vs2acc = v2;
3983     FloatRegister vtable = v3;
3984 
3985     // Max number of bytes we can process before having to take the mod
3986     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3987     uint64_t BASE = 0xfff1;
3988     uint64_t NMAX = 0x15B0;
3989 
3990     __ mov(base, BASE);
3991     __ mov(nmax, NMAX);
3992 
3993     // Load accumulation coefficients for the upper 16 bits
3994     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3995     __ ld1(vtable, __ T16B, Address(temp0));
3996 
3997     // s1 is initialized to the lower 16 bits of adler
3998     // s2 is initialized to the upper 16 bits of adler
3999     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4000     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4001 
4002     // The pipelined loop needs at least 16 elements for 1 iteration
4003     // It does check this, but it is more effective to skip to the cleanup loop
4004     __ cmp(len, (u1)16);
4005     __ br(Assembler::HS, L_nmax);
4006     __ cbz(len, L_combine);
4007 
4008     __ bind(L_simple_by1_loop);
4009     __ ldrb(temp0, Address(__ post(buff, 1)));
4010     __ add(s1, s1, temp0);
4011     __ add(s2, s2, s1);
4012     __ subs(len, len, 1);
4013     __ br(Assembler::HI, L_simple_by1_loop);
4014 
4015     // s1 = s1 % BASE
4016     __ subs(temp0, s1, base);
4017     __ csel(s1, temp0, s1, Assembler::HS);
4018 
4019     // s2 = s2 % BASE
4020     __ lsr(temp0, s2, 16);
4021     __ lsl(temp1, temp0, 4);
4022     __ sub(temp1, temp1, temp0);
4023     __ add(s2, temp1, s2, ext::uxth);
4024 
4025     __ subs(temp0, s2, base);
4026     __ csel(s2, temp0, s2, Assembler::HS);
4027 
4028     __ b(L_combine);
4029 
4030     __ bind(L_nmax);
4031     __ subs(len, len, nmax);
4032     __ sub(count, nmax, 16);
4033     __ br(Assembler::LO, L_by16);
4034 
4035     __ bind(L_nmax_loop);
4036 
4037     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4038                                       vbytes, vs1acc, vs2acc, vtable);
4039 
4040     __ subs(count, count, 16);
4041     __ br(Assembler::HS, L_nmax_loop);
4042 
4043     // s1 = s1 % BASE
4044     __ lsr(temp0, s1, 16);
4045     __ lsl(temp1, temp0, 4);
4046     __ sub(temp1, temp1, temp0);
4047     __ add(temp1, temp1, s1, ext::uxth);
4048 
4049     __ lsr(temp0, temp1, 16);
4050     __ lsl(s1, temp0, 4);
4051     __ sub(s1, s1, temp0);
4052     __ add(s1, s1, temp1, ext:: uxth);
4053 
4054     __ subs(temp0, s1, base);
4055     __ csel(s1, temp0, s1, Assembler::HS);
4056 
4057     // s2 = s2 % BASE
4058     __ lsr(temp0, s2, 16);
4059     __ lsl(temp1, temp0, 4);
4060     __ sub(temp1, temp1, temp0);
4061     __ add(temp1, temp1, s2, ext::uxth);
4062 
4063     __ lsr(temp0, temp1, 16);
4064     __ lsl(s2, temp0, 4);
4065     __ sub(s2, s2, temp0);
4066     __ add(s2, s2, temp1, ext:: uxth);
4067 
4068     __ subs(temp0, s2, base);
4069     __ csel(s2, temp0, s2, Assembler::HS);
4070 
4071     __ subs(len, len, nmax);
4072     __ sub(count, nmax, 16);
4073     __ br(Assembler::HS, L_nmax_loop);
4074 
4075     __ bind(L_by16);
4076     __ adds(len, len, count);
4077     __ br(Assembler::LO, L_by1);
4078 
4079     __ bind(L_by16_loop);
4080 
4081     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4082                                       vbytes, vs1acc, vs2acc, vtable);
4083 
4084     __ subs(len, len, 16);
4085     __ br(Assembler::HS, L_by16_loop);
4086 
4087     __ bind(L_by1);
4088     __ adds(len, len, 15);
4089     __ br(Assembler::LO, L_do_mod);
4090 
4091     __ bind(L_by1_loop);
4092     __ ldrb(temp0, Address(__ post(buff, 1)));
4093     __ add(s1, temp0, s1);
4094     __ add(s2, s2, s1);
4095     __ subs(len, len, 1);
4096     __ br(Assembler::HS, L_by1_loop);
4097 
4098     __ bind(L_do_mod);
4099     // s1 = s1 % BASE
4100     __ lsr(temp0, s1, 16);
4101     __ lsl(temp1, temp0, 4);
4102     __ sub(temp1, temp1, temp0);
4103     __ add(temp1, temp1, s1, ext::uxth);
4104 
4105     __ lsr(temp0, temp1, 16);
4106     __ lsl(s1, temp0, 4);
4107     __ sub(s1, s1, temp0);
4108     __ add(s1, s1, temp1, ext:: uxth);
4109 
4110     __ subs(temp0, s1, base);
4111     __ csel(s1, temp0, s1, Assembler::HS);
4112 
4113     // s2 = s2 % BASE
4114     __ lsr(temp0, s2, 16);
4115     __ lsl(temp1, temp0, 4);
4116     __ sub(temp1, temp1, temp0);
4117     __ add(temp1, temp1, s2, ext::uxth);
4118 
4119     __ lsr(temp0, temp1, 16);
4120     __ lsl(s2, temp0, 4);
4121     __ sub(s2, s2, temp0);
4122     __ add(s2, s2, temp1, ext:: uxth);
4123 
4124     __ subs(temp0, s2, base);
4125     __ csel(s2, temp0, s2, Assembler::HS);
4126 
4127     // Combine lower bits and higher bits
4128     __ bind(L_combine);
4129     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4130 
4131     __ ret(lr);
4132 
4133     return start;
4134   }
4135 
4136   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4137           Register temp0, Register temp1, FloatRegister vbytes,
4138           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4139     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4140     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4141     // In non-vectorized code, we update s1 and s2 as:
4142     //   s1 <- s1 + b1
4143     //   s2 <- s2 + s1
4144     //   s1 <- s1 + b2
4145     //   s2 <- s2 + b1
4146     //   ...
4147     //   s1 <- s1 + b16
4148     //   s2 <- s2 + s1
4149     // Putting above assignments together, we have:
4150     //   s1_new = s1 + b1 + b2 + ... + b16
4151     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4152     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4153     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4154     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4155 
4156     // s2 = s2 + s1 * 16
4157     __ add(s2, s2, s1, Assembler::LSL, 4);
4158 
4159     // vs1acc = b1 + b2 + b3 + ... + b16
4160     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4161     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4162     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4163     __ uaddlv(vs1acc, __ T16B, vbytes);
4164     __ uaddlv(vs2acc, __ T8H, vs2acc);
4165 
4166     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4167     __ fmovd(temp0, vs1acc);
4168     __ fmovd(temp1, vs2acc);
4169     __ add(s1, s1, temp0);
4170     __ add(s2, s2, temp1);
4171   }
4172 
4173   /**
4174    *  Arguments:
4175    *
4176    *  Input:
4177    *    c_rarg0   - x address
4178    *    c_rarg1   - x length
4179    *    c_rarg2   - y address
4180    *    c_rarg3   - y lenth
4181    *    c_rarg4   - z address
4182    *    c_rarg5   - z length
4183    */
4184   address generate_multiplyToLen() {
4185     __ align(CodeEntryAlignment);
4186     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4187 
4188     address start = __ pc();
4189     const Register x     = r0;
4190     const Register xlen  = r1;
4191     const Register y     = r2;
4192     const Register ylen  = r3;
4193     const Register z     = r4;
4194     const Register zlen  = r5;
4195 
4196     const Register tmp1  = r10;
4197     const Register tmp2  = r11;
4198     const Register tmp3  = r12;
4199     const Register tmp4  = r13;
4200     const Register tmp5  = r14;
4201     const Register tmp6  = r15;
4202     const Register tmp7  = r16;
4203 
4204     BLOCK_COMMENT("Entry:");
4205     __ enter(); // required for proper stackwalking of RuntimeStub frame
4206     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4207     __ leave(); // required for proper stackwalking of RuntimeStub frame
4208     __ ret(lr);
4209 
4210     return start;
4211   }
4212 
4213   address generate_squareToLen() {
4214     // squareToLen algorithm for sizes 1..127 described in java code works
4215     // faster than multiply_to_len on some CPUs and slower on others, but
4216     // multiply_to_len shows a bit better overall results
4217     __ align(CodeEntryAlignment);
4218     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4219     address start = __ pc();
4220 
4221     const Register x     = r0;
4222     const Register xlen  = r1;
4223     const Register z     = r2;
4224     const Register zlen  = r3;
4225     const Register y     = r4; // == x
4226     const Register ylen  = r5; // == xlen
4227 
4228     const Register tmp1  = r10;
4229     const Register tmp2  = r11;
4230     const Register tmp3  = r12;
4231     const Register tmp4  = r13;
4232     const Register tmp5  = r14;
4233     const Register tmp6  = r15;
4234     const Register tmp7  = r16;
4235 
4236     RegSet spilled_regs = RegSet::of(y, ylen);
4237     BLOCK_COMMENT("Entry:");
4238     __ enter();
4239     __ push(spilled_regs, sp);
4240     __ mov(y, x);
4241     __ mov(ylen, xlen);
4242     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4243     __ pop(spilled_regs, sp);
4244     __ leave();
4245     __ ret(lr);
4246     return start;
4247   }
4248 
4249   address generate_mulAdd() {
4250     __ align(CodeEntryAlignment);
4251     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4252 
4253     address start = __ pc();
4254 
4255     const Register out     = r0;
4256     const Register in      = r1;
4257     const Register offset  = r2;
4258     const Register len     = r3;
4259     const Register k       = r4;
4260 
4261     BLOCK_COMMENT("Entry:");
4262     __ enter();
4263     __ mul_add(out, in, offset, len, k);
4264     __ leave();
4265     __ ret(lr);
4266 
4267     return start;
4268   }
4269 
4270   // Arguments:
4271   //
4272   // Input:
4273   //   c_rarg0   - newArr address
4274   //   c_rarg1   - oldArr address
4275   //   c_rarg2   - newIdx
4276   //   c_rarg3   - shiftCount
4277   //   c_rarg4   - numIter
4278   //
4279   address generate_bigIntegerRightShift() {
4280     __ align(CodeEntryAlignment);
4281     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4282     address start = __ pc();
4283 
4284     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4285 
4286     Register newArr        = c_rarg0;
4287     Register oldArr        = c_rarg1;
4288     Register newIdx        = c_rarg2;
4289     Register shiftCount    = c_rarg3;
4290     Register numIter       = c_rarg4;
4291     Register idx           = numIter;
4292 
4293     Register newArrCur     = rscratch1;
4294     Register shiftRevCount = rscratch2;
4295     Register oldArrCur     = r13;
4296     Register oldArrNext    = r14;
4297 
4298     FloatRegister oldElem0        = v0;
4299     FloatRegister oldElem1        = v1;
4300     FloatRegister newElem         = v2;
4301     FloatRegister shiftVCount     = v3;
4302     FloatRegister shiftVRevCount  = v4;
4303 
4304     __ cbz(idx, Exit);
4305 
4306     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4307 
4308     // left shift count
4309     __ movw(shiftRevCount, 32);
4310     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4311 
4312     // numIter too small to allow a 4-words SIMD loop, rolling back
4313     __ cmp(numIter, (u1)4);
4314     __ br(Assembler::LT, ShiftThree);
4315 
4316     __ dup(shiftVCount,    __ T4S, shiftCount);
4317     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4318     __ negr(shiftVCount,   __ T4S, shiftVCount);
4319 
4320     __ BIND(ShiftSIMDLoop);
4321 
4322     // Calculate the load addresses
4323     __ sub(idx, idx, 4);
4324     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4325     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4326     __ add(oldArrCur,  oldArrNext, 4);
4327 
4328     // Load 4 words and process
4329     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4330     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4331     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4332     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4333     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4334     __ st1(newElem,   __ T4S,  Address(newArrCur));
4335 
4336     __ cmp(idx, (u1)4);
4337     __ br(Assembler::LT, ShiftTwoLoop);
4338     __ b(ShiftSIMDLoop);
4339 
4340     __ BIND(ShiftTwoLoop);
4341     __ cbz(idx, Exit);
4342     __ cmp(idx, (u1)1);
4343     __ br(Assembler::EQ, ShiftOne);
4344 
4345     // Calculate the load addresses
4346     __ sub(idx, idx, 2);
4347     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4348     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4349     __ add(oldArrCur,  oldArrNext, 4);
4350 
4351     // Load 2 words and process
4352     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4353     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4354     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4355     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4356     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4357     __ st1(newElem,   __ T2S, Address(newArrCur));
4358     __ b(ShiftTwoLoop);
4359 
4360     __ BIND(ShiftThree);
4361     __ tbz(idx, 1, ShiftOne);
4362     __ tbz(idx, 0, ShiftTwo);
4363     __ ldrw(r10,  Address(oldArr, 12));
4364     __ ldrw(r11,  Address(oldArr, 8));
4365     __ lsrvw(r10, r10, shiftCount);
4366     __ lslvw(r11, r11, shiftRevCount);
4367     __ orrw(r12,  r10, r11);
4368     __ strw(r12,  Address(newArr, 8));
4369 
4370     __ BIND(ShiftTwo);
4371     __ ldrw(r10,  Address(oldArr, 8));
4372     __ ldrw(r11,  Address(oldArr, 4));
4373     __ lsrvw(r10, r10, shiftCount);
4374     __ lslvw(r11, r11, shiftRevCount);
4375     __ orrw(r12,  r10, r11);
4376     __ strw(r12,  Address(newArr, 4));
4377 
4378     __ BIND(ShiftOne);
4379     __ ldrw(r10,  Address(oldArr, 4));
4380     __ ldrw(r11,  Address(oldArr));
4381     __ lsrvw(r10, r10, shiftCount);
4382     __ lslvw(r11, r11, shiftRevCount);
4383     __ orrw(r12,  r10, r11);
4384     __ strw(r12,  Address(newArr));
4385 
4386     __ BIND(Exit);
4387     __ ret(lr);
4388 
4389     return start;
4390   }
4391 
4392   // Arguments:
4393   //
4394   // Input:
4395   //   c_rarg0   - newArr address
4396   //   c_rarg1   - oldArr address
4397   //   c_rarg2   - newIdx
4398   //   c_rarg3   - shiftCount
4399   //   c_rarg4   - numIter
4400   //
4401   address generate_bigIntegerLeftShift() {
4402     __ align(CodeEntryAlignment);
4403     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4404     address start = __ pc();
4405 
4406     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4407 
4408     Register newArr        = c_rarg0;
4409     Register oldArr        = c_rarg1;
4410     Register newIdx        = c_rarg2;
4411     Register shiftCount    = c_rarg3;
4412     Register numIter       = c_rarg4;
4413 
4414     Register shiftRevCount = rscratch1;
4415     Register oldArrNext    = rscratch2;
4416 
4417     FloatRegister oldElem0        = v0;
4418     FloatRegister oldElem1        = v1;
4419     FloatRegister newElem         = v2;
4420     FloatRegister shiftVCount     = v3;
4421     FloatRegister shiftVRevCount  = v4;
4422 
4423     __ cbz(numIter, Exit);
4424 
4425     __ add(oldArrNext, oldArr, 4);
4426     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4427 
4428     // right shift count
4429     __ movw(shiftRevCount, 32);
4430     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4431 
4432     // numIter too small to allow a 4-words SIMD loop, rolling back
4433     __ cmp(numIter, (u1)4);
4434     __ br(Assembler::LT, ShiftThree);
4435 
4436     __ dup(shiftVCount,     __ T4S, shiftCount);
4437     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4438     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4439 
4440     __ BIND(ShiftSIMDLoop);
4441 
4442     // load 4 words and process
4443     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4444     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4445     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4446     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4447     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4448     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4449     __ sub(numIter,   numIter, 4);
4450 
4451     __ cmp(numIter, (u1)4);
4452     __ br(Assembler::LT, ShiftTwoLoop);
4453     __ b(ShiftSIMDLoop);
4454 
4455     __ BIND(ShiftTwoLoop);
4456     __ cbz(numIter, Exit);
4457     __ cmp(numIter, (u1)1);
4458     __ br(Assembler::EQ, ShiftOne);
4459 
4460     // load 2 words and process
4461     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4462     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4463     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4464     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4465     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4466     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4467     __ sub(numIter,   numIter, 2);
4468     __ b(ShiftTwoLoop);
4469 
4470     __ BIND(ShiftThree);
4471     __ ldrw(r10,  __ post(oldArr, 4));
4472     __ ldrw(r11,  __ post(oldArrNext, 4));
4473     __ lslvw(r10, r10, shiftCount);
4474     __ lsrvw(r11, r11, shiftRevCount);
4475     __ orrw(r12,  r10, r11);
4476     __ strw(r12,  __ post(newArr, 4));
4477     __ tbz(numIter, 1, Exit);
4478     __ tbz(numIter, 0, ShiftOne);
4479 
4480     __ BIND(ShiftTwo);
4481     __ ldrw(r10,  __ post(oldArr, 4));
4482     __ ldrw(r11,  __ post(oldArrNext, 4));
4483     __ lslvw(r10, r10, shiftCount);
4484     __ lsrvw(r11, r11, shiftRevCount);
4485     __ orrw(r12,  r10, r11);
4486     __ strw(r12,  __ post(newArr, 4));
4487 
4488     __ BIND(ShiftOne);
4489     __ ldrw(r10,  Address(oldArr));
4490     __ ldrw(r11,  Address(oldArrNext));
4491     __ lslvw(r10, r10, shiftCount);
4492     __ lsrvw(r11, r11, shiftRevCount);
4493     __ orrw(r12,  r10, r11);
4494     __ strw(r12,  Address(newArr));
4495 
4496     __ BIND(Exit);
4497     __ ret(lr);
4498 
4499     return start;
4500   }
4501 
4502   address generate_has_negatives(address &has_negatives_long) {
4503     const u1 large_loop_size = 64;
4504     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4505     int dcache_line = VM_Version::dcache_line_size();
4506 
4507     Register ary1 = r1, len = r2, result = r0;
4508 
4509     __ align(CodeEntryAlignment);
4510 
4511     StubCodeMark mark(this, "StubRoutines", "has_negatives");
4512 
4513     address entry = __ pc();
4514 
4515     __ enter();
4516 
4517   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
4518         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4519 
4520   __ cmp(len, (u1)15);
4521   __ br(Assembler::GT, LEN_OVER_15);
4522   // The only case when execution falls into this code is when pointer is near
4523   // the end of memory page and we have to avoid reading next page
4524   __ add(ary1, ary1, len);
4525   __ subs(len, len, 8);
4526   __ br(Assembler::GT, LEN_OVER_8);
4527   __ ldr(rscratch2, Address(ary1, -8));
4528   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4529   __ lsrv(rscratch2, rscratch2, rscratch1);
4530   __ tst(rscratch2, UPPER_BIT_MASK);
4531   __ cset(result, Assembler::NE);
4532   __ leave();
4533   __ ret(lr);
4534   __ bind(LEN_OVER_8);
4535   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4536   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4537   __ tst(rscratch2, UPPER_BIT_MASK);
4538   __ br(Assembler::NE, RET_TRUE_NO_POP);
4539   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4540   __ lsrv(rscratch1, rscratch1, rscratch2);
4541   __ tst(rscratch1, UPPER_BIT_MASK);
4542   __ cset(result, Assembler::NE);
4543   __ leave();
4544   __ ret(lr);
4545 
4546   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4547   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4548 
4549   has_negatives_long = __ pc(); // 2nd entry point
4550 
4551   __ enter();
4552 
4553   __ bind(LEN_OVER_15);
4554     __ push(spilled_regs, sp);
4555     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4556     __ cbz(rscratch2, ALIGNED);
4557     __ ldp(tmp6, tmp1, Address(ary1));
4558     __ mov(tmp5, 16);
4559     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4560     __ add(ary1, ary1, rscratch1);
4561     __ sub(len, len, rscratch1);
4562     __ orr(tmp6, tmp6, tmp1);
4563     __ tst(tmp6, UPPER_BIT_MASK);
4564     __ br(Assembler::NE, RET_TRUE);
4565 
4566   __ bind(ALIGNED);
4567     __ cmp(len, large_loop_size);
4568     __ br(Assembler::LT, CHECK_16);
4569     // Perform 16-byte load as early return in pre-loop to handle situation
4570     // when initially aligned large array has negative values at starting bytes,
4571     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4572     // slower. Cases with negative bytes further ahead won't be affected that
4573     // much. In fact, it'll be faster due to early loads, less instructions and
4574     // less branches in LARGE_LOOP.
4575     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4576     __ sub(len, len, 16);
4577     __ orr(tmp6, tmp6, tmp1);
4578     __ tst(tmp6, UPPER_BIT_MASK);
4579     __ br(Assembler::NE, RET_TRUE);
4580     __ cmp(len, large_loop_size);
4581     __ br(Assembler::LT, CHECK_16);
4582 
4583     if (SoftwarePrefetchHintDistance >= 0
4584         && SoftwarePrefetchHintDistance >= dcache_line) {
4585       // initial prefetch
4586       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4587     }
4588   __ bind(LARGE_LOOP);
4589     if (SoftwarePrefetchHintDistance >= 0) {
4590       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4591     }
4592     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4593     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4594     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4595     // instructions per cycle and have less branches, but this approach disables
4596     // early return, thus, all 64 bytes are loaded and checked every time.
4597     __ ldp(tmp2, tmp3, Address(ary1));
4598     __ ldp(tmp4, tmp5, Address(ary1, 16));
4599     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4600     __ ldp(tmp6, tmp1, Address(ary1, 48));
4601     __ add(ary1, ary1, large_loop_size);
4602     __ sub(len, len, large_loop_size);
4603     __ orr(tmp2, tmp2, tmp3);
4604     __ orr(tmp4, tmp4, tmp5);
4605     __ orr(rscratch1, rscratch1, rscratch2);
4606     __ orr(tmp6, tmp6, tmp1);
4607     __ orr(tmp2, tmp2, tmp4);
4608     __ orr(rscratch1, rscratch1, tmp6);
4609     __ orr(tmp2, tmp2, rscratch1);
4610     __ tst(tmp2, UPPER_BIT_MASK);
4611     __ br(Assembler::NE, RET_TRUE);
4612     __ cmp(len, large_loop_size);
4613     __ br(Assembler::GE, LARGE_LOOP);
4614 
4615   __ bind(CHECK_16); // small 16-byte load pre-loop
4616     __ cmp(len, (u1)16);
4617     __ br(Assembler::LT, POST_LOOP16);
4618 
4619   __ bind(LOOP16); // small 16-byte load loop
4620     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4621     __ sub(len, len, 16);
4622     __ orr(tmp2, tmp2, tmp3);
4623     __ tst(tmp2, UPPER_BIT_MASK);
4624     __ br(Assembler::NE, RET_TRUE);
4625     __ cmp(len, (u1)16);
4626     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4627 
4628   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4629     __ cmp(len, (u1)8);
4630     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4631     __ ldr(tmp3, Address(__ post(ary1, 8)));
4632     __ sub(len, len, 8);
4633     __ tst(tmp3, UPPER_BIT_MASK);
4634     __ br(Assembler::NE, RET_TRUE);
4635 
4636   __ bind(POST_LOOP16_LOAD_TAIL);
4637     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
4638     __ ldr(tmp1, Address(ary1));
4639     __ mov(tmp2, 64);
4640     __ sub(tmp4, tmp2, len, __ LSL, 3);
4641     __ lslv(tmp1, tmp1, tmp4);
4642     __ tst(tmp1, UPPER_BIT_MASK);
4643     __ br(Assembler::NE, RET_TRUE);
4644     // Fallthrough
4645 
4646   __ bind(RET_FALSE);
4647     __ pop(spilled_regs, sp);
4648     __ leave();
4649     __ mov(result, zr);
4650     __ ret(lr);
4651 
4652   __ bind(RET_TRUE);
4653     __ pop(spilled_regs, sp);
4654   __ bind(RET_TRUE_NO_POP);
4655     __ leave();
4656     __ mov(result, 1);
4657     __ ret(lr);
4658 
4659   __ bind(DONE);
4660     __ pop(spilled_regs, sp);
4661     __ leave();
4662     __ ret(lr);
4663     return entry;
4664   }
4665 
4666   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4667         bool usePrefetch, Label &NOT_EQUAL) {
4668     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4669         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4670         tmp7 = r12, tmp8 = r13;
4671     Label LOOP;
4672 
4673     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4674     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4675     __ bind(LOOP);
4676     if (usePrefetch) {
4677       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4678       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4679     }
4680     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4681     __ eor(tmp1, tmp1, tmp2);
4682     __ eor(tmp3, tmp3, tmp4);
4683     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4684     __ orr(tmp1, tmp1, tmp3);
4685     __ cbnz(tmp1, NOT_EQUAL);
4686     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4687     __ eor(tmp5, tmp5, tmp6);
4688     __ eor(tmp7, tmp7, tmp8);
4689     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4690     __ orr(tmp5, tmp5, tmp7);
4691     __ cbnz(tmp5, NOT_EQUAL);
4692     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4693     __ eor(tmp1, tmp1, tmp2);
4694     __ eor(tmp3, tmp3, tmp4);
4695     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4696     __ orr(tmp1, tmp1, tmp3);
4697     __ cbnz(tmp1, NOT_EQUAL);
4698     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4699     __ eor(tmp5, tmp5, tmp6);
4700     __ sub(cnt1, cnt1, 8 * wordSize);
4701     __ eor(tmp7, tmp7, tmp8);
4702     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4703     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4704     // cmp) because subs allows an unlimited range of immediate operand.
4705     __ subs(tmp6, cnt1, loopThreshold);
4706     __ orr(tmp5, tmp5, tmp7);
4707     __ cbnz(tmp5, NOT_EQUAL);
4708     __ br(__ GE, LOOP);
4709     // post-loop
4710     __ eor(tmp1, tmp1, tmp2);
4711     __ eor(tmp3, tmp3, tmp4);
4712     __ orr(tmp1, tmp1, tmp3);
4713     __ sub(cnt1, cnt1, 2 * wordSize);
4714     __ cbnz(tmp1, NOT_EQUAL);
4715   }
4716 
4717   void generate_large_array_equals_loop_simd(int loopThreshold,
4718         bool usePrefetch, Label &NOT_EQUAL) {
4719     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4720         tmp2 = rscratch2;
4721     Label LOOP;
4722 
4723     __ bind(LOOP);
4724     if (usePrefetch) {
4725       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4726       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4727     }
4728     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4729     __ sub(cnt1, cnt1, 8 * wordSize);
4730     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4731     __ subs(tmp1, cnt1, loopThreshold);
4732     __ eor(v0, __ T16B, v0, v4);
4733     __ eor(v1, __ T16B, v1, v5);
4734     __ eor(v2, __ T16B, v2, v6);
4735     __ eor(v3, __ T16B, v3, v7);
4736     __ orr(v0, __ T16B, v0, v1);
4737     __ orr(v1, __ T16B, v2, v3);
4738     __ orr(v0, __ T16B, v0, v1);
4739     __ umov(tmp1, v0, __ D, 0);
4740     __ umov(tmp2, v0, __ D, 1);
4741     __ orr(tmp1, tmp1, tmp2);
4742     __ cbnz(tmp1, NOT_EQUAL);
4743     __ br(__ GE, LOOP);
4744   }
4745 
4746   // a1 = r1 - array1 address
4747   // a2 = r2 - array2 address
4748   // result = r0 - return value. Already contains "false"
4749   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4750   // r3-r5 are reserved temporary registers
4751   address generate_large_array_equals() {
4752     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4753         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4754         tmp7 = r12, tmp8 = r13;
4755     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4756         SMALL_LOOP, POST_LOOP;
4757     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4758     // calculate if at least 32 prefetched bytes are used
4759     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4760     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4761     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4762     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4763         tmp5, tmp6, tmp7, tmp8);
4764 
4765     __ align(CodeEntryAlignment);
4766 
4767     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4768 
4769     address entry = __ pc();
4770     __ enter();
4771     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4772     // also advance pointers to use post-increment instead of pre-increment
4773     __ add(a1, a1, wordSize);
4774     __ add(a2, a2, wordSize);
4775     if (AvoidUnalignedAccesses) {
4776       // both implementations (SIMD/nonSIMD) are using relatively large load
4777       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4778       // on some CPUs in case of address is not at least 16-byte aligned.
4779       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4780       // load if needed at least for 1st address and make if 16-byte aligned.
4781       Label ALIGNED16;
4782       __ tbz(a1, 3, ALIGNED16);
4783       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4784       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4785       __ sub(cnt1, cnt1, wordSize);
4786       __ eor(tmp1, tmp1, tmp2);
4787       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4788       __ bind(ALIGNED16);
4789     }
4790     if (UseSIMDForArrayEquals) {
4791       if (SoftwarePrefetchHintDistance >= 0) {
4792         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4793         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4794         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4795             /* prfm = */ true, NOT_EQUAL);
4796         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4797         __ br(__ LT, TAIL);
4798       }
4799       __ bind(NO_PREFETCH_LARGE_LOOP);
4800       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4801           /* prfm = */ false, NOT_EQUAL);
4802     } else {
4803       __ push(spilled_regs, sp);
4804       if (SoftwarePrefetchHintDistance >= 0) {
4805         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4806         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4807         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4808             /* prfm = */ true, NOT_EQUAL);
4809         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4810         __ br(__ LT, TAIL);
4811       }
4812       __ bind(NO_PREFETCH_LARGE_LOOP);
4813       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4814           /* prfm = */ false, NOT_EQUAL);
4815     }
4816     __ bind(TAIL);
4817       __ cbz(cnt1, EQUAL);
4818       __ subs(cnt1, cnt1, wordSize);
4819       __ br(__ LE, POST_LOOP);
4820     __ bind(SMALL_LOOP);
4821       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4822       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4823       __ subs(cnt1, cnt1, wordSize);
4824       __ eor(tmp1, tmp1, tmp2);
4825       __ cbnz(tmp1, NOT_EQUAL);
4826       __ br(__ GT, SMALL_LOOP);
4827     __ bind(POST_LOOP);
4828       __ ldr(tmp1, Address(a1, cnt1));
4829       __ ldr(tmp2, Address(a2, cnt1));
4830       __ eor(tmp1, tmp1, tmp2);
4831       __ cbnz(tmp1, NOT_EQUAL);
4832     __ bind(EQUAL);
4833       __ mov(result, true);
4834     __ bind(NOT_EQUAL);
4835       if (!UseSIMDForArrayEquals) {
4836         __ pop(spilled_regs, sp);
4837       }
4838     __ bind(NOT_EQUAL_NO_POP);
4839     __ leave();
4840     __ ret(lr);
4841     return entry;
4842   }
4843 
4844   address generate_dsin_dcos(bool isCos) {
4845     __ align(CodeEntryAlignment);
4846     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4847     address start = __ pc();
4848     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4849         (address)StubRoutines::aarch64::_two_over_pi,
4850         (address)StubRoutines::aarch64::_pio2,
4851         (address)StubRoutines::aarch64::_dsin_coef,
4852         (address)StubRoutines::aarch64::_dcos_coef);
4853     return start;
4854   }
4855 
4856   address generate_dlog() {
4857     __ align(CodeEntryAlignment);
4858     StubCodeMark mark(this, "StubRoutines", "dlog");
4859     address entry = __ pc();
4860     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4861         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4862     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4863     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4864         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4865     return entry;
4866   }
4867 
4868   // code for comparing 16 bytes of strings with same encoding
4869   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4870     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4871     __ ldr(rscratch1, Address(__ post(str1, 8)));
4872     __ eor(rscratch2, tmp1, tmp2);
4873     __ ldr(cnt1, Address(__ post(str2, 8)));
4874     __ cbnz(rscratch2, DIFF1);
4875     __ ldr(tmp1, Address(__ post(str1, 8)));
4876     __ eor(rscratch2, rscratch1, cnt1);
4877     __ ldr(tmp2, Address(__ post(str2, 8)));
4878     __ cbnz(rscratch2, DIFF2);
4879   }
4880 
4881   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4882   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4883       Label &DIFF2) {
4884     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4885     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4886 
4887     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4888     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4889     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4890     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4891 
4892     __ fmovd(tmpL, vtmp3);
4893     __ eor(rscratch2, tmp3, tmpL);
4894     __ cbnz(rscratch2, DIFF2);
4895 
4896     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4897     __ umov(tmpL, vtmp3, __ D, 1);
4898     __ eor(rscratch2, tmpU, tmpL);
4899     __ cbnz(rscratch2, DIFF1);
4900 
4901     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4902     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4903     __ fmovd(tmpL, vtmp);
4904     __ eor(rscratch2, tmp3, tmpL);
4905     __ cbnz(rscratch2, DIFF2);
4906 
4907     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4908     __ umov(tmpL, vtmp, __ D, 1);
4909     __ eor(rscratch2, tmpU, tmpL);
4910     __ cbnz(rscratch2, DIFF1);
4911   }
4912 
4913   // r0  = result
4914   // r1  = str1
4915   // r2  = cnt1
4916   // r3  = str2
4917   // r4  = cnt2
4918   // r10 = tmp1
4919   // r11 = tmp2
4920   address generate_compare_long_string_different_encoding(bool isLU) {
4921     __ align(CodeEntryAlignment);
4922     StubCodeMark mark(this, "StubRoutines", isLU
4923         ? "compare_long_string_different_encoding LU"
4924         : "compare_long_string_different_encoding UL");
4925     address entry = __ pc();
4926     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4927         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4928         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4929     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4930         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4931     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4932     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4933 
4934     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4935 
4936     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4937     // cnt2 == amount of characters left to compare
4938     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4939     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4940     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4941     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4942     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4943     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4944     __ eor(rscratch2, tmp1, tmp2);
4945     __ mov(rscratch1, tmp2);
4946     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4947     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4948              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4949     __ push(spilled_regs, sp);
4950     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4951     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4952 
4953     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4954 
4955     if (SoftwarePrefetchHintDistance >= 0) {
4956       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4957       __ br(__ LT, NO_PREFETCH);
4958       __ bind(LARGE_LOOP_PREFETCH);
4959         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4960         __ mov(tmp4, 2);
4961         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4962         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4963           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4964           __ subs(tmp4, tmp4, 1);
4965           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4966           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4967           __ mov(tmp4, 2);
4968         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4969           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4970           __ subs(tmp4, tmp4, 1);
4971           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4972           __ sub(cnt2, cnt2, 64);
4973           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4974           __ br(__ GE, LARGE_LOOP_PREFETCH);
4975     }
4976     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4977     __ bind(NO_PREFETCH);
4978     __ subs(cnt2, cnt2, 16);
4979     __ br(__ LT, TAIL);
4980     __ align(OptoLoopAlignment);
4981     __ bind(SMALL_LOOP); // smaller loop
4982       __ subs(cnt2, cnt2, 16);
4983       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4984       __ br(__ GE, SMALL_LOOP);
4985       __ cmn(cnt2, (u1)16);
4986       __ br(__ EQ, LOAD_LAST);
4987     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4988       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4989       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4990       __ ldr(tmp3, Address(cnt1, -8));
4991       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4992       __ b(LOAD_LAST);
4993     __ bind(DIFF2);
4994       __ mov(tmpU, tmp3);
4995     __ bind(DIFF1);
4996       __ pop(spilled_regs, sp);
4997       __ b(CALCULATE_DIFFERENCE);
4998     __ bind(LOAD_LAST);
4999       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5000       // No need to load it again
5001       __ mov(tmpU, tmp3);
5002       __ pop(spilled_regs, sp);
5003 
5004       // tmp2 points to the address of the last 4 Latin1 characters right now
5005       __ ldrs(vtmp, Address(tmp2));
5006       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5007       __ fmovd(tmpL, vtmp);
5008 
5009       __ eor(rscratch2, tmpU, tmpL);
5010       __ cbz(rscratch2, DONE);
5011 
5012     // Find the first different characters in the longwords and
5013     // compute their difference.
5014     __ bind(CALCULATE_DIFFERENCE);
5015       __ rev(rscratch2, rscratch2);
5016       __ clz(rscratch2, rscratch2);
5017       __ andr(rscratch2, rscratch2, -16);
5018       __ lsrv(tmp1, tmp1, rscratch2);
5019       __ uxthw(tmp1, tmp1);
5020       __ lsrv(rscratch1, rscratch1, rscratch2);
5021       __ uxthw(rscratch1, rscratch1);
5022       __ subw(result, tmp1, rscratch1);
5023     __ bind(DONE);
5024       __ ret(lr);
5025     return entry;
5026   }
5027 
5028     address generate_method_entry_barrier() {
5029     __ align(CodeEntryAlignment);
5030     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5031 
5032     Label deoptimize_label;
5033 
5034     address start = __ pc();
5035 
5036     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5037 
5038     __ enter();
5039     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5040 
5041     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5042 
5043     __ push_call_clobbered_registers();
5044 
5045     __ mov(c_rarg0, rscratch2);
5046     __ call_VM_leaf
5047          (CAST_FROM_FN_PTR
5048           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5049 
5050     __ reset_last_Java_frame(true);
5051 
5052     __ mov(rscratch1, r0);
5053 
5054     __ pop_call_clobbered_registers();
5055 
5056     __ cbnz(rscratch1, deoptimize_label);
5057 
5058     __ leave();
5059     __ ret(lr);
5060 
5061     __ BIND(deoptimize_label);
5062 
5063     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5064     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5065 
5066     __ mov(sp, rscratch1);
5067     __ br(rscratch2);
5068 
5069     return start;
5070   }
5071 
5072   // r0  = result
5073   // r1  = str1
5074   // r2  = cnt1
5075   // r3  = str2
5076   // r4  = cnt2
5077   // r10 = tmp1
5078   // r11 = tmp2
5079   address generate_compare_long_string_same_encoding(bool isLL) {
5080     __ align(CodeEntryAlignment);
5081     StubCodeMark mark(this, "StubRoutines", isLL
5082         ? "compare_long_string_same_encoding LL"
5083         : "compare_long_string_same_encoding UU");
5084     address entry = __ pc();
5085     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5086         tmp1 = r10, tmp2 = r11;
5087     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
5088         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
5089         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
5090     // exit from large loop when less than 64 bytes left to read or we're about
5091     // to prefetch memory behind array border
5092     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5093     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
5094     // update cnt2 counter with already loaded 8 bytes
5095     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5096     // update pointers, because of previous read
5097     __ add(str1, str1, wordSize);
5098     __ add(str2, str2, wordSize);
5099     if (SoftwarePrefetchHintDistance >= 0) {
5100       __ bind(LARGE_LOOP_PREFETCH);
5101         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5102         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5103         compare_string_16_bytes_same(DIFF, DIFF2);
5104         compare_string_16_bytes_same(DIFF, DIFF2);
5105         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5106         compare_string_16_bytes_same(DIFF, DIFF2);
5107         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5108         compare_string_16_bytes_same(DIFF, DIFF2);
5109         __ br(__ GT, LARGE_LOOP_PREFETCH);
5110         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
5111     }
5112     // less than 16 bytes left?
5113     __ subs(cnt2, cnt2, isLL ? 16 : 8);
5114     __ br(__ LT, TAIL);
5115     __ align(OptoLoopAlignment);
5116     __ bind(SMALL_LOOP);
5117       compare_string_16_bytes_same(DIFF, DIFF2);
5118       __ subs(cnt2, cnt2, isLL ? 16 : 8);
5119       __ br(__ GE, SMALL_LOOP);
5120     __ bind(TAIL);
5121       __ adds(cnt2, cnt2, isLL ? 16 : 8);
5122       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
5123       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5124       __ br(__ LE, CHECK_LAST);
5125       __ eor(rscratch2, tmp1, tmp2);
5126       __ cbnz(rscratch2, DIFF);
5127       __ ldr(tmp1, Address(__ post(str1, 8)));
5128       __ ldr(tmp2, Address(__ post(str2, 8)));
5129       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5130     __ bind(CHECK_LAST);
5131       if (!isLL) {
5132         __ add(cnt2, cnt2, cnt2); // now in bytes
5133       }
5134       __ eor(rscratch2, tmp1, tmp2);
5135       __ cbnz(rscratch2, DIFF);
5136       __ ldr(rscratch1, Address(str1, cnt2));
5137       __ ldr(cnt1, Address(str2, cnt2));
5138       __ eor(rscratch2, rscratch1, cnt1);
5139       __ cbz(rscratch2, LENGTH_DIFF);
5140       // Find the first different characters in the longwords and
5141       // compute their difference.
5142     __ bind(DIFF2);
5143       __ rev(rscratch2, rscratch2);
5144       __ clz(rscratch2, rscratch2);
5145       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5146       __ lsrv(rscratch1, rscratch1, rscratch2);
5147       if (isLL) {
5148         __ lsrv(cnt1, cnt1, rscratch2);
5149         __ uxtbw(rscratch1, rscratch1);
5150         __ uxtbw(cnt1, cnt1);
5151       } else {
5152         __ lsrv(cnt1, cnt1, rscratch2);
5153         __ uxthw(rscratch1, rscratch1);
5154         __ uxthw(cnt1, cnt1);
5155       }
5156       __ subw(result, rscratch1, cnt1);
5157       __ b(LENGTH_DIFF);
5158     __ bind(DIFF);
5159       __ rev(rscratch2, rscratch2);
5160       __ clz(rscratch2, rscratch2);
5161       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5162       __ lsrv(tmp1, tmp1, rscratch2);
5163       if (isLL) {
5164         __ lsrv(tmp2, tmp2, rscratch2);
5165         __ uxtbw(tmp1, tmp1);
5166         __ uxtbw(tmp2, tmp2);
5167       } else {
5168         __ lsrv(tmp2, tmp2, rscratch2);
5169         __ uxthw(tmp1, tmp1);
5170         __ uxthw(tmp2, tmp2);
5171       }
5172       __ subw(result, tmp1, tmp2);
5173       __ b(LENGTH_DIFF);
5174     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
5175       __ eor(rscratch2, tmp1, tmp2);
5176       __ cbnz(rscratch2, DIFF);
5177     __ bind(LENGTH_DIFF);
5178       __ ret(lr);
5179     return entry;
5180   }
5181 
5182   void generate_compare_long_strings() {
5183       StubRoutines::aarch64::_compare_long_string_LL
5184           = generate_compare_long_string_same_encoding(true);
5185       StubRoutines::aarch64::_compare_long_string_UU
5186           = generate_compare_long_string_same_encoding(false);
5187       StubRoutines::aarch64::_compare_long_string_LU
5188           = generate_compare_long_string_different_encoding(true);
5189       StubRoutines::aarch64::_compare_long_string_UL
5190           = generate_compare_long_string_different_encoding(false);
5191   }
5192 
5193   // R0 = result
5194   // R1 = str2
5195   // R2 = cnt1
5196   // R3 = str1
5197   // R4 = cnt2
5198   // This generic linear code use few additional ideas, which makes it faster:
5199   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5200   // in order to skip initial loading(help in systems with 1 ld pipeline)
5201   // 2) we can use "fast" algorithm of finding single character to search for
5202   // first symbol with less branches(1 branch per each loaded register instead
5203   // of branch for each symbol), so, this is where constants like
5204   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5205   // 3) after loading and analyzing 1st register of source string, it can be
5206   // used to search for every 1st character entry, saving few loads in
5207   // comparison with "simplier-but-slower" implementation
5208   // 4) in order to avoid lots of push/pop operations, code below is heavily
5209   // re-using/re-initializing/compressing register values, which makes code
5210   // larger and a bit less readable, however, most of extra operations are
5211   // issued during loads or branches, so, penalty is minimal
5212   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5213     const char* stubName = str1_isL
5214         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5215         : "indexof_linear_uu";
5216     __ align(CodeEntryAlignment);
5217     StubCodeMark mark(this, "StubRoutines", stubName);
5218     address entry = __ pc();
5219 
5220     int str1_chr_size = str1_isL ? 1 : 2;
5221     int str2_chr_size = str2_isL ? 1 : 2;
5222     int str1_chr_shift = str1_isL ? 0 : 1;
5223     int str2_chr_shift = str2_isL ? 0 : 1;
5224     bool isL = str1_isL && str2_isL;
5225    // parameters
5226     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5227     // temporary registers
5228     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5229     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5230     // redefinitions
5231     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5232 
5233     __ push(spilled_regs, sp);
5234     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5235         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5236         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5237         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5238         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5239         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5240     // Read whole register from str1. It is safe, because length >=8 here
5241     __ ldr(ch1, Address(str1));
5242     // Read whole register from str2. It is safe, because length >=8 here
5243     __ ldr(ch2, Address(str2));
5244     __ sub(cnt2, cnt2, cnt1);
5245     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5246     if (str1_isL != str2_isL) {
5247       __ eor(v0, __ T16B, v0, v0);
5248     }
5249     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5250     __ mul(first, first, tmp1);
5251     // check if we have less than 1 register to check
5252     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5253     if (str1_isL != str2_isL) {
5254       __ fmovd(v1, ch1);
5255     }
5256     __ br(__ LE, L_SMALL);
5257     __ eor(ch2, first, ch2);
5258     if (str1_isL != str2_isL) {
5259       __ zip1(v1, __ T16B, v1, v0);
5260     }
5261     __ sub(tmp2, ch2, tmp1);
5262     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5263     __ bics(tmp2, tmp2, ch2);
5264     if (str1_isL != str2_isL) {
5265       __ fmovd(ch1, v1);
5266     }
5267     __ br(__ NE, L_HAS_ZERO);
5268     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5269     __ add(result, result, wordSize/str2_chr_size);
5270     __ add(str2, str2, wordSize);
5271     __ br(__ LT, L_POST_LOOP);
5272     __ BIND(L_LOOP);
5273       __ ldr(ch2, Address(str2));
5274       __ eor(ch2, first, ch2);
5275       __ sub(tmp2, ch2, tmp1);
5276       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5277       __ bics(tmp2, tmp2, ch2);
5278       __ br(__ NE, L_HAS_ZERO);
5279     __ BIND(L_LOOP_PROCEED);
5280       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5281       __ add(str2, str2, wordSize);
5282       __ add(result, result, wordSize/str2_chr_size);
5283       __ br(__ GE, L_LOOP);
5284     __ BIND(L_POST_LOOP);
5285       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5286       __ br(__ LE, NOMATCH);
5287       __ ldr(ch2, Address(str2));
5288       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5289       __ eor(ch2, first, ch2);
5290       __ sub(tmp2, ch2, tmp1);
5291       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5292       __ mov(tmp4, -1); // all bits set
5293       __ b(L_SMALL_PROCEED);
5294     __ align(OptoLoopAlignment);
5295     __ BIND(L_SMALL);
5296       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5297       __ eor(ch2, first, ch2);
5298       if (str1_isL != str2_isL) {
5299         __ zip1(v1, __ T16B, v1, v0);
5300       }
5301       __ sub(tmp2, ch2, tmp1);
5302       __ mov(tmp4, -1); // all bits set
5303       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5304       if (str1_isL != str2_isL) {
5305         __ fmovd(ch1, v1); // move converted 4 symbols
5306       }
5307     __ BIND(L_SMALL_PROCEED);
5308       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5309       __ bic(tmp2, tmp2, ch2);
5310       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5311       __ rbit(tmp2, tmp2);
5312       __ br(__ EQ, NOMATCH);
5313     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5314       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5315       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5316       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5317       if (str2_isL) { // LL
5318         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5319         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5320         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5321         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5322         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5323       } else {
5324         __ mov(ch2, 0xE); // all bits in byte set except last one
5325         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5326         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5327         __ lslv(tmp2, tmp2, tmp4);
5328         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5329         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5330         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5331         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5332       }
5333       __ cmp(ch1, ch2);
5334       __ mov(tmp4, wordSize/str2_chr_size);
5335       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5336     __ BIND(L_SMALL_CMP_LOOP);
5337       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5338                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5339       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5340                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5341       __ add(tmp4, tmp4, 1);
5342       __ cmp(tmp4, cnt1);
5343       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5344       __ cmp(first, ch2);
5345       __ br(__ EQ, L_SMALL_CMP_LOOP);
5346     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5347       __ cbz(tmp2, NOMATCH); // no more matches. exit
5348       __ clz(tmp4, tmp2);
5349       __ add(result, result, 1); // advance index
5350       __ add(str2, str2, str2_chr_size); // advance pointer
5351       __ b(L_SMALL_HAS_ZERO_LOOP);
5352     __ align(OptoLoopAlignment);
5353     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5354       __ cmp(first, ch2);
5355       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5356       __ b(DONE);
5357     __ align(OptoLoopAlignment);
5358     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5359       if (str2_isL) { // LL
5360         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5361         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5362         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5363         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5364         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5365       } else {
5366         __ mov(ch2, 0xE); // all bits in byte set except last one
5367         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5368         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5369         __ lslv(tmp2, tmp2, tmp4);
5370         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5371         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5372         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5373         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5374       }
5375       __ cmp(ch1, ch2);
5376       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5377       __ b(DONE);
5378     __ align(OptoLoopAlignment);
5379     __ BIND(L_HAS_ZERO);
5380       __ rbit(tmp2, tmp2);
5381       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5382       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5383       // It's fine because both counters are 32bit and are not changed in this
5384       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5385       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5386       __ sub(result, result, 1);
5387     __ BIND(L_HAS_ZERO_LOOP);
5388       __ mov(cnt1, wordSize/str2_chr_size);
5389       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5390       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5391       if (str2_isL) {
5392         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5393         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5394         __ lslv(tmp2, tmp2, tmp4);
5395         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5396         __ add(tmp4, tmp4, 1);
5397         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5398         __ lsl(tmp2, tmp2, 1);
5399         __ mov(tmp4, wordSize/str2_chr_size);
5400       } else {
5401         __ mov(ch2, 0xE);
5402         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5403         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5404         __ lslv(tmp2, tmp2, tmp4);
5405         __ add(tmp4, tmp4, 1);
5406         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5407         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5408         __ lsl(tmp2, tmp2, 1);
5409         __ mov(tmp4, wordSize/str2_chr_size);
5410         __ sub(str2, str2, str2_chr_size);
5411       }
5412       __ cmp(ch1, ch2);
5413       __ mov(tmp4, wordSize/str2_chr_size);
5414       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5415     __ BIND(L_CMP_LOOP);
5416       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5417                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5418       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5419                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5420       __ add(tmp4, tmp4, 1);
5421       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5422       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5423       __ cmp(cnt1, ch2);
5424       __ br(__ EQ, L_CMP_LOOP);
5425     __ BIND(L_CMP_LOOP_NOMATCH);
5426       // here we're not matched
5427       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5428       __ clz(tmp4, tmp2);
5429       __ add(str2, str2, str2_chr_size); // advance pointer
5430       __ b(L_HAS_ZERO_LOOP);
5431     __ align(OptoLoopAlignment);
5432     __ BIND(L_CMP_LOOP_LAST_CMP);
5433       __ cmp(cnt1, ch2);
5434       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5435       __ b(DONE);
5436     __ align(OptoLoopAlignment);
5437     __ BIND(L_CMP_LOOP_LAST_CMP2);
5438       if (str2_isL) {
5439         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5440         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5441         __ lslv(tmp2, tmp2, tmp4);
5442         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5443         __ add(tmp4, tmp4, 1);
5444         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5445         __ lsl(tmp2, tmp2, 1);
5446       } else {
5447         __ mov(ch2, 0xE);
5448         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5449         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5450         __ lslv(tmp2, tmp2, tmp4);
5451         __ add(tmp4, tmp4, 1);
5452         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5453         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5454         __ lsl(tmp2, tmp2, 1);
5455         __ sub(str2, str2, str2_chr_size);
5456       }
5457       __ cmp(ch1, ch2);
5458       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5459       __ b(DONE);
5460     __ align(OptoLoopAlignment);
5461     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5462       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5463       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5464       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5465       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5466       // result by analyzed characters value, so, we can just reset lower bits
5467       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5468       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5469       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5470       // index of last analyzed substring inside current octet. So, str2 in at
5471       // respective start address. We need to advance it to next octet
5472       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5473       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5474       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5475       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5476       __ movw(cnt2, cnt2);
5477       __ b(L_LOOP_PROCEED);
5478     __ align(OptoLoopAlignment);
5479     __ BIND(NOMATCH);
5480       __ mov(result, -1);
5481     __ BIND(DONE);
5482       __ pop(spilled_regs, sp);
5483       __ ret(lr);
5484     return entry;
5485   }
5486 
5487   void generate_string_indexof_stubs() {
5488     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5489     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5490     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5491   }
5492 
5493   void inflate_and_store_2_fp_registers(bool generatePrfm,
5494       FloatRegister src1, FloatRegister src2) {
5495     Register dst = r1;
5496     __ zip1(v1, __ T16B, src1, v0);
5497     __ zip2(v2, __ T16B, src1, v0);
5498     if (generatePrfm) {
5499       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5500     }
5501     __ zip1(v3, __ T16B, src2, v0);
5502     __ zip2(v4, __ T16B, src2, v0);
5503     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5504   }
5505 
5506   // R0 = src
5507   // R1 = dst
5508   // R2 = len
5509   // R3 = len >> 3
5510   // V0 = 0
5511   // v1 = loaded 8 bytes
5512   address generate_large_byte_array_inflate() {
5513     __ align(CodeEntryAlignment);
5514     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5515     address entry = __ pc();
5516     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5517     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5518     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5519 
5520     // do one more 8-byte read to have address 16-byte aligned in most cases
5521     // also use single store instruction
5522     __ ldrd(v2, __ post(src, 8));
5523     __ sub(octetCounter, octetCounter, 2);
5524     __ zip1(v1, __ T16B, v1, v0);
5525     __ zip1(v2, __ T16B, v2, v0);
5526     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5527     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5528     __ subs(rscratch1, octetCounter, large_loop_threshold);
5529     __ br(__ LE, LOOP_START);
5530     __ b(LOOP_PRFM_START);
5531     __ bind(LOOP_PRFM);
5532       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5533     __ bind(LOOP_PRFM_START);
5534       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5535       __ sub(octetCounter, octetCounter, 8);
5536       __ subs(rscratch1, octetCounter, large_loop_threshold);
5537       inflate_and_store_2_fp_registers(true, v3, v4);
5538       inflate_and_store_2_fp_registers(true, v5, v6);
5539       __ br(__ GT, LOOP_PRFM);
5540       __ cmp(octetCounter, (u1)8);
5541       __ br(__ LT, DONE);
5542     __ bind(LOOP);
5543       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5544       __ bind(LOOP_START);
5545       __ sub(octetCounter, octetCounter, 8);
5546       __ cmp(octetCounter, (u1)8);
5547       inflate_and_store_2_fp_registers(false, v3, v4);
5548       inflate_and_store_2_fp_registers(false, v5, v6);
5549       __ br(__ GE, LOOP);
5550     __ bind(DONE);
5551       __ ret(lr);
5552     return entry;
5553   }
5554 
5555   /**
5556    *  Arguments:
5557    *
5558    *  Input:
5559    *  c_rarg0   - current state address
5560    *  c_rarg1   - H key address
5561    *  c_rarg2   - data address
5562    *  c_rarg3   - number of blocks
5563    *
5564    *  Output:
5565    *  Updated state at c_rarg0
5566    */
5567   address generate_ghash_processBlocks() {
5568     // Bafflingly, GCM uses little-endian for the byte order, but
5569     // big-endian for the bit order.  For example, the polynomial 1 is
5570     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5571     //
5572     // So, we must either reverse the bytes in each word and do
5573     // everything big-endian or reverse the bits in each byte and do
5574     // it little-endian.  On AArch64 it's more idiomatic to reverse
5575     // the bits in each byte (we have an instruction, RBIT, to do
5576     // that) and keep the data in little-endian bit order throught the
5577     // calculation, bit-reversing the inputs and outputs.
5578 
5579     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5580     __ align(wordSize * 2);
5581     address p = __ pc();
5582     __ emit_int64(0x87);  // The low-order bits of the field
5583                           // polynomial (i.e. p = z^7+z^2+z+1)
5584                           // repeated in the low and high parts of a
5585                           // 128-bit vector
5586     __ emit_int64(0x87);
5587 
5588     __ align(CodeEntryAlignment);
5589     address start = __ pc();
5590 
5591     Register state   = c_rarg0;
5592     Register subkeyH = c_rarg1;
5593     Register data    = c_rarg2;
5594     Register blocks  = c_rarg3;
5595 
5596     FloatRegister vzr = v30;
5597     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5598 
5599     __ ldrq(v24, p);    // The field polynomial
5600 
5601     __ ldrq(v0, Address(state));
5602     __ ldrq(v1, Address(subkeyH));
5603 
5604     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5605     __ rbit(v0, __ T16B, v0);
5606     __ rev64(v1, __ T16B, v1);
5607     __ rbit(v1, __ T16B, v1);
5608 
5609     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5610     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5611 
5612     {
5613       Label L_ghash_loop;
5614       __ bind(L_ghash_loop);
5615 
5616       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5617                                                  // reversing each byte
5618       __ rbit(v2, __ T16B, v2);
5619       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5620 
5621       // Multiply state in v2 by subkey in v1
5622       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5623                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
5624                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
5625       // Reduce v7:v5 by the field polynomial
5626       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
5627 
5628       __ sub(blocks, blocks, 1);
5629       __ cbnz(blocks, L_ghash_loop);
5630     }
5631 
5632     // The bit-reversed result is at this point in v0
5633     __ rev64(v0, __ T16B, v0);
5634     __ rbit(v0, __ T16B, v0);
5635 
5636     __ st1(v0, __ T16B, state);
5637     __ ret(lr);
5638 
5639     return start;
5640   }
5641 
5642   address generate_ghash_processBlocks_wide() {
5643     address small = generate_ghash_processBlocks();
5644 
5645     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
5646     __ align(wordSize * 2);
5647     address p = __ pc();
5648     __ emit_int64(0x87);  // The low-order bits of the field
5649                           // polynomial (i.e. p = z^7+z^2+z+1)
5650                           // repeated in the low and high parts of a
5651                           // 128-bit vector
5652     __ emit_int64(0x87);
5653 
5654     __ align(CodeEntryAlignment);
5655     address start = __ pc();
5656 
5657     Register state   = c_rarg0;
5658     Register subkeyH = c_rarg1;
5659     Register data    = c_rarg2;
5660     Register blocks  = c_rarg3;
5661 
5662     const int unroll = 4;
5663 
5664     __ cmp(blocks, (unsigned char)(unroll * 2));
5665     __ br(__ LT, small);
5666 
5667     if (unroll > 1) {
5668     // Save state before entering routine
5669       __ sub(sp, sp, 4 * 16);
5670       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
5671       __ sub(sp, sp, 4 * 16);
5672       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
5673     }
5674 
5675     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
5676 
5677     if (unroll > 1) {
5678       // And restore state
5679       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
5680       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
5681     }
5682 
5683     __ cmp(blocks, (unsigned char)0);
5684     __ br(__ GT, small);
5685 
5686     __ ret(lr);
5687 
5688     return start;
5689   }
5690 
5691   void generate_base64_encode_simdround(Register src, Register dst,
5692         FloatRegister codec, u8 size) {
5693 
5694     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
5695     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5696     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5697 
5698     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5699 
5700     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
5701 
5702     __ ushr(ind0, arrangement, in0,  2);
5703 
5704     __ ushr(ind1, arrangement, in1,  2);
5705     __ shl(in0,   arrangement, in0,  6);
5706     __ orr(ind1,  arrangement, ind1, in0);
5707     __ ushr(ind1, arrangement, ind1, 2);
5708 
5709     __ ushr(ind2, arrangement, in2,  4);
5710     __ shl(in1,   arrangement, in1,  4);
5711     __ orr(ind2,  arrangement, in1,  ind2);
5712     __ ushr(ind2, arrangement, ind2, 2);
5713 
5714     __ shl(ind3,  arrangement, in2,  2);
5715     __ ushr(ind3, arrangement, ind3, 2);
5716 
5717     __ tbl(out0,  arrangement, codec,  4, ind0);
5718     __ tbl(out1,  arrangement, codec,  4, ind1);
5719     __ tbl(out2,  arrangement, codec,  4, ind2);
5720     __ tbl(out3,  arrangement, codec,  4, ind3);
5721 
5722     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
5723   }
5724 
5725    /**
5726    *  Arguments:
5727    *
5728    *  Input:
5729    *  c_rarg0   - src_start
5730    *  c_rarg1   - src_offset
5731    *  c_rarg2   - src_length
5732    *  c_rarg3   - dest_start
5733    *  c_rarg4   - dest_offset
5734    *  c_rarg5   - isURL
5735    *
5736    */
5737   address generate_base64_encodeBlock() {
5738 
5739     static const char toBase64[64] = {
5740       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5741       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5742       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5743       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5744       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5745     };
5746 
5747     static const char toBase64URL[64] = {
5748       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5749       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5750       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5751       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5752       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5753     };
5754 
5755     __ align(CodeEntryAlignment);
5756     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5757     address start = __ pc();
5758 
5759     Register src   = c_rarg0;  // source array
5760     Register soff  = c_rarg1;  // source start offset
5761     Register send  = c_rarg2;  // source end offset
5762     Register dst   = c_rarg3;  // dest array
5763     Register doff  = c_rarg4;  // position for writing to dest array
5764     Register isURL = c_rarg5;  // Base64 or URL chracter set
5765 
5766     // c_rarg6 and c_rarg7 are free to use as temps
5767     Register codec  = c_rarg6;
5768     Register length = c_rarg7;
5769 
5770     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
5771 
5772     __ add(src, src, soff);
5773     __ add(dst, dst, doff);
5774     __ sub(length, send, soff);
5775 
5776     // load the codec base address
5777     __ lea(codec, ExternalAddress((address) toBase64));
5778     __ cbz(isURL, ProcessData);
5779     __ lea(codec, ExternalAddress((address) toBase64URL));
5780 
5781     __ BIND(ProcessData);
5782 
5783     // too short to formup a SIMD loop, roll back
5784     __ cmp(length, (u1)24);
5785     __ br(Assembler::LT, Process3B);
5786 
5787     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
5788 
5789     __ BIND(Process48B);
5790     __ cmp(length, (u1)48);
5791     __ br(Assembler::LT, Process24B);
5792     generate_base64_encode_simdround(src, dst, v0, 16);
5793     __ sub(length, length, 48);
5794     __ b(Process48B);
5795 
5796     __ BIND(Process24B);
5797     __ cmp(length, (u1)24);
5798     __ br(Assembler::LT, SIMDExit);
5799     generate_base64_encode_simdround(src, dst, v0, 8);
5800     __ sub(length, length, 24);
5801 
5802     __ BIND(SIMDExit);
5803     __ cbz(length, Exit);
5804 
5805     __ BIND(Process3B);
5806     //  3 src bytes, 24 bits
5807     __ ldrb(r10, __ post(src, 1));
5808     __ ldrb(r11, __ post(src, 1));
5809     __ ldrb(r12, __ post(src, 1));
5810     __ orrw(r11, r11, r10, Assembler::LSL, 8);
5811     __ orrw(r12, r12, r11, Assembler::LSL, 8);
5812     // codec index
5813     __ ubfmw(r15, r12, 18, 23);
5814     __ ubfmw(r14, r12, 12, 17);
5815     __ ubfmw(r13, r12, 6,  11);
5816     __ andw(r12,  r12, 63);
5817     // get the code based on the codec
5818     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
5819     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
5820     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
5821     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
5822     __ strb(r15, __ post(dst, 1));
5823     __ strb(r14, __ post(dst, 1));
5824     __ strb(r13, __ post(dst, 1));
5825     __ strb(r12, __ post(dst, 1));
5826     __ sub(length, length, 3);
5827     __ cbnz(length, Process3B);
5828 
5829     __ BIND(Exit);
5830     __ ret(lr);
5831 
5832     return start;
5833   }
5834 
5835   void generate_base64_decode_simdround(Register src, Register dst,
5836         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
5837 
5838     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
5839     FloatRegister out0 = v20, out1 = v21, out2 = v22;
5840 
5841     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
5842     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
5843 
5844     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
5845 
5846     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5847 
5848     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
5849 
5850     // we need unsigned saturating substract, to make sure all input values
5851     // in range [0, 63] will have 0U value in the higher half lookup
5852     __ uqsubv(decH0, __ T16B, in0, v27);
5853     __ uqsubv(decH1, __ T16B, in1, v27);
5854     __ uqsubv(decH2, __ T16B, in2, v27);
5855     __ uqsubv(decH3, __ T16B, in3, v27);
5856 
5857     // lower half lookup
5858     __ tbl(decL0, arrangement, codecL, 4, in0);
5859     __ tbl(decL1, arrangement, codecL, 4, in1);
5860     __ tbl(decL2, arrangement, codecL, 4, in2);
5861     __ tbl(decL3, arrangement, codecL, 4, in3);
5862 
5863     // higher half lookup
5864     __ tbx(decH0, arrangement, codecH, 4, decH0);
5865     __ tbx(decH1, arrangement, codecH, 4, decH1);
5866     __ tbx(decH2, arrangement, codecH, 4, decH2);
5867     __ tbx(decH3, arrangement, codecH, 4, decH3);
5868 
5869     // combine lower and higher
5870     __ orr(decL0, arrangement, decL0, decH0);
5871     __ orr(decL1, arrangement, decL1, decH1);
5872     __ orr(decL2, arrangement, decL2, decH2);
5873     __ orr(decL3, arrangement, decL3, decH3);
5874 
5875     // check illegal inputs, value larger than 63 (maximum of 6 bits)
5876     __ cmhi(decH0, arrangement, decL0, v27);
5877     __ cmhi(decH1, arrangement, decL1, v27);
5878     __ cmhi(decH2, arrangement, decL2, v27);
5879     __ cmhi(decH3, arrangement, decL3, v27);
5880     __ orr(in0, arrangement, decH0, decH1);
5881     __ orr(in1, arrangement, decH2, decH3);
5882     __ orr(in2, arrangement, in0,   in1);
5883     __ umaxv(in3, arrangement, in2);
5884     __ umov(rscratch2, in3, __ B, 0);
5885 
5886     // get the data to output
5887     __ shl(out0,  arrangement, decL0, 2);
5888     __ ushr(out1, arrangement, decL1, 4);
5889     __ orr(out0,  arrangement, out0,  out1);
5890     __ shl(out1,  arrangement, decL1, 4);
5891     __ ushr(out2, arrangement, decL2, 2);
5892     __ orr(out1,  arrangement, out1,  out2);
5893     __ shl(out2,  arrangement, decL2, 6);
5894     __ orr(out2,  arrangement, out2,  decL3);
5895 
5896     __ cbz(rscratch2, NoIllegalData);
5897 
5898     // handle illegal input
5899     __ umov(r10, in2, __ D, 0);
5900     if (size == 16) {
5901       __ cbnz(r10, ErrorInLowerHalf);
5902 
5903       // illegal input is in higher half, store the lower half now.
5904       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
5905 
5906       __ umov(r10, in2,  __ D, 1);
5907       __ umov(r11, out0, __ D, 1);
5908       __ umov(r12, out1, __ D, 1);
5909       __ umov(r13, out2, __ D, 1);
5910       __ b(StoreLegalData);
5911 
5912       __ BIND(ErrorInLowerHalf);
5913     }
5914     __ umov(r11, out0, __ D, 0);
5915     __ umov(r12, out1, __ D, 0);
5916     __ umov(r13, out2, __ D, 0);
5917 
5918     __ BIND(StoreLegalData);
5919     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
5920     __ strb(r11, __ post(dst, 1));
5921     __ strb(r12, __ post(dst, 1));
5922     __ strb(r13, __ post(dst, 1));
5923     __ lsr(r10, r10, 8);
5924     __ lsr(r11, r11, 8);
5925     __ lsr(r12, r12, 8);
5926     __ lsr(r13, r13, 8);
5927     __ b(StoreLegalData);
5928 
5929     __ BIND(NoIllegalData);
5930     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
5931   }
5932 
5933 
5934    /**
5935    *  Arguments:
5936    *
5937    *  Input:
5938    *  c_rarg0   - src_start
5939    *  c_rarg1   - src_offset
5940    *  c_rarg2   - src_length
5941    *  c_rarg3   - dest_start
5942    *  c_rarg4   - dest_offset
5943    *  c_rarg5   - isURL
5944    *  c_rarg6   - isMIME
5945    *
5946    */
5947   address generate_base64_decodeBlock() {
5948 
5949     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
5950     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
5951     // titled "Base64 decoding".
5952 
5953     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
5954     // except the trailing character '=' is also treated illegal value in this instrinsic. That
5955     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
5956     static const uint8_t fromBase64ForNoSIMD[256] = {
5957       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5958       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5959       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
5960        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5961       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5962        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
5963       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5964        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5965       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5966       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5967       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5968       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5969       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5970       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5971       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5972       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5973     };
5974 
5975     static const uint8_t fromBase64URLForNoSIMD[256] = {
5976       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5977       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5978       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
5979        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5980       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5981        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
5982       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5983        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5984       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5985       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5986       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5987       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5988       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5989       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5990       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5991       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5992     };
5993 
5994     // A legal value of base64 code is in range [0, 127].  We need two lookups
5995     // with tbl/tbx and combine them to get the decode data. The 1st table vector
5996     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
5997     // table vector lookup use tbx, out of range indices are unchanged in
5998     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
5999     // The value of index 64 is set to 0, so that we know that we already get the
6000     // decoded data with the 1st lookup.
6001     static const uint8_t fromBase64ForSIMD[128] = {
6002       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6003       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6004       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6005        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6006         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6007        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6008       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6009        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6010     };
6011 
6012     static const uint8_t fromBase64URLForSIMD[128] = {
6013       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6014       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6015       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6016        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6017         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6018        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6019        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6020        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6021     };
6022 
6023     __ align(CodeEntryAlignment);
6024     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6025     address start = __ pc();
6026 
6027     Register src    = c_rarg0;  // source array
6028     Register soff   = c_rarg1;  // source start offset
6029     Register send   = c_rarg2;  // source end offset
6030     Register dst    = c_rarg3;  // dest array
6031     Register doff   = c_rarg4;  // position for writing to dest array
6032     Register isURL  = c_rarg5;  // Base64 or URL character set
6033     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6034 
6035     Register length = send;    // reuse send as length of source data to process
6036 
6037     Register simd_codec   = c_rarg6;
6038     Register nosimd_codec = c_rarg7;
6039 
6040     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6041 
6042     __ enter();
6043 
6044     __ add(src, src, soff);
6045     __ add(dst, dst, doff);
6046 
6047     __ mov(doff, dst);
6048 
6049     __ sub(length, send, soff);
6050     __ bfm(length, zr, 0, 1);
6051 
6052     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6053     __ cbz(isURL, ProcessData);
6054     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6055 
6056     __ BIND(ProcessData);
6057     __ mov(rscratch1, length);
6058     __ cmp(length, (u1)144); // 144 = 80 + 64
6059     __ br(Assembler::LT, Process4B);
6060 
6061     // In the MIME case, the line length cannot be more than 76
6062     // bytes (see RFC 2045). This is too short a block for SIMD
6063     // to be worthwhile, so we use non-SIMD here.
6064     __ movw(rscratch1, 79);
6065 
6066     __ BIND(Process4B);
6067     __ ldrw(r14, __ post(src, 4));
6068     __ ubfxw(r10, r14, 0,  8);
6069     __ ubfxw(r11, r14, 8,  8);
6070     __ ubfxw(r12, r14, 16, 8);
6071     __ ubfxw(r13, r14, 24, 8);
6072     // get the de-code
6073     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6074     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6075     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6076     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6077     // error detection, 255u indicates an illegal input
6078     __ orrw(r14, r10, r11);
6079     __ orrw(r15, r12, r13);
6080     __ orrw(r14, r14, r15);
6081     __ tbnz(r14, 7, Exit);
6082     // recover the data
6083     __ lslw(r14, r10, 10);
6084     __ bfiw(r14, r11, 4, 6);
6085     __ bfmw(r14, r12, 2, 5);
6086     __ rev16w(r14, r14);
6087     __ bfiw(r13, r12, 6, 2);
6088     __ strh(r14, __ post(dst, 2));
6089     __ strb(r13, __ post(dst, 1));
6090     // non-simd loop
6091     __ subsw(rscratch1, rscratch1, 4);
6092     __ br(Assembler::GT, Process4B);
6093 
6094     // if exiting from PreProcess80B, rscratch1 == -1;
6095     // otherwise, rscratch1 == 0.
6096     __ cbzw(rscratch1, Exit);
6097     __ sub(length, length, 80);
6098 
6099     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6100     __ cbz(isURL, SIMDEnter);
6101     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6102 
6103     __ BIND(SIMDEnter);
6104     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6105     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6106     __ mov(rscratch1, 63);
6107     __ dup(v27, __ T16B, rscratch1);
6108 
6109     __ BIND(Process64B);
6110     __ cmp(length, (u1)64);
6111     __ br(Assembler::LT, Process32B);
6112     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6113     __ sub(length, length, 64);
6114     __ b(Process64B);
6115 
6116     __ BIND(Process32B);
6117     __ cmp(length, (u1)32);
6118     __ br(Assembler::LT, SIMDExit);
6119     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6120     __ sub(length, length, 32);
6121     __ b(Process32B);
6122 
6123     __ BIND(SIMDExit);
6124     __ cbz(length, Exit);
6125     __ movw(rscratch1, length);
6126     __ b(Process4B);
6127 
6128     __ BIND(Exit);
6129     __ sub(c_rarg0, dst, doff);
6130 
6131     __ leave();
6132     __ ret(lr);
6133 
6134     return start;
6135   }
6136 
6137 #ifdef LINUX
6138 
6139   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6140   //
6141   // If LSE is in use, generate LSE versions of all the stubs. The
6142   // non-LSE versions are in atomic_aarch64.S.
6143 
6144   // class AtomicStubMark records the entry point of a stub and the
6145   // stub pointer which will point to it. The stub pointer is set to
6146   // the entry point when ~AtomicStubMark() is called, which must be
6147   // after ICache::invalidate_range. This ensures safe publication of
6148   // the generated code.
6149   class AtomicStubMark {
6150     address _entry_point;
6151     aarch64_atomic_stub_t *_stub;
6152     MacroAssembler *_masm;
6153   public:
6154     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6155       _masm = masm;
6156       __ align(32);
6157       _entry_point = __ pc();
6158       _stub = stub;
6159     }
6160     ~AtomicStubMark() {
6161       *_stub = (aarch64_atomic_stub_t)_entry_point;
6162     }
6163   };
6164 
6165   // NB: For memory_order_conservative we need a trailing membar after
6166   // LSE atomic operations but not a leading membar.
6167   //
6168   // We don't need a leading membar because a clause in the Arm ARM
6169   // says:
6170   //
6171   //   Barrier-ordered-before
6172   //
6173   //   Barrier instructions order prior Memory effects before subsequent
6174   //   Memory effects generated by the same Observer. A read or a write
6175   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6176   //   Observer if and only if RW1 appears in program order before RW 2
6177   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6178   //   instruction with both Acquire and Release semantics.
6179   //
6180   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6181   // and Release semantics, therefore we don't need a leading
6182   // barrier. However, there is no corresponding Barrier-ordered-after
6183   // relationship, therefore we need a trailing membar to prevent a
6184   // later store or load from being reordered with the store in an
6185   // atomic instruction.
6186   //
6187   // This was checked by using the herd7 consistency model simulator
6188   // (http://diy.inria.fr/) with this test case:
6189   //
6190   // AArch64 LseCas
6191   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6192   // P0 | P1;
6193   // LDR W4, [X2] | MOV W3, #0;
6194   // DMB LD       | MOV W4, #1;
6195   // LDR W3, [X1] | CASAL W3, W4, [X1];
6196   //              | DMB ISH;
6197   //              | STR W4, [X2];
6198   // exists
6199   // (0:X3=0 /\ 0:X4=1)
6200   //
6201   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6202   // with the store to x in P1. Without the DMB in P1 this may happen.
6203   //
6204   // At the time of writing we don't know of any AArch64 hardware that
6205   // reorders stores in this way, but the Reference Manual permits it.
6206 
6207   void gen_cas_entry(Assembler::operand_size size,
6208                      atomic_memory_order order) {
6209     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6210       exchange_val = c_rarg2;
6211     bool acquire, release;
6212     switch (order) {
6213       case memory_order_relaxed:
6214         acquire = false;
6215         release = false;
6216         break;
6217       case memory_order_release:
6218         acquire = false;
6219         release = true;
6220         break;
6221       default:
6222         acquire = true;
6223         release = true;
6224         break;
6225     }
6226     __ mov(prev, compare_val);
6227     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6228     if (order == memory_order_conservative) {
6229       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6230     }
6231     if (size == Assembler::xword) {
6232       __ mov(r0, prev);
6233     } else {
6234       __ movw(r0, prev);
6235     }
6236     __ ret(lr);
6237   }
6238 
6239   void gen_ldaddal_entry(Assembler::operand_size size) {
6240     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6241     __ ldaddal(size, incr, prev, addr);
6242     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6243     if (size == Assembler::xword) {
6244       __ mov(r0, prev);
6245     } else {
6246       __ movw(r0, prev);
6247     }
6248     __ ret(lr);
6249   }
6250 
6251   void gen_swpal_entry(Assembler::operand_size size) {
6252     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6253     __ swpal(size, incr, prev, addr);
6254     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6255     if (size == Assembler::xword) {
6256       __ mov(r0, prev);
6257     } else {
6258       __ movw(r0, prev);
6259     }
6260     __ ret(lr);
6261   }
6262 
6263   void generate_atomic_entry_points() {
6264     if (! UseLSE) {
6265       return;
6266     }
6267 
6268     __ align(CodeEntryAlignment);
6269     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6270     address first_entry = __ pc();
6271 
6272     // All memory_order_conservative
6273     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6274     gen_ldaddal_entry(Assembler::word);
6275     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6276     gen_ldaddal_entry(Assembler::xword);
6277 
6278     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6279     gen_swpal_entry(Assembler::word);
6280     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6281     gen_swpal_entry(Assembler::xword);
6282 
6283     // CAS, memory_order_conservative
6284     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6285     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6286     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6287     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6288     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6289     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6290 
6291     // CAS, memory_order_relaxed
6292     AtomicStubMark mark_cmpxchg_1_relaxed
6293       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6294     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6295     AtomicStubMark mark_cmpxchg_4_relaxed
6296       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6297     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6298     AtomicStubMark mark_cmpxchg_8_relaxed
6299       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6300     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6301 
6302     AtomicStubMark mark_cmpxchg_4_release
6303       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6304     gen_cas_entry(MacroAssembler::word, memory_order_release);
6305     AtomicStubMark mark_cmpxchg_8_release
6306       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6307     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6308 
6309     AtomicStubMark mark_cmpxchg_4_seq_cst
6310       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6311     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6312     AtomicStubMark mark_cmpxchg_8_seq_cst
6313       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6314     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6315 
6316     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6317   }
6318 #endif // LINUX
6319 
6320   // Continuation point for throwing of implicit exceptions that are
6321   // not handled in the current activation. Fabricates an exception
6322   // oop and initiates normal exception dispatching in this
6323   // frame. Since we need to preserve callee-saved values (currently
6324   // only for C2, but done for C1 as well) we need a callee-saved oop
6325   // map and therefore have to make these stubs into RuntimeStubs
6326   // rather than BufferBlobs.  If the compiler needs all registers to
6327   // be preserved between the fault point and the exception handler
6328   // then it must assume responsibility for that in
6329   // AbstractCompiler::continuation_for_implicit_null_exception or
6330   // continuation_for_implicit_division_by_zero_exception. All other
6331   // implicit exceptions (e.g., NullPointerException or
6332   // AbstractMethodError on entry) are either at call sites or
6333   // otherwise assume that stack unwinding will be initiated, so
6334   // caller saved registers were assumed volatile in the compiler.
6335 
6336 #undef __
6337 #define __ masm->
6338 
6339   address generate_throw_exception(const char* name,
6340                                    address runtime_entry,
6341                                    Register arg1 = noreg,
6342                                    Register arg2 = noreg) {
6343     // Information about frame layout at time of blocking runtime call.
6344     // Note that we only have to preserve callee-saved registers since
6345     // the compilers are responsible for supplying a continuation point
6346     // if they expect all registers to be preserved.
6347     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6348     enum layout {
6349       rfp_off = 0,
6350       rfp_off2,
6351       return_off,
6352       return_off2,
6353       framesize // inclusive of return address
6354     };
6355 
6356     int insts_size = 512;
6357     int locs_size  = 64;
6358 
6359     CodeBuffer code(name, insts_size, locs_size);
6360     OopMapSet* oop_maps  = new OopMapSet();
6361     MacroAssembler* masm = new MacroAssembler(&code);
6362 
6363     address start = __ pc();
6364 
6365     // This is an inlined and slightly modified version of call_VM
6366     // which has the ability to fetch the return PC out of
6367     // thread-local storage and also sets up last_Java_sp slightly
6368     // differently than the real call_VM
6369 
6370     __ enter(); // Save FP and LR before call
6371 
6372     assert(is_even(framesize/2), "sp not 16-byte aligned");
6373 
6374     // lr and fp are already in place
6375     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
6376 
6377     int frame_complete = __ pc() - start;
6378 
6379     // Set up last_Java_sp and last_Java_fp
6380     address the_pc = __ pc();
6381     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6382 
6383     // Call runtime
6384     if (arg1 != noreg) {
6385       assert(arg2 != c_rarg1, "clobbered");
6386       __ mov(c_rarg1, arg1);
6387     }
6388     if (arg2 != noreg) {
6389       __ mov(c_rarg2, arg2);
6390     }
6391     __ mov(c_rarg0, rthread);
6392     BLOCK_COMMENT("call runtime_entry");
6393     __ mov(rscratch1, runtime_entry);
6394     __ blr(rscratch1);
6395 
6396     // Generate oop map
6397     OopMap* map = new OopMap(framesize, 0);
6398 
6399     oop_maps->add_gc_map(the_pc - start, map);
6400 
6401     __ reset_last_Java_frame(true);
6402 
6403     // Reinitialize the ptrue predicate register, in case the external runtime
6404     // call clobbers ptrue reg, as we may return to SVE compiled code.
6405     __ reinitialize_ptrue();
6406 
6407     __ leave();
6408 
6409     // check for pending exceptions
6410 #ifdef ASSERT
6411     Label L;
6412     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6413     __ cbnz(rscratch1, L);
6414     __ should_not_reach_here();
6415     __ bind(L);
6416 #endif // ASSERT
6417     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6418 
6419 
6420     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6421     RuntimeStub* stub =
6422       RuntimeStub::new_runtime_stub(name,
6423                                     &code,
6424                                     frame_complete,
6425                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6426                                     oop_maps, false);
6427     return stub->entry_point();
6428   }
6429 
6430   class MontgomeryMultiplyGenerator : public MacroAssembler {
6431 
6432     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6433       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6434 
6435     RegSet _toSave;
6436     bool _squaring;
6437 
6438   public:
6439     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6440       : MacroAssembler(as->code()), _squaring(squaring) {
6441 
6442       // Register allocation
6443 
6444       RegSetIterator<> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6445       Pa_base = *regs;       // Argument registers
6446       if (squaring)
6447         Pb_base = Pa_base;
6448       else
6449         Pb_base = *++regs;
6450       Pn_base = *++regs;
6451       Rlen= *++regs;
6452       inv = *++regs;
6453       Pm_base = *++regs;
6454 
6455                           // Working registers:
6456       Ra =  *++regs;        // The current digit of a, b, n, and m.
6457       Rb =  *++regs;
6458       Rm =  *++regs;
6459       Rn =  *++regs;
6460 
6461       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
6462       Pb =  *++regs;
6463       Pm =  *++regs;
6464       Pn =  *++regs;
6465 
6466       t0 =  *++regs;        // Three registers which form a
6467       t1 =  *++regs;        // triple-precision accumuator.
6468       t2 =  *++regs;
6469 
6470       Ri =  *++regs;        // Inner and outer loop indexes.
6471       Rj =  *++regs;
6472 
6473       Rhi_ab = *++regs;     // Product registers: low and high parts
6474       Rlo_ab = *++regs;     // of a*b and m*n.
6475       Rhi_mn = *++regs;
6476       Rlo_mn = *++regs;
6477 
6478       // r19 and up are callee-saved.
6479       _toSave = RegSet::range(r19, *regs) + Pm_base;
6480     }
6481 
6482   private:
6483     void save_regs() {
6484       push(_toSave, sp);
6485     }
6486 
6487     void restore_regs() {
6488       pop(_toSave, sp);
6489     }
6490 
6491     template <typename T>
6492     void unroll_2(Register count, T block) {
6493       Label loop, end, odd;
6494       tbnz(count, 0, odd);
6495       cbz(count, end);
6496       align(16);
6497       bind(loop);
6498       (this->*block)();
6499       bind(odd);
6500       (this->*block)();
6501       subs(count, count, 2);
6502       br(Assembler::GT, loop);
6503       bind(end);
6504     }
6505 
6506     template <typename T>
6507     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
6508       Label loop, end, odd;
6509       tbnz(count, 0, odd);
6510       cbz(count, end);
6511       align(16);
6512       bind(loop);
6513       (this->*block)(d, s, tmp);
6514       bind(odd);
6515       (this->*block)(d, s, tmp);
6516       subs(count, count, 2);
6517       br(Assembler::GT, loop);
6518       bind(end);
6519     }
6520 
6521     void pre1(RegisterOrConstant i) {
6522       block_comment("pre1");
6523       // Pa = Pa_base;
6524       // Pb = Pb_base + i;
6525       // Pm = Pm_base;
6526       // Pn = Pn_base + i;
6527       // Ra = *Pa;
6528       // Rb = *Pb;
6529       // Rm = *Pm;
6530       // Rn = *Pn;
6531       ldr(Ra, Address(Pa_base));
6532       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6533       ldr(Rm, Address(Pm_base));
6534       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6535       lea(Pa, Address(Pa_base));
6536       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6537       lea(Pm, Address(Pm_base));
6538       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6539 
6540       // Zero the m*n result.
6541       mov(Rhi_mn, zr);
6542       mov(Rlo_mn, zr);
6543     }
6544 
6545     // The core multiply-accumulate step of a Montgomery
6546     // multiplication.  The idea is to schedule operations as a
6547     // pipeline so that instructions with long latencies (loads and
6548     // multiplies) have time to complete before their results are
6549     // used.  This most benefits in-order implementations of the
6550     // architecture but out-of-order ones also benefit.
6551     void step() {
6552       block_comment("step");
6553       // MACC(Ra, Rb, t0, t1, t2);
6554       // Ra = *++Pa;
6555       // Rb = *--Pb;
6556       umulh(Rhi_ab, Ra, Rb);
6557       mul(Rlo_ab, Ra, Rb);
6558       ldr(Ra, pre(Pa, wordSize));
6559       ldr(Rb, pre(Pb, -wordSize));
6560       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
6561                                        // previous iteration.
6562       // MACC(Rm, Rn, t0, t1, t2);
6563       // Rm = *++Pm;
6564       // Rn = *--Pn;
6565       umulh(Rhi_mn, Rm, Rn);
6566       mul(Rlo_mn, Rm, Rn);
6567       ldr(Rm, pre(Pm, wordSize));
6568       ldr(Rn, pre(Pn, -wordSize));
6569       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6570     }
6571 
6572     void post1() {
6573       block_comment("post1");
6574 
6575       // MACC(Ra, Rb, t0, t1, t2);
6576       // Ra = *++Pa;
6577       // Rb = *--Pb;
6578       umulh(Rhi_ab, Ra, Rb);
6579       mul(Rlo_ab, Ra, Rb);
6580       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6581       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6582 
6583       // *Pm = Rm = t0 * inv;
6584       mul(Rm, t0, inv);
6585       str(Rm, Address(Pm));
6586 
6587       // MACC(Rm, Rn, t0, t1, t2);
6588       // t0 = t1; t1 = t2; t2 = 0;
6589       umulh(Rhi_mn, Rm, Rn);
6590 
6591 #ifndef PRODUCT
6592       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6593       {
6594         mul(Rlo_mn, Rm, Rn);
6595         add(Rlo_mn, t0, Rlo_mn);
6596         Label ok;
6597         cbz(Rlo_mn, ok); {
6598           stop("broken Montgomery multiply");
6599         } bind(ok);
6600       }
6601 #endif
6602       // We have very carefully set things up so that
6603       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6604       // the lower half of Rm * Rn because we know the result already:
6605       // it must be -t0.  t0 + (-t0) must generate a carry iff
6606       // t0 != 0.  So, rather than do a mul and an adds we just set
6607       // the carry flag iff t0 is nonzero.
6608       //
6609       // mul(Rlo_mn, Rm, Rn);
6610       // adds(zr, t0, Rlo_mn);
6611       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6612       adcs(t0, t1, Rhi_mn);
6613       adc(t1, t2, zr);
6614       mov(t2, zr);
6615     }
6616 
6617     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
6618       block_comment("pre2");
6619       // Pa = Pa_base + i-len;
6620       // Pb = Pb_base + len;
6621       // Pm = Pm_base + i-len;
6622       // Pn = Pn_base + len;
6623 
6624       if (i.is_register()) {
6625         sub(Rj, i.as_register(), len);
6626       } else {
6627         mov(Rj, i.as_constant());
6628         sub(Rj, Rj, len);
6629       }
6630       // Rj == i-len
6631 
6632       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
6633       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
6634       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6635       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
6636 
6637       // Ra = *++Pa;
6638       // Rb = *--Pb;
6639       // Rm = *++Pm;
6640       // Rn = *--Pn;
6641       ldr(Ra, pre(Pa, wordSize));
6642       ldr(Rb, pre(Pb, -wordSize));
6643       ldr(Rm, pre(Pm, wordSize));
6644       ldr(Rn, pre(Pn, -wordSize));
6645 
6646       mov(Rhi_mn, zr);
6647       mov(Rlo_mn, zr);
6648     }
6649 
6650     void post2(RegisterOrConstant i, RegisterOrConstant len) {
6651       block_comment("post2");
6652       if (i.is_constant()) {
6653         mov(Rj, i.as_constant()-len.as_constant());
6654       } else {
6655         sub(Rj, i.as_register(), len);
6656       }
6657 
6658       adds(t0, t0, Rlo_mn); // The pending m*n, low part
6659 
6660       // As soon as we know the least significant digit of our result,
6661       // store it.
6662       // Pm_base[i-len] = t0;
6663       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6664 
6665       // t0 = t1; t1 = t2; t2 = 0;
6666       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
6667       adc(t1, t2, zr);
6668       mov(t2, zr);
6669     }
6670 
6671     // A carry in t0 after Montgomery multiplication means that we
6672     // should subtract multiples of n from our result in m.  We'll
6673     // keep doing that until there is no carry.
6674     void normalize(RegisterOrConstant len) {
6675       block_comment("normalize");
6676       // while (t0)
6677       //   t0 = sub(Pm_base, Pn_base, t0, len);
6678       Label loop, post, again;
6679       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
6680       cbz(t0, post); {
6681         bind(again); {
6682           mov(i, zr);
6683           mov(cnt, len);
6684           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6685           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6686           subs(zr, zr, zr); // set carry flag, i.e. no borrow
6687           align(16);
6688           bind(loop); {
6689             sbcs(Rm, Rm, Rn);
6690             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6691             add(i, i, 1);
6692             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6693             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6694             sub(cnt, cnt, 1);
6695           } cbnz(cnt, loop);
6696           sbc(t0, t0, zr);
6697         } cbnz(t0, again);
6698       } bind(post);
6699     }
6700 
6701     // Move memory at s to d, reversing words.
6702     //    Increments d to end of copied memory
6703     //    Destroys tmp1, tmp2
6704     //    Preserves len
6705     //    Leaves s pointing to the address which was in d at start
6706     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
6707       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
6708 
6709       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
6710       mov(tmp1, len);
6711       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
6712       sub(s, d, len, ext::uxtw, LogBytesPerWord);
6713     }
6714     // where
6715     void reverse1(Register d, Register s, Register tmp) {
6716       ldr(tmp, pre(s, -wordSize));
6717       ror(tmp, tmp, 32);
6718       str(tmp, post(d, wordSize));
6719     }
6720 
6721     void step_squaring() {
6722       // An extra ACC
6723       step();
6724       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6725     }
6726 
6727     void last_squaring(RegisterOrConstant i) {
6728       Label dont;
6729       // if ((i & 1) == 0) {
6730       tbnz(i.as_register(), 0, dont); {
6731         // MACC(Ra, Rb, t0, t1, t2);
6732         // Ra = *++Pa;
6733         // Rb = *--Pb;
6734         umulh(Rhi_ab, Ra, Rb);
6735         mul(Rlo_ab, Ra, Rb);
6736         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6737       } bind(dont);
6738     }
6739 
6740     void extra_step_squaring() {
6741       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6742 
6743       // MACC(Rm, Rn, t0, t1, t2);
6744       // Rm = *++Pm;
6745       // Rn = *--Pn;
6746       umulh(Rhi_mn, Rm, Rn);
6747       mul(Rlo_mn, Rm, Rn);
6748       ldr(Rm, pre(Pm, wordSize));
6749       ldr(Rn, pre(Pn, -wordSize));
6750     }
6751 
6752     void post1_squaring() {
6753       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6754 
6755       // *Pm = Rm = t0 * inv;
6756       mul(Rm, t0, inv);
6757       str(Rm, Address(Pm));
6758 
6759       // MACC(Rm, Rn, t0, t1, t2);
6760       // t0 = t1; t1 = t2; t2 = 0;
6761       umulh(Rhi_mn, Rm, Rn);
6762 
6763 #ifndef PRODUCT
6764       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6765       {
6766         mul(Rlo_mn, Rm, Rn);
6767         add(Rlo_mn, t0, Rlo_mn);
6768         Label ok;
6769         cbz(Rlo_mn, ok); {
6770           stop("broken Montgomery multiply");
6771         } bind(ok);
6772       }
6773 #endif
6774       // We have very carefully set things up so that
6775       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6776       // the lower half of Rm * Rn because we know the result already:
6777       // it must be -t0.  t0 + (-t0) must generate a carry iff
6778       // t0 != 0.  So, rather than do a mul and an adds we just set
6779       // the carry flag iff t0 is nonzero.
6780       //
6781       // mul(Rlo_mn, Rm, Rn);
6782       // adds(zr, t0, Rlo_mn);
6783       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6784       adcs(t0, t1, Rhi_mn);
6785       adc(t1, t2, zr);
6786       mov(t2, zr);
6787     }
6788 
6789     void acc(Register Rhi, Register Rlo,
6790              Register t0, Register t1, Register t2) {
6791       adds(t0, t0, Rlo);
6792       adcs(t1, t1, Rhi);
6793       adc(t2, t2, zr);
6794     }
6795 
6796   public:
6797     /**
6798      * Fast Montgomery multiplication.  The derivation of the
6799      * algorithm is in A Cryptographic Library for the Motorola
6800      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
6801      *
6802      * Arguments:
6803      *
6804      * Inputs for multiplication:
6805      *   c_rarg0   - int array elements a
6806      *   c_rarg1   - int array elements b
6807      *   c_rarg2   - int array elements n (the modulus)
6808      *   c_rarg3   - int length
6809      *   c_rarg4   - int inv
6810      *   c_rarg5   - int array elements m (the result)
6811      *
6812      * Inputs for squaring:
6813      *   c_rarg0   - int array elements a
6814      *   c_rarg1   - int array elements n (the modulus)
6815      *   c_rarg2   - int length
6816      *   c_rarg3   - int inv
6817      *   c_rarg4   - int array elements m (the result)
6818      *
6819      */
6820     address generate_multiply() {
6821       Label argh, nothing;
6822       bind(argh);
6823       stop("MontgomeryMultiply total_allocation must be <= 8192");
6824 
6825       align(CodeEntryAlignment);
6826       address entry = pc();
6827 
6828       cbzw(Rlen, nothing);
6829 
6830       enter();
6831 
6832       // Make room.
6833       cmpw(Rlen, 512);
6834       br(Assembler::HI, argh);
6835       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
6836       andr(sp, Ra, -2 * wordSize);
6837 
6838       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
6839 
6840       {
6841         // Copy input args, reversing as we go.  We use Ra as a
6842         // temporary variable.
6843         reverse(Ra, Pa_base, Rlen, t0, t1);
6844         if (!_squaring)
6845           reverse(Ra, Pb_base, Rlen, t0, t1);
6846         reverse(Ra, Pn_base, Rlen, t0, t1);
6847       }
6848 
6849       // Push all call-saved registers and also Pm_base which we'll need
6850       // at the end.
6851       save_regs();
6852 
6853 #ifndef PRODUCT
6854       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
6855       {
6856         ldr(Rn, Address(Pn_base, 0));
6857         mul(Rlo_mn, Rn, inv);
6858         subs(zr, Rlo_mn, -1);
6859         Label ok;
6860         br(EQ, ok); {
6861           stop("broken inverse in Montgomery multiply");
6862         } bind(ok);
6863       }
6864 #endif
6865 
6866       mov(Pm_base, Ra);
6867 
6868       mov(t0, zr);
6869       mov(t1, zr);
6870       mov(t2, zr);
6871 
6872       block_comment("for (int i = 0; i < len; i++) {");
6873       mov(Ri, zr); {
6874         Label loop, end;
6875         cmpw(Ri, Rlen);
6876         br(Assembler::GE, end);
6877 
6878         bind(loop);
6879         pre1(Ri);
6880 
6881         block_comment("  for (j = i; j; j--) {"); {
6882           movw(Rj, Ri);
6883           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
6884         } block_comment("  } // j");
6885 
6886         post1();
6887         addw(Ri, Ri, 1);
6888         cmpw(Ri, Rlen);
6889         br(Assembler::LT, loop);
6890         bind(end);
6891         block_comment("} // i");
6892       }
6893 
6894       block_comment("for (int i = len; i < 2*len; i++) {");
6895       mov(Ri, Rlen); {
6896         Label loop, end;
6897         cmpw(Ri, Rlen, Assembler::LSL, 1);
6898         br(Assembler::GE, end);
6899 
6900         bind(loop);
6901         pre2(Ri, Rlen);
6902 
6903         block_comment("  for (j = len*2-i-1; j; j--) {"); {
6904           lslw(Rj, Rlen, 1);
6905           subw(Rj, Rj, Ri);
6906           subw(Rj, Rj, 1);
6907           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
6908         } block_comment("  } // j");
6909 
6910         post2(Ri, Rlen);
6911         addw(Ri, Ri, 1);
6912         cmpw(Ri, Rlen, Assembler::LSL, 1);
6913         br(Assembler::LT, loop);
6914         bind(end);
6915       }
6916       block_comment("} // i");
6917 
6918       normalize(Rlen);
6919 
6920       mov(Ra, Pm_base);  // Save Pm_base in Ra
6921       restore_regs();  // Restore caller's Pm_base
6922 
6923       // Copy our result into caller's Pm_base
6924       reverse(Pm_base, Ra, Rlen, t0, t1);
6925 
6926       leave();
6927       bind(nothing);
6928       ret(lr);
6929 
6930       return entry;
6931     }
6932     // In C, approximately:
6933 
6934     // void
6935     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
6936     //                     julong Pn_base[], julong Pm_base[],
6937     //                     julong inv, int len) {
6938     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
6939     //   julong *Pa, *Pb, *Pn, *Pm;
6940     //   julong Ra, Rb, Rn, Rm;
6941 
6942     //   int i;
6943 
6944     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
6945 
6946     //   for (i = 0; i < len; i++) {
6947     //     int j;
6948 
6949     //     Pa = Pa_base;
6950     //     Pb = Pb_base + i;
6951     //     Pm = Pm_base;
6952     //     Pn = Pn_base + i;
6953 
6954     //     Ra = *Pa;
6955     //     Rb = *Pb;
6956     //     Rm = *Pm;
6957     //     Rn = *Pn;
6958 
6959     //     int iters = i;
6960     //     for (j = 0; iters--; j++) {
6961     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
6962     //       MACC(Ra, Rb, t0, t1, t2);
6963     //       Ra = *++Pa;
6964     //       Rb = *--Pb;
6965     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6966     //       MACC(Rm, Rn, t0, t1, t2);
6967     //       Rm = *++Pm;
6968     //       Rn = *--Pn;
6969     //     }
6970 
6971     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
6972     //     MACC(Ra, Rb, t0, t1, t2);
6973     //     *Pm = Rm = t0 * inv;
6974     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
6975     //     MACC(Rm, Rn, t0, t1, t2);
6976 
6977     //     assert(t0 == 0, "broken Montgomery multiply");
6978 
6979     //     t0 = t1; t1 = t2; t2 = 0;
6980     //   }
6981 
6982     //   for (i = len; i < 2*len; i++) {
6983     //     int j;
6984 
6985     //     Pa = Pa_base + i-len;
6986     //     Pb = Pb_base + len;
6987     //     Pm = Pm_base + i-len;
6988     //     Pn = Pn_base + len;
6989 
6990     //     Ra = *++Pa;
6991     //     Rb = *--Pb;
6992     //     Rm = *++Pm;
6993     //     Rn = *--Pn;
6994 
6995     //     int iters = len*2-i-1;
6996     //     for (j = i-len+1; iters--; j++) {
6997     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
6998     //       MACC(Ra, Rb, t0, t1, t2);
6999     //       Ra = *++Pa;
7000     //       Rb = *--Pb;
7001     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7002     //       MACC(Rm, Rn, t0, t1, t2);
7003     //       Rm = *++Pm;
7004     //       Rn = *--Pn;
7005     //     }
7006 
7007     //     Pm_base[i-len] = t0;
7008     //     t0 = t1; t1 = t2; t2 = 0;
7009     //   }
7010 
7011     //   while (t0)
7012     //     t0 = sub(Pm_base, Pn_base, t0, len);
7013     // }
7014 
7015     /**
7016      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7017      * multiplies than Montgomery multiplication so it should be up to
7018      * 25% faster.  However, its loop control is more complex and it
7019      * may actually run slower on some machines.
7020      *
7021      * Arguments:
7022      *
7023      * Inputs:
7024      *   c_rarg0   - int array elements a
7025      *   c_rarg1   - int array elements n (the modulus)
7026      *   c_rarg2   - int length
7027      *   c_rarg3   - int inv
7028      *   c_rarg4   - int array elements m (the result)
7029      *
7030      */
7031     address generate_square() {
7032       Label argh;
7033       bind(argh);
7034       stop("MontgomeryMultiply total_allocation must be <= 8192");
7035 
7036       align(CodeEntryAlignment);
7037       address entry = pc();
7038 
7039       enter();
7040 
7041       // Make room.
7042       cmpw(Rlen, 512);
7043       br(Assembler::HI, argh);
7044       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7045       andr(sp, Ra, -2 * wordSize);
7046 
7047       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7048 
7049       {
7050         // Copy input args, reversing as we go.  We use Ra as a
7051         // temporary variable.
7052         reverse(Ra, Pa_base, Rlen, t0, t1);
7053         reverse(Ra, Pn_base, Rlen, t0, t1);
7054       }
7055 
7056       // Push all call-saved registers and also Pm_base which we'll need
7057       // at the end.
7058       save_regs();
7059 
7060       mov(Pm_base, Ra);
7061 
7062       mov(t0, zr);
7063       mov(t1, zr);
7064       mov(t2, zr);
7065 
7066       block_comment("for (int i = 0; i < len; i++) {");
7067       mov(Ri, zr); {
7068         Label loop, end;
7069         bind(loop);
7070         cmp(Ri, Rlen);
7071         br(Assembler::GE, end);
7072 
7073         pre1(Ri);
7074 
7075         block_comment("for (j = (i+1)/2; j; j--) {"); {
7076           add(Rj, Ri, 1);
7077           lsr(Rj, Rj, 1);
7078           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7079         } block_comment("  } // j");
7080 
7081         last_squaring(Ri);
7082 
7083         block_comment("  for (j = i/2; j; j--) {"); {
7084           lsr(Rj, Ri, 1);
7085           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7086         } block_comment("  } // j");
7087 
7088         post1_squaring();
7089         add(Ri, Ri, 1);
7090         cmp(Ri, Rlen);
7091         br(Assembler::LT, loop);
7092 
7093         bind(end);
7094         block_comment("} // i");
7095       }
7096 
7097       block_comment("for (int i = len; i < 2*len; i++) {");
7098       mov(Ri, Rlen); {
7099         Label loop, end;
7100         bind(loop);
7101         cmp(Ri, Rlen, Assembler::LSL, 1);
7102         br(Assembler::GE, end);
7103 
7104         pre2(Ri, Rlen);
7105 
7106         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7107           lsl(Rj, Rlen, 1);
7108           sub(Rj, Rj, Ri);
7109           sub(Rj, Rj, 1);
7110           lsr(Rj, Rj, 1);
7111           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7112         } block_comment("  } // j");
7113 
7114         last_squaring(Ri);
7115 
7116         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7117           lsl(Rj, Rlen, 1);
7118           sub(Rj, Rj, Ri);
7119           lsr(Rj, Rj, 1);
7120           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7121         } block_comment("  } // j");
7122 
7123         post2(Ri, Rlen);
7124         add(Ri, Ri, 1);
7125         cmp(Ri, Rlen, Assembler::LSL, 1);
7126 
7127         br(Assembler::LT, loop);
7128         bind(end);
7129         block_comment("} // i");
7130       }
7131 
7132       normalize(Rlen);
7133 
7134       mov(Ra, Pm_base);  // Save Pm_base in Ra
7135       restore_regs();  // Restore caller's Pm_base
7136 
7137       // Copy our result into caller's Pm_base
7138       reverse(Pm_base, Ra, Rlen, t0, t1);
7139 
7140       leave();
7141       ret(lr);
7142 
7143       return entry;
7144     }
7145     // In C, approximately:
7146 
7147     // void
7148     // montgomery_square(julong Pa_base[], julong Pn_base[],
7149     //                   julong Pm_base[], julong inv, int len) {
7150     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7151     //   julong *Pa, *Pb, *Pn, *Pm;
7152     //   julong Ra, Rb, Rn, Rm;
7153 
7154     //   int i;
7155 
7156     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7157 
7158     //   for (i = 0; i < len; i++) {
7159     //     int j;
7160 
7161     //     Pa = Pa_base;
7162     //     Pb = Pa_base + i;
7163     //     Pm = Pm_base;
7164     //     Pn = Pn_base + i;
7165 
7166     //     Ra = *Pa;
7167     //     Rb = *Pb;
7168     //     Rm = *Pm;
7169     //     Rn = *Pn;
7170 
7171     //     int iters = (i+1)/2;
7172     //     for (j = 0; iters--; j++) {
7173     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7174     //       MACC2(Ra, Rb, t0, t1, t2);
7175     //       Ra = *++Pa;
7176     //       Rb = *--Pb;
7177     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7178     //       MACC(Rm, Rn, t0, t1, t2);
7179     //       Rm = *++Pm;
7180     //       Rn = *--Pn;
7181     //     }
7182     //     if ((i & 1) == 0) {
7183     //       assert(Ra == Pa_base[j], "must be");
7184     //       MACC(Ra, Ra, t0, t1, t2);
7185     //     }
7186     //     iters = i/2;
7187     //     assert(iters == i-j, "must be");
7188     //     for (; iters--; j++) {
7189     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7190     //       MACC(Rm, Rn, t0, t1, t2);
7191     //       Rm = *++Pm;
7192     //       Rn = *--Pn;
7193     //     }
7194 
7195     //     *Pm = Rm = t0 * inv;
7196     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7197     //     MACC(Rm, Rn, t0, t1, t2);
7198 
7199     //     assert(t0 == 0, "broken Montgomery multiply");
7200 
7201     //     t0 = t1; t1 = t2; t2 = 0;
7202     //   }
7203 
7204     //   for (i = len; i < 2*len; i++) {
7205     //     int start = i-len+1;
7206     //     int end = start + (len - start)/2;
7207     //     int j;
7208 
7209     //     Pa = Pa_base + i-len;
7210     //     Pb = Pa_base + len;
7211     //     Pm = Pm_base + i-len;
7212     //     Pn = Pn_base + len;
7213 
7214     //     Ra = *++Pa;
7215     //     Rb = *--Pb;
7216     //     Rm = *++Pm;
7217     //     Rn = *--Pn;
7218 
7219     //     int iters = (2*len-i-1)/2;
7220     //     assert(iters == end-start, "must be");
7221     //     for (j = start; iters--; j++) {
7222     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7223     //       MACC2(Ra, Rb, t0, t1, t2);
7224     //       Ra = *++Pa;
7225     //       Rb = *--Pb;
7226     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7227     //       MACC(Rm, Rn, t0, t1, t2);
7228     //       Rm = *++Pm;
7229     //       Rn = *--Pn;
7230     //     }
7231     //     if ((i & 1) == 0) {
7232     //       assert(Ra == Pa_base[j], "must be");
7233     //       MACC(Ra, Ra, t0, t1, t2);
7234     //     }
7235     //     iters =  (2*len-i)/2;
7236     //     assert(iters == len-j, "must be");
7237     //     for (; iters--; j++) {
7238     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7239     //       MACC(Rm, Rn, t0, t1, t2);
7240     //       Rm = *++Pm;
7241     //       Rn = *--Pn;
7242     //     }
7243     //     Pm_base[i-len] = t0;
7244     //     t0 = t1; t1 = t2; t2 = 0;
7245     //   }
7246 
7247     //   while (t0)
7248     //     t0 = sub(Pm_base, Pn_base, t0, len);
7249     // }
7250   };
7251 
7252 
7253   // Call here from the interpreter or compiled code to either load
7254   // multiple returned values from the inline type instance being
7255   // returned to registers or to store returned values to a newly
7256   // allocated inline type instance.
7257   address generate_return_value_stub(address destination, const char* name, bool has_res) {
7258     // We need to save all registers the calling convention may use so
7259     // the runtime calls read or update those registers. This needs to
7260     // be in sync with SharedRuntime::java_return_convention().
7261     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7262     enum layout {
7263       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
7264       j_rarg6_off, j_rarg6_2,
7265       j_rarg5_off, j_rarg5_2,
7266       j_rarg4_off, j_rarg4_2,
7267       j_rarg3_off, j_rarg3_2,
7268       j_rarg2_off, j_rarg2_2,
7269       j_rarg1_off, j_rarg1_2,
7270       j_rarg0_off, j_rarg0_2,
7271 
7272       j_farg7_off, j_farg7_2,
7273       j_farg6_off, j_farg6_2,
7274       j_farg5_off, j_farg5_2,
7275       j_farg4_off, j_farg4_2,
7276       j_farg3_off, j_farg3_2,
7277       j_farg2_off, j_farg2_2,
7278       j_farg1_off, j_farg1_2,
7279       j_farg0_off, j_farg0_2,
7280 
7281       rfp_off, rfp_off2,
7282       return_off, return_off2,
7283 
7284       framesize // inclusive of return address
7285     };
7286 
7287     CodeBuffer code(name, 512, 64);
7288     MacroAssembler* masm = new MacroAssembler(&code);
7289 
7290     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
7291     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
7292     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
7293     int frame_size_in_words = frame_size_in_bytes / wordSize;
7294 
7295     OopMapSet* oop_maps = new OopMapSet();
7296     OopMap* map = new OopMap(frame_size_in_slots, 0);
7297 
7298     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
7299     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
7300     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
7301     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
7302     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
7303     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
7304     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
7305     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
7306 
7307     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
7308     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
7309     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
7310     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
7311     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
7312     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
7313     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
7314     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
7315 
7316     address start = __ pc();
7317 
7318     __ enter(); // Save FP and LR before call
7319 
7320     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
7321     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
7322     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
7323     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
7324 
7325     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
7326     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
7327     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
7328     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
7329 
7330     int frame_complete = __ offset();
7331 
7332     // Set up last_Java_sp and last_Java_fp
7333     address the_pc = __ pc();
7334     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7335 
7336     // Call runtime
7337     __ mov(c_rarg1, r0);
7338     __ mov(c_rarg0, rthread);
7339 
7340     __ mov(rscratch1, destination);
7341     __ blr(rscratch1);
7342 
7343     oop_maps->add_gc_map(the_pc - start, map);
7344 
7345     __ reset_last_Java_frame(false);
7346 
7347     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
7348     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
7349     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
7350     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
7351 
7352     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
7353     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
7354     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
7355     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
7356 
7357     __ leave();
7358 
7359     // check for pending exceptions
7360     Label pending;
7361     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
7362     __ cbnz(rscratch1, pending);
7363 
7364     if (has_res) {
7365       __ get_vm_result(r0, rthread);
7366     }
7367 
7368     __ ret(lr);
7369 
7370     __ bind(pending);
7371     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7372 
7373     // -------------
7374     // make sure all code is generated
7375     masm->flush();
7376 
7377     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
7378     return stub->entry_point();
7379   }
7380 
7381   // Initialization
7382   void generate_initial() {
7383     // Generate initial stubs and initializes the entry points
7384 
7385     // entry points that exist in all platforms Note: This is code
7386     // that could be shared among different platforms - however the
7387     // benefit seems to be smaller than the disadvantage of having a
7388     // much more complicated generator structure. See also comment in
7389     // stubRoutines.hpp.
7390 
7391     StubRoutines::_forward_exception_entry = generate_forward_exception();
7392 
7393     StubRoutines::_call_stub_entry =
7394       generate_call_stub(StubRoutines::_call_stub_return_address);
7395 
7396     // is referenced by megamorphic call
7397     StubRoutines::_catch_exception_entry = generate_catch_exception();
7398 
7399     // Build this early so it's available for the interpreter.
7400     StubRoutines::_throw_StackOverflowError_entry =
7401       generate_throw_exception("StackOverflowError throw_exception",
7402                                CAST_FROM_FN_PTR(address,
7403                                                 SharedRuntime::throw_StackOverflowError));
7404     StubRoutines::_throw_delayed_StackOverflowError_entry =
7405       generate_throw_exception("delayed StackOverflowError throw_exception",
7406                                CAST_FROM_FN_PTR(address,
7407                                                 SharedRuntime::throw_delayed_StackOverflowError));
7408     if (UseCRC32Intrinsics) {
7409       // set table address before stub generation which use it
7410       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7411       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7412     }
7413 
7414     if (UseCRC32CIntrinsics) {
7415       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7416     }
7417 
7418     // Disabled until JDK-8210858 is fixed
7419     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7420     //   StubRoutines::_dlog = generate_dlog();
7421     // }
7422 
7423     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7424       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7425     }
7426 
7427     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7428       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7429     }
7430 
7431     if (InlineTypeReturnedAsFields) {
7432       StubRoutines::_load_inline_type_fields_in_regs =
7433          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
7434       StubRoutines::_store_inline_type_fields_to_buf =
7435          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
7436     }
7437 
7438     // Safefetch stubs.
7439     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7440                                                        &StubRoutines::_safefetch32_fault_pc,
7441                                                        &StubRoutines::_safefetch32_continuation_pc);
7442     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7443                                                        &StubRoutines::_safefetchN_fault_pc,
7444                                                        &StubRoutines::_safefetchN_continuation_pc);
7445   }
7446 
7447   void generate_all() {
7448     // support for verify_oop (must happen after universe_init)
7449     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
7450     StubRoutines::_throw_AbstractMethodError_entry =
7451       generate_throw_exception("AbstractMethodError throw_exception",
7452                                CAST_FROM_FN_PTR(address,
7453                                                 SharedRuntime::
7454                                                 throw_AbstractMethodError));
7455 
7456     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7457       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7458                                CAST_FROM_FN_PTR(address,
7459                                                 SharedRuntime::
7460                                                 throw_IncompatibleClassChangeError));
7461 
7462     StubRoutines::_throw_NullPointerException_at_call_entry =
7463       generate_throw_exception("NullPointerException at call throw_exception",
7464                                CAST_FROM_FN_PTR(address,
7465                                                 SharedRuntime::
7466                                                 throw_NullPointerException_at_call));
7467 
7468     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7469 
7470     // arraycopy stubs used by compilers
7471     generate_arraycopy_stubs();
7472 
7473     // has negatives stub for large arrays.
7474     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
7475 
7476     // array equals stub for large arrays.
7477     if (!UseSimpleArrayEquals) {
7478       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7479     }
7480 
7481     generate_compare_long_strings();
7482 
7483     generate_string_indexof_stubs();
7484 
7485     // byte_array_inflate stub for large arrays.
7486     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7487 
7488     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7489     if (bs_nm != NULL) {
7490       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7491     }
7492 #ifdef COMPILER2
7493     if (UseMultiplyToLenIntrinsic) {
7494       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7495     }
7496 
7497     if (UseSquareToLenIntrinsic) {
7498       StubRoutines::_squareToLen = generate_squareToLen();
7499     }
7500 
7501     if (UseMulAddIntrinsic) {
7502       StubRoutines::_mulAdd = generate_mulAdd();
7503     }
7504 
7505     if (UseSIMDForBigIntegerShiftIntrinsics) {
7506       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7507       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7508     }
7509 
7510     if (UseMontgomeryMultiplyIntrinsic) {
7511       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7512       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7513       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7514     }
7515 
7516     if (UseMontgomerySquareIntrinsic) {
7517       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7518       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7519       // We use generate_multiply() rather than generate_square()
7520       // because it's faster for the sizes of modulus we care about.
7521       StubRoutines::_montgomerySquare = g.generate_multiply();
7522     }
7523 #endif // COMPILER2
7524 
7525     // generate GHASH intrinsics code
7526     if (UseGHASHIntrinsics) {
7527       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7528       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
7529     }
7530 
7531     if (UseBASE64Intrinsics) {
7532         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7533         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7534     }
7535 
7536     // data cache line writeback
7537     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7538     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7539 
7540     if (UseAESIntrinsics) {
7541       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7542       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7543       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7544       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7545       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7546       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7547     }
7548 
7549     if (UseSHA1Intrinsics) {
7550       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7551       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7552     }
7553     if (UseSHA256Intrinsics) {
7554       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7555       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7556     }
7557     if (UseSHA512Intrinsics) {
7558       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7559       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7560     }
7561     if (UseSHA3Intrinsics) {
7562       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7563       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7564     }
7565 
7566     // generate Adler32 intrinsics code
7567     if (UseAdler32Intrinsics) {
7568       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7569     }
7570 
7571 #ifdef LINUX
7572 
7573     generate_atomic_entry_points();
7574 
7575 #endif // LINUX
7576 
7577     StubRoutines::aarch64::set_completed();
7578   }
7579 
7580  public:
7581   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7582     if (all) {
7583       generate_all();
7584     } else {
7585       generate_initial();
7586     }
7587   }
7588 }; // end class declaration
7589 
7590 #define UCM_TABLE_MAX_ENTRIES 8
7591 void StubGenerator_generate(CodeBuffer* code, bool all) {
7592   if (UnsafeCopyMemory::_table == NULL) {
7593     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7594   }
7595   StubGenerator g(code, all);
7596 }
7597 
7598 
7599 #ifdef LINUX
7600 
7601 // Define pointers to atomic stubs and initialize them to point to the
7602 // code in atomic_aarch64.S.
7603 
7604 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
7605   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7606     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
7607   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7608     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7609 
7610 DEFAULT_ATOMIC_OP(fetch_add, 4, )
7611 DEFAULT_ATOMIC_OP(fetch_add, 8, )
7612 DEFAULT_ATOMIC_OP(xchg, 4, )
7613 DEFAULT_ATOMIC_OP(xchg, 8, )
7614 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
7615 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
7616 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
7617 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
7618 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
7619 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
7620 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
7621 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
7622 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
7623 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
7624 
7625 #undef DEFAULT_ATOMIC_OP
7626 
7627 #endif // LINUX