1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "atomic_aarch64.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/interpreter.hpp"
  36 #include "memory/universe.hpp"
  37 #include "nativeInst_aarch64.hpp"
  38 #include "oops/instanceOop.hpp"
  39 #include "oops/method.hpp"
  40 #include "oops/objArrayKlass.hpp"
  41 #include "oops/oop.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/atomic.hpp"
  44 #include "runtime/frame.inline.hpp"
  45 #include "runtime/handles.inline.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubCodeGenerator.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/thread.inline.hpp"
  50 #include "utilities/align.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 #ifdef COMPILER2
  53 #include "opto/runtime.hpp"
  54 #endif
  55 #if INCLUDE_ZGC
  56 #include "gc/z/zThreadLocalData.hpp"
  57 #endif
  58 
  59 // Declaration and definition of StubGenerator (no .hpp file).
  60 // For a more detailed description of the stub routine structure
  61 // see the comment in stubRoutines.hpp
  62 
  63 #undef __
  64 #define __ _masm->
  65 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  66 
  67 #ifdef PRODUCT
  68 #define BLOCK_COMMENT(str) /* nothing */
  69 #else
  70 #define BLOCK_COMMENT(str) __ block_comment(str)
  71 #endif
  72 
  73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  74 
  75 // Stub Code definitions
  76 
  77 class StubGenerator: public StubCodeGenerator {
  78  private:
  79 
  80 #ifdef PRODUCT
  81 #define inc_counter_np(counter) ((void)0)
  82 #else
  83   void inc_counter_np_(int& counter) {
  84     __ lea(rscratch2, ExternalAddress((address)&counter));
  85     __ ldrw(rscratch1, Address(rscratch2));
  86     __ addw(rscratch1, rscratch1, 1);
  87     __ strw(rscratch1, Address(rscratch2));
  88   }
  89 #define inc_counter_np(counter) \
  90   BLOCK_COMMENT("inc_counter " #counter); \
  91   inc_counter_np_(counter);
  92 #endif
  93 
  94   // Call stubs are used to call Java from C
  95   //
  96   // Arguments:
  97   //    c_rarg0:   call wrapper address                   address
  98   //    c_rarg1:   result                                 address
  99   //    c_rarg2:   result type                            BasicType
 100   //    c_rarg3:   method                                 Method*
 101   //    c_rarg4:   (interpreter) entry point              address
 102   //    c_rarg5:   parameters                             intptr_t*
 103   //    c_rarg6:   parameter size (in words)              int
 104   //    c_rarg7:   thread                                 Thread*
 105   //
 106   // There is no return from the stub itself as any Java result
 107   // is written to result
 108   //
 109   // we save r30 (lr) as the return PC at the base of the frame and
 110   // link r29 (fp) below it as the frame pointer installing sp (r31)
 111   // into fp.
 112   //
 113   // we save r0-r7, which accounts for all the c arguments.
 114   //
 115   // TODO: strictly do we need to save them all? they are treated as
 116   // volatile by C so could we omit saving the ones we are going to
 117   // place in global registers (thread? method?) or those we only use
 118   // during setup of the Java call?
 119   //
 120   // we don't need to save r8 which C uses as an indirect result location
 121   // return register.
 122   //
 123   // we don't need to save r9-r15 which both C and Java treat as
 124   // volatile
 125   //
 126   // we don't need to save r16-18 because Java does not use them
 127   //
 128   // we save r19-r28 which Java uses as scratch registers and C
 129   // expects to be callee-save
 130   //
 131   // we save the bottom 64 bits of each value stored in v8-v15; it is
 132   // the responsibility of the caller to preserve larger values.
 133   //
 134   // so the stub frame looks like this when we enter Java code
 135   //
 136   //     [ return_from_Java     ] <--- sp
 137   //     [ argument word n      ]
 138   //      ...
 139   // -27 [ argument word 1      ]
 140   // -26 [ saved v15            ] <--- sp_after_call
 141   // -25 [ saved v14            ]
 142   // -24 [ saved v13            ]
 143   // -23 [ saved v12            ]
 144   // -22 [ saved v11            ]
 145   // -21 [ saved v10            ]
 146   // -20 [ saved v9             ]
 147   // -19 [ saved v8             ]
 148   // -18 [ saved r28            ]
 149   // -17 [ saved r27            ]
 150   // -16 [ saved r26            ]
 151   // -15 [ saved r25            ]
 152   // -14 [ saved r24            ]
 153   // -13 [ saved r23            ]
 154   // -12 [ saved r22            ]
 155   // -11 [ saved r21            ]
 156   // -10 [ saved r20            ]
 157   //  -9 [ saved r19            ]
 158   //  -8 [ call wrapper    (r0) ]
 159   //  -7 [ result          (r1) ]
 160   //  -6 [ result type     (r2) ]
 161   //  -5 [ method          (r3) ]
 162   //  -4 [ entry point     (r4) ]
 163   //  -3 [ parameters      (r5) ]
 164   //  -2 [ parameter size  (r6) ]
 165   //  -1 [ thread (r7)          ]
 166   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 167   //   1 [ saved lr       (r30) ]
 168 
 169   // Call stub stack layout word offsets from fp
 170   enum call_stub_layout {
 171     sp_after_call_off = -26,
 172 
 173     d15_off            = -26,
 174     d13_off            = -24,
 175     d11_off            = -22,
 176     d9_off             = -20,
 177 
 178     r28_off            = -18,
 179     r26_off            = -16,
 180     r24_off            = -14,
 181     r22_off            = -12,
 182     r20_off            = -10,
 183     call_wrapper_off   =  -8,
 184     result_off         =  -7,
 185     result_type_off    =  -6,
 186     method_off         =  -5,
 187     entry_point_off    =  -4,
 188     parameter_size_off =  -2,
 189     thread_off         =  -1,
 190     fp_f               =   0,
 191     retaddr_off        =   1,
 192   };
 193 
 194   address generate_call_stub(address& return_address) {
 195     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 196            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 197            "adjust this code");
 198 
 199     StubCodeMark mark(this, "StubRoutines", "call_stub");
 200     address start = __ pc();
 201 
 202     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 203 
 204     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 205     const Address result        (rfp, result_off         * wordSize);
 206     const Address result_type   (rfp, result_type_off    * wordSize);
 207     const Address method        (rfp, method_off         * wordSize);
 208     const Address entry_point   (rfp, entry_point_off    * wordSize);
 209     const Address parameter_size(rfp, parameter_size_off * wordSize);
 210 
 211     const Address thread        (rfp, thread_off         * wordSize);
 212 
 213     const Address d15_save      (rfp, d15_off * wordSize);
 214     const Address d13_save      (rfp, d13_off * wordSize);
 215     const Address d11_save      (rfp, d11_off * wordSize);
 216     const Address d9_save       (rfp, d9_off * wordSize);
 217 
 218     const Address r28_save      (rfp, r28_off * wordSize);
 219     const Address r26_save      (rfp, r26_off * wordSize);
 220     const Address r24_save      (rfp, r24_off * wordSize);
 221     const Address r22_save      (rfp, r22_off * wordSize);
 222     const Address r20_save      (rfp, r20_off * wordSize);
 223 
 224     // stub code
 225 
 226     address aarch64_entry = __ pc();
 227 
 228     // set up frame and move sp to end of save area
 229     __ enter();
 230     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 231 
 232     // save register parameters and Java scratch/global registers
 233     // n.b. we save thread even though it gets installed in
 234     // rthread because we want to sanity check rthread later
 235     __ str(c_rarg7,  thread);
 236     __ strw(c_rarg6, parameter_size);
 237     __ stp(c_rarg4, c_rarg5,  entry_point);
 238     __ stp(c_rarg2, c_rarg3,  result_type);
 239     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 240 
 241     __ stp(r20, r19,   r20_save);
 242     __ stp(r22, r21,   r22_save);
 243     __ stp(r24, r23,   r24_save);
 244     __ stp(r26, r25,   r26_save);
 245     __ stp(r28, r27,   r28_save);
 246 
 247     __ stpd(v9,  v8,   d9_save);
 248     __ stpd(v11, v10,  d11_save);
 249     __ stpd(v13, v12,  d13_save);
 250     __ stpd(v15, v14,  d15_save);
 251 
 252     // install Java thread in global register now we have saved
 253     // whatever value it held
 254     __ mov(rthread, c_rarg7);
 255     // And method
 256     __ mov(rmethod, c_rarg3);
 257 
 258     // set up the heapbase register
 259     __ reinit_heapbase();
 260 
 261 #ifdef ASSERT
 262     // make sure we have no pending exceptions
 263     {
 264       Label L;
 265       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 266       __ cmp(rscratch1, (u1)NULL_WORD);
 267       __ br(Assembler::EQ, L);
 268       __ stop("StubRoutines::call_stub: entered with pending exception");
 269       __ BIND(L);
 270     }
 271 #endif
 272     // pass parameters if any
 273     __ mov(esp, sp);
 274     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 275     __ andr(sp, rscratch1, -2 * wordSize);
 276 
 277     BLOCK_COMMENT("pass parameters if any");
 278     Label parameters_done;
 279     // parameter count is still in c_rarg6
 280     // and parameter pointer identifying param 1 is in c_rarg5
 281     __ cbzw(c_rarg6, parameters_done);
 282 
 283     address loop = __ pc();
 284     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 285     __ subsw(c_rarg6, c_rarg6, 1);
 286     __ push(rscratch1);
 287     __ br(Assembler::GT, loop);
 288 
 289     __ BIND(parameters_done);
 290 
 291     // call Java entry -- passing methdoOop, and current sp
 292     //      rmethod: Method*
 293     //      r13: sender sp
 294     BLOCK_COMMENT("call Java function");
 295     __ mov(r13, sp);
 296     __ blr(c_rarg4);
 297 
 298     // we do this here because the notify will already have been done
 299     // if we get to the next instruction via an exception
 300     //
 301     // n.b. adding this instruction here affects the calculation of
 302     // whether or not a routine returns to the call stub (used when
 303     // doing stack walks) since the normal test is to check the return
 304     // pc against the address saved below. so we may need to allow for
 305     // this extra instruction in the check.
 306 
 307     // save current address for use by exception handling code
 308 
 309     return_address = __ pc();
 310 
 311     // store result depending on type (everything that is not
 312     // T_OBJECT, T_INLINE_TYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 313     // n.b. this assumes Java returns an integral result in r0
 314     // and a floating result in j_farg0
 315     // All of j_rargN may be used to return inline type fields so be careful
 316     // not to clobber those.
 317     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
 318     // assignment of Rresult below.
 319     Register Rresult = r14, Rresult_type = r15;
 320     __ ldr(Rresult, result);
 321     Label is_long, is_float, is_double, is_value, exit;
 322     __ ldr(Rresult_type, result_type);
 323     __ cmp(Rresult_type, (u1)T_OBJECT);
 324     __ br(Assembler::EQ, is_long);
 325     __ cmp(Rresult_type, (u1)T_INLINE_TYPE);
 326     __ br(Assembler::EQ, is_value);
 327     __ cmp(Rresult_type, (u1)T_LONG);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(Rresult_type, (u1)T_FLOAT);
 330     __ br(Assembler::EQ, is_float);
 331     __ cmp(Rresult_type, (u1)T_DOUBLE);
 332     __ br(Assembler::EQ, is_double);
 333 
 334     // handle T_INT case
 335     __ strw(r0, Address(Rresult));
 336 
 337     __ BIND(exit);
 338 
 339     // pop parameters
 340     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 341 
 342 #ifdef ASSERT
 343     // verify that threads correspond
 344     {
 345       Label L, S;
 346       __ ldr(rscratch1, thread);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::NE, S);
 349       __ get_thread(rscratch1);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::EQ, L);
 352       __ BIND(S);
 353       __ stop("StubRoutines::call_stub: threads must correspond");
 354       __ BIND(L);
 355     }
 356 #endif
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376     // leave frame and return to caller
 377     __ leave();
 378     __ ret(lr);
 379 
 380     // handle return types different from T_INT
 381     __ BIND(is_value);
 382     if (InlineTypeReturnedAsFields) {
 383       // Check for flattened return value
 384       __ tbz(r0, 0, is_long);
 385       // Load pack handler address
 386       __ andr(rscratch1, r0, -2);
 387       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
 388       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
 389       __ blr(rscratch1);
 390       __ b(exit);
 391     }
 392 
 393     __ BIND(is_long);
 394     __ str(r0, Address(Rresult, 0));
 395     __ br(Assembler::AL, exit);
 396 
 397     __ BIND(is_float);
 398     __ strs(j_farg0, Address(Rresult, 0));
 399     __ br(Assembler::AL, exit);
 400 
 401     __ BIND(is_double);
 402     __ strd(j_farg0, Address(Rresult, 0));
 403     __ br(Assembler::AL, exit);
 404 
 405     return start;
 406   }
 407 
 408   // Return point for a Java call if there's an exception thrown in
 409   // Java code.  The exception is caught and transformed into a
 410   // pending exception stored in JavaThread that can be tested from
 411   // within the VM.
 412   //
 413   // Note: Usually the parameters are removed by the callee. In case
 414   // of an exception crossing an activation frame boundary, that is
 415   // not the case if the callee is compiled code => need to setup the
 416   // rsp.
 417   //
 418   // r0: exception oop
 419 
 420   address generate_catch_exception() {
 421     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 422     address start = __ pc();
 423 
 424     // same as in generate_call_stub():
 425     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 426     const Address thread        (rfp, thread_off         * wordSize);
 427 
 428 #ifdef ASSERT
 429     // verify that threads correspond
 430     {
 431       Label L, S;
 432       __ ldr(rscratch1, thread);
 433       __ cmp(rthread, rscratch1);
 434       __ br(Assembler::NE, S);
 435       __ get_thread(rscratch1);
 436       __ cmp(rthread, rscratch1);
 437       __ br(Assembler::EQ, L);
 438       __ bind(S);
 439       __ stop("StubRoutines::catch_exception: threads must correspond");
 440       __ bind(L);
 441     }
 442 #endif
 443 
 444     // set pending exception
 445     __ verify_oop(r0);
 446 
 447     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 448     __ mov(rscratch1, (address)__FILE__);
 449     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 450     __ movw(rscratch1, (int)__LINE__);
 451     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 452 
 453     // complete return to VM
 454     assert(StubRoutines::_call_stub_return_address != NULL,
 455            "_call_stub_return_address must have been generated before");
 456     __ b(StubRoutines::_call_stub_return_address);
 457 
 458     return start;
 459   }
 460 
 461   // Continuation point for runtime calls returning with a pending
 462   // exception.  The pending exception check happened in the runtime
 463   // or native call stub.  The pending exception in Thread is
 464   // converted into a Java-level exception.
 465   //
 466   // Contract with Java-level exception handlers:
 467   // r0: exception
 468   // r3: throwing pc
 469   //
 470   // NOTE: At entry of this stub, exception-pc must be in LR !!
 471 
 472   // NOTE: this is always used as a jump target within generated code
 473   // so it just needs to be generated code wiht no x86 prolog
 474 
 475   address generate_forward_exception() {
 476     StubCodeMark mark(this, "StubRoutines", "forward exception");
 477     address start = __ pc();
 478 
 479     // Upon entry, LR points to the return address returning into
 480     // Java (interpreted or compiled) code; i.e., the return address
 481     // becomes the throwing pc.
 482     //
 483     // Arguments pushed before the runtime call are still on the stack
 484     // but the exception handler will reset the stack pointer ->
 485     // ignore them.  A potential result in registers can be ignored as
 486     // well.
 487 
 488 #ifdef ASSERT
 489     // make sure this code is only executed if there is a pending exception
 490     {
 491       Label L;
 492       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 493       __ cbnz(rscratch1, L);
 494       __ stop("StubRoutines::forward exception: no pending exception (1)");
 495       __ bind(L);
 496     }
 497 #endif
 498 
 499     // compute exception handler into r19
 500 
 501     // call the VM to find the handler address associated with the
 502     // caller address. pass thread in r0 and caller pc (ret address)
 503     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 504     // the stack.
 505     __ mov(c_rarg1, lr);
 506     // lr will be trashed by the VM call so we move it to R19
 507     // (callee-saved) because we also need to pass it to the handler
 508     // returned by this call.
 509     __ mov(r19, lr);
 510     BLOCK_COMMENT("call exception_handler_for_return_address");
 511     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 512                          SharedRuntime::exception_handler_for_return_address),
 513                     rthread, c_rarg1);
 514     // Reinitialize the ptrue predicate register, in case the external runtime
 515     // call clobbers ptrue reg, as we may return to SVE compiled code.
 516     __ reinitialize_ptrue();
 517 
 518     // we should not really care that lr is no longer the callee
 519     // address. we saved the value the handler needs in r19 so we can
 520     // just copy it to r3. however, the C2 handler will push its own
 521     // frame and then calls into the VM and the VM code asserts that
 522     // the PC for the frame above the handler belongs to a compiled
 523     // Java method. So, we restore lr here to satisfy that assert.
 524     __ mov(lr, r19);
 525     // setup r0 & r3 & clear pending exception
 526     __ mov(r3, r19);
 527     __ mov(r19, r0);
 528     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 529     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 530 
 531 #ifdef ASSERT
 532     // make sure exception is set
 533     {
 534       Label L;
 535       __ cbnz(r0, L);
 536       __ stop("StubRoutines::forward exception: no pending exception (2)");
 537       __ bind(L);
 538     }
 539 #endif
 540 
 541     // continue at exception handler
 542     // r0: exception
 543     // r3: throwing pc
 544     // r19: exception handler
 545     __ verify_oop(r0);
 546     __ br(r19);
 547 
 548     return start;
 549   }
 550 
 551   // Non-destructive plausibility checks for oops
 552   //
 553   // Arguments:
 554   //    r0: oop to verify
 555   //    rscratch1: error message
 556   //
 557   // Stack after saving c_rarg3:
 558   //    [tos + 0]: saved c_rarg3
 559   //    [tos + 1]: saved c_rarg2
 560   //    [tos + 2]: saved lr
 561   //    [tos + 3]: saved rscratch2
 562   //    [tos + 4]: saved r0
 563   //    [tos + 5]: saved rscratch1
 564   address generate_verify_oop() {
 565 
 566     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 567     address start = __ pc();
 568 
 569     Label exit, error;
 570 
 571     // save c_rarg2 and c_rarg3
 572     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 573 
 574     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 575     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 576     __ ldr(c_rarg3, Address(c_rarg2));
 577     __ add(c_rarg3, c_rarg3, 1);
 578     __ str(c_rarg3, Address(c_rarg2));
 579 
 580     // object is in r0
 581     // make sure object is 'reasonable'
 582     __ cbz(r0, exit); // if obj is NULL it is OK
 583 
 584 #if INCLUDE_ZGC
 585     if (UseZGC) {
 586       // Check if mask is good.
 587       // verifies that ZAddressBadMask & r0 == 0
 588       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 589       __ andr(c_rarg2, r0, c_rarg3);
 590       __ cbnz(c_rarg2, error);
 591     }
 592 #endif
 593 
 594     // Check if the oop is in the right area of memory
 595     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 596     __ andr(c_rarg2, r0, c_rarg3);
 597     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 598 
 599     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 600     // instruction here because the flags register is live.
 601     __ eor(c_rarg2, c_rarg2, c_rarg3);
 602     __ cbnz(c_rarg2, error);
 603 
 604     // make sure klass is 'reasonable', which is not zero.
 605     __ load_klass(r0, r0);  // get klass
 606     __ cbz(r0, error);      // if klass is NULL it is broken
 607 
 608     // return if everything seems ok
 609     __ bind(exit);
 610 
 611     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 612     __ ret(lr);
 613 
 614     // handle errors
 615     __ bind(error);
 616     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 617 
 618     __ push(RegSet::range(r0, r29), sp);
 619     // debug(char* msg, int64_t pc, int64_t regs[])
 620     __ mov(c_rarg0, rscratch1);      // pass address of error message
 621     __ mov(c_rarg1, lr);             // pass return address
 622     __ mov(c_rarg2, sp);             // pass address of regs on stack
 623 #ifndef PRODUCT
 624     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 625 #endif
 626     BLOCK_COMMENT("call MacroAssembler::debug");
 627     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 628     __ blr(rscratch1);
 629     __ hlt(0);
 630 
 631     return start;
 632   }
 633 
 634   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 635 
 636   // Generate indices for iota vector.
 637   address generate_iota_indices(const char *stub_name) {
 638     __ align(CodeEntryAlignment);
 639     StubCodeMark mark(this, "StubRoutines", stub_name);
 640     address start = __ pc();
 641     __ emit_data64(0x0706050403020100, relocInfo::none);
 642     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 643     return start;
 644   }
 645 
 646   // The inner part of zero_words().  This is the bulk operation,
 647   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 648   // caller is responsible for zeroing the last few words.
 649   //
 650   // Inputs:
 651   // r10: the HeapWord-aligned base address of an array to zero.
 652   // r11: the count in HeapWords, r11 > 0.
 653   //
 654   // Returns r10 and r11, adjusted for the caller to clear.
 655   // r10: the base address of the tail of words left to clear.
 656   // r11: the number of words in the tail.
 657   //      r11 < MacroAssembler::zero_words_block_size.
 658 
 659   address generate_zero_blocks() {
 660     Label done;
 661     Label base_aligned;
 662 
 663     Register base = r10, cnt = r11;
 664 
 665     __ align(CodeEntryAlignment);
 666     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 667     address start = __ pc();
 668 
 669     if (UseBlockZeroing) {
 670       int zva_length = VM_Version::zva_length();
 671 
 672       // Ensure ZVA length can be divided by 16. This is required by
 673       // the subsequent operations.
 674       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 675 
 676       __ tbz(base, 3, base_aligned);
 677       __ str(zr, Address(__ post(base, 8)));
 678       __ sub(cnt, cnt, 1);
 679       __ bind(base_aligned);
 680 
 681       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 682       // alignment.
 683       Label small;
 684       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 685       __ subs(rscratch1, cnt, low_limit >> 3);
 686       __ br(Assembler::LT, small);
 687       __ zero_dcache_blocks(base, cnt);
 688       __ bind(small);
 689     }
 690 
 691     {
 692       // Number of stp instructions we'll unroll
 693       const int unroll =
 694         MacroAssembler::zero_words_block_size / 2;
 695       // Clear the remaining blocks.
 696       Label loop;
 697       __ subs(cnt, cnt, unroll * 2);
 698       __ br(Assembler::LT, done);
 699       __ bind(loop);
 700       for (int i = 0; i < unroll; i++)
 701         __ stp(zr, zr, __ post(base, 16));
 702       __ subs(cnt, cnt, unroll * 2);
 703       __ br(Assembler::GE, loop);
 704       __ bind(done);
 705       __ add(cnt, cnt, unroll * 2);
 706     }
 707 
 708     __ ret(lr);
 709 
 710     return start;
 711   }
 712 
 713 
 714   typedef enum {
 715     copy_forwards = 1,
 716     copy_backwards = -1
 717   } copy_direction;
 718 
 719   // Bulk copy of blocks of 8 words.
 720   //
 721   // count is a count of words.
 722   //
 723   // Precondition: count >= 8
 724   //
 725   // Postconditions:
 726   //
 727   // The least significant bit of count contains the remaining count
 728   // of words to copy.  The rest of count is trash.
 729   //
 730   // s and d are adjusted to point to the remaining words to copy
 731   //
 732   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 733                            copy_direction direction) {
 734     int unit = wordSize * direction;
 735     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 736 
 737     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 738       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 739     const Register stride = r13;
 740 
 741     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 742     assert_different_registers(s, d, count, rscratch1);
 743 
 744     Label again, drain;
 745     const char *stub_name;
 746     if (direction == copy_forwards)
 747       stub_name = "forward_copy_longs";
 748     else
 749       stub_name = "backward_copy_longs";
 750 
 751     __ align(CodeEntryAlignment);
 752 
 753     StubCodeMark mark(this, "StubRoutines", stub_name);
 754 
 755     __ bind(start);
 756 
 757     Label unaligned_copy_long;
 758     if (AvoidUnalignedAccesses) {
 759       __ tbnz(d, 3, unaligned_copy_long);
 760     }
 761 
 762     if (direction == copy_forwards) {
 763       __ sub(s, s, bias);
 764       __ sub(d, d, bias);
 765     }
 766 
 767 #ifdef ASSERT
 768     // Make sure we are never given < 8 words
 769     {
 770       Label L;
 771       __ cmp(count, (u1)8);
 772       __ br(Assembler::GE, L);
 773       __ stop("genrate_copy_longs called with < 8 words");
 774       __ bind(L);
 775     }
 776 #endif
 777 
 778     // Fill 8 registers
 779     if (UseSIMDForMemoryOps) {
 780       __ ldpq(v0, v1, Address(s, 4 * unit));
 781       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 782     } else {
 783       __ ldp(t0, t1, Address(s, 2 * unit));
 784       __ ldp(t2, t3, Address(s, 4 * unit));
 785       __ ldp(t4, t5, Address(s, 6 * unit));
 786       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 787     }
 788 
 789     __ subs(count, count, 16);
 790     __ br(Assembler::LO, drain);
 791 
 792     int prefetch = PrefetchCopyIntervalInBytes;
 793     bool use_stride = false;
 794     if (direction == copy_backwards) {
 795        use_stride = prefetch > 256;
 796        prefetch = -prefetch;
 797        if (use_stride) __ mov(stride, prefetch);
 798     }
 799 
 800     __ bind(again);
 801 
 802     if (PrefetchCopyIntervalInBytes > 0)
 803       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 804 
 805     if (UseSIMDForMemoryOps) {
 806       __ stpq(v0, v1, Address(d, 4 * unit));
 807       __ ldpq(v0, v1, Address(s, 4 * unit));
 808       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 809       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 810     } else {
 811       __ stp(t0, t1, Address(d, 2 * unit));
 812       __ ldp(t0, t1, Address(s, 2 * unit));
 813       __ stp(t2, t3, Address(d, 4 * unit));
 814       __ ldp(t2, t3, Address(s, 4 * unit));
 815       __ stp(t4, t5, Address(d, 6 * unit));
 816       __ ldp(t4, t5, Address(s, 6 * unit));
 817       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 818       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 819     }
 820 
 821     __ subs(count, count, 8);
 822     __ br(Assembler::HS, again);
 823 
 824     // Drain
 825     __ bind(drain);
 826     if (UseSIMDForMemoryOps) {
 827       __ stpq(v0, v1, Address(d, 4 * unit));
 828       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 829     } else {
 830       __ stp(t0, t1, Address(d, 2 * unit));
 831       __ stp(t2, t3, Address(d, 4 * unit));
 832       __ stp(t4, t5, Address(d, 6 * unit));
 833       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 834     }
 835 
 836     {
 837       Label L1, L2;
 838       __ tbz(count, exact_log2(4), L1);
 839       if (UseSIMDForMemoryOps) {
 840         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 841         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 842       } else {
 843         __ ldp(t0, t1, Address(s, 2 * unit));
 844         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 845         __ stp(t0, t1, Address(d, 2 * unit));
 846         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 847       }
 848       __ bind(L1);
 849 
 850       if (direction == copy_forwards) {
 851         __ add(s, s, bias);
 852         __ add(d, d, bias);
 853       }
 854 
 855       __ tbz(count, 1, L2);
 856       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 857       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 858       __ bind(L2);
 859     }
 860 
 861     __ ret(lr);
 862 
 863     if (AvoidUnalignedAccesses) {
 864       Label drain, again;
 865       // Register order for storing. Order is different for backward copy.
 866 
 867       __ bind(unaligned_copy_long);
 868 
 869       // source address is even aligned, target odd aligned
 870       //
 871       // when forward copying word pairs we read long pairs at offsets
 872       // {0, 2, 4, 6} (in long words). when backwards copying we read
 873       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 874       // address by -2 in the forwards case so we can compute the
 875       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 876       // or -1.
 877       //
 878       // when forward copying we need to store 1 word, 3 pairs and
 879       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 880       // zero offset We adjust the destination by -1 which means we
 881       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 882       //
 883       // When backwards copyng we need to store 1 word, 3 pairs and
 884       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 885       // offsets {1, 3, 5, 7, 8} * unit.
 886 
 887       if (direction == copy_forwards) {
 888         __ sub(s, s, 16);
 889         __ sub(d, d, 8);
 890       }
 891 
 892       // Fill 8 registers
 893       //
 894       // for forwards copy s was offset by -16 from the original input
 895       // value of s so the register contents are at these offsets
 896       // relative to the 64 bit block addressed by that original input
 897       // and so on for each successive 64 byte block when s is updated
 898       //
 899       // t0 at offset 0,  t1 at offset 8
 900       // t2 at offset 16, t3 at offset 24
 901       // t4 at offset 32, t5 at offset 40
 902       // t6 at offset 48, t7 at offset 56
 903 
 904       // for backwards copy s was not offset so the register contents
 905       // are at these offsets into the preceding 64 byte block
 906       // relative to that original input and so on for each successive
 907       // preceding 64 byte block when s is updated. this explains the
 908       // slightly counter-intuitive looking pattern of register usage
 909       // in the stp instructions for backwards copy.
 910       //
 911       // t0 at offset -16, t1 at offset -8
 912       // t2 at offset -32, t3 at offset -24
 913       // t4 at offset -48, t5 at offset -40
 914       // t6 at offset -64, t7 at offset -56
 915 
 916       __ ldp(t0, t1, Address(s, 2 * unit));
 917       __ ldp(t2, t3, Address(s, 4 * unit));
 918       __ ldp(t4, t5, Address(s, 6 * unit));
 919       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 920 
 921       __ subs(count, count, 16);
 922       __ br(Assembler::LO, drain);
 923 
 924       int prefetch = PrefetchCopyIntervalInBytes;
 925       bool use_stride = false;
 926       if (direction == copy_backwards) {
 927          use_stride = prefetch > 256;
 928          prefetch = -prefetch;
 929          if (use_stride) __ mov(stride, prefetch);
 930       }
 931 
 932       __ bind(again);
 933 
 934       if (PrefetchCopyIntervalInBytes > 0)
 935         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 936 
 937       if (direction == copy_forwards) {
 938        // allowing for the offset of -8 the store instructions place
 939        // registers into the target 64 bit block at the following
 940        // offsets
 941        //
 942        // t0 at offset 0
 943        // t1 at offset 8,  t2 at offset 16
 944        // t3 at offset 24, t4 at offset 32
 945        // t5 at offset 40, t6 at offset 48
 946        // t7 at offset 56
 947 
 948         __ str(t0, Address(d, 1 * unit));
 949         __ stp(t1, t2, Address(d, 2 * unit));
 950         __ ldp(t0, t1, Address(s, 2 * unit));
 951         __ stp(t3, t4, Address(d, 4 * unit));
 952         __ ldp(t2, t3, Address(s, 4 * unit));
 953         __ stp(t5, t6, Address(d, 6 * unit));
 954         __ ldp(t4, t5, Address(s, 6 * unit));
 955         __ str(t7, Address(__ pre(d, 8 * unit)));
 956         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 957       } else {
 958        // d was not offset when we started so the registers are
 959        // written into the 64 bit block preceding d with the following
 960        // offsets
 961        //
 962        // t1 at offset -8
 963        // t3 at offset -24, t0 at offset -16
 964        // t5 at offset -48, t2 at offset -32
 965        // t7 at offset -56, t4 at offset -48
 966        //                   t6 at offset -64
 967        //
 968        // note that this matches the offsets previously noted for the
 969        // loads
 970 
 971         __ str(t1, Address(d, 1 * unit));
 972         __ stp(t3, t0, Address(d, 3 * unit));
 973         __ ldp(t0, t1, Address(s, 2 * unit));
 974         __ stp(t5, t2, Address(d, 5 * unit));
 975         __ ldp(t2, t3, Address(s, 4 * unit));
 976         __ stp(t7, t4, Address(d, 7 * unit));
 977         __ ldp(t4, t5, Address(s, 6 * unit));
 978         __ str(t6, Address(__ pre(d, 8 * unit)));
 979         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 980       }
 981 
 982       __ subs(count, count, 8);
 983       __ br(Assembler::HS, again);
 984 
 985       // Drain
 986       //
 987       // this uses the same pattern of offsets and register arguments
 988       // as above
 989       __ bind(drain);
 990       if (direction == copy_forwards) {
 991         __ str(t0, Address(d, 1 * unit));
 992         __ stp(t1, t2, Address(d, 2 * unit));
 993         __ stp(t3, t4, Address(d, 4 * unit));
 994         __ stp(t5, t6, Address(d, 6 * unit));
 995         __ str(t7, Address(__ pre(d, 8 * unit)));
 996       } else {
 997         __ str(t1, Address(d, 1 * unit));
 998         __ stp(t3, t0, Address(d, 3 * unit));
 999         __ stp(t5, t2, Address(d, 5 * unit));
1000         __ stp(t7, t4, Address(d, 7 * unit));
1001         __ str(t6, Address(__ pre(d, 8 * unit)));
1002       }
1003       // now we need to copy any remaining part block which may
1004       // include a 4 word block subblock and/or a 2 word subblock.
1005       // bits 2 and 1 in the count are the tell-tale for whetehr we
1006       // have each such subblock
1007       {
1008         Label L1, L2;
1009         __ tbz(count, exact_log2(4), L1);
1010        // this is the same as above but copying only 4 longs hence
1011        // with ony one intervening stp between the str instructions
1012        // but note that the offsets and registers still follow the
1013        // same pattern
1014         __ ldp(t0, t1, Address(s, 2 * unit));
1015         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1016         if (direction == copy_forwards) {
1017           __ str(t0, Address(d, 1 * unit));
1018           __ stp(t1, t2, Address(d, 2 * unit));
1019           __ str(t3, Address(__ pre(d, 4 * unit)));
1020         } else {
1021           __ str(t1, Address(d, 1 * unit));
1022           __ stp(t3, t0, Address(d, 3 * unit));
1023           __ str(t2, Address(__ pre(d, 4 * unit)));
1024         }
1025         __ bind(L1);
1026 
1027         __ tbz(count, 1, L2);
1028        // this is the same as above but copying only 2 longs hence
1029        // there is no intervening stp between the str instructions
1030        // but note that the offset and register patterns are still
1031        // the same
1032         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1033         if (direction == copy_forwards) {
1034           __ str(t0, Address(d, 1 * unit));
1035           __ str(t1, Address(__ pre(d, 2 * unit)));
1036         } else {
1037           __ str(t1, Address(d, 1 * unit));
1038           __ str(t0, Address(__ pre(d, 2 * unit)));
1039         }
1040         __ bind(L2);
1041 
1042        // for forwards copy we need to re-adjust the offsets we
1043        // applied so that s and d are follow the last words written
1044 
1045        if (direction == copy_forwards) {
1046          __ add(s, s, 16);
1047          __ add(d, d, 8);
1048        }
1049 
1050       }
1051 
1052       __ ret(lr);
1053       }
1054   }
1055 
1056   // Small copy: less than 16 bytes.
1057   //
1058   // NB: Ignores all of the bits of count which represent more than 15
1059   // bytes, so a caller doesn't have to mask them.
1060 
1061   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1062     bool is_backwards = step < 0;
1063     size_t granularity = uabs(step);
1064     int direction = is_backwards ? -1 : 1;
1065     int unit = wordSize * direction;
1066 
1067     Label Lword, Lint, Lshort, Lbyte;
1068 
1069     assert(granularity
1070            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1071 
1072     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1073 
1074     // ??? I don't know if this bit-test-and-branch is the right thing
1075     // to do.  It does a lot of jumping, resulting in several
1076     // mispredicted branches.  It might make more sense to do this
1077     // with something like Duff's device with a single computed branch.
1078 
1079     __ tbz(count, 3 - exact_log2(granularity), Lword);
1080     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1081     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1082     __ bind(Lword);
1083 
1084     if (granularity <= sizeof (jint)) {
1085       __ tbz(count, 2 - exact_log2(granularity), Lint);
1086       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1087       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1088       __ bind(Lint);
1089     }
1090 
1091     if (granularity <= sizeof (jshort)) {
1092       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1093       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1094       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1095       __ bind(Lshort);
1096     }
1097 
1098     if (granularity <= sizeof (jbyte)) {
1099       __ tbz(count, 0, Lbyte);
1100       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1101       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1102       __ bind(Lbyte);
1103     }
1104   }
1105 
1106   Label copy_f, copy_b;
1107 
1108   // All-singing all-dancing memory copy.
1109   //
1110   // Copy count units of memory from s to d.  The size of a unit is
1111   // step, which can be positive or negative depending on the direction
1112   // of copy.  If is_aligned is false, we align the source address.
1113   //
1114 
1115   void copy_memory(bool is_aligned, Register s, Register d,
1116                    Register count, Register tmp, int step) {
1117     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1118     bool is_backwards = step < 0;
1119     unsigned int granularity = uabs(step);
1120     const Register t0 = r3, t1 = r4;
1121 
1122     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1123     // load all the data before writing anything
1124     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1125     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1126     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1127     const Register send = r17, dend = r16;
1128 
1129     if (PrefetchCopyIntervalInBytes > 0)
1130       __ prfm(Address(s, 0), PLDL1KEEP);
1131     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1132     __ br(Assembler::HI, copy_big);
1133 
1134     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1135     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1136 
1137     __ cmp(count, u1(16/granularity));
1138     __ br(Assembler::LS, copy16);
1139 
1140     __ cmp(count, u1(64/granularity));
1141     __ br(Assembler::HI, copy80);
1142 
1143     __ cmp(count, u1(32/granularity));
1144     __ br(Assembler::LS, copy32);
1145 
1146     // 33..64 bytes
1147     if (UseSIMDForMemoryOps) {
1148       __ ldpq(v0, v1, Address(s, 0));
1149       __ ldpq(v2, v3, Address(send, -32));
1150       __ stpq(v0, v1, Address(d, 0));
1151       __ stpq(v2, v3, Address(dend, -32));
1152     } else {
1153       __ ldp(t0, t1, Address(s, 0));
1154       __ ldp(t2, t3, Address(s, 16));
1155       __ ldp(t4, t5, Address(send, -32));
1156       __ ldp(t6, t7, Address(send, -16));
1157 
1158       __ stp(t0, t1, Address(d, 0));
1159       __ stp(t2, t3, Address(d, 16));
1160       __ stp(t4, t5, Address(dend, -32));
1161       __ stp(t6, t7, Address(dend, -16));
1162     }
1163     __ b(finish);
1164 
1165     // 17..32 bytes
1166     __ bind(copy32);
1167     __ ldp(t0, t1, Address(s, 0));
1168     __ ldp(t2, t3, Address(send, -16));
1169     __ stp(t0, t1, Address(d, 0));
1170     __ stp(t2, t3, Address(dend, -16));
1171     __ b(finish);
1172 
1173     // 65..80/96 bytes
1174     // (96 bytes if SIMD because we do 32 byes per instruction)
1175     __ bind(copy80);
1176     if (UseSIMDForMemoryOps) {
1177       __ ldpq(v0, v1, Address(s, 0));
1178       __ ldpq(v2, v3, Address(s, 32));
1179       // Unaligned pointers can be an issue for copying.
1180       // The issue has more chances to happen when granularity of data is
1181       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1182       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1183       // The most performance drop has been seen for the range 65-80 bytes.
1184       // For such cases using the pair of ldp/stp instead of the third pair of
1185       // ldpq/stpq fixes the performance issue.
1186       if (granularity < sizeof (jint)) {
1187         Label copy96;
1188         __ cmp(count, u1(80/granularity));
1189         __ br(Assembler::HI, copy96);
1190         __ ldp(t0, t1, Address(send, -16));
1191 
1192         __ stpq(v0, v1, Address(d, 0));
1193         __ stpq(v2, v3, Address(d, 32));
1194         __ stp(t0, t1, Address(dend, -16));
1195         __ b(finish);
1196 
1197         __ bind(copy96);
1198       }
1199       __ ldpq(v4, v5, Address(send, -32));
1200 
1201       __ stpq(v0, v1, Address(d, 0));
1202       __ stpq(v2, v3, Address(d, 32));
1203       __ stpq(v4, v5, Address(dend, -32));
1204     } else {
1205       __ ldp(t0, t1, Address(s, 0));
1206       __ ldp(t2, t3, Address(s, 16));
1207       __ ldp(t4, t5, Address(s, 32));
1208       __ ldp(t6, t7, Address(s, 48));
1209       __ ldp(t8, t9, Address(send, -16));
1210 
1211       __ stp(t0, t1, Address(d, 0));
1212       __ stp(t2, t3, Address(d, 16));
1213       __ stp(t4, t5, Address(d, 32));
1214       __ stp(t6, t7, Address(d, 48));
1215       __ stp(t8, t9, Address(dend, -16));
1216     }
1217     __ b(finish);
1218 
1219     // 0..16 bytes
1220     __ bind(copy16);
1221     __ cmp(count, u1(8/granularity));
1222     __ br(Assembler::LO, copy8);
1223 
1224     // 8..16 bytes
1225     __ ldr(t0, Address(s, 0));
1226     __ ldr(t1, Address(send, -8));
1227     __ str(t0, Address(d, 0));
1228     __ str(t1, Address(dend, -8));
1229     __ b(finish);
1230 
1231     if (granularity < 8) {
1232       // 4..7 bytes
1233       __ bind(copy8);
1234       __ tbz(count, 2 - exact_log2(granularity), copy4);
1235       __ ldrw(t0, Address(s, 0));
1236       __ ldrw(t1, Address(send, -4));
1237       __ strw(t0, Address(d, 0));
1238       __ strw(t1, Address(dend, -4));
1239       __ b(finish);
1240       if (granularity < 4) {
1241         // 0..3 bytes
1242         __ bind(copy4);
1243         __ cbz(count, finish); // get rid of 0 case
1244         if (granularity == 2) {
1245           __ ldrh(t0, Address(s, 0));
1246           __ strh(t0, Address(d, 0));
1247         } else { // granularity == 1
1248           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1249           // the first and last byte.
1250           // Handle the 3 byte case by loading and storing base + count/2
1251           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1252           // This does means in the 1 byte case we load/store the same
1253           // byte 3 times.
1254           __ lsr(count, count, 1);
1255           __ ldrb(t0, Address(s, 0));
1256           __ ldrb(t1, Address(send, -1));
1257           __ ldrb(t2, Address(s, count));
1258           __ strb(t0, Address(d, 0));
1259           __ strb(t1, Address(dend, -1));
1260           __ strb(t2, Address(d, count));
1261         }
1262         __ b(finish);
1263       }
1264     }
1265 
1266     __ bind(copy_big);
1267     if (is_backwards) {
1268       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1269       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1270     }
1271 
1272     // Now we've got the small case out of the way we can align the
1273     // source address on a 2-word boundary.
1274 
1275     Label aligned;
1276 
1277     if (is_aligned) {
1278       // We may have to adjust by 1 word to get s 2-word-aligned.
1279       __ tbz(s, exact_log2(wordSize), aligned);
1280       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1281       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1282       __ sub(count, count, wordSize/granularity);
1283     } else {
1284       if (is_backwards) {
1285         __ andr(rscratch2, s, 2 * wordSize - 1);
1286       } else {
1287         __ neg(rscratch2, s);
1288         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1289       }
1290       // rscratch2 is the byte adjustment needed to align s.
1291       __ cbz(rscratch2, aligned);
1292       int shift = exact_log2(granularity);
1293       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1294       __ sub(count, count, rscratch2);
1295 
1296 #if 0
1297       // ?? This code is only correct for a disjoint copy.  It may or
1298       // may not make sense to use it in that case.
1299 
1300       // Copy the first pair; s and d may not be aligned.
1301       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1302       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1303 
1304       // Align s and d, adjust count
1305       if (is_backwards) {
1306         __ sub(s, s, rscratch2);
1307         __ sub(d, d, rscratch2);
1308       } else {
1309         __ add(s, s, rscratch2);
1310         __ add(d, d, rscratch2);
1311       }
1312 #else
1313       copy_memory_small(s, d, rscratch2, rscratch1, step);
1314 #endif
1315     }
1316 
1317     __ bind(aligned);
1318 
1319     // s is now 2-word-aligned.
1320 
1321     // We have a count of units and some trailing bytes.  Adjust the
1322     // count and do a bulk copy of words.
1323     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1324     if (direction == copy_forwards)
1325       __ bl(copy_f);
1326     else
1327       __ bl(copy_b);
1328 
1329     // And the tail.
1330     copy_memory_small(s, d, count, tmp, step);
1331 
1332     if (granularity >= 8) __ bind(copy8);
1333     if (granularity >= 4) __ bind(copy4);
1334     __ bind(finish);
1335   }
1336 
1337 
1338   void clobber_registers() {
1339 #ifdef ASSERT
1340     RegSet clobbered
1341       = MacroAssembler::call_clobbered_registers() - rscratch1;
1342     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1343     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1344     for (RegSetIterator<> it = clobbered.begin(); *it != noreg; ++it) {
1345       __ mov(*it, rscratch1);
1346     }
1347 #endif
1348 
1349   }
1350 
1351   // Scan over array at a for count oops, verifying each one.
1352   // Preserves a and count, clobbers rscratch1 and rscratch2.
1353   void verify_oop_array (int size, Register a, Register count, Register temp) {
1354     Label loop, end;
1355     __ mov(rscratch1, a);
1356     __ mov(rscratch2, zr);
1357     __ bind(loop);
1358     __ cmp(rscratch2, count);
1359     __ br(Assembler::HS, end);
1360     if (size == wordSize) {
1361       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1362       __ verify_oop(temp);
1363     } else {
1364       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1365       __ decode_heap_oop(temp); // calls verify_oop
1366     }
1367     __ add(rscratch2, rscratch2, 1);
1368     __ b(loop);
1369     __ bind(end);
1370   }
1371 
1372   // Arguments:
1373   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1374   //             ignored
1375   //   is_oop  - true => oop array, so generate store check code
1376   //   name    - stub name string
1377   //
1378   // Inputs:
1379   //   c_rarg0   - source array address
1380   //   c_rarg1   - destination array address
1381   //   c_rarg2   - element count, treated as ssize_t, can be zero
1382   //
1383   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1384   // the hardware handle it.  The two dwords within qwords that span
1385   // cache line boundaries will still be loaded and stored atomically.
1386   //
1387   // Side Effects:
1388   //   disjoint_int_copy_entry is set to the no-overlap entry point
1389   //   used by generate_conjoint_int_oop_copy().
1390   //
1391   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1392                                   const char *name, bool dest_uninitialized = false) {
1393     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1394     RegSet saved_reg = RegSet::of(s, d, count);
1395     __ align(CodeEntryAlignment);
1396     StubCodeMark mark(this, "StubRoutines", name);
1397     address start = __ pc();
1398     __ enter();
1399 
1400     if (entry != NULL) {
1401       *entry = __ pc();
1402       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1403       BLOCK_COMMENT("Entry:");
1404     }
1405 
1406     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1407     if (dest_uninitialized) {
1408       decorators |= IS_DEST_UNINITIALIZED;
1409     }
1410     if (aligned) {
1411       decorators |= ARRAYCOPY_ALIGNED;
1412     }
1413 
1414     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1415     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1416 
1417     if (is_oop) {
1418       // save regs before copy_memory
1419       __ push(RegSet::of(d, count), sp);
1420     }
1421     {
1422       // UnsafeCopyMemory page error: continue after ucm
1423       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1424       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1425       copy_memory(aligned, s, d, count, rscratch1, size);
1426     }
1427 
1428     if (is_oop) {
1429       __ pop(RegSet::of(d, count), sp);
1430       if (VerifyOops)
1431         verify_oop_array(size, d, count, r16);
1432     }
1433 
1434     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1435 
1436     __ leave();
1437     __ mov(r0, zr); // return 0
1438     __ ret(lr);
1439     return start;
1440   }
1441 
1442   // Arguments:
1443   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1444   //             ignored
1445   //   is_oop  - true => oop array, so generate store check code
1446   //   name    - stub name string
1447   //
1448   // Inputs:
1449   //   c_rarg0   - source array address
1450   //   c_rarg1   - destination array address
1451   //   c_rarg2   - element count, treated as ssize_t, can be zero
1452   //
1453   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1454   // the hardware handle it.  The two dwords within qwords that span
1455   // cache line boundaries will still be loaded and stored atomically.
1456   //
1457   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1458                                  address *entry, const char *name,
1459                                  bool dest_uninitialized = false) {
1460     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1461     RegSet saved_regs = RegSet::of(s, d, count);
1462     StubCodeMark mark(this, "StubRoutines", name);
1463     address start = __ pc();
1464     __ enter();
1465 
1466     if (entry != NULL) {
1467       *entry = __ pc();
1468       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1469       BLOCK_COMMENT("Entry:");
1470     }
1471 
1472     // use fwd copy when (d-s) above_equal (count*size)
1473     __ sub(rscratch1, d, s);
1474     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1475     __ br(Assembler::HS, nooverlap_target);
1476 
1477     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1478     if (dest_uninitialized) {
1479       decorators |= IS_DEST_UNINITIALIZED;
1480     }
1481     if (aligned) {
1482       decorators |= ARRAYCOPY_ALIGNED;
1483     }
1484 
1485     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1486     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1487 
1488     if (is_oop) {
1489       // save regs before copy_memory
1490       __ push(RegSet::of(d, count), sp);
1491     }
1492     {
1493       // UnsafeCopyMemory page error: continue after ucm
1494       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1495       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1496       copy_memory(aligned, s, d, count, rscratch1, -size);
1497     }
1498     if (is_oop) {
1499       __ pop(RegSet::of(d, count), sp);
1500       if (VerifyOops)
1501         verify_oop_array(size, d, count, r16);
1502     }
1503     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1504     __ leave();
1505     __ mov(r0, zr); // return 0
1506     __ ret(lr);
1507     return start;
1508 }
1509 
1510   // Arguments:
1511   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1512   //             ignored
1513   //   name    - stub name string
1514   //
1515   // Inputs:
1516   //   c_rarg0   - source array address
1517   //   c_rarg1   - destination array address
1518   //   c_rarg2   - element count, treated as ssize_t, can be zero
1519   //
1520   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1521   // we let the hardware handle it.  The one to eight bytes within words,
1522   // dwords or qwords that span cache line boundaries will still be loaded
1523   // and stored atomically.
1524   //
1525   // Side Effects:
1526   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1527   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1528   // we let the hardware handle it.  The one to eight bytes within words,
1529   // dwords or qwords that span cache line boundaries will still be loaded
1530   // and stored atomically.
1531   //
1532   // Side Effects:
1533   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1534   //   used by generate_conjoint_byte_copy().
1535   //
1536   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1537     const bool not_oop = false;
1538     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1539   }
1540 
1541   // Arguments:
1542   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1543   //             ignored
1544   //   name    - stub name string
1545   //
1546   // Inputs:
1547   //   c_rarg0   - source array address
1548   //   c_rarg1   - destination array address
1549   //   c_rarg2   - element count, treated as ssize_t, can be zero
1550   //
1551   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1552   // we let the hardware handle it.  The one to eight bytes within words,
1553   // dwords or qwords that span cache line boundaries will still be loaded
1554   // and stored atomically.
1555   //
1556   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1557                                       address* entry, const char *name) {
1558     const bool not_oop = false;
1559     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1560   }
1561 
1562   // Arguments:
1563   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1564   //             ignored
1565   //   name    - stub name string
1566   //
1567   // Inputs:
1568   //   c_rarg0   - source array address
1569   //   c_rarg1   - destination array address
1570   //   c_rarg2   - element count, treated as ssize_t, can be zero
1571   //
1572   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1573   // let the hardware handle it.  The two or four words within dwords
1574   // or qwords that span cache line boundaries will still be loaded
1575   // and stored atomically.
1576   //
1577   // Side Effects:
1578   //   disjoint_short_copy_entry is set to the no-overlap entry point
1579   //   used by generate_conjoint_short_copy().
1580   //
1581   address generate_disjoint_short_copy(bool aligned,
1582                                        address* entry, const char *name) {
1583     const bool not_oop = false;
1584     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1585   }
1586 
1587   // Arguments:
1588   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1589   //             ignored
1590   //   name    - stub name string
1591   //
1592   // Inputs:
1593   //   c_rarg0   - source array address
1594   //   c_rarg1   - destination array address
1595   //   c_rarg2   - element count, treated as ssize_t, can be zero
1596   //
1597   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1598   // let the hardware handle it.  The two or four words within dwords
1599   // or qwords that span cache line boundaries will still be loaded
1600   // and stored atomically.
1601   //
1602   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1603                                        address *entry, const char *name) {
1604     const bool not_oop = false;
1605     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1606 
1607   }
1608   // Arguments:
1609   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1610   //             ignored
1611   //   name    - stub name string
1612   //
1613   // Inputs:
1614   //   c_rarg0   - source array address
1615   //   c_rarg1   - destination array address
1616   //   c_rarg2   - element count, treated as ssize_t, can be zero
1617   //
1618   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1619   // the hardware handle it.  The two dwords within qwords that span
1620   // cache line boundaries will still be loaded and stored atomically.
1621   //
1622   // Side Effects:
1623   //   disjoint_int_copy_entry is set to the no-overlap entry point
1624   //   used by generate_conjoint_int_oop_copy().
1625   //
1626   address generate_disjoint_int_copy(bool aligned, address *entry,
1627                                          const char *name, bool dest_uninitialized = false) {
1628     const bool not_oop = false;
1629     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1630   }
1631 
1632   // Arguments:
1633   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1634   //             ignored
1635   //   name    - stub name string
1636   //
1637   // Inputs:
1638   //   c_rarg0   - source array address
1639   //   c_rarg1   - destination array address
1640   //   c_rarg2   - element count, treated as ssize_t, can be zero
1641   //
1642   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1643   // the hardware handle it.  The two dwords within qwords that span
1644   // cache line boundaries will still be loaded and stored atomically.
1645   //
1646   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1647                                      address *entry, const char *name,
1648                                      bool dest_uninitialized = false) {
1649     const bool not_oop = false;
1650     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1651   }
1652 
1653 
1654   // Arguments:
1655   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1656   //             ignored
1657   //   name    - stub name string
1658   //
1659   // Inputs:
1660   //   c_rarg0   - source array address
1661   //   c_rarg1   - destination array address
1662   //   c_rarg2   - element count, treated as size_t, can be zero
1663   //
1664   // Side Effects:
1665   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1666   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1667   //
1668   address generate_disjoint_long_copy(bool aligned, address *entry,
1669                                           const char *name, bool dest_uninitialized = false) {
1670     const bool not_oop = false;
1671     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1672   }
1673 
1674   // Arguments:
1675   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1676   //             ignored
1677   //   name    - stub name string
1678   //
1679   // Inputs:
1680   //   c_rarg0   - source array address
1681   //   c_rarg1   - destination array address
1682   //   c_rarg2   - element count, treated as size_t, can be zero
1683   //
1684   address generate_conjoint_long_copy(bool aligned,
1685                                       address nooverlap_target, address *entry,
1686                                       const char *name, bool dest_uninitialized = false) {
1687     const bool not_oop = false;
1688     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1689   }
1690 
1691   // Arguments:
1692   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1693   //             ignored
1694   //   name    - stub name string
1695   //
1696   // Inputs:
1697   //   c_rarg0   - source array address
1698   //   c_rarg1   - destination array address
1699   //   c_rarg2   - element count, treated as size_t, can be zero
1700   //
1701   // Side Effects:
1702   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1703   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1704   //
1705   address generate_disjoint_oop_copy(bool aligned, address *entry,
1706                                      const char *name, bool dest_uninitialized) {
1707     const bool is_oop = true;
1708     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1709     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1710   }
1711 
1712   // Arguments:
1713   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1714   //             ignored
1715   //   name    - stub name string
1716   //
1717   // Inputs:
1718   //   c_rarg0   - source array address
1719   //   c_rarg1   - destination array address
1720   //   c_rarg2   - element count, treated as size_t, can be zero
1721   //
1722   address generate_conjoint_oop_copy(bool aligned,
1723                                      address nooverlap_target, address *entry,
1724                                      const char *name, bool dest_uninitialized) {
1725     const bool is_oop = true;
1726     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1727     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1728                                   name, dest_uninitialized);
1729   }
1730 
1731 
1732   // Helper for generating a dynamic type check.
1733   // Smashes rscratch1, rscratch2.
1734   void generate_type_check(Register sub_klass,
1735                            Register super_check_offset,
1736                            Register super_klass,
1737                            Label& L_success) {
1738     assert_different_registers(sub_klass, super_check_offset, super_klass);
1739 
1740     BLOCK_COMMENT("type_check:");
1741 
1742     Label L_miss;
1743 
1744     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1745                                      super_check_offset);
1746     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1747 
1748     // Fall through on failure!
1749     __ BIND(L_miss);
1750   }
1751 
1752   //
1753   //  Generate checkcasting array copy stub
1754   //
1755   //  Input:
1756   //    c_rarg0   - source array address
1757   //    c_rarg1   - destination array address
1758   //    c_rarg2   - element count, treated as ssize_t, can be zero
1759   //    c_rarg3   - size_t ckoff (super_check_offset)
1760   //    c_rarg4   - oop ckval (super_klass)
1761   //
1762   //  Output:
1763   //    r0 ==  0  -  success
1764   //    r0 == -1^K - failure, where K is partial transfer count
1765   //
1766   address generate_checkcast_copy(const char *name, address *entry,
1767                                   bool dest_uninitialized = false) {
1768 
1769     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1770 
1771     // Input registers (after setup_arg_regs)
1772     const Register from        = c_rarg0;   // source array address
1773     const Register to          = c_rarg1;   // destination array address
1774     const Register count       = c_rarg2;   // elementscount
1775     const Register ckoff       = c_rarg3;   // super_check_offset
1776     const Register ckval       = c_rarg4;   // super_klass
1777 
1778     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1779     RegSet wb_post_saved_regs = RegSet::of(count);
1780 
1781     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1782     const Register copied_oop  = r22;       // actual oop copied
1783     const Register count_save  = r21;       // orig elementscount
1784     const Register start_to    = r20;       // destination array start address
1785     const Register r19_klass   = r19;       // oop._klass
1786 
1787     //---------------------------------------------------------------
1788     // Assembler stub will be used for this call to arraycopy
1789     // if the two arrays are subtypes of Object[] but the
1790     // destination array type is not equal to or a supertype
1791     // of the source type.  Each element must be separately
1792     // checked.
1793 
1794     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1795                                copied_oop, r19_klass, count_save);
1796 
1797     __ align(CodeEntryAlignment);
1798     StubCodeMark mark(this, "StubRoutines", name);
1799     address start = __ pc();
1800 
1801     __ enter(); // required for proper stackwalking of RuntimeStub frame
1802 
1803 #ifdef ASSERT
1804     // caller guarantees that the arrays really are different
1805     // otherwise, we would have to make conjoint checks
1806     { Label L;
1807       array_overlap_test(L, TIMES_OOP);
1808       __ stop("checkcast_copy within a single array");
1809       __ bind(L);
1810     }
1811 #endif //ASSERT
1812 
1813     // Caller of this entry point must set up the argument registers.
1814     if (entry != NULL) {
1815       *entry = __ pc();
1816       BLOCK_COMMENT("Entry:");
1817     }
1818 
1819      // Empty array:  Nothing to do.
1820     __ cbz(count, L_done);
1821     __ push(RegSet::of(r19, r20, r21, r22), sp);
1822 
1823 #ifdef ASSERT
1824     BLOCK_COMMENT("assert consistent ckoff/ckval");
1825     // The ckoff and ckval must be mutually consistent,
1826     // even though caller generates both.
1827     { Label L;
1828       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1829       __ ldrw(start_to, Address(ckval, sco_offset));
1830       __ cmpw(ckoff, start_to);
1831       __ br(Assembler::EQ, L);
1832       __ stop("super_check_offset inconsistent");
1833       __ bind(L);
1834     }
1835 #endif //ASSERT
1836 
1837     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1838     bool is_oop = true;
1839     if (dest_uninitialized) {
1840       decorators |= IS_DEST_UNINITIALIZED;
1841     }
1842 
1843     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1844     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1845 
1846     // save the original count
1847     __ mov(count_save, count);
1848 
1849     // Copy from low to high addresses
1850     __ mov(start_to, to);              // Save destination array start address
1851     __ b(L_load_element);
1852 
1853     // ======== begin loop ========
1854     // (Loop is rotated; its entry is L_load_element.)
1855     // Loop control:
1856     //   for (; count != 0; count--) {
1857     //     copied_oop = load_heap_oop(from++);
1858     //     ... generate_type_check ...;
1859     //     store_heap_oop(to++, copied_oop);
1860     //   }
1861     __ align(OptoLoopAlignment);
1862 
1863     __ BIND(L_store_element);
1864     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
1865     __ sub(count, count, 1);
1866     __ cbz(count, L_do_card_marks);
1867 
1868     // ======== loop entry is here ========
1869     __ BIND(L_load_element);
1870     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1871     __ cbz(copied_oop, L_store_element);
1872 
1873     __ load_klass(r19_klass, copied_oop);// query the object klass
1874     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1875     // ======== end loop ========
1876 
1877     // It was a real error; we must depend on the caller to finish the job.
1878     // Register count = remaining oops, count_orig = total oops.
1879     // Emit GC store barriers for the oops we have copied and report
1880     // their number to the caller.
1881 
1882     __ subs(count, count_save, count);     // K = partially copied oop count
1883     __ eon(count, count, zr);                   // report (-1^K) to caller
1884     __ br(Assembler::EQ, L_done_pop);
1885 
1886     __ BIND(L_do_card_marks);
1887     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1888 
1889     __ bind(L_done_pop);
1890     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1891     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1892 
1893     __ bind(L_done);
1894     __ mov(r0, count);
1895     __ leave();
1896     __ ret(lr);
1897 
1898     return start;
1899   }
1900 
1901   // Perform range checks on the proposed arraycopy.
1902   // Kills temp, but nothing else.
1903   // Also, clean the sign bits of src_pos and dst_pos.
1904   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1905                               Register src_pos, // source position (c_rarg1)
1906                               Register dst,     // destination array oo (c_rarg2)
1907                               Register dst_pos, // destination position (c_rarg3)
1908                               Register length,
1909                               Register temp,
1910                               Label& L_failed) {
1911     BLOCK_COMMENT("arraycopy_range_checks:");
1912 
1913     assert_different_registers(rscratch1, temp);
1914 
1915     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1916     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1917     __ addw(temp, length, src_pos);
1918     __ cmpw(temp, rscratch1);
1919     __ br(Assembler::HI, L_failed);
1920 
1921     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1922     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1923     __ addw(temp, length, dst_pos);
1924     __ cmpw(temp, rscratch1);
1925     __ br(Assembler::HI, L_failed);
1926 
1927     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1928     __ movw(src_pos, src_pos);
1929     __ movw(dst_pos, dst_pos);
1930 
1931     BLOCK_COMMENT("arraycopy_range_checks done");
1932   }
1933 
1934   // These stubs get called from some dumb test routine.
1935   // I'll write them properly when they're called from
1936   // something that's actually doing something.
1937   static void fake_arraycopy_stub(address src, address dst, int count) {
1938     assert(count == 0, "huh?");
1939   }
1940 
1941 
1942   //
1943   //  Generate 'unsafe' array copy stub
1944   //  Though just as safe as the other stubs, it takes an unscaled
1945   //  size_t argument instead of an element count.
1946   //
1947   //  Input:
1948   //    c_rarg0   - source array address
1949   //    c_rarg1   - destination array address
1950   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1951   //
1952   // Examines the alignment of the operands and dispatches
1953   // to a long, int, short, or byte copy loop.
1954   //
1955   address generate_unsafe_copy(const char *name,
1956                                address byte_copy_entry,
1957                                address short_copy_entry,
1958                                address int_copy_entry,
1959                                address long_copy_entry) {
1960     Label L_long_aligned, L_int_aligned, L_short_aligned;
1961     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1962 
1963     __ align(CodeEntryAlignment);
1964     StubCodeMark mark(this, "StubRoutines", name);
1965     address start = __ pc();
1966     __ enter(); // required for proper stackwalking of RuntimeStub frame
1967 
1968     // bump this on entry, not on exit:
1969     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1970 
1971     __ orr(rscratch1, s, d);
1972     __ orr(rscratch1, rscratch1, count);
1973 
1974     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1975     __ cbz(rscratch1, L_long_aligned);
1976     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1977     __ cbz(rscratch1, L_int_aligned);
1978     __ tbz(rscratch1, 0, L_short_aligned);
1979     __ b(RuntimeAddress(byte_copy_entry));
1980 
1981     __ BIND(L_short_aligned);
1982     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1983     __ b(RuntimeAddress(short_copy_entry));
1984     __ BIND(L_int_aligned);
1985     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1986     __ b(RuntimeAddress(int_copy_entry));
1987     __ BIND(L_long_aligned);
1988     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1989     __ b(RuntimeAddress(long_copy_entry));
1990 
1991     return start;
1992   }
1993 
1994   //
1995   //  Generate generic array copy stubs
1996   //
1997   //  Input:
1998   //    c_rarg0    -  src oop
1999   //    c_rarg1    -  src_pos (32-bits)
2000   //    c_rarg2    -  dst oop
2001   //    c_rarg3    -  dst_pos (32-bits)
2002   //    c_rarg4    -  element count (32-bits)
2003   //
2004   //  Output:
2005   //    r0 ==  0  -  success
2006   //    r0 == -1^K - failure, where K is partial transfer count
2007   //
2008   address generate_generic_copy(const char *name,
2009                                 address byte_copy_entry, address short_copy_entry,
2010                                 address int_copy_entry, address oop_copy_entry,
2011                                 address long_copy_entry, address checkcast_copy_entry) {
2012 
2013     Label L_failed, L_objArray;
2014     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2015 
2016     // Input registers
2017     const Register src        = c_rarg0;  // source array oop
2018     const Register src_pos    = c_rarg1;  // source position
2019     const Register dst        = c_rarg2;  // destination array oop
2020     const Register dst_pos    = c_rarg3;  // destination position
2021     const Register length     = c_rarg4;
2022 
2023 
2024     // Registers used as temps
2025     const Register dst_klass  = c_rarg5;
2026 
2027     __ align(CodeEntryAlignment);
2028 
2029     StubCodeMark mark(this, "StubRoutines", name);
2030 
2031     address start = __ pc();
2032 
2033     __ enter(); // required for proper stackwalking of RuntimeStub frame
2034 
2035     // bump this on entry, not on exit:
2036     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2037 
2038     //-----------------------------------------------------------------------
2039     // Assembler stub will be used for this call to arraycopy
2040     // if the following conditions are met:
2041     //
2042     // (1) src and dst must not be null.
2043     // (2) src_pos must not be negative.
2044     // (3) dst_pos must not be negative.
2045     // (4) length  must not be negative.
2046     // (5) src klass and dst klass should be the same and not NULL.
2047     // (6) src and dst should be arrays.
2048     // (7) src_pos + length must not exceed length of src.
2049     // (8) dst_pos + length must not exceed length of dst.
2050     //
2051 
2052     //  if (src == NULL) return -1;
2053     __ cbz(src, L_failed);
2054 
2055     //  if (src_pos < 0) return -1;
2056     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2057 
2058     //  if (dst == NULL) return -1;
2059     __ cbz(dst, L_failed);
2060 
2061     //  if (dst_pos < 0) return -1;
2062     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2063 
2064     // registers used as temp
2065     const Register scratch_length    = r16; // elements count to copy
2066     const Register scratch_src_klass = r17; // array klass
2067     const Register lh                = r15; // layout helper
2068 
2069     //  if (length < 0) return -1;
2070     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2071     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2072 
2073     __ load_klass(scratch_src_klass, src);
2074 #ifdef ASSERT
2075     //  assert(src->klass() != NULL);
2076     {
2077       BLOCK_COMMENT("assert klasses not null {");
2078       Label L1, L2;
2079       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2080       __ bind(L1);
2081       __ stop("broken null klass");
2082       __ bind(L2);
2083       __ load_klass(rscratch1, dst);
2084       __ cbz(rscratch1, L1);     // this would be broken also
2085       BLOCK_COMMENT("} assert klasses not null done");
2086     }
2087 #endif
2088 
2089     // Load layout helper (32-bits)
2090     //
2091     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2092     // 32        30    24            16              8     2                 0
2093     //
2094     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2095     //
2096 
2097     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2098 
2099     // Handle objArrays completely differently...
2100     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2101     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2102     __ movw(rscratch1, objArray_lh);
2103     __ eorw(rscratch2, lh, rscratch1);
2104     __ cbzw(rscratch2, L_objArray);
2105 
2106     //  if (src->klass() != dst->klass()) return -1;
2107     __ load_klass(rscratch2, dst);
2108     __ eor(rscratch2, rscratch2, scratch_src_klass);
2109     __ cbnz(rscratch2, L_failed);
2110 
2111     // Check for flat inline type array -> return -1
2112     __ tst(lh, Klass::_lh_array_tag_flat_value_bit_inplace);
2113     __ br(Assembler::NE, L_failed);
2114 
2115     // Check for null-free (non-flat) inline type array -> handle as object array
2116     __ tst(lh, Klass::_lh_null_free_array_bit_inplace);
2117     __ br(Assembler::NE, L_failed);
2118 
2119     //  if (!src->is_Array()) return -1;
2120     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2121 
2122     // At this point, it is known to be a typeArray (array_tag 0x3).
2123 #ifdef ASSERT
2124     {
2125       BLOCK_COMMENT("assert primitive array {");
2126       Label L;
2127       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2128       __ cmpw(lh, rscratch2);
2129       __ br(Assembler::GE, L);
2130       __ stop("must be a primitive array");
2131       __ bind(L);
2132       BLOCK_COMMENT("} assert primitive array done");
2133     }
2134 #endif
2135 
2136     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2137                            rscratch2, L_failed);
2138 
2139     // TypeArrayKlass
2140     //
2141     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2142     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2143     //
2144 
2145     const Register rscratch1_offset = rscratch1;    // array offset
2146     const Register r15_elsize = lh; // element size
2147 
2148     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2149            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2150     __ add(src, src, rscratch1_offset);           // src array offset
2151     __ add(dst, dst, rscratch1_offset);           // dst array offset
2152     BLOCK_COMMENT("choose copy loop based on element size");
2153 
2154     // next registers should be set before the jump to corresponding stub
2155     const Register from     = c_rarg0;  // source array address
2156     const Register to       = c_rarg1;  // destination array address
2157     const Register count    = c_rarg2;  // elements count
2158 
2159     // 'from', 'to', 'count' registers should be set in such order
2160     // since they are the same as 'src', 'src_pos', 'dst'.
2161 
2162     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2163 
2164     // The possible values of elsize are 0-3, i.e. exact_log2(element
2165     // size in bytes).  We do a simple bitwise binary search.
2166   __ BIND(L_copy_bytes);
2167     __ tbnz(r15_elsize, 1, L_copy_ints);
2168     __ tbnz(r15_elsize, 0, L_copy_shorts);
2169     __ lea(from, Address(src, src_pos));// src_addr
2170     __ lea(to,   Address(dst, dst_pos));// dst_addr
2171     __ movw(count, scratch_length); // length
2172     __ b(RuntimeAddress(byte_copy_entry));
2173 
2174   __ BIND(L_copy_shorts);
2175     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2176     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2177     __ movw(count, scratch_length); // length
2178     __ b(RuntimeAddress(short_copy_entry));
2179 
2180   __ BIND(L_copy_ints);
2181     __ tbnz(r15_elsize, 0, L_copy_longs);
2182     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2183     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2184     __ movw(count, scratch_length); // length
2185     __ b(RuntimeAddress(int_copy_entry));
2186 
2187   __ BIND(L_copy_longs);
2188 #ifdef ASSERT
2189     {
2190       BLOCK_COMMENT("assert long copy {");
2191       Label L;
2192       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2193       __ cmpw(r15_elsize, LogBytesPerLong);
2194       __ br(Assembler::EQ, L);
2195       __ stop("must be long copy, but elsize is wrong");
2196       __ bind(L);
2197       BLOCK_COMMENT("} assert long copy done");
2198     }
2199 #endif
2200     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2201     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2202     __ movw(count, scratch_length); // length
2203     __ b(RuntimeAddress(long_copy_entry));
2204 
2205     // ObjArrayKlass
2206   __ BIND(L_objArray);
2207     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2208 
2209     Label L_plain_copy, L_checkcast_copy;
2210     //  test array classes for subtyping
2211     __ load_klass(r15, dst);
2212     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2213     __ br(Assembler::NE, L_checkcast_copy);
2214 
2215     // Identically typed arrays can be copied without element-wise checks.
2216     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2217                            rscratch2, L_failed);
2218 
2219     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2220     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2221     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2222     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2223     __ movw(count, scratch_length); // length
2224   __ BIND(L_plain_copy);
2225     __ b(RuntimeAddress(oop_copy_entry));
2226 
2227   __ BIND(L_checkcast_copy);
2228     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2229     {
2230       // Before looking at dst.length, make sure dst is also an objArray.
2231       __ ldrw(rscratch1, Address(r15, lh_offset));
2232       __ movw(rscratch2, objArray_lh);
2233       __ eorw(rscratch1, rscratch1, rscratch2);
2234       __ cbnzw(rscratch1, L_failed);
2235 
2236       // It is safe to examine both src.length and dst.length.
2237       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2238                              r15, L_failed);
2239 
2240       __ load_klass(dst_klass, dst); // reload
2241 
2242       // Marshal the base address arguments now, freeing registers.
2243       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2244       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2245       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2246       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2247       __ movw(count, length);           // length (reloaded)
2248       Register sco_temp = c_rarg3;      // this register is free now
2249       assert_different_registers(from, to, count, sco_temp,
2250                                  dst_klass, scratch_src_klass);
2251       // assert_clean_int(count, sco_temp);
2252 
2253       // Generate the type check.
2254       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2255       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2256 
2257       // Smashes rscratch1, rscratch2
2258       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2259 
2260       // Fetch destination element klass from the ObjArrayKlass header.
2261       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2262       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2263       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2264 
2265       // the checkcast_copy loop needs two extra arguments:
2266       assert(c_rarg3 == sco_temp, "#3 already in place");
2267       // Set up arguments for checkcast_copy_entry.
2268       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2269       __ b(RuntimeAddress(checkcast_copy_entry));
2270     }
2271 
2272   __ BIND(L_failed);
2273     __ mov(r0, -1);
2274     __ leave();   // required for proper stackwalking of RuntimeStub frame
2275     __ ret(lr);
2276 
2277     return start;
2278   }
2279 
2280   //
2281   // Generate stub for array fill. If "aligned" is true, the
2282   // "to" address is assumed to be heapword aligned.
2283   //
2284   // Arguments for generated stub:
2285   //   to:    c_rarg0
2286   //   value: c_rarg1
2287   //   count: c_rarg2 treated as signed
2288   //
2289   address generate_fill(BasicType t, bool aligned, const char *name) {
2290     __ align(CodeEntryAlignment);
2291     StubCodeMark mark(this, "StubRoutines", name);
2292     address start = __ pc();
2293 
2294     BLOCK_COMMENT("Entry:");
2295 
2296     const Register to        = c_rarg0;  // source array address
2297     const Register value     = c_rarg1;  // value
2298     const Register count     = c_rarg2;  // elements count
2299 
2300     const Register bz_base = r10;        // base for block_zero routine
2301     const Register cnt_words = r11;      // temp register
2302 
2303     __ enter();
2304 
2305     Label L_fill_elements, L_exit1;
2306 
2307     int shift = -1;
2308     switch (t) {
2309       case T_BYTE:
2310         shift = 0;
2311         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2312         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2313         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2314         __ br(Assembler::LO, L_fill_elements);
2315         break;
2316       case T_SHORT:
2317         shift = 1;
2318         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2319         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2320         __ br(Assembler::LO, L_fill_elements);
2321         break;
2322       case T_INT:
2323         shift = 2;
2324         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2325         __ br(Assembler::LO, L_fill_elements);
2326         break;
2327       default: ShouldNotReachHere();
2328     }
2329 
2330     // Align source address at 8 bytes address boundary.
2331     Label L_skip_align1, L_skip_align2, L_skip_align4;
2332     if (!aligned) {
2333       switch (t) {
2334         case T_BYTE:
2335           // One byte misalignment happens only for byte arrays.
2336           __ tbz(to, 0, L_skip_align1);
2337           __ strb(value, Address(__ post(to, 1)));
2338           __ subw(count, count, 1);
2339           __ bind(L_skip_align1);
2340           // Fallthrough
2341         case T_SHORT:
2342           // Two bytes misalignment happens only for byte and short (char) arrays.
2343           __ tbz(to, 1, L_skip_align2);
2344           __ strh(value, Address(__ post(to, 2)));
2345           __ subw(count, count, 2 >> shift);
2346           __ bind(L_skip_align2);
2347           // Fallthrough
2348         case T_INT:
2349           // Align to 8 bytes, we know we are 4 byte aligned to start.
2350           __ tbz(to, 2, L_skip_align4);
2351           __ strw(value, Address(__ post(to, 4)));
2352           __ subw(count, count, 4 >> shift);
2353           __ bind(L_skip_align4);
2354           break;
2355         default: ShouldNotReachHere();
2356       }
2357     }
2358 
2359     //
2360     //  Fill large chunks
2361     //
2362     __ lsrw(cnt_words, count, 3 - shift); // number of words
2363     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2364     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2365     if (UseBlockZeroing) {
2366       Label non_block_zeroing, rest;
2367       // If the fill value is zero we can use the fast zero_words().
2368       __ cbnz(value, non_block_zeroing);
2369       __ mov(bz_base, to);
2370       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2371       __ zero_words(bz_base, cnt_words);
2372       __ b(rest);
2373       __ bind(non_block_zeroing);
2374       __ fill_words(to, cnt_words, value);
2375       __ bind(rest);
2376     } else {
2377       __ fill_words(to, cnt_words, value);
2378     }
2379 
2380     // Remaining count is less than 8 bytes. Fill it by a single store.
2381     // Note that the total length is no less than 8 bytes.
2382     if (t == T_BYTE || t == T_SHORT) {
2383       Label L_exit1;
2384       __ cbzw(count, L_exit1);
2385       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2386       __ str(value, Address(to, -8));    // overwrite some elements
2387       __ bind(L_exit1);
2388       __ leave();
2389       __ ret(lr);
2390     }
2391 
2392     // Handle copies less than 8 bytes.
2393     Label L_fill_2, L_fill_4, L_exit2;
2394     __ bind(L_fill_elements);
2395     switch (t) {
2396       case T_BYTE:
2397         __ tbz(count, 0, L_fill_2);
2398         __ strb(value, Address(__ post(to, 1)));
2399         __ bind(L_fill_2);
2400         __ tbz(count, 1, L_fill_4);
2401         __ strh(value, Address(__ post(to, 2)));
2402         __ bind(L_fill_4);
2403         __ tbz(count, 2, L_exit2);
2404         __ strw(value, Address(to));
2405         break;
2406       case T_SHORT:
2407         __ tbz(count, 0, L_fill_4);
2408         __ strh(value, Address(__ post(to, 2)));
2409         __ bind(L_fill_4);
2410         __ tbz(count, 1, L_exit2);
2411         __ strw(value, Address(to));
2412         break;
2413       case T_INT:
2414         __ cbzw(count, L_exit2);
2415         __ strw(value, Address(to));
2416         break;
2417       default: ShouldNotReachHere();
2418     }
2419     __ bind(L_exit2);
2420     __ leave();
2421     __ ret(lr);
2422     return start;
2423   }
2424 
2425   address generate_data_cache_writeback() {
2426     const Register line        = c_rarg0;  // address of line to write back
2427 
2428     __ align(CodeEntryAlignment);
2429 
2430     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2431 
2432     address start = __ pc();
2433     __ enter();
2434     __ cache_wb(Address(line, 0));
2435     __ leave();
2436     __ ret(lr);
2437 
2438     return start;
2439   }
2440 
2441   address generate_data_cache_writeback_sync() {
2442     const Register is_pre     = c_rarg0;  // pre or post sync
2443 
2444     __ align(CodeEntryAlignment);
2445 
2446     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2447 
2448     // pre wbsync is a no-op
2449     // post wbsync translates to an sfence
2450 
2451     Label skip;
2452     address start = __ pc();
2453     __ enter();
2454     __ cbnz(is_pre, skip);
2455     __ cache_wbsync(false);
2456     __ bind(skip);
2457     __ leave();
2458     __ ret(lr);
2459 
2460     return start;
2461   }
2462 
2463   void generate_arraycopy_stubs() {
2464     address entry;
2465     address entry_jbyte_arraycopy;
2466     address entry_jshort_arraycopy;
2467     address entry_jint_arraycopy;
2468     address entry_oop_arraycopy;
2469     address entry_jlong_arraycopy;
2470     address entry_checkcast_arraycopy;
2471 
2472     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2473     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2474 
2475     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2476 
2477     //*** jbyte
2478     // Always need aligned and unaligned versions
2479     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2480                                                                                   "jbyte_disjoint_arraycopy");
2481     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2482                                                                                   &entry_jbyte_arraycopy,
2483                                                                                   "jbyte_arraycopy");
2484     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2485                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2486     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2487                                                                                   "arrayof_jbyte_arraycopy");
2488 
2489     //*** jshort
2490     // Always need aligned and unaligned versions
2491     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2492                                                                                     "jshort_disjoint_arraycopy");
2493     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2494                                                                                     &entry_jshort_arraycopy,
2495                                                                                     "jshort_arraycopy");
2496     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2497                                                                                     "arrayof_jshort_disjoint_arraycopy");
2498     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2499                                                                                     "arrayof_jshort_arraycopy");
2500 
2501     //*** jint
2502     // Aligned versions
2503     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2504                                                                                 "arrayof_jint_disjoint_arraycopy");
2505     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2506                                                                                 "arrayof_jint_arraycopy");
2507     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2508     // entry_jint_arraycopy always points to the unaligned version
2509     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2510                                                                                 "jint_disjoint_arraycopy");
2511     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2512                                                                                 &entry_jint_arraycopy,
2513                                                                                 "jint_arraycopy");
2514 
2515     //*** jlong
2516     // It is always aligned
2517     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2518                                                                                   "arrayof_jlong_disjoint_arraycopy");
2519     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2520                                                                                   "arrayof_jlong_arraycopy");
2521     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2522     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2523 
2524     //*** oops
2525     {
2526       // With compressed oops we need unaligned versions; notice that
2527       // we overwrite entry_oop_arraycopy.
2528       bool aligned = !UseCompressedOops;
2529 
2530       StubRoutines::_arrayof_oop_disjoint_arraycopy
2531         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2532                                      /*dest_uninitialized*/false);
2533       StubRoutines::_arrayof_oop_arraycopy
2534         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2535                                      /*dest_uninitialized*/false);
2536       // Aligned versions without pre-barriers
2537       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2538         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2539                                      /*dest_uninitialized*/true);
2540       StubRoutines::_arrayof_oop_arraycopy_uninit
2541         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2542                                      /*dest_uninitialized*/true);
2543     }
2544 
2545     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2546     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2547     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2548     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2549 
2550     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2551     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2552                                                                         /*dest_uninitialized*/true);
2553 
2554     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2555                                                               entry_jbyte_arraycopy,
2556                                                               entry_jshort_arraycopy,
2557                                                               entry_jint_arraycopy,
2558                                                               entry_jlong_arraycopy);
2559 
2560     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2561                                                                entry_jbyte_arraycopy,
2562                                                                entry_jshort_arraycopy,
2563                                                                entry_jint_arraycopy,
2564                                                                entry_oop_arraycopy,
2565                                                                entry_jlong_arraycopy,
2566                                                                entry_checkcast_arraycopy);
2567 
2568     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2569     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2570     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2571     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2572     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2573     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2574   }
2575 
2576   void generate_math_stubs() { Unimplemented(); }
2577 
2578   // Arguments:
2579   //
2580   // Inputs:
2581   //   c_rarg0   - source byte array address
2582   //   c_rarg1   - destination byte array address
2583   //   c_rarg2   - K (key) in little endian int array
2584   //
2585   address generate_aescrypt_encryptBlock() {
2586     __ align(CodeEntryAlignment);
2587     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2588 
2589     const Register from        = c_rarg0;  // source array address
2590     const Register to          = c_rarg1;  // destination array address
2591     const Register key         = c_rarg2;  // key array address
2592     const Register keylen      = rscratch1;
2593 
2594     address start = __ pc();
2595     __ enter();
2596 
2597     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2598 
2599     __ aesenc_loadkeys(key, keylen);
2600     __ aesecb_encrypt(from, to, keylen);
2601 
2602     __ mov(r0, 0);
2603 
2604     __ leave();
2605     __ ret(lr);
2606 
2607     return start;
2608   }
2609 
2610   // Arguments:
2611   //
2612   // Inputs:
2613   //   c_rarg0   - source byte array address
2614   //   c_rarg1   - destination byte array address
2615   //   c_rarg2   - K (key) in little endian int array
2616   //
2617   address generate_aescrypt_decryptBlock() {
2618     assert(UseAES, "need AES cryptographic extension support");
2619     __ align(CodeEntryAlignment);
2620     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2621     Label L_doLast;
2622 
2623     const Register from        = c_rarg0;  // source array address
2624     const Register to          = c_rarg1;  // destination array address
2625     const Register key         = c_rarg2;  // key array address
2626     const Register keylen      = rscratch1;
2627 
2628     address start = __ pc();
2629     __ enter(); // required for proper stackwalking of RuntimeStub frame
2630 
2631     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2632 
2633     __ aesecb_decrypt(from, to, key, keylen);
2634 
2635     __ mov(r0, 0);
2636 
2637     __ leave();
2638     __ ret(lr);
2639 
2640     return start;
2641   }
2642 
2643   // Arguments:
2644   //
2645   // Inputs:
2646   //   c_rarg0   - source byte array address
2647   //   c_rarg1   - destination byte array address
2648   //   c_rarg2   - K (key) in little endian int array
2649   //   c_rarg3   - r vector byte array address
2650   //   c_rarg4   - input length
2651   //
2652   // Output:
2653   //   x0        - input length
2654   //
2655   address generate_cipherBlockChaining_encryptAESCrypt() {
2656     assert(UseAES, "need AES cryptographic extension support");
2657     __ align(CodeEntryAlignment);
2658     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2659 
2660     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2661 
2662     const Register from        = c_rarg0;  // source array address
2663     const Register to          = c_rarg1;  // destination array address
2664     const Register key         = c_rarg2;  // key array address
2665     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2666                                            // and left with the results of the last encryption block
2667     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2668     const Register keylen      = rscratch1;
2669 
2670     address start = __ pc();
2671 
2672       __ enter();
2673 
2674       __ movw(rscratch2, len_reg);
2675 
2676       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2677 
2678       __ ld1(v0, __ T16B, rvec);
2679 
2680       __ cmpw(keylen, 52);
2681       __ br(Assembler::CC, L_loadkeys_44);
2682       __ br(Assembler::EQ, L_loadkeys_52);
2683 
2684       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2685       __ rev32(v17, __ T16B, v17);
2686       __ rev32(v18, __ T16B, v18);
2687     __ BIND(L_loadkeys_52);
2688       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2689       __ rev32(v19, __ T16B, v19);
2690       __ rev32(v20, __ T16B, v20);
2691     __ BIND(L_loadkeys_44);
2692       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2693       __ rev32(v21, __ T16B, v21);
2694       __ rev32(v22, __ T16B, v22);
2695       __ rev32(v23, __ T16B, v23);
2696       __ rev32(v24, __ T16B, v24);
2697       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2698       __ rev32(v25, __ T16B, v25);
2699       __ rev32(v26, __ T16B, v26);
2700       __ rev32(v27, __ T16B, v27);
2701       __ rev32(v28, __ T16B, v28);
2702       __ ld1(v29, v30, v31, __ T16B, key);
2703       __ rev32(v29, __ T16B, v29);
2704       __ rev32(v30, __ T16B, v30);
2705       __ rev32(v31, __ T16B, v31);
2706 
2707     __ BIND(L_aes_loop);
2708       __ ld1(v1, __ T16B, __ post(from, 16));
2709       __ eor(v0, __ T16B, v0, v1);
2710 
2711       __ br(Assembler::CC, L_rounds_44);
2712       __ br(Assembler::EQ, L_rounds_52);
2713 
2714       __ aese(v0, v17); __ aesmc(v0, v0);
2715       __ aese(v0, v18); __ aesmc(v0, v0);
2716     __ BIND(L_rounds_52);
2717       __ aese(v0, v19); __ aesmc(v0, v0);
2718       __ aese(v0, v20); __ aesmc(v0, v0);
2719     __ BIND(L_rounds_44);
2720       __ aese(v0, v21); __ aesmc(v0, v0);
2721       __ aese(v0, v22); __ aesmc(v0, v0);
2722       __ aese(v0, v23); __ aesmc(v0, v0);
2723       __ aese(v0, v24); __ aesmc(v0, v0);
2724       __ aese(v0, v25); __ aesmc(v0, v0);
2725       __ aese(v0, v26); __ aesmc(v0, v0);
2726       __ aese(v0, v27); __ aesmc(v0, v0);
2727       __ aese(v0, v28); __ aesmc(v0, v0);
2728       __ aese(v0, v29); __ aesmc(v0, v0);
2729       __ aese(v0, v30);
2730       __ eor(v0, __ T16B, v0, v31);
2731 
2732       __ st1(v0, __ T16B, __ post(to, 16));
2733 
2734       __ subw(len_reg, len_reg, 16);
2735       __ cbnzw(len_reg, L_aes_loop);
2736 
2737       __ st1(v0, __ T16B, rvec);
2738 
2739       __ mov(r0, rscratch2);
2740 
2741       __ leave();
2742       __ ret(lr);
2743 
2744       return start;
2745   }
2746 
2747   // Arguments:
2748   //
2749   // Inputs:
2750   //   c_rarg0   - source byte array address
2751   //   c_rarg1   - destination byte array address
2752   //   c_rarg2   - K (key) in little endian int array
2753   //   c_rarg3   - r vector byte array address
2754   //   c_rarg4   - input length
2755   //
2756   // Output:
2757   //   r0        - input length
2758   //
2759   address generate_cipherBlockChaining_decryptAESCrypt() {
2760     assert(UseAES, "need AES cryptographic extension support");
2761     __ align(CodeEntryAlignment);
2762     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2763 
2764     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2765 
2766     const Register from        = c_rarg0;  // source array address
2767     const Register to          = c_rarg1;  // destination array address
2768     const Register key         = c_rarg2;  // key array address
2769     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2770                                            // and left with the results of the last encryption block
2771     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2772     const Register keylen      = rscratch1;
2773 
2774     address start = __ pc();
2775 
2776       __ enter();
2777 
2778       __ movw(rscratch2, len_reg);
2779 
2780       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2781 
2782       __ ld1(v2, __ T16B, rvec);
2783 
2784       __ ld1(v31, __ T16B, __ post(key, 16));
2785       __ rev32(v31, __ T16B, v31);
2786 
2787       __ cmpw(keylen, 52);
2788       __ br(Assembler::CC, L_loadkeys_44);
2789       __ br(Assembler::EQ, L_loadkeys_52);
2790 
2791       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2792       __ rev32(v17, __ T16B, v17);
2793       __ rev32(v18, __ T16B, v18);
2794     __ BIND(L_loadkeys_52);
2795       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2796       __ rev32(v19, __ T16B, v19);
2797       __ rev32(v20, __ T16B, v20);
2798     __ BIND(L_loadkeys_44);
2799       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2800       __ rev32(v21, __ T16B, v21);
2801       __ rev32(v22, __ T16B, v22);
2802       __ rev32(v23, __ T16B, v23);
2803       __ rev32(v24, __ T16B, v24);
2804       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2805       __ rev32(v25, __ T16B, v25);
2806       __ rev32(v26, __ T16B, v26);
2807       __ rev32(v27, __ T16B, v27);
2808       __ rev32(v28, __ T16B, v28);
2809       __ ld1(v29, v30, __ T16B, key);
2810       __ rev32(v29, __ T16B, v29);
2811       __ rev32(v30, __ T16B, v30);
2812 
2813     __ BIND(L_aes_loop);
2814       __ ld1(v0, __ T16B, __ post(from, 16));
2815       __ orr(v1, __ T16B, v0, v0);
2816 
2817       __ br(Assembler::CC, L_rounds_44);
2818       __ br(Assembler::EQ, L_rounds_52);
2819 
2820       __ aesd(v0, v17); __ aesimc(v0, v0);
2821       __ aesd(v0, v18); __ aesimc(v0, v0);
2822     __ BIND(L_rounds_52);
2823       __ aesd(v0, v19); __ aesimc(v0, v0);
2824       __ aesd(v0, v20); __ aesimc(v0, v0);
2825     __ BIND(L_rounds_44);
2826       __ aesd(v0, v21); __ aesimc(v0, v0);
2827       __ aesd(v0, v22); __ aesimc(v0, v0);
2828       __ aesd(v0, v23); __ aesimc(v0, v0);
2829       __ aesd(v0, v24); __ aesimc(v0, v0);
2830       __ aesd(v0, v25); __ aesimc(v0, v0);
2831       __ aesd(v0, v26); __ aesimc(v0, v0);
2832       __ aesd(v0, v27); __ aesimc(v0, v0);
2833       __ aesd(v0, v28); __ aesimc(v0, v0);
2834       __ aesd(v0, v29); __ aesimc(v0, v0);
2835       __ aesd(v0, v30);
2836       __ eor(v0, __ T16B, v0, v31);
2837       __ eor(v0, __ T16B, v0, v2);
2838 
2839       __ st1(v0, __ T16B, __ post(to, 16));
2840       __ orr(v2, __ T16B, v1, v1);
2841 
2842       __ subw(len_reg, len_reg, 16);
2843       __ cbnzw(len_reg, L_aes_loop);
2844 
2845       __ st1(v2, __ T16B, rvec);
2846 
2847       __ mov(r0, rscratch2);
2848 
2849       __ leave();
2850       __ ret(lr);
2851 
2852     return start;
2853   }
2854 
2855   // CTR AES crypt.
2856   // Arguments:
2857   //
2858   // Inputs:
2859   //   c_rarg0   - source byte array address
2860   //   c_rarg1   - destination byte array address
2861   //   c_rarg2   - K (key) in little endian int array
2862   //   c_rarg3   - counter vector byte array address
2863   //   c_rarg4   - input length
2864   //   c_rarg5   - saved encryptedCounter start
2865   //   c_rarg6   - saved used length
2866   //
2867   // Output:
2868   //   r0       - input length
2869   //
2870   address generate_counterMode_AESCrypt() {
2871     const Register in = c_rarg0;
2872     const Register out = c_rarg1;
2873     const Register key = c_rarg2;
2874     const Register counter = c_rarg3;
2875     const Register saved_len = c_rarg4, len = r10;
2876     const Register saved_encrypted_ctr = c_rarg5;
2877     const Register used_ptr = c_rarg6, used = r12;
2878 
2879     const Register offset = r7;
2880     const Register keylen = r11;
2881 
2882     const unsigned char block_size = 16;
2883     const int bulk_width = 4;
2884     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2885     // performance with larger data sizes, but it also means that the
2886     // fast path isn't used until you have at least 8 blocks, and up
2887     // to 127 bytes of data will be executed on the slow path. For
2888     // that reason, and also so as not to blow away too much icache, 4
2889     // blocks seems like a sensible compromise.
2890 
2891     // Algorithm:
2892     //
2893     //    if (len == 0) {
2894     //        goto DONE;
2895     //    }
2896     //    int result = len;
2897     //    do {
2898     //        if (used >= blockSize) {
2899     //            if (len >= bulk_width * blockSize) {
2900     //                CTR_large_block();
2901     //                if (len == 0)
2902     //                    goto DONE;
2903     //            }
2904     //            for (;;) {
2905     //                16ByteVector v0 = counter;
2906     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2907     //                used = 0;
2908     //                if (len < blockSize)
2909     //                    break;    /* goto NEXT */
2910     //                16ByteVector v1 = load16Bytes(in, offset);
2911     //                v1 = v1 ^ encryptedCounter;
2912     //                store16Bytes(out, offset);
2913     //                used = blockSize;
2914     //                offset += blockSize;
2915     //                len -= blockSize;
2916     //                if (len == 0)
2917     //                    goto DONE;
2918     //            }
2919     //        }
2920     //      NEXT:
2921     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2922     //        len--;
2923     //    } while (len != 0);
2924     //  DONE:
2925     //    return result;
2926     //
2927     // CTR_large_block()
2928     //    Wide bulk encryption of whole blocks.
2929 
2930     __ align(CodeEntryAlignment);
2931     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2932     const address start = __ pc();
2933     __ enter();
2934 
2935     Label DONE, CTR_large_block, large_block_return;
2936     __ ldrw(used, Address(used_ptr));
2937     __ cbzw(saved_len, DONE);
2938 
2939     __ mov(len, saved_len);
2940     __ mov(offset, 0);
2941 
2942     // Compute #rounds for AES based on the length of the key array
2943     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2944 
2945     __ aesenc_loadkeys(key, keylen);
2946 
2947     {
2948       Label L_CTR_loop, NEXT;
2949 
2950       __ bind(L_CTR_loop);
2951 
2952       __ cmp(used, block_size);
2953       __ br(__ LO, NEXT);
2954 
2955       // Maybe we have a lot of data
2956       __ subsw(rscratch1, len, bulk_width * block_size);
2957       __ br(__ HS, CTR_large_block);
2958       __ BIND(large_block_return);
2959       __ cbzw(len, DONE);
2960 
2961       // Setup the counter
2962       __ movi(v4, __ T4S, 0);
2963       __ movi(v5, __ T4S, 1);
2964       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2965 
2966       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2967       __ rev32(v16, __ T16B, v0);
2968       __ addv(v16, __ T4S, v16, v4);
2969       __ rev32(v16, __ T16B, v16);
2970       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2971 
2972       {
2973         // We have fewer than bulk_width blocks of data left. Encrypt
2974         // them one by one until there is less than a full block
2975         // remaining, being careful to save both the encrypted counter
2976         // and the counter.
2977 
2978         Label inner_loop;
2979         __ bind(inner_loop);
2980         // Counter to encrypt is in v0
2981         __ aesecb_encrypt(noreg, noreg, keylen);
2982         __ st1(v0, __ T16B, saved_encrypted_ctr);
2983 
2984         // Do we have a remaining full block?
2985 
2986         __ mov(used, 0);
2987         __ cmp(len, block_size);
2988         __ br(__ LO, NEXT);
2989 
2990         // Yes, we have a full block
2991         __ ldrq(v1, Address(in, offset));
2992         __ eor(v1, __ T16B, v1, v0);
2993         __ strq(v1, Address(out, offset));
2994         __ mov(used, block_size);
2995         __ add(offset, offset, block_size);
2996 
2997         __ subw(len, len, block_size);
2998         __ cbzw(len, DONE);
2999 
3000         // Increment the counter, store it back
3001         __ orr(v0, __ T16B, v16, v16);
3002         __ rev32(v16, __ T16B, v16);
3003         __ addv(v16, __ T4S, v16, v4);
3004         __ rev32(v16, __ T16B, v16);
3005         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3006 
3007         __ b(inner_loop);
3008       }
3009 
3010       __ BIND(NEXT);
3011 
3012       // Encrypt a single byte, and loop.
3013       // We expect this to be a rare event.
3014       __ ldrb(rscratch1, Address(in, offset));
3015       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3016       __ eor(rscratch1, rscratch1, rscratch2);
3017       __ strb(rscratch1, Address(out, offset));
3018       __ add(offset, offset, 1);
3019       __ add(used, used, 1);
3020       __ subw(len, len,1);
3021       __ cbnzw(len, L_CTR_loop);
3022     }
3023 
3024     __ bind(DONE);
3025     __ strw(used, Address(used_ptr));
3026     __ mov(r0, saved_len);
3027 
3028     __ leave(); // required for proper stackwalking of RuntimeStub frame
3029     __ ret(lr);
3030 
3031     // Bulk encryption
3032 
3033     __ BIND (CTR_large_block);
3034     assert(bulk_width == 4 || bulk_width == 8, "must be");
3035 
3036     if (bulk_width == 8) {
3037       __ sub(sp, sp, 4 * 16);
3038       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3039     }
3040     __ sub(sp, sp, 4 * 16);
3041     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3042     RegSet saved_regs = (RegSet::of(in, out, offset)
3043                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3044     __ push(saved_regs, sp);
3045     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3046     __ add(in, in, offset);
3047     __ add(out, out, offset);
3048 
3049     // Keys should already be loaded into the correct registers
3050 
3051     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3052     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3053 
3054     // AES/CTR loop
3055     {
3056       Label L_CTR_loop;
3057       __ BIND(L_CTR_loop);
3058 
3059       // Setup the counters
3060       __ movi(v8, __ T4S, 0);
3061       __ movi(v9, __ T4S, 1);
3062       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3063 
3064       for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
3065         __ rev32(f, __ T16B, v16);
3066         __ addv(v16, __ T4S, v16, v8);
3067       }
3068 
3069       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3070 
3071       // Encrypt the counters
3072       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3073 
3074       if (bulk_width == 8) {
3075         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3076       }
3077 
3078       // XOR the encrypted counters with the inputs
3079       for (int i = 0; i < bulk_width; i++) {
3080         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3081       }
3082 
3083       // Write the encrypted data
3084       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3085       if (bulk_width == 8) {
3086         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3087       }
3088 
3089       __ subw(len, len, 16 * bulk_width);
3090       __ cbnzw(len, L_CTR_loop);
3091     }
3092 
3093     // Save the counter back where it goes
3094     __ rev32(v16, __ T16B, v16);
3095     __ st1(v16, __ T16B, counter);
3096 
3097     __ pop(saved_regs, sp);
3098 
3099     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3100     if (bulk_width == 8) {
3101       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3102     }
3103 
3104     __ andr(rscratch1, len, -16 * bulk_width);
3105     __ sub(len, len, rscratch1);
3106     __ add(offset, offset, rscratch1);
3107     __ mov(used, 16);
3108     __ strw(used, Address(used_ptr));
3109     __ b(large_block_return);
3110 
3111     return start;
3112   }
3113 
3114   // Vector AES Galois Counter Mode implementation. Parameters:
3115   //
3116   // in = c_rarg0
3117   // len = c_rarg1
3118   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3119   // out = c_rarg3
3120   // key = c_rarg4
3121   // state = c_rarg5 - GHASH.state
3122   // subkeyHtbl = c_rarg6 - powers of H
3123   // subkeyHtbl_48_entries = c_rarg7 (not used)
3124   // counter = [sp, #0] pointer to 16 bytes of CTR
3125   // return - number of processed bytes
3126   address generate_galoisCounterMode_AESCrypt() {
3127     address ghash_polynomial = __ pc();
3128     __ emit_int64(0x87);  // The low-order bits of the field
3129                           // polynomial (i.e. p = z^7+z^2+z+1)
3130                           // repeated in the low and high parts of a
3131                           // 128-bit vector
3132     __ emit_int64(0x87);
3133 
3134     __ align(CodeEntryAlignment);
3135      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3136     address start = __ pc();
3137     __ enter();
3138 
3139     const Register in = c_rarg0;
3140     const Register len = c_rarg1;
3141     const Register ct = c_rarg2;
3142     const Register out = c_rarg3;
3143     // and updated with the incremented counter in the end
3144 
3145     const Register key = c_rarg4;
3146     const Register state = c_rarg5;
3147 
3148     const Register subkeyHtbl = c_rarg6;
3149 
3150     // Pointer to CTR is passed on the stack before the (fp, lr) pair.
3151     const Address counter_mem(sp, 2 * wordSize);
3152     const Register counter = c_rarg7;
3153     __ ldr(counter, counter_mem);
3154 
3155     const Register keylen = r10;
3156     // Save state before entering routine
3157     __ sub(sp, sp, 4 * 16);
3158     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3159     __ sub(sp, sp, 4 * 16);
3160     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3161 
3162     // __ andr(len, len, -512);
3163     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3164     __ str(len, __ pre(sp, -2 * wordSize));
3165 
3166     Label DONE;
3167     __ cbz(len, DONE);
3168 
3169     // Compute #rounds for AES based on the length of the key array
3170     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3171 
3172     __ aesenc_loadkeys(key, keylen);
3173     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3174     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3175 
3176     // AES/CTR loop
3177     {
3178       Label L_CTR_loop;
3179       __ BIND(L_CTR_loop);
3180 
3181       // Setup the counters
3182       __ movi(v8, __ T4S, 0);
3183       __ movi(v9, __ T4S, 1);
3184       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3185       for (FloatRegister f = v0; f < v8; f++) {
3186         __ rev32(f, __ T16B, v16);
3187         __ addv(v16, __ T4S, v16, v8);
3188       }
3189 
3190       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3191 
3192       // Encrypt the counters
3193       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3194 
3195       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3196 
3197       // XOR the encrypted counters with the inputs
3198       for (int i = 0; i < 8; i++) {
3199         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3200       }
3201       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3202       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3203 
3204       __ subw(len, len, 16 * 8);
3205       __ cbnzw(len, L_CTR_loop);
3206     }
3207 
3208     __ rev32(v16, __ T16B, v16);
3209     __ st1(v16, __ T16B, counter);
3210 
3211     __ ldr(len, Address(sp));
3212     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3213 
3214     // GHASH/CTR loop
3215     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3216                                 len, /*unrolls*/4);
3217 
3218 #ifdef ASSERT
3219     { Label L;
3220       __ cmp(len, (unsigned char)0);
3221       __ br(Assembler::EQ, L);
3222       __ stop("stubGenerator: abort");
3223       __ bind(L);
3224   }
3225 #endif
3226 
3227   __ bind(DONE);
3228     // Return the number of bytes processed
3229     __ ldr(r0, __ post(sp, 2 * wordSize));
3230 
3231     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3232     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3233 
3234     __ leave(); // required for proper stackwalking of RuntimeStub frame
3235     __ ret(lr);
3236      return start;
3237   }
3238 
3239   // Arguments:
3240   //
3241   // Inputs:
3242   //   c_rarg0   - byte[]  source+offset
3243   //   c_rarg1   - int[]   SHA.state
3244   //   c_rarg2   - int     offset
3245   //   c_rarg3   - int     limit
3246   //
3247   address generate_sha1_implCompress(bool multi_block, const char *name) {
3248     __ align(CodeEntryAlignment);
3249     StubCodeMark mark(this, "StubRoutines", name);
3250     address start = __ pc();
3251 
3252     Register buf   = c_rarg0;
3253     Register state = c_rarg1;
3254     Register ofs   = c_rarg2;
3255     Register limit = c_rarg3;
3256 
3257     Label keys;
3258     Label sha1_loop;
3259 
3260     // load the keys into v0..v3
3261     __ adr(rscratch1, keys);
3262     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3263     // load 5 words state into v6, v7
3264     __ ldrq(v6, Address(state, 0));
3265     __ ldrs(v7, Address(state, 16));
3266 
3267 
3268     __ BIND(sha1_loop);
3269     // load 64 bytes of data into v16..v19
3270     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3271     __ rev32(v16, __ T16B, v16);
3272     __ rev32(v17, __ T16B, v17);
3273     __ rev32(v18, __ T16B, v18);
3274     __ rev32(v19, __ T16B, v19);
3275 
3276     // do the sha1
3277     __ addv(v4, __ T4S, v16, v0);
3278     __ orr(v20, __ T16B, v6, v6);
3279 
3280     FloatRegister d0 = v16;
3281     FloatRegister d1 = v17;
3282     FloatRegister d2 = v18;
3283     FloatRegister d3 = v19;
3284 
3285     for (int round = 0; round < 20; round++) {
3286       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3287       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3288       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3289       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3290       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3291 
3292       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3293       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3294       __ sha1h(tmp2, __ T4S, v20);
3295       if (round < 5)
3296         __ sha1c(v20, __ T4S, tmp3, tmp4);
3297       else if (round < 10 || round >= 15)
3298         __ sha1p(v20, __ T4S, tmp3, tmp4);
3299       else
3300         __ sha1m(v20, __ T4S, tmp3, tmp4);
3301       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3302 
3303       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3304     }
3305 
3306     __ addv(v7, __ T2S, v7, v21);
3307     __ addv(v6, __ T4S, v6, v20);
3308 
3309     if (multi_block) {
3310       __ add(ofs, ofs, 64);
3311       __ cmp(ofs, limit);
3312       __ br(Assembler::LE, sha1_loop);
3313       __ mov(c_rarg0, ofs); // return ofs
3314     }
3315 
3316     __ strq(v6, Address(state, 0));
3317     __ strs(v7, Address(state, 16));
3318 
3319     __ ret(lr);
3320 
3321     __ bind(keys);
3322     __ emit_int32(0x5a827999);
3323     __ emit_int32(0x6ed9eba1);
3324     __ emit_int32(0x8f1bbcdc);
3325     __ emit_int32(0xca62c1d6);
3326 
3327     return start;
3328   }
3329 
3330 
3331   // Arguments:
3332   //
3333   // Inputs:
3334   //   c_rarg0   - byte[]  source+offset
3335   //   c_rarg1   - int[]   SHA.state
3336   //   c_rarg2   - int     offset
3337   //   c_rarg3   - int     limit
3338   //
3339   address generate_sha256_implCompress(bool multi_block, const char *name) {
3340     static const uint32_t round_consts[64] = {
3341       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3342       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3343       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3344       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3345       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3346       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3347       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3348       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3349       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3350       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3351       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3352       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3353       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3354       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3355       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3356       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3357     };
3358     __ align(CodeEntryAlignment);
3359     StubCodeMark mark(this, "StubRoutines", name);
3360     address start = __ pc();
3361 
3362     Register buf   = c_rarg0;
3363     Register state = c_rarg1;
3364     Register ofs   = c_rarg2;
3365     Register limit = c_rarg3;
3366 
3367     Label sha1_loop;
3368 
3369     __ stpd(v8, v9, __ pre(sp, -32));
3370     __ stpd(v10, v11, Address(sp, 16));
3371 
3372 // dga == v0
3373 // dgb == v1
3374 // dg0 == v2
3375 // dg1 == v3
3376 // dg2 == v4
3377 // t0 == v6
3378 // t1 == v7
3379 
3380     // load 16 keys to v16..v31
3381     __ lea(rscratch1, ExternalAddress((address)round_consts));
3382     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3383     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3384     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3385     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3386 
3387     // load 8 words (256 bits) state
3388     __ ldpq(v0, v1, state);
3389 
3390     __ BIND(sha1_loop);
3391     // load 64 bytes of data into v8..v11
3392     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3393     __ rev32(v8, __ T16B, v8);
3394     __ rev32(v9, __ T16B, v9);
3395     __ rev32(v10, __ T16B, v10);
3396     __ rev32(v11, __ T16B, v11);
3397 
3398     __ addv(v6, __ T4S, v8, v16);
3399     __ orr(v2, __ T16B, v0, v0);
3400     __ orr(v3, __ T16B, v1, v1);
3401 
3402     FloatRegister d0 = v8;
3403     FloatRegister d1 = v9;
3404     FloatRegister d2 = v10;
3405     FloatRegister d3 = v11;
3406 
3407 
3408     for (int round = 0; round < 16; round++) {
3409       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3410       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3411       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3412       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3413 
3414       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3415        __ orr(v4, __ T16B, v2, v2);
3416       if (round < 15)
3417         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3418       __ sha256h(v2, __ T4S, v3, tmp2);
3419       __ sha256h2(v3, __ T4S, v4, tmp2);
3420       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3421 
3422       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3423     }
3424 
3425     __ addv(v0, __ T4S, v0, v2);
3426     __ addv(v1, __ T4S, v1, v3);
3427 
3428     if (multi_block) {
3429       __ add(ofs, ofs, 64);
3430       __ cmp(ofs, limit);
3431       __ br(Assembler::LE, sha1_loop);
3432       __ mov(c_rarg0, ofs); // return ofs
3433     }
3434 
3435     __ ldpd(v10, v11, Address(sp, 16));
3436     __ ldpd(v8, v9, __ post(sp, 32));
3437 
3438     __ stpq(v0, v1, state);
3439 
3440     __ ret(lr);
3441 
3442     return start;
3443   }
3444 
3445   // Arguments:
3446   //
3447   // Inputs:
3448   //   c_rarg0   - byte[]  source+offset
3449   //   c_rarg1   - int[]   SHA.state
3450   //   c_rarg2   - int     offset
3451   //   c_rarg3   - int     limit
3452   //
3453   address generate_sha512_implCompress(bool multi_block, const char *name) {
3454     static const uint64_t round_consts[80] = {
3455       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3456       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3457       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3458       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3459       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3460       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3461       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3462       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3463       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3464       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3465       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3466       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3467       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3468       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3469       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3470       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3471       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3472       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3473       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3474       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3475       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3476       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3477       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3478       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3479       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3480       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3481       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3482     };
3483 
3484     // Double rounds for sha512.
3485     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3486       if (dr < 36)                                                                   \
3487         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3488       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3489       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3490       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3491       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3492       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3493       if (dr < 32) {                                                                 \
3494         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3495         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3496       }                                                                              \
3497       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3498       if (dr < 32)                                                                   \
3499         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3500       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3501       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3502 
3503     __ align(CodeEntryAlignment);
3504     StubCodeMark mark(this, "StubRoutines", name);
3505     address start = __ pc();
3506 
3507     Register buf   = c_rarg0;
3508     Register state = c_rarg1;
3509     Register ofs   = c_rarg2;
3510     Register limit = c_rarg3;
3511 
3512     __ stpd(v8, v9, __ pre(sp, -64));
3513     __ stpd(v10, v11, Address(sp, 16));
3514     __ stpd(v12, v13, Address(sp, 32));
3515     __ stpd(v14, v15, Address(sp, 48));
3516 
3517     Label sha512_loop;
3518 
3519     // load state
3520     __ ld1(v8, v9, v10, v11, __ T2D, state);
3521 
3522     // load first 4 round constants
3523     __ lea(rscratch1, ExternalAddress((address)round_consts));
3524     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3525 
3526     __ BIND(sha512_loop);
3527     // load 128B of data into v12..v19
3528     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3529     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3530     __ rev64(v12, __ T16B, v12);
3531     __ rev64(v13, __ T16B, v13);
3532     __ rev64(v14, __ T16B, v14);
3533     __ rev64(v15, __ T16B, v15);
3534     __ rev64(v16, __ T16B, v16);
3535     __ rev64(v17, __ T16B, v17);
3536     __ rev64(v18, __ T16B, v18);
3537     __ rev64(v19, __ T16B, v19);
3538 
3539     __ mov(rscratch2, rscratch1);
3540 
3541     __ mov(v0, __ T16B, v8);
3542     __ mov(v1, __ T16B, v9);
3543     __ mov(v2, __ T16B, v10);
3544     __ mov(v3, __ T16B, v11);
3545 
3546     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3547     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3548     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3549     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3550     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3551     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3552     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3553     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3554     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3555     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3556     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3557     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3558     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3559     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3560     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3561     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3562     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3563     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3564     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3565     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3566     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3567     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3568     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3569     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3570     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3571     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3572     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3573     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3574     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3575     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3576     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3577     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3578     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3579     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3580     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3581     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3582     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3583     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3584     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3585     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3586 
3587     __ addv(v8, __ T2D, v8, v0);
3588     __ addv(v9, __ T2D, v9, v1);
3589     __ addv(v10, __ T2D, v10, v2);
3590     __ addv(v11, __ T2D, v11, v3);
3591 
3592     if (multi_block) {
3593       __ add(ofs, ofs, 128);
3594       __ cmp(ofs, limit);
3595       __ br(Assembler::LE, sha512_loop);
3596       __ mov(c_rarg0, ofs); // return ofs
3597     }
3598 
3599     __ st1(v8, v9, v10, v11, __ T2D, state);
3600 
3601     __ ldpd(v14, v15, Address(sp, 48));
3602     __ ldpd(v12, v13, Address(sp, 32));
3603     __ ldpd(v10, v11, Address(sp, 16));
3604     __ ldpd(v8, v9, __ post(sp, 64));
3605 
3606     __ ret(lr);
3607 
3608     return start;
3609   }
3610 
3611   // Arguments:
3612   //
3613   // Inputs:
3614   //   c_rarg0   - byte[]  source+offset
3615   //   c_rarg1   - byte[]   SHA.state
3616   //   c_rarg2   - int     digest_length
3617   //   c_rarg3   - int     offset
3618   //   c_rarg4   - int     limit
3619   //
3620   address generate_sha3_implCompress(bool multi_block, const char *name) {
3621     static const uint64_t round_consts[24] = {
3622       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3623       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3624       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3625       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3626       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3627       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3628       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3629       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3630     };
3631 
3632     __ align(CodeEntryAlignment);
3633     StubCodeMark mark(this, "StubRoutines", name);
3634     address start = __ pc();
3635 
3636     Register buf           = c_rarg0;
3637     Register state         = c_rarg1;
3638     Register digest_length = c_rarg2;
3639     Register ofs           = c_rarg3;
3640     Register limit         = c_rarg4;
3641 
3642     Label sha3_loop, rounds24_loop;
3643     Label sha3_512, sha3_384_or_224, sha3_256;
3644 
3645     __ stpd(v8, v9, __ pre(sp, -64));
3646     __ stpd(v10, v11, Address(sp, 16));
3647     __ stpd(v12, v13, Address(sp, 32));
3648     __ stpd(v14, v15, Address(sp, 48));
3649 
3650     // load state
3651     __ add(rscratch1, state, 32);
3652     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3653     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3654     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3655     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3656     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3657     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3658     __ ld1(v24, __ T1D, rscratch1);
3659 
3660     __ BIND(sha3_loop);
3661 
3662     // 24 keccak rounds
3663     __ movw(rscratch2, 24);
3664 
3665     // load round_constants base
3666     __ lea(rscratch1, ExternalAddress((address) round_consts));
3667 
3668     // load input
3669     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3670     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3671     __ eor(v0, __ T8B, v0, v25);
3672     __ eor(v1, __ T8B, v1, v26);
3673     __ eor(v2, __ T8B, v2, v27);
3674     __ eor(v3, __ T8B, v3, v28);
3675     __ eor(v4, __ T8B, v4, v29);
3676     __ eor(v5, __ T8B, v5, v30);
3677     __ eor(v6, __ T8B, v6, v31);
3678 
3679     // digest_length == 64, SHA3-512
3680     __ tbnz(digest_length, 6, sha3_512);
3681 
3682     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3683     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3684     __ eor(v7, __ T8B, v7, v25);
3685     __ eor(v8, __ T8B, v8, v26);
3686     __ eor(v9, __ T8B, v9, v27);
3687     __ eor(v10, __ T8B, v10, v28);
3688     __ eor(v11, __ T8B, v11, v29);
3689     __ eor(v12, __ T8B, v12, v30);
3690 
3691     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3692     __ tbnz(digest_length, 4, sha3_384_or_224);
3693 
3694     // SHA3-256
3695     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3696     __ eor(v13, __ T8B, v13, v25);
3697     __ eor(v14, __ T8B, v14, v26);
3698     __ eor(v15, __ T8B, v15, v27);
3699     __ eor(v16, __ T8B, v16, v28);
3700     __ b(rounds24_loop);
3701 
3702     __ BIND(sha3_384_or_224);
3703     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3704 
3705     // SHA3-224
3706     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3707     __ ld1(v29, __ T8B, __ post(buf, 8));
3708     __ eor(v13, __ T8B, v13, v25);
3709     __ eor(v14, __ T8B, v14, v26);
3710     __ eor(v15, __ T8B, v15, v27);
3711     __ eor(v16, __ T8B, v16, v28);
3712     __ eor(v17, __ T8B, v17, v29);
3713     __ b(rounds24_loop);
3714 
3715     __ BIND(sha3_512);
3716     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3717     __ eor(v7, __ T8B, v7, v25);
3718     __ eor(v8, __ T8B, v8, v26);
3719 
3720     __ BIND(rounds24_loop);
3721     __ subw(rscratch2, rscratch2, 1);
3722 
3723     __ eor3(v29, __ T16B, v4, v9, v14);
3724     __ eor3(v26, __ T16B, v1, v6, v11);
3725     __ eor3(v28, __ T16B, v3, v8, v13);
3726     __ eor3(v25, __ T16B, v0, v5, v10);
3727     __ eor3(v27, __ T16B, v2, v7, v12);
3728     __ eor3(v29, __ T16B, v29, v19, v24);
3729     __ eor3(v26, __ T16B, v26, v16, v21);
3730     __ eor3(v28, __ T16B, v28, v18, v23);
3731     __ eor3(v25, __ T16B, v25, v15, v20);
3732     __ eor3(v27, __ T16B, v27, v17, v22);
3733 
3734     __ rax1(v30, __ T2D, v29, v26);
3735     __ rax1(v26, __ T2D, v26, v28);
3736     __ rax1(v28, __ T2D, v28, v25);
3737     __ rax1(v25, __ T2D, v25, v27);
3738     __ rax1(v27, __ T2D, v27, v29);
3739 
3740     __ eor(v0, __ T16B, v0, v30);
3741     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3742     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3743     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3744     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3745     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3746     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3747     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3748     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3749     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3750     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3751     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3752     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3753     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3754     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3755     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3756     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3757     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3758     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3759     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3760     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3761     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3762     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3763     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3764     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3765 
3766     __ bcax(v20, __ T16B, v31, v22, v8);
3767     __ bcax(v21, __ T16B, v8,  v23, v22);
3768     __ bcax(v22, __ T16B, v22, v24, v23);
3769     __ bcax(v23, __ T16B, v23, v31, v24);
3770     __ bcax(v24, __ T16B, v24, v8,  v31);
3771 
3772     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3773 
3774     __ bcax(v17, __ T16B, v25, v19, v3);
3775     __ bcax(v18, __ T16B, v3,  v15, v19);
3776     __ bcax(v19, __ T16B, v19, v16, v15);
3777     __ bcax(v15, __ T16B, v15, v25, v16);
3778     __ bcax(v16, __ T16B, v16, v3,  v25);
3779 
3780     __ bcax(v10, __ T16B, v29, v12, v26);
3781     __ bcax(v11, __ T16B, v26, v13, v12);
3782     __ bcax(v12, __ T16B, v12, v14, v13);
3783     __ bcax(v13, __ T16B, v13, v29, v14);
3784     __ bcax(v14, __ T16B, v14, v26, v29);
3785 
3786     __ bcax(v7, __ T16B, v30, v9,  v4);
3787     __ bcax(v8, __ T16B, v4,  v5,  v9);
3788     __ bcax(v9, __ T16B, v9,  v6,  v5);
3789     __ bcax(v5, __ T16B, v5,  v30, v6);
3790     __ bcax(v6, __ T16B, v6,  v4,  v30);
3791 
3792     __ bcax(v3, __ T16B, v27, v0,  v28);
3793     __ bcax(v4, __ T16B, v28, v1,  v0);
3794     __ bcax(v0, __ T16B, v0,  v2,  v1);
3795     __ bcax(v1, __ T16B, v1,  v27, v2);
3796     __ bcax(v2, __ T16B, v2,  v28, v27);
3797 
3798     __ eor(v0, __ T16B, v0, v31);
3799 
3800     __ cbnzw(rscratch2, rounds24_loop);
3801 
3802     if (multi_block) {
3803       // block_size =  200 - 2 * digest_length, ofs += block_size
3804       __ add(ofs, ofs, 200);
3805       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3806 
3807       __ cmp(ofs, limit);
3808       __ br(Assembler::LE, sha3_loop);
3809       __ mov(c_rarg0, ofs); // return ofs
3810     }
3811 
3812     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3813     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3814     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3815     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3816     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3817     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3818     __ st1(v24, __ T1D, state);
3819 
3820     __ ldpd(v14, v15, Address(sp, 48));
3821     __ ldpd(v12, v13, Address(sp, 32));
3822     __ ldpd(v10, v11, Address(sp, 16));
3823     __ ldpd(v8, v9, __ post(sp, 64));
3824 
3825     __ ret(lr);
3826 
3827     return start;
3828   }
3829 
3830   // Safefetch stubs.
3831   void generate_safefetch(const char* name, int size, address* entry,
3832                           address* fault_pc, address* continuation_pc) {
3833     // safefetch signatures:
3834     //   int      SafeFetch32(int*      adr, int      errValue);
3835     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3836     //
3837     // arguments:
3838     //   c_rarg0 = adr
3839     //   c_rarg1 = errValue
3840     //
3841     // result:
3842     //   PPC_RET  = *adr or errValue
3843 
3844     StubCodeMark mark(this, "StubRoutines", name);
3845 
3846     // Entry point, pc or function descriptor.
3847     *entry = __ pc();
3848 
3849     // Load *adr into c_rarg1, may fault.
3850     *fault_pc = __ pc();
3851     switch (size) {
3852       case 4:
3853         // int32_t
3854         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3855         break;
3856       case 8:
3857         // int64_t
3858         __ ldr(c_rarg1, Address(c_rarg0, 0));
3859         break;
3860       default:
3861         ShouldNotReachHere();
3862     }
3863 
3864     // return errValue or *adr
3865     *continuation_pc = __ pc();
3866     __ mov(r0, c_rarg1);
3867     __ ret(lr);
3868   }
3869 
3870   /**
3871    *  Arguments:
3872    *
3873    * Inputs:
3874    *   c_rarg0   - int crc
3875    *   c_rarg1   - byte* buf
3876    *   c_rarg2   - int length
3877    *
3878    * Ouput:
3879    *       rax   - int crc result
3880    */
3881   address generate_updateBytesCRC32() {
3882     assert(UseCRC32Intrinsics, "what are we doing here?");
3883 
3884     __ align(CodeEntryAlignment);
3885     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3886 
3887     address start = __ pc();
3888 
3889     const Register crc   = c_rarg0;  // crc
3890     const Register buf   = c_rarg1;  // source java byte array address
3891     const Register len   = c_rarg2;  // length
3892     const Register table0 = c_rarg3; // crc_table address
3893     const Register table1 = c_rarg4;
3894     const Register table2 = c_rarg5;
3895     const Register table3 = c_rarg6;
3896     const Register tmp3 = c_rarg7;
3897 
3898     BLOCK_COMMENT("Entry:");
3899     __ enter(); // required for proper stackwalking of RuntimeStub frame
3900 
3901     __ kernel_crc32(crc, buf, len,
3902               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3903 
3904     __ leave(); // required for proper stackwalking of RuntimeStub frame
3905     __ ret(lr);
3906 
3907     return start;
3908   }
3909 
3910   /**
3911    *  Arguments:
3912    *
3913    * Inputs:
3914    *   c_rarg0   - int crc
3915    *   c_rarg1   - byte* buf
3916    *   c_rarg2   - int length
3917    *   c_rarg3   - int* table
3918    *
3919    * Ouput:
3920    *       r0   - int crc result
3921    */
3922   address generate_updateBytesCRC32C() {
3923     assert(UseCRC32CIntrinsics, "what are we doing here?");
3924 
3925     __ align(CodeEntryAlignment);
3926     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3927 
3928     address start = __ pc();
3929 
3930     const Register crc   = c_rarg0;  // crc
3931     const Register buf   = c_rarg1;  // source java byte array address
3932     const Register len   = c_rarg2;  // length
3933     const Register table0 = c_rarg3; // crc_table address
3934     const Register table1 = c_rarg4;
3935     const Register table2 = c_rarg5;
3936     const Register table3 = c_rarg6;
3937     const Register tmp3 = c_rarg7;
3938 
3939     BLOCK_COMMENT("Entry:");
3940     __ enter(); // required for proper stackwalking of RuntimeStub frame
3941 
3942     __ kernel_crc32c(crc, buf, len,
3943               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3944 
3945     __ leave(); // required for proper stackwalking of RuntimeStub frame
3946     __ ret(lr);
3947 
3948     return start;
3949   }
3950 
3951   /***
3952    *  Arguments:
3953    *
3954    *  Inputs:
3955    *   c_rarg0   - int   adler
3956    *   c_rarg1   - byte* buff
3957    *   c_rarg2   - int   len
3958    *
3959    * Output:
3960    *   c_rarg0   - int adler result
3961    */
3962   address generate_updateBytesAdler32() {
3963     __ align(CodeEntryAlignment);
3964     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3965     address start = __ pc();
3966 
3967     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3968 
3969     // Aliases
3970     Register adler  = c_rarg0;
3971     Register s1     = c_rarg0;
3972     Register s2     = c_rarg3;
3973     Register buff   = c_rarg1;
3974     Register len    = c_rarg2;
3975     Register nmax  = r4;
3976     Register base  = r5;
3977     Register count = r6;
3978     Register temp0 = rscratch1;
3979     Register temp1 = rscratch2;
3980     FloatRegister vbytes = v0;
3981     FloatRegister vs1acc = v1;
3982     FloatRegister vs2acc = v2;
3983     FloatRegister vtable = v3;
3984 
3985     // Max number of bytes we can process before having to take the mod
3986     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3987     uint64_t BASE = 0xfff1;
3988     uint64_t NMAX = 0x15B0;
3989 
3990     __ mov(base, BASE);
3991     __ mov(nmax, NMAX);
3992 
3993     // Load accumulation coefficients for the upper 16 bits
3994     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3995     __ ld1(vtable, __ T16B, Address(temp0));
3996 
3997     // s1 is initialized to the lower 16 bits of adler
3998     // s2 is initialized to the upper 16 bits of adler
3999     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4000     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4001 
4002     // The pipelined loop needs at least 16 elements for 1 iteration
4003     // It does check this, but it is more effective to skip to the cleanup loop
4004     __ cmp(len, (u1)16);
4005     __ br(Assembler::HS, L_nmax);
4006     __ cbz(len, L_combine);
4007 
4008     __ bind(L_simple_by1_loop);
4009     __ ldrb(temp0, Address(__ post(buff, 1)));
4010     __ add(s1, s1, temp0);
4011     __ add(s2, s2, s1);
4012     __ subs(len, len, 1);
4013     __ br(Assembler::HI, L_simple_by1_loop);
4014 
4015     // s1 = s1 % BASE
4016     __ subs(temp0, s1, base);
4017     __ csel(s1, temp0, s1, Assembler::HS);
4018 
4019     // s2 = s2 % BASE
4020     __ lsr(temp0, s2, 16);
4021     __ lsl(temp1, temp0, 4);
4022     __ sub(temp1, temp1, temp0);
4023     __ add(s2, temp1, s2, ext::uxth);
4024 
4025     __ subs(temp0, s2, base);
4026     __ csel(s2, temp0, s2, Assembler::HS);
4027 
4028     __ b(L_combine);
4029 
4030     __ bind(L_nmax);
4031     __ subs(len, len, nmax);
4032     __ sub(count, nmax, 16);
4033     __ br(Assembler::LO, L_by16);
4034 
4035     __ bind(L_nmax_loop);
4036 
4037     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4038                                       vbytes, vs1acc, vs2acc, vtable);
4039 
4040     __ subs(count, count, 16);
4041     __ br(Assembler::HS, L_nmax_loop);
4042 
4043     // s1 = s1 % BASE
4044     __ lsr(temp0, s1, 16);
4045     __ lsl(temp1, temp0, 4);
4046     __ sub(temp1, temp1, temp0);
4047     __ add(temp1, temp1, s1, ext::uxth);
4048 
4049     __ lsr(temp0, temp1, 16);
4050     __ lsl(s1, temp0, 4);
4051     __ sub(s1, s1, temp0);
4052     __ add(s1, s1, temp1, ext:: uxth);
4053 
4054     __ subs(temp0, s1, base);
4055     __ csel(s1, temp0, s1, Assembler::HS);
4056 
4057     // s2 = s2 % BASE
4058     __ lsr(temp0, s2, 16);
4059     __ lsl(temp1, temp0, 4);
4060     __ sub(temp1, temp1, temp0);
4061     __ add(temp1, temp1, s2, ext::uxth);
4062 
4063     __ lsr(temp0, temp1, 16);
4064     __ lsl(s2, temp0, 4);
4065     __ sub(s2, s2, temp0);
4066     __ add(s2, s2, temp1, ext:: uxth);
4067 
4068     __ subs(temp0, s2, base);
4069     __ csel(s2, temp0, s2, Assembler::HS);
4070 
4071     __ subs(len, len, nmax);
4072     __ sub(count, nmax, 16);
4073     __ br(Assembler::HS, L_nmax_loop);
4074 
4075     __ bind(L_by16);
4076     __ adds(len, len, count);
4077     __ br(Assembler::LO, L_by1);
4078 
4079     __ bind(L_by16_loop);
4080 
4081     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4082                                       vbytes, vs1acc, vs2acc, vtable);
4083 
4084     __ subs(len, len, 16);
4085     __ br(Assembler::HS, L_by16_loop);
4086 
4087     __ bind(L_by1);
4088     __ adds(len, len, 15);
4089     __ br(Assembler::LO, L_do_mod);
4090 
4091     __ bind(L_by1_loop);
4092     __ ldrb(temp0, Address(__ post(buff, 1)));
4093     __ add(s1, temp0, s1);
4094     __ add(s2, s2, s1);
4095     __ subs(len, len, 1);
4096     __ br(Assembler::HS, L_by1_loop);
4097 
4098     __ bind(L_do_mod);
4099     // s1 = s1 % BASE
4100     __ lsr(temp0, s1, 16);
4101     __ lsl(temp1, temp0, 4);
4102     __ sub(temp1, temp1, temp0);
4103     __ add(temp1, temp1, s1, ext::uxth);
4104 
4105     __ lsr(temp0, temp1, 16);
4106     __ lsl(s1, temp0, 4);
4107     __ sub(s1, s1, temp0);
4108     __ add(s1, s1, temp1, ext:: uxth);
4109 
4110     __ subs(temp0, s1, base);
4111     __ csel(s1, temp0, s1, Assembler::HS);
4112 
4113     // s2 = s2 % BASE
4114     __ lsr(temp0, s2, 16);
4115     __ lsl(temp1, temp0, 4);
4116     __ sub(temp1, temp1, temp0);
4117     __ add(temp1, temp1, s2, ext::uxth);
4118 
4119     __ lsr(temp0, temp1, 16);
4120     __ lsl(s2, temp0, 4);
4121     __ sub(s2, s2, temp0);
4122     __ add(s2, s2, temp1, ext:: uxth);
4123 
4124     __ subs(temp0, s2, base);
4125     __ csel(s2, temp0, s2, Assembler::HS);
4126 
4127     // Combine lower bits and higher bits
4128     __ bind(L_combine);
4129     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4130 
4131     __ ret(lr);
4132 
4133     return start;
4134   }
4135 
4136   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4137           Register temp0, Register temp1, FloatRegister vbytes,
4138           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4139     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4140     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4141     // In non-vectorized code, we update s1 and s2 as:
4142     //   s1 <- s1 + b1
4143     //   s2 <- s2 + s1
4144     //   s1 <- s1 + b2
4145     //   s2 <- s2 + b1
4146     //   ...
4147     //   s1 <- s1 + b16
4148     //   s2 <- s2 + s1
4149     // Putting above assignments together, we have:
4150     //   s1_new = s1 + b1 + b2 + ... + b16
4151     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4152     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4153     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4154     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4155 
4156     // s2 = s2 + s1 * 16
4157     __ add(s2, s2, s1, Assembler::LSL, 4);
4158 
4159     // vs1acc = b1 + b2 + b3 + ... + b16
4160     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4161     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4162     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4163     __ uaddlv(vs1acc, __ T16B, vbytes);
4164     __ uaddlv(vs2acc, __ T8H, vs2acc);
4165 
4166     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4167     __ fmovd(temp0, vs1acc);
4168     __ fmovd(temp1, vs2acc);
4169     __ add(s1, s1, temp0);
4170     __ add(s2, s2, temp1);
4171   }
4172 
4173   /**
4174    *  Arguments:
4175    *
4176    *  Input:
4177    *    c_rarg0   - x address
4178    *    c_rarg1   - x length
4179    *    c_rarg2   - y address
4180    *    c_rarg3   - y lenth
4181    *    c_rarg4   - z address
4182    *    c_rarg5   - z length
4183    */
4184   address generate_multiplyToLen() {
4185     __ align(CodeEntryAlignment);
4186     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4187 
4188     address start = __ pc();
4189     const Register x     = r0;
4190     const Register xlen  = r1;
4191     const Register y     = r2;
4192     const Register ylen  = r3;
4193     const Register z     = r4;
4194     const Register zlen  = r5;
4195 
4196     const Register tmp1  = r10;
4197     const Register tmp2  = r11;
4198     const Register tmp3  = r12;
4199     const Register tmp4  = r13;
4200     const Register tmp5  = r14;
4201     const Register tmp6  = r15;
4202     const Register tmp7  = r16;
4203 
4204     BLOCK_COMMENT("Entry:");
4205     __ enter(); // required for proper stackwalking of RuntimeStub frame
4206     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4207     __ leave(); // required for proper stackwalking of RuntimeStub frame
4208     __ ret(lr);
4209 
4210     return start;
4211   }
4212 
4213   address generate_squareToLen() {
4214     // squareToLen algorithm for sizes 1..127 described in java code works
4215     // faster than multiply_to_len on some CPUs and slower on others, but
4216     // multiply_to_len shows a bit better overall results
4217     __ align(CodeEntryAlignment);
4218     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4219     address start = __ pc();
4220 
4221     const Register x     = r0;
4222     const Register xlen  = r1;
4223     const Register z     = r2;
4224     const Register zlen  = r3;
4225     const Register y     = r4; // == x
4226     const Register ylen  = r5; // == xlen
4227 
4228     const Register tmp1  = r10;
4229     const Register tmp2  = r11;
4230     const Register tmp3  = r12;
4231     const Register tmp4  = r13;
4232     const Register tmp5  = r14;
4233     const Register tmp6  = r15;
4234     const Register tmp7  = r16;
4235 
4236     RegSet spilled_regs = RegSet::of(y, ylen);
4237     BLOCK_COMMENT("Entry:");
4238     __ enter();
4239     __ push(spilled_regs, sp);
4240     __ mov(y, x);
4241     __ mov(ylen, xlen);
4242     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4243     __ pop(spilled_regs, sp);
4244     __ leave();
4245     __ ret(lr);
4246     return start;
4247   }
4248 
4249   address generate_mulAdd() {
4250     __ align(CodeEntryAlignment);
4251     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4252 
4253     address start = __ pc();
4254 
4255     const Register out     = r0;
4256     const Register in      = r1;
4257     const Register offset  = r2;
4258     const Register len     = r3;
4259     const Register k       = r4;
4260 
4261     BLOCK_COMMENT("Entry:");
4262     __ enter();
4263     __ mul_add(out, in, offset, len, k);
4264     __ leave();
4265     __ ret(lr);
4266 
4267     return start;
4268   }
4269 
4270   // Arguments:
4271   //
4272   // Input:
4273   //   c_rarg0   - newArr address
4274   //   c_rarg1   - oldArr address
4275   //   c_rarg2   - newIdx
4276   //   c_rarg3   - shiftCount
4277   //   c_rarg4   - numIter
4278   //
4279   address generate_bigIntegerRightShift() {
4280     __ align(CodeEntryAlignment);
4281     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4282     address start = __ pc();
4283 
4284     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4285 
4286     Register newArr        = c_rarg0;
4287     Register oldArr        = c_rarg1;
4288     Register newIdx        = c_rarg2;
4289     Register shiftCount    = c_rarg3;
4290     Register numIter       = c_rarg4;
4291     Register idx           = numIter;
4292 
4293     Register newArrCur     = rscratch1;
4294     Register shiftRevCount = rscratch2;
4295     Register oldArrCur     = r13;
4296     Register oldArrNext    = r14;
4297 
4298     FloatRegister oldElem0        = v0;
4299     FloatRegister oldElem1        = v1;
4300     FloatRegister newElem         = v2;
4301     FloatRegister shiftVCount     = v3;
4302     FloatRegister shiftVRevCount  = v4;
4303 
4304     __ cbz(idx, Exit);
4305 
4306     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4307 
4308     // left shift count
4309     __ movw(shiftRevCount, 32);
4310     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4311 
4312     // numIter too small to allow a 4-words SIMD loop, rolling back
4313     __ cmp(numIter, (u1)4);
4314     __ br(Assembler::LT, ShiftThree);
4315 
4316     __ dup(shiftVCount,    __ T4S, shiftCount);
4317     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4318     __ negr(shiftVCount,   __ T4S, shiftVCount);
4319 
4320     __ BIND(ShiftSIMDLoop);
4321 
4322     // Calculate the load addresses
4323     __ sub(idx, idx, 4);
4324     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4325     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4326     __ add(oldArrCur,  oldArrNext, 4);
4327 
4328     // Load 4 words and process
4329     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4330     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4331     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4332     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4333     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4334     __ st1(newElem,   __ T4S,  Address(newArrCur));
4335 
4336     __ cmp(idx, (u1)4);
4337     __ br(Assembler::LT, ShiftTwoLoop);
4338     __ b(ShiftSIMDLoop);
4339 
4340     __ BIND(ShiftTwoLoop);
4341     __ cbz(idx, Exit);
4342     __ cmp(idx, (u1)1);
4343     __ br(Assembler::EQ, ShiftOne);
4344 
4345     // Calculate the load addresses
4346     __ sub(idx, idx, 2);
4347     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4348     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4349     __ add(oldArrCur,  oldArrNext, 4);
4350 
4351     // Load 2 words and process
4352     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4353     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4354     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4355     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4356     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4357     __ st1(newElem,   __ T2S, Address(newArrCur));
4358     __ b(ShiftTwoLoop);
4359 
4360     __ BIND(ShiftThree);
4361     __ tbz(idx, 1, ShiftOne);
4362     __ tbz(idx, 0, ShiftTwo);
4363     __ ldrw(r10,  Address(oldArr, 12));
4364     __ ldrw(r11,  Address(oldArr, 8));
4365     __ lsrvw(r10, r10, shiftCount);
4366     __ lslvw(r11, r11, shiftRevCount);
4367     __ orrw(r12,  r10, r11);
4368     __ strw(r12,  Address(newArr, 8));
4369 
4370     __ BIND(ShiftTwo);
4371     __ ldrw(r10,  Address(oldArr, 8));
4372     __ ldrw(r11,  Address(oldArr, 4));
4373     __ lsrvw(r10, r10, shiftCount);
4374     __ lslvw(r11, r11, shiftRevCount);
4375     __ orrw(r12,  r10, r11);
4376     __ strw(r12,  Address(newArr, 4));
4377 
4378     __ BIND(ShiftOne);
4379     __ ldrw(r10,  Address(oldArr, 4));
4380     __ ldrw(r11,  Address(oldArr));
4381     __ lsrvw(r10, r10, shiftCount);
4382     __ lslvw(r11, r11, shiftRevCount);
4383     __ orrw(r12,  r10, r11);
4384     __ strw(r12,  Address(newArr));
4385 
4386     __ BIND(Exit);
4387     __ ret(lr);
4388 
4389     return start;
4390   }
4391 
4392   // Arguments:
4393   //
4394   // Input:
4395   //   c_rarg0   - newArr address
4396   //   c_rarg1   - oldArr address
4397   //   c_rarg2   - newIdx
4398   //   c_rarg3   - shiftCount
4399   //   c_rarg4   - numIter
4400   //
4401   address generate_bigIntegerLeftShift() {
4402     __ align(CodeEntryAlignment);
4403     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4404     address start = __ pc();
4405 
4406     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4407 
4408     Register newArr        = c_rarg0;
4409     Register oldArr        = c_rarg1;
4410     Register newIdx        = c_rarg2;
4411     Register shiftCount    = c_rarg3;
4412     Register numIter       = c_rarg4;
4413 
4414     Register shiftRevCount = rscratch1;
4415     Register oldArrNext    = rscratch2;
4416 
4417     FloatRegister oldElem0        = v0;
4418     FloatRegister oldElem1        = v1;
4419     FloatRegister newElem         = v2;
4420     FloatRegister shiftVCount     = v3;
4421     FloatRegister shiftVRevCount  = v4;
4422 
4423     __ cbz(numIter, Exit);
4424 
4425     __ add(oldArrNext, oldArr, 4);
4426     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4427 
4428     // right shift count
4429     __ movw(shiftRevCount, 32);
4430     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4431 
4432     // numIter too small to allow a 4-words SIMD loop, rolling back
4433     __ cmp(numIter, (u1)4);
4434     __ br(Assembler::LT, ShiftThree);
4435 
4436     __ dup(shiftVCount,     __ T4S, shiftCount);
4437     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4438     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4439 
4440     __ BIND(ShiftSIMDLoop);
4441 
4442     // load 4 words and process
4443     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4444     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4445     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4446     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4447     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4448     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4449     __ sub(numIter,   numIter, 4);
4450 
4451     __ cmp(numIter, (u1)4);
4452     __ br(Assembler::LT, ShiftTwoLoop);
4453     __ b(ShiftSIMDLoop);
4454 
4455     __ BIND(ShiftTwoLoop);
4456     __ cbz(numIter, Exit);
4457     __ cmp(numIter, (u1)1);
4458     __ br(Assembler::EQ, ShiftOne);
4459 
4460     // load 2 words and process
4461     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4462     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4463     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4464     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4465     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4466     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4467     __ sub(numIter,   numIter, 2);
4468     __ b(ShiftTwoLoop);
4469 
4470     __ BIND(ShiftThree);
4471     __ ldrw(r10,  __ post(oldArr, 4));
4472     __ ldrw(r11,  __ post(oldArrNext, 4));
4473     __ lslvw(r10, r10, shiftCount);
4474     __ lsrvw(r11, r11, shiftRevCount);
4475     __ orrw(r12,  r10, r11);
4476     __ strw(r12,  __ post(newArr, 4));
4477     __ tbz(numIter, 1, Exit);
4478     __ tbz(numIter, 0, ShiftOne);
4479 
4480     __ BIND(ShiftTwo);
4481     __ ldrw(r10,  __ post(oldArr, 4));
4482     __ ldrw(r11,  __ post(oldArrNext, 4));
4483     __ lslvw(r10, r10, shiftCount);
4484     __ lsrvw(r11, r11, shiftRevCount);
4485     __ orrw(r12,  r10, r11);
4486     __ strw(r12,  __ post(newArr, 4));
4487 
4488     __ BIND(ShiftOne);
4489     __ ldrw(r10,  Address(oldArr));
4490     __ ldrw(r11,  Address(oldArrNext));
4491     __ lslvw(r10, r10, shiftCount);
4492     __ lsrvw(r11, r11, shiftRevCount);
4493     __ orrw(r12,  r10, r11);
4494     __ strw(r12,  Address(newArr));
4495 
4496     __ BIND(Exit);
4497     __ ret(lr);
4498 
4499     return start;
4500   }
4501 
4502   address generate_has_negatives(address &has_negatives_long) {
4503     const u1 large_loop_size = 64;
4504     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4505     int dcache_line = VM_Version::dcache_line_size();
4506 
4507     Register ary1 = r1, len = r2, result = r0;
4508 
4509     __ align(CodeEntryAlignment);
4510 
4511     StubCodeMark mark(this, "StubRoutines", "has_negatives");
4512 
4513     address entry = __ pc();
4514 
4515     __ enter();
4516 
4517   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
4518         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4519 
4520   __ cmp(len, (u1)15);
4521   __ br(Assembler::GT, LEN_OVER_15);
4522   // The only case when execution falls into this code is when pointer is near
4523   // the end of memory page and we have to avoid reading next page
4524   __ add(ary1, ary1, len);
4525   __ subs(len, len, 8);
4526   __ br(Assembler::GT, LEN_OVER_8);
4527   __ ldr(rscratch2, Address(ary1, -8));
4528   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4529   __ lsrv(rscratch2, rscratch2, rscratch1);
4530   __ tst(rscratch2, UPPER_BIT_MASK);
4531   __ cset(result, Assembler::NE);
4532   __ leave();
4533   __ ret(lr);
4534   __ bind(LEN_OVER_8);
4535   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4536   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4537   __ tst(rscratch2, UPPER_BIT_MASK);
4538   __ br(Assembler::NE, RET_TRUE_NO_POP);
4539   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4540   __ lsrv(rscratch1, rscratch1, rscratch2);
4541   __ tst(rscratch1, UPPER_BIT_MASK);
4542   __ cset(result, Assembler::NE);
4543   __ leave();
4544   __ ret(lr);
4545 
4546   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4547   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4548 
4549   has_negatives_long = __ pc(); // 2nd entry point
4550 
4551   __ enter();
4552 
4553   __ bind(LEN_OVER_15);
4554     __ push(spilled_regs, sp);
4555     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4556     __ cbz(rscratch2, ALIGNED);
4557     __ ldp(tmp6, tmp1, Address(ary1));
4558     __ mov(tmp5, 16);
4559     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4560     __ add(ary1, ary1, rscratch1);
4561     __ sub(len, len, rscratch1);
4562     __ orr(tmp6, tmp6, tmp1);
4563     __ tst(tmp6, UPPER_BIT_MASK);
4564     __ br(Assembler::NE, RET_TRUE);
4565 
4566   __ bind(ALIGNED);
4567     __ cmp(len, large_loop_size);
4568     __ br(Assembler::LT, CHECK_16);
4569     // Perform 16-byte load as early return in pre-loop to handle situation
4570     // when initially aligned large array has negative values at starting bytes,
4571     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4572     // slower. Cases with negative bytes further ahead won't be affected that
4573     // much. In fact, it'll be faster due to early loads, less instructions and
4574     // less branches in LARGE_LOOP.
4575     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4576     __ sub(len, len, 16);
4577     __ orr(tmp6, tmp6, tmp1);
4578     __ tst(tmp6, UPPER_BIT_MASK);
4579     __ br(Assembler::NE, RET_TRUE);
4580     __ cmp(len, large_loop_size);
4581     __ br(Assembler::LT, CHECK_16);
4582 
4583     if (SoftwarePrefetchHintDistance >= 0
4584         && SoftwarePrefetchHintDistance >= dcache_line) {
4585       // initial prefetch
4586       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4587     }
4588   __ bind(LARGE_LOOP);
4589     if (SoftwarePrefetchHintDistance >= 0) {
4590       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4591     }
4592     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4593     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4594     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4595     // instructions per cycle and have less branches, but this approach disables
4596     // early return, thus, all 64 bytes are loaded and checked every time.
4597     __ ldp(tmp2, tmp3, Address(ary1));
4598     __ ldp(tmp4, tmp5, Address(ary1, 16));
4599     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4600     __ ldp(tmp6, tmp1, Address(ary1, 48));
4601     __ add(ary1, ary1, large_loop_size);
4602     __ sub(len, len, large_loop_size);
4603     __ orr(tmp2, tmp2, tmp3);
4604     __ orr(tmp4, tmp4, tmp5);
4605     __ orr(rscratch1, rscratch1, rscratch2);
4606     __ orr(tmp6, tmp6, tmp1);
4607     __ orr(tmp2, tmp2, tmp4);
4608     __ orr(rscratch1, rscratch1, tmp6);
4609     __ orr(tmp2, tmp2, rscratch1);
4610     __ tst(tmp2, UPPER_BIT_MASK);
4611     __ br(Assembler::NE, RET_TRUE);
4612     __ cmp(len, large_loop_size);
4613     __ br(Assembler::GE, LARGE_LOOP);
4614 
4615   __ bind(CHECK_16); // small 16-byte load pre-loop
4616     __ cmp(len, (u1)16);
4617     __ br(Assembler::LT, POST_LOOP16);
4618 
4619   __ bind(LOOP16); // small 16-byte load loop
4620     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4621     __ sub(len, len, 16);
4622     __ orr(tmp2, tmp2, tmp3);
4623     __ tst(tmp2, UPPER_BIT_MASK);
4624     __ br(Assembler::NE, RET_TRUE);
4625     __ cmp(len, (u1)16);
4626     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4627 
4628   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4629     __ cmp(len, (u1)8);
4630     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4631     __ ldr(tmp3, Address(__ post(ary1, 8)));
4632     __ sub(len, len, 8);
4633     __ tst(tmp3, UPPER_BIT_MASK);
4634     __ br(Assembler::NE, RET_TRUE);
4635 
4636   __ bind(POST_LOOP16_LOAD_TAIL);
4637     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
4638     __ ldr(tmp1, Address(ary1));
4639     __ mov(tmp2, 64);
4640     __ sub(tmp4, tmp2, len, __ LSL, 3);
4641     __ lslv(tmp1, tmp1, tmp4);
4642     __ tst(tmp1, UPPER_BIT_MASK);
4643     __ br(Assembler::NE, RET_TRUE);
4644     // Fallthrough
4645 
4646   __ bind(RET_FALSE);
4647     __ pop(spilled_regs, sp);
4648     __ leave();
4649     __ mov(result, zr);
4650     __ ret(lr);
4651 
4652   __ bind(RET_TRUE);
4653     __ pop(spilled_regs, sp);
4654   __ bind(RET_TRUE_NO_POP);
4655     __ leave();
4656     __ mov(result, 1);
4657     __ ret(lr);
4658 
4659   __ bind(DONE);
4660     __ pop(spilled_regs, sp);
4661     __ leave();
4662     __ ret(lr);
4663     return entry;
4664   }
4665 
4666   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4667         bool usePrefetch, Label &NOT_EQUAL) {
4668     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4669         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4670         tmp7 = r12, tmp8 = r13;
4671     Label LOOP;
4672 
4673     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4674     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4675     __ bind(LOOP);
4676     if (usePrefetch) {
4677       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4678       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4679     }
4680     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4681     __ eor(tmp1, tmp1, tmp2);
4682     __ eor(tmp3, tmp3, tmp4);
4683     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4684     __ orr(tmp1, tmp1, tmp3);
4685     __ cbnz(tmp1, NOT_EQUAL);
4686     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4687     __ eor(tmp5, tmp5, tmp6);
4688     __ eor(tmp7, tmp7, tmp8);
4689     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4690     __ orr(tmp5, tmp5, tmp7);
4691     __ cbnz(tmp5, NOT_EQUAL);
4692     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4693     __ eor(tmp1, tmp1, tmp2);
4694     __ eor(tmp3, tmp3, tmp4);
4695     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4696     __ orr(tmp1, tmp1, tmp3);
4697     __ cbnz(tmp1, NOT_EQUAL);
4698     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4699     __ eor(tmp5, tmp5, tmp6);
4700     __ sub(cnt1, cnt1, 8 * wordSize);
4701     __ eor(tmp7, tmp7, tmp8);
4702     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4703     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4704     // cmp) because subs allows an unlimited range of immediate operand.
4705     __ subs(tmp6, cnt1, loopThreshold);
4706     __ orr(tmp5, tmp5, tmp7);
4707     __ cbnz(tmp5, NOT_EQUAL);
4708     __ br(__ GE, LOOP);
4709     // post-loop
4710     __ eor(tmp1, tmp1, tmp2);
4711     __ eor(tmp3, tmp3, tmp4);
4712     __ orr(tmp1, tmp1, tmp3);
4713     __ sub(cnt1, cnt1, 2 * wordSize);
4714     __ cbnz(tmp1, NOT_EQUAL);
4715   }
4716 
4717   void generate_large_array_equals_loop_simd(int loopThreshold,
4718         bool usePrefetch, Label &NOT_EQUAL) {
4719     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4720         tmp2 = rscratch2;
4721     Label LOOP;
4722 
4723     __ bind(LOOP);
4724     if (usePrefetch) {
4725       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4726       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4727     }
4728     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4729     __ sub(cnt1, cnt1, 8 * wordSize);
4730     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4731     __ subs(tmp1, cnt1, loopThreshold);
4732     __ eor(v0, __ T16B, v0, v4);
4733     __ eor(v1, __ T16B, v1, v5);
4734     __ eor(v2, __ T16B, v2, v6);
4735     __ eor(v3, __ T16B, v3, v7);
4736     __ orr(v0, __ T16B, v0, v1);
4737     __ orr(v1, __ T16B, v2, v3);
4738     __ orr(v0, __ T16B, v0, v1);
4739     __ umov(tmp1, v0, __ D, 0);
4740     __ umov(tmp2, v0, __ D, 1);
4741     __ orr(tmp1, tmp1, tmp2);
4742     __ cbnz(tmp1, NOT_EQUAL);
4743     __ br(__ GE, LOOP);
4744   }
4745 
4746   // a1 = r1 - array1 address
4747   // a2 = r2 - array2 address
4748   // result = r0 - return value. Already contains "false"
4749   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4750   // r3-r5 are reserved temporary registers
4751   address generate_large_array_equals() {
4752     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4753         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4754         tmp7 = r12, tmp8 = r13;
4755     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4756         SMALL_LOOP, POST_LOOP;
4757     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4758     // calculate if at least 32 prefetched bytes are used
4759     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4760     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4761     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4762     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4763         tmp5, tmp6, tmp7, tmp8);
4764 
4765     __ align(CodeEntryAlignment);
4766 
4767     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4768 
4769     address entry = __ pc();
4770     __ enter();
4771     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4772     // also advance pointers to use post-increment instead of pre-increment
4773     __ add(a1, a1, wordSize);
4774     __ add(a2, a2, wordSize);
4775     if (AvoidUnalignedAccesses) {
4776       // both implementations (SIMD/nonSIMD) are using relatively large load
4777       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4778       // on some CPUs in case of address is not at least 16-byte aligned.
4779       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4780       // load if needed at least for 1st address and make if 16-byte aligned.
4781       Label ALIGNED16;
4782       __ tbz(a1, 3, ALIGNED16);
4783       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4784       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4785       __ sub(cnt1, cnt1, wordSize);
4786       __ eor(tmp1, tmp1, tmp2);
4787       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4788       __ bind(ALIGNED16);
4789     }
4790     if (UseSIMDForArrayEquals) {
4791       if (SoftwarePrefetchHintDistance >= 0) {
4792         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4793         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4794         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4795             /* prfm = */ true, NOT_EQUAL);
4796         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4797         __ br(__ LT, TAIL);
4798       }
4799       __ bind(NO_PREFETCH_LARGE_LOOP);
4800       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4801           /* prfm = */ false, NOT_EQUAL);
4802     } else {
4803       __ push(spilled_regs, sp);
4804       if (SoftwarePrefetchHintDistance >= 0) {
4805         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4806         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4807         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4808             /* prfm = */ true, NOT_EQUAL);
4809         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4810         __ br(__ LT, TAIL);
4811       }
4812       __ bind(NO_PREFETCH_LARGE_LOOP);
4813       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4814           /* prfm = */ false, NOT_EQUAL);
4815     }
4816     __ bind(TAIL);
4817       __ cbz(cnt1, EQUAL);
4818       __ subs(cnt1, cnt1, wordSize);
4819       __ br(__ LE, POST_LOOP);
4820     __ bind(SMALL_LOOP);
4821       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4822       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4823       __ subs(cnt1, cnt1, wordSize);
4824       __ eor(tmp1, tmp1, tmp2);
4825       __ cbnz(tmp1, NOT_EQUAL);
4826       __ br(__ GT, SMALL_LOOP);
4827     __ bind(POST_LOOP);
4828       __ ldr(tmp1, Address(a1, cnt1));
4829       __ ldr(tmp2, Address(a2, cnt1));
4830       __ eor(tmp1, tmp1, tmp2);
4831       __ cbnz(tmp1, NOT_EQUAL);
4832     __ bind(EQUAL);
4833       __ mov(result, true);
4834     __ bind(NOT_EQUAL);
4835       if (!UseSIMDForArrayEquals) {
4836         __ pop(spilled_regs, sp);
4837       }
4838     __ bind(NOT_EQUAL_NO_POP);
4839     __ leave();
4840     __ ret(lr);
4841     return entry;
4842   }
4843 
4844   address generate_dsin_dcos(bool isCos) {
4845     __ align(CodeEntryAlignment);
4846     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4847     address start = __ pc();
4848     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4849         (address)StubRoutines::aarch64::_two_over_pi,
4850         (address)StubRoutines::aarch64::_pio2,
4851         (address)StubRoutines::aarch64::_dsin_coef,
4852         (address)StubRoutines::aarch64::_dcos_coef);
4853     return start;
4854   }
4855 
4856   address generate_dlog() {
4857     __ align(CodeEntryAlignment);
4858     StubCodeMark mark(this, "StubRoutines", "dlog");
4859     address entry = __ pc();
4860     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4861         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4862     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4863     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4864         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4865     return entry;
4866   }
4867 
4868 
4869   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4870   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4871       Label &DIFF2) {
4872     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4873     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4874 
4875     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4876     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4877     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4878     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4879 
4880     __ fmovd(tmpL, vtmp3);
4881     __ eor(rscratch2, tmp3, tmpL);
4882     __ cbnz(rscratch2, DIFF2);
4883 
4884     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4885     __ umov(tmpL, vtmp3, __ D, 1);
4886     __ eor(rscratch2, tmpU, tmpL);
4887     __ cbnz(rscratch2, DIFF1);
4888 
4889     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4890     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4891     __ fmovd(tmpL, vtmp);
4892     __ eor(rscratch2, tmp3, tmpL);
4893     __ cbnz(rscratch2, DIFF2);
4894 
4895     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4896     __ umov(tmpL, vtmp, __ D, 1);
4897     __ eor(rscratch2, tmpU, tmpL);
4898     __ cbnz(rscratch2, DIFF1);
4899   }
4900 
4901   // r0  = result
4902   // r1  = str1
4903   // r2  = cnt1
4904   // r3  = str2
4905   // r4  = cnt2
4906   // r10 = tmp1
4907   // r11 = tmp2
4908   address generate_compare_long_string_different_encoding(bool isLU) {
4909     __ align(CodeEntryAlignment);
4910     StubCodeMark mark(this, "StubRoutines", isLU
4911         ? "compare_long_string_different_encoding LU"
4912         : "compare_long_string_different_encoding UL");
4913     address entry = __ pc();
4914     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4915         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4916         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4917     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4918         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4919     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4920     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4921 
4922     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4923 
4924     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4925     // cnt2 == amount of characters left to compare
4926     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4927     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4928     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4929     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4930     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4931     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4932     __ eor(rscratch2, tmp1, tmp2);
4933     __ mov(rscratch1, tmp2);
4934     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4935     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4936              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4937     __ push(spilled_regs, sp);
4938     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4939     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4940 
4941     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4942 
4943     if (SoftwarePrefetchHintDistance >= 0) {
4944       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4945       __ br(__ LT, NO_PREFETCH);
4946       __ bind(LARGE_LOOP_PREFETCH);
4947         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4948         __ mov(tmp4, 2);
4949         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4950         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4951           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4952           __ subs(tmp4, tmp4, 1);
4953           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4954           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4955           __ mov(tmp4, 2);
4956         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4957           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4958           __ subs(tmp4, tmp4, 1);
4959           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4960           __ sub(cnt2, cnt2, 64);
4961           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4962           __ br(__ GE, LARGE_LOOP_PREFETCH);
4963     }
4964     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4965     __ bind(NO_PREFETCH);
4966     __ subs(cnt2, cnt2, 16);
4967     __ br(__ LT, TAIL);
4968     __ align(OptoLoopAlignment);
4969     __ bind(SMALL_LOOP); // smaller loop
4970       __ subs(cnt2, cnt2, 16);
4971       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4972       __ br(__ GE, SMALL_LOOP);
4973       __ cmn(cnt2, (u1)16);
4974       __ br(__ EQ, LOAD_LAST);
4975     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4976       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4977       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4978       __ ldr(tmp3, Address(cnt1, -8));
4979       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4980       __ b(LOAD_LAST);
4981     __ bind(DIFF2);
4982       __ mov(tmpU, tmp3);
4983     __ bind(DIFF1);
4984       __ pop(spilled_regs, sp);
4985       __ b(CALCULATE_DIFFERENCE);
4986     __ bind(LOAD_LAST);
4987       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4988       // No need to load it again
4989       __ mov(tmpU, tmp3);
4990       __ pop(spilled_regs, sp);
4991 
4992       // tmp2 points to the address of the last 4 Latin1 characters right now
4993       __ ldrs(vtmp, Address(tmp2));
4994       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4995       __ fmovd(tmpL, vtmp);
4996 
4997       __ eor(rscratch2, tmpU, tmpL);
4998       __ cbz(rscratch2, DONE);
4999 
5000     // Find the first different characters in the longwords and
5001     // compute their difference.
5002     __ bind(CALCULATE_DIFFERENCE);
5003       __ rev(rscratch2, rscratch2);
5004       __ clz(rscratch2, rscratch2);
5005       __ andr(rscratch2, rscratch2, -16);
5006       __ lsrv(tmp1, tmp1, rscratch2);
5007       __ uxthw(tmp1, tmp1);
5008       __ lsrv(rscratch1, rscratch1, rscratch2);
5009       __ uxthw(rscratch1, rscratch1);
5010       __ subw(result, tmp1, rscratch1);
5011     __ bind(DONE);
5012       __ ret(lr);
5013     return entry;
5014   }
5015 
5016     address generate_method_entry_barrier() {
5017     __ align(CodeEntryAlignment);
5018     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5019 
5020     Label deoptimize_label;
5021 
5022     address start = __ pc();
5023 
5024     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5025 
5026     __ enter();
5027     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5028 
5029     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5030 
5031     __ push_call_clobbered_registers();
5032 
5033     __ mov(c_rarg0, rscratch2);
5034     __ call_VM_leaf
5035          (CAST_FROM_FN_PTR
5036           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5037 
5038     __ reset_last_Java_frame(true);
5039 
5040     __ mov(rscratch1, r0);
5041 
5042     __ pop_call_clobbered_registers();
5043 
5044     __ cbnz(rscratch1, deoptimize_label);
5045 
5046     __ leave();
5047     __ ret(lr);
5048 
5049     __ BIND(deoptimize_label);
5050 
5051     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5052     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5053 
5054     __ mov(sp, rscratch1);
5055     __ br(rscratch2);
5056 
5057     return start;
5058   }
5059 
5060   enum string_compare_mode {
5061     LL,
5062     LU,
5063     UL,
5064     UU,
5065   };
5066 
5067   // The following registers are declared in aarch64.ad
5068   // r0  = result
5069   // r1  = str1
5070   // r2  = cnt1
5071   // r3  = str2
5072   // r4  = cnt2
5073   // r10 = tmp1
5074   // r11 = tmp2
5075   // z0  = ztmp1
5076   // z1  = ztmp2
5077   // p0  = pgtmp1
5078   // p1  = pgtmp2
5079   address generate_compare_long_string_sve(string_compare_mode mode) {
5080     __ align(CodeEntryAlignment);
5081     address entry = __ pc();
5082     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5083              tmp1 = r10, tmp2 = r11;
5084 
5085     Label LOOP, MATCH, DONE, NOMATCH;
5086     Register vec_len = tmp1;
5087     Register idx = tmp2;
5088     // The minimum of the string lengths has been stored in cnt2.
5089     Register cnt = cnt2;
5090     FloatRegister ztmp1 = z0, ztmp2 = z1;
5091     PRegister pgtmp1 = p0, pgtmp2 = p1;
5092 
5093     if (mode == LL) {
5094       __ sve_cntb(vec_len);
5095     } else {
5096       __ sve_cnth(vec_len);
5097     }
5098 
5099     __ mov(idx, 0);
5100     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5101 
5102     __ bind(LOOP);
5103       switch (mode) {
5104         case LL:
5105           __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));
5106           __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));
5107           break;
5108         case LU:
5109           __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));
5110           __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1)));
5111           break;
5112         case UL:
5113           __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1)));
5114           __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));
5115           break;
5116         case UU:
5117           __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1)));
5118           __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1)));
5119           break;
5120         default: ShouldNotReachHere();
5121       }
5122       __ add(idx, idx, vec_len);
5123 
5124       // Compare strings.
5125       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5126       __ br(__ NE, MATCH);
5127       __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5128       __ br(__ LT, LOOP);
5129 
5130       // The result has been computed in the caller prior to entering this stub.
5131       __ b(DONE);
5132 
5133     __ bind(MATCH);
5134 
5135       // Crop the vector to find its location.
5136       __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5137 
5138       // Extract the first different characters of each string.
5139       __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5140       __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5141 
5142       // Compute the difference of the first different characters.
5143       __ sub(result, rscratch1, rscratch2);
5144 
5145     __ bind(DONE);
5146       __ ret(lr);
5147 
5148     return entry;
5149   }
5150 
5151   // r0  = result
5152   // r1  = str1
5153   // r2  = cnt1
5154   // r3  = str2
5155   // r4  = cnt2
5156   // r10 = tmp1
5157   // r11 = tmp2
5158   address generate_compare_long_string_same_encoding(bool isLL) {
5159     __ align(CodeEntryAlignment);
5160     StubCodeMark mark(this, "StubRoutines", isLL
5161         ? "compare_long_string_same_encoding LL"
5162         : "compare_long_string_same_encoding UU");
5163     address entry = __ pc();
5164     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5165         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5166 
5167     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5168 
5169     // exit from large loop when less than 64 bytes left to read or we're about
5170     // to prefetch memory behind array border
5171     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5172 
5173     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5174     __ eor(rscratch2, tmp1, tmp2);
5175     __ cbnz(rscratch2, CAL_DIFFERENCE);
5176 
5177     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5178     // update pointers, because of previous read
5179     __ add(str1, str1, wordSize);
5180     __ add(str2, str2, wordSize);
5181     if (SoftwarePrefetchHintDistance >= 0) {
5182       __ bind(LARGE_LOOP_PREFETCH);
5183         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5184         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5185 
5186         __ align(OptoLoopAlignment);
5187         for (int i = 0; i < 4; i++) {
5188           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5189           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5190           __ cmp(tmp1, tmp2);
5191           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5192           __ br(Assembler::NE, DIFF);
5193         }
5194         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5195         __ add(str1, str1, 64);
5196         __ add(str2, str2, 64);
5197         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5198         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5199         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5200     }
5201 
5202     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5203     __ br(Assembler::LE, LESS16);
5204     __ align(OptoLoopAlignment);
5205     __ bind(LOOP_COMPARE16);
5206       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5207       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5208       __ cmp(tmp1, tmp2);
5209       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5210       __ br(Assembler::NE, DIFF);
5211       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5212       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5213       __ br(Assembler::LT, LESS16);
5214 
5215       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5216       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5217       __ cmp(tmp1, tmp2);
5218       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5219       __ br(Assembler::NE, DIFF);
5220       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5221       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5222       __ br(Assembler::GE, LOOP_COMPARE16);
5223       __ cbz(cnt2, LENGTH_DIFF);
5224 
5225     __ bind(LESS16);
5226       // each 8 compare
5227       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5228       __ br(Assembler::LE, LESS8);
5229       __ ldr(tmp1, Address(__ post(str1, 8)));
5230       __ ldr(tmp2, Address(__ post(str2, 8)));
5231       __ eor(rscratch2, tmp1, tmp2);
5232       __ cbnz(rscratch2, CAL_DIFFERENCE);
5233       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5234 
5235     __ bind(LESS8); // directly load last 8 bytes
5236       if (!isLL) {
5237         __ add(cnt2, cnt2, cnt2);
5238       }
5239       __ ldr(tmp1, Address(str1, cnt2));
5240       __ ldr(tmp2, Address(str2, cnt2));
5241       __ eor(rscratch2, tmp1, tmp2);
5242       __ cbz(rscratch2, LENGTH_DIFF);
5243       __ b(CAL_DIFFERENCE);
5244 
5245     __ bind(DIFF);
5246       __ cmp(tmp1, tmp2);
5247       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5248       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5249       // reuse rscratch2 register for the result of eor instruction
5250       __ eor(rscratch2, tmp1, tmp2);
5251 
5252     __ bind(CAL_DIFFERENCE);
5253       __ rev(rscratch2, rscratch2);
5254       __ clz(rscratch2, rscratch2);
5255       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5256       __ lsrv(tmp1, tmp1, rscratch2);
5257       __ lsrv(tmp2, tmp2, rscratch2);
5258       if (isLL) {
5259         __ uxtbw(tmp1, tmp1);
5260         __ uxtbw(tmp2, tmp2);
5261       } else {
5262         __ uxthw(tmp1, tmp1);
5263         __ uxthw(tmp2, tmp2);
5264       }
5265       __ subw(result, tmp1, tmp2);
5266 
5267     __ bind(LENGTH_DIFF);
5268       __ ret(lr);
5269     return entry;
5270   }
5271 
5272   void generate_compare_long_strings() {
5273     if (UseSVE == 0) {
5274       StubRoutines::aarch64::_compare_long_string_LL
5275           = generate_compare_long_string_same_encoding(true);
5276       StubRoutines::aarch64::_compare_long_string_UU
5277           = generate_compare_long_string_same_encoding(false);
5278       StubRoutines::aarch64::_compare_long_string_LU
5279           = generate_compare_long_string_different_encoding(true);
5280       StubRoutines::aarch64::_compare_long_string_UL
5281           = generate_compare_long_string_different_encoding(false);
5282     } else {
5283       StubRoutines::aarch64::_compare_long_string_LL
5284           = generate_compare_long_string_sve(LL);
5285       StubRoutines::aarch64::_compare_long_string_UU
5286           = generate_compare_long_string_sve(UU);
5287       StubRoutines::aarch64::_compare_long_string_LU
5288           = generate_compare_long_string_sve(LU);
5289       StubRoutines::aarch64::_compare_long_string_UL
5290           = generate_compare_long_string_sve(UL);
5291     }
5292   }
5293 
5294   // R0 = result
5295   // R1 = str2
5296   // R2 = cnt1
5297   // R3 = str1
5298   // R4 = cnt2
5299   // This generic linear code use few additional ideas, which makes it faster:
5300   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5301   // in order to skip initial loading(help in systems with 1 ld pipeline)
5302   // 2) we can use "fast" algorithm of finding single character to search for
5303   // first symbol with less branches(1 branch per each loaded register instead
5304   // of branch for each symbol), so, this is where constants like
5305   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5306   // 3) after loading and analyzing 1st register of source string, it can be
5307   // used to search for every 1st character entry, saving few loads in
5308   // comparison with "simplier-but-slower" implementation
5309   // 4) in order to avoid lots of push/pop operations, code below is heavily
5310   // re-using/re-initializing/compressing register values, which makes code
5311   // larger and a bit less readable, however, most of extra operations are
5312   // issued during loads or branches, so, penalty is minimal
5313   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5314     const char* stubName = str1_isL
5315         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5316         : "indexof_linear_uu";
5317     __ align(CodeEntryAlignment);
5318     StubCodeMark mark(this, "StubRoutines", stubName);
5319     address entry = __ pc();
5320 
5321     int str1_chr_size = str1_isL ? 1 : 2;
5322     int str2_chr_size = str2_isL ? 1 : 2;
5323     int str1_chr_shift = str1_isL ? 0 : 1;
5324     int str2_chr_shift = str2_isL ? 0 : 1;
5325     bool isL = str1_isL && str2_isL;
5326    // parameters
5327     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5328     // temporary registers
5329     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5330     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5331     // redefinitions
5332     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5333 
5334     __ push(spilled_regs, sp);
5335     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5336         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5337         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5338         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5339         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5340         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5341     // Read whole register from str1. It is safe, because length >=8 here
5342     __ ldr(ch1, Address(str1));
5343     // Read whole register from str2. It is safe, because length >=8 here
5344     __ ldr(ch2, Address(str2));
5345     __ sub(cnt2, cnt2, cnt1);
5346     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5347     if (str1_isL != str2_isL) {
5348       __ eor(v0, __ T16B, v0, v0);
5349     }
5350     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5351     __ mul(first, first, tmp1);
5352     // check if we have less than 1 register to check
5353     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5354     if (str1_isL != str2_isL) {
5355       __ fmovd(v1, ch1);
5356     }
5357     __ br(__ LE, L_SMALL);
5358     __ eor(ch2, first, ch2);
5359     if (str1_isL != str2_isL) {
5360       __ zip1(v1, __ T16B, v1, v0);
5361     }
5362     __ sub(tmp2, ch2, tmp1);
5363     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5364     __ bics(tmp2, tmp2, ch2);
5365     if (str1_isL != str2_isL) {
5366       __ fmovd(ch1, v1);
5367     }
5368     __ br(__ NE, L_HAS_ZERO);
5369     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5370     __ add(result, result, wordSize/str2_chr_size);
5371     __ add(str2, str2, wordSize);
5372     __ br(__ LT, L_POST_LOOP);
5373     __ BIND(L_LOOP);
5374       __ ldr(ch2, Address(str2));
5375       __ eor(ch2, first, ch2);
5376       __ sub(tmp2, ch2, tmp1);
5377       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5378       __ bics(tmp2, tmp2, ch2);
5379       __ br(__ NE, L_HAS_ZERO);
5380     __ BIND(L_LOOP_PROCEED);
5381       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5382       __ add(str2, str2, wordSize);
5383       __ add(result, result, wordSize/str2_chr_size);
5384       __ br(__ GE, L_LOOP);
5385     __ BIND(L_POST_LOOP);
5386       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5387       __ br(__ LE, NOMATCH);
5388       __ ldr(ch2, Address(str2));
5389       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5390       __ eor(ch2, first, ch2);
5391       __ sub(tmp2, ch2, tmp1);
5392       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5393       __ mov(tmp4, -1); // all bits set
5394       __ b(L_SMALL_PROCEED);
5395     __ align(OptoLoopAlignment);
5396     __ BIND(L_SMALL);
5397       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5398       __ eor(ch2, first, ch2);
5399       if (str1_isL != str2_isL) {
5400         __ zip1(v1, __ T16B, v1, v0);
5401       }
5402       __ sub(tmp2, ch2, tmp1);
5403       __ mov(tmp4, -1); // all bits set
5404       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5405       if (str1_isL != str2_isL) {
5406         __ fmovd(ch1, v1); // move converted 4 symbols
5407       }
5408     __ BIND(L_SMALL_PROCEED);
5409       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5410       __ bic(tmp2, tmp2, ch2);
5411       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5412       __ rbit(tmp2, tmp2);
5413       __ br(__ EQ, NOMATCH);
5414     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5415       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5416       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5417       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5418       if (str2_isL) { // LL
5419         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5420         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5421         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5422         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5423         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5424       } else {
5425         __ mov(ch2, 0xE); // all bits in byte set except last one
5426         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5427         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5428         __ lslv(tmp2, tmp2, tmp4);
5429         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5430         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5431         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5432         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5433       }
5434       __ cmp(ch1, ch2);
5435       __ mov(tmp4, wordSize/str2_chr_size);
5436       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5437     __ BIND(L_SMALL_CMP_LOOP);
5438       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5439                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5440       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5441                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5442       __ add(tmp4, tmp4, 1);
5443       __ cmp(tmp4, cnt1);
5444       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5445       __ cmp(first, ch2);
5446       __ br(__ EQ, L_SMALL_CMP_LOOP);
5447     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5448       __ cbz(tmp2, NOMATCH); // no more matches. exit
5449       __ clz(tmp4, tmp2);
5450       __ add(result, result, 1); // advance index
5451       __ add(str2, str2, str2_chr_size); // advance pointer
5452       __ b(L_SMALL_HAS_ZERO_LOOP);
5453     __ align(OptoLoopAlignment);
5454     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5455       __ cmp(first, ch2);
5456       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5457       __ b(DONE);
5458     __ align(OptoLoopAlignment);
5459     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5460       if (str2_isL) { // LL
5461         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5462         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5463         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5464         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5465         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5466       } else {
5467         __ mov(ch2, 0xE); // all bits in byte set except last one
5468         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5469         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5470         __ lslv(tmp2, tmp2, tmp4);
5471         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5472         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5473         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5474         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5475       }
5476       __ cmp(ch1, ch2);
5477       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5478       __ b(DONE);
5479     __ align(OptoLoopAlignment);
5480     __ BIND(L_HAS_ZERO);
5481       __ rbit(tmp2, tmp2);
5482       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5483       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5484       // It's fine because both counters are 32bit and are not changed in this
5485       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5486       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5487       __ sub(result, result, 1);
5488     __ BIND(L_HAS_ZERO_LOOP);
5489       __ mov(cnt1, wordSize/str2_chr_size);
5490       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5491       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5492       if (str2_isL) {
5493         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5494         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5495         __ lslv(tmp2, tmp2, tmp4);
5496         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5497         __ add(tmp4, tmp4, 1);
5498         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5499         __ lsl(tmp2, tmp2, 1);
5500         __ mov(tmp4, wordSize/str2_chr_size);
5501       } else {
5502         __ mov(ch2, 0xE);
5503         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5504         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5505         __ lslv(tmp2, tmp2, tmp4);
5506         __ add(tmp4, tmp4, 1);
5507         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5508         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5509         __ lsl(tmp2, tmp2, 1);
5510         __ mov(tmp4, wordSize/str2_chr_size);
5511         __ sub(str2, str2, str2_chr_size);
5512       }
5513       __ cmp(ch1, ch2);
5514       __ mov(tmp4, wordSize/str2_chr_size);
5515       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5516     __ BIND(L_CMP_LOOP);
5517       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5518                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5519       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5520                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5521       __ add(tmp4, tmp4, 1);
5522       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5523       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5524       __ cmp(cnt1, ch2);
5525       __ br(__ EQ, L_CMP_LOOP);
5526     __ BIND(L_CMP_LOOP_NOMATCH);
5527       // here we're not matched
5528       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5529       __ clz(tmp4, tmp2);
5530       __ add(str2, str2, str2_chr_size); // advance pointer
5531       __ b(L_HAS_ZERO_LOOP);
5532     __ align(OptoLoopAlignment);
5533     __ BIND(L_CMP_LOOP_LAST_CMP);
5534       __ cmp(cnt1, ch2);
5535       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5536       __ b(DONE);
5537     __ align(OptoLoopAlignment);
5538     __ BIND(L_CMP_LOOP_LAST_CMP2);
5539       if (str2_isL) {
5540         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5541         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5542         __ lslv(tmp2, tmp2, tmp4);
5543         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5544         __ add(tmp4, tmp4, 1);
5545         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5546         __ lsl(tmp2, tmp2, 1);
5547       } else {
5548         __ mov(ch2, 0xE);
5549         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5550         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5551         __ lslv(tmp2, tmp2, tmp4);
5552         __ add(tmp4, tmp4, 1);
5553         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5554         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5555         __ lsl(tmp2, tmp2, 1);
5556         __ sub(str2, str2, str2_chr_size);
5557       }
5558       __ cmp(ch1, ch2);
5559       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5560       __ b(DONE);
5561     __ align(OptoLoopAlignment);
5562     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5563       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5564       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5565       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5566       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5567       // result by analyzed characters value, so, we can just reset lower bits
5568       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5569       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5570       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5571       // index of last analyzed substring inside current octet. So, str2 in at
5572       // respective start address. We need to advance it to next octet
5573       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5574       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5575       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5576       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5577       __ movw(cnt2, cnt2);
5578       __ b(L_LOOP_PROCEED);
5579     __ align(OptoLoopAlignment);
5580     __ BIND(NOMATCH);
5581       __ mov(result, -1);
5582     __ BIND(DONE);
5583       __ pop(spilled_regs, sp);
5584       __ ret(lr);
5585     return entry;
5586   }
5587 
5588   void generate_string_indexof_stubs() {
5589     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5590     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5591     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5592   }
5593 
5594   void inflate_and_store_2_fp_registers(bool generatePrfm,
5595       FloatRegister src1, FloatRegister src2) {
5596     Register dst = r1;
5597     __ zip1(v1, __ T16B, src1, v0);
5598     __ zip2(v2, __ T16B, src1, v0);
5599     if (generatePrfm) {
5600       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5601     }
5602     __ zip1(v3, __ T16B, src2, v0);
5603     __ zip2(v4, __ T16B, src2, v0);
5604     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5605   }
5606 
5607   // R0 = src
5608   // R1 = dst
5609   // R2 = len
5610   // R3 = len >> 3
5611   // V0 = 0
5612   // v1 = loaded 8 bytes
5613   address generate_large_byte_array_inflate() {
5614     __ align(CodeEntryAlignment);
5615     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5616     address entry = __ pc();
5617     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5618     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5619     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5620 
5621     // do one more 8-byte read to have address 16-byte aligned in most cases
5622     // also use single store instruction
5623     __ ldrd(v2, __ post(src, 8));
5624     __ sub(octetCounter, octetCounter, 2);
5625     __ zip1(v1, __ T16B, v1, v0);
5626     __ zip1(v2, __ T16B, v2, v0);
5627     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5628     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5629     __ subs(rscratch1, octetCounter, large_loop_threshold);
5630     __ br(__ LE, LOOP_START);
5631     __ b(LOOP_PRFM_START);
5632     __ bind(LOOP_PRFM);
5633       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5634     __ bind(LOOP_PRFM_START);
5635       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5636       __ sub(octetCounter, octetCounter, 8);
5637       __ subs(rscratch1, octetCounter, large_loop_threshold);
5638       inflate_and_store_2_fp_registers(true, v3, v4);
5639       inflate_and_store_2_fp_registers(true, v5, v6);
5640       __ br(__ GT, LOOP_PRFM);
5641       __ cmp(octetCounter, (u1)8);
5642       __ br(__ LT, DONE);
5643     __ bind(LOOP);
5644       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5645       __ bind(LOOP_START);
5646       __ sub(octetCounter, octetCounter, 8);
5647       __ cmp(octetCounter, (u1)8);
5648       inflate_and_store_2_fp_registers(false, v3, v4);
5649       inflate_and_store_2_fp_registers(false, v5, v6);
5650       __ br(__ GE, LOOP);
5651     __ bind(DONE);
5652       __ ret(lr);
5653     return entry;
5654   }
5655 
5656   /**
5657    *  Arguments:
5658    *
5659    *  Input:
5660    *  c_rarg0   - current state address
5661    *  c_rarg1   - H key address
5662    *  c_rarg2   - data address
5663    *  c_rarg3   - number of blocks
5664    *
5665    *  Output:
5666    *  Updated state at c_rarg0
5667    */
5668   address generate_ghash_processBlocks() {
5669     // Bafflingly, GCM uses little-endian for the byte order, but
5670     // big-endian for the bit order.  For example, the polynomial 1 is
5671     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5672     //
5673     // So, we must either reverse the bytes in each word and do
5674     // everything big-endian or reverse the bits in each byte and do
5675     // it little-endian.  On AArch64 it's more idiomatic to reverse
5676     // the bits in each byte (we have an instruction, RBIT, to do
5677     // that) and keep the data in little-endian bit order throught the
5678     // calculation, bit-reversing the inputs and outputs.
5679 
5680     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5681     __ align(wordSize * 2);
5682     address p = __ pc();
5683     __ emit_int64(0x87);  // The low-order bits of the field
5684                           // polynomial (i.e. p = z^7+z^2+z+1)
5685                           // repeated in the low and high parts of a
5686                           // 128-bit vector
5687     __ emit_int64(0x87);
5688 
5689     __ align(CodeEntryAlignment);
5690     address start = __ pc();
5691 
5692     Register state   = c_rarg0;
5693     Register subkeyH = c_rarg1;
5694     Register data    = c_rarg2;
5695     Register blocks  = c_rarg3;
5696 
5697     FloatRegister vzr = v30;
5698     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5699 
5700     __ ldrq(v24, p);    // The field polynomial
5701 
5702     __ ldrq(v0, Address(state));
5703     __ ldrq(v1, Address(subkeyH));
5704 
5705     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5706     __ rbit(v0, __ T16B, v0);
5707     __ rev64(v1, __ T16B, v1);
5708     __ rbit(v1, __ T16B, v1);
5709 
5710     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5711     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5712 
5713     {
5714       Label L_ghash_loop;
5715       __ bind(L_ghash_loop);
5716 
5717       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5718                                                  // reversing each byte
5719       __ rbit(v2, __ T16B, v2);
5720       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5721 
5722       // Multiply state in v2 by subkey in v1
5723       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5724                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
5725                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
5726       // Reduce v7:v5 by the field polynomial
5727       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
5728 
5729       __ sub(blocks, blocks, 1);
5730       __ cbnz(blocks, L_ghash_loop);
5731     }
5732 
5733     // The bit-reversed result is at this point in v0
5734     __ rev64(v0, __ T16B, v0);
5735     __ rbit(v0, __ T16B, v0);
5736 
5737     __ st1(v0, __ T16B, state);
5738     __ ret(lr);
5739 
5740     return start;
5741   }
5742 
5743   address generate_ghash_processBlocks_wide() {
5744     address small = generate_ghash_processBlocks();
5745 
5746     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
5747     __ align(wordSize * 2);
5748     address p = __ pc();
5749     __ emit_int64(0x87);  // The low-order bits of the field
5750                           // polynomial (i.e. p = z^7+z^2+z+1)
5751                           // repeated in the low and high parts of a
5752                           // 128-bit vector
5753     __ emit_int64(0x87);
5754 
5755     __ align(CodeEntryAlignment);
5756     address start = __ pc();
5757 
5758     Register state   = c_rarg0;
5759     Register subkeyH = c_rarg1;
5760     Register data    = c_rarg2;
5761     Register blocks  = c_rarg3;
5762 
5763     const int unroll = 4;
5764 
5765     __ cmp(blocks, (unsigned char)(unroll * 2));
5766     __ br(__ LT, small);
5767 
5768     if (unroll > 1) {
5769     // Save state before entering routine
5770       __ sub(sp, sp, 4 * 16);
5771       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
5772       __ sub(sp, sp, 4 * 16);
5773       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
5774     }
5775 
5776     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
5777 
5778     if (unroll > 1) {
5779       // And restore state
5780       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
5781       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
5782     }
5783 
5784     __ cmp(blocks, (unsigned char)0);
5785     __ br(__ GT, small);
5786 
5787     __ ret(lr);
5788 
5789     return start;
5790   }
5791 
5792   void generate_base64_encode_simdround(Register src, Register dst,
5793         FloatRegister codec, u8 size) {
5794 
5795     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
5796     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5797     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5798 
5799     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5800 
5801     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
5802 
5803     __ ushr(ind0, arrangement, in0,  2);
5804 
5805     __ ushr(ind1, arrangement, in1,  2);
5806     __ shl(in0,   arrangement, in0,  6);
5807     __ orr(ind1,  arrangement, ind1, in0);
5808     __ ushr(ind1, arrangement, ind1, 2);
5809 
5810     __ ushr(ind2, arrangement, in2,  4);
5811     __ shl(in1,   arrangement, in1,  4);
5812     __ orr(ind2,  arrangement, in1,  ind2);
5813     __ ushr(ind2, arrangement, ind2, 2);
5814 
5815     __ shl(ind3,  arrangement, in2,  2);
5816     __ ushr(ind3, arrangement, ind3, 2);
5817 
5818     __ tbl(out0,  arrangement, codec,  4, ind0);
5819     __ tbl(out1,  arrangement, codec,  4, ind1);
5820     __ tbl(out2,  arrangement, codec,  4, ind2);
5821     __ tbl(out3,  arrangement, codec,  4, ind3);
5822 
5823     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
5824   }
5825 
5826    /**
5827    *  Arguments:
5828    *
5829    *  Input:
5830    *  c_rarg0   - src_start
5831    *  c_rarg1   - src_offset
5832    *  c_rarg2   - src_length
5833    *  c_rarg3   - dest_start
5834    *  c_rarg4   - dest_offset
5835    *  c_rarg5   - isURL
5836    *
5837    */
5838   address generate_base64_encodeBlock() {
5839 
5840     static const char toBase64[64] = {
5841       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5842       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5843       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5844       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5845       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5846     };
5847 
5848     static const char toBase64URL[64] = {
5849       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5850       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5851       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5852       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5853       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5854     };
5855 
5856     __ align(CodeEntryAlignment);
5857     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5858     address start = __ pc();
5859 
5860     Register src   = c_rarg0;  // source array
5861     Register soff  = c_rarg1;  // source start offset
5862     Register send  = c_rarg2;  // source end offset
5863     Register dst   = c_rarg3;  // dest array
5864     Register doff  = c_rarg4;  // position for writing to dest array
5865     Register isURL = c_rarg5;  // Base64 or URL chracter set
5866 
5867     // c_rarg6 and c_rarg7 are free to use as temps
5868     Register codec  = c_rarg6;
5869     Register length = c_rarg7;
5870 
5871     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
5872 
5873     __ add(src, src, soff);
5874     __ add(dst, dst, doff);
5875     __ sub(length, send, soff);
5876 
5877     // load the codec base address
5878     __ lea(codec, ExternalAddress((address) toBase64));
5879     __ cbz(isURL, ProcessData);
5880     __ lea(codec, ExternalAddress((address) toBase64URL));
5881 
5882     __ BIND(ProcessData);
5883 
5884     // too short to formup a SIMD loop, roll back
5885     __ cmp(length, (u1)24);
5886     __ br(Assembler::LT, Process3B);
5887 
5888     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
5889 
5890     __ BIND(Process48B);
5891     __ cmp(length, (u1)48);
5892     __ br(Assembler::LT, Process24B);
5893     generate_base64_encode_simdround(src, dst, v0, 16);
5894     __ sub(length, length, 48);
5895     __ b(Process48B);
5896 
5897     __ BIND(Process24B);
5898     __ cmp(length, (u1)24);
5899     __ br(Assembler::LT, SIMDExit);
5900     generate_base64_encode_simdround(src, dst, v0, 8);
5901     __ sub(length, length, 24);
5902 
5903     __ BIND(SIMDExit);
5904     __ cbz(length, Exit);
5905 
5906     __ BIND(Process3B);
5907     //  3 src bytes, 24 bits
5908     __ ldrb(r10, __ post(src, 1));
5909     __ ldrb(r11, __ post(src, 1));
5910     __ ldrb(r12, __ post(src, 1));
5911     __ orrw(r11, r11, r10, Assembler::LSL, 8);
5912     __ orrw(r12, r12, r11, Assembler::LSL, 8);
5913     // codec index
5914     __ ubfmw(r15, r12, 18, 23);
5915     __ ubfmw(r14, r12, 12, 17);
5916     __ ubfmw(r13, r12, 6,  11);
5917     __ andw(r12,  r12, 63);
5918     // get the code based on the codec
5919     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
5920     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
5921     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
5922     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
5923     __ strb(r15, __ post(dst, 1));
5924     __ strb(r14, __ post(dst, 1));
5925     __ strb(r13, __ post(dst, 1));
5926     __ strb(r12, __ post(dst, 1));
5927     __ sub(length, length, 3);
5928     __ cbnz(length, Process3B);
5929 
5930     __ BIND(Exit);
5931     __ ret(lr);
5932 
5933     return start;
5934   }
5935 
5936   void generate_base64_decode_simdround(Register src, Register dst,
5937         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
5938 
5939     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
5940     FloatRegister out0 = v20, out1 = v21, out2 = v22;
5941 
5942     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
5943     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
5944 
5945     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
5946 
5947     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5948 
5949     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
5950 
5951     // we need unsigned saturating substract, to make sure all input values
5952     // in range [0, 63] will have 0U value in the higher half lookup
5953     __ uqsubv(decH0, __ T16B, in0, v27);
5954     __ uqsubv(decH1, __ T16B, in1, v27);
5955     __ uqsubv(decH2, __ T16B, in2, v27);
5956     __ uqsubv(decH3, __ T16B, in3, v27);
5957 
5958     // lower half lookup
5959     __ tbl(decL0, arrangement, codecL, 4, in0);
5960     __ tbl(decL1, arrangement, codecL, 4, in1);
5961     __ tbl(decL2, arrangement, codecL, 4, in2);
5962     __ tbl(decL3, arrangement, codecL, 4, in3);
5963 
5964     // higher half lookup
5965     __ tbx(decH0, arrangement, codecH, 4, decH0);
5966     __ tbx(decH1, arrangement, codecH, 4, decH1);
5967     __ tbx(decH2, arrangement, codecH, 4, decH2);
5968     __ tbx(decH3, arrangement, codecH, 4, decH3);
5969 
5970     // combine lower and higher
5971     __ orr(decL0, arrangement, decL0, decH0);
5972     __ orr(decL1, arrangement, decL1, decH1);
5973     __ orr(decL2, arrangement, decL2, decH2);
5974     __ orr(decL3, arrangement, decL3, decH3);
5975 
5976     // check illegal inputs, value larger than 63 (maximum of 6 bits)
5977     __ cmhi(decH0, arrangement, decL0, v27);
5978     __ cmhi(decH1, arrangement, decL1, v27);
5979     __ cmhi(decH2, arrangement, decL2, v27);
5980     __ cmhi(decH3, arrangement, decL3, v27);
5981     __ orr(in0, arrangement, decH0, decH1);
5982     __ orr(in1, arrangement, decH2, decH3);
5983     __ orr(in2, arrangement, in0,   in1);
5984     __ umaxv(in3, arrangement, in2);
5985     __ umov(rscratch2, in3, __ B, 0);
5986 
5987     // get the data to output
5988     __ shl(out0,  arrangement, decL0, 2);
5989     __ ushr(out1, arrangement, decL1, 4);
5990     __ orr(out0,  arrangement, out0,  out1);
5991     __ shl(out1,  arrangement, decL1, 4);
5992     __ ushr(out2, arrangement, decL2, 2);
5993     __ orr(out1,  arrangement, out1,  out2);
5994     __ shl(out2,  arrangement, decL2, 6);
5995     __ orr(out2,  arrangement, out2,  decL3);
5996 
5997     __ cbz(rscratch2, NoIllegalData);
5998 
5999     // handle illegal input
6000     __ umov(r10, in2, __ D, 0);
6001     if (size == 16) {
6002       __ cbnz(r10, ErrorInLowerHalf);
6003 
6004       // illegal input is in higher half, store the lower half now.
6005       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6006 
6007       __ umov(r10, in2,  __ D, 1);
6008       __ umov(r11, out0, __ D, 1);
6009       __ umov(r12, out1, __ D, 1);
6010       __ umov(r13, out2, __ D, 1);
6011       __ b(StoreLegalData);
6012 
6013       __ BIND(ErrorInLowerHalf);
6014     }
6015     __ umov(r11, out0, __ D, 0);
6016     __ umov(r12, out1, __ D, 0);
6017     __ umov(r13, out2, __ D, 0);
6018 
6019     __ BIND(StoreLegalData);
6020     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6021     __ strb(r11, __ post(dst, 1));
6022     __ strb(r12, __ post(dst, 1));
6023     __ strb(r13, __ post(dst, 1));
6024     __ lsr(r10, r10, 8);
6025     __ lsr(r11, r11, 8);
6026     __ lsr(r12, r12, 8);
6027     __ lsr(r13, r13, 8);
6028     __ b(StoreLegalData);
6029 
6030     __ BIND(NoIllegalData);
6031     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6032   }
6033 
6034 
6035    /**
6036    *  Arguments:
6037    *
6038    *  Input:
6039    *  c_rarg0   - src_start
6040    *  c_rarg1   - src_offset
6041    *  c_rarg2   - src_length
6042    *  c_rarg3   - dest_start
6043    *  c_rarg4   - dest_offset
6044    *  c_rarg5   - isURL
6045    *  c_rarg6   - isMIME
6046    *
6047    */
6048   address generate_base64_decodeBlock() {
6049 
6050     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6051     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6052     // titled "Base64 decoding".
6053 
6054     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6055     // except the trailing character '=' is also treated illegal value in this instrinsic. That
6056     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6057     static const uint8_t fromBase64ForNoSIMD[256] = {
6058       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6059       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6060       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6061        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6062       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6063        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6064       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6065        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6066       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6067       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6068       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6069       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6070       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6071       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6072       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6073       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6074     };
6075 
6076     static const uint8_t fromBase64URLForNoSIMD[256] = {
6077       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6078       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6079       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6080        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6081       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6082        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6083       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6084        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6085       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6086       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6087       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6088       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6089       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6090       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6091       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6092       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6093     };
6094 
6095     // A legal value of base64 code is in range [0, 127].  We need two lookups
6096     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6097     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6098     // table vector lookup use tbx, out of range indices are unchanged in
6099     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6100     // The value of index 64 is set to 0, so that we know that we already get the
6101     // decoded data with the 1st lookup.
6102     static const uint8_t fromBase64ForSIMD[128] = {
6103       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6104       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6105       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6106        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6107         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6108        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6109       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6110        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6111     };
6112 
6113     static const uint8_t fromBase64URLForSIMD[128] = {
6114       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6115       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6116       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6117        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6118         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6119        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6120        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6121        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6122     };
6123 
6124     __ align(CodeEntryAlignment);
6125     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6126     address start = __ pc();
6127 
6128     Register src    = c_rarg0;  // source array
6129     Register soff   = c_rarg1;  // source start offset
6130     Register send   = c_rarg2;  // source end offset
6131     Register dst    = c_rarg3;  // dest array
6132     Register doff   = c_rarg4;  // position for writing to dest array
6133     Register isURL  = c_rarg5;  // Base64 or URL character set
6134     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6135 
6136     Register length = send;    // reuse send as length of source data to process
6137 
6138     Register simd_codec   = c_rarg6;
6139     Register nosimd_codec = c_rarg7;
6140 
6141     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6142 
6143     __ enter();
6144 
6145     __ add(src, src, soff);
6146     __ add(dst, dst, doff);
6147 
6148     __ mov(doff, dst);
6149 
6150     __ sub(length, send, soff);
6151     __ bfm(length, zr, 0, 1);
6152 
6153     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6154     __ cbz(isURL, ProcessData);
6155     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6156 
6157     __ BIND(ProcessData);
6158     __ mov(rscratch1, length);
6159     __ cmp(length, (u1)144); // 144 = 80 + 64
6160     __ br(Assembler::LT, Process4B);
6161 
6162     // In the MIME case, the line length cannot be more than 76
6163     // bytes (see RFC 2045). This is too short a block for SIMD
6164     // to be worthwhile, so we use non-SIMD here.
6165     __ movw(rscratch1, 79);
6166 
6167     __ BIND(Process4B);
6168     __ ldrw(r14, __ post(src, 4));
6169     __ ubfxw(r10, r14, 0,  8);
6170     __ ubfxw(r11, r14, 8,  8);
6171     __ ubfxw(r12, r14, 16, 8);
6172     __ ubfxw(r13, r14, 24, 8);
6173     // get the de-code
6174     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6175     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6176     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6177     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6178     // error detection, 255u indicates an illegal input
6179     __ orrw(r14, r10, r11);
6180     __ orrw(r15, r12, r13);
6181     __ orrw(r14, r14, r15);
6182     __ tbnz(r14, 7, Exit);
6183     // recover the data
6184     __ lslw(r14, r10, 10);
6185     __ bfiw(r14, r11, 4, 6);
6186     __ bfmw(r14, r12, 2, 5);
6187     __ rev16w(r14, r14);
6188     __ bfiw(r13, r12, 6, 2);
6189     __ strh(r14, __ post(dst, 2));
6190     __ strb(r13, __ post(dst, 1));
6191     // non-simd loop
6192     __ subsw(rscratch1, rscratch1, 4);
6193     __ br(Assembler::GT, Process4B);
6194 
6195     // if exiting from PreProcess80B, rscratch1 == -1;
6196     // otherwise, rscratch1 == 0.
6197     __ cbzw(rscratch1, Exit);
6198     __ sub(length, length, 80);
6199 
6200     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6201     __ cbz(isURL, SIMDEnter);
6202     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6203 
6204     __ BIND(SIMDEnter);
6205     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6206     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6207     __ mov(rscratch1, 63);
6208     __ dup(v27, __ T16B, rscratch1);
6209 
6210     __ BIND(Process64B);
6211     __ cmp(length, (u1)64);
6212     __ br(Assembler::LT, Process32B);
6213     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6214     __ sub(length, length, 64);
6215     __ b(Process64B);
6216 
6217     __ BIND(Process32B);
6218     __ cmp(length, (u1)32);
6219     __ br(Assembler::LT, SIMDExit);
6220     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6221     __ sub(length, length, 32);
6222     __ b(Process32B);
6223 
6224     __ BIND(SIMDExit);
6225     __ cbz(length, Exit);
6226     __ movw(rscratch1, length);
6227     __ b(Process4B);
6228 
6229     __ BIND(Exit);
6230     __ sub(c_rarg0, dst, doff);
6231 
6232     __ leave();
6233     __ ret(lr);
6234 
6235     return start;
6236   }
6237 
6238 #ifdef LINUX
6239 
6240   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6241   //
6242   // If LSE is in use, generate LSE versions of all the stubs. The
6243   // non-LSE versions are in atomic_aarch64.S.
6244 
6245   // class AtomicStubMark records the entry point of a stub and the
6246   // stub pointer which will point to it. The stub pointer is set to
6247   // the entry point when ~AtomicStubMark() is called, which must be
6248   // after ICache::invalidate_range. This ensures safe publication of
6249   // the generated code.
6250   class AtomicStubMark {
6251     address _entry_point;
6252     aarch64_atomic_stub_t *_stub;
6253     MacroAssembler *_masm;
6254   public:
6255     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6256       _masm = masm;
6257       __ align(32);
6258       _entry_point = __ pc();
6259       _stub = stub;
6260     }
6261     ~AtomicStubMark() {
6262       *_stub = (aarch64_atomic_stub_t)_entry_point;
6263     }
6264   };
6265 
6266   // NB: For memory_order_conservative we need a trailing membar after
6267   // LSE atomic operations but not a leading membar.
6268   //
6269   // We don't need a leading membar because a clause in the Arm ARM
6270   // says:
6271   //
6272   //   Barrier-ordered-before
6273   //
6274   //   Barrier instructions order prior Memory effects before subsequent
6275   //   Memory effects generated by the same Observer. A read or a write
6276   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6277   //   Observer if and only if RW1 appears in program order before RW 2
6278   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6279   //   instruction with both Acquire and Release semantics.
6280   //
6281   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6282   // and Release semantics, therefore we don't need a leading
6283   // barrier. However, there is no corresponding Barrier-ordered-after
6284   // relationship, therefore we need a trailing membar to prevent a
6285   // later store or load from being reordered with the store in an
6286   // atomic instruction.
6287   //
6288   // This was checked by using the herd7 consistency model simulator
6289   // (http://diy.inria.fr/) with this test case:
6290   //
6291   // AArch64 LseCas
6292   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6293   // P0 | P1;
6294   // LDR W4, [X2] | MOV W3, #0;
6295   // DMB LD       | MOV W4, #1;
6296   // LDR W3, [X1] | CASAL W3, W4, [X1];
6297   //              | DMB ISH;
6298   //              | STR W4, [X2];
6299   // exists
6300   // (0:X3=0 /\ 0:X4=1)
6301   //
6302   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6303   // with the store to x in P1. Without the DMB in P1 this may happen.
6304   //
6305   // At the time of writing we don't know of any AArch64 hardware that
6306   // reorders stores in this way, but the Reference Manual permits it.
6307 
6308   void gen_cas_entry(Assembler::operand_size size,
6309                      atomic_memory_order order) {
6310     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6311       exchange_val = c_rarg2;
6312     bool acquire, release;
6313     switch (order) {
6314       case memory_order_relaxed:
6315         acquire = false;
6316         release = false;
6317         break;
6318       case memory_order_release:
6319         acquire = false;
6320         release = true;
6321         break;
6322       default:
6323         acquire = true;
6324         release = true;
6325         break;
6326     }
6327     __ mov(prev, compare_val);
6328     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6329     if (order == memory_order_conservative) {
6330       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6331     }
6332     if (size == Assembler::xword) {
6333       __ mov(r0, prev);
6334     } else {
6335       __ movw(r0, prev);
6336     }
6337     __ ret(lr);
6338   }
6339 
6340   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6341     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6342     // If not relaxed, then default to conservative.  Relaxed is the only
6343     // case we use enough to be worth specializing.
6344     if (order == memory_order_relaxed) {
6345       __ ldadd(size, incr, prev, addr);
6346     } else {
6347       __ ldaddal(size, incr, prev, addr);
6348       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6349     }
6350     if (size == Assembler::xword) {
6351       __ mov(r0, prev);
6352     } else {
6353       __ movw(r0, prev);
6354     }
6355     __ ret(lr);
6356   }
6357 
6358   void gen_swpal_entry(Assembler::operand_size size) {
6359     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6360     __ swpal(size, incr, prev, addr);
6361     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6362     if (size == Assembler::xword) {
6363       __ mov(r0, prev);
6364     } else {
6365       __ movw(r0, prev);
6366     }
6367     __ ret(lr);
6368   }
6369 
6370   void generate_atomic_entry_points() {
6371     if (! UseLSE) {
6372       return;
6373     }
6374 
6375     __ align(CodeEntryAlignment);
6376     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6377     address first_entry = __ pc();
6378 
6379     // ADD, memory_order_conservative
6380     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6381     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6382     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6383     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6384 
6385     // ADD, memory_order_relaxed
6386     AtomicStubMark mark_fetch_add_4_relaxed
6387       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6388     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6389     AtomicStubMark mark_fetch_add_8_relaxed
6390       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6391     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6392 
6393     // XCHG, memory_order_conservative
6394     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6395     gen_swpal_entry(Assembler::word);
6396     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6397     gen_swpal_entry(Assembler::xword);
6398 
6399     // CAS, memory_order_conservative
6400     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6401     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6402     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6403     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6404     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6405     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6406 
6407     // CAS, memory_order_relaxed
6408     AtomicStubMark mark_cmpxchg_1_relaxed
6409       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6410     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6411     AtomicStubMark mark_cmpxchg_4_relaxed
6412       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6413     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6414     AtomicStubMark mark_cmpxchg_8_relaxed
6415       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6416     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6417 
6418     AtomicStubMark mark_cmpxchg_4_release
6419       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6420     gen_cas_entry(MacroAssembler::word, memory_order_release);
6421     AtomicStubMark mark_cmpxchg_8_release
6422       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6423     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6424 
6425     AtomicStubMark mark_cmpxchg_4_seq_cst
6426       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6427     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6428     AtomicStubMark mark_cmpxchg_8_seq_cst
6429       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6430     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6431 
6432     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6433   }
6434 #endif // LINUX
6435 
6436   // Continuation point for throwing of implicit exceptions that are
6437   // not handled in the current activation. Fabricates an exception
6438   // oop and initiates normal exception dispatching in this
6439   // frame. Since we need to preserve callee-saved values (currently
6440   // only for C2, but done for C1 as well) we need a callee-saved oop
6441   // map and therefore have to make these stubs into RuntimeStubs
6442   // rather than BufferBlobs.  If the compiler needs all registers to
6443   // be preserved between the fault point and the exception handler
6444   // then it must assume responsibility for that in
6445   // AbstractCompiler::continuation_for_implicit_null_exception or
6446   // continuation_for_implicit_division_by_zero_exception. All other
6447   // implicit exceptions (e.g., NullPointerException or
6448   // AbstractMethodError on entry) are either at call sites or
6449   // otherwise assume that stack unwinding will be initiated, so
6450   // caller saved registers were assumed volatile in the compiler.
6451 
6452 #undef __
6453 #define __ masm->
6454 
6455   address generate_throw_exception(const char* name,
6456                                    address runtime_entry,
6457                                    Register arg1 = noreg,
6458                                    Register arg2 = noreg) {
6459     // Information about frame layout at time of blocking runtime call.
6460     // Note that we only have to preserve callee-saved registers since
6461     // the compilers are responsible for supplying a continuation point
6462     // if they expect all registers to be preserved.
6463     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6464     enum layout {
6465       rfp_off = 0,
6466       rfp_off2,
6467       return_off,
6468       return_off2,
6469       framesize // inclusive of return address
6470     };
6471 
6472     int insts_size = 512;
6473     int locs_size  = 64;
6474 
6475     CodeBuffer code(name, insts_size, locs_size);
6476     OopMapSet* oop_maps  = new OopMapSet();
6477     MacroAssembler* masm = new MacroAssembler(&code);
6478 
6479     address start = __ pc();
6480 
6481     // This is an inlined and slightly modified version of call_VM
6482     // which has the ability to fetch the return PC out of
6483     // thread-local storage and also sets up last_Java_sp slightly
6484     // differently than the real call_VM
6485 
6486     __ enter(); // Save FP and LR before call
6487 
6488     assert(is_even(framesize/2), "sp not 16-byte aligned");
6489 
6490     // lr and fp are already in place
6491     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
6492 
6493     int frame_complete = __ pc() - start;
6494 
6495     // Set up last_Java_sp and last_Java_fp
6496     address the_pc = __ pc();
6497     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6498 
6499     // Call runtime
6500     if (arg1 != noreg) {
6501       assert(arg2 != c_rarg1, "clobbered");
6502       __ mov(c_rarg1, arg1);
6503     }
6504     if (arg2 != noreg) {
6505       __ mov(c_rarg2, arg2);
6506     }
6507     __ mov(c_rarg0, rthread);
6508     BLOCK_COMMENT("call runtime_entry");
6509     __ mov(rscratch1, runtime_entry);
6510     __ blr(rscratch1);
6511 
6512     // Generate oop map
6513     OopMap* map = new OopMap(framesize, 0);
6514 
6515     oop_maps->add_gc_map(the_pc - start, map);
6516 
6517     __ reset_last_Java_frame(true);
6518 
6519     // Reinitialize the ptrue predicate register, in case the external runtime
6520     // call clobbers ptrue reg, as we may return to SVE compiled code.
6521     __ reinitialize_ptrue();
6522 
6523     __ leave();
6524 
6525     // check for pending exceptions
6526 #ifdef ASSERT
6527     Label L;
6528     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6529     __ cbnz(rscratch1, L);
6530     __ should_not_reach_here();
6531     __ bind(L);
6532 #endif // ASSERT
6533     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6534 
6535 
6536     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6537     RuntimeStub* stub =
6538       RuntimeStub::new_runtime_stub(name,
6539                                     &code,
6540                                     frame_complete,
6541                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6542                                     oop_maps, false);
6543     return stub->entry_point();
6544   }
6545 
6546   class MontgomeryMultiplyGenerator : public MacroAssembler {
6547 
6548     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6549       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6550 
6551     RegSet _toSave;
6552     bool _squaring;
6553 
6554   public:
6555     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6556       : MacroAssembler(as->code()), _squaring(squaring) {
6557 
6558       // Register allocation
6559 
6560       RegSetIterator<> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6561       Pa_base = *regs;       // Argument registers
6562       if (squaring)
6563         Pb_base = Pa_base;
6564       else
6565         Pb_base = *++regs;
6566       Pn_base = *++regs;
6567       Rlen= *++regs;
6568       inv = *++regs;
6569       Pm_base = *++regs;
6570 
6571                           // Working registers:
6572       Ra =  *++regs;        // The current digit of a, b, n, and m.
6573       Rb =  *++regs;
6574       Rm =  *++regs;
6575       Rn =  *++regs;
6576 
6577       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
6578       Pb =  *++regs;
6579       Pm =  *++regs;
6580       Pn =  *++regs;
6581 
6582       t0 =  *++regs;        // Three registers which form a
6583       t1 =  *++regs;        // triple-precision accumuator.
6584       t2 =  *++regs;
6585 
6586       Ri =  *++regs;        // Inner and outer loop indexes.
6587       Rj =  *++regs;
6588 
6589       Rhi_ab = *++regs;     // Product registers: low and high parts
6590       Rlo_ab = *++regs;     // of a*b and m*n.
6591       Rhi_mn = *++regs;
6592       Rlo_mn = *++regs;
6593 
6594       // r19 and up are callee-saved.
6595       _toSave = RegSet::range(r19, *regs) + Pm_base;
6596     }
6597 
6598   private:
6599     void save_regs() {
6600       push(_toSave, sp);
6601     }
6602 
6603     void restore_regs() {
6604       pop(_toSave, sp);
6605     }
6606 
6607     template <typename T>
6608     void unroll_2(Register count, T block) {
6609       Label loop, end, odd;
6610       tbnz(count, 0, odd);
6611       cbz(count, end);
6612       align(16);
6613       bind(loop);
6614       (this->*block)();
6615       bind(odd);
6616       (this->*block)();
6617       subs(count, count, 2);
6618       br(Assembler::GT, loop);
6619       bind(end);
6620     }
6621 
6622     template <typename T>
6623     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
6624       Label loop, end, odd;
6625       tbnz(count, 0, odd);
6626       cbz(count, end);
6627       align(16);
6628       bind(loop);
6629       (this->*block)(d, s, tmp);
6630       bind(odd);
6631       (this->*block)(d, s, tmp);
6632       subs(count, count, 2);
6633       br(Assembler::GT, loop);
6634       bind(end);
6635     }
6636 
6637     void pre1(RegisterOrConstant i) {
6638       block_comment("pre1");
6639       // Pa = Pa_base;
6640       // Pb = Pb_base + i;
6641       // Pm = Pm_base;
6642       // Pn = Pn_base + i;
6643       // Ra = *Pa;
6644       // Rb = *Pb;
6645       // Rm = *Pm;
6646       // Rn = *Pn;
6647       ldr(Ra, Address(Pa_base));
6648       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6649       ldr(Rm, Address(Pm_base));
6650       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6651       lea(Pa, Address(Pa_base));
6652       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6653       lea(Pm, Address(Pm_base));
6654       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6655 
6656       // Zero the m*n result.
6657       mov(Rhi_mn, zr);
6658       mov(Rlo_mn, zr);
6659     }
6660 
6661     // The core multiply-accumulate step of a Montgomery
6662     // multiplication.  The idea is to schedule operations as a
6663     // pipeline so that instructions with long latencies (loads and
6664     // multiplies) have time to complete before their results are
6665     // used.  This most benefits in-order implementations of the
6666     // architecture but out-of-order ones also benefit.
6667     void step() {
6668       block_comment("step");
6669       // MACC(Ra, Rb, t0, t1, t2);
6670       // Ra = *++Pa;
6671       // Rb = *--Pb;
6672       umulh(Rhi_ab, Ra, Rb);
6673       mul(Rlo_ab, Ra, Rb);
6674       ldr(Ra, pre(Pa, wordSize));
6675       ldr(Rb, pre(Pb, -wordSize));
6676       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
6677                                        // previous iteration.
6678       // MACC(Rm, Rn, t0, t1, t2);
6679       // Rm = *++Pm;
6680       // Rn = *--Pn;
6681       umulh(Rhi_mn, Rm, Rn);
6682       mul(Rlo_mn, Rm, Rn);
6683       ldr(Rm, pre(Pm, wordSize));
6684       ldr(Rn, pre(Pn, -wordSize));
6685       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6686     }
6687 
6688     void post1() {
6689       block_comment("post1");
6690 
6691       // MACC(Ra, Rb, t0, t1, t2);
6692       // Ra = *++Pa;
6693       // Rb = *--Pb;
6694       umulh(Rhi_ab, Ra, Rb);
6695       mul(Rlo_ab, Ra, Rb);
6696       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6697       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6698 
6699       // *Pm = Rm = t0 * inv;
6700       mul(Rm, t0, inv);
6701       str(Rm, Address(Pm));
6702 
6703       // MACC(Rm, Rn, t0, t1, t2);
6704       // t0 = t1; t1 = t2; t2 = 0;
6705       umulh(Rhi_mn, Rm, Rn);
6706 
6707 #ifndef PRODUCT
6708       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6709       {
6710         mul(Rlo_mn, Rm, Rn);
6711         add(Rlo_mn, t0, Rlo_mn);
6712         Label ok;
6713         cbz(Rlo_mn, ok); {
6714           stop("broken Montgomery multiply");
6715         } bind(ok);
6716       }
6717 #endif
6718       // We have very carefully set things up so that
6719       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6720       // the lower half of Rm * Rn because we know the result already:
6721       // it must be -t0.  t0 + (-t0) must generate a carry iff
6722       // t0 != 0.  So, rather than do a mul and an adds we just set
6723       // the carry flag iff t0 is nonzero.
6724       //
6725       // mul(Rlo_mn, Rm, Rn);
6726       // adds(zr, t0, Rlo_mn);
6727       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6728       adcs(t0, t1, Rhi_mn);
6729       adc(t1, t2, zr);
6730       mov(t2, zr);
6731     }
6732 
6733     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
6734       block_comment("pre2");
6735       // Pa = Pa_base + i-len;
6736       // Pb = Pb_base + len;
6737       // Pm = Pm_base + i-len;
6738       // Pn = Pn_base + len;
6739 
6740       if (i.is_register()) {
6741         sub(Rj, i.as_register(), len);
6742       } else {
6743         mov(Rj, i.as_constant());
6744         sub(Rj, Rj, len);
6745       }
6746       // Rj == i-len
6747 
6748       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
6749       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
6750       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6751       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
6752 
6753       // Ra = *++Pa;
6754       // Rb = *--Pb;
6755       // Rm = *++Pm;
6756       // Rn = *--Pn;
6757       ldr(Ra, pre(Pa, wordSize));
6758       ldr(Rb, pre(Pb, -wordSize));
6759       ldr(Rm, pre(Pm, wordSize));
6760       ldr(Rn, pre(Pn, -wordSize));
6761 
6762       mov(Rhi_mn, zr);
6763       mov(Rlo_mn, zr);
6764     }
6765 
6766     void post2(RegisterOrConstant i, RegisterOrConstant len) {
6767       block_comment("post2");
6768       if (i.is_constant()) {
6769         mov(Rj, i.as_constant()-len.as_constant());
6770       } else {
6771         sub(Rj, i.as_register(), len);
6772       }
6773 
6774       adds(t0, t0, Rlo_mn); // The pending m*n, low part
6775 
6776       // As soon as we know the least significant digit of our result,
6777       // store it.
6778       // Pm_base[i-len] = t0;
6779       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6780 
6781       // t0 = t1; t1 = t2; t2 = 0;
6782       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
6783       adc(t1, t2, zr);
6784       mov(t2, zr);
6785     }
6786 
6787     // A carry in t0 after Montgomery multiplication means that we
6788     // should subtract multiples of n from our result in m.  We'll
6789     // keep doing that until there is no carry.
6790     void normalize(RegisterOrConstant len) {
6791       block_comment("normalize");
6792       // while (t0)
6793       //   t0 = sub(Pm_base, Pn_base, t0, len);
6794       Label loop, post, again;
6795       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
6796       cbz(t0, post); {
6797         bind(again); {
6798           mov(i, zr);
6799           mov(cnt, len);
6800           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6801           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6802           subs(zr, zr, zr); // set carry flag, i.e. no borrow
6803           align(16);
6804           bind(loop); {
6805             sbcs(Rm, Rm, Rn);
6806             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6807             add(i, i, 1);
6808             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6809             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6810             sub(cnt, cnt, 1);
6811           } cbnz(cnt, loop);
6812           sbc(t0, t0, zr);
6813         } cbnz(t0, again);
6814       } bind(post);
6815     }
6816 
6817     // Move memory at s to d, reversing words.
6818     //    Increments d to end of copied memory
6819     //    Destroys tmp1, tmp2
6820     //    Preserves len
6821     //    Leaves s pointing to the address which was in d at start
6822     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
6823       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
6824 
6825       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
6826       mov(tmp1, len);
6827       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
6828       sub(s, d, len, ext::uxtw, LogBytesPerWord);
6829     }
6830     // where
6831     void reverse1(Register d, Register s, Register tmp) {
6832       ldr(tmp, pre(s, -wordSize));
6833       ror(tmp, tmp, 32);
6834       str(tmp, post(d, wordSize));
6835     }
6836 
6837     void step_squaring() {
6838       // An extra ACC
6839       step();
6840       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6841     }
6842 
6843     void last_squaring(RegisterOrConstant i) {
6844       Label dont;
6845       // if ((i & 1) == 0) {
6846       tbnz(i.as_register(), 0, dont); {
6847         // MACC(Ra, Rb, t0, t1, t2);
6848         // Ra = *++Pa;
6849         // Rb = *--Pb;
6850         umulh(Rhi_ab, Ra, Rb);
6851         mul(Rlo_ab, Ra, Rb);
6852         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6853       } bind(dont);
6854     }
6855 
6856     void extra_step_squaring() {
6857       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6858 
6859       // MACC(Rm, Rn, t0, t1, t2);
6860       // Rm = *++Pm;
6861       // Rn = *--Pn;
6862       umulh(Rhi_mn, Rm, Rn);
6863       mul(Rlo_mn, Rm, Rn);
6864       ldr(Rm, pre(Pm, wordSize));
6865       ldr(Rn, pre(Pn, -wordSize));
6866     }
6867 
6868     void post1_squaring() {
6869       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6870 
6871       // *Pm = Rm = t0 * inv;
6872       mul(Rm, t0, inv);
6873       str(Rm, Address(Pm));
6874 
6875       // MACC(Rm, Rn, t0, t1, t2);
6876       // t0 = t1; t1 = t2; t2 = 0;
6877       umulh(Rhi_mn, Rm, Rn);
6878 
6879 #ifndef PRODUCT
6880       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6881       {
6882         mul(Rlo_mn, Rm, Rn);
6883         add(Rlo_mn, t0, Rlo_mn);
6884         Label ok;
6885         cbz(Rlo_mn, ok); {
6886           stop("broken Montgomery multiply");
6887         } bind(ok);
6888       }
6889 #endif
6890       // We have very carefully set things up so that
6891       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6892       // the lower half of Rm * Rn because we know the result already:
6893       // it must be -t0.  t0 + (-t0) must generate a carry iff
6894       // t0 != 0.  So, rather than do a mul and an adds we just set
6895       // the carry flag iff t0 is nonzero.
6896       //
6897       // mul(Rlo_mn, Rm, Rn);
6898       // adds(zr, t0, Rlo_mn);
6899       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6900       adcs(t0, t1, Rhi_mn);
6901       adc(t1, t2, zr);
6902       mov(t2, zr);
6903     }
6904 
6905     void acc(Register Rhi, Register Rlo,
6906              Register t0, Register t1, Register t2) {
6907       adds(t0, t0, Rlo);
6908       adcs(t1, t1, Rhi);
6909       adc(t2, t2, zr);
6910     }
6911 
6912   public:
6913     /**
6914      * Fast Montgomery multiplication.  The derivation of the
6915      * algorithm is in A Cryptographic Library for the Motorola
6916      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
6917      *
6918      * Arguments:
6919      *
6920      * Inputs for multiplication:
6921      *   c_rarg0   - int array elements a
6922      *   c_rarg1   - int array elements b
6923      *   c_rarg2   - int array elements n (the modulus)
6924      *   c_rarg3   - int length
6925      *   c_rarg4   - int inv
6926      *   c_rarg5   - int array elements m (the result)
6927      *
6928      * Inputs for squaring:
6929      *   c_rarg0   - int array elements a
6930      *   c_rarg1   - int array elements n (the modulus)
6931      *   c_rarg2   - int length
6932      *   c_rarg3   - int inv
6933      *   c_rarg4   - int array elements m (the result)
6934      *
6935      */
6936     address generate_multiply() {
6937       Label argh, nothing;
6938       bind(argh);
6939       stop("MontgomeryMultiply total_allocation must be <= 8192");
6940 
6941       align(CodeEntryAlignment);
6942       address entry = pc();
6943 
6944       cbzw(Rlen, nothing);
6945 
6946       enter();
6947 
6948       // Make room.
6949       cmpw(Rlen, 512);
6950       br(Assembler::HI, argh);
6951       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
6952       andr(sp, Ra, -2 * wordSize);
6953 
6954       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
6955 
6956       {
6957         // Copy input args, reversing as we go.  We use Ra as a
6958         // temporary variable.
6959         reverse(Ra, Pa_base, Rlen, t0, t1);
6960         if (!_squaring)
6961           reverse(Ra, Pb_base, Rlen, t0, t1);
6962         reverse(Ra, Pn_base, Rlen, t0, t1);
6963       }
6964 
6965       // Push all call-saved registers and also Pm_base which we'll need
6966       // at the end.
6967       save_regs();
6968 
6969 #ifndef PRODUCT
6970       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
6971       {
6972         ldr(Rn, Address(Pn_base, 0));
6973         mul(Rlo_mn, Rn, inv);
6974         subs(zr, Rlo_mn, -1);
6975         Label ok;
6976         br(EQ, ok); {
6977           stop("broken inverse in Montgomery multiply");
6978         } bind(ok);
6979       }
6980 #endif
6981 
6982       mov(Pm_base, Ra);
6983 
6984       mov(t0, zr);
6985       mov(t1, zr);
6986       mov(t2, zr);
6987 
6988       block_comment("for (int i = 0; i < len; i++) {");
6989       mov(Ri, zr); {
6990         Label loop, end;
6991         cmpw(Ri, Rlen);
6992         br(Assembler::GE, end);
6993 
6994         bind(loop);
6995         pre1(Ri);
6996 
6997         block_comment("  for (j = i; j; j--) {"); {
6998           movw(Rj, Ri);
6999           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7000         } block_comment("  } // j");
7001 
7002         post1();
7003         addw(Ri, Ri, 1);
7004         cmpw(Ri, Rlen);
7005         br(Assembler::LT, loop);
7006         bind(end);
7007         block_comment("} // i");
7008       }
7009 
7010       block_comment("for (int i = len; i < 2*len; i++) {");
7011       mov(Ri, Rlen); {
7012         Label loop, end;
7013         cmpw(Ri, Rlen, Assembler::LSL, 1);
7014         br(Assembler::GE, end);
7015 
7016         bind(loop);
7017         pre2(Ri, Rlen);
7018 
7019         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7020           lslw(Rj, Rlen, 1);
7021           subw(Rj, Rj, Ri);
7022           subw(Rj, Rj, 1);
7023           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7024         } block_comment("  } // j");
7025 
7026         post2(Ri, Rlen);
7027         addw(Ri, Ri, 1);
7028         cmpw(Ri, Rlen, Assembler::LSL, 1);
7029         br(Assembler::LT, loop);
7030         bind(end);
7031       }
7032       block_comment("} // i");
7033 
7034       normalize(Rlen);
7035 
7036       mov(Ra, Pm_base);  // Save Pm_base in Ra
7037       restore_regs();  // Restore caller's Pm_base
7038 
7039       // Copy our result into caller's Pm_base
7040       reverse(Pm_base, Ra, Rlen, t0, t1);
7041 
7042       leave();
7043       bind(nothing);
7044       ret(lr);
7045 
7046       return entry;
7047     }
7048     // In C, approximately:
7049 
7050     // void
7051     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7052     //                     julong Pn_base[], julong Pm_base[],
7053     //                     julong inv, int len) {
7054     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7055     //   julong *Pa, *Pb, *Pn, *Pm;
7056     //   julong Ra, Rb, Rn, Rm;
7057 
7058     //   int i;
7059 
7060     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7061 
7062     //   for (i = 0; i < len; i++) {
7063     //     int j;
7064 
7065     //     Pa = Pa_base;
7066     //     Pb = Pb_base + i;
7067     //     Pm = Pm_base;
7068     //     Pn = Pn_base + i;
7069 
7070     //     Ra = *Pa;
7071     //     Rb = *Pb;
7072     //     Rm = *Pm;
7073     //     Rn = *Pn;
7074 
7075     //     int iters = i;
7076     //     for (j = 0; iters--; j++) {
7077     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7078     //       MACC(Ra, Rb, t0, t1, t2);
7079     //       Ra = *++Pa;
7080     //       Rb = *--Pb;
7081     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7082     //       MACC(Rm, Rn, t0, t1, t2);
7083     //       Rm = *++Pm;
7084     //       Rn = *--Pn;
7085     //     }
7086 
7087     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7088     //     MACC(Ra, Rb, t0, t1, t2);
7089     //     *Pm = Rm = t0 * inv;
7090     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7091     //     MACC(Rm, Rn, t0, t1, t2);
7092 
7093     //     assert(t0 == 0, "broken Montgomery multiply");
7094 
7095     //     t0 = t1; t1 = t2; t2 = 0;
7096     //   }
7097 
7098     //   for (i = len; i < 2*len; i++) {
7099     //     int j;
7100 
7101     //     Pa = Pa_base + i-len;
7102     //     Pb = Pb_base + len;
7103     //     Pm = Pm_base + i-len;
7104     //     Pn = Pn_base + len;
7105 
7106     //     Ra = *++Pa;
7107     //     Rb = *--Pb;
7108     //     Rm = *++Pm;
7109     //     Rn = *--Pn;
7110 
7111     //     int iters = len*2-i-1;
7112     //     for (j = i-len+1; iters--; j++) {
7113     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7114     //       MACC(Ra, Rb, t0, t1, t2);
7115     //       Ra = *++Pa;
7116     //       Rb = *--Pb;
7117     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7118     //       MACC(Rm, Rn, t0, t1, t2);
7119     //       Rm = *++Pm;
7120     //       Rn = *--Pn;
7121     //     }
7122 
7123     //     Pm_base[i-len] = t0;
7124     //     t0 = t1; t1 = t2; t2 = 0;
7125     //   }
7126 
7127     //   while (t0)
7128     //     t0 = sub(Pm_base, Pn_base, t0, len);
7129     // }
7130 
7131     /**
7132      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7133      * multiplies than Montgomery multiplication so it should be up to
7134      * 25% faster.  However, its loop control is more complex and it
7135      * may actually run slower on some machines.
7136      *
7137      * Arguments:
7138      *
7139      * Inputs:
7140      *   c_rarg0   - int array elements a
7141      *   c_rarg1   - int array elements n (the modulus)
7142      *   c_rarg2   - int length
7143      *   c_rarg3   - int inv
7144      *   c_rarg4   - int array elements m (the result)
7145      *
7146      */
7147     address generate_square() {
7148       Label argh;
7149       bind(argh);
7150       stop("MontgomeryMultiply total_allocation must be <= 8192");
7151 
7152       align(CodeEntryAlignment);
7153       address entry = pc();
7154 
7155       enter();
7156 
7157       // Make room.
7158       cmpw(Rlen, 512);
7159       br(Assembler::HI, argh);
7160       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7161       andr(sp, Ra, -2 * wordSize);
7162 
7163       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7164 
7165       {
7166         // Copy input args, reversing as we go.  We use Ra as a
7167         // temporary variable.
7168         reverse(Ra, Pa_base, Rlen, t0, t1);
7169         reverse(Ra, Pn_base, Rlen, t0, t1);
7170       }
7171 
7172       // Push all call-saved registers and also Pm_base which we'll need
7173       // at the end.
7174       save_regs();
7175 
7176       mov(Pm_base, Ra);
7177 
7178       mov(t0, zr);
7179       mov(t1, zr);
7180       mov(t2, zr);
7181 
7182       block_comment("for (int i = 0; i < len; i++) {");
7183       mov(Ri, zr); {
7184         Label loop, end;
7185         bind(loop);
7186         cmp(Ri, Rlen);
7187         br(Assembler::GE, end);
7188 
7189         pre1(Ri);
7190 
7191         block_comment("for (j = (i+1)/2; j; j--) {"); {
7192           add(Rj, Ri, 1);
7193           lsr(Rj, Rj, 1);
7194           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7195         } block_comment("  } // j");
7196 
7197         last_squaring(Ri);
7198 
7199         block_comment("  for (j = i/2; j; j--) {"); {
7200           lsr(Rj, Ri, 1);
7201           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7202         } block_comment("  } // j");
7203 
7204         post1_squaring();
7205         add(Ri, Ri, 1);
7206         cmp(Ri, Rlen);
7207         br(Assembler::LT, loop);
7208 
7209         bind(end);
7210         block_comment("} // i");
7211       }
7212 
7213       block_comment("for (int i = len; i < 2*len; i++) {");
7214       mov(Ri, Rlen); {
7215         Label loop, end;
7216         bind(loop);
7217         cmp(Ri, Rlen, Assembler::LSL, 1);
7218         br(Assembler::GE, end);
7219 
7220         pre2(Ri, Rlen);
7221 
7222         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7223           lsl(Rj, Rlen, 1);
7224           sub(Rj, Rj, Ri);
7225           sub(Rj, Rj, 1);
7226           lsr(Rj, Rj, 1);
7227           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7228         } block_comment("  } // j");
7229 
7230         last_squaring(Ri);
7231 
7232         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7233           lsl(Rj, Rlen, 1);
7234           sub(Rj, Rj, Ri);
7235           lsr(Rj, Rj, 1);
7236           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7237         } block_comment("  } // j");
7238 
7239         post2(Ri, Rlen);
7240         add(Ri, Ri, 1);
7241         cmp(Ri, Rlen, Assembler::LSL, 1);
7242 
7243         br(Assembler::LT, loop);
7244         bind(end);
7245         block_comment("} // i");
7246       }
7247 
7248       normalize(Rlen);
7249 
7250       mov(Ra, Pm_base);  // Save Pm_base in Ra
7251       restore_regs();  // Restore caller's Pm_base
7252 
7253       // Copy our result into caller's Pm_base
7254       reverse(Pm_base, Ra, Rlen, t0, t1);
7255 
7256       leave();
7257       ret(lr);
7258 
7259       return entry;
7260     }
7261     // In C, approximately:
7262 
7263     // void
7264     // montgomery_square(julong Pa_base[], julong Pn_base[],
7265     //                   julong Pm_base[], julong inv, int len) {
7266     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7267     //   julong *Pa, *Pb, *Pn, *Pm;
7268     //   julong Ra, Rb, Rn, Rm;
7269 
7270     //   int i;
7271 
7272     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7273 
7274     //   for (i = 0; i < len; i++) {
7275     //     int j;
7276 
7277     //     Pa = Pa_base;
7278     //     Pb = Pa_base + i;
7279     //     Pm = Pm_base;
7280     //     Pn = Pn_base + i;
7281 
7282     //     Ra = *Pa;
7283     //     Rb = *Pb;
7284     //     Rm = *Pm;
7285     //     Rn = *Pn;
7286 
7287     //     int iters = (i+1)/2;
7288     //     for (j = 0; iters--; j++) {
7289     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7290     //       MACC2(Ra, Rb, t0, t1, t2);
7291     //       Ra = *++Pa;
7292     //       Rb = *--Pb;
7293     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7294     //       MACC(Rm, Rn, t0, t1, t2);
7295     //       Rm = *++Pm;
7296     //       Rn = *--Pn;
7297     //     }
7298     //     if ((i & 1) == 0) {
7299     //       assert(Ra == Pa_base[j], "must be");
7300     //       MACC(Ra, Ra, t0, t1, t2);
7301     //     }
7302     //     iters = i/2;
7303     //     assert(iters == i-j, "must be");
7304     //     for (; iters--; j++) {
7305     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7306     //       MACC(Rm, Rn, t0, t1, t2);
7307     //       Rm = *++Pm;
7308     //       Rn = *--Pn;
7309     //     }
7310 
7311     //     *Pm = Rm = t0 * inv;
7312     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7313     //     MACC(Rm, Rn, t0, t1, t2);
7314 
7315     //     assert(t0 == 0, "broken Montgomery multiply");
7316 
7317     //     t0 = t1; t1 = t2; t2 = 0;
7318     //   }
7319 
7320     //   for (i = len; i < 2*len; i++) {
7321     //     int start = i-len+1;
7322     //     int end = start + (len - start)/2;
7323     //     int j;
7324 
7325     //     Pa = Pa_base + i-len;
7326     //     Pb = Pa_base + len;
7327     //     Pm = Pm_base + i-len;
7328     //     Pn = Pn_base + len;
7329 
7330     //     Ra = *++Pa;
7331     //     Rb = *--Pb;
7332     //     Rm = *++Pm;
7333     //     Rn = *--Pn;
7334 
7335     //     int iters = (2*len-i-1)/2;
7336     //     assert(iters == end-start, "must be");
7337     //     for (j = start; iters--; j++) {
7338     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7339     //       MACC2(Ra, Rb, t0, t1, t2);
7340     //       Ra = *++Pa;
7341     //       Rb = *--Pb;
7342     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7343     //       MACC(Rm, Rn, t0, t1, t2);
7344     //       Rm = *++Pm;
7345     //       Rn = *--Pn;
7346     //     }
7347     //     if ((i & 1) == 0) {
7348     //       assert(Ra == Pa_base[j], "must be");
7349     //       MACC(Ra, Ra, t0, t1, t2);
7350     //     }
7351     //     iters =  (2*len-i)/2;
7352     //     assert(iters == len-j, "must be");
7353     //     for (; iters--; j++) {
7354     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7355     //       MACC(Rm, Rn, t0, t1, t2);
7356     //       Rm = *++Pm;
7357     //       Rn = *--Pn;
7358     //     }
7359     //     Pm_base[i-len] = t0;
7360     //     t0 = t1; t1 = t2; t2 = 0;
7361     //   }
7362 
7363     //   while (t0)
7364     //     t0 = sub(Pm_base, Pn_base, t0, len);
7365     // }
7366   };
7367 
7368 
7369   // Call here from the interpreter or compiled code to either load
7370   // multiple returned values from the inline type instance being
7371   // returned to registers or to store returned values to a newly
7372   // allocated inline type instance.
7373   address generate_return_value_stub(address destination, const char* name, bool has_res) {
7374     // We need to save all registers the calling convention may use so
7375     // the runtime calls read or update those registers. This needs to
7376     // be in sync with SharedRuntime::java_return_convention().
7377     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7378     enum layout {
7379       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
7380       j_rarg6_off, j_rarg6_2,
7381       j_rarg5_off, j_rarg5_2,
7382       j_rarg4_off, j_rarg4_2,
7383       j_rarg3_off, j_rarg3_2,
7384       j_rarg2_off, j_rarg2_2,
7385       j_rarg1_off, j_rarg1_2,
7386       j_rarg0_off, j_rarg0_2,
7387 
7388       j_farg7_off, j_farg7_2,
7389       j_farg6_off, j_farg6_2,
7390       j_farg5_off, j_farg5_2,
7391       j_farg4_off, j_farg4_2,
7392       j_farg3_off, j_farg3_2,
7393       j_farg2_off, j_farg2_2,
7394       j_farg1_off, j_farg1_2,
7395       j_farg0_off, j_farg0_2,
7396 
7397       rfp_off, rfp_off2,
7398       return_off, return_off2,
7399 
7400       framesize // inclusive of return address
7401     };
7402 
7403     CodeBuffer code(name, 512, 64);
7404     MacroAssembler* masm = new MacroAssembler(&code);
7405 
7406     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
7407     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
7408     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
7409     int frame_size_in_words = frame_size_in_bytes / wordSize;
7410 
7411     OopMapSet* oop_maps = new OopMapSet();
7412     OopMap* map = new OopMap(frame_size_in_slots, 0);
7413 
7414     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
7415     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
7416     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
7417     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
7418     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
7419     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
7420     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
7421     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
7422 
7423     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
7424     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
7425     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
7426     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
7427     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
7428     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
7429     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
7430     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
7431 
7432     address start = __ pc();
7433 
7434     __ enter(); // Save FP and LR before call
7435 
7436     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
7437     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
7438     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
7439     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
7440 
7441     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
7442     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
7443     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
7444     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
7445 
7446     int frame_complete = __ offset();
7447 
7448     // Set up last_Java_sp and last_Java_fp
7449     address the_pc = __ pc();
7450     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7451 
7452     // Call runtime
7453     __ mov(c_rarg1, r0);
7454     __ mov(c_rarg0, rthread);
7455 
7456     __ mov(rscratch1, destination);
7457     __ blr(rscratch1);
7458 
7459     oop_maps->add_gc_map(the_pc - start, map);
7460 
7461     __ reset_last_Java_frame(false);
7462 
7463     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
7464     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
7465     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
7466     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
7467 
7468     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
7469     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
7470     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
7471     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
7472 
7473     __ leave();
7474 
7475     // check for pending exceptions
7476     Label pending;
7477     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
7478     __ cbnz(rscratch1, pending);
7479 
7480     if (has_res) {
7481       __ get_vm_result(r0, rthread);
7482     }
7483 
7484     __ ret(lr);
7485 
7486     __ bind(pending);
7487     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7488 
7489     // -------------
7490     // make sure all code is generated
7491     masm->flush();
7492 
7493     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
7494     return stub->entry_point();
7495   }
7496 
7497   // Initialization
7498   void generate_initial() {
7499     // Generate initial stubs and initializes the entry points
7500 
7501     // entry points that exist in all platforms Note: This is code
7502     // that could be shared among different platforms - however the
7503     // benefit seems to be smaller than the disadvantage of having a
7504     // much more complicated generator structure. See also comment in
7505     // stubRoutines.hpp.
7506 
7507     StubRoutines::_forward_exception_entry = generate_forward_exception();
7508 
7509     StubRoutines::_call_stub_entry =
7510       generate_call_stub(StubRoutines::_call_stub_return_address);
7511 
7512     // is referenced by megamorphic call
7513     StubRoutines::_catch_exception_entry = generate_catch_exception();
7514 
7515     // Build this early so it's available for the interpreter.
7516     StubRoutines::_throw_StackOverflowError_entry =
7517       generate_throw_exception("StackOverflowError throw_exception",
7518                                CAST_FROM_FN_PTR(address,
7519                                                 SharedRuntime::throw_StackOverflowError));
7520     StubRoutines::_throw_delayed_StackOverflowError_entry =
7521       generate_throw_exception("delayed StackOverflowError throw_exception",
7522                                CAST_FROM_FN_PTR(address,
7523                                                 SharedRuntime::throw_delayed_StackOverflowError));
7524     if (UseCRC32Intrinsics) {
7525       // set table address before stub generation which use it
7526       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7527       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7528     }
7529 
7530     if (UseCRC32CIntrinsics) {
7531       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7532     }
7533 
7534     // Disabled until JDK-8210858 is fixed
7535     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7536     //   StubRoutines::_dlog = generate_dlog();
7537     // }
7538 
7539     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7540       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7541     }
7542 
7543     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7544       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7545     }
7546 
7547     if (InlineTypeReturnedAsFields) {
7548       StubRoutines::_load_inline_type_fields_in_regs =
7549          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
7550       StubRoutines::_store_inline_type_fields_to_buf =
7551          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
7552     }
7553 
7554     // Safefetch stubs.
7555     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7556                                                        &StubRoutines::_safefetch32_fault_pc,
7557                                                        &StubRoutines::_safefetch32_continuation_pc);
7558     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7559                                                        &StubRoutines::_safefetchN_fault_pc,
7560                                                        &StubRoutines::_safefetchN_continuation_pc);
7561   }
7562 
7563   void generate_all() {
7564     // support for verify_oop (must happen after universe_init)
7565     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
7566     StubRoutines::_throw_AbstractMethodError_entry =
7567       generate_throw_exception("AbstractMethodError throw_exception",
7568                                CAST_FROM_FN_PTR(address,
7569                                                 SharedRuntime::
7570                                                 throw_AbstractMethodError));
7571 
7572     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7573       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7574                                CAST_FROM_FN_PTR(address,
7575                                                 SharedRuntime::
7576                                                 throw_IncompatibleClassChangeError));
7577 
7578     StubRoutines::_throw_NullPointerException_at_call_entry =
7579       generate_throw_exception("NullPointerException at call throw_exception",
7580                                CAST_FROM_FN_PTR(address,
7581                                                 SharedRuntime::
7582                                                 throw_NullPointerException_at_call));
7583 
7584     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7585 
7586     // arraycopy stubs used by compilers
7587     generate_arraycopy_stubs();
7588 
7589     // has negatives stub for large arrays.
7590     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
7591 
7592     // array equals stub for large arrays.
7593     if (!UseSimpleArrayEquals) {
7594       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7595     }
7596 
7597     generate_compare_long_strings();
7598 
7599     generate_string_indexof_stubs();
7600 
7601     // byte_array_inflate stub for large arrays.
7602     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7603 
7604     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7605     if (bs_nm != NULL) {
7606       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7607     }
7608 #ifdef COMPILER2
7609     if (UseMultiplyToLenIntrinsic) {
7610       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7611     }
7612 
7613     if (UseSquareToLenIntrinsic) {
7614       StubRoutines::_squareToLen = generate_squareToLen();
7615     }
7616 
7617     if (UseMulAddIntrinsic) {
7618       StubRoutines::_mulAdd = generate_mulAdd();
7619     }
7620 
7621     if (UseSIMDForBigIntegerShiftIntrinsics) {
7622       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7623       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7624     }
7625 
7626     if (UseMontgomeryMultiplyIntrinsic) {
7627       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7628       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7629       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7630     }
7631 
7632     if (UseMontgomerySquareIntrinsic) {
7633       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7634       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7635       // We use generate_multiply() rather than generate_square()
7636       // because it's faster for the sizes of modulus we care about.
7637       StubRoutines::_montgomerySquare = g.generate_multiply();
7638     }
7639 #endif // COMPILER2
7640 
7641     if (UseBASE64Intrinsics) {
7642         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7643         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7644     }
7645 
7646     // data cache line writeback
7647     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7648     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7649 
7650     if (UseAESIntrinsics) {
7651       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7652       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7653       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7654       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7655       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7656     }
7657     if (UseGHASHIntrinsics) {
7658       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7659       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
7660     }
7661     if (UseAESIntrinsics && UseGHASHIntrinsics) {
7662       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7663     }
7664 
7665     if (UseSHA1Intrinsics) {
7666       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7667       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7668     }
7669     if (UseSHA256Intrinsics) {
7670       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7671       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7672     }
7673     if (UseSHA512Intrinsics) {
7674       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7675       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7676     }
7677     if (UseSHA3Intrinsics) {
7678       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7679       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7680     }
7681 
7682     // generate Adler32 intrinsics code
7683     if (UseAdler32Intrinsics) {
7684       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7685     }
7686 
7687 #ifdef LINUX
7688 
7689     generate_atomic_entry_points();
7690 
7691 #endif // LINUX
7692 
7693     StubRoutines::aarch64::set_completed();
7694   }
7695 
7696  public:
7697   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7698     if (all) {
7699       generate_all();
7700     } else {
7701       generate_initial();
7702     }
7703   }
7704 }; // end class declaration
7705 
7706 #define UCM_TABLE_MAX_ENTRIES 8
7707 void StubGenerator_generate(CodeBuffer* code, bool all) {
7708   if (UnsafeCopyMemory::_table == NULL) {
7709     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7710   }
7711   StubGenerator g(code, all);
7712 }
7713 
7714 
7715 #ifdef LINUX
7716 
7717 // Define pointers to atomic stubs and initialize them to point to the
7718 // code in atomic_aarch64.S.
7719 
7720 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
7721   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7722     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
7723   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7724     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7725 
7726 DEFAULT_ATOMIC_OP(fetch_add, 4, )
7727 DEFAULT_ATOMIC_OP(fetch_add, 8, )
7728 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
7729 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
7730 DEFAULT_ATOMIC_OP(xchg, 4, )
7731 DEFAULT_ATOMIC_OP(xchg, 8, )
7732 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
7733 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
7734 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
7735 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
7736 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
7737 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
7738 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
7739 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
7740 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
7741 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
7742 
7743 #undef DEFAULT_ATOMIC_OP
7744 
7745 #endif // LINUX