1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/runtime.hpp"
  48 #endif
  49 
  50 #ifdef BUILTIN_SIM
  51 #include "../../../../../../simulator/simulator.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp
  57 
  58 #undef __
  59 #define __ _masm->
  60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(int& counter) {
  79     __ lea(rscratch2, ExternalAddress((address)&counter));
  80     __ ldrw(rscratch1, Address(rscratch2));
  81     __ addw(rscratch1, rscratch1, 1);
  82     __ strw(rscratch1, Address(rscratch2));
  83   }
  84 #define inc_counter_np(counter) \
  85   BLOCK_COMMENT("inc_counter " #counter); \
  86   inc_counter_np_(counter);
  87 #endif
  88 
  89   // Call stubs are used to call Java from C
  90   //
  91   // Arguments:
  92   //    c_rarg0:   call wrapper address                   address
  93   //    c_rarg1:   result                                 address
  94   //    c_rarg2:   result type                            BasicType
  95   //    c_rarg3:   method                                 Method*
  96   //    c_rarg4:   (interpreter) entry point              address
  97   //    c_rarg5:   parameters                             intptr_t*
  98   //    c_rarg6:   parameter size (in words)              int
  99   //    c_rarg7:   thread                                 Thread*
 100   //
 101   // There is no return from the stub itself as any Java result
 102   // is written to result
 103   //
 104   // we save r30 (lr) as the return PC at the base of the frame and
 105   // link r29 (fp) below it as the frame pointer installing sp (r31)
 106   // into fp.
 107   //
 108   // we save r0-r7, which accounts for all the c arguments.
 109   //
 110   // TODO: strictly do we need to save them all? they are treated as
 111   // volatile by C so could we omit saving the ones we are going to
 112   // place in global registers (thread? method?) or those we only use
 113   // during setup of the Java call?
 114   //
 115   // we don't need to save r8 which C uses as an indirect result location
 116   // return register.
 117   //
 118   // we don't need to save r9-r15 which both C and Java treat as
 119   // volatile
 120   //
 121   // we don't need to save r16-18 because Java does not use them
 122   //
 123   // we save r19-r28 which Java uses as scratch registers and C
 124   // expects to be callee-save
 125   //
 126   // we save the bottom 64 bits of each value stored in v8-v15; it is
 127   // the responsibility of the caller to preserve larger values.
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved v15            ] <--- sp_after_call
 136   // -25 [ saved v14            ]
 137   // -24 [ saved v13            ]
 138   // -23 [ saved v12            ]
 139   // -22 [ saved v11            ]
 140   // -21 [ saved v10            ]
 141   // -20 [ saved v9             ]
 142   // -19 [ saved v8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d13_off            = -24,
 170     d11_off            = -22,
 171     d9_off             = -20,
 172 
 173     r28_off            = -18,
 174     r26_off            = -16,
 175     r24_off            = -14,
 176     r22_off            = -12,
 177     r20_off            = -10,
 178     call_wrapper_off   =  -8,
 179     result_off         =  -7,
 180     result_type_off    =  -6,
 181     method_off         =  -5,
 182     entry_point_off    =  -4,
 183     parameter_size_off =  -2,
 184     thread_off         =  -1,
 185     fp_f               =   0,
 186     retaddr_off        =   1,
 187   };
 188 
 189   address generate_call_stub(address& return_address) {
 190     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 191            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 192            "adjust this code");
 193 
 194     StubCodeMark mark(this, "StubRoutines", "call_stub");
 195     address start = __ pc();
 196 
 197     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 198 
 199     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 200     const Address result        (rfp, result_off         * wordSize);
 201     const Address result_type   (rfp, result_type_off    * wordSize);
 202     const Address method        (rfp, method_off         * wordSize);
 203     const Address entry_point   (rfp, entry_point_off    * wordSize);
 204     const Address parameter_size(rfp, parameter_size_off * wordSize);
 205 
 206     const Address thread        (rfp, thread_off         * wordSize);
 207 
 208     const Address d15_save      (rfp, d15_off * wordSize);
 209     const Address d13_save      (rfp, d13_off * wordSize);
 210     const Address d11_save      (rfp, d11_off * wordSize);
 211     const Address d9_save       (rfp, d9_off * wordSize);
 212 
 213     const Address r28_save      (rfp, r28_off * wordSize);
 214     const Address r26_save      (rfp, r26_off * wordSize);
 215     const Address r24_save      (rfp, r24_off * wordSize);
 216     const Address r22_save      (rfp, r22_off * wordSize);
 217     const Address r20_save      (rfp, r20_off * wordSize);
 218 
 219     // stub code
 220 
 221     // we need a C prolog to bootstrap the x86 caller into the sim
 222     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 223 
 224     address aarch64_entry = __ pc();
 225 
 226 #ifdef BUILTIN_SIM
 227     // Save sender's SP for stack traces.
 228     __ mov(rscratch1, sp);
 229     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 230 #endif
 231     // set up frame and move sp to end of save area
 232     __ enter();
 233     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 234 
 235     // save register parameters and Java scratch/global registers
 236     // n.b. we save thread even though it gets installed in
 237     // rthread because we want to sanity check rthread later
 238     __ str(c_rarg7,  thread);
 239     __ strw(c_rarg6, parameter_size);
 240     __ stp(c_rarg4, c_rarg5,  entry_point);
 241     __ stp(c_rarg2, c_rarg3,  result_type);
 242     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 243 
 244     __ stp(r20, r19,   r20_save);
 245     __ stp(r22, r21,   r22_save);
 246     __ stp(r24, r23,   r24_save);
 247     __ stp(r26, r25,   r26_save);
 248     __ stp(r28, r27,   r28_save);
 249 
 250     __ stpd(v9,  v8,   d9_save);
 251     __ stpd(v11, v10,  d11_save);
 252     __ stpd(v13, v12,  d13_save);
 253     __ stpd(v15, v14,  d15_save);
 254 
 255     // install Java thread in global register now we have saved
 256     // whatever value it held
 257     __ mov(rthread, c_rarg7);
 258     // And method
 259     __ mov(rmethod, c_rarg3);
 260 
 261     // set up the heapbase register
 262     __ reinit_heapbase();
 263 
 264 #ifdef ASSERT
 265     // make sure we have no pending exceptions
 266     {
 267       Label L;
 268       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 269       __ cmp(rscratch1, (u1)NULL_WORD);
 270       __ br(Assembler::EQ, L);
 271       __ stop("StubRoutines::call_stub: entered with pending exception");
 272       __ BIND(L);
 273     }
 274 #endif
 275     // pass parameters if any
 276     __ mov(esp, sp);
 277     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 278     __ andr(sp, rscratch1, -2 * wordSize);
 279 
 280     BLOCK_COMMENT("pass parameters if any");
 281     Label parameters_done;
 282     // parameter count is still in c_rarg6
 283     // and parameter pointer identifying param 1 is in c_rarg5
 284     __ cbzw(c_rarg6, parameters_done);
 285 
 286     address loop = __ pc();
 287     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 288     __ subsw(c_rarg6, c_rarg6, 1);
 289     __ push(rscratch1);
 290     __ br(Assembler::GT, loop);
 291 
 292     __ BIND(parameters_done);
 293 
 294     // call Java entry -- passing methdoOop, and current sp
 295     //      rmethod: Method*
 296     //      r13: sender sp
 297     BLOCK_COMMENT("call Java function");
 298     __ mov(r13, sp);
 299     __ blr(c_rarg4);
 300 
 301     // tell the simulator we have returned to the stub
 302 
 303     // we do this here because the notify will already have been done
 304     // if we get to the next instruction via an exception
 305     //
 306     // n.b. adding this instruction here affects the calculation of
 307     // whether or not a routine returns to the call stub (used when
 308     // doing stack walks) since the normal test is to check the return
 309     // pc against the address saved below. so we may need to allow for
 310     // this extra instruction in the check.
 311 
 312     if (NotifySimulator) {
 313       __ notify(Assembler::method_reentry);
 314     }
 315     // save current address for use by exception handling code
 316 
 317     return_address = __ pc();
 318 
 319     // store result depending on type (everything that is not
 320     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 321     // n.b. this assumes Java returns an integral result in r0
 322     // and a floating result in j_farg0
 323     __ ldr(j_rarg2, result);
 324     Label is_long, is_float, is_double, exit;
 325     __ ldr(j_rarg1, result_type);
 326     __ cmp(j_rarg1, (u1)T_OBJECT);
 327     __ br(Assembler::EQ, is_long);
 328     __ cmp(j_rarg1, (u1)T_LONG);
 329     __ br(Assembler::EQ, is_long);
 330     __ cmp(j_rarg1, (u1)T_FLOAT);
 331     __ br(Assembler::EQ, is_float);
 332     __ cmp(j_rarg1, (u1)T_DOUBLE);
 333     __ br(Assembler::EQ, is_double);
 334 
 335     // handle T_INT case
 336     __ strw(r0, Address(j_rarg2));
 337 
 338     __ BIND(exit);
 339 
 340     // pop parameters
 341     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 342 
 343 #ifdef ASSERT
 344     // verify that threads correspond
 345     {
 346       Label L, S;
 347       __ ldr(rscratch1, thread);
 348       __ cmp(rthread, rscratch1);
 349       __ br(Assembler::NE, S);
 350       __ get_thread(rscratch1);
 351       __ cmp(rthread, rscratch1);
 352       __ br(Assembler::EQ, L);
 353       __ BIND(S);
 354       __ stop("StubRoutines::call_stub: threads must correspond");
 355       __ BIND(L);
 356     }
 357 #endif
 358 
 359     // restore callee-save registers
 360     __ ldpd(v15, v14,  d15_save);
 361     __ ldpd(v13, v12,  d13_save);
 362     __ ldpd(v11, v10,  d11_save);
 363     __ ldpd(v9,  v8,   d9_save);
 364 
 365     __ ldp(r28, r27,   r28_save);
 366     __ ldp(r26, r25,   r26_save);
 367     __ ldp(r24, r23,   r24_save);
 368     __ ldp(r22, r21,   r22_save);
 369     __ ldp(r20, r19,   r20_save);
 370 
 371     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 372     __ ldrw(c_rarg2, result_type);
 373     __ ldr(c_rarg3,  method);
 374     __ ldp(c_rarg4, c_rarg5,  entry_point);
 375     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 376 
 377 #ifndef PRODUCT
 378     // tell the simulator we are about to end Java execution
 379     if (NotifySimulator) {
 380       __ notify(Assembler::method_exit);
 381     }
 382 #endif
 383     // leave frame and return to caller
 384     __ leave();
 385     __ ret(lr);
 386 
 387     // handle return types different from T_INT
 388 
 389     __ BIND(is_long);
 390     __ str(r0, Address(j_rarg2, 0));
 391     __ br(Assembler::AL, exit);
 392 
 393     __ BIND(is_float);
 394     __ strs(j_farg0, Address(j_rarg2, 0));
 395     __ br(Assembler::AL, exit);
 396 
 397     __ BIND(is_double);
 398     __ strd(j_farg0, Address(j_rarg2, 0));
 399     __ br(Assembler::AL, exit);
 400 
 401     return start;
 402   }
 403 
 404   // Return point for a Java call if there's an exception thrown in
 405   // Java code.  The exception is caught and transformed into a
 406   // pending exception stored in JavaThread that can be tested from
 407   // within the VM.
 408   //
 409   // Note: Usually the parameters are removed by the callee. In case
 410   // of an exception crossing an activation frame boundary, that is
 411   // not the case if the callee is compiled code => need to setup the
 412   // rsp.
 413   //
 414   // r0: exception oop
 415 
 416   // NOTE: this is used as a target from the signal handler so it
 417   // needs an x86 prolog which returns into the current simulator
 418   // executing the generated catch_exception code. so the prolog
 419   // needs to install rax in a sim register and adjust the sim's
 420   // restart pc to enter the generated code at the start position
 421   // then return from native to simulated execution.
 422 
 423   address generate_catch_exception() {
 424     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 425     address start = __ pc();
 426 
 427     // same as in generate_call_stub():
 428     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 429     const Address thread        (rfp, thread_off         * wordSize);
 430 
 431 #ifdef ASSERT
 432     // verify that threads correspond
 433     {
 434       Label L, S;
 435       __ ldr(rscratch1, thread);
 436       __ cmp(rthread, rscratch1);
 437       __ br(Assembler::NE, S);
 438       __ get_thread(rscratch1);
 439       __ cmp(rthread, rscratch1);
 440       __ br(Assembler::EQ, L);
 441       __ bind(S);
 442       __ stop("StubRoutines::catch_exception: threads must correspond");
 443       __ bind(L);
 444     }
 445 #endif
 446 
 447     // set pending exception
 448     __ verify_oop(r0);
 449 
 450     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 451     __ mov(rscratch1, (address)__FILE__);
 452     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 453     __ movw(rscratch1, (int)__LINE__);
 454     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 455 
 456     // complete return to VM
 457     assert(StubRoutines::_call_stub_return_address != NULL,
 458            "_call_stub_return_address must have been generated before");
 459     __ b(StubRoutines::_call_stub_return_address);
 460 
 461     return start;
 462   }
 463 
 464   // Continuation point for runtime calls returning with a pending
 465   // exception.  The pending exception check happened in the runtime
 466   // or native call stub.  The pending exception in Thread is
 467   // converted into a Java-level exception.
 468   //
 469   // Contract with Java-level exception handlers:
 470   // r0: exception
 471   // r3: throwing pc
 472   //
 473   // NOTE: At entry of this stub, exception-pc must be in LR !!
 474 
 475   // NOTE: this is always used as a jump target within generated code
 476   // so it just needs to be generated code wiht no x86 prolog
 477 
 478   address generate_forward_exception() {
 479     StubCodeMark mark(this, "StubRoutines", "forward exception");
 480     address start = __ pc();
 481 
 482     // Upon entry, LR points to the return address returning into
 483     // Java (interpreted or compiled) code; i.e., the return address
 484     // becomes the throwing pc.
 485     //
 486     // Arguments pushed before the runtime call are still on the stack
 487     // but the exception handler will reset the stack pointer ->
 488     // ignore them.  A potential result in registers can be ignored as
 489     // well.
 490 
 491 #ifdef ASSERT
 492     // make sure this code is only executed if there is a pending exception
 493     {
 494       Label L;
 495       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 496       __ cbnz(rscratch1, L);
 497       __ stop("StubRoutines::forward exception: no pending exception (1)");
 498       __ bind(L);
 499     }
 500 #endif
 501 
 502     // compute exception handler into r19
 503 
 504     // call the VM to find the handler address associated with the
 505     // caller address. pass thread in r0 and caller pc (ret address)
 506     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 507     // the stack.
 508     __ mov(c_rarg1, lr);
 509     // lr will be trashed by the VM call so we move it to R19
 510     // (callee-saved) because we also need to pass it to the handler
 511     // returned by this call.
 512     __ mov(r19, lr);
 513     BLOCK_COMMENT("call exception_handler_for_return_address");
 514     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 515                          SharedRuntime::exception_handler_for_return_address),
 516                     rthread, c_rarg1);
 517     // we should not really care that lr is no longer the callee
 518     // address. we saved the value the handler needs in r19 so we can
 519     // just copy it to r3. however, the C2 handler will push its own
 520     // frame and then calls into the VM and the VM code asserts that
 521     // the PC for the frame above the handler belongs to a compiled
 522     // Java method. So, we restore lr here to satisfy that assert.
 523     __ mov(lr, r19);
 524     // setup r0 & r3 & clear pending exception
 525     __ mov(r3, r19);
 526     __ mov(r19, r0);
 527     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 528     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 529 
 530 #ifdef ASSERT
 531     // make sure exception is set
 532     {
 533       Label L;
 534       __ cbnz(r0, L);
 535       __ stop("StubRoutines::forward exception: no pending exception (2)");
 536       __ bind(L);
 537     }
 538 #endif
 539 
 540     // continue at exception handler
 541     // r0: exception
 542     // r3: throwing pc
 543     // r19: exception handler
 544     __ verify_oop(r0);
 545     __ br(r19);
 546 
 547     return start;
 548   }
 549 
 550   // Non-destructive plausibility checks for oops
 551   //
 552   // Arguments:
 553   //    r0: oop to verify
 554   //    rscratch1: error message
 555   //
 556   // Stack after saving c_rarg3:
 557   //    [tos + 0]: saved c_rarg3
 558   //    [tos + 1]: saved c_rarg2
 559   //    [tos + 2]: saved lr
 560   //    [tos + 3]: saved rscratch2
 561   //    [tos + 4]: saved r0
 562   //    [tos + 5]: saved rscratch1
 563   address generate_verify_oop() {
 564 
 565     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 566     address start = __ pc();
 567 
 568     Label exit, error;
 569 
 570     // save c_rarg2 and c_rarg3
 571     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 572 
 573     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 574     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 575     __ ldr(c_rarg3, Address(c_rarg2));
 576     __ add(c_rarg3, c_rarg3, 1);
 577     __ str(c_rarg3, Address(c_rarg2));
 578 
 579     // object is in r0
 580     // make sure object is 'reasonable'
 581     __ cbz(r0, exit); // if obj is NULL it is OK
 582 
 583     // Check if the oop is in the right area of memory
 584     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 585     __ andr(c_rarg2, r0, c_rarg3);
 586     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 587 
 588     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 589     // instruction here because the flags register is live.
 590     __ eor(c_rarg2, c_rarg2, c_rarg3);
 591     __ cbnz(c_rarg2, error);
 592 
 593     // make sure klass is 'reasonable', which is not zero.
 594     __ load_klass(r0, r0);  // get klass
 595     __ cbz(r0, error);      // if klass is NULL it is broken
 596 
 597     // return if everything seems ok
 598     __ bind(exit);
 599 
 600     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 601     __ ret(lr);
 602 
 603     // handle errors
 604     __ bind(error);
 605     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 606 
 607     __ push(RegSet::range(r0, r29), sp);
 608     // debug(char* msg, int64_t pc, int64_t regs[])
 609     __ mov(c_rarg0, rscratch1);      // pass address of error message
 610     __ mov(c_rarg1, lr);             // pass return address
 611     __ mov(c_rarg2, sp);             // pass address of regs on stack
 612 #ifndef PRODUCT
 613     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 614 #endif
 615     BLOCK_COMMENT("call MacroAssembler::debug");
 616     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 617     __ blrt(rscratch1, 3, 0, 1);
 618 
 619     return start;
 620   }
 621 
 622   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 623 
 624   // The inner part of zero_words().  This is the bulk operation,
 625   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 626   // caller is responsible for zeroing the last few words.
 627   //
 628   // Inputs:
 629   // r10: the HeapWord-aligned base address of an array to zero.
 630   // r11: the count in HeapWords, r11 > 0.
 631   //
 632   // Returns r10 and r11, adjusted for the caller to clear.
 633   // r10: the base address of the tail of words left to clear.
 634   // r11: the number of words in the tail.
 635   //      r11 < MacroAssembler::zero_words_block_size.
 636 
 637   address generate_zero_blocks() {
 638     Label done;
 639     Label base_aligned;
 640 
 641     Register base = r10, cnt = r11;
 642 
 643     __ align(CodeEntryAlignment);
 644     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 645     address start = __ pc();
 646 
 647     if (UseBlockZeroing) {
 648       int zva_length = VM_Version::zva_length();
 649 
 650       // Ensure ZVA length can be divided by 16. This is required by
 651       // the subsequent operations.
 652       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 653 
 654       __ tbz(base, 3, base_aligned);
 655       __ str(zr, Address(__ post(base, 8)));
 656       __ sub(cnt, cnt, 1);
 657       __ bind(base_aligned);
 658 
 659       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 660       // alignment.
 661       Label small;
 662       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 663       __ subs(rscratch1, cnt, low_limit >> 3);
 664       __ br(Assembler::LT, small);
 665       __ zero_dcache_blocks(base, cnt);
 666       __ bind(small);
 667     }
 668 
 669     {
 670       // Number of stp instructions we'll unroll
 671       const int unroll =
 672         MacroAssembler::zero_words_block_size / 2;
 673       // Clear the remaining blocks.
 674       Label loop;
 675       __ subs(cnt, cnt, unroll * 2);
 676       __ br(Assembler::LT, done);
 677       __ bind(loop);
 678       for (int i = 0; i < unroll; i++)
 679         __ stp(zr, zr, __ post(base, 16));
 680       __ subs(cnt, cnt, unroll * 2);
 681       __ br(Assembler::GE, loop);
 682       __ bind(done);
 683       __ add(cnt, cnt, unroll * 2);
 684     }
 685 
 686     __ ret(lr);
 687 
 688     return start;
 689   }
 690 
 691 
 692   typedef enum {
 693     copy_forwards = 1,
 694     copy_backwards = -1
 695   } copy_direction;
 696 
 697   // Bulk copy of blocks of 8 words.
 698   //
 699   // count is a count of words.
 700   //
 701   // Precondition: count >= 8
 702   //
 703   // Postconditions:
 704   //
 705   // The least significant bit of count contains the remaining count
 706   // of words to copy.  The rest of count is trash.
 707   //
 708   // s and d are adjusted to point to the remaining words to copy
 709   //
 710   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 711                            copy_direction direction) {
 712     int unit = wordSize * direction;
 713     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 714 
 715     int offset;
 716     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 717       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 718     const Register stride = r13;
 719 
 720     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 721     assert_different_registers(s, d, count, rscratch1);
 722 
 723     Label again, drain;
 724     const char *stub_name;
 725     if (direction == copy_forwards)
 726       stub_name = "forward_copy_longs";
 727     else
 728       stub_name = "backward_copy_longs";
 729 
 730     __ align(CodeEntryAlignment);
 731 
 732     StubCodeMark mark(this, "StubRoutines", stub_name);
 733 
 734     __ bind(start);
 735 
 736     Label unaligned_copy_long;
 737     if (AvoidUnalignedAccesses) {
 738       __ tbnz(d, 3, unaligned_copy_long);
 739     }
 740 
 741     if (direction == copy_forwards) {
 742       __ sub(s, s, bias);
 743       __ sub(d, d, bias);
 744     }
 745 
 746 #ifdef ASSERT
 747     // Make sure we are never given < 8 words
 748     {
 749       Label L;
 750       __ cmp(count, (u1)8);
 751       __ br(Assembler::GE, L);
 752       __ stop("genrate_copy_longs called with < 8 words");
 753       __ bind(L);
 754     }
 755 #endif
 756 
 757     // Fill 8 registers
 758     if (UseSIMDForMemoryOps) {
 759       __ ldpq(v0, v1, Address(s, 4 * unit));
 760       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 761     } else {
 762       __ ldp(t0, t1, Address(s, 2 * unit));
 763       __ ldp(t2, t3, Address(s, 4 * unit));
 764       __ ldp(t4, t5, Address(s, 6 * unit));
 765       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 766     }
 767 
 768     __ subs(count, count, 16);
 769     __ br(Assembler::LO, drain);
 770 
 771     int prefetch = PrefetchCopyIntervalInBytes;
 772     bool use_stride = false;
 773     if (direction == copy_backwards) {
 774        use_stride = prefetch > 256;
 775        prefetch = -prefetch;
 776        if (use_stride) __ mov(stride, prefetch);
 777     }
 778 
 779     __ bind(again);
 780 
 781     if (PrefetchCopyIntervalInBytes > 0)
 782       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 783 
 784     if (UseSIMDForMemoryOps) {
 785       __ stpq(v0, v1, Address(d, 4 * unit));
 786       __ ldpq(v0, v1, Address(s, 4 * unit));
 787       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 788       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 789     } else {
 790       __ stp(t0, t1, Address(d, 2 * unit));
 791       __ ldp(t0, t1, Address(s, 2 * unit));
 792       __ stp(t2, t3, Address(d, 4 * unit));
 793       __ ldp(t2, t3, Address(s, 4 * unit));
 794       __ stp(t4, t5, Address(d, 6 * unit));
 795       __ ldp(t4, t5, Address(s, 6 * unit));
 796       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 797       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 798     }
 799 
 800     __ subs(count, count, 8);
 801     __ br(Assembler::HS, again);
 802 
 803     // Drain
 804     __ bind(drain);
 805     if (UseSIMDForMemoryOps) {
 806       __ stpq(v0, v1, Address(d, 4 * unit));
 807       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 808     } else {
 809       __ stp(t0, t1, Address(d, 2 * unit));
 810       __ stp(t2, t3, Address(d, 4 * unit));
 811       __ stp(t4, t5, Address(d, 6 * unit));
 812       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 813     }
 814 
 815     {
 816       Label L1, L2;
 817       __ tbz(count, exact_log2(4), L1);
 818       if (UseSIMDForMemoryOps) {
 819         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 820         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 821       } else {
 822         __ ldp(t0, t1, Address(s, 2 * unit));
 823         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 824         __ stp(t0, t1, Address(d, 2 * unit));
 825         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 826       }
 827       __ bind(L1);
 828 
 829       if (direction == copy_forwards) {
 830         __ add(s, s, bias);
 831         __ add(d, d, bias);
 832       }
 833 
 834       __ tbz(count, 1, L2);
 835       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 836       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 837       __ bind(L2);
 838     }
 839 
 840     __ ret(lr);
 841 
 842     if (AvoidUnalignedAccesses) {
 843       Label drain, again;
 844       // Register order for storing. Order is different for backward copy.
 845 
 846       __ bind(unaligned_copy_long);
 847 
 848       // source address is even aligned, target odd aligned
 849       //
 850       // when forward copying word pairs we read long pairs at offsets
 851       // {0, 2, 4, 6} (in long words). when backwards copying we read
 852       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 853       // address by -2 in the forwards case so we can compute the
 854       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 855       // or -1.
 856       //
 857       // when forward copying we need to store 1 word, 3 pairs and
 858       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 859       // zero offset We adjust the destination by -1 which means we
 860       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 861       //
 862       // When backwards copyng we need to store 1 word, 3 pairs and
 863       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 864       // offsets {1, 3, 5, 7, 8} * unit.
 865 
 866       if (direction == copy_forwards) {
 867         __ sub(s, s, 16);
 868         __ sub(d, d, 8);
 869       }
 870 
 871       // Fill 8 registers
 872       //
 873       // for forwards copy s was offset by -16 from the original input
 874       // value of s so the register contents are at these offsets
 875       // relative to the 64 bit block addressed by that original input
 876       // and so on for each successive 64 byte block when s is updated
 877       //
 878       // t0 at offset 0,  t1 at offset 8
 879       // t2 at offset 16, t3 at offset 24
 880       // t4 at offset 32, t5 at offset 40
 881       // t6 at offset 48, t7 at offset 56
 882 
 883       // for backwards copy s was not offset so the register contents
 884       // are at these offsets into the preceding 64 byte block
 885       // relative to that original input and so on for each successive
 886       // preceding 64 byte block when s is updated. this explains the
 887       // slightly counter-intuitive looking pattern of register usage
 888       // in the stp instructions for backwards copy.
 889       //
 890       // t0 at offset -16, t1 at offset -8
 891       // t2 at offset -32, t3 at offset -24
 892       // t4 at offset -48, t5 at offset -40
 893       // t6 at offset -64, t7 at offset -56
 894 
 895       __ ldp(t0, t1, Address(s, 2 * unit));
 896       __ ldp(t2, t3, Address(s, 4 * unit));
 897       __ ldp(t4, t5, Address(s, 6 * unit));
 898       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 899 
 900       __ subs(count, count, 16);
 901       __ br(Assembler::LO, drain);
 902 
 903       int prefetch = PrefetchCopyIntervalInBytes;
 904       bool use_stride = false;
 905       if (direction == copy_backwards) {
 906          use_stride = prefetch > 256;
 907          prefetch = -prefetch;
 908          if (use_stride) __ mov(stride, prefetch);
 909       }
 910 
 911       __ bind(again);
 912 
 913       if (PrefetchCopyIntervalInBytes > 0)
 914         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 915 
 916       if (direction == copy_forwards) {
 917        // allowing for the offset of -8 the store instructions place
 918        // registers into the target 64 bit block at the following
 919        // offsets
 920        //
 921        // t0 at offset 0
 922        // t1 at offset 8,  t2 at offset 16
 923        // t3 at offset 24, t4 at offset 32
 924        // t5 at offset 40, t6 at offset 48
 925        // t7 at offset 56
 926 
 927         __ str(t0, Address(d, 1 * unit));
 928         __ stp(t1, t2, Address(d, 2 * unit));
 929         __ ldp(t0, t1, Address(s, 2 * unit));
 930         __ stp(t3, t4, Address(d, 4 * unit));
 931         __ ldp(t2, t3, Address(s, 4 * unit));
 932         __ stp(t5, t6, Address(d, 6 * unit));
 933         __ ldp(t4, t5, Address(s, 6 * unit));
 934         __ str(t7, Address(__ pre(d, 8 * unit)));
 935         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 936       } else {
 937        // d was not offset when we started so the registers are
 938        // written into the 64 bit block preceding d with the following
 939        // offsets
 940        //
 941        // t1 at offset -8
 942        // t3 at offset -24, t0 at offset -16
 943        // t5 at offset -48, t2 at offset -32
 944        // t7 at offset -56, t4 at offset -48
 945        //                   t6 at offset -64
 946        //
 947        // note that this matches the offsets previously noted for the
 948        // loads
 949 
 950         __ str(t1, Address(d, 1 * unit));
 951         __ stp(t3, t0, Address(d, 3 * unit));
 952         __ ldp(t0, t1, Address(s, 2 * unit));
 953         __ stp(t5, t2, Address(d, 5 * unit));
 954         __ ldp(t2, t3, Address(s, 4 * unit));
 955         __ stp(t7, t4, Address(d, 7 * unit));
 956         __ ldp(t4, t5, Address(s, 6 * unit));
 957         __ str(t6, Address(__ pre(d, 8 * unit)));
 958         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 959       }
 960 
 961       __ subs(count, count, 8);
 962       __ br(Assembler::HS, again);
 963 
 964       // Drain
 965       //
 966       // this uses the same pattern of offsets and register arguments
 967       // as above
 968       __ bind(drain);
 969       if (direction == copy_forwards) {
 970         __ str(t0, Address(d, 1 * unit));
 971         __ stp(t1, t2, Address(d, 2 * unit));
 972         __ stp(t3, t4, Address(d, 4 * unit));
 973         __ stp(t5, t6, Address(d, 6 * unit));
 974         __ str(t7, Address(__ pre(d, 8 * unit)));
 975       } else {
 976         __ str(t1, Address(d, 1 * unit));
 977         __ stp(t3, t0, Address(d, 3 * unit));
 978         __ stp(t5, t2, Address(d, 5 * unit));
 979         __ stp(t7, t4, Address(d, 7 * unit));
 980         __ str(t6, Address(__ pre(d, 8 * unit)));
 981       }
 982       // now we need to copy any remaining part block which may
 983       // include a 4 word block subblock and/or a 2 word subblock.
 984       // bits 2 and 1 in the count are the tell-tale for whetehr we
 985       // have each such subblock
 986       {
 987         Label L1, L2;
 988         __ tbz(count, exact_log2(4), L1);
 989        // this is the same as above but copying only 4 longs hence
 990        // with ony one intervening stp between the str instructions
 991        // but note that the offsets and registers still follow the
 992        // same pattern
 993         __ ldp(t0, t1, Address(s, 2 * unit));
 994         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 995         if (direction == copy_forwards) {
 996           __ str(t0, Address(d, 1 * unit));
 997           __ stp(t1, t2, Address(d, 2 * unit));
 998           __ str(t3, Address(__ pre(d, 4 * unit)));
 999         } else {
1000           __ str(t1, Address(d, 1 * unit));
1001           __ stp(t3, t0, Address(d, 3 * unit));
1002           __ str(t2, Address(__ pre(d, 4 * unit)));
1003         }
1004         __ bind(L1);
1005 
1006         __ tbz(count, 1, L2);
1007        // this is the same as above but copying only 2 longs hence
1008        // there is no intervening stp between the str instructions
1009        // but note that the offset and register patterns are still
1010        // the same
1011         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1012         if (direction == copy_forwards) {
1013           __ str(t0, Address(d, 1 * unit));
1014           __ str(t1, Address(__ pre(d, 2 * unit)));
1015         } else {
1016           __ str(t1, Address(d, 1 * unit));
1017           __ str(t0, Address(__ pre(d, 2 * unit)));
1018         }
1019         __ bind(L2);
1020 
1021        // for forwards copy we need to re-adjust the offsets we
1022        // applied so that s and d are follow the last words written
1023 
1024        if (direction == copy_forwards) {
1025          __ add(s, s, 16);
1026          __ add(d, d, 8);
1027        }
1028 
1029       }
1030 
1031       __ ret(lr);
1032       }
1033   }
1034 
1035   // Small copy: less than 16 bytes.
1036   //
1037   // NB: Ignores all of the bits of count which represent more than 15
1038   // bytes, so a caller doesn't have to mask them.
1039 
1040   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1041     bool is_backwards = step < 0;
1042     size_t granularity = uabs(step);
1043     int direction = is_backwards ? -1 : 1;
1044     int unit = wordSize * direction;
1045 
1046     Label Lword, Lint, Lshort, Lbyte;
1047 
1048     assert(granularity
1049            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1050 
1051     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1052 
1053     // ??? I don't know if this bit-test-and-branch is the right thing
1054     // to do.  It does a lot of jumping, resulting in several
1055     // mispredicted branches.  It might make more sense to do this
1056     // with something like Duff's device with a single computed branch.
1057 
1058     __ tbz(count, 3 - exact_log2(granularity), Lword);
1059     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1060     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1061     __ bind(Lword);
1062 
1063     if (granularity <= sizeof (jint)) {
1064       __ tbz(count, 2 - exact_log2(granularity), Lint);
1065       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1066       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1067       __ bind(Lint);
1068     }
1069 
1070     if (granularity <= sizeof (jshort)) {
1071       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1072       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1073       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1074       __ bind(Lshort);
1075     }
1076 
1077     if (granularity <= sizeof (jbyte)) {
1078       __ tbz(count, 0, Lbyte);
1079       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1080       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1081       __ bind(Lbyte);
1082     }
1083   }
1084 
1085   Label copy_f, copy_b;
1086 
1087   // All-singing all-dancing memory copy.
1088   //
1089   // Copy count units of memory from s to d.  The size of a unit is
1090   // step, which can be positive or negative depending on the direction
1091   // of copy.  If is_aligned is false, we align the source address.
1092   //
1093 
1094   void copy_memory(bool is_aligned, Register s, Register d,
1095                    Register count, Register tmp, int step) {
1096     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1097     bool is_backwards = step < 0;
1098     int granularity = uabs(step);
1099     const Register t0 = r3, t1 = r4;
1100 
1101     // <= 96 bytes do inline. Direction doesn't matter because we always
1102     // load all the data before writing anything
1103     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1104     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1105     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1106     const Register send = r17, dend = r18;
1107 
1108     if (PrefetchCopyIntervalInBytes > 0)
1109       __ prfm(Address(s, 0), PLDL1KEEP);
1110     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1111     __ br(Assembler::HI, copy_big);
1112 
1113     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1114     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1115 
1116     __ cmp(count, u1(16/granularity));
1117     __ br(Assembler::LS, copy16);
1118 
1119     __ cmp(count, u1(64/granularity));
1120     __ br(Assembler::HI, copy80);
1121 
1122     __ cmp(count, u1(32/granularity));
1123     __ br(Assembler::LS, copy32);
1124 
1125     // 33..64 bytes
1126     if (UseSIMDForMemoryOps) {
1127       __ ldpq(v0, v1, Address(s, 0));
1128       __ ldpq(v2, v3, Address(send, -32));
1129       __ stpq(v0, v1, Address(d, 0));
1130       __ stpq(v2, v3, Address(dend, -32));
1131     } else {
1132       __ ldp(t0, t1, Address(s, 0));
1133       __ ldp(t2, t3, Address(s, 16));
1134       __ ldp(t4, t5, Address(send, -32));
1135       __ ldp(t6, t7, Address(send, -16));
1136 
1137       __ stp(t0, t1, Address(d, 0));
1138       __ stp(t2, t3, Address(d, 16));
1139       __ stp(t4, t5, Address(dend, -32));
1140       __ stp(t6, t7, Address(dend, -16));
1141     }
1142     __ b(finish);
1143 
1144     // 17..32 bytes
1145     __ bind(copy32);
1146     __ ldp(t0, t1, Address(s, 0));
1147     __ ldp(t2, t3, Address(send, -16));
1148     __ stp(t0, t1, Address(d, 0));
1149     __ stp(t2, t3, Address(dend, -16));
1150     __ b(finish);
1151 
1152     // 65..80/96 bytes
1153     // (96 bytes if SIMD because we do 32 byes per instruction)
1154     __ bind(copy80);
1155     if (UseSIMDForMemoryOps) {
1156       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1157       __ ldpq(v4, v5, Address(send, -32));
1158       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1159       __ stpq(v4, v5, Address(dend, -32));
1160     } else {
1161       __ ldp(t0, t1, Address(s, 0));
1162       __ ldp(t2, t3, Address(s, 16));
1163       __ ldp(t4, t5, Address(s, 32));
1164       __ ldp(t6, t7, Address(s, 48));
1165       __ ldp(t8, t9, Address(send, -16));
1166 
1167       __ stp(t0, t1, Address(d, 0));
1168       __ stp(t2, t3, Address(d, 16));
1169       __ stp(t4, t5, Address(d, 32));
1170       __ stp(t6, t7, Address(d, 48));
1171       __ stp(t8, t9, Address(dend, -16));
1172     }
1173     __ b(finish);
1174 
1175     // 0..16 bytes
1176     __ bind(copy16);
1177     __ cmp(count, u1(8/granularity));
1178     __ br(Assembler::LO, copy8);
1179 
1180     // 8..16 bytes
1181     __ ldr(t0, Address(s, 0));
1182     __ ldr(t1, Address(send, -8));
1183     __ str(t0, Address(d, 0));
1184     __ str(t1, Address(dend, -8));
1185     __ b(finish);
1186 
1187     if (granularity < 8) {
1188       // 4..7 bytes
1189       __ bind(copy8);
1190       __ tbz(count, 2 - exact_log2(granularity), copy4);
1191       __ ldrw(t0, Address(s, 0));
1192       __ ldrw(t1, Address(send, -4));
1193       __ strw(t0, Address(d, 0));
1194       __ strw(t1, Address(dend, -4));
1195       __ b(finish);
1196       if (granularity < 4) {
1197         // 0..3 bytes
1198         __ bind(copy4);
1199         __ cbz(count, finish); // get rid of 0 case
1200         if (granularity == 2) {
1201           __ ldrh(t0, Address(s, 0));
1202           __ strh(t0, Address(d, 0));
1203         } else { // granularity == 1
1204           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1205           // the first and last byte.
1206           // Handle the 3 byte case by loading and storing base + count/2
1207           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1208           // This does means in the 1 byte case we load/store the same
1209           // byte 3 times.
1210           __ lsr(count, count, 1);
1211           __ ldrb(t0, Address(s, 0));
1212           __ ldrb(t1, Address(send, -1));
1213           __ ldrb(t2, Address(s, count));
1214           __ strb(t0, Address(d, 0));
1215           __ strb(t1, Address(dend, -1));
1216           __ strb(t2, Address(d, count));
1217         }
1218         __ b(finish);
1219       }
1220     }
1221 
1222     __ bind(copy_big);
1223     if (is_backwards) {
1224       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1225       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1226     }
1227 
1228     // Now we've got the small case out of the way we can align the
1229     // source address on a 2-word boundary.
1230 
1231     Label aligned;
1232 
1233     if (is_aligned) {
1234       // We may have to adjust by 1 word to get s 2-word-aligned.
1235       __ tbz(s, exact_log2(wordSize), aligned);
1236       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1237       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1238       __ sub(count, count, wordSize/granularity);
1239     } else {
1240       if (is_backwards) {
1241         __ andr(rscratch2, s, 2 * wordSize - 1);
1242       } else {
1243         __ neg(rscratch2, s);
1244         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1245       }
1246       // rscratch2 is the byte adjustment needed to align s.
1247       __ cbz(rscratch2, aligned);
1248       int shift = exact_log2(granularity);
1249       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1250       __ sub(count, count, rscratch2);
1251 
1252 #if 0
1253       // ?? This code is only correct for a disjoint copy.  It may or
1254       // may not make sense to use it in that case.
1255 
1256       // Copy the first pair; s and d may not be aligned.
1257       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1258       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1259 
1260       // Align s and d, adjust count
1261       if (is_backwards) {
1262         __ sub(s, s, rscratch2);
1263         __ sub(d, d, rscratch2);
1264       } else {
1265         __ add(s, s, rscratch2);
1266         __ add(d, d, rscratch2);
1267       }
1268 #else
1269       copy_memory_small(s, d, rscratch2, rscratch1, step);
1270 #endif
1271     }
1272 
1273     __ bind(aligned);
1274 
1275     // s is now 2-word-aligned.
1276 
1277     // We have a count of units and some trailing bytes.  Adjust the
1278     // count and do a bulk copy of words.
1279     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1280     if (direction == copy_forwards)
1281       __ bl(copy_f);
1282     else
1283       __ bl(copy_b);
1284 
1285     // And the tail.
1286     copy_memory_small(s, d, count, tmp, step);
1287 
1288     if (granularity >= 8) __ bind(copy8);
1289     if (granularity >= 4) __ bind(copy4);
1290     __ bind(finish);
1291   }
1292 
1293 
1294   void clobber_registers() {
1295 #ifdef ASSERT
1296     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1297     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1298     for (Register r = r3; r <= r18; r++)
1299       if (r != rscratch1) __ mov(r, rscratch1);
1300 #endif
1301   }
1302 
1303   // Scan over array at a for count oops, verifying each one.
1304   // Preserves a and count, clobbers rscratch1 and rscratch2.
1305   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1306     Label loop, end;
1307     __ mov(rscratch1, a);
1308     __ mov(rscratch2, zr);
1309     __ bind(loop);
1310     __ cmp(rscratch2, count);
1311     __ br(Assembler::HS, end);
1312     if (size == (size_t)wordSize) {
1313       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1314       __ verify_oop(temp);
1315     } else {
1316       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1317       __ decode_heap_oop(temp); // calls verify_oop
1318     }
1319     __ add(rscratch2, rscratch2, size);
1320     __ b(loop);
1321     __ bind(end);
1322   }
1323 
1324   // Arguments:
1325   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1326   //             ignored
1327   //   is_oop  - true => oop array, so generate store check code
1328   //   name    - stub name string
1329   //
1330   // Inputs:
1331   //   c_rarg0   - source array address
1332   //   c_rarg1   - destination array address
1333   //   c_rarg2   - element count, treated as ssize_t, can be zero
1334   //
1335   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1336   // the hardware handle it.  The two dwords within qwords that span
1337   // cache line boundaries will still be loaded and stored atomicly.
1338   //
1339   // Side Effects:
1340   //   disjoint_int_copy_entry is set to the no-overlap entry point
1341   //   used by generate_conjoint_int_oop_copy().
1342   //
1343   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1344                                   const char *name, bool dest_uninitialized = false) {
1345     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1346     RegSet saved_reg = RegSet::of(s, d, count);
1347     __ align(CodeEntryAlignment);
1348     StubCodeMark mark(this, "StubRoutines", name);
1349     address start = __ pc();
1350     __ enter();
1351 
1352     if (entry != NULL) {
1353       *entry = __ pc();
1354       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1355       BLOCK_COMMENT("Entry:");
1356     }
1357 
1358     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1359     if (dest_uninitialized) {
1360       decorators |= IS_DEST_UNINITIALIZED;
1361     }
1362     if (aligned) {
1363       decorators |= ARRAYCOPY_ALIGNED;
1364     }
1365 
1366     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1367     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1368 
1369     if (is_oop) {
1370       // save regs before copy_memory
1371       __ push(RegSet::of(d, count), sp);
1372     }
1373     copy_memory(aligned, s, d, count, rscratch1, size);
1374 
1375     if (is_oop) {
1376       __ pop(RegSet::of(d, count), sp);
1377       if (VerifyOops)
1378         verify_oop_array(size, d, count, r16);
1379     }
1380 
1381     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1382 
1383     __ leave();
1384     __ mov(r0, zr); // return 0
1385     __ ret(lr);
1386 #ifdef BUILTIN_SIM
1387     {
1388       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1389       sim->notifyCompile(const_cast<char*>(name), start);
1390     }
1391 #endif
1392     return start;
1393   }
1394 
1395   // Arguments:
1396   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1397   //             ignored
1398   //   is_oop  - true => oop array, so generate store check code
1399   //   name    - stub name string
1400   //
1401   // Inputs:
1402   //   c_rarg0   - source array address
1403   //   c_rarg1   - destination array address
1404   //   c_rarg2   - element count, treated as ssize_t, can be zero
1405   //
1406   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1407   // the hardware handle it.  The two dwords within qwords that span
1408   // cache line boundaries will still be loaded and stored atomicly.
1409   //
1410   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1411                                  address *entry, const char *name,
1412                                  bool dest_uninitialized = false) {
1413     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1414     RegSet saved_regs = RegSet::of(s, d, count);
1415     StubCodeMark mark(this, "StubRoutines", name);
1416     address start = __ pc();
1417     __ enter();
1418 
1419     if (entry != NULL) {
1420       *entry = __ pc();
1421       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1422       BLOCK_COMMENT("Entry:");
1423     }
1424 
1425     // use fwd copy when (d-s) above_equal (count*size)
1426     __ sub(rscratch1, d, s);
1427     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1428     __ br(Assembler::HS, nooverlap_target);
1429 
1430     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1431     if (dest_uninitialized) {
1432       decorators |= IS_DEST_UNINITIALIZED;
1433     }
1434     if (aligned) {
1435       decorators |= ARRAYCOPY_ALIGNED;
1436     }
1437 
1438     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1439     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1440 
1441     if (is_oop) {
1442       // save regs before copy_memory
1443       __ push(RegSet::of(d, count), sp);
1444     }
1445     copy_memory(aligned, s, d, count, rscratch1, -size);
1446     if (is_oop) {
1447       __ pop(RegSet::of(d, count), sp);
1448       if (VerifyOops)
1449         verify_oop_array(size, d, count, r16);
1450     }
1451     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1452     __ leave();
1453     __ mov(r0, zr); // return 0
1454     __ ret(lr);
1455 #ifdef BUILTIN_SIM
1456     {
1457       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1458       sim->notifyCompile(const_cast<char*>(name), start);
1459     }
1460 #endif
1461     return start;
1462 }
1463 
1464   // Arguments:
1465   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1466   //             ignored
1467   //   name    - stub name string
1468   //
1469   // Inputs:
1470   //   c_rarg0   - source array address
1471   //   c_rarg1   - destination array address
1472   //   c_rarg2   - element count, treated as ssize_t, can be zero
1473   //
1474   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1475   // we let the hardware handle it.  The one to eight bytes within words,
1476   // dwords or qwords that span cache line boundaries will still be loaded
1477   // and stored atomically.
1478   //
1479   // Side Effects:
1480   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1481   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1482   // we let the hardware handle it.  The one to eight bytes within words,
1483   // dwords or qwords that span cache line boundaries will still be loaded
1484   // and stored atomically.
1485   //
1486   // Side Effects:
1487   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1488   //   used by generate_conjoint_byte_copy().
1489   //
1490   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1491     const bool not_oop = false;
1492     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1493   }
1494 
1495   // Arguments:
1496   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1497   //             ignored
1498   //   name    - stub name string
1499   //
1500   // Inputs:
1501   //   c_rarg0   - source array address
1502   //   c_rarg1   - destination array address
1503   //   c_rarg2   - element count, treated as ssize_t, can be zero
1504   //
1505   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1506   // we let the hardware handle it.  The one to eight bytes within words,
1507   // dwords or qwords that span cache line boundaries will still be loaded
1508   // and stored atomically.
1509   //
1510   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1511                                       address* entry, const char *name) {
1512     const bool not_oop = false;
1513     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1514   }
1515 
1516   // Arguments:
1517   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1518   //             ignored
1519   //   name    - stub name string
1520   //
1521   // Inputs:
1522   //   c_rarg0   - source array address
1523   //   c_rarg1   - destination array address
1524   //   c_rarg2   - element count, treated as ssize_t, can be zero
1525   //
1526   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1527   // let the hardware handle it.  The two or four words within dwords
1528   // or qwords that span cache line boundaries will still be loaded
1529   // and stored atomically.
1530   //
1531   // Side Effects:
1532   //   disjoint_short_copy_entry is set to the no-overlap entry point
1533   //   used by generate_conjoint_short_copy().
1534   //
1535   address generate_disjoint_short_copy(bool aligned,
1536                                        address* entry, const char *name) {
1537     const bool not_oop = false;
1538     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1539   }
1540 
1541   // Arguments:
1542   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1543   //             ignored
1544   //   name    - stub name string
1545   //
1546   // Inputs:
1547   //   c_rarg0   - source array address
1548   //   c_rarg1   - destination array address
1549   //   c_rarg2   - element count, treated as ssize_t, can be zero
1550   //
1551   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1552   // let the hardware handle it.  The two or four words within dwords
1553   // or qwords that span cache line boundaries will still be loaded
1554   // and stored atomically.
1555   //
1556   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1557                                        address *entry, const char *name) {
1558     const bool not_oop = false;
1559     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1560 
1561   }
1562   // Arguments:
1563   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1564   //             ignored
1565   //   name    - stub name string
1566   //
1567   // Inputs:
1568   //   c_rarg0   - source array address
1569   //   c_rarg1   - destination array address
1570   //   c_rarg2   - element count, treated as ssize_t, can be zero
1571   //
1572   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1573   // the hardware handle it.  The two dwords within qwords that span
1574   // cache line boundaries will still be loaded and stored atomicly.
1575   //
1576   // Side Effects:
1577   //   disjoint_int_copy_entry is set to the no-overlap entry point
1578   //   used by generate_conjoint_int_oop_copy().
1579   //
1580   address generate_disjoint_int_copy(bool aligned, address *entry,
1581                                          const char *name, bool dest_uninitialized = false) {
1582     const bool not_oop = false;
1583     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1584   }
1585 
1586   // Arguments:
1587   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1588   //             ignored
1589   //   name    - stub name string
1590   //
1591   // Inputs:
1592   //   c_rarg0   - source array address
1593   //   c_rarg1   - destination array address
1594   //   c_rarg2   - element count, treated as ssize_t, can be zero
1595   //
1596   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1597   // the hardware handle it.  The two dwords within qwords that span
1598   // cache line boundaries will still be loaded and stored atomicly.
1599   //
1600   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1601                                      address *entry, const char *name,
1602                                      bool dest_uninitialized = false) {
1603     const bool not_oop = false;
1604     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1605   }
1606 
1607 
1608   // Arguments:
1609   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1610   //             ignored
1611   //   name    - stub name string
1612   //
1613   // Inputs:
1614   //   c_rarg0   - source array address
1615   //   c_rarg1   - destination array address
1616   //   c_rarg2   - element count, treated as size_t, can be zero
1617   //
1618   // Side Effects:
1619   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1620   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1621   //
1622   address generate_disjoint_long_copy(bool aligned, address *entry,
1623                                           const char *name, bool dest_uninitialized = false) {
1624     const bool not_oop = false;
1625     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1626   }
1627 
1628   // Arguments:
1629   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1630   //             ignored
1631   //   name    - stub name string
1632   //
1633   // Inputs:
1634   //   c_rarg0   - source array address
1635   //   c_rarg1   - destination array address
1636   //   c_rarg2   - element count, treated as size_t, can be zero
1637   //
1638   address generate_conjoint_long_copy(bool aligned,
1639                                       address nooverlap_target, address *entry,
1640                                       const char *name, bool dest_uninitialized = false) {
1641     const bool not_oop = false;
1642     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1643   }
1644 
1645   // Arguments:
1646   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1647   //             ignored
1648   //   name    - stub name string
1649   //
1650   // Inputs:
1651   //   c_rarg0   - source array address
1652   //   c_rarg1   - destination array address
1653   //   c_rarg2   - element count, treated as size_t, can be zero
1654   //
1655   // Side Effects:
1656   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1657   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1658   //
1659   address generate_disjoint_oop_copy(bool aligned, address *entry,
1660                                      const char *name, bool dest_uninitialized) {
1661     const bool is_oop = true;
1662     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1663     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1664   }
1665 
1666   // Arguments:
1667   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1668   //             ignored
1669   //   name    - stub name string
1670   //
1671   // Inputs:
1672   //   c_rarg0   - source array address
1673   //   c_rarg1   - destination array address
1674   //   c_rarg2   - element count, treated as size_t, can be zero
1675   //
1676   address generate_conjoint_oop_copy(bool aligned,
1677                                      address nooverlap_target, address *entry,
1678                                      const char *name, bool dest_uninitialized) {
1679     const bool is_oop = true;
1680     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1681     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1682                                   name, dest_uninitialized);
1683   }
1684 
1685 
1686   // Helper for generating a dynamic type check.
1687   // Smashes rscratch1, rscratch2.
1688   void generate_type_check(Register sub_klass,
1689                            Register super_check_offset,
1690                            Register super_klass,
1691                            Label& L_success) {
1692     assert_different_registers(sub_klass, super_check_offset, super_klass);
1693 
1694     BLOCK_COMMENT("type_check:");
1695 
1696     Label L_miss;
1697 
1698     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1699                                      super_check_offset);
1700     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1701 
1702     // Fall through on failure!
1703     __ BIND(L_miss);
1704   }
1705 
1706   //
1707   //  Generate checkcasting array copy stub
1708   //
1709   //  Input:
1710   //    c_rarg0   - source array address
1711   //    c_rarg1   - destination array address
1712   //    c_rarg2   - element count, treated as ssize_t, can be zero
1713   //    c_rarg3   - size_t ckoff (super_check_offset)
1714   //    c_rarg4   - oop ckval (super_klass)
1715   //
1716   //  Output:
1717   //    r0 ==  0  -  success
1718   //    r0 == -1^K - failure, where K is partial transfer count
1719   //
1720   address generate_checkcast_copy(const char *name, address *entry,
1721                                   bool dest_uninitialized = false) {
1722 
1723     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1724 
1725     // Input registers (after setup_arg_regs)
1726     const Register from        = c_rarg0;   // source array address
1727     const Register to          = c_rarg1;   // destination array address
1728     const Register count       = c_rarg2;   // elementscount
1729     const Register ckoff       = c_rarg3;   // super_check_offset
1730     const Register ckval       = c_rarg4;   // super_klass
1731 
1732     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1733     RegSet wb_post_saved_regs = RegSet::of(count);
1734 
1735     // Registers used as temps (r18, r19, r20 are save-on-entry)
1736     const Register count_save  = r21;       // orig elementscount
1737     const Register start_to    = r20;       // destination array start address
1738     const Register copied_oop  = r18;       // actual oop copied
1739     const Register r19_klass   = r19;       // oop._klass
1740 
1741     //---------------------------------------------------------------
1742     // Assembler stub will be used for this call to arraycopy
1743     // if the two arrays are subtypes of Object[] but the
1744     // destination array type is not equal to or a supertype
1745     // of the source type.  Each element must be separately
1746     // checked.
1747 
1748     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1749                                copied_oop, r19_klass, count_save);
1750 
1751     __ align(CodeEntryAlignment);
1752     StubCodeMark mark(this, "StubRoutines", name);
1753     address start = __ pc();
1754 
1755     __ enter(); // required for proper stackwalking of RuntimeStub frame
1756 
1757 #ifdef ASSERT
1758     // caller guarantees that the arrays really are different
1759     // otherwise, we would have to make conjoint checks
1760     { Label L;
1761       array_overlap_test(L, TIMES_OOP);
1762       __ stop("checkcast_copy within a single array");
1763       __ bind(L);
1764     }
1765 #endif //ASSERT
1766 
1767     // Caller of this entry point must set up the argument registers.
1768     if (entry != NULL) {
1769       *entry = __ pc();
1770       BLOCK_COMMENT("Entry:");
1771     }
1772 
1773      // Empty array:  Nothing to do.
1774     __ cbz(count, L_done);
1775 
1776     __ push(RegSet::of(r18, r19, r20, r21), sp);
1777 
1778 #ifdef ASSERT
1779     BLOCK_COMMENT("assert consistent ckoff/ckval");
1780     // The ckoff and ckval must be mutually consistent,
1781     // even though caller generates both.
1782     { Label L;
1783       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1784       __ ldrw(start_to, Address(ckval, sco_offset));
1785       __ cmpw(ckoff, start_to);
1786       __ br(Assembler::EQ, L);
1787       __ stop("super_check_offset inconsistent");
1788       __ bind(L);
1789     }
1790 #endif //ASSERT
1791 
1792     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1793     bool is_oop = true;
1794     if (dest_uninitialized) {
1795       decorators |= IS_DEST_UNINITIALIZED;
1796     }
1797 
1798     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1799     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1800 
1801     // save the original count
1802     __ mov(count_save, count);
1803 
1804     // Copy from low to high addresses
1805     __ mov(start_to, to);              // Save destination array start address
1806     __ b(L_load_element);
1807 
1808     // ======== begin loop ========
1809     // (Loop is rotated; its entry is L_load_element.)
1810     // Loop control:
1811     //   for (; count != 0; count--) {
1812     //     copied_oop = load_heap_oop(from++);
1813     //     ... generate_type_check ...;
1814     //     store_heap_oop(to++, copied_oop);
1815     //   }
1816     __ align(OptoLoopAlignment);
1817 
1818     __ BIND(L_store_element);
1819     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1820     __ sub(count, count, 1);
1821     __ cbz(count, L_do_card_marks);
1822 
1823     // ======== loop entry is here ========
1824     __ BIND(L_load_element);
1825     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1826     __ cbz(copied_oop, L_store_element);
1827 
1828     __ load_klass(r19_klass, copied_oop);// query the object klass
1829     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1830     // ======== end loop ========
1831 
1832     // It was a real error; we must depend on the caller to finish the job.
1833     // Register count = remaining oops, count_orig = total oops.
1834     // Emit GC store barriers for the oops we have copied and report
1835     // their number to the caller.
1836 
1837     __ subs(count, count_save, count);     // K = partially copied oop count
1838     __ eon(count, count, zr);                   // report (-1^K) to caller
1839     __ br(Assembler::EQ, L_done_pop);
1840 
1841     __ BIND(L_do_card_marks);
1842     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1843 
1844     __ bind(L_done_pop);
1845     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1846     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1847 
1848     __ bind(L_done);
1849     __ mov(r0, count);
1850     __ leave();
1851     __ ret(lr);
1852 
1853     return start;
1854   }
1855 
1856   // Perform range checks on the proposed arraycopy.
1857   // Kills temp, but nothing else.
1858   // Also, clean the sign bits of src_pos and dst_pos.
1859   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1860                               Register src_pos, // source position (c_rarg1)
1861                               Register dst,     // destination array oo (c_rarg2)
1862                               Register dst_pos, // destination position (c_rarg3)
1863                               Register length,
1864                               Register temp,
1865                               Label& L_failed) {
1866     BLOCK_COMMENT("arraycopy_range_checks:");
1867 
1868     assert_different_registers(rscratch1, temp);
1869 
1870     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1871     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1872     __ addw(temp, length, src_pos);
1873     __ cmpw(temp, rscratch1);
1874     __ br(Assembler::HI, L_failed);
1875 
1876     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1877     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1878     __ addw(temp, length, dst_pos);
1879     __ cmpw(temp, rscratch1);
1880     __ br(Assembler::HI, L_failed);
1881 
1882     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1883     __ movw(src_pos, src_pos);
1884     __ movw(dst_pos, dst_pos);
1885 
1886     BLOCK_COMMENT("arraycopy_range_checks done");
1887   }
1888 
1889   // These stubs get called from some dumb test routine.
1890   // I'll write them properly when they're called from
1891   // something that's actually doing something.
1892   static void fake_arraycopy_stub(address src, address dst, int count) {
1893     assert(count == 0, "huh?");
1894   }
1895 
1896 
1897   //
1898   //  Generate 'unsafe' array copy stub
1899   //  Though just as safe as the other stubs, it takes an unscaled
1900   //  size_t argument instead of an element count.
1901   //
1902   //  Input:
1903   //    c_rarg0   - source array address
1904   //    c_rarg1   - destination array address
1905   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1906   //
1907   // Examines the alignment of the operands and dispatches
1908   // to a long, int, short, or byte copy loop.
1909   //
1910   address generate_unsafe_copy(const char *name,
1911                                address byte_copy_entry,
1912                                address short_copy_entry,
1913                                address int_copy_entry,
1914                                address long_copy_entry) {
1915     Label L_long_aligned, L_int_aligned, L_short_aligned;
1916     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1917 
1918     __ align(CodeEntryAlignment);
1919     StubCodeMark mark(this, "StubRoutines", name);
1920     address start = __ pc();
1921     __ enter(); // required for proper stackwalking of RuntimeStub frame
1922 
1923     // bump this on entry, not on exit:
1924     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1925 
1926     __ orr(rscratch1, s, d);
1927     __ orr(rscratch1, rscratch1, count);
1928 
1929     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1930     __ cbz(rscratch1, L_long_aligned);
1931     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1932     __ cbz(rscratch1, L_int_aligned);
1933     __ tbz(rscratch1, 0, L_short_aligned);
1934     __ b(RuntimeAddress(byte_copy_entry));
1935 
1936     __ BIND(L_short_aligned);
1937     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1938     __ b(RuntimeAddress(short_copy_entry));
1939     __ BIND(L_int_aligned);
1940     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1941     __ b(RuntimeAddress(int_copy_entry));
1942     __ BIND(L_long_aligned);
1943     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1944     __ b(RuntimeAddress(long_copy_entry));
1945 
1946     return start;
1947   }
1948 
1949   //
1950   //  Generate generic array copy stubs
1951   //
1952   //  Input:
1953   //    c_rarg0    -  src oop
1954   //    c_rarg1    -  src_pos (32-bits)
1955   //    c_rarg2    -  dst oop
1956   //    c_rarg3    -  dst_pos (32-bits)
1957   //    c_rarg4    -  element count (32-bits)
1958   //
1959   //  Output:
1960   //    r0 ==  0  -  success
1961   //    r0 == -1^K - failure, where K is partial transfer count
1962   //
1963   address generate_generic_copy(const char *name,
1964                                 address byte_copy_entry, address short_copy_entry,
1965                                 address int_copy_entry, address oop_copy_entry,
1966                                 address long_copy_entry, address checkcast_copy_entry) {
1967 
1968     Label L_failed, L_objArray;
1969     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1970 
1971     // Input registers
1972     const Register src        = c_rarg0;  // source array oop
1973     const Register src_pos    = c_rarg1;  // source position
1974     const Register dst        = c_rarg2;  // destination array oop
1975     const Register dst_pos    = c_rarg3;  // destination position
1976     const Register length     = c_rarg4;
1977 
1978 
1979     // Registers used as temps
1980     const Register dst_klass  = c_rarg5;
1981 
1982     __ align(CodeEntryAlignment);
1983 
1984     StubCodeMark mark(this, "StubRoutines", name);
1985 
1986     address start = __ pc();
1987 
1988     __ enter(); // required for proper stackwalking of RuntimeStub frame
1989 
1990     // bump this on entry, not on exit:
1991     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1992 
1993     //-----------------------------------------------------------------------
1994     // Assembler stub will be used for this call to arraycopy
1995     // if the following conditions are met:
1996     //
1997     // (1) src and dst must not be null.
1998     // (2) src_pos must not be negative.
1999     // (3) dst_pos must not be negative.
2000     // (4) length  must not be negative.
2001     // (5) src klass and dst klass should be the same and not NULL.
2002     // (6) src and dst should be arrays.
2003     // (7) src_pos + length must not exceed length of src.
2004     // (8) dst_pos + length must not exceed length of dst.
2005     //
2006 
2007     //  if (src == NULL) return -1;
2008     __ cbz(src, L_failed);
2009 
2010     //  if (src_pos < 0) return -1;
2011     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2012 
2013     //  if (dst == NULL) return -1;
2014     __ cbz(dst, L_failed);
2015 
2016     //  if (dst_pos < 0) return -1;
2017     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2018 
2019     // registers used as temp
2020     const Register scratch_length    = r16; // elements count to copy
2021     const Register scratch_src_klass = r17; // array klass
2022     const Register lh                = r18; // layout helper
2023 
2024     //  if (length < 0) return -1;
2025     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2026     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2027 
2028     __ load_klass(scratch_src_klass, src);
2029 #ifdef ASSERT
2030     //  assert(src->klass() != NULL);
2031     {
2032       BLOCK_COMMENT("assert klasses not null {");
2033       Label L1, L2;
2034       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2035       __ bind(L1);
2036       __ stop("broken null klass");
2037       __ bind(L2);
2038       __ load_klass(rscratch1, dst);
2039       __ cbz(rscratch1, L1);     // this would be broken also
2040       BLOCK_COMMENT("} assert klasses not null done");
2041     }
2042 #endif
2043 
2044     // Load layout helper (32-bits)
2045     //
2046     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2047     // 32        30    24            16              8     2                 0
2048     //
2049     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2050     //
2051 
2052     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2053 
2054     // Handle objArrays completely differently...
2055     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2056     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2057     __ movw(rscratch1, objArray_lh);
2058     __ eorw(rscratch2, lh, rscratch1);
2059     __ cbzw(rscratch2, L_objArray);
2060 
2061     //  if (src->klass() != dst->klass()) return -1;
2062     __ load_klass(rscratch2, dst);
2063     __ eor(rscratch2, rscratch2, scratch_src_klass);
2064     __ cbnz(rscratch2, L_failed);
2065 
2066     //  if (!src->is_Array()) return -1;
2067     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2068 
2069     // At this point, it is known to be a typeArray (array_tag 0x3).
2070 #ifdef ASSERT
2071     {
2072       BLOCK_COMMENT("assert primitive array {");
2073       Label L;
2074       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2075       __ cmpw(lh, rscratch2);
2076       __ br(Assembler::GE, L);
2077       __ stop("must be a primitive array");
2078       __ bind(L);
2079       BLOCK_COMMENT("} assert primitive array done");
2080     }
2081 #endif
2082 
2083     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2084                            rscratch2, L_failed);
2085 
2086     // TypeArrayKlass
2087     //
2088     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2089     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2090     //
2091 
2092     const Register rscratch1_offset = rscratch1;    // array offset
2093     const Register r18_elsize = lh; // element size
2094 
2095     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2096            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2097     __ add(src, src, rscratch1_offset);           // src array offset
2098     __ add(dst, dst, rscratch1_offset);           // dst array offset
2099     BLOCK_COMMENT("choose copy loop based on element size");
2100 
2101     // next registers should be set before the jump to corresponding stub
2102     const Register from     = c_rarg0;  // source array address
2103     const Register to       = c_rarg1;  // destination array address
2104     const Register count    = c_rarg2;  // elements count
2105 
2106     // 'from', 'to', 'count' registers should be set in such order
2107     // since they are the same as 'src', 'src_pos', 'dst'.
2108 
2109     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2110 
2111     // The possible values of elsize are 0-3, i.e. exact_log2(element
2112     // size in bytes).  We do a simple bitwise binary search.
2113   __ BIND(L_copy_bytes);
2114     __ tbnz(r18_elsize, 1, L_copy_ints);
2115     __ tbnz(r18_elsize, 0, L_copy_shorts);
2116     __ lea(from, Address(src, src_pos));// src_addr
2117     __ lea(to,   Address(dst, dst_pos));// dst_addr
2118     __ movw(count, scratch_length); // length
2119     __ b(RuntimeAddress(byte_copy_entry));
2120 
2121   __ BIND(L_copy_shorts);
2122     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2123     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2124     __ movw(count, scratch_length); // length
2125     __ b(RuntimeAddress(short_copy_entry));
2126 
2127   __ BIND(L_copy_ints);
2128     __ tbnz(r18_elsize, 0, L_copy_longs);
2129     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2130     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2131     __ movw(count, scratch_length); // length
2132     __ b(RuntimeAddress(int_copy_entry));
2133 
2134   __ BIND(L_copy_longs);
2135 #ifdef ASSERT
2136     {
2137       BLOCK_COMMENT("assert long copy {");
2138       Label L;
2139       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2140       __ cmpw(r18_elsize, LogBytesPerLong);
2141       __ br(Assembler::EQ, L);
2142       __ stop("must be long copy, but elsize is wrong");
2143       __ bind(L);
2144       BLOCK_COMMENT("} assert long copy done");
2145     }
2146 #endif
2147     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2148     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2149     __ movw(count, scratch_length); // length
2150     __ b(RuntimeAddress(long_copy_entry));
2151 
2152     // ObjArrayKlass
2153   __ BIND(L_objArray);
2154     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2155 
2156     Label L_plain_copy, L_checkcast_copy;
2157     //  test array classes for subtyping
2158     __ load_klass(r18, dst);
2159     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2160     __ br(Assembler::NE, L_checkcast_copy);
2161 
2162     // Identically typed arrays can be copied without element-wise checks.
2163     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2164                            rscratch2, L_failed);
2165 
2166     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2167     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2168     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2169     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2170     __ movw(count, scratch_length); // length
2171   __ BIND(L_plain_copy);
2172     __ b(RuntimeAddress(oop_copy_entry));
2173 
2174   __ BIND(L_checkcast_copy);
2175     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2176     {
2177       // Before looking at dst.length, make sure dst is also an objArray.
2178       __ ldrw(rscratch1, Address(r18, lh_offset));
2179       __ movw(rscratch2, objArray_lh);
2180       __ eorw(rscratch1, rscratch1, rscratch2);
2181       __ cbnzw(rscratch1, L_failed);
2182 
2183       // It is safe to examine both src.length and dst.length.
2184       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2185                              r18, L_failed);
2186 
2187       __ load_klass(dst_klass, dst); // reload
2188 
2189       // Marshal the base address arguments now, freeing registers.
2190       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2191       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2192       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2193       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2194       __ movw(count, length);           // length (reloaded)
2195       Register sco_temp = c_rarg3;      // this register is free now
2196       assert_different_registers(from, to, count, sco_temp,
2197                                  dst_klass, scratch_src_klass);
2198       // assert_clean_int(count, sco_temp);
2199 
2200       // Generate the type check.
2201       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2202       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2203 
2204       // Smashes rscratch1, rscratch2
2205       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2206 
2207       // Fetch destination element klass from the ObjArrayKlass header.
2208       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2209       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2210       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2211 
2212       // the checkcast_copy loop needs two extra arguments:
2213       assert(c_rarg3 == sco_temp, "#3 already in place");
2214       // Set up arguments for checkcast_copy_entry.
2215       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2216       __ b(RuntimeAddress(checkcast_copy_entry));
2217     }
2218 
2219   __ BIND(L_failed);
2220     __ mov(r0, -1);
2221     __ leave();   // required for proper stackwalking of RuntimeStub frame
2222     __ ret(lr);
2223 
2224     return start;
2225   }
2226 
2227   //
2228   // Generate stub for array fill. If "aligned" is true, the
2229   // "to" address is assumed to be heapword aligned.
2230   //
2231   // Arguments for generated stub:
2232   //   to:    c_rarg0
2233   //   value: c_rarg1
2234   //   count: c_rarg2 treated as signed
2235   //
2236   address generate_fill(BasicType t, bool aligned, const char *name) {
2237     __ align(CodeEntryAlignment);
2238     StubCodeMark mark(this, "StubRoutines", name);
2239     address start = __ pc();
2240 
2241     BLOCK_COMMENT("Entry:");
2242 
2243     const Register to        = c_rarg0;  // source array address
2244     const Register value     = c_rarg1;  // value
2245     const Register count     = c_rarg2;  // elements count
2246 
2247     const Register bz_base = r10;        // base for block_zero routine
2248     const Register cnt_words = r11;      // temp register
2249 
2250     __ enter();
2251 
2252     Label L_fill_elements, L_exit1;
2253 
2254     int shift = -1;
2255     switch (t) {
2256       case T_BYTE:
2257         shift = 0;
2258         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2259         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2260         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2261         __ br(Assembler::LO, L_fill_elements);
2262         break;
2263       case T_SHORT:
2264         shift = 1;
2265         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2266         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2267         __ br(Assembler::LO, L_fill_elements);
2268         break;
2269       case T_INT:
2270         shift = 2;
2271         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2272         __ br(Assembler::LO, L_fill_elements);
2273         break;
2274       default: ShouldNotReachHere();
2275     }
2276 
2277     // Align source address at 8 bytes address boundary.
2278     Label L_skip_align1, L_skip_align2, L_skip_align4;
2279     if (!aligned) {
2280       switch (t) {
2281         case T_BYTE:
2282           // One byte misalignment happens only for byte arrays.
2283           __ tbz(to, 0, L_skip_align1);
2284           __ strb(value, Address(__ post(to, 1)));
2285           __ subw(count, count, 1);
2286           __ bind(L_skip_align1);
2287           // Fallthrough
2288         case T_SHORT:
2289           // Two bytes misalignment happens only for byte and short (char) arrays.
2290           __ tbz(to, 1, L_skip_align2);
2291           __ strh(value, Address(__ post(to, 2)));
2292           __ subw(count, count, 2 >> shift);
2293           __ bind(L_skip_align2);
2294           // Fallthrough
2295         case T_INT:
2296           // Align to 8 bytes, we know we are 4 byte aligned to start.
2297           __ tbz(to, 2, L_skip_align4);
2298           __ strw(value, Address(__ post(to, 4)));
2299           __ subw(count, count, 4 >> shift);
2300           __ bind(L_skip_align4);
2301           break;
2302         default: ShouldNotReachHere();
2303       }
2304     }
2305 
2306     //
2307     //  Fill large chunks
2308     //
2309     __ lsrw(cnt_words, count, 3 - shift); // number of words
2310     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2311     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2312     if (UseBlockZeroing) {
2313       Label non_block_zeroing, rest;
2314       // If the fill value is zero we can use the fast zero_words().
2315       __ cbnz(value, non_block_zeroing);
2316       __ mov(bz_base, to);
2317       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2318       __ zero_words(bz_base, cnt_words);
2319       __ b(rest);
2320       __ bind(non_block_zeroing);
2321       __ fill_words(to, cnt_words, value);
2322       __ bind(rest);
2323     } else {
2324       __ fill_words(to, cnt_words, value);
2325     }
2326 
2327     // Remaining count is less than 8 bytes. Fill it by a single store.
2328     // Note that the total length is no less than 8 bytes.
2329     if (t == T_BYTE || t == T_SHORT) {
2330       Label L_exit1;
2331       __ cbzw(count, L_exit1);
2332       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2333       __ str(value, Address(to, -8));    // overwrite some elements
2334       __ bind(L_exit1);
2335       __ leave();
2336       __ ret(lr);
2337     }
2338 
2339     // Handle copies less than 8 bytes.
2340     Label L_fill_2, L_fill_4, L_exit2;
2341     __ bind(L_fill_elements);
2342     switch (t) {
2343       case T_BYTE:
2344         __ tbz(count, 0, L_fill_2);
2345         __ strb(value, Address(__ post(to, 1)));
2346         __ bind(L_fill_2);
2347         __ tbz(count, 1, L_fill_4);
2348         __ strh(value, Address(__ post(to, 2)));
2349         __ bind(L_fill_4);
2350         __ tbz(count, 2, L_exit2);
2351         __ strw(value, Address(to));
2352         break;
2353       case T_SHORT:
2354         __ tbz(count, 0, L_fill_4);
2355         __ strh(value, Address(__ post(to, 2)));
2356         __ bind(L_fill_4);
2357         __ tbz(count, 1, L_exit2);
2358         __ strw(value, Address(to));
2359         break;
2360       case T_INT:
2361         __ cbzw(count, L_exit2);
2362         __ strw(value, Address(to));
2363         break;
2364       default: ShouldNotReachHere();
2365     }
2366     __ bind(L_exit2);
2367     __ leave();
2368     __ ret(lr);
2369     return start;
2370   }
2371 
2372   void generate_arraycopy_stubs() {
2373     address entry;
2374     address entry_jbyte_arraycopy;
2375     address entry_jshort_arraycopy;
2376     address entry_jint_arraycopy;
2377     address entry_oop_arraycopy;
2378     address entry_jlong_arraycopy;
2379     address entry_checkcast_arraycopy;
2380 
2381     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2382     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2383 
2384     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2385 
2386     //*** jbyte
2387     // Always need aligned and unaligned versions
2388     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2389                                                                                   "jbyte_disjoint_arraycopy");
2390     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2391                                                                                   &entry_jbyte_arraycopy,
2392                                                                                   "jbyte_arraycopy");
2393     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2394                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2395     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2396                                                                                   "arrayof_jbyte_arraycopy");
2397 
2398     //*** jshort
2399     // Always need aligned and unaligned versions
2400     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2401                                                                                     "jshort_disjoint_arraycopy");
2402     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2403                                                                                     &entry_jshort_arraycopy,
2404                                                                                     "jshort_arraycopy");
2405     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2406                                                                                     "arrayof_jshort_disjoint_arraycopy");
2407     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2408                                                                                     "arrayof_jshort_arraycopy");
2409 
2410     //*** jint
2411     // Aligned versions
2412     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2413                                                                                 "arrayof_jint_disjoint_arraycopy");
2414     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2415                                                                                 "arrayof_jint_arraycopy");
2416     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2417     // entry_jint_arraycopy always points to the unaligned version
2418     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2419                                                                                 "jint_disjoint_arraycopy");
2420     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2421                                                                                 &entry_jint_arraycopy,
2422                                                                                 "jint_arraycopy");
2423 
2424     //*** jlong
2425     // It is always aligned
2426     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2427                                                                                   "arrayof_jlong_disjoint_arraycopy");
2428     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2429                                                                                   "arrayof_jlong_arraycopy");
2430     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2431     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2432 
2433     //*** oops
2434     {
2435       // With compressed oops we need unaligned versions; notice that
2436       // we overwrite entry_oop_arraycopy.
2437       bool aligned = !UseCompressedOops;
2438 
2439       StubRoutines::_arrayof_oop_disjoint_arraycopy
2440         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2441                                      /*dest_uninitialized*/false);
2442       StubRoutines::_arrayof_oop_arraycopy
2443         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2444                                      /*dest_uninitialized*/false);
2445       // Aligned versions without pre-barriers
2446       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2447         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2448                                      /*dest_uninitialized*/true);
2449       StubRoutines::_arrayof_oop_arraycopy_uninit
2450         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2451                                      /*dest_uninitialized*/true);
2452     }
2453 
2454     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2455     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2456     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2457     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2458 
2459     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2460     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2461                                                                         /*dest_uninitialized*/true);
2462 
2463     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2464                                                               entry_jbyte_arraycopy,
2465                                                               entry_jshort_arraycopy,
2466                                                               entry_jint_arraycopy,
2467                                                               entry_jlong_arraycopy);
2468 
2469     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2470                                                                entry_jbyte_arraycopy,
2471                                                                entry_jshort_arraycopy,
2472                                                                entry_jint_arraycopy,
2473                                                                entry_oop_arraycopy,
2474                                                                entry_jlong_arraycopy,
2475                                                                entry_checkcast_arraycopy);
2476 
2477     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2478     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2479     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2480     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2481     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2482     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2483   }
2484 
2485   void generate_math_stubs() { Unimplemented(); }
2486 
2487   // Arguments:
2488   //
2489   // Inputs:
2490   //   c_rarg0   - source byte array address
2491   //   c_rarg1   - destination byte array address
2492   //   c_rarg2   - K (key) in little endian int array
2493   //
2494   address generate_aescrypt_encryptBlock() {
2495     __ align(CodeEntryAlignment);
2496     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2497 
2498     Label L_doLast;
2499 
2500     const Register from        = c_rarg0;  // source array address
2501     const Register to          = c_rarg1;  // destination array address
2502     const Register key         = c_rarg2;  // key array address
2503     const Register keylen      = rscratch1;
2504 
2505     address start = __ pc();
2506     __ enter();
2507 
2508     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2509 
2510     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2511 
2512     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2513     __ rev32(v1, __ T16B, v1);
2514     __ rev32(v2, __ T16B, v2);
2515     __ rev32(v3, __ T16B, v3);
2516     __ rev32(v4, __ T16B, v4);
2517     __ aese(v0, v1);
2518     __ aesmc(v0, v0);
2519     __ aese(v0, v2);
2520     __ aesmc(v0, v0);
2521     __ aese(v0, v3);
2522     __ aesmc(v0, v0);
2523     __ aese(v0, v4);
2524     __ aesmc(v0, v0);
2525 
2526     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2527     __ rev32(v1, __ T16B, v1);
2528     __ rev32(v2, __ T16B, v2);
2529     __ rev32(v3, __ T16B, v3);
2530     __ rev32(v4, __ T16B, v4);
2531     __ aese(v0, v1);
2532     __ aesmc(v0, v0);
2533     __ aese(v0, v2);
2534     __ aesmc(v0, v0);
2535     __ aese(v0, v3);
2536     __ aesmc(v0, v0);
2537     __ aese(v0, v4);
2538     __ aesmc(v0, v0);
2539 
2540     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2541     __ rev32(v1, __ T16B, v1);
2542     __ rev32(v2, __ T16B, v2);
2543 
2544     __ cmpw(keylen, 44);
2545     __ br(Assembler::EQ, L_doLast);
2546 
2547     __ aese(v0, v1);
2548     __ aesmc(v0, v0);
2549     __ aese(v0, v2);
2550     __ aesmc(v0, v0);
2551 
2552     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2553     __ rev32(v1, __ T16B, v1);
2554     __ rev32(v2, __ T16B, v2);
2555 
2556     __ cmpw(keylen, 52);
2557     __ br(Assembler::EQ, L_doLast);
2558 
2559     __ aese(v0, v1);
2560     __ aesmc(v0, v0);
2561     __ aese(v0, v2);
2562     __ aesmc(v0, v0);
2563 
2564     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2565     __ rev32(v1, __ T16B, v1);
2566     __ rev32(v2, __ T16B, v2);
2567 
2568     __ BIND(L_doLast);
2569 
2570     __ aese(v0, v1);
2571     __ aesmc(v0, v0);
2572     __ aese(v0, v2);
2573 
2574     __ ld1(v1, __ T16B, key);
2575     __ rev32(v1, __ T16B, v1);
2576     __ eor(v0, __ T16B, v0, v1);
2577 
2578     __ st1(v0, __ T16B, to);
2579 
2580     __ mov(r0, 0);
2581 
2582     __ leave();
2583     __ ret(lr);
2584 
2585     return start;
2586   }
2587 
2588   // Arguments:
2589   //
2590   // Inputs:
2591   //   c_rarg0   - source byte array address
2592   //   c_rarg1   - destination byte array address
2593   //   c_rarg2   - K (key) in little endian int array
2594   //
2595   address generate_aescrypt_decryptBlock() {
2596     assert(UseAES, "need AES instructions and misaligned SSE support");
2597     __ align(CodeEntryAlignment);
2598     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2599     Label L_doLast;
2600 
2601     const Register from        = c_rarg0;  // source array address
2602     const Register to          = c_rarg1;  // destination array address
2603     const Register key         = c_rarg2;  // key array address
2604     const Register keylen      = rscratch1;
2605 
2606     address start = __ pc();
2607     __ enter(); // required for proper stackwalking of RuntimeStub frame
2608 
2609     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2610 
2611     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2612 
2613     __ ld1(v5, __ T16B, __ post(key, 16));
2614     __ rev32(v5, __ T16B, v5);
2615 
2616     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2617     __ rev32(v1, __ T16B, v1);
2618     __ rev32(v2, __ T16B, v2);
2619     __ rev32(v3, __ T16B, v3);
2620     __ rev32(v4, __ T16B, v4);
2621     __ aesd(v0, v1);
2622     __ aesimc(v0, v0);
2623     __ aesd(v0, v2);
2624     __ aesimc(v0, v0);
2625     __ aesd(v0, v3);
2626     __ aesimc(v0, v0);
2627     __ aesd(v0, v4);
2628     __ aesimc(v0, v0);
2629 
2630     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2631     __ rev32(v1, __ T16B, v1);
2632     __ rev32(v2, __ T16B, v2);
2633     __ rev32(v3, __ T16B, v3);
2634     __ rev32(v4, __ T16B, v4);
2635     __ aesd(v0, v1);
2636     __ aesimc(v0, v0);
2637     __ aesd(v0, v2);
2638     __ aesimc(v0, v0);
2639     __ aesd(v0, v3);
2640     __ aesimc(v0, v0);
2641     __ aesd(v0, v4);
2642     __ aesimc(v0, v0);
2643 
2644     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2645     __ rev32(v1, __ T16B, v1);
2646     __ rev32(v2, __ T16B, v2);
2647 
2648     __ cmpw(keylen, 44);
2649     __ br(Assembler::EQ, L_doLast);
2650 
2651     __ aesd(v0, v1);
2652     __ aesimc(v0, v0);
2653     __ aesd(v0, v2);
2654     __ aesimc(v0, v0);
2655 
2656     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2657     __ rev32(v1, __ T16B, v1);
2658     __ rev32(v2, __ T16B, v2);
2659 
2660     __ cmpw(keylen, 52);
2661     __ br(Assembler::EQ, L_doLast);
2662 
2663     __ aesd(v0, v1);
2664     __ aesimc(v0, v0);
2665     __ aesd(v0, v2);
2666     __ aesimc(v0, v0);
2667 
2668     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2669     __ rev32(v1, __ T16B, v1);
2670     __ rev32(v2, __ T16B, v2);
2671 
2672     __ BIND(L_doLast);
2673 
2674     __ aesd(v0, v1);
2675     __ aesimc(v0, v0);
2676     __ aesd(v0, v2);
2677 
2678     __ eor(v0, __ T16B, v0, v5);
2679 
2680     __ st1(v0, __ T16B, to);
2681 
2682     __ mov(r0, 0);
2683 
2684     __ leave();
2685     __ ret(lr);
2686 
2687     return start;
2688   }
2689 
2690   // Arguments:
2691   //
2692   // Inputs:
2693   //   c_rarg0   - source byte array address
2694   //   c_rarg1   - destination byte array address
2695   //   c_rarg2   - K (key) in little endian int array
2696   //   c_rarg3   - r vector byte array address
2697   //   c_rarg4   - input length
2698   //
2699   // Output:
2700   //   x0        - input length
2701   //
2702   address generate_cipherBlockChaining_encryptAESCrypt() {
2703     assert(UseAES, "need AES instructions and misaligned SSE support");
2704     __ align(CodeEntryAlignment);
2705     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2706 
2707     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2708 
2709     const Register from        = c_rarg0;  // source array address
2710     const Register to          = c_rarg1;  // destination array address
2711     const Register key         = c_rarg2;  // key array address
2712     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2713                                            // and left with the results of the last encryption block
2714     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2715     const Register keylen      = rscratch1;
2716 
2717     address start = __ pc();
2718 
2719       __ enter();
2720 
2721       __ movw(rscratch2, len_reg);
2722 
2723       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2724 
2725       __ ld1(v0, __ T16B, rvec);
2726 
2727       __ cmpw(keylen, 52);
2728       __ br(Assembler::CC, L_loadkeys_44);
2729       __ br(Assembler::EQ, L_loadkeys_52);
2730 
2731       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2732       __ rev32(v17, __ T16B, v17);
2733       __ rev32(v18, __ T16B, v18);
2734     __ BIND(L_loadkeys_52);
2735       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2736       __ rev32(v19, __ T16B, v19);
2737       __ rev32(v20, __ T16B, v20);
2738     __ BIND(L_loadkeys_44);
2739       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2740       __ rev32(v21, __ T16B, v21);
2741       __ rev32(v22, __ T16B, v22);
2742       __ rev32(v23, __ T16B, v23);
2743       __ rev32(v24, __ T16B, v24);
2744       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2745       __ rev32(v25, __ T16B, v25);
2746       __ rev32(v26, __ T16B, v26);
2747       __ rev32(v27, __ T16B, v27);
2748       __ rev32(v28, __ T16B, v28);
2749       __ ld1(v29, v30, v31, __ T16B, key);
2750       __ rev32(v29, __ T16B, v29);
2751       __ rev32(v30, __ T16B, v30);
2752       __ rev32(v31, __ T16B, v31);
2753 
2754     __ BIND(L_aes_loop);
2755       __ ld1(v1, __ T16B, __ post(from, 16));
2756       __ eor(v0, __ T16B, v0, v1);
2757 
2758       __ br(Assembler::CC, L_rounds_44);
2759       __ br(Assembler::EQ, L_rounds_52);
2760 
2761       __ aese(v0, v17); __ aesmc(v0, v0);
2762       __ aese(v0, v18); __ aesmc(v0, v0);
2763     __ BIND(L_rounds_52);
2764       __ aese(v0, v19); __ aesmc(v0, v0);
2765       __ aese(v0, v20); __ aesmc(v0, v0);
2766     __ BIND(L_rounds_44);
2767       __ aese(v0, v21); __ aesmc(v0, v0);
2768       __ aese(v0, v22); __ aesmc(v0, v0);
2769       __ aese(v0, v23); __ aesmc(v0, v0);
2770       __ aese(v0, v24); __ aesmc(v0, v0);
2771       __ aese(v0, v25); __ aesmc(v0, v0);
2772       __ aese(v0, v26); __ aesmc(v0, v0);
2773       __ aese(v0, v27); __ aesmc(v0, v0);
2774       __ aese(v0, v28); __ aesmc(v0, v0);
2775       __ aese(v0, v29); __ aesmc(v0, v0);
2776       __ aese(v0, v30);
2777       __ eor(v0, __ T16B, v0, v31);
2778 
2779       __ st1(v0, __ T16B, __ post(to, 16));
2780 
2781       __ subw(len_reg, len_reg, 16);
2782       __ cbnzw(len_reg, L_aes_loop);
2783 
2784       __ st1(v0, __ T16B, rvec);
2785 
2786       __ mov(r0, rscratch2);
2787 
2788       __ leave();
2789       __ ret(lr);
2790 
2791       return start;
2792   }
2793 
2794   // Arguments:
2795   //
2796   // Inputs:
2797   //   c_rarg0   - source byte array address
2798   //   c_rarg1   - destination byte array address
2799   //   c_rarg2   - K (key) in little endian int array
2800   //   c_rarg3   - r vector byte array address
2801   //   c_rarg4   - input length
2802   //
2803   // Output:
2804   //   r0        - input length
2805   //
2806   address generate_cipherBlockChaining_decryptAESCrypt() {
2807     assert(UseAES, "need AES instructions and misaligned SSE support");
2808     __ align(CodeEntryAlignment);
2809     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2810 
2811     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2812 
2813     const Register from        = c_rarg0;  // source array address
2814     const Register to          = c_rarg1;  // destination array address
2815     const Register key         = c_rarg2;  // key array address
2816     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2817                                            // and left with the results of the last encryption block
2818     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2819     const Register keylen      = rscratch1;
2820 
2821     address start = __ pc();
2822 
2823       __ enter();
2824 
2825       __ movw(rscratch2, len_reg);
2826 
2827       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2828 
2829       __ ld1(v2, __ T16B, rvec);
2830 
2831       __ ld1(v31, __ T16B, __ post(key, 16));
2832       __ rev32(v31, __ T16B, v31);
2833 
2834       __ cmpw(keylen, 52);
2835       __ br(Assembler::CC, L_loadkeys_44);
2836       __ br(Assembler::EQ, L_loadkeys_52);
2837 
2838       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2839       __ rev32(v17, __ T16B, v17);
2840       __ rev32(v18, __ T16B, v18);
2841     __ BIND(L_loadkeys_52);
2842       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2843       __ rev32(v19, __ T16B, v19);
2844       __ rev32(v20, __ T16B, v20);
2845     __ BIND(L_loadkeys_44);
2846       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2847       __ rev32(v21, __ T16B, v21);
2848       __ rev32(v22, __ T16B, v22);
2849       __ rev32(v23, __ T16B, v23);
2850       __ rev32(v24, __ T16B, v24);
2851       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2852       __ rev32(v25, __ T16B, v25);
2853       __ rev32(v26, __ T16B, v26);
2854       __ rev32(v27, __ T16B, v27);
2855       __ rev32(v28, __ T16B, v28);
2856       __ ld1(v29, v30, __ T16B, key);
2857       __ rev32(v29, __ T16B, v29);
2858       __ rev32(v30, __ T16B, v30);
2859 
2860     __ BIND(L_aes_loop);
2861       __ ld1(v0, __ T16B, __ post(from, 16));
2862       __ orr(v1, __ T16B, v0, v0);
2863 
2864       __ br(Assembler::CC, L_rounds_44);
2865       __ br(Assembler::EQ, L_rounds_52);
2866 
2867       __ aesd(v0, v17); __ aesimc(v0, v0);
2868       __ aesd(v0, v18); __ aesimc(v0, v0);
2869     __ BIND(L_rounds_52);
2870       __ aesd(v0, v19); __ aesimc(v0, v0);
2871       __ aesd(v0, v20); __ aesimc(v0, v0);
2872     __ BIND(L_rounds_44);
2873       __ aesd(v0, v21); __ aesimc(v0, v0);
2874       __ aesd(v0, v22); __ aesimc(v0, v0);
2875       __ aesd(v0, v23); __ aesimc(v0, v0);
2876       __ aesd(v0, v24); __ aesimc(v0, v0);
2877       __ aesd(v0, v25); __ aesimc(v0, v0);
2878       __ aesd(v0, v26); __ aesimc(v0, v0);
2879       __ aesd(v0, v27); __ aesimc(v0, v0);
2880       __ aesd(v0, v28); __ aesimc(v0, v0);
2881       __ aesd(v0, v29); __ aesimc(v0, v0);
2882       __ aesd(v0, v30);
2883       __ eor(v0, __ T16B, v0, v31);
2884       __ eor(v0, __ T16B, v0, v2);
2885 
2886       __ st1(v0, __ T16B, __ post(to, 16));
2887       __ orr(v2, __ T16B, v1, v1);
2888 
2889       __ subw(len_reg, len_reg, 16);
2890       __ cbnzw(len_reg, L_aes_loop);
2891 
2892       __ st1(v2, __ T16B, rvec);
2893 
2894       __ mov(r0, rscratch2);
2895 
2896       __ leave();
2897       __ ret(lr);
2898 
2899     return start;
2900   }
2901 
2902   // Arguments:
2903   //
2904   // Inputs:
2905   //   c_rarg0   - byte[]  source+offset
2906   //   c_rarg1   - int[]   SHA.state
2907   //   c_rarg2   - int     offset
2908   //   c_rarg3   - int     limit
2909   //
2910   address generate_sha1_implCompress(bool multi_block, const char *name) {
2911     __ align(CodeEntryAlignment);
2912     StubCodeMark mark(this, "StubRoutines", name);
2913     address start = __ pc();
2914 
2915     Register buf   = c_rarg0;
2916     Register state = c_rarg1;
2917     Register ofs   = c_rarg2;
2918     Register limit = c_rarg3;
2919 
2920     Label keys;
2921     Label sha1_loop;
2922 
2923     // load the keys into v0..v3
2924     __ adr(rscratch1, keys);
2925     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2926     // load 5 words state into v6, v7
2927     __ ldrq(v6, Address(state, 0));
2928     __ ldrs(v7, Address(state, 16));
2929 
2930 
2931     __ BIND(sha1_loop);
2932     // load 64 bytes of data into v16..v19
2933     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2934     __ rev32(v16, __ T16B, v16);
2935     __ rev32(v17, __ T16B, v17);
2936     __ rev32(v18, __ T16B, v18);
2937     __ rev32(v19, __ T16B, v19);
2938 
2939     // do the sha1
2940     __ addv(v4, __ T4S, v16, v0);
2941     __ orr(v20, __ T16B, v6, v6);
2942 
2943     FloatRegister d0 = v16;
2944     FloatRegister d1 = v17;
2945     FloatRegister d2 = v18;
2946     FloatRegister d3 = v19;
2947 
2948     for (int round = 0; round < 20; round++) {
2949       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2950       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2951       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2952       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2953       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2954 
2955       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2956       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2957       __ sha1h(tmp2, __ T4S, v20);
2958       if (round < 5)
2959         __ sha1c(v20, __ T4S, tmp3, tmp4);
2960       else if (round < 10 || round >= 15)
2961         __ sha1p(v20, __ T4S, tmp3, tmp4);
2962       else
2963         __ sha1m(v20, __ T4S, tmp3, tmp4);
2964       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2965 
2966       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2967     }
2968 
2969     __ addv(v7, __ T2S, v7, v21);
2970     __ addv(v6, __ T4S, v6, v20);
2971 
2972     if (multi_block) {
2973       __ add(ofs, ofs, 64);
2974       __ cmp(ofs, limit);
2975       __ br(Assembler::LE, sha1_loop);
2976       __ mov(c_rarg0, ofs); // return ofs
2977     }
2978 
2979     __ strq(v6, Address(state, 0));
2980     __ strs(v7, Address(state, 16));
2981 
2982     __ ret(lr);
2983 
2984     __ bind(keys);
2985     __ emit_int32(0x5a827999);
2986     __ emit_int32(0x6ed9eba1);
2987     __ emit_int32(0x8f1bbcdc);
2988     __ emit_int32(0xca62c1d6);
2989 
2990     return start;
2991   }
2992 
2993 
2994   // Arguments:
2995   //
2996   // Inputs:
2997   //   c_rarg0   - byte[]  source+offset
2998   //   c_rarg1   - int[]   SHA.state
2999   //   c_rarg2   - int     offset
3000   //   c_rarg3   - int     limit
3001   //
3002   address generate_sha256_implCompress(bool multi_block, const char *name) {
3003     static const uint32_t round_consts[64] = {
3004       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3005       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3006       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3007       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3008       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3009       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3010       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3011       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3012       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3013       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3014       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3015       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3016       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3017       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3018       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3019       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3020     };
3021     __ align(CodeEntryAlignment);
3022     StubCodeMark mark(this, "StubRoutines", name);
3023     address start = __ pc();
3024 
3025     Register buf   = c_rarg0;
3026     Register state = c_rarg1;
3027     Register ofs   = c_rarg2;
3028     Register limit = c_rarg3;
3029 
3030     Label sha1_loop;
3031 
3032     __ stpd(v8, v9, __ pre(sp, -32));
3033     __ stpd(v10, v11, Address(sp, 16));
3034 
3035 // dga == v0
3036 // dgb == v1
3037 // dg0 == v2
3038 // dg1 == v3
3039 // dg2 == v4
3040 // t0 == v6
3041 // t1 == v7
3042 
3043     // load 16 keys to v16..v31
3044     __ lea(rscratch1, ExternalAddress((address)round_consts));
3045     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3046     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3047     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3048     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3049 
3050     // load 8 words (256 bits) state
3051     __ ldpq(v0, v1, state);
3052 
3053     __ BIND(sha1_loop);
3054     // load 64 bytes of data into v8..v11
3055     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3056     __ rev32(v8, __ T16B, v8);
3057     __ rev32(v9, __ T16B, v9);
3058     __ rev32(v10, __ T16B, v10);
3059     __ rev32(v11, __ T16B, v11);
3060 
3061     __ addv(v6, __ T4S, v8, v16);
3062     __ orr(v2, __ T16B, v0, v0);
3063     __ orr(v3, __ T16B, v1, v1);
3064 
3065     FloatRegister d0 = v8;
3066     FloatRegister d1 = v9;
3067     FloatRegister d2 = v10;
3068     FloatRegister d3 = v11;
3069 
3070 
3071     for (int round = 0; round < 16; round++) {
3072       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3073       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3074       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3075       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3076 
3077       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3078        __ orr(v4, __ T16B, v2, v2);
3079       if (round < 15)
3080         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3081       __ sha256h(v2, __ T4S, v3, tmp2);
3082       __ sha256h2(v3, __ T4S, v4, tmp2);
3083       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3084 
3085       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3086     }
3087 
3088     __ addv(v0, __ T4S, v0, v2);
3089     __ addv(v1, __ T4S, v1, v3);
3090 
3091     if (multi_block) {
3092       __ add(ofs, ofs, 64);
3093       __ cmp(ofs, limit);
3094       __ br(Assembler::LE, sha1_loop);
3095       __ mov(c_rarg0, ofs); // return ofs
3096     }
3097 
3098     __ ldpd(v10, v11, Address(sp, 16));
3099     __ ldpd(v8, v9, __ post(sp, 32));
3100 
3101     __ stpq(v0, v1, state);
3102 
3103     __ ret(lr);
3104 
3105     return start;
3106   }
3107 
3108 #ifndef BUILTIN_SIM
3109   // Safefetch stubs.
3110   void generate_safefetch(const char* name, int size, address* entry,
3111                           address* fault_pc, address* continuation_pc) {
3112     // safefetch signatures:
3113     //   int      SafeFetch32(int*      adr, int      errValue);
3114     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3115     //
3116     // arguments:
3117     //   c_rarg0 = adr
3118     //   c_rarg1 = errValue
3119     //
3120     // result:
3121     //   PPC_RET  = *adr or errValue
3122 
3123     StubCodeMark mark(this, "StubRoutines", name);
3124 
3125     // Entry point, pc or function descriptor.
3126     *entry = __ pc();
3127 
3128     // Load *adr into c_rarg1, may fault.
3129     *fault_pc = __ pc();
3130     switch (size) {
3131       case 4:
3132         // int32_t
3133         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3134         break;
3135       case 8:
3136         // int64_t
3137         __ ldr(c_rarg1, Address(c_rarg0, 0));
3138         break;
3139       default:
3140         ShouldNotReachHere();
3141     }
3142 
3143     // return errValue or *adr
3144     *continuation_pc = __ pc();
3145     __ mov(r0, c_rarg1);
3146     __ ret(lr);
3147   }
3148 #endif
3149 
3150   /**
3151    *  Arguments:
3152    *
3153    * Inputs:
3154    *   c_rarg0   - int crc
3155    *   c_rarg1   - byte* buf
3156    *   c_rarg2   - int length
3157    *
3158    * Ouput:
3159    *       rax   - int crc result
3160    */
3161   address generate_updateBytesCRC32() {
3162     assert(UseCRC32Intrinsics, "what are we doing here?");
3163 
3164     __ align(CodeEntryAlignment);
3165     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3166 
3167     address start = __ pc();
3168 
3169     const Register crc   = c_rarg0;  // crc
3170     const Register buf   = c_rarg1;  // source java byte array address
3171     const Register len   = c_rarg2;  // length
3172     const Register table0 = c_rarg3; // crc_table address
3173     const Register table1 = c_rarg4;
3174     const Register table2 = c_rarg5;
3175     const Register table3 = c_rarg6;
3176     const Register tmp3 = c_rarg7;
3177 
3178     BLOCK_COMMENT("Entry:");
3179     __ enter(); // required for proper stackwalking of RuntimeStub frame
3180 
3181     __ kernel_crc32(crc, buf, len,
3182               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3183 
3184     __ leave(); // required for proper stackwalking of RuntimeStub frame
3185     __ ret(lr);
3186 
3187     return start;
3188   }
3189 
3190   /**
3191    *  Arguments:
3192    *
3193    * Inputs:
3194    *   c_rarg0   - int crc
3195    *   c_rarg1   - byte* buf
3196    *   c_rarg2   - int length
3197    *   c_rarg3   - int* table
3198    *
3199    * Ouput:
3200    *       r0   - int crc result
3201    */
3202   address generate_updateBytesCRC32C() {
3203     assert(UseCRC32CIntrinsics, "what are we doing here?");
3204 
3205     __ align(CodeEntryAlignment);
3206     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3207 
3208     address start = __ pc();
3209 
3210     const Register crc   = c_rarg0;  // crc
3211     const Register buf   = c_rarg1;  // source java byte array address
3212     const Register len   = c_rarg2;  // length
3213     const Register table0 = c_rarg3; // crc_table address
3214     const Register table1 = c_rarg4;
3215     const Register table2 = c_rarg5;
3216     const Register table3 = c_rarg6;
3217     const Register tmp3 = c_rarg7;
3218 
3219     BLOCK_COMMENT("Entry:");
3220     __ enter(); // required for proper stackwalking of RuntimeStub frame
3221 
3222     __ kernel_crc32c(crc, buf, len,
3223               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3224 
3225     __ leave(); // required for proper stackwalking of RuntimeStub frame
3226     __ ret(lr);
3227 
3228     return start;
3229   }
3230 
3231   /***
3232    *  Arguments:
3233    *
3234    *  Inputs:
3235    *   c_rarg0   - int   adler
3236    *   c_rarg1   - byte* buff
3237    *   c_rarg2   - int   len
3238    *
3239    * Output:
3240    *   c_rarg0   - int adler result
3241    */
3242   address generate_updateBytesAdler32() {
3243     __ align(CodeEntryAlignment);
3244     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3245     address start = __ pc();
3246 
3247     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3248 
3249     // Aliases
3250     Register adler  = c_rarg0;
3251     Register s1     = c_rarg0;
3252     Register s2     = c_rarg3;
3253     Register buff   = c_rarg1;
3254     Register len    = c_rarg2;
3255     Register nmax  = r4;
3256     Register base  = r5;
3257     Register count = r6;
3258     Register temp0 = rscratch1;
3259     Register temp1 = rscratch2;
3260     FloatRegister vbytes = v0;
3261     FloatRegister vs1acc = v1;
3262     FloatRegister vs2acc = v2;
3263     FloatRegister vtable = v3;
3264 
3265     // Max number of bytes we can process before having to take the mod
3266     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3267     unsigned long BASE = 0xfff1;
3268     unsigned long NMAX = 0x15B0;
3269 
3270     __ mov(base, BASE);
3271     __ mov(nmax, NMAX);
3272 
3273     // Load accumulation coefficients for the upper 16 bits
3274     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3275     __ ld1(vtable, __ T16B, Address(temp0));
3276 
3277     // s1 is initialized to the lower 16 bits of adler
3278     // s2 is initialized to the upper 16 bits of adler
3279     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3280     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3281 
3282     // The pipelined loop needs at least 16 elements for 1 iteration
3283     // It does check this, but it is more effective to skip to the cleanup loop
3284     __ cmp(len, (u1)16);
3285     __ br(Assembler::HS, L_nmax);
3286     __ cbz(len, L_combine);
3287 
3288     __ bind(L_simple_by1_loop);
3289     __ ldrb(temp0, Address(__ post(buff, 1)));
3290     __ add(s1, s1, temp0);
3291     __ add(s2, s2, s1);
3292     __ subs(len, len, 1);
3293     __ br(Assembler::HI, L_simple_by1_loop);
3294 
3295     // s1 = s1 % BASE
3296     __ subs(temp0, s1, base);
3297     __ csel(s1, temp0, s1, Assembler::HS);
3298 
3299     // s2 = s2 % BASE
3300     __ lsr(temp0, s2, 16);
3301     __ lsl(temp1, temp0, 4);
3302     __ sub(temp1, temp1, temp0);
3303     __ add(s2, temp1, s2, ext::uxth);
3304 
3305     __ subs(temp0, s2, base);
3306     __ csel(s2, temp0, s2, Assembler::HS);
3307 
3308     __ b(L_combine);
3309 
3310     __ bind(L_nmax);
3311     __ subs(len, len, nmax);
3312     __ sub(count, nmax, 16);
3313     __ br(Assembler::LO, L_by16);
3314 
3315     __ bind(L_nmax_loop);
3316 
3317     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3318                                       vbytes, vs1acc, vs2acc, vtable);
3319 
3320     __ subs(count, count, 16);
3321     __ br(Assembler::HS, L_nmax_loop);
3322 
3323     // s1 = s1 % BASE
3324     __ lsr(temp0, s1, 16);
3325     __ lsl(temp1, temp0, 4);
3326     __ sub(temp1, temp1, temp0);
3327     __ add(temp1, temp1, s1, ext::uxth);
3328 
3329     __ lsr(temp0, temp1, 16);
3330     __ lsl(s1, temp0, 4);
3331     __ sub(s1, s1, temp0);
3332     __ add(s1, s1, temp1, ext:: uxth);
3333 
3334     __ subs(temp0, s1, base);
3335     __ csel(s1, temp0, s1, Assembler::HS);
3336 
3337     // s2 = s2 % BASE
3338     __ lsr(temp0, s2, 16);
3339     __ lsl(temp1, temp0, 4);
3340     __ sub(temp1, temp1, temp0);
3341     __ add(temp1, temp1, s2, ext::uxth);
3342 
3343     __ lsr(temp0, temp1, 16);
3344     __ lsl(s2, temp0, 4);
3345     __ sub(s2, s2, temp0);
3346     __ add(s2, s2, temp1, ext:: uxth);
3347 
3348     __ subs(temp0, s2, base);
3349     __ csel(s2, temp0, s2, Assembler::HS);
3350 
3351     __ subs(len, len, nmax);
3352     __ sub(count, nmax, 16);
3353     __ br(Assembler::HS, L_nmax_loop);
3354 
3355     __ bind(L_by16);
3356     __ adds(len, len, count);
3357     __ br(Assembler::LO, L_by1);
3358 
3359     __ bind(L_by16_loop);
3360 
3361     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3362                                       vbytes, vs1acc, vs2acc, vtable);
3363 
3364     __ subs(len, len, 16);
3365     __ br(Assembler::HS, L_by16_loop);
3366 
3367     __ bind(L_by1);
3368     __ adds(len, len, 15);
3369     __ br(Assembler::LO, L_do_mod);
3370 
3371     __ bind(L_by1_loop);
3372     __ ldrb(temp0, Address(__ post(buff, 1)));
3373     __ add(s1, temp0, s1);
3374     __ add(s2, s2, s1);
3375     __ subs(len, len, 1);
3376     __ br(Assembler::HS, L_by1_loop);
3377 
3378     __ bind(L_do_mod);
3379     // s1 = s1 % BASE
3380     __ lsr(temp0, s1, 16);
3381     __ lsl(temp1, temp0, 4);
3382     __ sub(temp1, temp1, temp0);
3383     __ add(temp1, temp1, s1, ext::uxth);
3384 
3385     __ lsr(temp0, temp1, 16);
3386     __ lsl(s1, temp0, 4);
3387     __ sub(s1, s1, temp0);
3388     __ add(s1, s1, temp1, ext:: uxth);
3389 
3390     __ subs(temp0, s1, base);
3391     __ csel(s1, temp0, s1, Assembler::HS);
3392 
3393     // s2 = s2 % BASE
3394     __ lsr(temp0, s2, 16);
3395     __ lsl(temp1, temp0, 4);
3396     __ sub(temp1, temp1, temp0);
3397     __ add(temp1, temp1, s2, ext::uxth);
3398 
3399     __ lsr(temp0, temp1, 16);
3400     __ lsl(s2, temp0, 4);
3401     __ sub(s2, s2, temp0);
3402     __ add(s2, s2, temp1, ext:: uxth);
3403 
3404     __ subs(temp0, s2, base);
3405     __ csel(s2, temp0, s2, Assembler::HS);
3406 
3407     // Combine lower bits and higher bits
3408     __ bind(L_combine);
3409     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3410 
3411     __ ret(lr);
3412 
3413     return start;
3414   }
3415 
3416   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3417           Register temp0, Register temp1, FloatRegister vbytes,
3418           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3419     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3420     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3421     // In non-vectorized code, we update s1 and s2 as:
3422     //   s1 <- s1 + b1
3423     //   s2 <- s2 + s1
3424     //   s1 <- s1 + b2
3425     //   s2 <- s2 + b1
3426     //   ...
3427     //   s1 <- s1 + b16
3428     //   s2 <- s2 + s1
3429     // Putting above assignments together, we have:
3430     //   s1_new = s1 + b1 + b2 + ... + b16
3431     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3432     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3433     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3434     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3435 
3436     // s2 = s2 + s1 * 16
3437     __ add(s2, s2, s1, Assembler::LSL, 4);
3438 
3439     // vs1acc = b1 + b2 + b3 + ... + b16
3440     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3441     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3442     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3443     __ uaddlv(vs1acc, __ T16B, vbytes);
3444     __ uaddlv(vs2acc, __ T8H, vs2acc);
3445 
3446     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3447     __ fmovd(temp0, vs1acc);
3448     __ fmovd(temp1, vs2acc);
3449     __ add(s1, s1, temp0);
3450     __ add(s2, s2, temp1);
3451   }
3452 
3453   /**
3454    *  Arguments:
3455    *
3456    *  Input:
3457    *    c_rarg0   - x address
3458    *    c_rarg1   - x length
3459    *    c_rarg2   - y address
3460    *    c_rarg3   - y lenth
3461    *    c_rarg4   - z address
3462    *    c_rarg5   - z length
3463    */
3464   address generate_multiplyToLen() {
3465     __ align(CodeEntryAlignment);
3466     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3467 
3468     address start = __ pc();
3469     const Register x     = r0;
3470     const Register xlen  = r1;
3471     const Register y     = r2;
3472     const Register ylen  = r3;
3473     const Register z     = r4;
3474     const Register zlen  = r5;
3475 
3476     const Register tmp1  = r10;
3477     const Register tmp2  = r11;
3478     const Register tmp3  = r12;
3479     const Register tmp4  = r13;
3480     const Register tmp5  = r14;
3481     const Register tmp6  = r15;
3482     const Register tmp7  = r16;
3483 
3484     BLOCK_COMMENT("Entry:");
3485     __ enter(); // required for proper stackwalking of RuntimeStub frame
3486     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3487     __ leave(); // required for proper stackwalking of RuntimeStub frame
3488     __ ret(lr);
3489 
3490     return start;
3491   }
3492 
3493   address generate_squareToLen() {
3494     // squareToLen algorithm for sizes 1..127 described in java code works
3495     // faster than multiply_to_len on some CPUs and slower on others, but
3496     // multiply_to_len shows a bit better overall results
3497     __ align(CodeEntryAlignment);
3498     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3499     address start = __ pc();
3500 
3501     const Register x     = r0;
3502     const Register xlen  = r1;
3503     const Register z     = r2;
3504     const Register zlen  = r3;
3505     const Register y     = r4; // == x
3506     const Register ylen  = r5; // == xlen
3507 
3508     const Register tmp1  = r10;
3509     const Register tmp2  = r11;
3510     const Register tmp3  = r12;
3511     const Register tmp4  = r13;
3512     const Register tmp5  = r14;
3513     const Register tmp6  = r15;
3514     const Register tmp7  = r16;
3515 
3516     RegSet spilled_regs = RegSet::of(y, ylen);
3517     BLOCK_COMMENT("Entry:");
3518     __ enter();
3519     __ push(spilled_regs, sp);
3520     __ mov(y, x);
3521     __ mov(ylen, xlen);
3522     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3523     __ pop(spilled_regs, sp);
3524     __ leave();
3525     __ ret(lr);
3526     return start;
3527   }
3528 
3529   address generate_mulAdd() {
3530     __ align(CodeEntryAlignment);
3531     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3532 
3533     address start = __ pc();
3534 
3535     const Register out     = r0;
3536     const Register in      = r1;
3537     const Register offset  = r2;
3538     const Register len     = r3;
3539     const Register k       = r4;
3540 
3541     BLOCK_COMMENT("Entry:");
3542     __ enter();
3543     __ mul_add(out, in, offset, len, k);
3544     __ leave();
3545     __ ret(lr);
3546 
3547     return start;
3548   }
3549 
3550   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3551                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3552                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3553     // Karatsuba multiplication performs a 128*128 -> 256-bit
3554     // multiplication in three 128-bit multiplications and a few
3555     // additions.
3556     //
3557     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3558     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3559     //
3560     // Inputs:
3561     //
3562     // A0 in a.d[0]     (subkey)
3563     // A1 in a.d[1]
3564     // (A1+A0) in a1_xor_a0.d[0]
3565     //
3566     // B0 in b.d[0]     (state)
3567     // B1 in b.d[1]
3568 
3569     __ ext(tmp1, __ T16B, b, b, 0x08);
3570     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3571     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3572     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3573     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3574 
3575     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3576     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3577     __ eor(tmp2, __ T16B, tmp2, tmp4);
3578     __ eor(tmp2, __ T16B, tmp2, tmp3);
3579 
3580     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3581     __ ins(result_hi, __ D, tmp2, 0, 1);
3582     __ ins(result_lo, __ D, tmp2, 1, 0);
3583   }
3584 
3585   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3586                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3587     const FloatRegister t0 = result;
3588 
3589     // The GCM field polynomial f is z^128 + p(z), where p =
3590     // z^7+z^2+z+1.
3591     //
3592     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3593     //
3594     // so, given that the product we're reducing is
3595     //    a == lo + hi * z^128
3596     // substituting,
3597     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3598     //
3599     // we reduce by multiplying hi by p(z) and subtracting the result
3600     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3601     // bits we can do this with two 64-bit multiplications, lo*p and
3602     // hi*p.
3603 
3604     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3605     __ ext(t1, __ T16B, t0, z, 8);
3606     __ eor(hi, __ T16B, hi, t1);
3607     __ ext(t1, __ T16B, z, t0, 8);
3608     __ eor(lo, __ T16B, lo, t1);
3609     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3610     __ eor(result, __ T16B, lo, t0);
3611   }
3612 
3613   address generate_has_negatives(address &has_negatives_long) {
3614     const u1 large_loop_size = 64;
3615     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3616     int dcache_line = VM_Version::dcache_line_size();
3617 
3618     Register ary1 = r1, len = r2, result = r0;
3619 
3620     __ align(CodeEntryAlignment);
3621 
3622     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3623 
3624     address entry = __ pc();
3625 
3626     __ enter();
3627 
3628   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3629         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3630 
3631   __ cmp(len, (u1)15);
3632   __ br(Assembler::GT, LEN_OVER_15);
3633   // The only case when execution falls into this code is when pointer is near
3634   // the end of memory page and we have to avoid reading next page
3635   __ add(ary1, ary1, len);
3636   __ subs(len, len, 8);
3637   __ br(Assembler::GT, LEN_OVER_8);
3638   __ ldr(rscratch2, Address(ary1, -8));
3639   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3640   __ lsrv(rscratch2, rscratch2, rscratch1);
3641   __ tst(rscratch2, UPPER_BIT_MASK);
3642   __ cset(result, Assembler::NE);
3643   __ leave();
3644   __ ret(lr);
3645   __ bind(LEN_OVER_8);
3646   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3647   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3648   __ tst(rscratch2, UPPER_BIT_MASK);
3649   __ br(Assembler::NE, RET_TRUE_NO_POP);
3650   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3651   __ lsrv(rscratch1, rscratch1, rscratch2);
3652   __ tst(rscratch1, UPPER_BIT_MASK);
3653   __ cset(result, Assembler::NE);
3654   __ leave();
3655   __ ret(lr);
3656 
3657   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3658   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3659 
3660   has_negatives_long = __ pc(); // 2nd entry point
3661 
3662   __ enter();
3663 
3664   __ bind(LEN_OVER_15);
3665     __ push(spilled_regs, sp);
3666     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3667     __ cbz(rscratch2, ALIGNED);
3668     __ ldp(tmp6, tmp1, Address(ary1));
3669     __ mov(tmp5, 16);
3670     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3671     __ add(ary1, ary1, rscratch1);
3672     __ sub(len, len, rscratch1);
3673     __ orr(tmp6, tmp6, tmp1);
3674     __ tst(tmp6, UPPER_BIT_MASK);
3675     __ br(Assembler::NE, RET_TRUE);
3676 
3677   __ bind(ALIGNED);
3678     __ cmp(len, large_loop_size);
3679     __ br(Assembler::LT, CHECK_16);
3680     // Perform 16-byte load as early return in pre-loop to handle situation
3681     // when initially aligned large array has negative values at starting bytes,
3682     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3683     // slower. Cases with negative bytes further ahead won't be affected that
3684     // much. In fact, it'll be faster due to early loads, less instructions and
3685     // less branches in LARGE_LOOP.
3686     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3687     __ sub(len, len, 16);
3688     __ orr(tmp6, tmp6, tmp1);
3689     __ tst(tmp6, UPPER_BIT_MASK);
3690     __ br(Assembler::NE, RET_TRUE);
3691     __ cmp(len, large_loop_size);
3692     __ br(Assembler::LT, CHECK_16);
3693 
3694     if (SoftwarePrefetchHintDistance >= 0
3695         && SoftwarePrefetchHintDistance >= dcache_line) {
3696       // initial prefetch
3697       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3698     }
3699   __ bind(LARGE_LOOP);
3700     if (SoftwarePrefetchHintDistance >= 0) {
3701       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3702     }
3703     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3704     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3705     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3706     // instructions per cycle and have less branches, but this approach disables
3707     // early return, thus, all 64 bytes are loaded and checked every time.
3708     __ ldp(tmp2, tmp3, Address(ary1));
3709     __ ldp(tmp4, tmp5, Address(ary1, 16));
3710     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3711     __ ldp(tmp6, tmp1, Address(ary1, 48));
3712     __ add(ary1, ary1, large_loop_size);
3713     __ sub(len, len, large_loop_size);
3714     __ orr(tmp2, tmp2, tmp3);
3715     __ orr(tmp4, tmp4, tmp5);
3716     __ orr(rscratch1, rscratch1, rscratch2);
3717     __ orr(tmp6, tmp6, tmp1);
3718     __ orr(tmp2, tmp2, tmp4);
3719     __ orr(rscratch1, rscratch1, tmp6);
3720     __ orr(tmp2, tmp2, rscratch1);
3721     __ tst(tmp2, UPPER_BIT_MASK);
3722     __ br(Assembler::NE, RET_TRUE);
3723     __ cmp(len, large_loop_size);
3724     __ br(Assembler::GE, LARGE_LOOP);
3725 
3726   __ bind(CHECK_16); // small 16-byte load pre-loop
3727     __ cmp(len, (u1)16);
3728     __ br(Assembler::LT, POST_LOOP16);
3729 
3730   __ bind(LOOP16); // small 16-byte load loop
3731     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3732     __ sub(len, len, 16);
3733     __ orr(tmp2, tmp2, tmp3);
3734     __ tst(tmp2, UPPER_BIT_MASK);
3735     __ br(Assembler::NE, RET_TRUE);
3736     __ cmp(len, (u1)16);
3737     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3738 
3739   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3740     __ cmp(len, (u1)8);
3741     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3742     __ ldr(tmp3, Address(__ post(ary1, 8)));
3743     __ sub(len, len, 8);
3744     __ tst(tmp3, UPPER_BIT_MASK);
3745     __ br(Assembler::NE, RET_TRUE);
3746 
3747   __ bind(POST_LOOP16_LOAD_TAIL);
3748     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3749     __ ldr(tmp1, Address(ary1));
3750     __ mov(tmp2, 64);
3751     __ sub(tmp4, tmp2, len, __ LSL, 3);
3752     __ lslv(tmp1, tmp1, tmp4);
3753     __ tst(tmp1, UPPER_BIT_MASK);
3754     __ br(Assembler::NE, RET_TRUE);
3755     // Fallthrough
3756 
3757   __ bind(RET_FALSE);
3758     __ pop(spilled_regs, sp);
3759     __ leave();
3760     __ mov(result, zr);
3761     __ ret(lr);
3762 
3763   __ bind(RET_TRUE);
3764     __ pop(spilled_regs, sp);
3765   __ bind(RET_TRUE_NO_POP);
3766     __ leave();
3767     __ mov(result, 1);
3768     __ ret(lr);
3769 
3770   __ bind(DONE);
3771     __ pop(spilled_regs, sp);
3772     __ leave();
3773     __ ret(lr);
3774     return entry;
3775   }
3776 
3777   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3778         bool usePrefetch, Label &NOT_EQUAL) {
3779     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3780         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3781         tmp7 = r12, tmp8 = r13;
3782     Label LOOP;
3783 
3784     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3785     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3786     __ bind(LOOP);
3787     if (usePrefetch) {
3788       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3789       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3790     }
3791     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3792     __ eor(tmp1, tmp1, tmp2);
3793     __ eor(tmp3, tmp3, tmp4);
3794     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3795     __ orr(tmp1, tmp1, tmp3);
3796     __ cbnz(tmp1, NOT_EQUAL);
3797     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3798     __ eor(tmp5, tmp5, tmp6);
3799     __ eor(tmp7, tmp7, tmp8);
3800     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3801     __ orr(tmp5, tmp5, tmp7);
3802     __ cbnz(tmp5, NOT_EQUAL);
3803     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3804     __ eor(tmp1, tmp1, tmp2);
3805     __ eor(tmp3, tmp3, tmp4);
3806     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3807     __ orr(tmp1, tmp1, tmp3);
3808     __ cbnz(tmp1, NOT_EQUAL);
3809     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3810     __ eor(tmp5, tmp5, tmp6);
3811     __ sub(cnt1, cnt1, 8 * wordSize);
3812     __ eor(tmp7, tmp7, tmp8);
3813     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3814     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3815     // cmp) because subs allows an unlimited range of immediate operand.
3816     __ subs(tmp6, cnt1, loopThreshold);
3817     __ orr(tmp5, tmp5, tmp7);
3818     __ cbnz(tmp5, NOT_EQUAL);
3819     __ br(__ GE, LOOP);
3820     // post-loop
3821     __ eor(tmp1, tmp1, tmp2);
3822     __ eor(tmp3, tmp3, tmp4);
3823     __ orr(tmp1, tmp1, tmp3);
3824     __ sub(cnt1, cnt1, 2 * wordSize);
3825     __ cbnz(tmp1, NOT_EQUAL);
3826   }
3827 
3828   void generate_large_array_equals_loop_simd(int loopThreshold,
3829         bool usePrefetch, Label &NOT_EQUAL) {
3830     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3831         tmp2 = rscratch2;
3832     Label LOOP;
3833 
3834     __ bind(LOOP);
3835     if (usePrefetch) {
3836       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3837       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3838     }
3839     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3840     __ sub(cnt1, cnt1, 8 * wordSize);
3841     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3842     __ subs(tmp1, cnt1, loopThreshold);
3843     __ eor(v0, __ T16B, v0, v4);
3844     __ eor(v1, __ T16B, v1, v5);
3845     __ eor(v2, __ T16B, v2, v6);
3846     __ eor(v3, __ T16B, v3, v7);
3847     __ orr(v0, __ T16B, v0, v1);
3848     __ orr(v1, __ T16B, v2, v3);
3849     __ orr(v0, __ T16B, v0, v1);
3850     __ umov(tmp1, v0, __ D, 0);
3851     __ umov(tmp2, v0, __ D, 1);
3852     __ orr(tmp1, tmp1, tmp2);
3853     __ cbnz(tmp1, NOT_EQUAL);
3854     __ br(__ GE, LOOP);
3855   }
3856 
3857   // a1 = r1 - array1 address
3858   // a2 = r2 - array2 address
3859   // result = r0 - return value. Already contains "false"
3860   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3861   // r3-r5 are reserved temporary registers
3862   address generate_large_array_equals() {
3863     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3864         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3865         tmp7 = r12, tmp8 = r13;
3866     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3867         SMALL_LOOP, POST_LOOP;
3868     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3869     // calculate if at least 32 prefetched bytes are used
3870     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3871     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3872     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3873     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3874         tmp5, tmp6, tmp7, tmp8);
3875 
3876     __ align(CodeEntryAlignment);
3877 
3878     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3879 
3880     address entry = __ pc();
3881     __ enter();
3882     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3883     // also advance pointers to use post-increment instead of pre-increment
3884     __ add(a1, a1, wordSize);
3885     __ add(a2, a2, wordSize);
3886     if (AvoidUnalignedAccesses) {
3887       // both implementations (SIMD/nonSIMD) are using relatively large load
3888       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3889       // on some CPUs in case of address is not at least 16-byte aligned.
3890       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3891       // load if needed at least for 1st address and make if 16-byte aligned.
3892       Label ALIGNED16;
3893       __ tbz(a1, 3, ALIGNED16);
3894       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3895       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3896       __ sub(cnt1, cnt1, wordSize);
3897       __ eor(tmp1, tmp1, tmp2);
3898       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3899       __ bind(ALIGNED16);
3900     }
3901     if (UseSIMDForArrayEquals) {
3902       if (SoftwarePrefetchHintDistance >= 0) {
3903         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3904         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3905         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3906             /* prfm = */ true, NOT_EQUAL);
3907         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3908         __ br(__ LT, TAIL);
3909       }
3910       __ bind(NO_PREFETCH_LARGE_LOOP);
3911       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3912           /* prfm = */ false, NOT_EQUAL);
3913     } else {
3914       __ push(spilled_regs, sp);
3915       if (SoftwarePrefetchHintDistance >= 0) {
3916         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3917         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3918         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3919             /* prfm = */ true, NOT_EQUAL);
3920         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3921         __ br(__ LT, TAIL);
3922       }
3923       __ bind(NO_PREFETCH_LARGE_LOOP);
3924       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3925           /* prfm = */ false, NOT_EQUAL);
3926     }
3927     __ bind(TAIL);
3928       __ cbz(cnt1, EQUAL);
3929       __ subs(cnt1, cnt1, wordSize);
3930       __ br(__ LE, POST_LOOP);
3931     __ bind(SMALL_LOOP);
3932       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3933       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3934       __ subs(cnt1, cnt1, wordSize);
3935       __ eor(tmp1, tmp1, tmp2);
3936       __ cbnz(tmp1, NOT_EQUAL);
3937       __ br(__ GT, SMALL_LOOP);
3938     __ bind(POST_LOOP);
3939       __ ldr(tmp1, Address(a1, cnt1));
3940       __ ldr(tmp2, Address(a2, cnt1));
3941       __ eor(tmp1, tmp1, tmp2);
3942       __ cbnz(tmp1, NOT_EQUAL);
3943     __ bind(EQUAL);
3944       __ mov(result, true);
3945     __ bind(NOT_EQUAL);
3946       if (!UseSIMDForArrayEquals) {
3947         __ pop(spilled_regs, sp);
3948       }
3949     __ bind(NOT_EQUAL_NO_POP);
3950     __ leave();
3951     __ ret(lr);
3952     return entry;
3953   }
3954 
3955   address generate_dsin_dcos(bool isCos) {
3956     __ align(CodeEntryAlignment);
3957     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3958     address start = __ pc();
3959     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3960         (address)StubRoutines::aarch64::_two_over_pi,
3961         (address)StubRoutines::aarch64::_pio2,
3962         (address)StubRoutines::aarch64::_dsin_coef,
3963         (address)StubRoutines::aarch64::_dcos_coef);
3964     return start;
3965   }
3966 
3967   address generate_dlog() {
3968     __ align(CodeEntryAlignment);
3969     StubCodeMark mark(this, "StubRoutines", "dlog");
3970     address entry = __ pc();
3971     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3972         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3973     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3974     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3975         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3976     return entry;
3977   }
3978 
3979   // code for comparing 16 bytes of strings with same encoding
3980   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
3981     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
3982     __ ldr(rscratch1, Address(__ post(str1, 8)));
3983     __ eor(rscratch2, tmp1, tmp2);
3984     __ ldr(cnt1, Address(__ post(str2, 8)));
3985     __ cbnz(rscratch2, DIFF1);
3986     __ ldr(tmp1, Address(__ post(str1, 8)));
3987     __ eor(rscratch2, rscratch1, cnt1);
3988     __ ldr(tmp2, Address(__ post(str2, 8)));
3989     __ cbnz(rscratch2, DIFF2);
3990   }
3991 
3992   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
3993   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
3994       Label &DIFF2) {
3995     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
3996     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
3997 
3998     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
3999     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4000     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4001     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4002 
4003     __ fmovd(tmpL, vtmp3);
4004     __ eor(rscratch2, tmp3, tmpL);
4005     __ cbnz(rscratch2, DIFF2);
4006 
4007     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4008     __ umov(tmpL, vtmp3, __ D, 1);
4009     __ eor(rscratch2, tmpU, tmpL);
4010     __ cbnz(rscratch2, DIFF1);
4011 
4012     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4013     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4014     __ fmovd(tmpL, vtmp);
4015     __ eor(rscratch2, tmp3, tmpL);
4016     __ cbnz(rscratch2, DIFF2);
4017 
4018     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4019     __ umov(tmpL, vtmp, __ D, 1);
4020     __ eor(rscratch2, tmpU, tmpL);
4021     __ cbnz(rscratch2, DIFF1);
4022   }
4023 
4024   // r0  = result
4025   // r1  = str1
4026   // r2  = cnt1
4027   // r3  = str2
4028   // r4  = cnt2
4029   // r10 = tmp1
4030   // r11 = tmp2
4031   address generate_compare_long_string_different_encoding(bool isLU) {
4032     __ align(CodeEntryAlignment);
4033     StubCodeMark mark(this, "StubRoutines", isLU
4034         ? "compare_long_string_different_encoding LU"
4035         : "compare_long_string_different_encoding UL");
4036     address entry = __ pc();
4037     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4038         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4039         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4040     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4041         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4042     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4043     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4044 
4045     int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
4046 
4047     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4048     // cnt2 == amount of characters left to compare
4049     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4050     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4051     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4052     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4053     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4054     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4055     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4056     __ eor(rscratch2, tmp1, tmp2);
4057     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4058     __ mov(rscratch1, tmp2);
4059     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4060     Register strU = isLU ? str2 : str1,
4061              strL = isLU ? str1 : str2,
4062              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4063              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4064     __ push(spilled_regs, sp);
4065     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4066     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4067 
4068     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4069 
4070     if (SoftwarePrefetchHintDistance >= 0) {
4071       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4072       __ br(__ LT, NO_PREFETCH);
4073       __ bind(LARGE_LOOP_PREFETCH);
4074         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4075         __ mov(tmp4, 2);
4076         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4077         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4078           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4079           __ subs(tmp4, tmp4, 1);
4080           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4081           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4082           __ mov(tmp4, 2);
4083         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4084           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4085           __ subs(tmp4, tmp4, 1);
4086           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4087           __ sub(cnt2, cnt2, 64);
4088           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4089           __ br(__ GE, LARGE_LOOP_PREFETCH);
4090     }
4091     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4092     __ bind(NO_PREFETCH);
4093     __ subs(cnt2, cnt2, 16);
4094     __ br(__ LT, TAIL);
4095     __ bind(SMALL_LOOP); // smaller loop
4096       __ subs(cnt2, cnt2, 16);
4097       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4098       __ br(__ GE, SMALL_LOOP);
4099       __ cmn(cnt2, (u1)16);
4100       __ br(__ EQ, LOAD_LAST);
4101     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4102       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string
4103       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4104       __ ldr(tmp3, Address(cnt1, -8));
4105       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4106       __ b(LOAD_LAST);
4107     __ bind(DIFF2);
4108       __ mov(tmpU, tmp3);
4109     __ bind(DIFF1);
4110       __ pop(spilled_regs, sp);
4111       __ b(CALCULATE_DIFFERENCE);
4112     __ bind(LOAD_LAST);
4113       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4114       // No need to load it again
4115       __ mov(tmpU, tmp3);
4116       __ pop(spilled_regs, sp);
4117 
4118       __ ldrs(vtmp, Address(strL));
4119       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4120       __ fmovd(tmpL, vtmp);
4121 
4122       __ eor(rscratch2, tmpU, tmpL);
4123       __ cbz(rscratch2, DONE);
4124 
4125     // Find the first different characters in the longwords and
4126     // compute their difference.
4127     __ bind(CALCULATE_DIFFERENCE);
4128       __ rev(rscratch2, rscratch2);
4129       __ clz(rscratch2, rscratch2);
4130       __ andr(rscratch2, rscratch2, -16);
4131       __ lsrv(tmp1, tmp1, rscratch2);
4132       __ uxthw(tmp1, tmp1);
4133       __ lsrv(rscratch1, rscratch1, rscratch2);
4134       __ uxthw(rscratch1, rscratch1);
4135       __ subw(result, tmp1, rscratch1);
4136     __ bind(DONE);
4137       __ ret(lr);
4138     return entry;
4139   }
4140 
4141   // r0  = result
4142   // r1  = str1
4143   // r2  = cnt1
4144   // r3  = str2
4145   // r4  = cnt2
4146   // r10 = tmp1
4147   // r11 = tmp2
4148   address generate_compare_long_string_same_encoding(bool isLL) {
4149     __ align(CodeEntryAlignment);
4150     StubCodeMark mark(this, "StubRoutines", isLL
4151         ? "compare_long_string_same_encoding LL"
4152         : "compare_long_string_same_encoding UU");
4153     address entry = __ pc();
4154     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4155         tmp1 = r10, tmp2 = r11;
4156     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4157         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4158         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4159     // exit from large loop when less than 64 bytes left to read or we're about
4160     // to prefetch memory behind array border
4161     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4162     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4163     // update cnt2 counter with already loaded 8 bytes
4164     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4165     // update pointers, because of previous read
4166     __ add(str1, str1, wordSize);
4167     __ add(str2, str2, wordSize);
4168     if (SoftwarePrefetchHintDistance >= 0) {
4169       __ bind(LARGE_LOOP_PREFETCH);
4170         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4171         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4172         compare_string_16_bytes_same(DIFF, DIFF2);
4173         compare_string_16_bytes_same(DIFF, DIFF2);
4174         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4175         compare_string_16_bytes_same(DIFF, DIFF2);
4176         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4177         compare_string_16_bytes_same(DIFF, DIFF2);
4178         __ br(__ GT, LARGE_LOOP_PREFETCH);
4179         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4180     }
4181     // less than 16 bytes left?
4182     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4183     __ br(__ LT, TAIL);
4184     __ bind(SMALL_LOOP);
4185       compare_string_16_bytes_same(DIFF, DIFF2);
4186       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4187       __ br(__ GE, SMALL_LOOP);
4188     __ bind(TAIL);
4189       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4190       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4191       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4192       __ br(__ LE, CHECK_LAST);
4193       __ eor(rscratch2, tmp1, tmp2);
4194       __ cbnz(rscratch2, DIFF);
4195       __ ldr(tmp1, Address(__ post(str1, 8)));
4196       __ ldr(tmp2, Address(__ post(str2, 8)));
4197       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4198     __ bind(CHECK_LAST);
4199       if (!isLL) {
4200         __ add(cnt2, cnt2, cnt2); // now in bytes
4201       }
4202       __ eor(rscratch2, tmp1, tmp2);
4203       __ cbnz(rscratch2, DIFF);
4204       __ ldr(rscratch1, Address(str1, cnt2));
4205       __ ldr(cnt1, Address(str2, cnt2));
4206       __ eor(rscratch2, rscratch1, cnt1);
4207       __ cbz(rscratch2, LENGTH_DIFF);
4208       // Find the first different characters in the longwords and
4209       // compute their difference.
4210     __ bind(DIFF2);
4211       __ rev(rscratch2, rscratch2);
4212       __ clz(rscratch2, rscratch2);
4213       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4214       __ lsrv(rscratch1, rscratch1, rscratch2);
4215       if (isLL) {
4216         __ lsrv(cnt1, cnt1, rscratch2);
4217         __ uxtbw(rscratch1, rscratch1);
4218         __ uxtbw(cnt1, cnt1);
4219       } else {
4220         __ lsrv(cnt1, cnt1, rscratch2);
4221         __ uxthw(rscratch1, rscratch1);
4222         __ uxthw(cnt1, cnt1);
4223       }
4224       __ subw(result, rscratch1, cnt1);
4225       __ b(LENGTH_DIFF);
4226     __ bind(DIFF);
4227       __ rev(rscratch2, rscratch2);
4228       __ clz(rscratch2, rscratch2);
4229       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4230       __ lsrv(tmp1, tmp1, rscratch2);
4231       if (isLL) {
4232         __ lsrv(tmp2, tmp2, rscratch2);
4233         __ uxtbw(tmp1, tmp1);
4234         __ uxtbw(tmp2, tmp2);
4235       } else {
4236         __ lsrv(tmp2, tmp2, rscratch2);
4237         __ uxthw(tmp1, tmp1);
4238         __ uxthw(tmp2, tmp2);
4239       }
4240       __ subw(result, tmp1, tmp2);
4241       __ b(LENGTH_DIFF);
4242     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4243       __ eor(rscratch2, tmp1, tmp2);
4244       __ cbnz(rscratch2, DIFF);
4245     __ bind(LENGTH_DIFF);
4246       __ ret(lr);
4247     return entry;
4248   }
4249 
4250   void generate_compare_long_strings() {
4251       StubRoutines::aarch64::_compare_long_string_LL
4252           = generate_compare_long_string_same_encoding(true);
4253       StubRoutines::aarch64::_compare_long_string_UU
4254           = generate_compare_long_string_same_encoding(false);
4255       StubRoutines::aarch64::_compare_long_string_LU
4256           = generate_compare_long_string_different_encoding(true);
4257       StubRoutines::aarch64::_compare_long_string_UL
4258           = generate_compare_long_string_different_encoding(false);
4259   }
4260 
4261   // R0 = result
4262   // R1 = str2
4263   // R2 = cnt1
4264   // R3 = str1
4265   // R4 = cnt2
4266   // This generic linear code use few additional ideas, which makes it faster:
4267   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4268   // in order to skip initial loading(help in systems with 1 ld pipeline)
4269   // 2) we can use "fast" algorithm of finding single character to search for
4270   // first symbol with less branches(1 branch per each loaded register instead
4271   // of branch for each symbol), so, this is where constants like
4272   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4273   // 3) after loading and analyzing 1st register of source string, it can be
4274   // used to search for every 1st character entry, saving few loads in
4275   // comparison with "simplier-but-slower" implementation
4276   // 4) in order to avoid lots of push/pop operations, code below is heavily
4277   // re-using/re-initializing/compressing register values, which makes code
4278   // larger and a bit less readable, however, most of extra operations are
4279   // issued during loads or branches, so, penalty is minimal
4280   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4281     const char* stubName = str1_isL
4282         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4283         : "indexof_linear_uu";
4284     __ align(CodeEntryAlignment);
4285     StubCodeMark mark(this, "StubRoutines", stubName);
4286     address entry = __ pc();
4287 
4288     int str1_chr_size = str1_isL ? 1 : 2;
4289     int str2_chr_size = str2_isL ? 1 : 2;
4290     int str1_chr_shift = str1_isL ? 0 : 1;
4291     int str2_chr_shift = str2_isL ? 0 : 1;
4292     bool isL = str1_isL && str2_isL;
4293    // parameters
4294     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4295     // temporary registers
4296     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4297     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4298     // redefinitions
4299     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4300 
4301     __ push(spilled_regs, sp);
4302     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4303         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4304         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4305         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4306         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4307         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4308     // Read whole register from str1. It is safe, because length >=8 here
4309     __ ldr(ch1, Address(str1));
4310     // Read whole register from str2. It is safe, because length >=8 here
4311     __ ldr(ch2, Address(str2));
4312     __ sub(cnt2, cnt2, cnt1);
4313     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4314     if (str1_isL != str2_isL) {
4315       __ eor(v0, __ T16B, v0, v0);
4316     }
4317     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4318     __ mul(first, first, tmp1);
4319     // check if we have less than 1 register to check
4320     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4321     if (str1_isL != str2_isL) {
4322       __ fmovd(v1, ch1);
4323     }
4324     __ br(__ LE, L_SMALL);
4325     __ eor(ch2, first, ch2);
4326     if (str1_isL != str2_isL) {
4327       __ zip1(v1, __ T16B, v1, v0);
4328     }
4329     __ sub(tmp2, ch2, tmp1);
4330     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4331     __ bics(tmp2, tmp2, ch2);
4332     if (str1_isL != str2_isL) {
4333       __ fmovd(ch1, v1);
4334     }
4335     __ br(__ NE, L_HAS_ZERO);
4336     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4337     __ add(result, result, wordSize/str2_chr_size);
4338     __ add(str2, str2, wordSize);
4339     __ br(__ LT, L_POST_LOOP);
4340     __ BIND(L_LOOP);
4341       __ ldr(ch2, Address(str2));
4342       __ eor(ch2, first, ch2);
4343       __ sub(tmp2, ch2, tmp1);
4344       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4345       __ bics(tmp2, tmp2, ch2);
4346       __ br(__ NE, L_HAS_ZERO);
4347     __ BIND(L_LOOP_PROCEED);
4348       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4349       __ add(str2, str2, wordSize);
4350       __ add(result, result, wordSize/str2_chr_size);
4351       __ br(__ GE, L_LOOP);
4352     __ BIND(L_POST_LOOP);
4353       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4354       __ br(__ LE, NOMATCH);
4355       __ ldr(ch2, Address(str2));
4356       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4357       __ eor(ch2, first, ch2);
4358       __ sub(tmp2, ch2, tmp1);
4359       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4360       __ mov(tmp4, -1); // all bits set
4361       __ b(L_SMALL_PROCEED);
4362     __ align(OptoLoopAlignment);
4363     __ BIND(L_SMALL);
4364       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4365       __ eor(ch2, first, ch2);
4366       if (str1_isL != str2_isL) {
4367         __ zip1(v1, __ T16B, v1, v0);
4368       }
4369       __ sub(tmp2, ch2, tmp1);
4370       __ mov(tmp4, -1); // all bits set
4371       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4372       if (str1_isL != str2_isL) {
4373         __ fmovd(ch1, v1); // move converted 4 symbols
4374       }
4375     __ BIND(L_SMALL_PROCEED);
4376       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4377       __ bic(tmp2, tmp2, ch2);
4378       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4379       __ rbit(tmp2, tmp2);
4380       __ br(__ EQ, NOMATCH);
4381     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4382       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4383       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4384       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4385       if (str2_isL) { // LL
4386         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4387         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4388         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4389         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4390         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4391       } else {
4392         __ mov(ch2, 0xE); // all bits in byte set except last one
4393         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4394         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4395         __ lslv(tmp2, tmp2, tmp4);
4396         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4397         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4398         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4399         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4400       }
4401       __ cmp(ch1, ch2);
4402       __ mov(tmp4, wordSize/str2_chr_size);
4403       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4404     __ BIND(L_SMALL_CMP_LOOP);
4405       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4406                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4407       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4408                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4409       __ add(tmp4, tmp4, 1);
4410       __ cmp(tmp4, cnt1);
4411       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4412       __ cmp(first, ch2);
4413       __ br(__ EQ, L_SMALL_CMP_LOOP);
4414     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4415       __ cbz(tmp2, NOMATCH); // no more matches. exit
4416       __ clz(tmp4, tmp2);
4417       __ add(result, result, 1); // advance index
4418       __ add(str2, str2, str2_chr_size); // advance pointer
4419       __ b(L_SMALL_HAS_ZERO_LOOP);
4420     __ align(OptoLoopAlignment);
4421     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4422       __ cmp(first, ch2);
4423       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4424       __ b(DONE);
4425     __ align(OptoLoopAlignment);
4426     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4427       if (str2_isL) { // LL
4428         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4429         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4430         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4431         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4432         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4433       } else {
4434         __ mov(ch2, 0xE); // all bits in byte set except last one
4435         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4436         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4437         __ lslv(tmp2, tmp2, tmp4);
4438         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4439         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4440         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4441         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4442       }
4443       __ cmp(ch1, ch2);
4444       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4445       __ b(DONE);
4446     __ align(OptoLoopAlignment);
4447     __ BIND(L_HAS_ZERO);
4448       __ rbit(tmp2, tmp2);
4449       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4450       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4451       // It's fine because both counters are 32bit and are not changed in this
4452       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4453       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4454       __ sub(result, result, 1);
4455     __ BIND(L_HAS_ZERO_LOOP);
4456       __ mov(cnt1, wordSize/str2_chr_size);
4457       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4458       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4459       if (str2_isL) {
4460         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4461         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4462         __ lslv(tmp2, tmp2, tmp4);
4463         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4464         __ add(tmp4, tmp4, 1);
4465         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4466         __ lsl(tmp2, tmp2, 1);
4467         __ mov(tmp4, wordSize/str2_chr_size);
4468       } else {
4469         __ mov(ch2, 0xE);
4470         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4471         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4472         __ lslv(tmp2, tmp2, tmp4);
4473         __ add(tmp4, tmp4, 1);
4474         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4475         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4476         __ lsl(tmp2, tmp2, 1);
4477         __ mov(tmp4, wordSize/str2_chr_size);
4478         __ sub(str2, str2, str2_chr_size);
4479       }
4480       __ cmp(ch1, ch2);
4481       __ mov(tmp4, wordSize/str2_chr_size);
4482       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4483     __ BIND(L_CMP_LOOP);
4484       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4485                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4486       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4487                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4488       __ add(tmp4, tmp4, 1);
4489       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4490       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4491       __ cmp(cnt1, ch2);
4492       __ br(__ EQ, L_CMP_LOOP);
4493     __ BIND(L_CMP_LOOP_NOMATCH);
4494       // here we're not matched
4495       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4496       __ clz(tmp4, tmp2);
4497       __ add(str2, str2, str2_chr_size); // advance pointer
4498       __ b(L_HAS_ZERO_LOOP);
4499     __ align(OptoLoopAlignment);
4500     __ BIND(L_CMP_LOOP_LAST_CMP);
4501       __ cmp(cnt1, ch2);
4502       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4503       __ b(DONE);
4504     __ align(OptoLoopAlignment);
4505     __ BIND(L_CMP_LOOP_LAST_CMP2);
4506       if (str2_isL) {
4507         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4508         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4509         __ lslv(tmp2, tmp2, tmp4);
4510         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4511         __ add(tmp4, tmp4, 1);
4512         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4513         __ lsl(tmp2, tmp2, 1);
4514       } else {
4515         __ mov(ch2, 0xE);
4516         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4517         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4518         __ lslv(tmp2, tmp2, tmp4);
4519         __ add(tmp4, tmp4, 1);
4520         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4521         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4522         __ lsl(tmp2, tmp2, 1);
4523         __ sub(str2, str2, str2_chr_size);
4524       }
4525       __ cmp(ch1, ch2);
4526       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4527       __ b(DONE);
4528     __ align(OptoLoopAlignment);
4529     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4530       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4531       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4532       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4533       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4534       // result by analyzed characters value, so, we can just reset lower bits
4535       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4536       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4537       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4538       // index of last analyzed substring inside current octet. So, str2 in at
4539       // respective start address. We need to advance it to next octet
4540       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4541       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4542       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4543       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4544       __ movw(cnt2, cnt2);
4545       __ b(L_LOOP_PROCEED);
4546     __ align(OptoLoopAlignment);
4547     __ BIND(NOMATCH);
4548       __ mov(result, -1);
4549     __ BIND(DONE);
4550       __ pop(spilled_regs, sp);
4551       __ ret(lr);
4552     return entry;
4553   }
4554 
4555   void generate_string_indexof_stubs() {
4556     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4557     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4558     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4559   }
4560 
4561   void inflate_and_store_2_fp_registers(bool generatePrfm,
4562       FloatRegister src1, FloatRegister src2) {
4563     Register dst = r1;
4564     __ zip1(v1, __ T16B, src1, v0);
4565     __ zip2(v2, __ T16B, src1, v0);
4566     if (generatePrfm) {
4567       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4568     }
4569     __ zip1(v3, __ T16B, src2, v0);
4570     __ zip2(v4, __ T16B, src2, v0);
4571     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4572   }
4573 
4574   // R0 = src
4575   // R1 = dst
4576   // R2 = len
4577   // R3 = len >> 3
4578   // V0 = 0
4579   // v1 = loaded 8 bytes
4580   address generate_large_byte_array_inflate() {
4581     __ align(CodeEntryAlignment);
4582     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4583     address entry = __ pc();
4584     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4585     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4586     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4587 
4588     // do one more 8-byte read to have address 16-byte aligned in most cases
4589     // also use single store instruction
4590     __ ldrd(v2, __ post(src, 8));
4591     __ sub(octetCounter, octetCounter, 2);
4592     __ zip1(v1, __ T16B, v1, v0);
4593     __ zip1(v2, __ T16B, v2, v0);
4594     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4595     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4596     __ subs(rscratch1, octetCounter, large_loop_threshold);
4597     __ br(__ LE, LOOP_START);
4598     __ b(LOOP_PRFM_START);
4599     __ bind(LOOP_PRFM);
4600       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4601     __ bind(LOOP_PRFM_START);
4602       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4603       __ sub(octetCounter, octetCounter, 8);
4604       __ subs(rscratch1, octetCounter, large_loop_threshold);
4605       inflate_and_store_2_fp_registers(true, v3, v4);
4606       inflate_and_store_2_fp_registers(true, v5, v6);
4607       __ br(__ GT, LOOP_PRFM);
4608       __ cmp(octetCounter, (u1)8);
4609       __ br(__ LT, DONE);
4610     __ bind(LOOP);
4611       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4612       __ bind(LOOP_START);
4613       __ sub(octetCounter, octetCounter, 8);
4614       __ cmp(octetCounter, (u1)8);
4615       inflate_and_store_2_fp_registers(false, v3, v4);
4616       inflate_and_store_2_fp_registers(false, v5, v6);
4617       __ br(__ GE, LOOP);
4618     __ bind(DONE);
4619       __ ret(lr);
4620     return entry;
4621   }
4622 
4623   /**
4624    *  Arguments:
4625    *
4626    *  Input:
4627    *  c_rarg0   - current state address
4628    *  c_rarg1   - H key address
4629    *  c_rarg2   - data address
4630    *  c_rarg3   - number of blocks
4631    *
4632    *  Output:
4633    *  Updated state at c_rarg0
4634    */
4635   address generate_ghash_processBlocks() {
4636     // Bafflingly, GCM uses little-endian for the byte order, but
4637     // big-endian for the bit order.  For example, the polynomial 1 is
4638     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4639     //
4640     // So, we must either reverse the bytes in each word and do
4641     // everything big-endian or reverse the bits in each byte and do
4642     // it little-endian.  On AArch64 it's more idiomatic to reverse
4643     // the bits in each byte (we have an instruction, RBIT, to do
4644     // that) and keep the data in little-endian bit order throught the
4645     // calculation, bit-reversing the inputs and outputs.
4646 
4647     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4648     __ align(wordSize * 2);
4649     address p = __ pc();
4650     __ emit_int64(0x87);  // The low-order bits of the field
4651                           // polynomial (i.e. p = z^7+z^2+z+1)
4652                           // repeated in the low and high parts of a
4653                           // 128-bit vector
4654     __ emit_int64(0x87);
4655 
4656     __ align(CodeEntryAlignment);
4657     address start = __ pc();
4658 
4659     Register state   = c_rarg0;
4660     Register subkeyH = c_rarg1;
4661     Register data    = c_rarg2;
4662     Register blocks  = c_rarg3;
4663 
4664     FloatRegister vzr = v30;
4665     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4666 
4667     __ ldrq(v0, Address(state));
4668     __ ldrq(v1, Address(subkeyH));
4669 
4670     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4671     __ rbit(v0, __ T16B, v0);
4672     __ rev64(v1, __ T16B, v1);
4673     __ rbit(v1, __ T16B, v1);
4674 
4675     __ ldrq(v26, p);
4676 
4677     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4678     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4679 
4680     {
4681       Label L_ghash_loop;
4682       __ bind(L_ghash_loop);
4683 
4684       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4685                                                  // reversing each byte
4686       __ rbit(v2, __ T16B, v2);
4687       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4688 
4689       // Multiply state in v2 by subkey in v1
4690       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4691                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4692                      /*temps*/v6, v20, v18, v21);
4693       // Reduce v7:v5 by the field polynomial
4694       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4695 
4696       __ sub(blocks, blocks, 1);
4697       __ cbnz(blocks, L_ghash_loop);
4698     }
4699 
4700     // The bit-reversed result is at this point in v0
4701     __ rev64(v1, __ T16B, v0);
4702     __ rbit(v1, __ T16B, v1);
4703 
4704     __ st1(v1, __ T16B, state);
4705     __ ret(lr);
4706 
4707     return start;
4708   }
4709 
4710   // Continuation point for throwing of implicit exceptions that are
4711   // not handled in the current activation. Fabricates an exception
4712   // oop and initiates normal exception dispatching in this
4713   // frame. Since we need to preserve callee-saved values (currently
4714   // only for C2, but done for C1 as well) we need a callee-saved oop
4715   // map and therefore have to make these stubs into RuntimeStubs
4716   // rather than BufferBlobs.  If the compiler needs all registers to
4717   // be preserved between the fault point and the exception handler
4718   // then it must assume responsibility for that in
4719   // AbstractCompiler::continuation_for_implicit_null_exception or
4720   // continuation_for_implicit_division_by_zero_exception. All other
4721   // implicit exceptions (e.g., NullPointerException or
4722   // AbstractMethodError on entry) are either at call sites or
4723   // otherwise assume that stack unwinding will be initiated, so
4724   // caller saved registers were assumed volatile in the compiler.
4725 
4726 #undef __
4727 #define __ masm->
4728 
4729   address generate_throw_exception(const char* name,
4730                                    address runtime_entry,
4731                                    Register arg1 = noreg,
4732                                    Register arg2 = noreg) {
4733     // Information about frame layout at time of blocking runtime call.
4734     // Note that we only have to preserve callee-saved registers since
4735     // the compilers are responsible for supplying a continuation point
4736     // if they expect all registers to be preserved.
4737     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4738     enum layout {
4739       rfp_off = 0,
4740       rfp_off2,
4741       return_off,
4742       return_off2,
4743       framesize // inclusive of return address
4744     };
4745 
4746     int insts_size = 512;
4747     int locs_size  = 64;
4748 
4749     CodeBuffer code(name, insts_size, locs_size);
4750     OopMapSet* oop_maps  = new OopMapSet();
4751     MacroAssembler* masm = new MacroAssembler(&code);
4752 
4753     address start = __ pc();
4754 
4755     // This is an inlined and slightly modified version of call_VM
4756     // which has the ability to fetch the return PC out of
4757     // thread-local storage and also sets up last_Java_sp slightly
4758     // differently than the real call_VM
4759 
4760     __ enter(); // Save FP and LR before call
4761 
4762     assert(is_even(framesize/2), "sp not 16-byte aligned");
4763 
4764     // lr and fp are already in place
4765     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4766 
4767     int frame_complete = __ pc() - start;
4768 
4769     // Set up last_Java_sp and last_Java_fp
4770     address the_pc = __ pc();
4771     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4772 
4773     // Call runtime
4774     if (arg1 != noreg) {
4775       assert(arg2 != c_rarg1, "clobbered");
4776       __ mov(c_rarg1, arg1);
4777     }
4778     if (arg2 != noreg) {
4779       __ mov(c_rarg2, arg2);
4780     }
4781     __ mov(c_rarg0, rthread);
4782     BLOCK_COMMENT("call runtime_entry");
4783     __ mov(rscratch1, runtime_entry);
4784     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4785 
4786     // Generate oop map
4787     OopMap* map = new OopMap(framesize, 0);
4788 
4789     oop_maps->add_gc_map(the_pc - start, map);
4790 
4791     __ reset_last_Java_frame(true);
4792     __ maybe_isb();
4793 
4794     __ leave();
4795 
4796     // check for pending exceptions
4797 #ifdef ASSERT
4798     Label L;
4799     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4800     __ cbnz(rscratch1, L);
4801     __ should_not_reach_here();
4802     __ bind(L);
4803 #endif // ASSERT
4804     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4805 
4806 
4807     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4808     RuntimeStub* stub =
4809       RuntimeStub::new_runtime_stub(name,
4810                                     &code,
4811                                     frame_complete,
4812                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4813                                     oop_maps, false);
4814     return stub->entry_point();
4815   }
4816 
4817   class MontgomeryMultiplyGenerator : public MacroAssembler {
4818 
4819     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4820       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4821 
4822     RegSet _toSave;
4823     bool _squaring;
4824 
4825   public:
4826     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4827       : MacroAssembler(as->code()), _squaring(squaring) {
4828 
4829       // Register allocation
4830 
4831       Register reg = c_rarg0;
4832       Pa_base = reg;       // Argument registers
4833       if (squaring)
4834         Pb_base = Pa_base;
4835       else
4836         Pb_base = ++reg;
4837       Pn_base = ++reg;
4838       Rlen= ++reg;
4839       inv = ++reg;
4840       Pm_base = ++reg;
4841 
4842                           // Working registers:
4843       Ra =  ++reg;        // The current digit of a, b, n, and m.
4844       Rb =  ++reg;
4845       Rm =  ++reg;
4846       Rn =  ++reg;
4847 
4848       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4849       Pb =  ++reg;
4850       Pm =  ++reg;
4851       Pn =  ++reg;
4852 
4853       t0 =  ++reg;        // Three registers which form a
4854       t1 =  ++reg;        // triple-precision accumuator.
4855       t2 =  ++reg;
4856 
4857       Ri =  ++reg;        // Inner and outer loop indexes.
4858       Rj =  ++reg;
4859 
4860       Rhi_ab = ++reg;     // Product registers: low and high parts
4861       Rlo_ab = ++reg;     // of a*b and m*n.
4862       Rhi_mn = ++reg;
4863       Rlo_mn = ++reg;
4864 
4865       // r19 and up are callee-saved.
4866       _toSave = RegSet::range(r19, reg) + Pm_base;
4867     }
4868 
4869   private:
4870     void save_regs() {
4871       push(_toSave, sp);
4872     }
4873 
4874     void restore_regs() {
4875       pop(_toSave, sp);
4876     }
4877 
4878     template <typename T>
4879     void unroll_2(Register count, T block) {
4880       Label loop, end, odd;
4881       tbnz(count, 0, odd);
4882       cbz(count, end);
4883       align(16);
4884       bind(loop);
4885       (this->*block)();
4886       bind(odd);
4887       (this->*block)();
4888       subs(count, count, 2);
4889       br(Assembler::GT, loop);
4890       bind(end);
4891     }
4892 
4893     template <typename T>
4894     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4895       Label loop, end, odd;
4896       tbnz(count, 0, odd);
4897       cbz(count, end);
4898       align(16);
4899       bind(loop);
4900       (this->*block)(d, s, tmp);
4901       bind(odd);
4902       (this->*block)(d, s, tmp);
4903       subs(count, count, 2);
4904       br(Assembler::GT, loop);
4905       bind(end);
4906     }
4907 
4908     void pre1(RegisterOrConstant i) {
4909       block_comment("pre1");
4910       // Pa = Pa_base;
4911       // Pb = Pb_base + i;
4912       // Pm = Pm_base;
4913       // Pn = Pn_base + i;
4914       // Ra = *Pa;
4915       // Rb = *Pb;
4916       // Rm = *Pm;
4917       // Rn = *Pn;
4918       ldr(Ra, Address(Pa_base));
4919       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4920       ldr(Rm, Address(Pm_base));
4921       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4922       lea(Pa, Address(Pa_base));
4923       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4924       lea(Pm, Address(Pm_base));
4925       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4926 
4927       // Zero the m*n result.
4928       mov(Rhi_mn, zr);
4929       mov(Rlo_mn, zr);
4930     }
4931 
4932     // The core multiply-accumulate step of a Montgomery
4933     // multiplication.  The idea is to schedule operations as a
4934     // pipeline so that instructions with long latencies (loads and
4935     // multiplies) have time to complete before their results are
4936     // used.  This most benefits in-order implementations of the
4937     // architecture but out-of-order ones also benefit.
4938     void step() {
4939       block_comment("step");
4940       // MACC(Ra, Rb, t0, t1, t2);
4941       // Ra = *++Pa;
4942       // Rb = *--Pb;
4943       umulh(Rhi_ab, Ra, Rb);
4944       mul(Rlo_ab, Ra, Rb);
4945       ldr(Ra, pre(Pa, wordSize));
4946       ldr(Rb, pre(Pb, -wordSize));
4947       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4948                                        // previous iteration.
4949       // MACC(Rm, Rn, t0, t1, t2);
4950       // Rm = *++Pm;
4951       // Rn = *--Pn;
4952       umulh(Rhi_mn, Rm, Rn);
4953       mul(Rlo_mn, Rm, Rn);
4954       ldr(Rm, pre(Pm, wordSize));
4955       ldr(Rn, pre(Pn, -wordSize));
4956       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4957     }
4958 
4959     void post1() {
4960       block_comment("post1");
4961 
4962       // MACC(Ra, Rb, t0, t1, t2);
4963       // Ra = *++Pa;
4964       // Rb = *--Pb;
4965       umulh(Rhi_ab, Ra, Rb);
4966       mul(Rlo_ab, Ra, Rb);
4967       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4968       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4969 
4970       // *Pm = Rm = t0 * inv;
4971       mul(Rm, t0, inv);
4972       str(Rm, Address(Pm));
4973 
4974       // MACC(Rm, Rn, t0, t1, t2);
4975       // t0 = t1; t1 = t2; t2 = 0;
4976       umulh(Rhi_mn, Rm, Rn);
4977 
4978 #ifndef PRODUCT
4979       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4980       {
4981         mul(Rlo_mn, Rm, Rn);
4982         add(Rlo_mn, t0, Rlo_mn);
4983         Label ok;
4984         cbz(Rlo_mn, ok); {
4985           stop("broken Montgomery multiply");
4986         } bind(ok);
4987       }
4988 #endif
4989       // We have very carefully set things up so that
4990       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4991       // the lower half of Rm * Rn because we know the result already:
4992       // it must be -t0.  t0 + (-t0) must generate a carry iff
4993       // t0 != 0.  So, rather than do a mul and an adds we just set
4994       // the carry flag iff t0 is nonzero.
4995       //
4996       // mul(Rlo_mn, Rm, Rn);
4997       // adds(zr, t0, Rlo_mn);
4998       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4999       adcs(t0, t1, Rhi_mn);
5000       adc(t1, t2, zr);
5001       mov(t2, zr);
5002     }
5003 
5004     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5005       block_comment("pre2");
5006       // Pa = Pa_base + i-len;
5007       // Pb = Pb_base + len;
5008       // Pm = Pm_base + i-len;
5009       // Pn = Pn_base + len;
5010 
5011       if (i.is_register()) {
5012         sub(Rj, i.as_register(), len);
5013       } else {
5014         mov(Rj, i.as_constant());
5015         sub(Rj, Rj, len);
5016       }
5017       // Rj == i-len
5018 
5019       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5020       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5021       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5022       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5023 
5024       // Ra = *++Pa;
5025       // Rb = *--Pb;
5026       // Rm = *++Pm;
5027       // Rn = *--Pn;
5028       ldr(Ra, pre(Pa, wordSize));
5029       ldr(Rb, pre(Pb, -wordSize));
5030       ldr(Rm, pre(Pm, wordSize));
5031       ldr(Rn, pre(Pn, -wordSize));
5032 
5033       mov(Rhi_mn, zr);
5034       mov(Rlo_mn, zr);
5035     }
5036 
5037     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5038       block_comment("post2");
5039       if (i.is_constant()) {
5040         mov(Rj, i.as_constant()-len.as_constant());
5041       } else {
5042         sub(Rj, i.as_register(), len);
5043       }
5044 
5045       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5046 
5047       // As soon as we know the least significant digit of our result,
5048       // store it.
5049       // Pm_base[i-len] = t0;
5050       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5051 
5052       // t0 = t1; t1 = t2; t2 = 0;
5053       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5054       adc(t1, t2, zr);
5055       mov(t2, zr);
5056     }
5057 
5058     // A carry in t0 after Montgomery multiplication means that we
5059     // should subtract multiples of n from our result in m.  We'll
5060     // keep doing that until there is no carry.
5061     void normalize(RegisterOrConstant len) {
5062       block_comment("normalize");
5063       // while (t0)
5064       //   t0 = sub(Pm_base, Pn_base, t0, len);
5065       Label loop, post, again;
5066       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5067       cbz(t0, post); {
5068         bind(again); {
5069           mov(i, zr);
5070           mov(cnt, len);
5071           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5072           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5073           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5074           align(16);
5075           bind(loop); {
5076             sbcs(Rm, Rm, Rn);
5077             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5078             add(i, i, 1);
5079             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5080             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5081             sub(cnt, cnt, 1);
5082           } cbnz(cnt, loop);
5083           sbc(t0, t0, zr);
5084         } cbnz(t0, again);
5085       } bind(post);
5086     }
5087 
5088     // Move memory at s to d, reversing words.
5089     //    Increments d to end of copied memory
5090     //    Destroys tmp1, tmp2
5091     //    Preserves len
5092     //    Leaves s pointing to the address which was in d at start
5093     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5094       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5095 
5096       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5097       mov(tmp1, len);
5098       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5099       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5100     }
5101     // where
5102     void reverse1(Register d, Register s, Register tmp) {
5103       ldr(tmp, pre(s, -wordSize));
5104       ror(tmp, tmp, 32);
5105       str(tmp, post(d, wordSize));
5106     }
5107 
5108     void step_squaring() {
5109       // An extra ACC
5110       step();
5111       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5112     }
5113 
5114     void last_squaring(RegisterOrConstant i) {
5115       Label dont;
5116       // if ((i & 1) == 0) {
5117       tbnz(i.as_register(), 0, dont); {
5118         // MACC(Ra, Rb, t0, t1, t2);
5119         // Ra = *++Pa;
5120         // Rb = *--Pb;
5121         umulh(Rhi_ab, Ra, Rb);
5122         mul(Rlo_ab, Ra, Rb);
5123         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5124       } bind(dont);
5125     }
5126 
5127     void extra_step_squaring() {
5128       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5129 
5130       // MACC(Rm, Rn, t0, t1, t2);
5131       // Rm = *++Pm;
5132       // Rn = *--Pn;
5133       umulh(Rhi_mn, Rm, Rn);
5134       mul(Rlo_mn, Rm, Rn);
5135       ldr(Rm, pre(Pm, wordSize));
5136       ldr(Rn, pre(Pn, -wordSize));
5137     }
5138 
5139     void post1_squaring() {
5140       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5141 
5142       // *Pm = Rm = t0 * inv;
5143       mul(Rm, t0, inv);
5144       str(Rm, Address(Pm));
5145 
5146       // MACC(Rm, Rn, t0, t1, t2);
5147       // t0 = t1; t1 = t2; t2 = 0;
5148       umulh(Rhi_mn, Rm, Rn);
5149 
5150 #ifndef PRODUCT
5151       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5152       {
5153         mul(Rlo_mn, Rm, Rn);
5154         add(Rlo_mn, t0, Rlo_mn);
5155         Label ok;
5156         cbz(Rlo_mn, ok); {
5157           stop("broken Montgomery multiply");
5158         } bind(ok);
5159       }
5160 #endif
5161       // We have very carefully set things up so that
5162       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5163       // the lower half of Rm * Rn because we know the result already:
5164       // it must be -t0.  t0 + (-t0) must generate a carry iff
5165       // t0 != 0.  So, rather than do a mul and an adds we just set
5166       // the carry flag iff t0 is nonzero.
5167       //
5168       // mul(Rlo_mn, Rm, Rn);
5169       // adds(zr, t0, Rlo_mn);
5170       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5171       adcs(t0, t1, Rhi_mn);
5172       adc(t1, t2, zr);
5173       mov(t2, zr);
5174     }
5175 
5176     void acc(Register Rhi, Register Rlo,
5177              Register t0, Register t1, Register t2) {
5178       adds(t0, t0, Rlo);
5179       adcs(t1, t1, Rhi);
5180       adc(t2, t2, zr);
5181     }
5182 
5183   public:
5184     /**
5185      * Fast Montgomery multiplication.  The derivation of the
5186      * algorithm is in A Cryptographic Library for the Motorola
5187      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5188      *
5189      * Arguments:
5190      *
5191      * Inputs for multiplication:
5192      *   c_rarg0   - int array elements a
5193      *   c_rarg1   - int array elements b
5194      *   c_rarg2   - int array elements n (the modulus)
5195      *   c_rarg3   - int length
5196      *   c_rarg4   - int inv
5197      *   c_rarg5   - int array elements m (the result)
5198      *
5199      * Inputs for squaring:
5200      *   c_rarg0   - int array elements a
5201      *   c_rarg1   - int array elements n (the modulus)
5202      *   c_rarg2   - int length
5203      *   c_rarg3   - int inv
5204      *   c_rarg4   - int array elements m (the result)
5205      *
5206      */
5207     address generate_multiply() {
5208       Label argh, nothing;
5209       bind(argh);
5210       stop("MontgomeryMultiply total_allocation must be <= 8192");
5211 
5212       align(CodeEntryAlignment);
5213       address entry = pc();
5214 
5215       cbzw(Rlen, nothing);
5216 
5217       enter();
5218 
5219       // Make room.
5220       cmpw(Rlen, 512);
5221       br(Assembler::HI, argh);
5222       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5223       andr(sp, Ra, -2 * wordSize);
5224 
5225       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5226 
5227       {
5228         // Copy input args, reversing as we go.  We use Ra as a
5229         // temporary variable.
5230         reverse(Ra, Pa_base, Rlen, t0, t1);
5231         if (!_squaring)
5232           reverse(Ra, Pb_base, Rlen, t0, t1);
5233         reverse(Ra, Pn_base, Rlen, t0, t1);
5234       }
5235 
5236       // Push all call-saved registers and also Pm_base which we'll need
5237       // at the end.
5238       save_regs();
5239 
5240 #ifndef PRODUCT
5241       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5242       {
5243         ldr(Rn, Address(Pn_base, 0));
5244         mul(Rlo_mn, Rn, inv);
5245         subs(zr, Rlo_mn, -1);
5246         Label ok;
5247         br(EQ, ok); {
5248           stop("broken inverse in Montgomery multiply");
5249         } bind(ok);
5250       }
5251 #endif
5252 
5253       mov(Pm_base, Ra);
5254 
5255       mov(t0, zr);
5256       mov(t1, zr);
5257       mov(t2, zr);
5258 
5259       block_comment("for (int i = 0; i < len; i++) {");
5260       mov(Ri, zr); {
5261         Label loop, end;
5262         cmpw(Ri, Rlen);
5263         br(Assembler::GE, end);
5264 
5265         bind(loop);
5266         pre1(Ri);
5267 
5268         block_comment("  for (j = i; j; j--) {"); {
5269           movw(Rj, Ri);
5270           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5271         } block_comment("  } // j");
5272 
5273         post1();
5274         addw(Ri, Ri, 1);
5275         cmpw(Ri, Rlen);
5276         br(Assembler::LT, loop);
5277         bind(end);
5278         block_comment("} // i");
5279       }
5280 
5281       block_comment("for (int i = len; i < 2*len; i++) {");
5282       mov(Ri, Rlen); {
5283         Label loop, end;
5284         cmpw(Ri, Rlen, Assembler::LSL, 1);
5285         br(Assembler::GE, end);
5286 
5287         bind(loop);
5288         pre2(Ri, Rlen);
5289 
5290         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5291           lslw(Rj, Rlen, 1);
5292           subw(Rj, Rj, Ri);
5293           subw(Rj, Rj, 1);
5294           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5295         } block_comment("  } // j");
5296 
5297         post2(Ri, Rlen);
5298         addw(Ri, Ri, 1);
5299         cmpw(Ri, Rlen, Assembler::LSL, 1);
5300         br(Assembler::LT, loop);
5301         bind(end);
5302       }
5303       block_comment("} // i");
5304 
5305       normalize(Rlen);
5306 
5307       mov(Ra, Pm_base);  // Save Pm_base in Ra
5308       restore_regs();  // Restore caller's Pm_base
5309 
5310       // Copy our result into caller's Pm_base
5311       reverse(Pm_base, Ra, Rlen, t0, t1);
5312 
5313       leave();
5314       bind(nothing);
5315       ret(lr);
5316 
5317       return entry;
5318     }
5319     // In C, approximately:
5320 
5321     // void
5322     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5323     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5324     //                     unsigned long inv, int len) {
5325     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5326     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5327     //   unsigned long Ra, Rb, Rn, Rm;
5328 
5329     //   int i;
5330 
5331     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5332 
5333     //   for (i = 0; i < len; i++) {
5334     //     int j;
5335 
5336     //     Pa = Pa_base;
5337     //     Pb = Pb_base + i;
5338     //     Pm = Pm_base;
5339     //     Pn = Pn_base + i;
5340 
5341     //     Ra = *Pa;
5342     //     Rb = *Pb;
5343     //     Rm = *Pm;
5344     //     Rn = *Pn;
5345 
5346     //     int iters = i;
5347     //     for (j = 0; iters--; j++) {
5348     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5349     //       MACC(Ra, Rb, t0, t1, t2);
5350     //       Ra = *++Pa;
5351     //       Rb = *--Pb;
5352     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5353     //       MACC(Rm, Rn, t0, t1, t2);
5354     //       Rm = *++Pm;
5355     //       Rn = *--Pn;
5356     //     }
5357 
5358     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5359     //     MACC(Ra, Rb, t0, t1, t2);
5360     //     *Pm = Rm = t0 * inv;
5361     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5362     //     MACC(Rm, Rn, t0, t1, t2);
5363 
5364     //     assert(t0 == 0, "broken Montgomery multiply");
5365 
5366     //     t0 = t1; t1 = t2; t2 = 0;
5367     //   }
5368 
5369     //   for (i = len; i < 2*len; i++) {
5370     //     int j;
5371 
5372     //     Pa = Pa_base + i-len;
5373     //     Pb = Pb_base + len;
5374     //     Pm = Pm_base + i-len;
5375     //     Pn = Pn_base + len;
5376 
5377     //     Ra = *++Pa;
5378     //     Rb = *--Pb;
5379     //     Rm = *++Pm;
5380     //     Rn = *--Pn;
5381 
5382     //     int iters = len*2-i-1;
5383     //     for (j = i-len+1; iters--; j++) {
5384     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5385     //       MACC(Ra, Rb, t0, t1, t2);
5386     //       Ra = *++Pa;
5387     //       Rb = *--Pb;
5388     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5389     //       MACC(Rm, Rn, t0, t1, t2);
5390     //       Rm = *++Pm;
5391     //       Rn = *--Pn;
5392     //     }
5393 
5394     //     Pm_base[i-len] = t0;
5395     //     t0 = t1; t1 = t2; t2 = 0;
5396     //   }
5397 
5398     //   while (t0)
5399     //     t0 = sub(Pm_base, Pn_base, t0, len);
5400     // }
5401 
5402     /**
5403      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5404      * multiplies than Montgomery multiplication so it should be up to
5405      * 25% faster.  However, its loop control is more complex and it
5406      * may actually run slower on some machines.
5407      *
5408      * Arguments:
5409      *
5410      * Inputs:
5411      *   c_rarg0   - int array elements a
5412      *   c_rarg1   - int array elements n (the modulus)
5413      *   c_rarg2   - int length
5414      *   c_rarg3   - int inv
5415      *   c_rarg4   - int array elements m (the result)
5416      *
5417      */
5418     address generate_square() {
5419       Label argh;
5420       bind(argh);
5421       stop("MontgomeryMultiply total_allocation must be <= 8192");
5422 
5423       align(CodeEntryAlignment);
5424       address entry = pc();
5425 
5426       enter();
5427 
5428       // Make room.
5429       cmpw(Rlen, 512);
5430       br(Assembler::HI, argh);
5431       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5432       andr(sp, Ra, -2 * wordSize);
5433 
5434       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5435 
5436       {
5437         // Copy input args, reversing as we go.  We use Ra as a
5438         // temporary variable.
5439         reverse(Ra, Pa_base, Rlen, t0, t1);
5440         reverse(Ra, Pn_base, Rlen, t0, t1);
5441       }
5442 
5443       // Push all call-saved registers and also Pm_base which we'll need
5444       // at the end.
5445       save_regs();
5446 
5447       mov(Pm_base, Ra);
5448 
5449       mov(t0, zr);
5450       mov(t1, zr);
5451       mov(t2, zr);
5452 
5453       block_comment("for (int i = 0; i < len; i++) {");
5454       mov(Ri, zr); {
5455         Label loop, end;
5456         bind(loop);
5457         cmp(Ri, Rlen);
5458         br(Assembler::GE, end);
5459 
5460         pre1(Ri);
5461 
5462         block_comment("for (j = (i+1)/2; j; j--) {"); {
5463           add(Rj, Ri, 1);
5464           lsr(Rj, Rj, 1);
5465           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5466         } block_comment("  } // j");
5467 
5468         last_squaring(Ri);
5469 
5470         block_comment("  for (j = i/2; j; j--) {"); {
5471           lsr(Rj, Ri, 1);
5472           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5473         } block_comment("  } // j");
5474 
5475         post1_squaring();
5476         add(Ri, Ri, 1);
5477         cmp(Ri, Rlen);
5478         br(Assembler::LT, loop);
5479 
5480         bind(end);
5481         block_comment("} // i");
5482       }
5483 
5484       block_comment("for (int i = len; i < 2*len; i++) {");
5485       mov(Ri, Rlen); {
5486         Label loop, end;
5487         bind(loop);
5488         cmp(Ri, Rlen, Assembler::LSL, 1);
5489         br(Assembler::GE, end);
5490 
5491         pre2(Ri, Rlen);
5492 
5493         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5494           lsl(Rj, Rlen, 1);
5495           sub(Rj, Rj, Ri);
5496           sub(Rj, Rj, 1);
5497           lsr(Rj, Rj, 1);
5498           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5499         } block_comment("  } // j");
5500 
5501         last_squaring(Ri);
5502 
5503         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5504           lsl(Rj, Rlen, 1);
5505           sub(Rj, Rj, Ri);
5506           lsr(Rj, Rj, 1);
5507           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5508         } block_comment("  } // j");
5509 
5510         post2(Ri, Rlen);
5511         add(Ri, Ri, 1);
5512         cmp(Ri, Rlen, Assembler::LSL, 1);
5513 
5514         br(Assembler::LT, loop);
5515         bind(end);
5516         block_comment("} // i");
5517       }
5518 
5519       normalize(Rlen);
5520 
5521       mov(Ra, Pm_base);  // Save Pm_base in Ra
5522       restore_regs();  // Restore caller's Pm_base
5523 
5524       // Copy our result into caller's Pm_base
5525       reverse(Pm_base, Ra, Rlen, t0, t1);
5526 
5527       leave();
5528       ret(lr);
5529 
5530       return entry;
5531     }
5532     // In C, approximately:
5533 
5534     // void
5535     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5536     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5537     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5538     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5539     //   unsigned long Ra, Rb, Rn, Rm;
5540 
5541     //   int i;
5542 
5543     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5544 
5545     //   for (i = 0; i < len; i++) {
5546     //     int j;
5547 
5548     //     Pa = Pa_base;
5549     //     Pb = Pa_base + i;
5550     //     Pm = Pm_base;
5551     //     Pn = Pn_base + i;
5552 
5553     //     Ra = *Pa;
5554     //     Rb = *Pb;
5555     //     Rm = *Pm;
5556     //     Rn = *Pn;
5557 
5558     //     int iters = (i+1)/2;
5559     //     for (j = 0; iters--; j++) {
5560     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5561     //       MACC2(Ra, Rb, t0, t1, t2);
5562     //       Ra = *++Pa;
5563     //       Rb = *--Pb;
5564     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5565     //       MACC(Rm, Rn, t0, t1, t2);
5566     //       Rm = *++Pm;
5567     //       Rn = *--Pn;
5568     //     }
5569     //     if ((i & 1) == 0) {
5570     //       assert(Ra == Pa_base[j], "must be");
5571     //       MACC(Ra, Ra, t0, t1, t2);
5572     //     }
5573     //     iters = i/2;
5574     //     assert(iters == i-j, "must be");
5575     //     for (; iters--; j++) {
5576     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5577     //       MACC(Rm, Rn, t0, t1, t2);
5578     //       Rm = *++Pm;
5579     //       Rn = *--Pn;
5580     //     }
5581 
5582     //     *Pm = Rm = t0 * inv;
5583     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5584     //     MACC(Rm, Rn, t0, t1, t2);
5585 
5586     //     assert(t0 == 0, "broken Montgomery multiply");
5587 
5588     //     t0 = t1; t1 = t2; t2 = 0;
5589     //   }
5590 
5591     //   for (i = len; i < 2*len; i++) {
5592     //     int start = i-len+1;
5593     //     int end = start + (len - start)/2;
5594     //     int j;
5595 
5596     //     Pa = Pa_base + i-len;
5597     //     Pb = Pa_base + len;
5598     //     Pm = Pm_base + i-len;
5599     //     Pn = Pn_base + len;
5600 
5601     //     Ra = *++Pa;
5602     //     Rb = *--Pb;
5603     //     Rm = *++Pm;
5604     //     Rn = *--Pn;
5605 
5606     //     int iters = (2*len-i-1)/2;
5607     //     assert(iters == end-start, "must be");
5608     //     for (j = start; iters--; j++) {
5609     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5610     //       MACC2(Ra, Rb, t0, t1, t2);
5611     //       Ra = *++Pa;
5612     //       Rb = *--Pb;
5613     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5614     //       MACC(Rm, Rn, t0, t1, t2);
5615     //       Rm = *++Pm;
5616     //       Rn = *--Pn;
5617     //     }
5618     //     if ((i & 1) == 0) {
5619     //       assert(Ra == Pa_base[j], "must be");
5620     //       MACC(Ra, Ra, t0, t1, t2);
5621     //     }
5622     //     iters =  (2*len-i)/2;
5623     //     assert(iters == len-j, "must be");
5624     //     for (; iters--; j++) {
5625     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5626     //       MACC(Rm, Rn, t0, t1, t2);
5627     //       Rm = *++Pm;
5628     //       Rn = *--Pn;
5629     //     }
5630     //     Pm_base[i-len] = t0;
5631     //     t0 = t1; t1 = t2; t2 = 0;
5632     //   }
5633 
5634     //   while (t0)
5635     //     t0 = sub(Pm_base, Pn_base, t0, len);
5636     // }
5637   };
5638 
5639 
5640   // Initialization
5641   void generate_initial() {
5642     // Generate initial stubs and initializes the entry points
5643 
5644     // entry points that exist in all platforms Note: This is code
5645     // that could be shared among different platforms - however the
5646     // benefit seems to be smaller than the disadvantage of having a
5647     // much more complicated generator structure. See also comment in
5648     // stubRoutines.hpp.
5649 
5650     StubRoutines::_forward_exception_entry = generate_forward_exception();
5651 
5652     StubRoutines::_call_stub_entry =
5653       generate_call_stub(StubRoutines::_call_stub_return_address);
5654 
5655     // is referenced by megamorphic call
5656     StubRoutines::_catch_exception_entry = generate_catch_exception();
5657 
5658     // Build this early so it's available for the interpreter.
5659     StubRoutines::_throw_StackOverflowError_entry =
5660       generate_throw_exception("StackOverflowError throw_exception",
5661                                CAST_FROM_FN_PTR(address,
5662                                                 SharedRuntime::throw_StackOverflowError));
5663     StubRoutines::_throw_delayed_StackOverflowError_entry =
5664       generate_throw_exception("delayed StackOverflowError throw_exception",
5665                                CAST_FROM_FN_PTR(address,
5666                                                 SharedRuntime::throw_delayed_StackOverflowError));
5667     if (UseCRC32Intrinsics) {
5668       // set table address before stub generation which use it
5669       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5670       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5671     }
5672 
5673     if (UseCRC32CIntrinsics) {
5674       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5675     }
5676 
5677     // Disabled until JDK-8210858 is fixed
5678     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5679     //   StubRoutines::_dlog = generate_dlog();
5680     // }
5681 
5682     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5683       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5684     }
5685 
5686     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5687       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5688     }
5689   }
5690 
5691   void generate_all() {
5692     // support for verify_oop (must happen after universe_init)
5693     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5694     StubRoutines::_throw_AbstractMethodError_entry =
5695       generate_throw_exception("AbstractMethodError throw_exception",
5696                                CAST_FROM_FN_PTR(address,
5697                                                 SharedRuntime::
5698                                                 throw_AbstractMethodError));
5699 
5700     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5701       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5702                                CAST_FROM_FN_PTR(address,
5703                                                 SharedRuntime::
5704                                                 throw_IncompatibleClassChangeError));
5705 
5706     StubRoutines::_throw_NullPointerException_at_call_entry =
5707       generate_throw_exception("NullPointerException at call throw_exception",
5708                                CAST_FROM_FN_PTR(address,
5709                                                 SharedRuntime::
5710                                                 throw_NullPointerException_at_call));
5711 
5712     // arraycopy stubs used by compilers
5713     generate_arraycopy_stubs();
5714 
5715     // has negatives stub for large arrays.
5716     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5717 
5718     // array equals stub for large arrays.
5719     if (!UseSimpleArrayEquals) {
5720       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5721     }
5722 
5723     generate_compare_long_strings();
5724 
5725     generate_string_indexof_stubs();
5726 
5727     // byte_array_inflate stub for large arrays.
5728     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5729 
5730 #ifdef COMPILER2
5731     if (UseMultiplyToLenIntrinsic) {
5732       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5733     }
5734 
5735     if (UseSquareToLenIntrinsic) {
5736       StubRoutines::_squareToLen = generate_squareToLen();
5737     }
5738 
5739     if (UseMulAddIntrinsic) {
5740       StubRoutines::_mulAdd = generate_mulAdd();
5741     }
5742 
5743     if (UseMontgomeryMultiplyIntrinsic) {
5744       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5745       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5746       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5747     }
5748 
5749     if (UseMontgomerySquareIntrinsic) {
5750       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5751       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5752       // We use generate_multiply() rather than generate_square()
5753       // because it's faster for the sizes of modulus we care about.
5754       StubRoutines::_montgomerySquare = g.generate_multiply();
5755     }
5756 #endif // COMPILER2
5757 
5758 #ifndef BUILTIN_SIM
5759     // generate GHASH intrinsics code
5760     if (UseGHASHIntrinsics) {
5761       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5762     }
5763 
5764     if (UseAESIntrinsics) {
5765       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5766       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5767       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5768       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5769     }
5770 
5771     if (UseSHA1Intrinsics) {
5772       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5773       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5774     }
5775     if (UseSHA256Intrinsics) {
5776       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5777       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5778     }
5779 
5780     // generate Adler32 intrinsics code
5781     if (UseAdler32Intrinsics) {
5782       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5783     }
5784 
5785     // Safefetch stubs.
5786     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5787                                                        &StubRoutines::_safefetch32_fault_pc,
5788                                                        &StubRoutines::_safefetch32_continuation_pc);
5789     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5790                                                        &StubRoutines::_safefetchN_fault_pc,
5791                                                        &StubRoutines::_safefetchN_continuation_pc);
5792 #endif
5793     StubRoutines::aarch64::set_completed();
5794   }
5795 
5796  public:
5797   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5798     if (all) {
5799       generate_all();
5800     } else {
5801       generate_initial();
5802     }
5803   }
5804 }; // end class declaration
5805 
5806 void StubGenerator_generate(CodeBuffer* code, bool all) {
5807   StubGenerator g(code, all);
5808 }