1 /*
   2  * Copyright (c) 2013, Red Hat Inc.
   3  * Copyright (c) 2003, 2015, Oracle and/or its affiliates.
   4  * All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "interpreter/interpreter.hpp"
  31 #include "nativeInst_aarch64.hpp"
  32 #include "oops/instanceOop.hpp"
  33 #include "oops/method.hpp"
  34 #include "oops/objArrayKlass.hpp"
  35 #include "oops/oop.inline.hpp"
  36 #include "prims/methodHandles.hpp"
  37 #include "runtime/frame.inline.hpp"
  38 #include "runtime/handles.inline.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubCodeGenerator.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "runtime/thread.inline.hpp"
  43 #include "utilities/macros.hpp"
  44 #include "utilities/top.hpp"
  45 
  46 #include "stubRoutines_aarch64.hpp"
  47 
  48 #ifdef COMPILER2
  49 #include "opto/runtime.hpp"
  50 #endif
  51 #if INCLUDE_ALL_GCS
  52 #include "shenandoahBarrierSetAssembler_aarch64.hpp"
  53 #endif
  54 
  55 // Declaration and definition of StubGenerator (no .hpp file).
  56 // For a more detailed description of the stub routine structure
  57 // see the comment in stubRoutines.hpp
  58 
  59 #undef __
  60 #define __ _masm->
  61 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  62 
  63 #ifdef PRODUCT
  64 #define BLOCK_COMMENT(str) /* nothing */
  65 #else
  66 #define BLOCK_COMMENT(str) __ block_comment(str)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Stub Code definitions
  72 
  73 class StubGenerator: public StubCodeGenerator {
  74  private:
  75 
  76 #ifdef PRODUCT
  77 #define inc_counter_np(counter) ((void)0)
  78 #else
  79   void inc_counter_np_(int& counter) {
  80     __ lea(rscratch2, ExternalAddress((address)&counter));
  81     __ ldrw(rscratch1, Address(rscratch2));
  82     __ addw(rscratch1, rscratch1, 1);
  83     __ strw(rscratch1, Address(rscratch2));
  84   }
  85 #define inc_counter_np(counter) \
  86   BLOCK_COMMENT("inc_counter " #counter); \
  87   inc_counter_np_(counter);
  88 #endif
  89 
  90   // Call stubs are used to call Java from C
  91   //
  92   // Arguments:
  93   //    c_rarg0:   call wrapper address                   address
  94   //    c_rarg1:   result                                 address
  95   //    c_rarg2:   result type                            BasicType
  96   //    c_rarg3:   method                                 Method*
  97   //    c_rarg4:   (interpreter) entry point              address
  98   //    c_rarg5:   parameters                             intptr_t*
  99   //    c_rarg6:   parameter size (in words)              int
 100   //    c_rarg7:   thread                                 Thread*
 101   //
 102   // There is no return from the stub itself as any Java result
 103   // is written to result
 104   //
 105   // we save r30 (lr) as the return PC at the base of the frame and
 106   // link r29 (fp) below it as the frame pointer installing sp (r31)
 107   // into fp.
 108   //
 109   // we save r0-r7, which accounts for all the c arguments.
 110   //
 111   // TODO: strictly do we need to save them all? they are treated as
 112   // volatile by C so could we omit saving the ones we are going to
 113   // place in global registers (thread? method?) or those we only use
 114   // during setup of the Java call?
 115   //
 116   // we don't need to save r8 which C uses as an indirect result location
 117   // return register.
 118   //
 119   // we don't need to save r9-r15 which both C and Java treat as
 120   // volatile
 121   //
 122   // we don't need to save r16-18 because Java does not use them
 123   //
 124   // we save r19-r28 which Java uses as scratch registers and C
 125   // expects to be callee-save
 126   //
 127   // we save the bottom 64 bits of each value stored in v8-v15; it is
 128   // the responsibility of the caller to preserve larger values.
 129   //
 130   // so the stub frame looks like this when we enter Java code
 131   //
 132   //     [ return_from_Java     ] <--- sp
 133   //     [ argument word n      ]
 134   //      ...
 135   // -27 [ argument word 1      ]
 136   // -26 [ saved v15            ] <--- sp_after_call
 137   // -25 [ saved v14            ]
 138   // -24 [ saved v13            ]
 139   // -23 [ saved v12            ]
 140   // -22 [ saved v11            ]
 141   // -21 [ saved v10            ]
 142   // -20 [ saved v9             ]
 143   // -19 [ saved v8             ]
 144   // -18 [ saved r28            ]
 145   // -17 [ saved r27            ]
 146   // -16 [ saved r26            ]
 147   // -15 [ saved r25            ]
 148   // -14 [ saved r24            ]
 149   // -13 [ saved r23            ]
 150   // -12 [ saved r22            ]
 151   // -11 [ saved r21            ]
 152   // -10 [ saved r20            ]
 153   //  -9 [ saved r19            ]
 154   //  -8 [ call wrapper    (r0) ]
 155   //  -7 [ result          (r1) ]
 156   //  -6 [ result type     (r2) ]
 157   //  -5 [ method          (r3) ]
 158   //  -4 [ entry point     (r4) ]
 159   //  -3 [ parameters      (r5) ]
 160   //  -2 [ parameter size  (r6) ]
 161   //  -1 [ thread (r7)          ]
 162   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 163   //   1 [ saved lr       (r30) ]
 164 
 165   // Call stub stack layout word offsets from fp
 166   enum call_stub_layout {
 167     sp_after_call_off = -26,
 168 
 169     d15_off            = -26,
 170     d13_off            = -24,
 171     d11_off            = -22,
 172     d9_off             = -20,
 173 
 174     r28_off            = -18,
 175     r26_off            = -16,
 176     r24_off            = -14,
 177     r22_off            = -12,
 178     r20_off            = -10,
 179     call_wrapper_off   =  -8,
 180     result_off         =  -7,
 181     result_type_off    =  -6,
 182     method_off         =  -5,
 183     entry_point_off    =  -4,
 184     parameter_size_off =  -2,
 185     thread_off         =  -1,
 186     fp_f               =   0,
 187     retaddr_off        =   1,
 188   };
 189 
 190   address generate_call_stub(address& return_address) {
 191     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 192            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 193            "adjust this code");
 194 
 195     StubCodeMark mark(this, "StubRoutines", "call_stub");
 196     address start = __ pc();
 197 
 198     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 199 
 200     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 201     const Address result        (rfp, result_off         * wordSize);
 202     const Address result_type   (rfp, result_type_off    * wordSize);
 203     const Address method        (rfp, method_off         * wordSize);
 204     const Address entry_point   (rfp, entry_point_off    * wordSize);
 205     const Address parameter_size(rfp, parameter_size_off * wordSize);
 206 
 207     const Address thread        (rfp, thread_off         * wordSize);
 208 
 209     const Address d15_save      (rfp, d15_off * wordSize);
 210     const Address d13_save      (rfp, d13_off * wordSize);
 211     const Address d11_save      (rfp, d11_off * wordSize);
 212     const Address d9_save       (rfp, d9_off * wordSize);
 213 
 214     const Address r28_save      (rfp, r28_off * wordSize);
 215     const Address r26_save      (rfp, r26_off * wordSize);
 216     const Address r24_save      (rfp, r24_off * wordSize);
 217     const Address r22_save      (rfp, r22_off * wordSize);
 218     const Address r20_save      (rfp, r20_off * wordSize);
 219 
 220     // stub code
 221 
 222     address aarch64_entry = __ pc();
 223 
 224     // set up frame and move sp to end of save area
 225     __ enter();
 226     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 227 
 228     // save register parameters and Java scratch/global registers
 229     // n.b. we save thread even though it gets installed in
 230     // rthread because we want to sanity check rthread later
 231     __ str(c_rarg7,  thread);
 232     __ strw(c_rarg6, parameter_size);
 233     __ stp(c_rarg4, c_rarg5,  entry_point);
 234     __ stp(c_rarg2, c_rarg3,  result_type);
 235     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 236 
 237     __ stp(r20, r19,   r20_save);
 238     __ stp(r22, r21,   r22_save);
 239     __ stp(r24, r23,   r24_save);
 240     __ stp(r26, r25,   r26_save);
 241     __ stp(r28, r27,   r28_save);
 242 
 243     __ stpd(v9,  v8,   d9_save);
 244     __ stpd(v11, v10,  d11_save);
 245     __ stpd(v13, v12,  d13_save);
 246     __ stpd(v15, v14,  d15_save);
 247 
 248     // install Java thread in global register now we have saved
 249     // whatever value it held
 250     __ mov(rthread, c_rarg7);
 251     // And method
 252     __ mov(rmethod, c_rarg3);
 253 
 254     // set up the heapbase register
 255     __ reinit_heapbase();
 256 
 257 #ifdef ASSERT
 258     // make sure we have no pending exceptions
 259     {
 260       Label L;
 261       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 262       __ cmp(rscratch1, (unsigned)NULL_WORD);
 263       __ br(Assembler::EQ, L);
 264       __ stop("StubRoutines::call_stub: entered with pending exception");
 265       __ BIND(L);
 266     }
 267 #endif
 268     // pass parameters if any
 269     __ mov(esp, sp);
 270     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 271     __ andr(sp, rscratch1, -2 * wordSize);
 272 
 273     BLOCK_COMMENT("pass parameters if any");
 274     Label parameters_done;
 275     // parameter count is still in c_rarg6
 276     // and parameter pointer identifying param 1 is in c_rarg5
 277     __ cbzw(c_rarg6, parameters_done);
 278 
 279     address loop = __ pc();
 280     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 281     __ subsw(c_rarg6, c_rarg6, 1);
 282     __ push(rscratch1);
 283     __ br(Assembler::GT, loop);
 284 
 285     __ BIND(parameters_done);
 286 
 287     // call Java entry -- passing methdoOop, and current sp
 288     //      rmethod: Method*
 289     //      r13: sender sp
 290     BLOCK_COMMENT("call Java function");
 291     __ mov(r13, sp);
 292     __ blr(c_rarg4);
 293 
 294     // we do this here because the notify will already have been done
 295     // if we get to the next instruction via an exception
 296     //
 297     // n.b. adding this instruction here affects the calculation of
 298     // whether or not a routine returns to the call stub (used when
 299     // doing stack walks) since the normal test is to check the return
 300     // pc against the address saved below. so we may need to allow for
 301     // this extra instruction in the check.
 302 
 303     // save current address for use by exception handling code
 304 
 305     return_address = __ pc();
 306 
 307     // store result depending on type (everything that is not
 308     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 309     // n.b. this assumes Java returns an integral result in r0
 310     // and a floating result in j_farg0
 311     __ ldr(j_rarg2, result);
 312     Label is_long, is_float, is_double, exit;
 313     __ ldr(j_rarg1, result_type);
 314     __ cmp(j_rarg1, T_OBJECT);
 315     __ br(Assembler::EQ, is_long);
 316     __ cmp(j_rarg1, T_LONG);
 317     __ br(Assembler::EQ, is_long);
 318     __ cmp(j_rarg1, T_FLOAT);
 319     __ br(Assembler::EQ, is_float);
 320     __ cmp(j_rarg1, T_DOUBLE);
 321     __ br(Assembler::EQ, is_double);
 322 
 323     // handle T_INT case
 324     __ strw(r0, Address(j_rarg2));
 325 
 326     __ BIND(exit);
 327 
 328     // pop parameters
 329     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 330 
 331 #ifdef ASSERT
 332     // verify that threads correspond
 333     {
 334       Label L, S;
 335       __ ldr(rscratch1, thread);
 336       __ cmp(rthread, rscratch1);
 337       __ br(Assembler::NE, S);
 338       __ get_thread(rscratch1);
 339       __ cmp(rthread, rscratch1);
 340       __ br(Assembler::EQ, L);
 341       __ BIND(S);
 342       __ stop("StubRoutines::call_stub: threads must correspond");
 343       __ BIND(L);
 344     }
 345 #endif
 346 
 347     // restore callee-save registers
 348     __ ldpd(v15, v14,  d15_save);
 349     __ ldpd(v13, v12,  d13_save);
 350     __ ldpd(v11, v10,  d11_save);
 351     __ ldpd(v9,  v8,   d9_save);
 352 
 353     __ ldp(r28, r27,   r28_save);
 354     __ ldp(r26, r25,   r26_save);
 355     __ ldp(r24, r23,   r24_save);
 356     __ ldp(r22, r21,   r22_save);
 357     __ ldp(r20, r19,   r20_save);
 358 
 359     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 360     __ ldrw(c_rarg2, result_type);
 361     __ ldr(c_rarg3,  method);
 362     __ ldp(c_rarg4, c_rarg5,  entry_point);
 363     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 364 
 365     // leave frame and return to caller
 366     __ leave();
 367     __ ret(lr);
 368 
 369     // handle return types different from T_INT
 370 
 371     __ BIND(is_long);
 372     __ str(r0, Address(j_rarg2, 0));
 373     __ br(Assembler::AL, exit);
 374 
 375     __ BIND(is_float);
 376     __ strs(j_farg0, Address(j_rarg2, 0));
 377     __ br(Assembler::AL, exit);
 378 
 379     __ BIND(is_double);
 380     __ strd(j_farg0, Address(j_rarg2, 0));
 381     __ br(Assembler::AL, exit);
 382 
 383     return start;
 384   }
 385 
 386   // Return point for a Java call if there's an exception thrown in
 387   // Java code.  The exception is caught and transformed into a
 388   // pending exception stored in JavaThread that can be tested from
 389   // within the VM.
 390   //
 391   // Note: Usually the parameters are removed by the callee. In case
 392   // of an exception crossing an activation frame boundary, that is
 393   // not the case if the callee is compiled code => need to setup the
 394   // rsp.
 395   //
 396   // r0: exception oop
 397 
 398   address generate_catch_exception() {
 399     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 400     address start = __ pc();
 401 
 402     // same as in generate_call_stub():
 403     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 404     const Address thread        (rfp, thread_off         * wordSize);
 405 
 406 #ifdef ASSERT
 407     // verify that threads correspond
 408     {
 409       Label L, S;
 410       __ ldr(rscratch1, thread);
 411       __ cmp(rthread, rscratch1);
 412       __ br(Assembler::NE, S);
 413       __ get_thread(rscratch1);
 414       __ cmp(rthread, rscratch1);
 415       __ br(Assembler::EQ, L);
 416       __ bind(S);
 417       __ stop("StubRoutines::catch_exception: threads must correspond");
 418       __ bind(L);
 419     }
 420 #endif
 421 
 422     // set pending exception
 423     __ verify_oop(r0);
 424 
 425     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 426     __ mov(rscratch1, (address)__FILE__);
 427     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 428     __ movw(rscratch1, (int)__LINE__);
 429     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 430 
 431     // complete return to VM
 432     assert(StubRoutines::_call_stub_return_address != NULL,
 433            "_call_stub_return_address must have been generated before");
 434     __ b(StubRoutines::_call_stub_return_address);
 435 
 436     return start;
 437   }
 438 
 439   // Continuation point for runtime calls returning with a pending
 440   // exception.  The pending exception check happened in the runtime
 441   // or native call stub.  The pending exception in Thread is
 442   // converted into a Java-level exception.
 443   //
 444   // Contract with Java-level exception handlers:
 445   // r0: exception
 446   // r3: throwing pc
 447   //
 448   // NOTE: At entry of this stub, exception-pc must be in LR !!
 449 
 450   // NOTE: this is always used as a jump target within generated code
 451   // so it just needs to be generated code wiht no x86 prolog
 452 
 453   address generate_forward_exception() {
 454     StubCodeMark mark(this, "StubRoutines", "forward exception");
 455     address start = __ pc();
 456 
 457     // Upon entry, LR points to the return address returning into
 458     // Java (interpreted or compiled) code; i.e., the return address
 459     // becomes the throwing pc.
 460     //
 461     // Arguments pushed before the runtime call are still on the stack
 462     // but the exception handler will reset the stack pointer ->
 463     // ignore them.  A potential result in registers can be ignored as
 464     // well.
 465 
 466 #ifdef ASSERT
 467     // make sure this code is only executed if there is a pending exception
 468     {
 469       Label L;
 470       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 471       __ cbnz(rscratch1, L);
 472       __ stop("StubRoutines::forward exception: no pending exception (1)");
 473       __ bind(L);
 474     }
 475 #endif
 476 
 477     // compute exception handler into r19
 478 
 479     // call the VM to find the handler address associated with the
 480     // caller address. pass thread in r0 and caller pc (ret address)
 481     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 482     // the stack.
 483     __ mov(c_rarg1, lr);
 484     // lr will be trashed by the VM call so we move it to R19
 485     // (callee-saved) because we also need to pass it to the handler
 486     // returned by this call.
 487     __ mov(r19, lr);
 488     BLOCK_COMMENT("call exception_handler_for_return_address");
 489     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 490                          SharedRuntime::exception_handler_for_return_address),
 491                     rthread, c_rarg1);
 492     // we should not really care that lr is no longer the callee
 493     // address. we saved the value the handler needs in r19 so we can
 494     // just copy it to r3. however, the C2 handler will push its own
 495     // frame and then calls into the VM and the VM code asserts that
 496     // the PC for the frame above the handler belongs to a compiled
 497     // Java method. So, we restore lr here to satisfy that assert.
 498     __ mov(lr, r19);
 499     // setup r0 & r3 & clear pending exception
 500     __ mov(r3, r19);
 501     __ mov(r19, r0);
 502     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 503     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 504 
 505 #ifdef ASSERT
 506     // make sure exception is set
 507     {
 508       Label L;
 509       __ cbnz(r0, L);
 510       __ stop("StubRoutines::forward exception: no pending exception (2)");
 511       __ bind(L);
 512     }
 513 #endif
 514 
 515     // continue at exception handler
 516     // r0: exception
 517     // r3: throwing pc
 518     // r19: exception handler
 519     __ verify_oop(r0);
 520     __ br(r19);
 521 
 522     return start;
 523   }
 524 
 525   // Non-destructive plausibility checks for oops
 526   //
 527   // Arguments:
 528   //    r0: oop to verify
 529   //    rscratch1: error message
 530   //
 531   // Stack after saving c_rarg3:
 532   //    [tos + 0]: saved c_rarg3
 533   //    [tos + 1]: saved c_rarg2
 534   //    [tos + 2]: saved lr
 535   //    [tos + 3]: saved rscratch2
 536   //    [tos + 4]: saved r0
 537   //    [tos + 5]: saved rscratch1
 538   address generate_verify_oop() {
 539 
 540     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 541     address start = __ pc();
 542 
 543     Label exit, error;
 544 
 545     // save c_rarg2 and c_rarg3
 546     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 547 
 548     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 549     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 550     __ ldr(c_rarg3, Address(c_rarg2));
 551     __ add(c_rarg3, c_rarg3, 1);
 552     __ str(c_rarg3, Address(c_rarg2));
 553 
 554     // object is in r0
 555     // make sure object is 'reasonable'
 556     __ cbz(r0, exit); // if obj is NULL it is OK
 557 
 558     // Check if the oop is in the right area of memory
 559     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 560     __ andr(c_rarg2, r0, c_rarg3);
 561     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 562 
 563     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 564     // instruction here because the flags register is live.
 565     __ eor(c_rarg2, c_rarg2, c_rarg3);
 566     __ cbnz(c_rarg2, error);
 567 
 568     // make sure klass is 'reasonable', which is not zero.
 569     __ load_klass(r0, r0);  // get klass
 570     __ cbz(r0, error);      // if klass is NULL it is broken
 571 
 572     // return if everything seems ok
 573     __ bind(exit);
 574 
 575     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 576     __ ret(lr);
 577 
 578     // handle errors
 579     __ bind(error);
 580     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 581 
 582     __ push(RegSet::range(r0, r29), sp);
 583     // debug(char* msg, int64_t pc, int64_t regs[])
 584     __ mov(c_rarg0, rscratch1);      // pass address of error message
 585     __ mov(c_rarg1, lr);             // pass return address
 586     __ mov(c_rarg2, sp);             // pass address of regs on stack
 587 #ifndef PRODUCT
 588     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 589 #endif
 590     BLOCK_COMMENT("call MacroAssembler::debug");
 591     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 592     __ blr(rscratch1);
 593 
 594     return start;
 595   }
 596 
 597   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 598 
 599   // Generate code for an array write pre barrier
 600   //
 601   //     addr    -  starting address
 602   //     count   -  element count
 603   //     tmp     - scratch register
 604   //
 605   //     Destroy no registers except rscratch1 and rscratch2
 606   //
 607   void  gen_write_ref_array_pre_barrier(Register src, Register addr, Register count, bool dest_uninitialized) {
 608     BarrierSet* bs = Universe::heap()->barrier_set();
 609     switch (bs->kind()) {
 610     case BarrierSet::G1SATBCT:
 611     case BarrierSet::G1SATBCTLogging:
 612       // Don't generate the call if we statically know that the target is uninitialized
 613       if (!dest_uninitialized) {
 614         __ push_call_clobbered_registers();
 615         if (count == c_rarg0) {
 616           if (addr == c_rarg1) {
 617             // exactly backwards!!
 618             __ mov(rscratch1, c_rarg0);
 619             __ mov(c_rarg0, c_rarg1);
 620             __ mov(c_rarg1, rscratch1);
 621           } else {
 622             __ mov(c_rarg1, count);
 623             __ mov(c_rarg0, addr);
 624           }
 625         } else {
 626           __ mov(c_rarg0, addr);
 627           __ mov(c_rarg1, count);
 628         }
 629         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 630         __ pop_call_clobbered_registers();
 631         break;
 632       case BarrierSet::CardTableModRef:
 633       case BarrierSet::CardTableExtension:
 634       case BarrierSet::ModRef:
 635         break;
 636 #if INCLUDE_ALL_GCS
 637       case BarrierSet::ShenandoahBarrierSet:
 638         ShenandoahBarrierSetAssembler::bsasm()->arraycopy_prologue(_masm, dest_uninitialized, src, addr, count);
 639         break;
 640 #endif
 641       default:
 642         ShouldNotReachHere();
 643 
 644       }
 645     }
 646   }
 647 
 648   //
 649   // Generate code for an array write post barrier
 650   //
 651   //  Input:
 652   //     start    - register containing starting address of destination array
 653   //     end      - register containing ending address of destination array
 654   //     scratch  - scratch register
 655   //
 656   //  The input registers are overwritten.
 657   //  The ending address is inclusive.
 658   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 659     assert_different_registers(start, end, scratch);
 660     Label L_done;
 661 
 662     // "end" is inclusive end pointer == start + (count - 1) * array_element_size
 663     // If count == 0, "end" is less than "start" and we need to skip card marking.
 664     __ cmp(end, start);
 665     __ br(__ LO, L_done);
 666 
 667     BarrierSet* bs = Universe::heap()->barrier_set();
 668     switch (bs->kind()) {
 669       case BarrierSet::G1SATBCT:
 670       case BarrierSet::G1SATBCTLogging:
 671 
 672         {
 673           __ push_call_clobbered_registers();
 674           // must compute element count unless barrier set interface is changed (other platforms supply count)
 675           assert_different_registers(start, end, scratch);
 676           __ lea(scratch, Address(end, BytesPerHeapOop));
 677           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 678           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 679           __ mov(c_rarg0, start);
 680           __ mov(c_rarg1, scratch);
 681           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 682           __ pop_call_clobbered_registers();
 683         }
 684         break;
 685       case BarrierSet::CardTableModRef:
 686       case BarrierSet::CardTableExtension:
 687         {
 688           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 689           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 690 
 691           Label L_loop;
 692 
 693            __ lsr(start, start, CardTableModRefBS::card_shift);
 694            __ lsr(end, end, CardTableModRefBS::card_shift);
 695            __ sub(end, end, start); // number of bytes to copy
 696 
 697           const Register count = end; // 'end' register contains bytes count now
 698           __ load_byte_map_base(scratch);
 699           __ add(start, start, scratch);
 700           if (UseConcMarkSweepGC) {
 701             __ membar(__ StoreStore);
 702           }
 703           __ BIND(L_loop);
 704           __ strb(zr, Address(start, count));
 705           __ subs(count, count, 1);
 706           __ br(Assembler::GE, L_loop);
 707         }
 708         break;
 709 #if INCLUDE_ALL_GCS
 710       case BarrierSet::ShenandoahBarrierSet:
 711         break;
 712 #endif
 713       default:
 714         ShouldNotReachHere();
 715 
 716     }
 717     __ bind(L_done);
 718   }
 719 
 720   address generate_zero_longs(Register base, Register cnt) {
 721     Register tmp = rscratch1;
 722     Register tmp2 = rscratch2;
 723     int zva_length = VM_Version::zva_length();
 724     Label initial_table_end, loop_zva;
 725     Label fini;
 726 
 727     __ align(CodeEntryAlignment);
 728     StubCodeMark mark(this, "StubRoutines", "zero_longs");
 729     address start = __ pc();
 730 
 731     // Base must be 16 byte aligned. If not just return and let caller handle it
 732     __ tst(base, 0x0f);
 733     __ br(Assembler::NE, fini);
 734     // Align base with ZVA length.
 735     __ neg(tmp, base);
 736     __ andr(tmp, tmp, zva_length - 1);
 737 
 738     // tmp: the number of bytes to be filled to align the base with ZVA length.
 739     __ add(base, base, tmp);
 740     __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
 741     __ adr(tmp2, initial_table_end);
 742     __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
 743     __ br(tmp2);
 744 
 745     for (int i = -zva_length + 16; i < 0; i += 16)
 746       __ stp(zr, zr, Address(base, i));
 747     __ bind(initial_table_end);
 748 
 749     __ sub(cnt, cnt, zva_length >> 3);
 750     __ bind(loop_zva);
 751     __ dc(Assembler::ZVA, base);
 752     __ subs(cnt, cnt, zva_length >> 3);
 753     __ add(base, base, zva_length);
 754     __ br(Assembler::GE, loop_zva);
 755     __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
 756     __ bind(fini);
 757     __ ret(lr);
 758 
 759     return start;
 760   }
 761 
 762   typedef enum {
 763     copy_forwards = 1,
 764     copy_backwards = -1
 765   } copy_direction;
 766 
 767   // Bulk copy of blocks of 8 words.
 768   //
 769   // count is a count of words.
 770   //
 771   // Precondition: count >= 8
 772   //
 773   // Postconditions:
 774   //
 775   // The least significant bit of count contains the remaining count
 776   // of words to copy.  The rest of count is trash.
 777   //
 778   // s and d are adjusted to point to the remaining words to copy
 779   //
 780   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 781                            copy_direction direction) {
 782     int unit = wordSize * direction;
 783     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 784 
 785     int offset;
 786     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 787       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 788     const Register stride = r13;
 789 
 790     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 791     assert_different_registers(s, d, count, rscratch1);
 792 
 793     Label again, drain;
 794     const char *stub_name;
 795     if (direction == copy_forwards)
 796       stub_name = "foward_copy_longs";
 797     else
 798       stub_name = "backward_copy_longs";
 799 
 800     __ align(CodeEntryAlignment);
 801 
 802     StubCodeMark mark(this, "StubRoutines", stub_name);
 803 
 804     __ bind(start);
 805 
 806     Label unaligned_copy_long;
 807     if (AvoidUnalignedAccesses) {
 808       __ tbnz(d, 3, unaligned_copy_long);
 809     }
 810 
 811     if (direction == copy_forwards) {
 812       __ sub(s, s, bias);
 813       __ sub(d, d, bias);
 814     }
 815 
 816 #ifdef ASSERT
 817     // Make sure we are never given < 8 words
 818     {
 819       Label L;
 820       __ cmp(count, 8);
 821       __ br(Assembler::GE, L);
 822       __ stop("genrate_copy_longs called with < 8 words");
 823       __ bind(L);
 824     }
 825 #endif
 826 
 827     // Fill 8 registers
 828     if (UseSIMDForMemoryOps) {
 829       __ ldpq(v0, v1, Address(s, 4 * unit));
 830       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 831     } else {
 832       __ ldp(t0, t1, Address(s, 2 * unit));
 833       __ ldp(t2, t3, Address(s, 4 * unit));
 834       __ ldp(t4, t5, Address(s, 6 * unit));
 835       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 836     }
 837 
 838     __ subs(count, count, 16);
 839     __ br(Assembler::LO, drain);
 840 
 841     int prefetch = PrefetchCopyIntervalInBytes;
 842     bool use_stride = false;
 843     if (direction == copy_backwards) {
 844        use_stride = prefetch > 256;
 845        prefetch = -prefetch;
 846        if (use_stride) __ mov(stride, prefetch);
 847     }
 848 
 849     __ bind(again);
 850 
 851     if (PrefetchCopyIntervalInBytes > 0)
 852       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 853 
 854     if (UseSIMDForMemoryOps) {
 855       __ stpq(v0, v1, Address(d, 4 * unit));
 856       __ ldpq(v0, v1, Address(s, 4 * unit));
 857       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 858       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 859     } else {
 860       __ stp(t0, t1, Address(d, 2 * unit));
 861       __ ldp(t0, t1, Address(s, 2 * unit));
 862       __ stp(t2, t3, Address(d, 4 * unit));
 863       __ ldp(t2, t3, Address(s, 4 * unit));
 864       __ stp(t4, t5, Address(d, 6 * unit));
 865       __ ldp(t4, t5, Address(s, 6 * unit));
 866       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 867       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 868     }
 869 
 870     __ subs(count, count, 8);
 871     __ br(Assembler::HS, again);
 872 
 873     // Drain
 874     __ bind(drain);
 875     if (UseSIMDForMemoryOps) {
 876       __ stpq(v0, v1, Address(d, 4 * unit));
 877       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 878     } else {
 879       __ stp(t0, t1, Address(d, 2 * unit));
 880       __ stp(t2, t3, Address(d, 4 * unit));
 881       __ stp(t4, t5, Address(d, 6 * unit));
 882       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 883     }
 884 
 885     {
 886       Label L1, L2;
 887       __ tbz(count, exact_log2(4), L1);
 888       if (UseSIMDForMemoryOps) {
 889         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 890         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 891       } else {
 892         __ ldp(t0, t1, Address(s, 2 * unit));
 893         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 894         __ stp(t0, t1, Address(d, 2 * unit));
 895         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 896       }
 897       __ bind(L1);
 898 
 899       if (direction == copy_forwards) {
 900         __ add(s, s, bias);
 901         __ add(d, d, bias);
 902       }
 903 
 904       __ tbz(count, 1, L2);
 905       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 906       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 907       __ bind(L2);
 908     }
 909 
 910     __ ret(lr);
 911 
 912     if (AvoidUnalignedAccesses) {
 913       Label drain, again;
 914       // Register order for storing. Order is different for backward copy.
 915 
 916       __ bind(unaligned_copy_long);
 917 
 918       // source address is even aligned, target odd aligned
 919       //
 920       // when forward copying word pairs we read long pairs at offsets
 921       // {0, 2, 4, 6} (in long words). when backwards copying we read
 922       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 923       // address by -2 in the forwards case so we can compute the
 924       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 925       // or -1.
 926       //
 927       // when forward copying we need to store 1 word, 3 pairs and
 928       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 929       // zero offset We adjust the destination by -1 which means we
 930       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 931       //
 932       // When backwards copyng we need to store 1 word, 3 pairs and
 933       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 934       // offsets {1, 3, 5, 7, 8} * unit.
 935 
 936       if (direction == copy_forwards) {
 937         __ sub(s, s, 16);
 938         __ sub(d, d, 8);
 939       }
 940 
 941       // Fill 8 registers
 942       //
 943       // for forwards copy s was offset by -16 from the original input
 944       // value of s so the register contents are at these offsets
 945       // relative to the 64 bit block addressed by that original input
 946       // and so on for each successive 64 byte block when s is updated
 947       //
 948       // t0 at offset 0,  t1 at offset 8
 949       // t2 at offset 16, t3 at offset 24
 950       // t4 at offset 32, t5 at offset 40
 951       // t6 at offset 48, t7 at offset 56
 952 
 953       // for backwards copy s was not offset so the register contents
 954       // are at these offsets into the preceding 64 byte block
 955       // relative to that original input and so on for each successive
 956       // preceding 64 byte block when s is updated. this explains the
 957       // slightly counter-intuitive looking pattern of register usage
 958       // in the stp instructions for backwards copy.
 959       //
 960       // t0 at offset -16, t1 at offset -8
 961       // t2 at offset -32, t3 at offset -24
 962       // t4 at offset -48, t5 at offset -40
 963       // t6 at offset -64, t7 at offset -56
 964 
 965       __ ldp(t0, t1, Address(s, 2 * unit));
 966       __ ldp(t2, t3, Address(s, 4 * unit));
 967       __ ldp(t4, t5, Address(s, 6 * unit));
 968       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 969 
 970       __ subs(count, count, 16);
 971       __ br(Assembler::LO, drain);
 972 
 973       int prefetch = PrefetchCopyIntervalInBytes;
 974       bool use_stride = false;
 975       if (direction == copy_backwards) {
 976          use_stride = prefetch > 256;
 977          prefetch = -prefetch;
 978          if (use_stride) __ mov(stride, prefetch);
 979       }
 980 
 981       __ bind(again);
 982 
 983       if (PrefetchCopyIntervalInBytes > 0)
 984         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 985 
 986       if (direction == copy_forwards) {
 987        // allowing for the offset of -8 the store instructions place
 988        // registers into the target 64 bit block at the following
 989        // offsets
 990        //
 991        // t0 at offset 0
 992        // t1 at offset 8,  t2 at offset 16
 993        // t3 at offset 24, t4 at offset 32
 994        // t5 at offset 40, t6 at offset 48
 995        // t7 at offset 56
 996 
 997         __ str(t0, Address(d, 1 * unit));
 998         __ stp(t1, t2, Address(d, 2 * unit));
 999         __ ldp(t0, t1, Address(s, 2 * unit));
1000         __ stp(t3, t4, Address(d, 4 * unit));
1001         __ ldp(t2, t3, Address(s, 4 * unit));
1002         __ stp(t5, t6, Address(d, 6 * unit));
1003         __ ldp(t4, t5, Address(s, 6 * unit));
1004         __ str(t7, Address(__ pre(d, 8 * unit)));
1005         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1006       } else {
1007        // d was not offset when we started so the registers are
1008        // written into the 64 bit block preceding d with the following
1009        // offsets
1010        //
1011        // t1 at offset -8
1012        // t3 at offset -24, t0 at offset -16
1013        // t5 at offset -48, t2 at offset -32
1014        // t7 at offset -56, t4 at offset -48
1015        //                   t6 at offset -64
1016        //
1017        // note that this matches the offsets previously noted for the
1018        // loads
1019 
1020         __ str(t1, Address(d, 1 * unit));
1021         __ stp(t3, t0, Address(d, 3 * unit));
1022         __ ldp(t0, t1, Address(s, 2 * unit));
1023         __ stp(t5, t2, Address(d, 5 * unit));
1024         __ ldp(t2, t3, Address(s, 4 * unit));
1025         __ stp(t7, t4, Address(d, 7 * unit));
1026         __ ldp(t4, t5, Address(s, 6 * unit));
1027         __ str(t6, Address(__ pre(d, 8 * unit)));
1028         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1029       }
1030 
1031       __ subs(count, count, 8);
1032       __ br(Assembler::HS, again);
1033 
1034       // Drain
1035       //
1036       // this uses the same pattern of offsets and register arguments
1037       // as above
1038       __ bind(drain);
1039       if (direction == copy_forwards) {
1040         __ str(t0, Address(d, 1 * unit));
1041         __ stp(t1, t2, Address(d, 2 * unit));
1042         __ stp(t3, t4, Address(d, 4 * unit));
1043         __ stp(t5, t6, Address(d, 6 * unit));
1044         __ str(t7, Address(__ pre(d, 8 * unit)));
1045       } else {
1046         __ str(t1, Address(d, 1 * unit));
1047         __ stp(t3, t0, Address(d, 3 * unit));
1048         __ stp(t5, t2, Address(d, 5 * unit));
1049         __ stp(t7, t4, Address(d, 7 * unit));
1050         __ str(t6, Address(__ pre(d, 8 * unit)));
1051       }
1052       // now we need to copy any remaining part block which may
1053       // include a 4 word block subblock and/or a 2 word subblock.
1054       // bits 2 and 1 in the count are the tell-tale for whetehr we
1055       // have each such subblock
1056       {
1057         Label L1, L2;
1058         __ tbz(count, exact_log2(4), L1);
1059        // this is the same as above but copying only 4 longs hence
1060        // with ony one intervening stp between the str instructions
1061        // but note that the offsets and registers still follow the
1062        // same pattern
1063         __ ldp(t0, t1, Address(s, 2 * unit));
1064         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1065         if (direction == copy_forwards) {
1066           __ str(t0, Address(d, 1 * unit));
1067           __ stp(t1, t2, Address(d, 2 * unit));
1068           __ str(t3, Address(__ pre(d, 4 * unit)));
1069         } else {
1070           __ str(t1, Address(d, 1 * unit));
1071           __ stp(t3, t0, Address(d, 3 * unit));
1072           __ str(t2, Address(__ pre(d, 4 * unit)));
1073         }
1074         __ bind(L1);
1075 
1076         __ tbz(count, 1, L2);
1077        // this is the same as above but copying only 2 longs hence
1078        // there is no intervening stp between the str instructions
1079        // but note that the offset and register patterns are still
1080        // the same
1081         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1082         if (direction == copy_forwards) {
1083           __ str(t0, Address(d, 1 * unit));
1084           __ str(t1, Address(__ pre(d, 2 * unit)));
1085         } else {
1086           __ str(t1, Address(d, 1 * unit));
1087           __ str(t0, Address(__ pre(d, 2 * unit)));
1088         }
1089         __ bind(L2);
1090 
1091        // for forwards copy we need to re-adjust the offsets we
1092        // applied so that s and d are follow the last words written
1093 
1094        if (direction == copy_forwards) {
1095          __ add(s, s, 16);
1096          __ add(d, d, 8);
1097        }
1098 
1099       }
1100 
1101       __ ret(lr);
1102       }
1103   }
1104 
1105   // Small copy: less than 16 bytes.
1106   //
1107   // NB: Ignores all of the bits of count which represent more than 15
1108   // bytes, so a caller doesn't have to mask them.
1109 
1110   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1111     bool is_backwards = step < 0;
1112     size_t granularity = uabs(step);
1113     int direction = is_backwards ? -1 : 1;
1114     int unit = wordSize * direction;
1115 
1116     Label Lpair, Lword, Lint, Lshort, Lbyte;
1117 
1118     assert(granularity
1119            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1120 
1121     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1122 
1123     // ??? I don't know if this bit-test-and-branch is the right thing
1124     // to do.  It does a lot of jumping, resulting in several
1125     // mispredicted branches.  It might make more sense to do this
1126     // with something like Duff's device with a single computed branch.
1127 
1128     __ tbz(count, 3 - exact_log2(granularity), Lword);
1129     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1130     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1131     __ bind(Lword);
1132 
1133     if (granularity <= sizeof (jint)) {
1134       __ tbz(count, 2 - exact_log2(granularity), Lint);
1135       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1136       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1137       __ bind(Lint);
1138     }
1139 
1140     if (granularity <= sizeof (jshort)) {
1141       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1142       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1143       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1144       __ bind(Lshort);
1145     }
1146 
1147     if (granularity <= sizeof (jbyte)) {
1148       __ tbz(count, 0, Lbyte);
1149       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1150       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1151       __ bind(Lbyte);
1152     }
1153   }
1154 
1155   Label copy_f, copy_b;
1156 
1157   // All-singing all-dancing memory copy.
1158   //
1159   // Copy count units of memory from s to d.  The size of a unit is
1160   // step, which can be positive or negative depending on the direction
1161   // of copy.  If is_aligned is false, we align the source address.
1162   //
1163 
1164   void copy_memory(bool is_aligned, Register s, Register d,
1165                    Register count, Register tmp, int step) {
1166     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1167     bool is_backwards = step < 0;
1168     int granularity = uabs(step);
1169     const Register t0 = r3, t1 = r4;
1170 
1171     // <= 96 bytes do inline. Direction doesn't matter because we always
1172     // load all the data before writing anything
1173     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1174     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1175     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1176     const Register send = r17, dend = r18;
1177 
1178     if (PrefetchCopyIntervalInBytes > 0)
1179       __ prfm(Address(s, 0), PLDL1KEEP);
1180     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1181     __ br(Assembler::HI, copy_big);
1182 
1183     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1184     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1185 
1186     __ cmp(count, 16/granularity);
1187     __ br(Assembler::LS, copy16);
1188 
1189     __ cmp(count, 64/granularity);
1190     __ br(Assembler::HI, copy80);
1191 
1192     __ cmp(count, 32/granularity);
1193     __ br(Assembler::LS, copy32);
1194 
1195     // 33..64 bytes
1196     if (UseSIMDForMemoryOps) {
1197       __ ldpq(v0, v1, Address(s, 0));
1198       __ ldpq(v2, v3, Address(send, -32));
1199       __ stpq(v0, v1, Address(d, 0));
1200       __ stpq(v2, v3, Address(dend, -32));
1201     } else {
1202       __ ldp(t0, t1, Address(s, 0));
1203       __ ldp(t2, t3, Address(s, 16));
1204       __ ldp(t4, t5, Address(send, -32));
1205       __ ldp(t6, t7, Address(send, -16));
1206 
1207       __ stp(t0, t1, Address(d, 0));
1208       __ stp(t2, t3, Address(d, 16));
1209       __ stp(t4, t5, Address(dend, -32));
1210       __ stp(t6, t7, Address(dend, -16));
1211     }
1212     __ b(finish);
1213 
1214     // 17..32 bytes
1215     __ bind(copy32);
1216     __ ldp(t0, t1, Address(s, 0));
1217     __ ldp(t2, t3, Address(send, -16));
1218     __ stp(t0, t1, Address(d, 0));
1219     __ stp(t2, t3, Address(dend, -16));
1220     __ b(finish);
1221 
1222     // 65..80/96 bytes
1223     // (96 bytes if SIMD because we do 32 byes per instruction)
1224     __ bind(copy80);
1225     if (UseSIMDForMemoryOps) {
1226       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1227       __ ldpq(v4, v5, Address(send, -32));
1228       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1229       __ stpq(v4, v5, Address(dend, -32));
1230     } else {
1231       __ ldp(t0, t1, Address(s, 0));
1232       __ ldp(t2, t3, Address(s, 16));
1233       __ ldp(t4, t5, Address(s, 32));
1234       __ ldp(t6, t7, Address(s, 48));
1235       __ ldp(t8, t9, Address(send, -16));
1236 
1237       __ stp(t0, t1, Address(d, 0));
1238       __ stp(t2, t3, Address(d, 16));
1239       __ stp(t4, t5, Address(d, 32));
1240       __ stp(t6, t7, Address(d, 48));
1241       __ stp(t8, t9, Address(dend, -16));
1242     }
1243     __ b(finish);
1244 
1245     // 0..16 bytes
1246     __ bind(copy16);
1247     __ cmp(count, 8/granularity);
1248     __ br(Assembler::LO, copy8);
1249 
1250     // 8..16 bytes
1251     __ ldr(t0, Address(s, 0));
1252     __ ldr(t1, Address(send, -8));
1253     __ str(t0, Address(d, 0));
1254     __ str(t1, Address(dend, -8));
1255     __ b(finish);
1256 
1257     if (granularity < 8) {
1258       // 4..7 bytes
1259       __ bind(copy8);
1260       __ tbz(count, 2 - exact_log2(granularity), copy4);
1261       __ ldrw(t0, Address(s, 0));
1262       __ ldrw(t1, Address(send, -4));
1263       __ strw(t0, Address(d, 0));
1264       __ strw(t1, Address(dend, -4));
1265       __ b(finish);
1266       if (granularity < 4) {
1267         // 0..3 bytes
1268         __ bind(copy4);
1269         __ cbz(count, finish); // get rid of 0 case
1270         if (granularity == 2) {
1271           __ ldrh(t0, Address(s, 0));
1272           __ strh(t0, Address(d, 0));
1273         } else { // granularity == 1
1274           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1275           // the first and last byte.
1276           // Handle the 3 byte case by loading and storing base + count/2
1277           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1278           // This does means in the 1 byte case we load/store the same
1279           // byte 3 times.
1280           __ lsr(count, count, 1);
1281           __ ldrb(t0, Address(s, 0));
1282           __ ldrb(t1, Address(send, -1));
1283           __ ldrb(t2, Address(s, count));
1284           __ strb(t0, Address(d, 0));
1285           __ strb(t1, Address(dend, -1));
1286           __ strb(t2, Address(d, count));
1287         }
1288         __ b(finish);
1289       }
1290     }
1291 
1292     __ bind(copy_big);
1293     if (is_backwards) {
1294       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1295       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1296     }
1297 
1298     // Now we've got the small case out of the way we can align the
1299     // source address on a 2-word boundary.
1300 
1301     Label aligned;
1302 
1303     if (is_aligned) {
1304       // We may have to adjust by 1 word to get s 2-word-aligned.
1305       __ tbz(s, exact_log2(wordSize), aligned);
1306       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1307       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1308       __ sub(count, count, wordSize/granularity);
1309     } else {
1310       if (is_backwards) {
1311         __ andr(rscratch2, s, 2 * wordSize - 1);
1312       } else {
1313         __ neg(rscratch2, s);
1314         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1315       }
1316       // rscratch2 is the byte adjustment needed to align s.
1317       __ cbz(rscratch2, aligned);
1318       int shift = exact_log2(granularity);
1319       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1320       __ sub(count, count, rscratch2);
1321 
1322 #if 0
1323       // ?? This code is only correct for a disjoint copy.  It may or
1324       // may not make sense to use it in that case.
1325 
1326       // Copy the first pair; s and d may not be aligned.
1327       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1328       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1329 
1330       // Align s and d, adjust count
1331       if (is_backwards) {
1332         __ sub(s, s, rscratch2);
1333         __ sub(d, d, rscratch2);
1334       } else {
1335         __ add(s, s, rscratch2);
1336         __ add(d, d, rscratch2);
1337       }
1338 #else
1339       copy_memory_small(s, d, rscratch2, rscratch1, step);
1340 #endif
1341     }
1342 
1343     __ bind(aligned);
1344 
1345     // s is now 2-word-aligned.
1346 
1347     // We have a count of units and some trailing bytes.  Adjust the
1348     // count and do a bulk copy of words.
1349     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1350     if (direction == copy_forwards)
1351       __ bl(copy_f);
1352     else
1353       __ bl(copy_b);
1354 
1355     // And the tail.
1356     copy_memory_small(s, d, count, tmp, step);
1357 
1358     if (granularity >= 8) __ bind(copy8);
1359     if (granularity >= 4) __ bind(copy4);
1360     __ bind(finish);
1361   }
1362 
1363 
1364   void clobber_registers() {
1365 #ifdef ASSERT
1366     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1367     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1368     for (Register r = r3; r <= r18; r++)
1369       if (r != rscratch1) __ mov(r, rscratch1);
1370 #endif
1371   }
1372 
1373   // Scan over array at a for count oops, verifying each one.
1374   // Preserves a and count, clobbers rscratch1 and rscratch2.
1375   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1376     Label loop, end;
1377     __ mov(rscratch1, a);
1378     __ mov(rscratch2, zr);
1379     __ bind(loop);
1380     __ cmp(rscratch2, count);
1381     __ br(Assembler::HS, end);
1382     if (size == (size_t)wordSize) {
1383       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1384       __ verify_oop(temp);
1385     } else {
1386       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1387       __ decode_heap_oop(temp); // calls verify_oop
1388     }
1389     __ add(rscratch2, rscratch2, 1);
1390     __ b(loop);
1391     __ bind(end);
1392   }
1393 
1394   // Arguments:
1395   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1396   //             ignored
1397   //   is_oop  - true => oop array, so generate store check code
1398   //   name    - stub name string
1399   //
1400   // Inputs:
1401   //   c_rarg0   - source array address
1402   //   c_rarg1   - destination array address
1403   //   c_rarg2   - element count, treated as ssize_t, can be zero
1404   //
1405   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1406   // the hardware handle it.  The two dwords within qwords that span
1407   // cache line boundaries will still be loaded and stored atomicly.
1408   //
1409   // Side Effects:
1410   //   disjoint_int_copy_entry is set to the no-overlap entry point
1411   //   used by generate_conjoint_int_oop_copy().
1412   //
1413   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1414                                   const char *name, bool dest_uninitialized = false) {
1415     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1416     __ align(CodeEntryAlignment);
1417     StubCodeMark mark(this, "StubRoutines", name);
1418     address start = __ pc();
1419     __ enter();
1420 
1421     if (entry != NULL) {
1422       *entry = __ pc();
1423       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1424       BLOCK_COMMENT("Entry:");
1425     }
1426 
1427     if (is_oop) {
1428       __ push(RegSet::of(d, count), sp);
1429       // no registers are destroyed by this call
1430       gen_write_ref_array_pre_barrier(s, d, count, dest_uninitialized);
1431     }
1432     copy_memory(aligned, s, d, count, rscratch1, size);
1433     if (is_oop) {
1434       __ pop(RegSet::of(d, count), sp);
1435       if (VerifyOops)
1436         verify_oop_array(size, d, count, r16);
1437       __ sub(count, count, 1); // make an inclusive end pointer
1438       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1439       gen_write_ref_array_post_barrier(d, count, rscratch1);
1440     }
1441     __ leave();
1442     __ mov(r0, zr); // return 0
1443     __ ret(lr);
1444     return start;
1445   }
1446 
1447   // Arguments:
1448   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1449   //             ignored
1450   //   is_oop  - true => oop array, so generate store check code
1451   //   name    - stub name string
1452   //
1453   // Inputs:
1454   //   c_rarg0   - source array address
1455   //   c_rarg1   - destination array address
1456   //   c_rarg2   - element count, treated as ssize_t, can be zero
1457   //
1458   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1459   // the hardware handle it.  The two dwords within qwords that span
1460   // cache line boundaries will still be loaded and stored atomicly.
1461   //
1462   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1463                                  address *entry, const char *name,
1464                                  bool dest_uninitialized = false) {
1465     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1466 
1467     StubCodeMark mark(this, "StubRoutines", name);
1468     address start = __ pc();
1469 
1470     __ enter();
1471 
1472     if (entry != NULL) {
1473       *entry = __ pc();
1474       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1475       BLOCK_COMMENT("Entry:");
1476     }
1477 
1478     // use fwd copy when (d-s) above_equal (count*size)
1479     __ sub(rscratch1, d, s);
1480     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1481     __ br(Assembler::HS, nooverlap_target);
1482 
1483     if (is_oop) {
1484       __ push(RegSet::of(d, count), sp);
1485       // no registers are destroyed by this call
1486       gen_write_ref_array_pre_barrier(s, d, count, dest_uninitialized);
1487     }
1488     copy_memory(aligned, s, d, count, rscratch1, -size);
1489     if (is_oop) {
1490       __ pop(RegSet::of(d, count), sp);
1491       if (VerifyOops)
1492         verify_oop_array(size, d, count, r16);
1493       __ sub(count, count, 1); // make an inclusive end pointer
1494       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1495       gen_write_ref_array_post_barrier(d, count, rscratch1);
1496     }
1497     __ leave();
1498     __ mov(r0, zr); // return 0
1499     __ ret(lr);
1500     return start;
1501 }
1502 
1503   // Arguments:
1504   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1505   //             ignored
1506   //   name    - stub name string
1507   //
1508   // Inputs:
1509   //   c_rarg0   - source array address
1510   //   c_rarg1   - destination array address
1511   //   c_rarg2   - element count, treated as ssize_t, can be zero
1512   //
1513   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1514   // we let the hardware handle it.  The one to eight bytes within words,
1515   // dwords or qwords that span cache line boundaries will still be loaded
1516   // and stored atomically.
1517   //
1518   // Side Effects:
1519   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1520   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1521   // we let the hardware handle it.  The one to eight bytes within words,
1522   // dwords or qwords that span cache line boundaries will still be loaded
1523   // and stored atomically.
1524   //
1525   // Side Effects:
1526   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1527   //   used by generate_conjoint_byte_copy().
1528   //
1529   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1530     const bool not_oop = false;
1531     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1532   }
1533 
1534   // Arguments:
1535   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1536   //             ignored
1537   //   name    - stub name string
1538   //
1539   // Inputs:
1540   //   c_rarg0   - source array address
1541   //   c_rarg1   - destination array address
1542   //   c_rarg2   - element count, treated as ssize_t, can be zero
1543   //
1544   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1545   // we let the hardware handle it.  The one to eight bytes within words,
1546   // dwords or qwords that span cache line boundaries will still be loaded
1547   // and stored atomically.
1548   //
1549   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1550                                       address* entry, const char *name) {
1551     const bool not_oop = false;
1552     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1553   }
1554 
1555   // Arguments:
1556   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1557   //             ignored
1558   //   name    - stub name string
1559   //
1560   // Inputs:
1561   //   c_rarg0   - source array address
1562   //   c_rarg1   - destination array address
1563   //   c_rarg2   - element count, treated as ssize_t, can be zero
1564   //
1565   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1566   // let the hardware handle it.  The two or four words within dwords
1567   // or qwords that span cache line boundaries will still be loaded
1568   // and stored atomically.
1569   //
1570   // Side Effects:
1571   //   disjoint_short_copy_entry is set to the no-overlap entry point
1572   //   used by generate_conjoint_short_copy().
1573   //
1574   address generate_disjoint_short_copy(bool aligned,
1575                                        address* entry, const char *name) {
1576     const bool not_oop = false;
1577     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1578   }
1579 
1580   // Arguments:
1581   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1582   //             ignored
1583   //   name    - stub name string
1584   //
1585   // Inputs:
1586   //   c_rarg0   - source array address
1587   //   c_rarg1   - destination array address
1588   //   c_rarg2   - element count, treated as ssize_t, can be zero
1589   //
1590   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1591   // let the hardware handle it.  The two or four words within dwords
1592   // or qwords that span cache line boundaries will still be loaded
1593   // and stored atomically.
1594   //
1595   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1596                                        address *entry, const char *name) {
1597     const bool not_oop = false;
1598     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1599 
1600   }
1601   // Arguments:
1602   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1603   //             ignored
1604   //   name    - stub name string
1605   //
1606   // Inputs:
1607   //   c_rarg0   - source array address
1608   //   c_rarg1   - destination array address
1609   //   c_rarg2   - element count, treated as ssize_t, can be zero
1610   //
1611   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1612   // the hardware handle it.  The two dwords within qwords that span
1613   // cache line boundaries will still be loaded and stored atomicly.
1614   //
1615   // Side Effects:
1616   //   disjoint_int_copy_entry is set to the no-overlap entry point
1617   //   used by generate_conjoint_int_oop_copy().
1618   //
1619   address generate_disjoint_int_copy(bool aligned, address *entry,
1620                                         const char *name) {
1621     const bool not_oop = false;
1622     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1623   }
1624 
1625   // Arguments:
1626   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1627   //             ignored
1628   //   name    - stub name string
1629   //
1630   // Inputs:
1631   //   c_rarg0   - source array address
1632   //   c_rarg1   - destination array address
1633   //   c_rarg2   - element count, treated as ssize_t, can be zero
1634   //
1635   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1636   // the hardware handle it.  The two dwords within qwords that span
1637   // cache line boundaries will still be loaded and stored atomicly.
1638   //
1639   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1640                                      address *entry, const char *name,
1641                                      bool dest_uninitialized = false) {
1642     const bool not_oop = false;
1643     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1644   }
1645 
1646 
1647   // Arguments:
1648   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1649   //             ignored
1650   //   name    - stub name string
1651   //
1652   // Inputs:
1653   //   c_rarg0   - source array address
1654   //   c_rarg1   - destination array address
1655   //   c_rarg2   - element count, treated as size_t, can be zero
1656   //
1657   // Side Effects:
1658   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1659   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1660   //
1661   address generate_disjoint_long_copy(bool aligned, address *entry,
1662                                           const char *name, bool dest_uninitialized = false) {
1663     const bool not_oop = false;
1664     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1665   }
1666 
1667   // Arguments:
1668   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1669   //             ignored
1670   //   name    - stub name string
1671   //
1672   // Inputs:
1673   //   c_rarg0   - source array address
1674   //   c_rarg1   - destination array address
1675   //   c_rarg2   - element count, treated as size_t, can be zero
1676   //
1677   address generate_conjoint_long_copy(bool aligned,
1678                                       address nooverlap_target, address *entry,
1679                                       const char *name, bool dest_uninitialized = false) {
1680     const bool not_oop = false;
1681     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1682   }
1683 
1684   // Arguments:
1685   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1686   //             ignored
1687   //   name    - stub name string
1688   //
1689   // Inputs:
1690   //   c_rarg0   - source array address
1691   //   c_rarg1   - destination array address
1692   //   c_rarg2   - element count, treated as size_t, can be zero
1693   //
1694   // Side Effects:
1695   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1696   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1697   //
1698   address generate_disjoint_oop_copy(bool aligned, address *entry,
1699                                      const char *name, bool dest_uninitialized) {
1700     const bool is_oop = true;
1701     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1702     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1703   }
1704 
1705   // Arguments:
1706   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1707   //             ignored
1708   //   name    - stub name string
1709   //
1710   // Inputs:
1711   //   c_rarg0   - source array address
1712   //   c_rarg1   - destination array address
1713   //   c_rarg2   - element count, treated as size_t, can be zero
1714   //
1715   address generate_conjoint_oop_copy(bool aligned,
1716                                      address nooverlap_target, address *entry,
1717                                      const char *name, bool dest_uninitialized) {
1718     const bool is_oop = true;
1719     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1720     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1721                                   name, dest_uninitialized);
1722   }
1723 
1724 
1725   // Helper for generating a dynamic type check.
1726   // Smashes rscratch1.
1727   void generate_type_check(Register sub_klass,
1728                            Register super_check_offset,
1729                            Register super_klass,
1730                            Label& L_success) {
1731     assert_different_registers(sub_klass, super_check_offset, super_klass);
1732 
1733     BLOCK_COMMENT("type_check:");
1734 
1735     Label L_miss;
1736 
1737     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1738                                      super_check_offset);
1739     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1740 
1741     // Fall through on failure!
1742     __ BIND(L_miss);
1743   }
1744 
1745   //
1746   //  Generate checkcasting array copy stub
1747   //
1748   //  Input:
1749   //    c_rarg0   - source array address
1750   //    c_rarg1   - destination array address
1751   //    c_rarg2   - element count, treated as ssize_t, can be zero
1752   //    c_rarg3   - size_t ckoff (super_check_offset)
1753   //    c_rarg4   - oop ckval (super_klass)
1754   //
1755   //  Output:
1756   //    r0 ==  0  -  success
1757   //    r0 == -1^K - failure, where K is partial transfer count
1758   //
1759   address generate_checkcast_copy(const char *name, address *entry,
1760                                   bool dest_uninitialized = false) {
1761 
1762     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1763 
1764     // Input registers (after setup_arg_regs)
1765     const Register from        = c_rarg0;   // source array address
1766     const Register to          = c_rarg1;   // destination array address
1767     const Register count       = c_rarg2;   // elementscount
1768     const Register ckoff       = c_rarg3;   // super_check_offset
1769     const Register ckval       = c_rarg4;   // super_klass
1770 
1771     // Registers used as temps (r18, r19, r20 are save-on-entry)
1772     const Register count_save  = r21;       // orig elementscount
1773     const Register start_to    = r20;       // destination array start address
1774     const Register copied_oop  = r18;       // actual oop copied
1775     const Register r19_klass   = r19;       // oop._klass
1776 
1777     //---------------------------------------------------------------
1778     // Assembler stub will be used for this call to arraycopy
1779     // if the two arrays are subtypes of Object[] but the
1780     // destination array type is not equal to or a supertype
1781     // of the source type.  Each element must be separately
1782     // checked.
1783 
1784     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1785                                copied_oop, r19_klass, count_save);
1786 
1787     __ align(CodeEntryAlignment);
1788     StubCodeMark mark(this, "StubRoutines", name);
1789     address start = __ pc();
1790 
1791     __ enter(); // required for proper stackwalking of RuntimeStub frame
1792 
1793 #ifdef ASSERT
1794     // caller guarantees that the arrays really are different
1795     // otherwise, we would have to make conjoint checks
1796     { Label L;
1797       array_overlap_test(L, TIMES_OOP);
1798       __ stop("checkcast_copy within a single array");
1799       __ bind(L);
1800     }
1801 #endif //ASSERT
1802 
1803     // Caller of this entry point must set up the argument registers.
1804     if (entry != NULL) {
1805       *entry = __ pc();
1806       BLOCK_COMMENT("Entry:");
1807     }
1808 
1809      // Empty array:  Nothing to do.
1810     __ cbz(count, L_done);
1811 
1812     __ push(RegSet::of(r18, r19, r20, r21), sp);
1813 
1814 #ifdef ASSERT
1815     BLOCK_COMMENT("assert consistent ckoff/ckval");
1816     // The ckoff and ckval must be mutually consistent,
1817     // even though caller generates both.
1818     { Label L;
1819       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1820       __ ldrw(start_to, Address(ckval, sco_offset));
1821       __ cmpw(ckoff, start_to);
1822       __ br(Assembler::EQ, L);
1823       __ stop("super_check_offset inconsistent");
1824       __ bind(L);
1825     }
1826 #endif //ASSERT
1827 
1828     gen_write_ref_array_pre_barrier(from, to, count, dest_uninitialized);
1829 
1830     // save the original count
1831     __ mov(count_save, count);
1832 
1833     // Copy from low to high addresses
1834     __ mov(start_to, to);              // Save destination array start address
1835     __ b(L_load_element);
1836 
1837     // ======== begin loop ========
1838     // (Loop is rotated; its entry is L_load_element.)
1839     // Loop control:
1840     //   for (; count != 0; count--) {
1841     //     copied_oop = load_heap_oop(from++);
1842     //     ... generate_type_check ...;
1843     //     store_heap_oop(to++, copied_oop);
1844     //   }
1845     __ align(OptoLoopAlignment);
1846 
1847     __ BIND(L_store_element);
1848     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1849     __ sub(count, count, 1);
1850     __ cbz(count, L_do_card_marks);
1851 
1852     // ======== loop entry is here ========
1853     __ BIND(L_load_element);
1854     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1855     __ cbz(copied_oop, L_store_element);
1856 
1857     __ load_klass(r19_klass, copied_oop);// query the object klass
1858     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1859     // ======== end loop ========
1860 
1861     // It was a real error; we must depend on the caller to finish the job.
1862     // Register count = remaining oops, count_orig = total oops.
1863     // Emit GC store barriers for the oops we have copied and report
1864     // their number to the caller.
1865 
1866     __ subs(count, count_save, count);     // K = partially copied oop count
1867     __ eon(count, count, zr);                   // report (-1^K) to caller
1868     __ br(Assembler::EQ, L_done_pop);
1869 
1870     __ BIND(L_do_card_marks);
1871     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1872     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1873 
1874     __ bind(L_done_pop);
1875     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1876     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1877 
1878     __ bind(L_done);
1879     __ mov(r0, count);
1880     __ leave();
1881     __ ret(lr);
1882 
1883     return start;
1884   }
1885 
1886   // Perform range checks on the proposed arraycopy.
1887   // Kills temp, but nothing else.
1888   // Also, clean the sign bits of src_pos and dst_pos.
1889   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1890                               Register src_pos, // source position (c_rarg1)
1891                               Register dst,     // destination array oo (c_rarg2)
1892                               Register dst_pos, // destination position (c_rarg3)
1893                               Register length,
1894                               Register temp,
1895                               Label& L_failed) {
1896     BLOCK_COMMENT("arraycopy_range_checks:");
1897 
1898     assert_different_registers(rscratch1, temp);
1899 
1900     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1901     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1902     __ addw(temp, length, src_pos);
1903     __ cmpw(temp, rscratch1);
1904     __ br(Assembler::HI, L_failed);
1905 
1906     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1907     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1908     __ addw(temp, length, dst_pos);
1909     __ cmpw(temp, rscratch1);
1910     __ br(Assembler::HI, L_failed);
1911 
1912     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1913     __ movw(src_pos, src_pos);
1914     __ movw(dst_pos, dst_pos);
1915 
1916     BLOCK_COMMENT("arraycopy_range_checks done");
1917   }
1918 
1919   // These stubs get called from some dumb test routine.
1920   // I'll write them properly when they're called from
1921   // something that's actually doing something.
1922   static void fake_arraycopy_stub(address src, address dst, int count) {
1923     assert(count == 0, "huh?");
1924   }
1925 
1926 
1927   //
1928   // Generate stub for array fill. If "aligned" is true, the
1929   // "to" address is assumed to be heapword aligned.
1930   //
1931   // Arguments for generated stub:
1932   //   to:    c_rarg0
1933   //   value: c_rarg1
1934   //   count: c_rarg2 treated as signed
1935   //
1936   address generate_fill(BasicType t, bool aligned, const char *name) {
1937     __ align(CodeEntryAlignment);
1938     StubCodeMark mark(this, "StubRoutines", name);
1939     address start = __ pc();
1940 
1941     BLOCK_COMMENT("Entry:");
1942 
1943     const Register to        = c_rarg0;  // source array address
1944     const Register value     = c_rarg1;  // value
1945     const Register count     = c_rarg2;  // elements count
1946 
1947     const Register bz_base = r10;        // base for block_zero routine
1948     const Register cnt_words = r11;      // temp register
1949 
1950     __ enter();
1951 
1952     Label L_fill_elements, L_exit1;
1953 
1954     int shift = -1;
1955     switch (t) {
1956       case T_BYTE:
1957         shift = 0;
1958         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1959         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
1960         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
1961         __ br(Assembler::LO, L_fill_elements);
1962         break;
1963       case T_SHORT:
1964         shift = 1;
1965         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1966         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
1967         __ br(Assembler::LO, L_fill_elements);
1968         break;
1969       case T_INT:
1970         shift = 2;
1971         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1972         __ br(Assembler::LO, L_fill_elements);
1973         break;
1974       default: ShouldNotReachHere();
1975     }
1976 
1977     // Align source address at 8 bytes address boundary.
1978     Label L_skip_align1, L_skip_align2, L_skip_align4;
1979     if (!aligned) {
1980       switch (t) {
1981         case T_BYTE:
1982           // One byte misalignment happens only for byte arrays.
1983           __ tbz(to, 0, L_skip_align1);
1984           __ strb(value, Address(__ post(to, 1)));
1985           __ subw(count, count, 1);
1986           __ bind(L_skip_align1);
1987           // Fallthrough
1988         case T_SHORT:
1989           // Two bytes misalignment happens only for byte and short (char) arrays.
1990           __ tbz(to, 1, L_skip_align2);
1991           __ strh(value, Address(__ post(to, 2)));
1992           __ subw(count, count, 2 >> shift);
1993           __ bind(L_skip_align2);
1994           // Fallthrough
1995         case T_INT:
1996           // Align to 8 bytes, we know we are 4 byte aligned to start.
1997           __ tbz(to, 2, L_skip_align4);
1998           __ strw(value, Address(__ post(to, 4)));
1999           __ subw(count, count, 4 >> shift);
2000           __ bind(L_skip_align4);
2001           break;
2002         default: ShouldNotReachHere();
2003       }
2004     }
2005 
2006     //
2007     //  Fill large chunks
2008     //
2009     __ lsrw(cnt_words, count, 3 - shift); // number of words
2010     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2011     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2012     if (UseBlockZeroing) {
2013       Label non_block_zeroing, rest;
2014       // count >= BlockZeroingLowLimit && value == 0
2015       __ subs(rscratch1, cnt_words, BlockZeroingLowLimit >> 3);
2016       __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
2017       __ br(Assembler::NE, non_block_zeroing);
2018       __ mov(bz_base, to);
2019       __ block_zero(bz_base, cnt_words, true);
2020       __ mov(to, bz_base);
2021       __ b(rest);
2022       __ bind(non_block_zeroing);
2023       __ fill_words(to, cnt_words, value);
2024       __ bind(rest);
2025     }
2026     else {
2027       __ fill_words(to, cnt_words, value);
2028     }
2029 
2030     // Remaining count is less than 8 bytes. Fill it by a single store.
2031     // Note that the total length is no less than 8 bytes.
2032     if (t == T_BYTE || t == T_SHORT) {
2033       Label L_exit1;
2034       __ cbzw(count, L_exit1);
2035       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2036       __ str(value, Address(to, -8));    // overwrite some elements
2037       __ bind(L_exit1);
2038       __ leave();
2039       __ ret(lr);
2040     }
2041 
2042     // Handle copies less than 8 bytes.
2043     Label L_fill_2, L_fill_4, L_exit2;
2044     __ bind(L_fill_elements);
2045     switch (t) {
2046       case T_BYTE:
2047         __ tbz(count, 0, L_fill_2);
2048         __ strb(value, Address(__ post(to, 1)));
2049         __ bind(L_fill_2);
2050         __ tbz(count, 1, L_fill_4);
2051         __ strh(value, Address(__ post(to, 2)));
2052         __ bind(L_fill_4);
2053         __ tbz(count, 2, L_exit2);
2054         __ strw(value, Address(to));
2055         break;
2056       case T_SHORT:
2057         __ tbz(count, 0, L_fill_4);
2058         __ strh(value, Address(__ post(to, 2)));
2059         __ bind(L_fill_4);
2060         __ tbz(count, 1, L_exit2);
2061         __ strw(value, Address(to));
2062         break;
2063       case T_INT:
2064         __ cbzw(count, L_exit2);
2065         __ strw(value, Address(to));
2066         break;
2067       default: ShouldNotReachHere();
2068     }
2069     __ bind(L_exit2);
2070     __ leave();
2071     __ ret(lr);
2072     return start;
2073   }
2074 
2075   //
2076   //  Generate 'unsafe' array copy stub
2077   //  Though just as safe as the other stubs, it takes an unscaled
2078   //  size_t argument instead of an element count.
2079   //
2080   //  Input:
2081   //    c_rarg0   - source array address
2082   //    c_rarg1   - destination array address
2083   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2084   //
2085   // Examines the alignment of the operands and dispatches
2086   // to a long, int, short, or byte copy loop.
2087   //
2088   address generate_unsafe_copy(const char *name,
2089                                address byte_copy_entry,
2090                                address short_copy_entry,
2091                                address int_copy_entry,
2092                                address long_copy_entry) {
2093     Label L_long_aligned, L_int_aligned, L_short_aligned;
2094     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2095 
2096     __ align(CodeEntryAlignment);
2097     StubCodeMark mark(this, "StubRoutines", name);
2098     address start = __ pc();
2099     __ enter(); // required for proper stackwalking of RuntimeStub frame
2100 
2101     // bump this on entry, not on exit:
2102     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2103 
2104     __ orr(rscratch1, s, d);
2105     __ orr(rscratch1, rscratch1, count);
2106 
2107     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2108     __ cbz(rscratch1, L_long_aligned);
2109     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2110     __ cbz(rscratch1, L_int_aligned);
2111     __ tbz(rscratch1, 0, L_short_aligned);
2112     __ b(RuntimeAddress(byte_copy_entry));
2113 
2114     __ BIND(L_short_aligned);
2115     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2116     __ b(RuntimeAddress(short_copy_entry));
2117     __ BIND(L_int_aligned);
2118     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2119     __ b(RuntimeAddress(int_copy_entry));
2120     __ BIND(L_long_aligned);
2121     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2122     __ b(RuntimeAddress(long_copy_entry));
2123 
2124     return start;
2125   }
2126 
2127   //
2128   //  Generate generic array copy stubs
2129   //
2130   //  Input:
2131   //    c_rarg0    -  src oop
2132   //    c_rarg1    -  src_pos (32-bits)
2133   //    c_rarg2    -  dst oop
2134   //    c_rarg3    -  dst_pos (32-bits)
2135   //    c_rarg4    -  element count (32-bits)
2136   //
2137   //  Output:
2138   //    r0 ==  0  -  success
2139   //    r0 == -1^K - failure, where K is partial transfer count
2140   //
2141   address generate_generic_copy(const char *name,
2142                                 address byte_copy_entry, address short_copy_entry,
2143                                 address int_copy_entry, address oop_copy_entry,
2144                                 address long_copy_entry, address checkcast_copy_entry) {
2145 
2146     Label L_failed, L_failed_0, L_objArray;
2147     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2148 
2149     // Input registers
2150     const Register src        = c_rarg0;  // source array oop
2151     const Register src_pos    = c_rarg1;  // source position
2152     const Register dst        = c_rarg2;  // destination array oop
2153     const Register dst_pos    = c_rarg3;  // destination position
2154     const Register length     = c_rarg4;
2155 
2156     __ align(CodeEntryAlignment);
2157 
2158     StubCodeMark mark(this, "StubRoutines", name);
2159 
2160     address start = __ pc();
2161 
2162     __ enter(); // required for proper stackwalking of RuntimeStub frame
2163 
2164     // bump this on entry, not on exit:
2165     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2166 
2167     //-----------------------------------------------------------------------
2168     // Assembler stub will be used for this call to arraycopy
2169     // if the following conditions are met:
2170     //
2171     // (1) src and dst must not be null.
2172     // (2) src_pos must not be negative.
2173     // (3) dst_pos must not be negative.
2174     // (4) length  must not be negative.
2175     // (5) src klass and dst klass should be the same and not NULL.
2176     // (6) src and dst should be arrays.
2177     // (7) src_pos + length must not exceed length of src.
2178     // (8) dst_pos + length must not exceed length of dst.
2179     //
2180 
2181     //  if (src == NULL) return -1;
2182     __ cbz(src, L_failed);
2183 
2184     //  if (src_pos < 0) return -1;
2185     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2186 
2187     //  if (dst == NULL) return -1;
2188     __ cbz(dst, L_failed);
2189 
2190     //  if (dst_pos < 0) return -1;
2191     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2192 
2193     // registers used as temp
2194     const Register scratch_length    = r16; // elements count to copy
2195     const Register scratch_src_klass = r17; // array klass
2196     const Register lh                = r18; // layout helper
2197 
2198     //  if (length < 0) return -1;
2199     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2200     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2201 
2202     __ load_klass(scratch_src_klass, src);
2203 #ifdef ASSERT
2204     //  assert(src->klass() != NULL);
2205     {
2206       BLOCK_COMMENT("assert klasses not null {");
2207       Label L1, L2;
2208       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2209       __ bind(L1);
2210       __ stop("broken null klass");
2211       __ bind(L2);
2212       __ load_klass(rscratch1, dst);
2213       __ cbz(rscratch1, L1);     // this would be broken also
2214       BLOCK_COMMENT("} assert klasses not null done");
2215     }
2216 #endif
2217 
2218     // Load layout helper (32-bits)
2219     //
2220     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2221     // 32        30    24            16              8     2                 0
2222     //
2223     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2224     //
2225 
2226     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2227 
2228     // Handle objArrays completely differently...
2229     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2230     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2231     __ movw(rscratch1, objArray_lh);
2232     __ eorw(rscratch2, lh, rscratch1);
2233     __ cbzw(rscratch2, L_objArray);
2234 
2235     //  if (src->klass() != dst->klass()) return -1;
2236     __ load_klass(rscratch2, dst);
2237     __ eor(rscratch2, rscratch2, scratch_src_klass);
2238     __ cbnz(rscratch2, L_failed);
2239 
2240     //  if (!src->is_Array()) return -1;
2241     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2242 
2243     // At this point, it is known to be a typeArray (array_tag 0x3).
2244 #ifdef ASSERT
2245     {
2246       BLOCK_COMMENT("assert primitive array {");
2247       Label L;
2248       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2249       __ cmpw(lh, rscratch2);
2250       __ br(Assembler::GE, L);
2251       __ stop("must be a primitive array");
2252       __ bind(L);
2253       BLOCK_COMMENT("} assert primitive array done");
2254     }
2255 #endif
2256 
2257     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2258                            rscratch2, L_failed);
2259 
2260     // TypeArrayKlass
2261     //
2262     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2263     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2264     //
2265 
2266     const Register rscratch1_offset = rscratch1;    // array offset
2267     const Register r18_elsize = lh; // element size
2268 
2269     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2270            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2271     __ add(src, src, rscratch1_offset);           // src array offset
2272     __ add(dst, dst, rscratch1_offset);           // dst array offset
2273     BLOCK_COMMENT("choose copy loop based on element size");
2274 
2275     // next registers should be set before the jump to corresponding stub
2276     const Register from     = c_rarg0;  // source array address
2277     const Register to       = c_rarg1;  // destination array address
2278     const Register count    = c_rarg2;  // elements count
2279 
2280     // 'from', 'to', 'count' registers should be set in such order
2281     // since they are the same as 'src', 'src_pos', 'dst'.
2282 
2283     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2284 
2285     // The possible values of elsize are 0-3, i.e. exact_log2(element
2286     // size in bytes).  We do a simple bitwise binary search.
2287   __ BIND(L_copy_bytes);
2288     __ tbnz(r18_elsize, 1, L_copy_ints);
2289     __ tbnz(r18_elsize, 0, L_copy_shorts);
2290     __ lea(from, Address(src, src_pos));// src_addr
2291     __ lea(to,   Address(dst, dst_pos));// dst_addr
2292     __ movw(count, scratch_length); // length
2293     __ b(RuntimeAddress(byte_copy_entry));
2294 
2295   __ BIND(L_copy_shorts);
2296     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2297     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2298     __ movw(count, scratch_length); // length
2299     __ b(RuntimeAddress(short_copy_entry));
2300 
2301   __ BIND(L_copy_ints);
2302     __ tbnz(r18_elsize, 0, L_copy_longs);
2303     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2304     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2305     __ movw(count, scratch_length); // length
2306     __ b(RuntimeAddress(int_copy_entry));
2307 
2308   __ BIND(L_copy_longs);
2309 #ifdef ASSERT
2310     {
2311       BLOCK_COMMENT("assert long copy {");
2312       Label L;
2313       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2314       __ cmpw(r18_elsize, LogBytesPerLong);
2315       __ br(Assembler::EQ, L);
2316       __ stop("must be long copy, but elsize is wrong");
2317       __ bind(L);
2318       BLOCK_COMMENT("} assert long copy done");
2319     }
2320 #endif
2321     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2322     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2323     __ movw(count, scratch_length); // length
2324     __ b(RuntimeAddress(long_copy_entry));
2325 
2326     // ObjArrayKlass
2327   __ BIND(L_objArray);
2328     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2329 
2330     Label L_plain_copy, L_checkcast_copy;
2331     //  test array classes for subtyping
2332     __ load_klass(r18, dst);
2333     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2334     __ br(Assembler::NE, L_checkcast_copy);
2335 
2336     // Identically typed arrays can be copied without element-wise checks.
2337     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2338                            rscratch2, L_failed);
2339 
2340     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2341     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2342     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2343     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2344     __ movw(count, scratch_length); // length
2345   __ BIND(L_plain_copy);
2346     __ b(RuntimeAddress(oop_copy_entry));
2347 
2348   __ BIND(L_checkcast_copy);
2349     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2350     {
2351       // Before looking at dst.length, make sure dst is also an objArray.
2352       __ ldrw(rscratch1, Address(r18, lh_offset));
2353       __ movw(rscratch2, objArray_lh);
2354       __ eorw(rscratch1, rscratch1, rscratch2);
2355       __ cbnzw(rscratch1, L_failed);
2356 
2357       // It is safe to examine both src.length and dst.length.
2358       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2359                              r18, L_failed);
2360 
2361       const Register rscratch2_dst_klass = rscratch2;
2362       __ load_klass(rscratch2_dst_klass, dst); // reload
2363 
2364       // Marshal the base address arguments now, freeing registers.
2365       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2366       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2367       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2368       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2369       __ movw(count, length);           // length (reloaded)
2370       Register sco_temp = c_rarg3;      // this register is free now
2371       assert_different_registers(from, to, count, sco_temp,
2372                                  rscratch2_dst_klass, scratch_src_klass);
2373       // assert_clean_int(count, sco_temp);
2374 
2375       // Generate the type check.
2376       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2377       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2378       // assert_clean_int(sco_temp, r18);
2379       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2380 
2381       // Fetch destination element klass from the ObjArrayKlass header.
2382       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2383       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2384       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2385 
2386       // the checkcast_copy loop needs two extra arguments:
2387       assert(c_rarg3 == sco_temp, "#3 already in place");
2388       // Set up arguments for checkcast_copy_entry.
2389       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2390       __ b(RuntimeAddress(checkcast_copy_entry));
2391     }
2392 
2393   __ BIND(L_failed);
2394     __ mov(r0, -1);
2395     __ leave();   // required for proper stackwalking of RuntimeStub frame
2396     __ ret(lr);
2397 
2398     return start;
2399   }
2400 
2401   void generate_arraycopy_stubs() {
2402     address entry;
2403     address entry_jbyte_arraycopy;
2404     address entry_jshort_arraycopy;
2405     address entry_jint_arraycopy;
2406     address entry_oop_arraycopy;
2407     address entry_jlong_arraycopy;
2408     address entry_checkcast_arraycopy;
2409 
2410     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2411     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2412 
2413     StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
2414 
2415     //*** jbyte
2416     // Always need aligned and unaligned versions
2417     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2418                                                                                   "jbyte_disjoint_arraycopy");
2419     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2420                                                                                   &entry_jbyte_arraycopy,
2421                                                                                   "jbyte_arraycopy");
2422     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2423                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2424     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2425                                                                                   "arrayof_jbyte_arraycopy");
2426 
2427     //*** jshort
2428     // Always need aligned and unaligned versions
2429     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2430                                                                                     "jshort_disjoint_arraycopy");
2431     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2432                                                                                     &entry_jshort_arraycopy,
2433                                                                                     "jshort_arraycopy");
2434     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2435                                                                                     "arrayof_jshort_disjoint_arraycopy");
2436     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2437                                                                                     "arrayof_jshort_arraycopy");
2438 
2439     //*** jint
2440     // Aligned versions
2441     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2442                                                                                 "arrayof_jint_disjoint_arraycopy");
2443     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2444                                                                                 "arrayof_jint_arraycopy");
2445     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2446     // entry_jint_arraycopy always points to the unaligned version
2447     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2448                                                                                 "jint_disjoint_arraycopy");
2449     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2450                                                                                 &entry_jint_arraycopy,
2451                                                                                 "jint_arraycopy");
2452 
2453     //*** jlong
2454     // It is always aligned
2455     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2456                                                                                   "arrayof_jlong_disjoint_arraycopy");
2457     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2458                                                                                   "arrayof_jlong_arraycopy");
2459     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2460     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2461 
2462     //*** oops
2463     {
2464       // With compressed oops we need unaligned versions; notice that
2465       // we overwrite entry_oop_arraycopy.
2466       bool aligned = !UseCompressedOops;
2467 
2468       StubRoutines::_arrayof_oop_disjoint_arraycopy
2469         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2470                                      /*dest_uninitialized*/false);
2471       StubRoutines::_arrayof_oop_arraycopy
2472         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2473                                      /*dest_uninitialized*/false);
2474       // Aligned versions without pre-barriers
2475       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2476         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2477                                      /*dest_uninitialized*/true);
2478       StubRoutines::_arrayof_oop_arraycopy_uninit
2479         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2480                                      /*dest_uninitialized*/true);
2481     }
2482 
2483     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2484     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2485     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2486     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2487 
2488     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2489     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2490                                                                         /*dest_uninitialized*/true);
2491 
2492     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2493                                                               entry_jbyte_arraycopy,
2494                                                               entry_jshort_arraycopy,
2495                                                               entry_jint_arraycopy,
2496                                                               entry_jlong_arraycopy);
2497 
2498     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2499                                                                entry_jbyte_arraycopy,
2500                                                                entry_jshort_arraycopy,
2501                                                                entry_jint_arraycopy,
2502                                                                entry_oop_arraycopy,
2503                                                                entry_jlong_arraycopy,
2504                                                                entry_checkcast_arraycopy);
2505 
2506     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2507     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2508     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2509     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2510     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2511     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2512   }
2513 
2514   // Arguments:
2515   //
2516   // Inputs:
2517   //   c_rarg0   - source byte array address
2518   //   c_rarg1   - destination byte array address
2519   //   c_rarg2   - K (key) in little endian int array
2520   //
2521   address generate_aescrypt_encryptBlock() {
2522     __ align(CodeEntryAlignment);
2523     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2524 
2525     Label L_doLast;
2526 
2527     const Register from        = c_rarg0;  // source array address
2528     const Register to          = c_rarg1;  // destination array address
2529     const Register key         = c_rarg2;  // key array address
2530     const Register keylen      = rscratch1;
2531 
2532     address start = __ pc();
2533     __ enter();
2534 
2535     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2536 
2537     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2538 
2539     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2540     __ rev32(v1, __ T16B, v1);
2541     __ rev32(v2, __ T16B, v2);
2542     __ rev32(v3, __ T16B, v3);
2543     __ rev32(v4, __ T16B, v4);
2544     __ aese(v0, v1);
2545     __ aesmc(v0, v0);
2546     __ aese(v0, v2);
2547     __ aesmc(v0, v0);
2548     __ aese(v0, v3);
2549     __ aesmc(v0, v0);
2550     __ aese(v0, v4);
2551     __ aesmc(v0, v0);
2552 
2553     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2554     __ rev32(v1, __ T16B, v1);
2555     __ rev32(v2, __ T16B, v2);
2556     __ rev32(v3, __ T16B, v3);
2557     __ rev32(v4, __ T16B, v4);
2558     __ aese(v0, v1);
2559     __ aesmc(v0, v0);
2560     __ aese(v0, v2);
2561     __ aesmc(v0, v0);
2562     __ aese(v0, v3);
2563     __ aesmc(v0, v0);
2564     __ aese(v0, v4);
2565     __ aesmc(v0, v0);
2566 
2567     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2568     __ rev32(v1, __ T16B, v1);
2569     __ rev32(v2, __ T16B, v2);
2570 
2571     __ cmpw(keylen, 44);
2572     __ br(Assembler::EQ, L_doLast);
2573 
2574     __ aese(v0, v1);
2575     __ aesmc(v0, v0);
2576     __ aese(v0, v2);
2577     __ aesmc(v0, v0);
2578 
2579     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2580     __ rev32(v1, __ T16B, v1);
2581     __ rev32(v2, __ T16B, v2);
2582 
2583     __ cmpw(keylen, 52);
2584     __ br(Assembler::EQ, L_doLast);
2585 
2586     __ aese(v0, v1);
2587     __ aesmc(v0, v0);
2588     __ aese(v0, v2);
2589     __ aesmc(v0, v0);
2590 
2591     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2592     __ rev32(v1, __ T16B, v1);
2593     __ rev32(v2, __ T16B, v2);
2594 
2595     __ BIND(L_doLast);
2596 
2597     __ aese(v0, v1);
2598     __ aesmc(v0, v0);
2599     __ aese(v0, v2);
2600 
2601     __ ld1(v1, __ T16B, key);
2602     __ rev32(v1, __ T16B, v1);
2603     __ eor(v0, __ T16B, v0, v1);
2604 
2605     __ st1(v0, __ T16B, to);
2606 
2607     __ mov(r0, 0);
2608 
2609     __ leave();
2610     __ ret(lr);
2611 
2612     return start;
2613   }
2614 
2615   // Arguments:
2616   //
2617   // Inputs:
2618   //   c_rarg0   - source byte array address
2619   //   c_rarg1   - destination byte array address
2620   //   c_rarg2   - K (key) in little endian int array
2621   //
2622   address generate_aescrypt_decryptBlock() {
2623     assert(UseAES, "need AES instructions and misaligned SSE support");
2624     __ align(CodeEntryAlignment);
2625     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2626     Label L_doLast;
2627 
2628     const Register from        = c_rarg0;  // source array address
2629     const Register to          = c_rarg1;  // destination array address
2630     const Register key         = c_rarg2;  // key array address
2631     const Register keylen      = rscratch1;
2632 
2633     address start = __ pc();
2634     __ enter(); // required for proper stackwalking of RuntimeStub frame
2635 
2636     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2637 
2638     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2639 
2640     __ ld1(v5, __ T16B, __ post(key, 16));
2641     __ rev32(v5, __ T16B, v5);
2642 
2643     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2644     __ rev32(v1, __ T16B, v1);
2645     __ rev32(v2, __ T16B, v2);
2646     __ rev32(v3, __ T16B, v3);
2647     __ rev32(v4, __ T16B, v4);
2648     __ aesd(v0, v1);
2649     __ aesimc(v0, v0);
2650     __ aesd(v0, v2);
2651     __ aesimc(v0, v0);
2652     __ aesd(v0, v3);
2653     __ aesimc(v0, v0);
2654     __ aesd(v0, v4);
2655     __ aesimc(v0, v0);
2656 
2657     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2658     __ rev32(v1, __ T16B, v1);
2659     __ rev32(v2, __ T16B, v2);
2660     __ rev32(v3, __ T16B, v3);
2661     __ rev32(v4, __ T16B, v4);
2662     __ aesd(v0, v1);
2663     __ aesimc(v0, v0);
2664     __ aesd(v0, v2);
2665     __ aesimc(v0, v0);
2666     __ aesd(v0, v3);
2667     __ aesimc(v0, v0);
2668     __ aesd(v0, v4);
2669     __ aesimc(v0, v0);
2670 
2671     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2672     __ rev32(v1, __ T16B, v1);
2673     __ rev32(v2, __ T16B, v2);
2674 
2675     __ cmpw(keylen, 44);
2676     __ br(Assembler::EQ, L_doLast);
2677 
2678     __ aesd(v0, v1);
2679     __ aesimc(v0, v0);
2680     __ aesd(v0, v2);
2681     __ aesimc(v0, v0);
2682 
2683     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2684     __ rev32(v1, __ T16B, v1);
2685     __ rev32(v2, __ T16B, v2);
2686 
2687     __ cmpw(keylen, 52);
2688     __ br(Assembler::EQ, L_doLast);
2689 
2690     __ aesd(v0, v1);
2691     __ aesimc(v0, v0);
2692     __ aesd(v0, v2);
2693     __ aesimc(v0, v0);
2694 
2695     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2696     __ rev32(v1, __ T16B, v1);
2697     __ rev32(v2, __ T16B, v2);
2698 
2699     __ BIND(L_doLast);
2700 
2701     __ aesd(v0, v1);
2702     __ aesimc(v0, v0);
2703     __ aesd(v0, v2);
2704 
2705     __ eor(v0, __ T16B, v0, v5);
2706 
2707     __ st1(v0, __ T16B, to);
2708 
2709     __ mov(r0, 0);
2710 
2711     __ leave();
2712     __ ret(lr);
2713 
2714     return start;
2715   }
2716 
2717   // Arguments:
2718   //
2719   // Inputs:
2720   //   c_rarg0   - source byte array address
2721   //   c_rarg1   - destination byte array address
2722   //   c_rarg2   - K (key) in little endian int array
2723   //   c_rarg3   - r vector byte array address
2724   //   c_rarg4   - input length
2725   //
2726   // Output:
2727   //   x0        - input length
2728   //
2729   address generate_cipherBlockChaining_encryptAESCrypt() {
2730     assert(UseAES, "need AES instructions and misaligned SSE support");
2731     __ align(CodeEntryAlignment);
2732     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2733 
2734     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish;
2735 
2736     const Register from        = c_rarg0;  // source array address
2737     const Register to          = c_rarg1;  // destination array address
2738     const Register key         = c_rarg2;  // key array address
2739     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2740                                            // and left with the results of the last encryption block
2741     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2742     const Register keylen      = rscratch1;
2743 
2744     address start = __ pc();
2745 
2746       __ enter();
2747 
2748       __ subsw(rscratch2, len_reg, zr);
2749       __ br(Assembler::LE, _L_finish);
2750 
2751       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2752 
2753       __ ld1(v0, __ T16B, rvec);
2754 
2755       __ cmpw(keylen, 52);
2756       __ br(Assembler::CC, L_loadkeys_44);
2757       __ br(Assembler::EQ, L_loadkeys_52);
2758 
2759       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2760       __ rev32(v17, __ T16B, v17);
2761       __ rev32(v18, __ T16B, v18);
2762     __ BIND(L_loadkeys_52);
2763       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2764       __ rev32(v19, __ T16B, v19);
2765       __ rev32(v20, __ T16B, v20);
2766     __ BIND(L_loadkeys_44);
2767       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2768       __ rev32(v21, __ T16B, v21);
2769       __ rev32(v22, __ T16B, v22);
2770       __ rev32(v23, __ T16B, v23);
2771       __ rev32(v24, __ T16B, v24);
2772       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2773       __ rev32(v25, __ T16B, v25);
2774       __ rev32(v26, __ T16B, v26);
2775       __ rev32(v27, __ T16B, v27);
2776       __ rev32(v28, __ T16B, v28);
2777       __ ld1(v29, v30, v31, __ T16B, key);
2778       __ rev32(v29, __ T16B, v29);
2779       __ rev32(v30, __ T16B, v30);
2780       __ rev32(v31, __ T16B, v31);
2781 
2782     __ BIND(L_aes_loop);
2783       __ ld1(v1, __ T16B, __ post(from, 16));
2784       __ eor(v0, __ T16B, v0, v1);
2785 
2786       __ br(Assembler::CC, L_rounds_44);
2787       __ br(Assembler::EQ, L_rounds_52);
2788 
2789       __ aese(v0, v17); __ aesmc(v0, v0);
2790       __ aese(v0, v18); __ aesmc(v0, v0);
2791     __ BIND(L_rounds_52);
2792       __ aese(v0, v19); __ aesmc(v0, v0);
2793       __ aese(v0, v20); __ aesmc(v0, v0);
2794     __ BIND(L_rounds_44);
2795       __ aese(v0, v21); __ aesmc(v0, v0);
2796       __ aese(v0, v22); __ aesmc(v0, v0);
2797       __ aese(v0, v23); __ aesmc(v0, v0);
2798       __ aese(v0, v24); __ aesmc(v0, v0);
2799       __ aese(v0, v25); __ aesmc(v0, v0);
2800       __ aese(v0, v26); __ aesmc(v0, v0);
2801       __ aese(v0, v27); __ aesmc(v0, v0);
2802       __ aese(v0, v28); __ aesmc(v0, v0);
2803       __ aese(v0, v29); __ aesmc(v0, v0);
2804       __ aese(v0, v30);
2805       __ eor(v0, __ T16B, v0, v31);
2806 
2807       __ st1(v0, __ T16B, __ post(to, 16));
2808 
2809       __ subw(len_reg, len_reg, 16);
2810       __ cbnzw(len_reg, L_aes_loop);
2811 
2812       __ st1(v0, __ T16B, rvec);
2813 
2814     __ BIND(_L_finish);
2815       __ mov(r0, rscratch2);
2816 
2817       __ leave();
2818       __ ret(lr);
2819 
2820       return start;
2821   }
2822 
2823   // Arguments:
2824   //
2825   // Inputs:
2826   //   c_rarg0   - source byte array address
2827   //   c_rarg1   - destination byte array address
2828   //   c_rarg2   - K (key) in little endian int array
2829   //   c_rarg3   - r vector byte array address
2830   //   c_rarg4   - input length
2831   //
2832   // Output:
2833   //   r0       - input length
2834   //
2835   address generate_cipherBlockChaining_decryptAESCrypt() {
2836     assert(UseAES, "need AES instructions and misaligned SSE support");
2837     __ align(CodeEntryAlignment);
2838     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2839 
2840     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish;
2841 
2842     const Register from        = c_rarg0;  // source array address
2843     const Register to          = c_rarg1;  // destination array address
2844     const Register key         = c_rarg2;  // key array address
2845     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2846                                            // and left with the results of the last encryption block
2847     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2848     const Register keylen      = rscratch1;
2849 
2850     address start = __ pc();
2851 
2852       __ enter();
2853 
2854       __ subsw(rscratch2, len_reg, zr);
2855       __ br(Assembler::LE, _L_finish);
2856 
2857       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2858 
2859       __ ld1(v2, __ T16B, rvec);
2860 
2861       __ ld1(v31, __ T16B, __ post(key, 16));
2862       __ rev32(v31, __ T16B, v31);
2863 
2864       __ cmpw(keylen, 52);
2865       __ br(Assembler::CC, L_loadkeys_44);
2866       __ br(Assembler::EQ, L_loadkeys_52);
2867 
2868       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2869       __ rev32(v17, __ T16B, v17);
2870       __ rev32(v18, __ T16B, v18);
2871     __ BIND(L_loadkeys_52);
2872       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2873       __ rev32(v19, __ T16B, v19);
2874       __ rev32(v20, __ T16B, v20);
2875     __ BIND(L_loadkeys_44);
2876       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2877       __ rev32(v21, __ T16B, v21);
2878       __ rev32(v22, __ T16B, v22);
2879       __ rev32(v23, __ T16B, v23);
2880       __ rev32(v24, __ T16B, v24);
2881       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2882       __ rev32(v25, __ T16B, v25);
2883       __ rev32(v26, __ T16B, v26);
2884       __ rev32(v27, __ T16B, v27);
2885       __ rev32(v28, __ T16B, v28);
2886       __ ld1(v29, v30, __ T16B, key);
2887       __ rev32(v29, __ T16B, v29);
2888       __ rev32(v30, __ T16B, v30);
2889 
2890     __ BIND(L_aes_loop);
2891       __ ld1(v0, __ T16B, __ post(from, 16));
2892       __ orr(v1, __ T16B, v0, v0);
2893 
2894       __ br(Assembler::CC, L_rounds_44);
2895       __ br(Assembler::EQ, L_rounds_52);
2896 
2897       __ aesd(v0, v17); __ aesimc(v0, v0);
2898       __ aesd(v0, v18); __ aesimc(v0, v0);
2899     __ BIND(L_rounds_52);
2900       __ aesd(v0, v19); __ aesimc(v0, v0);
2901       __ aesd(v0, v20); __ aesimc(v0, v0);
2902     __ BIND(L_rounds_44);
2903       __ aesd(v0, v21); __ aesimc(v0, v0);
2904       __ aesd(v0, v22); __ aesimc(v0, v0);
2905       __ aesd(v0, v23); __ aesimc(v0, v0);
2906       __ aesd(v0, v24); __ aesimc(v0, v0);
2907       __ aesd(v0, v25); __ aesimc(v0, v0);
2908       __ aesd(v0, v26); __ aesimc(v0, v0);
2909       __ aesd(v0, v27); __ aesimc(v0, v0);
2910       __ aesd(v0, v28); __ aesimc(v0, v0);
2911       __ aesd(v0, v29); __ aesimc(v0, v0);
2912       __ aesd(v0, v30);
2913       __ eor(v0, __ T16B, v0, v31);
2914       __ eor(v0, __ T16B, v0, v2);
2915 
2916       __ st1(v0, __ T16B, __ post(to, 16));
2917       __ orr(v2, __ T16B, v1, v1);
2918 
2919       __ subw(len_reg, len_reg, 16);
2920       __ cbnzw(len_reg, L_aes_loop);
2921 
2922       __ st1(v2, __ T16B, rvec);
2923 
2924     __ BIND(_L_finish);
2925       __ mov(r0, rscratch2);
2926 
2927       __ leave();
2928       __ ret(lr);
2929 
2930     return start;
2931   }
2932 
2933   // Arguments:
2934   //
2935   // Inputs:
2936   //   c_rarg0   - byte[]  source+offset
2937   //   c_rarg1   - int[]   SHA.state
2938   //   c_rarg2   - int     offset
2939   //   c_rarg3   - int     limit
2940   //
2941   address generate_sha1_implCompress(bool multi_block, const char *name) {
2942     __ align(CodeEntryAlignment);
2943     StubCodeMark mark(this, "StubRoutines", name);
2944     address start = __ pc();
2945 
2946     Register buf   = c_rarg0;
2947     Register state = c_rarg1;
2948     Register ofs   = c_rarg2;
2949     Register limit = c_rarg3;
2950 
2951     Label keys;
2952     Label sha1_loop;
2953 
2954     // load the keys into v0..v3
2955     __ adr(rscratch1, keys);
2956     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2957     // load 5 words state into v6, v7
2958     __ ldrq(v6, Address(state, 0));
2959     __ ldrs(v7, Address(state, 16));
2960 
2961 
2962     __ BIND(sha1_loop);
2963     // load 64 bytes of data into v16..v19
2964     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2965     __ rev32(v16, __ T16B, v16);
2966     __ rev32(v17, __ T16B, v17);
2967     __ rev32(v18, __ T16B, v18);
2968     __ rev32(v19, __ T16B, v19);
2969 
2970     // do the sha1
2971     __ addv(v4, __ T4S, v16, v0);
2972     __ orr(v20, __ T16B, v6, v6);
2973 
2974     FloatRegister d0 = v16;
2975     FloatRegister d1 = v17;
2976     FloatRegister d2 = v18;
2977     FloatRegister d3 = v19;
2978 
2979     for (int round = 0; round < 20; round++) {
2980       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2981       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2982       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2983       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2984       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2985 
2986       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2987       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2988       __ sha1h(tmp2, __ T4S, v20);
2989       if (round < 5)
2990         __ sha1c(v20, __ T4S, tmp3, tmp4);
2991       else if (round < 10 || round >= 15)
2992         __ sha1p(v20, __ T4S, tmp3, tmp4);
2993       else
2994         __ sha1m(v20, __ T4S, tmp3, tmp4);
2995       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2996 
2997       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2998     }
2999 
3000     __ addv(v7, __ T2S, v7, v21);
3001     __ addv(v6, __ T4S, v6, v20);
3002 
3003     if (multi_block) {
3004       __ add(ofs, ofs, 64);
3005       __ cmp(ofs, limit);
3006       __ br(Assembler::LE, sha1_loop);
3007       __ mov(c_rarg0, ofs); // return ofs
3008     }
3009 
3010     __ strq(v6, Address(state, 0));
3011     __ strs(v7, Address(state, 16));
3012 
3013     __ ret(lr);
3014 
3015     __ bind(keys);
3016     __ emit_int32(0x5a827999);
3017     __ emit_int32(0x6ed9eba1);
3018     __ emit_int32(0x8f1bbcdc);
3019     __ emit_int32(0xca62c1d6);
3020 
3021     return start;
3022   }
3023 
3024 
3025   // Arguments:
3026   //
3027   // Inputs:
3028   //   c_rarg0   - byte[]  source+offset
3029   //   c_rarg1   - int[]   SHA.state
3030   //   c_rarg2   - int     offset
3031   //   c_rarg3   - int     limit
3032   //
3033   address generate_sha256_implCompress(bool multi_block, const char *name) {
3034     static const uint32_t round_consts[64] = {
3035       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3036       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3037       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3038       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3039       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3040       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3041       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3042       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3043       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3044       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3045       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3046       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3047       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3048       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3049       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3050       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3051     };
3052     __ align(CodeEntryAlignment);
3053     StubCodeMark mark(this, "StubRoutines", name);
3054     address start = __ pc();
3055 
3056     Register buf   = c_rarg0;
3057     Register state = c_rarg1;
3058     Register ofs   = c_rarg2;
3059     Register limit = c_rarg3;
3060 
3061     Label sha1_loop;
3062 
3063     __ stpd(v8, v9, __ pre(sp, -32));
3064     __ stpd(v10, v11, Address(sp, 16));
3065 
3066 // dga == v0
3067 // dgb == v1
3068 // dg0 == v2
3069 // dg1 == v3
3070 // dg2 == v4
3071 // t0 == v6
3072 // t1 == v7
3073 
3074     // load 16 keys to v16..v31
3075     __ lea(rscratch1, ExternalAddress((address)round_consts));
3076     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3077     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3078     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3079     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3080 
3081     // load 8 words (256 bits) state
3082     __ ldpq(v0, v1, state);
3083 
3084     __ BIND(sha1_loop);
3085     // load 64 bytes of data into v8..v11
3086     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3087     __ rev32(v8, __ T16B, v8);
3088     __ rev32(v9, __ T16B, v9);
3089     __ rev32(v10, __ T16B, v10);
3090     __ rev32(v11, __ T16B, v11);
3091 
3092     __ addv(v6, __ T4S, v8, v16);
3093     __ orr(v2, __ T16B, v0, v0);
3094     __ orr(v3, __ T16B, v1, v1);
3095 
3096     FloatRegister d0 = v8;
3097     FloatRegister d1 = v9;
3098     FloatRegister d2 = v10;
3099     FloatRegister d3 = v11;
3100 
3101 
3102     for (int round = 0; round < 16; round++) {
3103       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3104       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3105       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3106       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3107 
3108       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3109        __ orr(v4, __ T16B, v2, v2);
3110       if (round < 15)
3111         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3112       __ sha256h(v2, __ T4S, v3, tmp2);
3113       __ sha256h2(v3, __ T4S, v4, tmp2);
3114       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3115 
3116       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3117     }
3118 
3119     __ addv(v0, __ T4S, v0, v2);
3120     __ addv(v1, __ T4S, v1, v3);
3121 
3122     if (multi_block) {
3123       __ add(ofs, ofs, 64);
3124       __ cmp(ofs, limit);
3125       __ br(Assembler::LE, sha1_loop);
3126       __ mov(c_rarg0, ofs); // return ofs
3127     }
3128 
3129     __ ldpd(v10, v11, Address(sp, 16));
3130     __ ldpd(v8, v9, __ post(sp, 32));
3131 
3132     __ stpq(v0, v1, state);
3133 
3134     __ ret(lr);
3135 
3136     return start;
3137   }
3138 
3139   // Safefetch stubs.
3140   void generate_safefetch(const char* name, int size, address* entry,
3141                           address* fault_pc, address* continuation_pc) {
3142     // safefetch signatures:
3143     //   int      SafeFetch32(int*      adr, int      errValue);
3144     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3145     //
3146     // arguments:
3147     //   c_rarg0 = adr
3148     //   c_rarg1 = errValue
3149     //
3150     // result:
3151     //   PPC_RET  = *adr or errValue
3152 
3153     StubCodeMark mark(this, "StubRoutines", name);
3154 
3155     // Entry point, pc or function descriptor.
3156     *entry = __ pc();
3157 
3158     // Load *adr into c_rarg1, may fault.
3159     *fault_pc = __ pc();
3160     switch (size) {
3161       case 4:
3162         // int32_t
3163         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3164         break;
3165       case 8:
3166         // int64_t
3167         __ ldr(c_rarg1, Address(c_rarg0, 0));
3168         break;
3169       default:
3170         ShouldNotReachHere();
3171     }
3172 
3173     // return errValue or *adr
3174     *continuation_pc = __ pc();
3175     __ mov(r0, c_rarg1);
3176     __ ret(lr);
3177   }
3178 
3179   /**
3180    *  Arguments:
3181    *
3182    * Inputs:
3183    *   c_rarg0   - int crc
3184    *   c_rarg1   - byte* buf
3185    *   c_rarg2   - int length
3186    *
3187    * Output:
3188    *       r0   - int crc result
3189    *
3190    * Preserves:
3191    *       r13
3192    *
3193    */
3194   address generate_updateBytesCRC32() {
3195     assert(UseCRC32Intrinsics, "what are we doing here?");
3196 
3197     __ align(CodeEntryAlignment);
3198     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3199 
3200     address start = __ pc();
3201 
3202     const Register crc   = c_rarg0;  // crc
3203     const Register buf   = c_rarg1;  // source java byte array address
3204     const Register len   = c_rarg2;  // length
3205     const Register table0 = c_rarg3; // crc_table address
3206     const Register table1 = c_rarg4;
3207     const Register table2 = c_rarg5;
3208     const Register table3 = c_rarg6;
3209     const Register tmp3 = c_rarg7;
3210 
3211     BLOCK_COMMENT("Entry:");
3212     __ enter(); // required for proper stackwalking of RuntimeStub frame
3213 
3214     __ kernel_crc32(crc, buf, len,
3215               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3216 
3217     __ leave(); // required for proper stackwalking of RuntimeStub frame
3218     __ ret(lr);
3219 
3220     return start;
3221   }
3222 
3223   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3224                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3225                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3226     // Karatsuba multiplication performs a 128*128 -> 256-bit
3227     // multiplication in three 128-bit multiplications and a few
3228     // additions.
3229     //
3230     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3231     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3232     //
3233     // Inputs:
3234     //
3235     // A0 in a.d[0]     (subkey)
3236     // A1 in a.d[1]
3237     // (A1+A0) in a1_xor_a0.d[0]
3238     //
3239     // B0 in b.d[0]     (state)
3240     // B1 in b.d[1]
3241 
3242     __ ext(tmp1, __ T16B, b, b, 0x08);
3243     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3244     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3245     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3246     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3247 
3248     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3249     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3250     __ eor(tmp2, __ T16B, tmp2, tmp4);
3251     __ eor(tmp2, __ T16B, tmp2, tmp3);
3252 
3253     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3254     __ ins(result_hi, __ D, tmp2, 0, 1);
3255     __ ins(result_lo, __ D, tmp2, 1, 0);
3256   }
3257 
3258   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3259                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3260     const FloatRegister t0 = result;
3261 
3262     // The GCM field polynomial f is z^128 + p(z), where p =
3263     // z^7+z^2+z+1.
3264     //
3265     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3266     //
3267     // so, given that the product we're reducing is
3268     //    a == lo + hi * z^128
3269     // substituting,
3270     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3271     //
3272     // we reduce by multiplying hi by p(z) and subtracting the result
3273     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3274     // bits we can do this with two 64-bit multiplications, lo*p and
3275     // hi*p.
3276 
3277     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3278     __ ext(t1, __ T16B, t0, z, 8);
3279     __ eor(hi, __ T16B, hi, t1);
3280     __ ext(t1, __ T16B, z, t0, 8);
3281     __ eor(lo, __ T16B, lo, t1);
3282     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3283     __ eor(result, __ T16B, lo, t0);
3284   }
3285 
3286   /**
3287    *  Arguments:
3288    *
3289    *  Input:
3290    *    c_rarg0   - x address
3291    *    c_rarg1   - x length
3292    *    c_rarg2   - y address
3293    *    c_rarg3   - y lenth
3294    *    c_rarg4   - z address
3295    *    c_rarg5   - z length
3296    */
3297   address generate_multiplyToLen() {
3298     __ align(CodeEntryAlignment);
3299     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3300 
3301     address start = __ pc();
3302     const Register x     = r0;
3303     const Register xlen  = r1;
3304     const Register y     = r2;
3305     const Register ylen  = r3;
3306     const Register z     = r4;
3307     const Register zlen  = r5;
3308 
3309     const Register tmp1  = r10;
3310     const Register tmp2  = r11;
3311     const Register tmp3  = r12;
3312     const Register tmp4  = r13;
3313     const Register tmp5  = r14;
3314     const Register tmp6  = r15;
3315     const Register tmp7  = r16;
3316 
3317     BLOCK_COMMENT("Entry:");
3318     __ enter(); // required for proper stackwalking of RuntimeStub frame
3319     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3320     __ leave(); // required for proper stackwalking of RuntimeStub frame
3321     __ ret(lr);
3322 
3323     return start;
3324   }
3325 
3326   /**
3327    *  Arguments:
3328    *
3329    *  Input:
3330    *  c_rarg0   - current state address
3331    *  c_rarg1   - H key address
3332    *  c_rarg2   - data address
3333    *  c_rarg3   - number of blocks
3334    *
3335    *  Output:
3336    *  Updated state at c_rarg0
3337    */
3338   address generate_ghash_processBlocks() {
3339     // Bafflingly, GCM uses little-endian for the byte order, but
3340     // big-endian for the bit order.  For example, the polynomial 1 is
3341     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3342     //
3343     // So, we must either reverse the bytes in each word and do
3344     // everything big-endian or reverse the bits in each byte and do
3345     // it little-endian.  On AArch64 it's more idiomatic to reverse
3346     // the bits in each byte (we have an instruction, RBIT, to do
3347     // that) and keep the data in little-endian bit order throught the
3348     // calculation, bit-reversing the inputs and outputs.
3349 
3350     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3351     __ align(wordSize * 2);
3352     address p = __ pc();
3353     __ emit_int64(0x87);  // The low-order bits of the field
3354                           // polynomial (i.e. p = z^7+z^2+z+1)
3355                           // repeated in the low and high parts of a
3356                           // 128-bit vector
3357     __ emit_int64(0x87);
3358 
3359     __ align(CodeEntryAlignment);
3360     address start = __ pc();
3361 
3362     Register state   = c_rarg0;
3363     Register subkeyH = c_rarg1;
3364     Register data    = c_rarg2;
3365     Register blocks  = c_rarg3;
3366 
3367     FloatRegister vzr = v30;
3368     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3369 
3370     __ ldrq(v0, Address(state));
3371     __ ldrq(v1, Address(subkeyH));
3372 
3373     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3374     __ rbit(v0, __ T16B, v0);
3375     __ rev64(v1, __ T16B, v1);
3376     __ rbit(v1, __ T16B, v1);
3377 
3378     __ ldrq(v26, p);
3379 
3380     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3381     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3382 
3383     {
3384       Label L_ghash_loop;
3385       __ bind(L_ghash_loop);
3386 
3387       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3388                                                  // reversing each byte
3389       __ rbit(v2, __ T16B, v2);
3390       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3391 
3392       // Multiply state in v2 by subkey in v1
3393       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3394                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3395                      /*temps*/v6, v20, v18, v21);
3396       // Reduce v7:v5 by the field polynomial
3397       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3398 
3399       __ sub(blocks, blocks, 1);
3400       __ cbnz(blocks, L_ghash_loop);
3401     }
3402 
3403     // The bit-reversed result is at this point in v0
3404     __ rev64(v1, __ T16B, v0);
3405     __ rbit(v1, __ T16B, v1);
3406 
3407     __ st1(v1, __ T16B, state);
3408     __ ret(lr);
3409 
3410     return start;
3411   }
3412 
3413   // Continuation point for throwing of implicit exceptions that are
3414   // not handled in the current activation. Fabricates an exception
3415   // oop and initiates normal exception dispatching in this
3416   // frame. Since we need to preserve callee-saved values (currently
3417   // only for C2, but done for C1 as well) we need a callee-saved oop
3418   // map and therefore have to make these stubs into RuntimeStubs
3419   // rather than BufferBlobs.  If the compiler needs all registers to
3420   // be preserved between the fault point and the exception handler
3421   // then it must assume responsibility for that in
3422   // AbstractCompiler::continuation_for_implicit_null_exception or
3423   // continuation_for_implicit_division_by_zero_exception. All other
3424   // implicit exceptions (e.g., NullPointerException or
3425   // AbstractMethodError on entry) are either at call sites or
3426   // otherwise assume that stack unwinding will be initiated, so
3427   // caller saved registers were assumed volatile in the compiler.
3428 
3429 #undef __
3430 #define __ masm->
3431 
3432   address generate_throw_exception(const char* name,
3433                                    address runtime_entry,
3434                                    Register arg1 = noreg,
3435                                    Register arg2 = noreg) {
3436     // Information about frame layout at time of blocking runtime call.
3437     // Note that we only have to preserve callee-saved registers since
3438     // the compilers are responsible for supplying a continuation point
3439     // if they expect all registers to be preserved.
3440     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3441     enum layout {
3442       rfp_off = 0,
3443       rfp_off2,
3444       return_off,
3445       return_off2,
3446       framesize // inclusive of return address
3447     };
3448 
3449     int insts_size = 512;
3450     int locs_size  = 64;
3451 
3452     CodeBuffer code(name, insts_size, locs_size);
3453     OopMapSet* oop_maps  = new OopMapSet();
3454     MacroAssembler* masm = new MacroAssembler(&code);
3455 
3456     address start = __ pc();
3457 
3458     // This is an inlined and slightly modified version of call_VM
3459     // which has the ability to fetch the return PC out of
3460     // thread-local storage and also sets up last_Java_sp slightly
3461     // differently than the real call_VM
3462 
3463     __ enter(); // Save FP and LR before call
3464 
3465     assert(is_even(framesize/2), "sp not 16-byte aligned");
3466 
3467     // lr and fp are already in place
3468     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3469 
3470     int frame_complete = __ pc() - start;
3471 
3472     // Set up last_Java_sp and last_Java_fp
3473     address the_pc = __ pc();
3474     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3475 
3476     // Call runtime
3477     if (arg1 != noreg) {
3478       assert(arg2 != c_rarg1, "clobbered");
3479       __ mov(c_rarg1, arg1);
3480     }
3481     if (arg2 != noreg) {
3482       __ mov(c_rarg2, arg2);
3483     }
3484     __ mov(c_rarg0, rthread);
3485     BLOCK_COMMENT("call runtime_entry");
3486     __ mov(rscratch1, runtime_entry);
3487     __ blr(rscratch1);
3488 
3489     // Generate oop map
3490     OopMap* map = new OopMap(framesize, 0);
3491 
3492     oop_maps->add_gc_map(the_pc - start, map);
3493 
3494     __ reset_last_Java_frame(true);
3495     __ maybe_isb();
3496 
3497     __ leave();
3498 
3499     // check for pending exceptions
3500 #ifdef ASSERT
3501     Label L;
3502     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3503     __ cbnz(rscratch1, L);
3504     __ should_not_reach_here();
3505     __ bind(L);
3506 #endif // ASSERT
3507     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3508 
3509 
3510     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3511     RuntimeStub* stub =
3512       RuntimeStub::new_runtime_stub(name,
3513                                     &code,
3514                                     frame_complete,
3515                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3516                                     oop_maps, false);
3517     return stub->entry_point();
3518   }
3519 
3520   class MontgomeryMultiplyGenerator : public MacroAssembler {
3521 
3522     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3523       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3524 
3525     RegSet _toSave;
3526     bool _squaring;
3527 
3528   public:
3529     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3530       : MacroAssembler(as->code()), _squaring(squaring) {
3531 
3532       // Register allocation
3533 
3534       Register reg = c_rarg0;
3535       Pa_base = reg;       // Argument registers
3536       if (squaring)
3537         Pb_base = Pa_base;
3538       else
3539         Pb_base = ++reg;
3540       Pn_base = ++reg;
3541       Rlen= ++reg;
3542       inv = ++reg;
3543       Pm_base = ++reg;
3544 
3545                           // Working registers:
3546       Ra =  ++reg;        // The current digit of a, b, n, and m.
3547       Rb =  ++reg;
3548       Rm =  ++reg;
3549       Rn =  ++reg;
3550 
3551       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3552       Pb =  ++reg;
3553       Pm =  ++reg;
3554       Pn =  ++reg;
3555 
3556       t0 =  ++reg;        // Three registers which form a
3557       t1 =  ++reg;        // triple-precision accumuator.
3558       t2 =  ++reg;
3559 
3560       Ri =  ++reg;        // Inner and outer loop indexes.
3561       Rj =  ++reg;
3562 
3563       Rhi_ab = ++reg;     // Product registers: low and high parts
3564       Rlo_ab = ++reg;     // of a*b and m*n.
3565       Rhi_mn = ++reg;
3566       Rlo_mn = ++reg;
3567 
3568       // r19 and up are callee-saved.
3569       _toSave = RegSet::range(r19, reg) + Pm_base;
3570     }
3571 
3572   private:
3573     void save_regs() {
3574       push(_toSave, sp);
3575     }
3576 
3577     void restore_regs() {
3578       pop(_toSave, sp);
3579     }
3580 
3581     template <typename T>
3582     void unroll_2(Register count, T block) {
3583       Label loop, end, odd;
3584       tbnz(count, 0, odd);
3585       cbz(count, end);
3586       align(16);
3587       bind(loop);
3588       (this->*block)();
3589       bind(odd);
3590       (this->*block)();
3591       subs(count, count, 2);
3592       br(Assembler::GT, loop);
3593       bind(end);
3594     }
3595 
3596     template <typename T>
3597     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3598       Label loop, end, odd;
3599       tbnz(count, 0, odd);
3600       cbz(count, end);
3601       align(16);
3602       bind(loop);
3603       (this->*block)(d, s, tmp);
3604       bind(odd);
3605       (this->*block)(d, s, tmp);
3606       subs(count, count, 2);
3607       br(Assembler::GT, loop);
3608       bind(end);
3609     }
3610 
3611     void pre1(RegisterOrConstant i) {
3612       block_comment("pre1");
3613       // Pa = Pa_base;
3614       // Pb = Pb_base + i;
3615       // Pm = Pm_base;
3616       // Pn = Pn_base + i;
3617       // Ra = *Pa;
3618       // Rb = *Pb;
3619       // Rm = *Pm;
3620       // Rn = *Pn;
3621       ldr(Ra, Address(Pa_base));
3622       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3623       ldr(Rm, Address(Pm_base));
3624       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3625       lea(Pa, Address(Pa_base));
3626       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3627       lea(Pm, Address(Pm_base));
3628       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3629 
3630       // Zero the m*n result.
3631       mov(Rhi_mn, zr);
3632       mov(Rlo_mn, zr);
3633     }
3634 
3635     // The core multiply-accumulate step of a Montgomery
3636     // multiplication.  The idea is to schedule operations as a
3637     // pipeline so that instructions with long latencies (loads and
3638     // multiplies) have time to complete before their results are
3639     // used.  This most benefits in-order implementations of the
3640     // architecture but out-of-order ones also benefit.
3641     void step() {
3642       block_comment("step");
3643       // MACC(Ra, Rb, t0, t1, t2);
3644       // Ra = *++Pa;
3645       // Rb = *--Pb;
3646       umulh(Rhi_ab, Ra, Rb);
3647       mul(Rlo_ab, Ra, Rb);
3648       ldr(Ra, pre(Pa, wordSize));
3649       ldr(Rb, pre(Pb, -wordSize));
3650       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3651                                        // previous iteration.
3652       // MACC(Rm, Rn, t0, t1, t2);
3653       // Rm = *++Pm;
3654       // Rn = *--Pn;
3655       umulh(Rhi_mn, Rm, Rn);
3656       mul(Rlo_mn, Rm, Rn);
3657       ldr(Rm, pre(Pm, wordSize));
3658       ldr(Rn, pre(Pn, -wordSize));
3659       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3660     }
3661 
3662     void post1() {
3663       block_comment("post1");
3664 
3665       // MACC(Ra, Rb, t0, t1, t2);
3666       // Ra = *++Pa;
3667       // Rb = *--Pb;
3668       umulh(Rhi_ab, Ra, Rb);
3669       mul(Rlo_ab, Ra, Rb);
3670       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3671       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3672 
3673       // *Pm = Rm = t0 * inv;
3674       mul(Rm, t0, inv);
3675       str(Rm, Address(Pm));
3676 
3677       // MACC(Rm, Rn, t0, t1, t2);
3678       // t0 = t1; t1 = t2; t2 = 0;
3679       umulh(Rhi_mn, Rm, Rn);
3680 
3681 #ifndef PRODUCT
3682       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3683       {
3684         mul(Rlo_mn, Rm, Rn);
3685         add(Rlo_mn, t0, Rlo_mn);
3686         Label ok;
3687         cbz(Rlo_mn, ok); {
3688           stop("broken Montgomery multiply");
3689         } bind(ok);
3690       }
3691 #endif
3692       // We have very carefully set things up so that
3693       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3694       // the lower half of Rm * Rn because we know the result already:
3695       // it must be -t0.  t0 + (-t0) must generate a carry iff
3696       // t0 != 0.  So, rather than do a mul and an adds we just set
3697       // the carry flag iff t0 is nonzero.
3698       //
3699       // mul(Rlo_mn, Rm, Rn);
3700       // adds(zr, t0, Rlo_mn);
3701       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3702       adcs(t0, t1, Rhi_mn);
3703       adc(t1, t2, zr);
3704       mov(t2, zr);
3705     }
3706 
3707     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
3708       block_comment("pre2");
3709       // Pa = Pa_base + i-len;
3710       // Pb = Pb_base + len;
3711       // Pm = Pm_base + i-len;
3712       // Pn = Pn_base + len;
3713 
3714       if (i.is_register()) {
3715         sub(Rj, i.as_register(), len);
3716       } else {
3717         mov(Rj, i.as_constant());
3718         sub(Rj, Rj, len);
3719       }
3720       // Rj == i-len
3721 
3722       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
3723       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
3724       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3725       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
3726 
3727       // Ra = *++Pa;
3728       // Rb = *--Pb;
3729       // Rm = *++Pm;
3730       // Rn = *--Pn;
3731       ldr(Ra, pre(Pa, wordSize));
3732       ldr(Rb, pre(Pb, -wordSize));
3733       ldr(Rm, pre(Pm, wordSize));
3734       ldr(Rn, pre(Pn, -wordSize));
3735 
3736       mov(Rhi_mn, zr);
3737       mov(Rlo_mn, zr);
3738     }
3739 
3740     void post2(RegisterOrConstant i, RegisterOrConstant len) {
3741       block_comment("post2");
3742       if (i.is_constant()) {
3743         mov(Rj, i.as_constant()-len.as_constant());
3744       } else {
3745         sub(Rj, i.as_register(), len);
3746       }
3747 
3748       adds(t0, t0, Rlo_mn); // The pending m*n, low part
3749 
3750       // As soon as we know the least significant digit of our result,
3751       // store it.
3752       // Pm_base[i-len] = t0;
3753       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3754 
3755       // t0 = t1; t1 = t2; t2 = 0;
3756       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
3757       adc(t1, t2, zr);
3758       mov(t2, zr);
3759     }
3760 
3761     // A carry in t0 after Montgomery multiplication means that we
3762     // should subtract multiples of n from our result in m.  We'll
3763     // keep doing that until there is no carry.
3764     void normalize(RegisterOrConstant len) {
3765       block_comment("normalize");
3766       // while (t0)
3767       //   t0 = sub(Pm_base, Pn_base, t0, len);
3768       Label loop, post, again;
3769       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
3770       cbz(t0, post); {
3771         bind(again); {
3772           mov(i, zr);
3773           mov(cnt, len);
3774           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3775           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3776           subs(zr, zr, zr); // set carry flag, i.e. no borrow
3777           align(16);
3778           bind(loop); {
3779             sbcs(Rm, Rm, Rn);
3780             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3781             add(i, i, 1);
3782             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3783             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3784             sub(cnt, cnt, 1);
3785           } cbnz(cnt, loop);
3786           sbc(t0, t0, zr);
3787         } cbnz(t0, again);
3788       } bind(post);
3789     }
3790 
3791     // Move memory at s to d, reversing words.
3792     //    Increments d to end of copied memory
3793     //    Destroys tmp1, tmp2
3794     //    Preserves len
3795     //    Leaves s pointing to the address which was in d at start
3796     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3797       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
3798 
3799       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
3800       mov(tmp1, len);
3801       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3802       sub(s, d, len, ext::uxtw, LogBytesPerWord);
3803     }
3804     // where
3805     void reverse1(Register d, Register s, Register tmp) {
3806       ldr(tmp, pre(s, -wordSize));
3807       ror(tmp, tmp, 32);
3808       str(tmp, post(d, wordSize));
3809     }
3810 
3811     void step_squaring() {
3812       // An extra ACC
3813       step();
3814       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3815     }
3816 
3817     void last_squaring(RegisterOrConstant i) {
3818       Label dont;
3819       // if ((i & 1) == 0) {
3820       tbnz(i.as_register(), 0, dont); {
3821         // MACC(Ra, Rb, t0, t1, t2);
3822         // Ra = *++Pa;
3823         // Rb = *--Pb;
3824         umulh(Rhi_ab, Ra, Rb);
3825         mul(Rlo_ab, Ra, Rb);
3826         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3827       } bind(dont);
3828     }
3829 
3830     void extra_step_squaring() {
3831       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3832 
3833       // MACC(Rm, Rn, t0, t1, t2);
3834       // Rm = *++Pm;
3835       // Rn = *--Pn;
3836       umulh(Rhi_mn, Rm, Rn);
3837       mul(Rlo_mn, Rm, Rn);
3838       ldr(Rm, pre(Pm, wordSize));
3839       ldr(Rn, pre(Pn, -wordSize));
3840     }
3841 
3842     void post1_squaring() {
3843       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3844 
3845       // *Pm = Rm = t0 * inv;
3846       mul(Rm, t0, inv);
3847       str(Rm, Address(Pm));
3848 
3849       // MACC(Rm, Rn, t0, t1, t2);
3850       // t0 = t1; t1 = t2; t2 = 0;
3851       umulh(Rhi_mn, Rm, Rn);
3852 
3853 #ifndef PRODUCT
3854       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3855       {
3856         mul(Rlo_mn, Rm, Rn);
3857         add(Rlo_mn, t0, Rlo_mn);
3858         Label ok;
3859         cbz(Rlo_mn, ok); {
3860           stop("broken Montgomery multiply");
3861         } bind(ok);
3862       }
3863 #endif
3864       // We have very carefully set things up so that
3865       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3866       // the lower half of Rm * Rn because we know the result already:
3867       // it must be -t0.  t0 + (-t0) must generate a carry iff
3868       // t0 != 0.  So, rather than do a mul and an adds we just set
3869       // the carry flag iff t0 is nonzero.
3870       //
3871       // mul(Rlo_mn, Rm, Rn);
3872       // adds(zr, t0, Rlo_mn);
3873       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3874       adcs(t0, t1, Rhi_mn);
3875       adc(t1, t2, zr);
3876       mov(t2, zr);
3877     }
3878 
3879     void acc(Register Rhi, Register Rlo,
3880              Register t0, Register t1, Register t2) {
3881       adds(t0, t0, Rlo);
3882       adcs(t1, t1, Rhi);
3883       adc(t2, t2, zr);
3884     }
3885 
3886   public:
3887     /**
3888      * Fast Montgomery multiplication.  The derivation of the
3889      * algorithm is in A Cryptographic Library for the Motorola
3890      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3891      *
3892      * Arguments:
3893      *
3894      * Inputs for multiplication:
3895      *   c_rarg0   - int array elements a
3896      *   c_rarg1   - int array elements b
3897      *   c_rarg2   - int array elements n (the modulus)
3898      *   c_rarg3   - int length
3899      *   c_rarg4   - int inv
3900      *   c_rarg5   - int array elements m (the result)
3901      *
3902      * Inputs for squaring:
3903      *   c_rarg0   - int array elements a
3904      *   c_rarg1   - int array elements n (the modulus)
3905      *   c_rarg2   - int length
3906      *   c_rarg3   - int inv
3907      *   c_rarg4   - int array elements m (the result)
3908      *
3909      */
3910     address generate_multiply() {
3911       Label argh, nothing;
3912       bind(argh);
3913       stop("MontgomeryMultiply total_allocation must be <= 8192");
3914 
3915       align(CodeEntryAlignment);
3916       address entry = pc();
3917 
3918       cbzw(Rlen, nothing);
3919 
3920       enter();
3921 
3922       // Make room.
3923       cmpw(Rlen, 512);
3924       br(Assembler::HI, argh);
3925       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3926       andr(sp, Ra, -2 * wordSize);
3927 
3928       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3929 
3930       {
3931         // Copy input args, reversing as we go.  We use Ra as a
3932         // temporary variable.
3933         reverse(Ra, Pa_base, Rlen, t0, t1);
3934         if (!_squaring)
3935           reverse(Ra, Pb_base, Rlen, t0, t1);
3936         reverse(Ra, Pn_base, Rlen, t0, t1);
3937       }
3938 
3939       // Push all call-saved registers and also Pm_base which we'll need
3940       // at the end.
3941       save_regs();
3942 
3943 #ifndef PRODUCT
3944       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3945       {
3946         ldr(Rn, Address(Pn_base, 0));
3947         mul(Rlo_mn, Rn, inv);
3948         cmp(Rlo_mn, -1);
3949         Label ok;
3950         br(EQ, ok); {
3951           stop("broken inverse in Montgomery multiply");
3952         } bind(ok);
3953       }
3954 #endif
3955 
3956       mov(Pm_base, Ra);
3957 
3958       mov(t0, zr);
3959       mov(t1, zr);
3960       mov(t2, zr);
3961 
3962       block_comment("for (int i = 0; i < len; i++) {");
3963       mov(Ri, zr); {
3964         Label loop, end;
3965         cmpw(Ri, Rlen);
3966         br(Assembler::GE, end);
3967 
3968         bind(loop);
3969         pre1(Ri);
3970 
3971         block_comment("  for (j = i; j; j--) {"); {
3972           movw(Rj, Ri);
3973           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3974         } block_comment("  } // j");
3975 
3976         post1();
3977         addw(Ri, Ri, 1);
3978         cmpw(Ri, Rlen);
3979         br(Assembler::LT, loop);
3980         bind(end);
3981         block_comment("} // i");
3982       }
3983 
3984       block_comment("for (int i = len; i < 2*len; i++) {");
3985       mov(Ri, Rlen); {
3986         Label loop, end;
3987         cmpw(Ri, Rlen, Assembler::LSL, 1);
3988         br(Assembler::GE, end);
3989 
3990         bind(loop);
3991         pre2(Ri, Rlen);
3992 
3993         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3994           lslw(Rj, Rlen, 1);
3995           subw(Rj, Rj, Ri);
3996           subw(Rj, Rj, 1);
3997           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3998         } block_comment("  } // j");
3999 
4000         post2(Ri, Rlen);
4001         addw(Ri, Ri, 1);
4002         cmpw(Ri, Rlen, Assembler::LSL, 1);
4003         br(Assembler::LT, loop);
4004         bind(end);
4005       }
4006       block_comment("} // i");
4007 
4008       normalize(Rlen);
4009 
4010       mov(Ra, Pm_base);  // Save Pm_base in Ra
4011       restore_regs();  // Restore caller's Pm_base
4012 
4013       // Copy our result into caller's Pm_base
4014       reverse(Pm_base, Ra, Rlen, t0, t1);
4015 
4016       leave();
4017       bind(nothing);
4018       ret(lr);
4019 
4020       return entry;
4021     }
4022     // In C, approximately:
4023 
4024     // void
4025     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4026     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4027     //                     unsigned long inv, int len) {
4028     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4029     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4030     //   unsigned long Ra, Rb, Rn, Rm;
4031 
4032     //   int i;
4033 
4034     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4035 
4036     //   for (i = 0; i < len; i++) {
4037     //     int j;
4038 
4039     //     Pa = Pa_base;
4040     //     Pb = Pb_base + i;
4041     //     Pm = Pm_base;
4042     //     Pn = Pn_base + i;
4043 
4044     //     Ra = *Pa;
4045     //     Rb = *Pb;
4046     //     Rm = *Pm;
4047     //     Rn = *Pn;
4048 
4049     //     int iters = i;
4050     //     for (j = 0; iters--; j++) {
4051     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4052     //       MACC(Ra, Rb, t0, t1, t2);
4053     //       Ra = *++Pa;
4054     //       Rb = *--Pb;
4055     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4056     //       MACC(Rm, Rn, t0, t1, t2);
4057     //       Rm = *++Pm;
4058     //       Rn = *--Pn;
4059     //     }
4060 
4061     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4062     //     MACC(Ra, Rb, t0, t1, t2);
4063     //     *Pm = Rm = t0 * inv;
4064     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4065     //     MACC(Rm, Rn, t0, t1, t2);
4066 
4067     //     assert(t0 == 0, "broken Montgomery multiply");
4068 
4069     //     t0 = t1; t1 = t2; t2 = 0;
4070     //   }
4071 
4072     //   for (i = len; i < 2*len; i++) {
4073     //     int j;
4074 
4075     //     Pa = Pa_base + i-len;
4076     //     Pb = Pb_base + len;
4077     //     Pm = Pm_base + i-len;
4078     //     Pn = Pn_base + len;
4079 
4080     //     Ra = *++Pa;
4081     //     Rb = *--Pb;
4082     //     Rm = *++Pm;
4083     //     Rn = *--Pn;
4084 
4085     //     int iters = len*2-i-1;
4086     //     for (j = i-len+1; iters--; j++) {
4087     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4088     //       MACC(Ra, Rb, t0, t1, t2);
4089     //       Ra = *++Pa;
4090     //       Rb = *--Pb;
4091     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4092     //       MACC(Rm, Rn, t0, t1, t2);
4093     //       Rm = *++Pm;
4094     //       Rn = *--Pn;
4095     //     }
4096 
4097     //     Pm_base[i-len] = t0;
4098     //     t0 = t1; t1 = t2; t2 = 0;
4099     //   }
4100 
4101     //   while (t0)
4102     //     t0 = sub(Pm_base, Pn_base, t0, len);
4103     // }
4104 
4105     /**
4106      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4107      * multiplies than Montgomery multiplication so it should be up to
4108      * 25% faster.  However, its loop control is more complex and it
4109      * may actually run slower on some machines.
4110      *
4111      * Arguments:
4112      *
4113      * Inputs:
4114      *   c_rarg0   - int array elements a
4115      *   c_rarg1   - int array elements n (the modulus)
4116      *   c_rarg2   - int length
4117      *   c_rarg3   - int inv
4118      *   c_rarg4   - int array elements m (the result)
4119      *
4120      */
4121     address generate_square() {
4122       Label argh;
4123       bind(argh);
4124       stop("MontgomeryMultiply total_allocation must be <= 8192");
4125 
4126       align(CodeEntryAlignment);
4127       address entry = pc();
4128 
4129       enter();
4130 
4131       // Make room.
4132       cmpw(Rlen, 512);
4133       br(Assembler::HI, argh);
4134       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4135       andr(sp, Ra, -2 * wordSize);
4136 
4137       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4138 
4139       {
4140         // Copy input args, reversing as we go.  We use Ra as a
4141         // temporary variable.
4142         reverse(Ra, Pa_base, Rlen, t0, t1);
4143         reverse(Ra, Pn_base, Rlen, t0, t1);
4144       }
4145 
4146       // Push all call-saved registers and also Pm_base which we'll need
4147       // at the end.
4148       save_regs();
4149 
4150       mov(Pm_base, Ra);
4151 
4152       mov(t0, zr);
4153       mov(t1, zr);
4154       mov(t2, zr);
4155 
4156       block_comment("for (int i = 0; i < len; i++) {");
4157       mov(Ri, zr); {
4158         Label loop, end;
4159         bind(loop);
4160         cmp(Ri, Rlen);
4161         br(Assembler::GE, end);
4162 
4163         pre1(Ri);
4164 
4165         block_comment("for (j = (i+1)/2; j; j--) {"); {
4166           add(Rj, Ri, 1);
4167           lsr(Rj, Rj, 1);
4168           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4169         } block_comment("  } // j");
4170 
4171         last_squaring(Ri);
4172 
4173         block_comment("  for (j = i/2; j; j--) {"); {
4174           lsr(Rj, Ri, 1);
4175           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4176         } block_comment("  } // j");
4177 
4178         post1_squaring();
4179         add(Ri, Ri, 1);
4180         cmp(Ri, Rlen);
4181         br(Assembler::LT, loop);
4182 
4183         bind(end);
4184         block_comment("} // i");
4185       }
4186 
4187       block_comment("for (int i = len; i < 2*len; i++) {");
4188       mov(Ri, Rlen); {
4189         Label loop, end;
4190         bind(loop);
4191         cmp(Ri, Rlen, Assembler::LSL, 1);
4192         br(Assembler::GE, end);
4193 
4194         pre2(Ri, Rlen);
4195 
4196         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4197           lsl(Rj, Rlen, 1);
4198           sub(Rj, Rj, Ri);
4199           sub(Rj, Rj, 1);
4200           lsr(Rj, Rj, 1);
4201           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4202         } block_comment("  } // j");
4203 
4204         last_squaring(Ri);
4205 
4206         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4207           lsl(Rj, Rlen, 1);
4208           sub(Rj, Rj, Ri);
4209           lsr(Rj, Rj, 1);
4210           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4211         } block_comment("  } // j");
4212 
4213         post2(Ri, Rlen);
4214         add(Ri, Ri, 1);
4215         cmp(Ri, Rlen, Assembler::LSL, 1);
4216 
4217         br(Assembler::LT, loop);
4218         bind(end);
4219         block_comment("} // i");
4220       }
4221 
4222       normalize(Rlen);
4223 
4224       mov(Ra, Pm_base);  // Save Pm_base in Ra
4225       restore_regs();  // Restore caller's Pm_base
4226 
4227       // Copy our result into caller's Pm_base
4228       reverse(Pm_base, Ra, Rlen, t0, t1);
4229 
4230       leave();
4231       ret(lr);
4232 
4233       return entry;
4234     }
4235     // In C, approximately:
4236 
4237     // void
4238     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4239     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4240     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4241     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4242     //   unsigned long Ra, Rb, Rn, Rm;
4243 
4244     //   int i;
4245 
4246     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4247 
4248     //   for (i = 0; i < len; i++) {
4249     //     int j;
4250 
4251     //     Pa = Pa_base;
4252     //     Pb = Pa_base + i;
4253     //     Pm = Pm_base;
4254     //     Pn = Pn_base + i;
4255 
4256     //     Ra = *Pa;
4257     //     Rb = *Pb;
4258     //     Rm = *Pm;
4259     //     Rn = *Pn;
4260 
4261     //     int iters = (i+1)/2;
4262     //     for (j = 0; iters--; j++) {
4263     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4264     //       MACC2(Ra, Rb, t0, t1, t2);
4265     //       Ra = *++Pa;
4266     //       Rb = *--Pb;
4267     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4268     //       MACC(Rm, Rn, t0, t1, t2);
4269     //       Rm = *++Pm;
4270     //       Rn = *--Pn;
4271     //     }
4272     //     if ((i & 1) == 0) {
4273     //       assert(Ra == Pa_base[j], "must be");
4274     //       MACC(Ra, Ra, t0, t1, t2);
4275     //     }
4276     //     iters = i/2;
4277     //     assert(iters == i-j, "must be");
4278     //     for (; iters--; j++) {
4279     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4280     //       MACC(Rm, Rn, t0, t1, t2);
4281     //       Rm = *++Pm;
4282     //       Rn = *--Pn;
4283     //     }
4284 
4285     //     *Pm = Rm = t0 * inv;
4286     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4287     //     MACC(Rm, Rn, t0, t1, t2);
4288 
4289     //     assert(t0 == 0, "broken Montgomery multiply");
4290 
4291     //     t0 = t1; t1 = t2; t2 = 0;
4292     //   }
4293 
4294     //   for (i = len; i < 2*len; i++) {
4295     //     int start = i-len+1;
4296     //     int end = start + (len - start)/2;
4297     //     int j;
4298 
4299     //     Pa = Pa_base + i-len;
4300     //     Pb = Pa_base + len;
4301     //     Pm = Pm_base + i-len;
4302     //     Pn = Pn_base + len;
4303 
4304     //     Ra = *++Pa;
4305     //     Rb = *--Pb;
4306     //     Rm = *++Pm;
4307     //     Rn = *--Pn;
4308 
4309     //     int iters = (2*len-i-1)/2;
4310     //     assert(iters == end-start, "must be");
4311     //     for (j = start; iters--; j++) {
4312     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4313     //       MACC2(Ra, Rb, t0, t1, t2);
4314     //       Ra = *++Pa;
4315     //       Rb = *--Pb;
4316     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4317     //       MACC(Rm, Rn, t0, t1, t2);
4318     //       Rm = *++Pm;
4319     //       Rn = *--Pn;
4320     //     }
4321     //     if ((i & 1) == 0) {
4322     //       assert(Ra == Pa_base[j], "must be");
4323     //       MACC(Ra, Ra, t0, t1, t2);
4324     //     }
4325     //     iters =  (2*len-i)/2;
4326     //     assert(iters == len-j, "must be");
4327     //     for (; iters--; j++) {
4328     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4329     //       MACC(Rm, Rn, t0, t1, t2);
4330     //       Rm = *++Pm;
4331     //       Rn = *--Pn;
4332     //     }
4333     //     Pm_base[i-len] = t0;
4334     //     t0 = t1; t1 = t2; t2 = 0;
4335     //   }
4336 
4337     //   while (t0)
4338     //     t0 = sub(Pm_base, Pn_base, t0, len);
4339     // }
4340   };
4341 
4342   // Initialization
4343   void generate_initial() {
4344     // Generate initial stubs and initializes the entry points
4345 
4346     // entry points that exist in all platforms Note: This is code
4347     // that could be shared among different platforms - however the
4348     // benefit seems to be smaller than the disadvantage of having a
4349     // much more complicated generator structure. See also comment in
4350     // stubRoutines.hpp.
4351 
4352     StubRoutines::_forward_exception_entry = generate_forward_exception();
4353 
4354     StubRoutines::_call_stub_entry =
4355       generate_call_stub(StubRoutines::_call_stub_return_address);
4356 
4357     // is referenced by megamorphic call
4358     StubRoutines::_catch_exception_entry = generate_catch_exception();
4359 
4360     // Build this early so it's available for the interpreter.
4361     StubRoutines::_throw_StackOverflowError_entry =
4362       generate_throw_exception("StackOverflowError throw_exception",
4363                                CAST_FROM_FN_PTR(address,
4364                                                 SharedRuntime::
4365                                                 throw_StackOverflowError));
4366     if (UseCRC32Intrinsics) {
4367       // set table address before stub generation which use it
4368       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4369       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4370     }
4371   }
4372 
4373   void generate_all() {
4374     // support for verify_oop (must happen after universe_init)
4375     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4376     StubRoutines::_throw_AbstractMethodError_entry =
4377       generate_throw_exception("AbstractMethodError throw_exception",
4378                                CAST_FROM_FN_PTR(address,
4379                                                 SharedRuntime::
4380                                                 throw_AbstractMethodError));
4381 
4382     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4383       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4384                                CAST_FROM_FN_PTR(address,
4385                                                 SharedRuntime::
4386                                                 throw_IncompatibleClassChangeError));
4387 
4388     StubRoutines::_throw_NullPointerException_at_call_entry =
4389       generate_throw_exception("NullPointerException at call throw_exception",
4390                                CAST_FROM_FN_PTR(address,
4391                                                 SharedRuntime::
4392                                                 throw_NullPointerException_at_call));
4393 
4394     // arraycopy stubs used by compilers
4395     generate_arraycopy_stubs();
4396 
4397     if (UseMultiplyToLenIntrinsic) {
4398       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4399     }
4400 
4401     if (UseMontgomeryMultiplyIntrinsic) {
4402       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4403       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4404       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4405     }
4406 
4407     if (UseMontgomerySquareIntrinsic) {
4408       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4409       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4410       // We use generate_multiply() rather than generate_square()
4411       // because it's faster for the sizes of modulus we care about.
4412       StubRoutines::_montgomerySquare = g.generate_multiply();
4413     }
4414 
4415     if (UseAESIntrinsics) {
4416       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4417       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4418       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4419       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4420     }
4421 
4422     // generate GHASH intrinsics code
4423     if (UseGHASHIntrinsics) {
4424       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4425     }
4426 
4427     if (UseSHA1Intrinsics) {
4428       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4429       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4430     }
4431     if (UseSHA256Intrinsics) {
4432       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4433       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4434     }
4435 
4436     // Safefetch stubs.
4437     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4438                                                        &StubRoutines::_safefetch32_fault_pc,
4439                                                        &StubRoutines::_safefetch32_continuation_pc);
4440     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4441                                                        &StubRoutines::_safefetchN_fault_pc,
4442                                                        &StubRoutines::_safefetchN_continuation_pc);
4443   }
4444 
4445  public:
4446   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4447     if (all) {
4448       generate_all();
4449     } else {
4450       generate_initial();
4451     }
4452   }
4453 }; // end class declaration
4454 
4455 void StubGenerator_generate(CodeBuffer* code, bool all) {
4456   StubGenerator g(code, all);
4457 }