1 /*
   2  * Copyright (c) 2013, Red Hat Inc.
   3  * Copyright (c) 2003, 2011, Oracle and/or its affiliates.
   4  * All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "interpreter/interpreter.hpp"
  31 #include "nativeInst_aarch64.hpp"
  32 #include "oops/instanceOop.hpp"
  33 #include "oops/method.hpp"
  34 #include "oops/objArrayKlass.hpp"
  35 #include "oops/oop.inline.hpp"
  36 #include "prims/methodHandles.hpp"
  37 #include "runtime/frame.inline.hpp"
  38 #include "runtime/handles.inline.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubCodeGenerator.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "runtime/thread.inline.hpp"
  43 #include "utilities/macros.hpp"
  44 #include "utilities/top.hpp"
  45 
  46 #include "stubRoutines_aarch64.hpp"
  47 
  48 #ifdef COMPILER2
  49 #include "opto/runtime.hpp"
  50 #endif
  51 #if INCLUDE_ALL_GCS
  52 #include "shenandoahBarrierSetAssembler_aarch64.hpp"
  53 #endif
  54 
  55 // Declaration and definition of StubGenerator (no .hpp file).
  56 // For a more detailed description of the stub routine structure
  57 // see the comment in stubRoutines.hpp
  58 
  59 #undef __
  60 #define __ _masm->
  61 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  62 
  63 #ifdef PRODUCT
  64 #define BLOCK_COMMENT(str) /* nothing */
  65 #else
  66 #define BLOCK_COMMENT(str) __ block_comment(str)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Stub Code definitions
  72 
  73 class StubGenerator: public StubCodeGenerator {
  74  private:
  75 
  76 #ifdef PRODUCT
  77 #define inc_counter_np(counter) ((void)0)
  78 #else
  79   void inc_counter_np_(int& counter) {
  80     __ lea(rscratch2, ExternalAddress((address)&counter));
  81     __ ldrw(rscratch1, Address(rscratch2));
  82     __ addw(rscratch1, rscratch1, 1);
  83     __ strw(rscratch1, Address(rscratch2));
  84   }
  85 #define inc_counter_np(counter) \
  86   BLOCK_COMMENT("inc_counter " #counter); \
  87   inc_counter_np_(counter);
  88 #endif
  89 
  90   // Call stubs are used to call Java from C
  91   //
  92   // Arguments:
  93   //    c_rarg0:   call wrapper address                   address
  94   //    c_rarg1:   result                                 address
  95   //    c_rarg2:   result type                            BasicType
  96   //    c_rarg3:   method                                 Method*
  97   //    c_rarg4:   (interpreter) entry point              address
  98   //    c_rarg5:   parameters                             intptr_t*
  99   //    c_rarg6:   parameter size (in words)              int
 100   //    c_rarg7:   thread                                 Thread*
 101   //
 102   // There is no return from the stub itself as any Java result
 103   // is written to result
 104   //
 105   // we save r30 (lr) as the return PC at the base of the frame and
 106   // link r29 (fp) below it as the frame pointer installing sp (r31)
 107   // into fp.
 108   //
 109   // we save r0-r7, which accounts for all the c arguments.
 110   //
 111   // TODO: strictly do we need to save them all? they are treated as
 112   // volatile by C so could we omit saving the ones we are going to
 113   // place in global registers (thread? method?) or those we only use
 114   // during setup of the Java call?
 115   //
 116   // we don't need to save r8 which C uses as an indirect result location
 117   // return register.
 118   //
 119   // we don't need to save r9-r15 which both C and Java treat as
 120   // volatile
 121   //
 122   // we don't need to save r16-18 because Java does not use them
 123   //
 124   // we save r19-r28 which Java uses as scratch registers and C
 125   // expects to be callee-save
 126   //
 127   // we save the bottom 64 bits of each value stored in v8-v15; it is
 128   // the responsibility of the caller to preserve larger values.
 129   //
 130   // so the stub frame looks like this when we enter Java code
 131   //
 132   //     [ return_from_Java     ] <--- sp
 133   //     [ argument word n      ]
 134   //      ...
 135   // -27 [ argument word 1      ]
 136   // -26 [ saved v15            ] <--- sp_after_call
 137   // -25 [ saved v14            ]
 138   // -24 [ saved v13            ]
 139   // -23 [ saved v12            ]
 140   // -22 [ saved v11            ]
 141   // -21 [ saved v10            ]
 142   // -20 [ saved v9             ]
 143   // -19 [ saved v8             ]
 144   // -18 [ saved r28            ]
 145   // -17 [ saved r27            ]
 146   // -16 [ saved r26            ]
 147   // -15 [ saved r25            ]
 148   // -14 [ saved r24            ]
 149   // -13 [ saved r23            ]
 150   // -12 [ saved r22            ]
 151   // -11 [ saved r21            ]
 152   // -10 [ saved r20            ]
 153   //  -9 [ saved r19            ]
 154   //  -8 [ call wrapper    (r0) ]
 155   //  -7 [ result          (r1) ]
 156   //  -6 [ result type     (r2) ]
 157   //  -5 [ method          (r3) ]
 158   //  -4 [ entry point     (r4) ]
 159   //  -3 [ parameters      (r5) ]
 160   //  -2 [ parameter size  (r6) ]
 161   //  -1 [ thread (r7)          ]
 162   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 163   //   1 [ saved lr       (r30) ]
 164 
 165   // Call stub stack layout word offsets from fp
 166   enum call_stub_layout {
 167     sp_after_call_off = -26,
 168 
 169     d15_off            = -26,
 170     d13_off            = -24,
 171     d11_off            = -22,
 172     d9_off             = -20,
 173 
 174     r28_off            = -18,
 175     r26_off            = -16,
 176     r24_off            = -14,
 177     r22_off            = -12,
 178     r20_off            = -10,
 179     call_wrapper_off   =  -8,
 180     result_off         =  -7,
 181     result_type_off    =  -6,
 182     method_off         =  -5,
 183     entry_point_off    =  -4,
 184     parameter_size_off =  -2,
 185     thread_off         =  -1,
 186     fp_f               =   0,
 187     retaddr_off        =   1,
 188   };
 189 
 190   address generate_call_stub(address& return_address) {
 191     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 192            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 193            "adjust this code");
 194 
 195     StubCodeMark mark(this, "StubRoutines", "call_stub");
 196     address start = __ pc();
 197 
 198     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 199 
 200     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 201     const Address result        (rfp, result_off         * wordSize);
 202     const Address result_type   (rfp, result_type_off    * wordSize);
 203     const Address method        (rfp, method_off         * wordSize);
 204     const Address entry_point   (rfp, entry_point_off    * wordSize);
 205     const Address parameter_size(rfp, parameter_size_off * wordSize);
 206 
 207     const Address thread        (rfp, thread_off         * wordSize);
 208 
 209     const Address d15_save      (rfp, d15_off * wordSize);
 210     const Address d13_save      (rfp, d13_off * wordSize);
 211     const Address d11_save      (rfp, d11_off * wordSize);
 212     const Address d9_save       (rfp, d9_off * wordSize);
 213 
 214     const Address r28_save      (rfp, r28_off * wordSize);
 215     const Address r26_save      (rfp, r26_off * wordSize);
 216     const Address r24_save      (rfp, r24_off * wordSize);
 217     const Address r22_save      (rfp, r22_off * wordSize);
 218     const Address r20_save      (rfp, r20_off * wordSize);
 219 
 220     // stub code
 221 
 222     address aarch64_entry = __ pc();
 223 
 224     // set up frame and move sp to end of save area
 225     __ enter();
 226     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 227 
 228     // save register parameters and Java scratch/global registers
 229     // n.b. we save thread even though it gets installed in
 230     // rthread because we want to sanity check rthread later
 231     __ str(c_rarg7,  thread);
 232     __ strw(c_rarg6, parameter_size);
 233     __ stp(c_rarg4, c_rarg5,  entry_point);
 234     __ stp(c_rarg2, c_rarg3,  result_type);
 235     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 236 
 237     __ stp(r20, r19,   r20_save);
 238     __ stp(r22, r21,   r22_save);
 239     __ stp(r24, r23,   r24_save);
 240     __ stp(r26, r25,   r26_save);
 241     __ stp(r28, r27,   r28_save);
 242 
 243     __ stpd(v9,  v8,   d9_save);
 244     __ stpd(v11, v10,  d11_save);
 245     __ stpd(v13, v12,  d13_save);
 246     __ stpd(v15, v14,  d15_save);
 247 
 248     // install Java thread in global register now we have saved
 249     // whatever value it held
 250     __ mov(rthread, c_rarg7);
 251     // And method
 252     __ mov(rmethod, c_rarg3);
 253 
 254     // set up the heapbase register
 255     __ reinit_heapbase();
 256 
 257 #ifdef ASSERT
 258     // make sure we have no pending exceptions
 259     {
 260       Label L;
 261       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 262       __ cmp(rscratch1, (unsigned)NULL_WORD);
 263       __ br(Assembler::EQ, L);
 264       __ stop("StubRoutines::call_stub: entered with pending exception");
 265       __ BIND(L);
 266     }
 267 #endif
 268     // pass parameters if any
 269     __ mov(esp, sp);
 270     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 271     __ andr(sp, rscratch1, -2 * wordSize);
 272 
 273     BLOCK_COMMENT("pass parameters if any");
 274     Label parameters_done;
 275     // parameter count is still in c_rarg6
 276     // and parameter pointer identifying param 1 is in c_rarg5
 277     __ cbzw(c_rarg6, parameters_done);
 278 
 279     address loop = __ pc();
 280     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 281     __ subsw(c_rarg6, c_rarg6, 1);
 282     __ push(rscratch1);
 283     __ br(Assembler::GT, loop);
 284 
 285     __ BIND(parameters_done);
 286 
 287     // call Java entry -- passing methdoOop, and current sp
 288     //      rmethod: Method*
 289     //      r13: sender sp
 290     BLOCK_COMMENT("call Java function");
 291     __ mov(r13, sp);
 292     __ blr(c_rarg4);
 293 
 294     // we do this here because the notify will already have been done
 295     // if we get to the next instruction via an exception
 296     //
 297     // n.b. adding this instruction here affects the calculation of
 298     // whether or not a routine returns to the call stub (used when
 299     // doing stack walks) since the normal test is to check the return
 300     // pc against the address saved below. so we may need to allow for
 301     // this extra instruction in the check.
 302 
 303     // save current address for use by exception handling code
 304 
 305     return_address = __ pc();
 306 
 307     // store result depending on type (everything that is not
 308     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 309     // n.b. this assumes Java returns an integral result in r0
 310     // and a floating result in j_farg0
 311     __ ldr(j_rarg2, result);
 312     Label is_long, is_float, is_double, exit;
 313     __ ldr(j_rarg1, result_type);
 314     __ cmp(j_rarg1, T_OBJECT);
 315     __ br(Assembler::EQ, is_long);
 316     __ cmp(j_rarg1, T_LONG);
 317     __ br(Assembler::EQ, is_long);
 318     __ cmp(j_rarg1, T_FLOAT);
 319     __ br(Assembler::EQ, is_float);
 320     __ cmp(j_rarg1, T_DOUBLE);
 321     __ br(Assembler::EQ, is_double);
 322 
 323     // handle T_INT case
 324     __ strw(r0, Address(j_rarg2));
 325 
 326     __ BIND(exit);
 327 
 328     // pop parameters
 329     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 330 
 331 #ifdef ASSERT
 332     // verify that threads correspond
 333     {
 334       Label L, S;
 335       __ ldr(rscratch1, thread);
 336       __ cmp(rthread, rscratch1);
 337       __ br(Assembler::NE, S);
 338       __ get_thread(rscratch1);
 339       __ cmp(rthread, rscratch1);
 340       __ br(Assembler::EQ, L);
 341       __ BIND(S);
 342       __ stop("StubRoutines::call_stub: threads must correspond");
 343       __ BIND(L);
 344     }
 345 #endif
 346 
 347     // restore callee-save registers
 348     __ ldpd(v15, v14,  d15_save);
 349     __ ldpd(v13, v12,  d13_save);
 350     __ ldpd(v11, v10,  d11_save);
 351     __ ldpd(v9,  v8,   d9_save);
 352 
 353     __ ldp(r28, r27,   r28_save);
 354     __ ldp(r26, r25,   r26_save);
 355     __ ldp(r24, r23,   r24_save);
 356     __ ldp(r22, r21,   r22_save);
 357     __ ldp(r20, r19,   r20_save);
 358 
 359     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 360     __ ldrw(c_rarg2, result_type);
 361     __ ldr(c_rarg3,  method);
 362     __ ldp(c_rarg4, c_rarg5,  entry_point);
 363     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 364 
 365     // leave frame and return to caller
 366     __ leave();
 367     __ ret(lr);
 368 
 369     // handle return types different from T_INT
 370 
 371     __ BIND(is_long);
 372     __ str(r0, Address(j_rarg2, 0));
 373     __ br(Assembler::AL, exit);
 374 
 375     __ BIND(is_float);
 376     __ strs(j_farg0, Address(j_rarg2, 0));
 377     __ br(Assembler::AL, exit);
 378 
 379     __ BIND(is_double);
 380     __ strd(j_farg0, Address(j_rarg2, 0));
 381     __ br(Assembler::AL, exit);
 382 
 383     return start;
 384   }
 385 
 386   // Return point for a Java call if there's an exception thrown in
 387   // Java code.  The exception is caught and transformed into a
 388   // pending exception stored in JavaThread that can be tested from
 389   // within the VM.
 390   //
 391   // Note: Usually the parameters are removed by the callee. In case
 392   // of an exception crossing an activation frame boundary, that is
 393   // not the case if the callee is compiled code => need to setup the
 394   // rsp.
 395   //
 396   // r0: exception oop
 397 
 398   address generate_catch_exception() {
 399     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 400     address start = __ pc();
 401 
 402     // same as in generate_call_stub():
 403     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 404     const Address thread        (rfp, thread_off         * wordSize);
 405 
 406 #ifdef ASSERT
 407     // verify that threads correspond
 408     {
 409       Label L, S;
 410       __ ldr(rscratch1, thread);
 411       __ cmp(rthread, rscratch1);
 412       __ br(Assembler::NE, S);
 413       __ get_thread(rscratch1);
 414       __ cmp(rthread, rscratch1);
 415       __ br(Assembler::EQ, L);
 416       __ bind(S);
 417       __ stop("StubRoutines::catch_exception: threads must correspond");
 418       __ bind(L);
 419     }
 420 #endif
 421 
 422     // set pending exception
 423     __ verify_oop(r0);
 424 
 425     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 426     __ mov(rscratch1, (address)__FILE__);
 427     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 428     __ movw(rscratch1, (int)__LINE__);
 429     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 430 
 431     // complete return to VM
 432     assert(StubRoutines::_call_stub_return_address != NULL,
 433            "_call_stub_return_address must have been generated before");
 434     __ b(StubRoutines::_call_stub_return_address);
 435 
 436     return start;
 437   }
 438 
 439   // Continuation point for runtime calls returning with a pending
 440   // exception.  The pending exception check happened in the runtime
 441   // or native call stub.  The pending exception in Thread is
 442   // converted into a Java-level exception.
 443   //
 444   // Contract with Java-level exception handlers:
 445   // r0: exception
 446   // r3: throwing pc
 447   //
 448   // NOTE: At entry of this stub, exception-pc must be in LR !!
 449 
 450   // NOTE: this is always used as a jump target within generated code
 451   // so it just needs to be generated code wiht no x86 prolog
 452 
 453   address generate_forward_exception() {
 454     StubCodeMark mark(this, "StubRoutines", "forward exception");
 455     address start = __ pc();
 456 
 457     // Upon entry, LR points to the return address returning into
 458     // Java (interpreted or compiled) code; i.e., the return address
 459     // becomes the throwing pc.
 460     //
 461     // Arguments pushed before the runtime call are still on the stack
 462     // but the exception handler will reset the stack pointer ->
 463     // ignore them.  A potential result in registers can be ignored as
 464     // well.
 465 
 466 #ifdef ASSERT
 467     // make sure this code is only executed if there is a pending exception
 468     {
 469       Label L;
 470       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 471       __ cbnz(rscratch1, L);
 472       __ stop("StubRoutines::forward exception: no pending exception (1)");
 473       __ bind(L);
 474     }
 475 #endif
 476 
 477     // compute exception handler into r19
 478 
 479     // call the VM to find the handler address associated with the
 480     // caller address. pass thread in r0 and caller pc (ret address)
 481     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 482     // the stack.
 483     __ mov(c_rarg1, lr);
 484     // lr will be trashed by the VM call so we move it to R19
 485     // (callee-saved) because we also need to pass it to the handler
 486     // returned by this call.
 487     __ mov(r19, lr);
 488     BLOCK_COMMENT("call exception_handler_for_return_address");
 489     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 490                          SharedRuntime::exception_handler_for_return_address),
 491                     rthread, c_rarg1);
 492     // we should not really care that lr is no longer the callee
 493     // address. we saved the value the handler needs in r19 so we can
 494     // just copy it to r3. however, the C2 handler will push its own
 495     // frame and then calls into the VM and the VM code asserts that
 496     // the PC for the frame above the handler belongs to a compiled
 497     // Java method. So, we restore lr here to satisfy that assert.
 498     __ mov(lr, r19);
 499     // setup r0 & r3 & clear pending exception
 500     __ mov(r3, r19);
 501     __ mov(r19, r0);
 502     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 503     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 504 
 505 #ifdef ASSERT
 506     // make sure exception is set
 507     {
 508       Label L;
 509       __ cbnz(r0, L);
 510       __ stop("StubRoutines::forward exception: no pending exception (2)");
 511       __ bind(L);
 512     }
 513 #endif
 514 
 515     // continue at exception handler
 516     // r0: exception
 517     // r3: throwing pc
 518     // r19: exception handler
 519     __ verify_oop(r0);
 520     __ br(r19);
 521 
 522     return start;
 523   }
 524 
 525   // Non-destructive plausibility checks for oops
 526   //
 527   // Arguments:
 528   //    r0: oop to verify
 529   //    rscratch1: error message
 530   //
 531   // Stack after saving c_rarg3:
 532   //    [tos + 0]: saved c_rarg3
 533   //    [tos + 1]: saved c_rarg2
 534   //    [tos + 2]: saved lr
 535   //    [tos + 3]: saved rscratch2
 536   //    [tos + 4]: saved r0
 537   //    [tos + 5]: saved rscratch1
 538   address generate_verify_oop() {
 539 
 540     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 541     address start = __ pc();
 542 
 543     Label exit, error;
 544 
 545     // save c_rarg2 and c_rarg3
 546     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 547 
 548     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 549     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 550     __ ldr(c_rarg3, Address(c_rarg2));
 551     __ add(c_rarg3, c_rarg3, 1);
 552     __ str(c_rarg3, Address(c_rarg2));
 553 
 554     // object is in r0
 555     // make sure object is 'reasonable'
 556     __ cbz(r0, exit); // if obj is NULL it is OK
 557 
 558     // Check if the oop is in the right area of memory
 559     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 560     __ andr(c_rarg2, r0, c_rarg3);
 561     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 562 
 563     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 564     // instruction here because the flags register is live.
 565     __ eor(c_rarg2, c_rarg2, c_rarg3);
 566     __ cbnz(c_rarg2, error);
 567 
 568     // make sure klass is 'reasonable', which is not zero.
 569     __ load_klass(r0, r0);  // get klass
 570     __ cbz(r0, error);      // if klass is NULL it is broken
 571 
 572     // return if everything seems ok
 573     __ bind(exit);
 574 
 575     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 576     __ ret(lr);
 577 
 578     // handle errors
 579     __ bind(error);
 580     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 581 
 582     __ push(RegSet::range(r0, r29), sp);
 583     // debug(char* msg, int64_t pc, int64_t regs[])
 584     __ mov(c_rarg0, rscratch1);      // pass address of error message
 585     __ mov(c_rarg1, lr);             // pass return address
 586     __ mov(c_rarg2, sp);             // pass address of regs on stack
 587 #ifndef PRODUCT
 588     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 589 #endif
 590     BLOCK_COMMENT("call MacroAssembler::debug");
 591     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 592     __ blr(rscratch1);
 593 
 594     return start;
 595   }
 596 
 597   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 598 
 599   // Generate code for an array write pre barrier
 600   //
 601   //     addr    -  starting address
 602   //     count   -  element count
 603   //     tmp     - scratch register
 604   //
 605   //     Destroy no registers except rscratch1 and rscratch2
 606   //
 607   void  gen_write_ref_array_pre_barrier(Register src, Register addr, Register count, bool dest_uninitialized) {
 608     BarrierSet* bs = Universe::heap()->barrier_set();
 609     switch (bs->kind()) {
 610     case BarrierSet::G1SATBCT:
 611     case BarrierSet::G1SATBCTLogging:
 612       // Don't generate the call if we statically know that the target is uninitialized
 613       if (!dest_uninitialized) {
 614         __ push_call_clobbered_registers();
 615         if (count == c_rarg0) {
 616           if (addr == c_rarg1) {
 617             // exactly backwards!!
 618             __ mov(rscratch1, c_rarg0);
 619             __ mov(c_rarg0, c_rarg1);
 620             __ mov(c_rarg1, rscratch1);
 621           } else {
 622             __ mov(c_rarg1, count);
 623             __ mov(c_rarg0, addr);
 624           }
 625         } else {
 626           __ mov(c_rarg0, addr);
 627           __ mov(c_rarg1, count);
 628         }
 629         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 630         __ pop_call_clobbered_registers();
 631         break;
 632       case BarrierSet::CardTableModRef:
 633       case BarrierSet::CardTableExtension:
 634       case BarrierSet::ModRef:
 635         break;
 636 #if INCLUDE_ALL_GCS
 637       case BarrierSet::ShenandoahBarrierSet:
 638         ShenandoahBarrierSetAssembler::bsasm()->arraycopy_prologue(_masm, dest_uninitialized, src, addr, count);
 639         break;
 640 #endif
 641       default:
 642         ShouldNotReachHere();
 643 
 644       }
 645     }
 646   }
 647 
 648   //
 649   // Generate code for an array write post barrier
 650   //
 651   //  Input:
 652   //     start    - register containing starting address of destination array
 653   //     end      - register containing ending address of destination array
 654   //     scratch  - scratch register
 655   //
 656   //  The input registers are overwritten.
 657   //  The ending address is inclusive.
 658   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 659     assert_different_registers(start, end, scratch);
 660     Label L_done;
 661 
 662     // "end" is inclusive end pointer == start + (count - 1) * array_element_size
 663     // If count == 0, "end" is less than "start" and we need to skip card marking.
 664     __ cmp(end, start);
 665     __ br(__ LO, L_done);
 666 
 667     BarrierSet* bs = Universe::heap()->barrier_set();
 668     switch (bs->kind()) {
 669       case BarrierSet::G1SATBCT:
 670       case BarrierSet::G1SATBCTLogging:
 671    
 672         {
 673           __ push_call_clobbered_registers();
 674           // must compute element count unless barrier set interface is changed (other platforms supply count)
 675           assert_different_registers(start, end, scratch);
 676           __ lea(scratch, Address(end, BytesPerHeapOop));
 677           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 678           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 679           __ mov(c_rarg0, start);
 680           __ mov(c_rarg1, scratch);
 681           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 682           __ pop_call_clobbered_registers();
 683         }
 684         break;
 685       case BarrierSet::CardTableModRef:
 686       case BarrierSet::CardTableExtension:
 687         {
 688           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 689           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 690 
 691           Label L_loop;
 692 
 693            __ lsr(start, start, CardTableModRefBS::card_shift);
 694            __ lsr(end, end, CardTableModRefBS::card_shift);
 695            __ sub(end, end, start); // number of bytes to copy
 696 
 697           const Register count = end; // 'end' register contains bytes count now
 698           __ load_byte_map_base(scratch);
 699           __ add(start, start, scratch);
 700           if (UseConcMarkSweepGC) {
 701             __ membar(__ StoreStore);
 702           }
 703           __ BIND(L_loop);
 704           __ strb(zr, Address(start, count));
 705           __ subs(count, count, 1);
 706           __ br(Assembler::GE, L_loop);
 707         }
 708         break;
 709 #if INCLUDE_ALL_GCS
 710     case BarrierSet::ShenandoahBarrierSet:
 711         break;
 712 #endif
 713     default:
 714         ShouldNotReachHere();
 715 
 716     }
 717     __ bind(L_done);
 718   }
 719 
 720   address generate_zero_longs(Register base, Register cnt) {
 721     Register tmp = rscratch1;
 722     Register tmp2 = rscratch2;
 723     int zva_length = VM_Version::zva_length();
 724     Label initial_table_end, loop_zva;
 725     Label fini;
 726 
 727     __ align(CodeEntryAlignment);
 728     StubCodeMark mark(this, "StubRoutines", "zero_longs");
 729     address start = __ pc();
 730 
 731     // Base must be 16 byte aligned. If not just return and let caller handle it
 732     __ tst(base, 0x0f);
 733     __ br(Assembler::NE, fini);
 734     // Align base with ZVA length.
 735     __ neg(tmp, base);
 736     __ andr(tmp, tmp, zva_length - 1);
 737 
 738     // tmp: the number of bytes to be filled to align the base with ZVA length.
 739     __ add(base, base, tmp);
 740     __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
 741     __ adr(tmp2, initial_table_end);
 742     __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
 743     __ br(tmp2);
 744 
 745     for (int i = -zva_length + 16; i < 0; i += 16)
 746       __ stp(zr, zr, Address(base, i));
 747     __ bind(initial_table_end);
 748 
 749     __ sub(cnt, cnt, zva_length >> 3);
 750     __ bind(loop_zva);
 751     __ dc(Assembler::ZVA, base);
 752     __ subs(cnt, cnt, zva_length >> 3);
 753     __ add(base, base, zva_length);
 754     __ br(Assembler::GE, loop_zva);
 755     __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
 756     __ bind(fini);
 757     __ ret(lr);
 758 
 759     return start;
 760   }
 761 
 762   typedef enum {
 763     copy_forwards = 1,
 764     copy_backwards = -1
 765   } copy_direction;
 766 
 767   // Bulk copy of blocks of 8 words.
 768   //
 769   // count is a count of words.
 770   //
 771   // Precondition: count >= 8
 772   //
 773   // Postconditions:
 774   //
 775   // The least significant bit of count contains the remaining count
 776   // of words to copy.  The rest of count is trash.
 777   //
 778   // s and d are adjusted to point to the remaining words to copy
 779   //
 780   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 781                            copy_direction direction) {
 782     int unit = wordSize * direction;
 783     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 784 
 785     int offset;
 786     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 787       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 788     const Register stride = r13;
 789 
 790     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 791     assert_different_registers(s, d, count, rscratch1);
 792 
 793     Label again, drain;
 794     const char *stub_name;
 795     if (direction == copy_forwards)
 796       stub_name = "foward_copy_longs";
 797     else
 798       stub_name = "backward_copy_longs";
 799 
 800     __ align(CodeEntryAlignment);
 801 
 802     StubCodeMark mark(this, "StubRoutines", stub_name);
 803 
 804     __ bind(start);
 805 
 806     Label unaligned_copy_long;
 807     if (AvoidUnalignedAccesses) {
 808       __ tbnz(d, 3, unaligned_copy_long);
 809     }
 810 
 811     if (direction == copy_forwards) {
 812       __ sub(s, s, bias);
 813       __ sub(d, d, bias);
 814     }
 815 
 816 #ifdef ASSERT
 817     // Make sure we are never given < 8 words
 818     {
 819       Label L;
 820       __ cmp(count, 8);
 821       __ br(Assembler::GE, L);
 822       __ stop("genrate_copy_longs called with < 8 words");
 823       __ bind(L);
 824     }
 825 #endif
 826 
 827     // Fill 8 registers
 828     if (UseSIMDForMemoryOps) {
 829       __ ldpq(v0, v1, Address(s, 4 * unit));
 830       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 831     } else {
 832       __ ldp(t0, t1, Address(s, 2 * unit));
 833       __ ldp(t2, t3, Address(s, 4 * unit));
 834       __ ldp(t4, t5, Address(s, 6 * unit));
 835       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 836     }
 837 
 838     __ subs(count, count, 16);
 839     __ br(Assembler::LO, drain);
 840 
 841     int prefetch = PrefetchCopyIntervalInBytes;
 842     bool use_stride = false;
 843     if (direction == copy_backwards) {
 844        use_stride = prefetch > 256;
 845        prefetch = -prefetch;
 846        if (use_stride) __ mov(stride, prefetch);
 847     }
 848 
 849     __ bind(again);
 850 
 851     if (PrefetchCopyIntervalInBytes > 0)
 852       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 853 
 854     if (UseSIMDForMemoryOps) {
 855       __ stpq(v0, v1, Address(d, 4 * unit));
 856       __ ldpq(v0, v1, Address(s, 4 * unit));
 857       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 858       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 859     } else {
 860       __ stp(t0, t1, Address(d, 2 * unit));
 861       __ ldp(t0, t1, Address(s, 2 * unit));
 862       __ stp(t2, t3, Address(d, 4 * unit));
 863       __ ldp(t2, t3, Address(s, 4 * unit));
 864       __ stp(t4, t5, Address(d, 6 * unit));
 865       __ ldp(t4, t5, Address(s, 6 * unit));
 866       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 867       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 868     }
 869 
 870     __ subs(count, count, 8);
 871     __ br(Assembler::HS, again);
 872 
 873     // Drain
 874     __ bind(drain);
 875     if (UseSIMDForMemoryOps) {
 876       __ stpq(v0, v1, Address(d, 4 * unit));
 877       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 878     } else {
 879       __ stp(t0, t1, Address(d, 2 * unit));
 880       __ stp(t2, t3, Address(d, 4 * unit));
 881       __ stp(t4, t5, Address(d, 6 * unit));
 882       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 883     }
 884 
 885     {
 886       Label L1, L2;
 887       __ tbz(count, exact_log2(4), L1);
 888       if (UseSIMDForMemoryOps) {
 889         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 890         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 891       } else {
 892         __ ldp(t0, t1, Address(s, 2 * unit));
 893         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 894         __ stp(t0, t1, Address(d, 2 * unit));
 895         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 896       }
 897       __ bind(L1);
 898 
 899       if (direction == copy_forwards) {
 900         __ add(s, s, bias);
 901         __ add(d, d, bias);
 902       }
 903 
 904       __ tbz(count, 1, L2);
 905       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 906       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 907       __ bind(L2);
 908     }
 909 
 910     __ ret(lr);
 911 
 912     if (AvoidUnalignedAccesses) {
 913       Label drain, again;
 914       // Register order for storing. Order is different for backward copy.
 915 
 916       __ bind(unaligned_copy_long);
 917 
 918       // source address is even aligned, target odd aligned
 919       //
 920       // when forward copying word pairs we read long pairs at offsets
 921       // {0, 2, 4, 6} (in long words). when backwards copying we read
 922       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 923       // address by -2 in the forwards case so we can compute the
 924       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 925       // or -1.
 926       //
 927       // when forward copying we need to store 1 word, 3 pairs and
 928       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 929       // zero offset We adjust the destination by -1 which means we
 930       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 931       //
 932       // When backwards copyng we need to store 1 word, 3 pairs and
 933       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 934       // offsets {1, 3, 5, 7, 8} * unit.
 935 
 936       if (direction == copy_forwards) {
 937         __ sub(s, s, 16);
 938         __ sub(d, d, 8);
 939       }
 940 
 941       // Fill 8 registers
 942       //
 943       // for forwards copy s was offset by -16 from the original input
 944       // value of s so the register contents are at these offsets
 945       // relative to the 64 bit block addressed by that original input
 946       // and so on for each successive 64 byte block when s is updated
 947       //
 948       // t0 at offset 0,  t1 at offset 8
 949       // t2 at offset 16, t3 at offset 24
 950       // t4 at offset 32, t5 at offset 40
 951       // t6 at offset 48, t7 at offset 56
 952 
 953       // for backwards copy s was not offset so the register contents
 954       // are at these offsets into the preceding 64 byte block
 955       // relative to that original input and so on for each successive
 956       // preceding 64 byte block when s is updated. this explains the
 957       // slightly counter-intuitive looking pattern of register usage
 958       // in the stp instructions for backwards copy.
 959       //
 960       // t0 at offset -16, t1 at offset -8
 961       // t2 at offset -32, t3 at offset -24
 962       // t4 at offset -48, t5 at offset -40
 963       // t6 at offset -64, t7 at offset -56
 964 
 965       __ ldp(t0, t1, Address(s, 2 * unit));
 966       __ ldp(t2, t3, Address(s, 4 * unit));
 967       __ ldp(t4, t5, Address(s, 6 * unit));
 968       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 969 
 970       __ subs(count, count, 16);
 971       __ br(Assembler::LO, drain);
 972 
 973       int prefetch = PrefetchCopyIntervalInBytes;
 974       bool use_stride = false;
 975       if (direction == copy_backwards) {
 976          use_stride = prefetch > 256;
 977          prefetch = -prefetch;
 978          if (use_stride) __ mov(stride, prefetch);
 979       }
 980 
 981       __ bind(again);
 982 
 983       if (PrefetchCopyIntervalInBytes > 0)
 984         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 985 
 986       if (direction == copy_forwards) {
 987        // allowing for the offset of -8 the store instructions place
 988        // registers into the target 64 bit block at the following
 989        // offsets
 990        //
 991        // t0 at offset 0
 992        // t1 at offset 8,  t2 at offset 16
 993        // t3 at offset 24, t4 at offset 32
 994        // t5 at offset 40, t6 at offset 48
 995        // t7 at offset 56
 996 
 997         __ str(t0, Address(d, 1 * unit));
 998         __ stp(t1, t2, Address(d, 2 * unit));
 999         __ ldp(t0, t1, Address(s, 2 * unit));
1000         __ stp(t3, t4, Address(d, 4 * unit));
1001         __ ldp(t2, t3, Address(s, 4 * unit));
1002         __ stp(t5, t6, Address(d, 6 * unit));
1003         __ ldp(t4, t5, Address(s, 6 * unit));
1004         __ str(t7, Address(__ pre(d, 8 * unit)));
1005         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1006       } else {
1007        // d was not offset when we started so the registers are
1008        // written into the 64 bit block preceding d with the following
1009        // offsets
1010        //
1011        // t1 at offset -8
1012        // t3 at offset -24, t0 at offset -16
1013        // t5 at offset -48, t2 at offset -32
1014        // t7 at offset -56, t4 at offset -48
1015        //                   t6 at offset -64
1016        //
1017        // note that this matches the offsets previously noted for the
1018        // loads
1019 
1020         __ str(t1, Address(d, 1 * unit));
1021         __ stp(t3, t0, Address(d, 3 * unit));
1022         __ ldp(t0, t1, Address(s, 2 * unit));
1023         __ stp(t5, t2, Address(d, 5 * unit));
1024         __ ldp(t2, t3, Address(s, 4 * unit));
1025         __ stp(t7, t4, Address(d, 7 * unit));
1026         __ ldp(t4, t5, Address(s, 6 * unit));
1027         __ str(t6, Address(__ pre(d, 8 * unit)));
1028         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1029       }
1030 
1031       __ subs(count, count, 8);
1032       __ br(Assembler::HS, again);
1033 
1034       // Drain
1035       //
1036       // this uses the same pattern of offsets and register arguments
1037       // as above
1038       __ bind(drain);
1039       if (direction == copy_forwards) {
1040         __ str(t0, Address(d, 1 * unit));
1041         __ stp(t1, t2, Address(d, 2 * unit));
1042         __ stp(t3, t4, Address(d, 4 * unit));
1043         __ stp(t5, t6, Address(d, 6 * unit));
1044         __ str(t7, Address(__ pre(d, 8 * unit)));
1045       } else {
1046         __ str(t1, Address(d, 1 * unit));
1047         __ stp(t3, t0, Address(d, 3 * unit));
1048         __ stp(t5, t2, Address(d, 5 * unit));
1049         __ stp(t7, t4, Address(d, 7 * unit));
1050         __ str(t6, Address(__ pre(d, 8 * unit)));
1051       }
1052       // now we need to copy any remaining part block which may
1053       // include a 4 word block subblock and/or a 2 word subblock.
1054       // bits 2 and 1 in the count are the tell-tale for whetehr we
1055       // have each such subblock
1056       {
1057         Label L1, L2;
1058         __ tbz(count, exact_log2(4), L1);
1059        // this is the same as above but copying only 4 longs hence
1060        // with ony one intervening stp between the str instructions
1061        // but note that the offsets and registers still follow the
1062        // same pattern
1063         __ ldp(t0, t1, Address(s, 2 * unit));
1064         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1065         if (direction == copy_forwards) {
1066           __ str(t0, Address(d, 1 * unit));
1067           __ stp(t1, t2, Address(d, 2 * unit));
1068           __ str(t3, Address(__ pre(d, 4 * unit)));
1069         } else {
1070           __ str(t1, Address(d, 1 * unit));
1071           __ stp(t3, t0, Address(d, 3 * unit));
1072           __ str(t2, Address(__ pre(d, 4 * unit)));
1073         }
1074         __ bind(L1);
1075 
1076         __ tbz(count, 1, L2);
1077        // this is the same as above but copying only 2 longs hence
1078        // there is no intervening stp between the str instructions
1079        // but note that the offset and register patterns are still
1080        // the same
1081         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1082         if (direction == copy_forwards) {
1083           __ str(t0, Address(d, 1 * unit));
1084           __ str(t1, Address(__ pre(d, 2 * unit)));
1085         } else {
1086           __ str(t1, Address(d, 1 * unit));
1087           __ str(t0, Address(__ pre(d, 2 * unit)));
1088         }
1089         __ bind(L2);
1090 
1091        // for forwards copy we need to re-adjust the offsets we
1092        // applied so that s and d are follow the last words written
1093 
1094        if (direction == copy_forwards) {
1095          __ add(s, s, 16);
1096          __ add(d, d, 8);
1097        }
1098 
1099       }
1100 
1101       __ ret(lr);
1102       }
1103   }
1104 
1105   // Small copy: less than 16 bytes.
1106   //
1107   // NB: Ignores all of the bits of count which represent more than 15
1108   // bytes, so a caller doesn't have to mask them.
1109 
1110   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1111     bool is_backwards = step < 0;
1112     size_t granularity = uabs(step);
1113     int direction = is_backwards ? -1 : 1;
1114     int unit = wordSize * direction;
1115 
1116     Label Lpair, Lword, Lint, Lshort, Lbyte;
1117 
1118     assert(granularity
1119            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1120 
1121     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1122 
1123     // ??? I don't know if this bit-test-and-branch is the right thing
1124     // to do.  It does a lot of jumping, resulting in several
1125     // mispredicted branches.  It might make more sense to do this
1126     // with something like Duff's device with a single computed branch.
1127 
1128     __ tbz(count, 3 - exact_log2(granularity), Lword);
1129     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1130     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1131     __ bind(Lword);
1132 
1133     if (granularity <= sizeof (jint)) {
1134       __ tbz(count, 2 - exact_log2(granularity), Lint);
1135       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1136       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1137       __ bind(Lint);
1138     }
1139 
1140     if (granularity <= sizeof (jshort)) {
1141       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1142       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1143       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1144       __ bind(Lshort);
1145     }
1146 
1147     if (granularity <= sizeof (jbyte)) {
1148       __ tbz(count, 0, Lbyte);
1149       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1150       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1151       __ bind(Lbyte);
1152     }
1153   }
1154 
1155   Label copy_f, copy_b;
1156 
1157   // All-singing all-dancing memory copy.
1158   //
1159   // Copy count units of memory from s to d.  The size of a unit is
1160   // step, which can be positive or negative depending on the direction
1161   // of copy.  If is_aligned is false, we align the source address.
1162   //
1163 
1164   void copy_memory(bool is_aligned, Register s, Register d,
1165                    Register count, Register tmp, int step) {
1166     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1167     bool is_backwards = step < 0;
1168     int granularity = uabs(step);
1169     const Register t0 = r3, t1 = r4;
1170 
1171     // <= 96 bytes do inline. Direction doesn't matter because we always
1172     // load all the data before writing anything
1173     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1174     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1175     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1176     const Register send = r17, dend = r18;
1177 
1178     if (PrefetchCopyIntervalInBytes > 0)
1179       __ prfm(Address(s, 0), PLDL1KEEP);
1180     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1181     __ br(Assembler::HI, copy_big);
1182 
1183     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1184     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1185 
1186     __ cmp(count, 16/granularity);
1187     __ br(Assembler::LS, copy16);
1188 
1189     __ cmp(count, 64/granularity);
1190     __ br(Assembler::HI, copy80);
1191 
1192     __ cmp(count, 32/granularity);
1193     __ br(Assembler::LS, copy32);
1194 
1195     // 33..64 bytes
1196     if (UseSIMDForMemoryOps) {
1197       __ ldpq(v0, v1, Address(s, 0));
1198       __ ldpq(v2, v3, Address(send, -32));
1199       __ stpq(v0, v1, Address(d, 0));
1200       __ stpq(v2, v3, Address(dend, -32));
1201     } else {
1202       __ ldp(t0, t1, Address(s, 0));
1203       __ ldp(t2, t3, Address(s, 16));
1204       __ ldp(t4, t5, Address(send, -32));
1205       __ ldp(t6, t7, Address(send, -16));
1206 
1207       __ stp(t0, t1, Address(d, 0));
1208       __ stp(t2, t3, Address(d, 16));
1209       __ stp(t4, t5, Address(dend, -32));
1210       __ stp(t6, t7, Address(dend, -16));
1211     }
1212     __ b(finish);
1213 
1214     // 17..32 bytes
1215     __ bind(copy32);
1216     __ ldp(t0, t1, Address(s, 0));
1217     __ ldp(t2, t3, Address(send, -16));
1218     __ stp(t0, t1, Address(d, 0));
1219     __ stp(t2, t3, Address(dend, -16));
1220     __ b(finish);
1221 
1222     // 65..80/96 bytes
1223     // (96 bytes if SIMD because we do 32 byes per instruction)
1224     __ bind(copy80);
1225     if (UseSIMDForMemoryOps) {
1226       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1227       __ ldpq(v4, v5, Address(send, -32));
1228       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1229       __ stpq(v4, v5, Address(dend, -32));
1230     } else {
1231       __ ldp(t0, t1, Address(s, 0));
1232       __ ldp(t2, t3, Address(s, 16));
1233       __ ldp(t4, t5, Address(s, 32));
1234       __ ldp(t6, t7, Address(s, 48));
1235       __ ldp(t8, t9, Address(send, -16));
1236   
1237       __ stp(t0, t1, Address(d, 0));
1238       __ stp(t2, t3, Address(d, 16));
1239       __ stp(t4, t5, Address(d, 32));
1240       __ stp(t6, t7, Address(d, 48));
1241       __ stp(t8, t9, Address(dend, -16));
1242     }
1243     __ b(finish);
1244 
1245     // 0..16 bytes
1246     __ bind(copy16);
1247     __ cmp(count, 8/granularity);
1248     __ br(Assembler::LO, copy8);
1249 
1250     // 8..16 bytes
1251     __ ldr(t0, Address(s, 0));
1252     __ ldr(t1, Address(send, -8));
1253     __ str(t0, Address(d, 0));
1254     __ str(t1, Address(dend, -8));
1255     __ b(finish);
1256 
1257     if (granularity < 8) {
1258       // 4..7 bytes
1259       __ bind(copy8);
1260       __ tbz(count, 2 - exact_log2(granularity), copy4);
1261       __ ldrw(t0, Address(s, 0));
1262       __ ldrw(t1, Address(send, -4));
1263       __ strw(t0, Address(d, 0));
1264       __ strw(t1, Address(dend, -4));
1265       __ b(finish);
1266       if (granularity < 4) {
1267         // 0..3 bytes
1268         __ bind(copy4);
1269         __ cbz(count, finish); // get rid of 0 case
1270         if (granularity == 2) {
1271           __ ldrh(t0, Address(s, 0));
1272           __ strh(t0, Address(d, 0));
1273         } else { // granularity == 1
1274           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1275           // the first and last byte.
1276           // Handle the 3 byte case by loading and storing base + count/2
1277           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1278           // This does means in the 1 byte case we load/store the same
1279           // byte 3 times.
1280           __ lsr(count, count, 1);
1281           __ ldrb(t0, Address(s, 0));
1282           __ ldrb(t1, Address(send, -1));
1283           __ ldrb(t2, Address(s, count));
1284           __ strb(t0, Address(d, 0));
1285           __ strb(t1, Address(dend, -1));
1286           __ strb(t2, Address(d, count));
1287         }
1288         __ b(finish);
1289       }
1290     }
1291 
1292     __ bind(copy_big);
1293     if (is_backwards) {
1294       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1295       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1296     }
1297 
1298     // Now we've got the small case out of the way we can align the
1299     // source address on a 2-word boundary.
1300 
1301     Label aligned;
1302 
1303     if (is_aligned) {
1304       // We may have to adjust by 1 word to get s 2-word-aligned.
1305       __ tbz(s, exact_log2(wordSize), aligned);
1306       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1307       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1308       __ sub(count, count, wordSize/granularity);
1309     } else {
1310       if (is_backwards) {
1311         __ andr(rscratch2, s, 2 * wordSize - 1);
1312       } else {
1313         __ neg(rscratch2, s);
1314         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1315       }
1316       // rscratch2 is the byte adjustment needed to align s.
1317       __ cbz(rscratch2, aligned);
1318       int shift = exact_log2(granularity);
1319       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1320       __ sub(count, count, rscratch2);
1321 
1322 #if 0
1323       // ?? This code is only correct for a disjoint copy.  It may or
1324       // may not make sense to use it in that case.
1325 
1326       // Copy the first pair; s and d may not be aligned.
1327       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1328       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1329 
1330       // Align s and d, adjust count
1331       if (is_backwards) {
1332         __ sub(s, s, rscratch2);
1333         __ sub(d, d, rscratch2);
1334       } else {
1335         __ add(s, s, rscratch2);
1336         __ add(d, d, rscratch2);
1337       }
1338 #else
1339       copy_memory_small(s, d, rscratch2, rscratch1, step);
1340 #endif
1341     }
1342 
1343     __ bind(aligned);
1344 
1345     // s is now 2-word-aligned.
1346 
1347     // We have a count of units and some trailing bytes.  Adjust the
1348     // count and do a bulk copy of words.
1349     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1350     if (direction == copy_forwards)
1351       __ bl(copy_f);
1352     else
1353       __ bl(copy_b);
1354 
1355     // And the tail.
1356     copy_memory_small(s, d, count, tmp, step);
1357 
1358     if (granularity >= 8) __ bind(copy8);
1359     if (granularity >= 4) __ bind(copy4);
1360     __ bind(finish);
1361   }
1362 
1363 
1364   void clobber_registers() {
1365 #ifdef ASSERT
1366     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1367     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1368     for (Register r = r3; r <= r18; r++)
1369       if (r != rscratch1) __ mov(r, rscratch1);
1370 #endif
1371   }
1372 
1373   // Scan over array at a for count oops, verifying each one.
1374   // Preserves a and count, clobbers rscratch1 and rscratch2.
1375   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1376     Label loop, end;
1377     __ mov(rscratch1, a);
1378     __ mov(rscratch2, zr);
1379     __ bind(loop);
1380     __ cmp(rscratch2, count);
1381     __ br(Assembler::HS, end);
1382     if (size == (size_t)wordSize) {
1383       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1384       __ verify_oop(temp);
1385     } else {
1386       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1387       __ decode_heap_oop(temp); // calls verify_oop
1388     }
1389     __ add(rscratch2, rscratch2, size);
1390     __ b(loop);
1391     __ bind(end);
1392   }
1393 
1394   // Arguments:
1395   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1396   //             ignored
1397   //   is_oop  - true => oop array, so generate store check code
1398   //   name    - stub name string
1399   //
1400   // Inputs:
1401   //   c_rarg0   - source array address
1402   //   c_rarg1   - destination array address
1403   //   c_rarg2   - element count, treated as ssize_t, can be zero
1404   //
1405   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1406   // the hardware handle it.  The two dwords within qwords that span
1407   // cache line boundaries will still be loaded and stored atomicly.
1408   //
1409   // Side Effects:
1410   //   disjoint_int_copy_entry is set to the no-overlap entry point
1411   //   used by generate_conjoint_int_oop_copy().
1412   //
1413   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1414                                   const char *name, bool dest_uninitialized = false) {
1415     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1416     __ align(CodeEntryAlignment);
1417     StubCodeMark mark(this, "StubRoutines", name);
1418     address start = __ pc();
1419     __ enter();
1420 
1421     if (entry != NULL) {
1422       *entry = __ pc();
1423       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1424       BLOCK_COMMENT("Entry:");
1425     }
1426 
1427     if (is_oop) {
1428       __ push(RegSet::of(d, count), sp);
1429       // no registers are destroyed by this call
1430       gen_write_ref_array_pre_barrier(s, d, count, dest_uninitialized);
1431     }
1432     copy_memory(aligned, s, d, count, rscratch1, size);
1433     if (is_oop) {
1434       __ pop(RegSet::of(d, count), sp);
1435       if (VerifyOops)
1436         verify_oop_array(size, d, count, r16);
1437       __ sub(count, count, 1); // make an inclusive end pointer
1438       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1439       gen_write_ref_array_post_barrier(d, count, rscratch1);
1440     }
1441     __ leave();
1442     __ mov(r0, zr); // return 0
1443     __ ret(lr);
1444     return start;
1445   }
1446 
1447   // Arguments:
1448   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1449   //             ignored
1450   //   is_oop  - true => oop array, so generate store check code
1451   //   name    - stub name string
1452   //
1453   // Inputs:
1454   //   c_rarg0   - source array address
1455   //   c_rarg1   - destination array address
1456   //   c_rarg2   - element count, treated as ssize_t, can be zero
1457   //
1458   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1459   // the hardware handle it.  The two dwords within qwords that span
1460   // cache line boundaries will still be loaded and stored atomicly.
1461   //
1462   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1463                                  address *entry, const char *name,
1464                                  bool dest_uninitialized = false) {
1465     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1466 
1467     StubCodeMark mark(this, "StubRoutines", name);
1468     address start = __ pc();
1469 
1470     __ enter();
1471 
1472     if (entry != NULL) {
1473       *entry = __ pc();
1474       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1475       BLOCK_COMMENT("Entry:");
1476     }
1477 
1478     // use fwd copy when (d-s) above_equal (count*size)
1479     __ sub(rscratch1, d, s);
1480     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1481     __ br(Assembler::HS, nooverlap_target);
1482 
1483     if (is_oop) {
1484       __ push(RegSet::of(d, count), sp);
1485       // no registers are destroyed by this call
1486       gen_write_ref_array_pre_barrier(s, d, count, dest_uninitialized);
1487     }
1488     copy_memory(aligned, s, d, count, rscratch1, -size);
1489     if (is_oop) {
1490       __ pop(RegSet::of(d, count), sp);
1491       if (VerifyOops)
1492         verify_oop_array(size, d, count, r16);
1493       __ sub(count, count, 1); // make an inclusive end pointer
1494       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1495       gen_write_ref_array_post_barrier(d, count, rscratch1);
1496     }
1497     __ leave();
1498     __ mov(r0, zr); // return 0
1499     __ ret(lr);
1500     return start;
1501 }
1502 
1503   // Arguments:
1504   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1505   //             ignored
1506   //   name    - stub name string
1507   //
1508   // Inputs:
1509   //   c_rarg0   - source array address
1510   //   c_rarg1   - destination array address
1511   //   c_rarg2   - element count, treated as ssize_t, can be zero
1512   //
1513   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1514   // we let the hardware handle it.  The one to eight bytes within words,
1515   // dwords or qwords that span cache line boundaries will still be loaded
1516   // and stored atomically.
1517   //
1518   // Side Effects:
1519   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1520   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1521   // we let the hardware handle it.  The one to eight bytes within words,
1522   // dwords or qwords that span cache line boundaries will still be loaded
1523   // and stored atomically.
1524   //
1525   // Side Effects:
1526   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1527   //   used by generate_conjoint_byte_copy().
1528   //
1529   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1530     const bool not_oop = false;
1531     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1532   }
1533 
1534   // Arguments:
1535   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1536   //             ignored
1537   //   name    - stub name string
1538   //
1539   // Inputs:
1540   //   c_rarg0   - source array address
1541   //   c_rarg1   - destination array address
1542   //   c_rarg2   - element count, treated as ssize_t, can be zero
1543   //
1544   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1545   // we let the hardware handle it.  The one to eight bytes within words,
1546   // dwords or qwords that span cache line boundaries will still be loaded
1547   // and stored atomically.
1548   //
1549   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1550                                       address* entry, const char *name) {
1551     const bool not_oop = false;
1552     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1553   }
1554 
1555   // Arguments:
1556   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1557   //             ignored
1558   //   name    - stub name string
1559   //
1560   // Inputs:
1561   //   c_rarg0   - source array address
1562   //   c_rarg1   - destination array address
1563   //   c_rarg2   - element count, treated as ssize_t, can be zero
1564   //
1565   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1566   // let the hardware handle it.  The two or four words within dwords
1567   // or qwords that span cache line boundaries will still be loaded
1568   // and stored atomically.
1569   //
1570   // Side Effects:
1571   //   disjoint_short_copy_entry is set to the no-overlap entry point
1572   //   used by generate_conjoint_short_copy().
1573   //
1574   address generate_disjoint_short_copy(bool aligned,
1575                                        address* entry, const char *name) {
1576     const bool not_oop = false;
1577     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1578   }
1579 
1580   // Arguments:
1581   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1582   //             ignored
1583   //   name    - stub name string
1584   //
1585   // Inputs:
1586   //   c_rarg0   - source array address
1587   //   c_rarg1   - destination array address
1588   //   c_rarg2   - element count, treated as ssize_t, can be zero
1589   //
1590   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1591   // let the hardware handle it.  The two or four words within dwords
1592   // or qwords that span cache line boundaries will still be loaded
1593   // and stored atomically.
1594   //
1595   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1596                                        address *entry, const char *name) {
1597     const bool not_oop = false;
1598     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1599 
1600   }
1601   // Arguments:
1602   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1603   //             ignored
1604   //   name    - stub name string
1605   //
1606   // Inputs:
1607   //   c_rarg0   - source array address
1608   //   c_rarg1   - destination array address
1609   //   c_rarg2   - element count, treated as ssize_t, can be zero
1610   //
1611   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1612   // the hardware handle it.  The two dwords within qwords that span
1613   // cache line boundaries will still be loaded and stored atomicly.
1614   //
1615   // Side Effects:
1616   //   disjoint_int_copy_entry is set to the no-overlap entry point
1617   //   used by generate_conjoint_int_oop_copy().
1618   //
1619   address generate_disjoint_int_copy(bool aligned, address *entry,
1620                                         const char *name) {
1621     const bool not_oop = false;
1622     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1623   }
1624 
1625   // Arguments:
1626   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1627   //             ignored
1628   //   name    - stub name string
1629   //
1630   // Inputs:
1631   //   c_rarg0   - source array address
1632   //   c_rarg1   - destination array address
1633   //   c_rarg2   - element count, treated as ssize_t, can be zero
1634   //
1635   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1636   // the hardware handle it.  The two dwords within qwords that span
1637   // cache line boundaries will still be loaded and stored atomicly.
1638   //
1639   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1640                                      address *entry, const char *name,
1641                                      bool dest_uninitialized = false) {
1642     const bool not_oop = false;
1643     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1644   }
1645 
1646 
1647   // Arguments:
1648   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1649   //             ignored
1650   //   name    - stub name string
1651   //
1652   // Inputs:
1653   //   c_rarg0   - source array address
1654   //   c_rarg1   - destination array address
1655   //   c_rarg2   - element count, treated as size_t, can be zero
1656   //
1657   // Side Effects:
1658   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1659   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1660   //
1661   address generate_disjoint_long_copy(bool aligned, address *entry,
1662                                           const char *name, bool dest_uninitialized = false) {
1663     const bool not_oop = false;
1664     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1665   }
1666 
1667   // Arguments:
1668   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1669   //             ignored
1670   //   name    - stub name string
1671   //
1672   // Inputs:
1673   //   c_rarg0   - source array address
1674   //   c_rarg1   - destination array address
1675   //   c_rarg2   - element count, treated as size_t, can be zero
1676   //
1677   address generate_conjoint_long_copy(bool aligned,
1678                                       address nooverlap_target, address *entry,
1679                                       const char *name, bool dest_uninitialized = false) {
1680     const bool not_oop = false;
1681     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1682   }
1683 
1684   // Arguments:
1685   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1686   //             ignored
1687   //   name    - stub name string
1688   //
1689   // Inputs:
1690   //   c_rarg0   - source array address
1691   //   c_rarg1   - destination array address
1692   //   c_rarg2   - element count, treated as size_t, can be zero
1693   //
1694   // Side Effects:
1695   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1696   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1697   //
1698   address generate_disjoint_oop_copy(bool aligned, address *entry,
1699                                      const char *name, bool dest_uninitialized) {
1700     const bool is_oop = true;
1701     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1702     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1703   }
1704 
1705   // Arguments:
1706   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1707   //             ignored
1708   //   name    - stub name string
1709   //
1710   // Inputs:
1711   //   c_rarg0   - source array address
1712   //   c_rarg1   - destination array address
1713   //   c_rarg2   - element count, treated as size_t, can be zero
1714   //
1715   address generate_conjoint_oop_copy(bool aligned,
1716                                      address nooverlap_target, address *entry,
1717                                      const char *name, bool dest_uninitialized) {
1718     const bool is_oop = true;
1719     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1720     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1721                                   name, dest_uninitialized);
1722   }
1723 
1724 
1725   // Helper for generating a dynamic type check.
1726   // Smashes rscratch1.
1727   void generate_type_check(Register sub_klass,
1728                            Register super_check_offset,
1729                            Register super_klass,
1730                            Label& L_success) {
1731     assert_different_registers(sub_klass, super_check_offset, super_klass);
1732 
1733     BLOCK_COMMENT("type_check:");
1734 
1735     Label L_miss;
1736 
1737     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1738                                      super_check_offset);
1739     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1740 
1741     // Fall through on failure!
1742     __ BIND(L_miss);
1743   }
1744 
1745   //
1746   //  Generate checkcasting array copy stub
1747   //
1748   //  Input:
1749   //    c_rarg0   - source array address
1750   //    c_rarg1   - destination array address
1751   //    c_rarg2   - element count, treated as ssize_t, can be zero
1752   //    c_rarg3   - size_t ckoff (super_check_offset)
1753   //    c_rarg4   - oop ckval (super_klass)
1754   //
1755   //  Output:
1756   //    r0 ==  0  -  success
1757   //    r0 == -1^K - failure, where K is partial transfer count
1758   //
1759   address generate_checkcast_copy(const char *name, address *entry,
1760                                   bool dest_uninitialized = false) {
1761 
1762     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1763 
1764     // Input registers (after setup_arg_regs)
1765     const Register from        = c_rarg0;   // source array address
1766     const Register to          = c_rarg1;   // destination array address
1767     const Register count       = c_rarg2;   // elementscount
1768     const Register ckoff       = c_rarg3;   // super_check_offset
1769     const Register ckval       = c_rarg4;   // super_klass
1770 
1771     // Registers used as temps (r18, r19, r20 are save-on-entry)
1772     const Register count_save  = r21;       // orig elementscount
1773     const Register start_to    = r20;       // destination array start address
1774     const Register copied_oop  = r18;       // actual oop copied
1775     const Register r19_klass   = r19;       // oop._klass
1776 
1777     //---------------------------------------------------------------
1778     // Assembler stub will be used for this call to arraycopy
1779     // if the two arrays are subtypes of Object[] but the
1780     // destination array type is not equal to or a supertype
1781     // of the source type.  Each element must be separately
1782     // checked.
1783 
1784     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1785                                copied_oop, r19_klass, count_save);
1786 
1787     __ align(CodeEntryAlignment);
1788     StubCodeMark mark(this, "StubRoutines", name);
1789     address start = __ pc();
1790 
1791     __ enter(); // required for proper stackwalking of RuntimeStub frame
1792 
1793 #ifdef ASSERT
1794     // caller guarantees that the arrays really are different
1795     // otherwise, we would have to make conjoint checks
1796     { Label L;
1797       array_overlap_test(L, TIMES_OOP);
1798       __ stop("checkcast_copy within a single array");
1799       __ bind(L);
1800     }
1801 #endif //ASSERT
1802 
1803     // Caller of this entry point must set up the argument registers.
1804     if (entry != NULL) {
1805       *entry = __ pc();
1806       BLOCK_COMMENT("Entry:");
1807     }
1808 
1809      // Empty array:  Nothing to do.
1810     __ cbz(count, L_done);
1811 
1812     __ push(RegSet::of(r18, r19, r20, r21), sp);
1813 
1814 #ifdef ASSERT
1815     BLOCK_COMMENT("assert consistent ckoff/ckval");
1816     // The ckoff and ckval must be mutually consistent,
1817     // even though caller generates both.
1818     { Label L;
1819       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1820       __ ldrw(start_to, Address(ckval, sco_offset));
1821       __ cmpw(ckoff, start_to);
1822       __ br(Assembler::EQ, L);
1823       __ stop("super_check_offset inconsistent");
1824       __ bind(L);
1825     }
1826 #endif //ASSERT
1827 
1828     gen_write_ref_array_pre_barrier(from, to, count, dest_uninitialized);
1829 
1830     // save the original count
1831     __ mov(count_save, count);
1832 
1833     // Copy from low to high addresses
1834     __ mov(start_to, to);              // Save destination array start address
1835     __ b(L_load_element);
1836 
1837     // ======== begin loop ========
1838     // (Loop is rotated; its entry is L_load_element.)
1839     // Loop control:
1840     //   for (; count != 0; count--) {
1841     //     copied_oop = load_heap_oop(from++);
1842     //     ... generate_type_check ...;
1843     //     store_heap_oop(to++, copied_oop);
1844     //   }
1845     __ align(OptoLoopAlignment);
1846 
1847     __ BIND(L_store_element);
1848     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1849     __ sub(count, count, 1);
1850     __ cbz(count, L_do_card_marks);
1851 
1852     // ======== loop entry is here ========
1853     __ BIND(L_load_element);
1854     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1855     __ cbz(copied_oop, L_store_element);
1856 
1857     __ load_klass(r19_klass, copied_oop);// query the object klass
1858     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1859     // ======== end loop ========
1860 
1861     // It was a real error; we must depend on the caller to finish the job.
1862     // Register count = remaining oops, count_orig = total oops.
1863     // Emit GC store barriers for the oops we have copied and report
1864     // their number to the caller.
1865 
1866     __ subs(count, count_save, count);     // K = partially copied oop count
1867     __ eon(count, count, zr);                   // report (-1^K) to caller
1868     __ br(Assembler::EQ, L_done_pop);
1869 
1870     __ BIND(L_do_card_marks);
1871     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1872     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1873 
1874     __ bind(L_done_pop);
1875     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1876     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1877 
1878     __ bind(L_done);
1879     __ mov(r0, count);
1880     __ leave();
1881     __ ret(lr);
1882 
1883     return start;
1884   }
1885 
1886   // Perform range checks on the proposed arraycopy.
1887   // Kills temp, but nothing else.
1888   // Also, clean the sign bits of src_pos and dst_pos.
1889   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1890                               Register src_pos, // source position (c_rarg1)
1891                               Register dst,     // destination array oo (c_rarg2)
1892                               Register dst_pos, // destination position (c_rarg3)
1893                               Register length,
1894                               Register temp,
1895                               Label& L_failed) {
1896     BLOCK_COMMENT("arraycopy_range_checks:");
1897 
1898     assert_different_registers(rscratch1, temp);
1899 
1900     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1901     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1902     __ addw(temp, length, src_pos);
1903     __ cmpw(temp, rscratch1);
1904     __ br(Assembler::HI, L_failed);
1905 
1906     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1907     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1908     __ addw(temp, length, dst_pos);
1909     __ cmpw(temp, rscratch1);
1910     __ br(Assembler::HI, L_failed);
1911 
1912     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1913     __ movw(src_pos, src_pos);
1914     __ movw(dst_pos, dst_pos);
1915 
1916     BLOCK_COMMENT("arraycopy_range_checks done");
1917   }
1918 
1919   // These stubs get called from some dumb test routine.
1920   // I'll write them properly when they're called from
1921   // something that's actually doing something.
1922   static void fake_arraycopy_stub(address src, address dst, int count) {
1923     assert(count == 0, "huh?");
1924   }
1925 
1926 
1927   //
1928   // Generate stub for array fill. If "aligned" is true, the
1929   // "to" address is assumed to be heapword aligned.
1930   //
1931   // Arguments for generated stub:
1932   //   to:    c_rarg0
1933   //   value: c_rarg1
1934   //   count: c_rarg2 treated as signed
1935   //
1936   address generate_fill(BasicType t, bool aligned, const char *name) {
1937     __ align(CodeEntryAlignment);
1938     StubCodeMark mark(this, "StubRoutines", name);
1939     address start = __ pc();
1940 
1941     BLOCK_COMMENT("Entry:");
1942 
1943     const Register to        = c_rarg0;  // source array address
1944     const Register value     = c_rarg1;  // value
1945     const Register count     = c_rarg2;  // elements count
1946 
1947     const Register bz_base = r10;        // base for block_zero routine
1948     const Register cnt_words = r11;      // temp register
1949 
1950     __ enter();
1951 
1952     Label L_fill_elements, L_exit1;
1953 
1954     int shift = -1;
1955     switch (t) {
1956       case T_BYTE:
1957         shift = 0;
1958         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1959         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
1960         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
1961         __ br(Assembler::LO, L_fill_elements);
1962         break;
1963       case T_SHORT:
1964         shift = 1;
1965         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1966         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
1967         __ br(Assembler::LO, L_fill_elements);
1968         break;
1969       case T_INT:
1970         shift = 2;
1971         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1972         __ br(Assembler::LO, L_fill_elements);
1973         break;
1974       default: ShouldNotReachHere();
1975     }
1976 
1977     // Align source address at 8 bytes address boundary.
1978     Label L_skip_align1, L_skip_align2, L_skip_align4;
1979     if (!aligned) {
1980       switch (t) {
1981         case T_BYTE:
1982           // One byte misalignment happens only for byte arrays.
1983           __ tbz(to, 0, L_skip_align1);
1984           __ strb(value, Address(__ post(to, 1)));
1985           __ subw(count, count, 1);
1986           __ bind(L_skip_align1);
1987           // Fallthrough
1988         case T_SHORT:
1989           // Two bytes misalignment happens only for byte and short (char) arrays.
1990           __ tbz(to, 1, L_skip_align2);
1991           __ strh(value, Address(__ post(to, 2)));
1992           __ subw(count, count, 2 >> shift);
1993           __ bind(L_skip_align2);
1994           // Fallthrough
1995         case T_INT:
1996           // Align to 8 bytes, we know we are 4 byte aligned to start.
1997           __ tbz(to, 2, L_skip_align4);
1998           __ strw(value, Address(__ post(to, 4)));
1999           __ subw(count, count, 4 >> shift);
2000           __ bind(L_skip_align4);
2001           break;
2002         default: ShouldNotReachHere();
2003       }
2004     }
2005 
2006     //
2007     //  Fill large chunks
2008     //
2009     __ lsrw(cnt_words, count, 3 - shift); // number of words
2010     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2011     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2012     if (UseBlockZeroing) {
2013       Label non_block_zeroing, rest;
2014       // count >= BlockZeroingLowLimit && value == 0
2015       __ subs(rscratch1, cnt_words, BlockZeroingLowLimit >> 3);
2016       __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
2017       __ br(Assembler::NE, non_block_zeroing);
2018       __ mov(bz_base, to);
2019       __ block_zero(bz_base, cnt_words, true);
2020       __ mov(to, bz_base);
2021       __ b(rest);
2022       __ bind(non_block_zeroing);
2023       __ fill_words(to, cnt_words, value);
2024       __ bind(rest);
2025     }
2026     else {
2027       __ fill_words(to, cnt_words, value);
2028     }
2029 
2030     // Remaining count is less than 8 bytes. Fill it by a single store.
2031     // Note that the total length is no less than 8 bytes.
2032     if (t == T_BYTE || t == T_SHORT) {
2033       Label L_exit1;
2034       __ cbzw(count, L_exit1);
2035       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2036       __ str(value, Address(to, -8));    // overwrite some elements
2037       __ bind(L_exit1);
2038       __ leave();
2039       __ ret(lr);
2040     }
2041 
2042     // Handle copies less than 8 bytes.
2043     Label L_fill_2, L_fill_4, L_exit2;
2044     __ bind(L_fill_elements);
2045     switch (t) {
2046       case T_BYTE:
2047         __ tbz(count, 0, L_fill_2);
2048         __ strb(value, Address(__ post(to, 1)));
2049         __ bind(L_fill_2);
2050         __ tbz(count, 1, L_fill_4);
2051         __ strh(value, Address(__ post(to, 2)));
2052         __ bind(L_fill_4);
2053         __ tbz(count, 2, L_exit2);
2054         __ strw(value, Address(to));
2055         break;
2056       case T_SHORT:
2057         __ tbz(count, 0, L_fill_4);
2058         __ strh(value, Address(__ post(to, 2)));
2059         __ bind(L_fill_4);
2060         __ tbz(count, 1, L_exit2);
2061         __ strw(value, Address(to));
2062         break;
2063       case T_INT:
2064         __ cbzw(count, L_exit2);
2065         __ strw(value, Address(to));
2066         break;
2067       default: ShouldNotReachHere();
2068     }
2069     __ bind(L_exit2);
2070     __ leave();
2071     __ ret(lr);
2072     return start;
2073   }
2074 
2075   //
2076   //  Generate 'unsafe' array copy stub
2077   //  Though just as safe as the other stubs, it takes an unscaled
2078   //  size_t argument instead of an element count.
2079   //
2080   //  Input:
2081   //    c_rarg0   - source array address
2082   //    c_rarg1   - destination array address
2083   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2084   //
2085   // Examines the alignment of the operands and dispatches
2086   // to a long, int, short, or byte copy loop.
2087   //
2088   address generate_unsafe_copy(const char *name,
2089                                address byte_copy_entry,
2090                                address short_copy_entry,
2091                                address int_copy_entry,
2092                                address long_copy_entry) {
2093     Label L_long_aligned, L_int_aligned, L_short_aligned;
2094     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2095 
2096     __ align(CodeEntryAlignment);
2097     StubCodeMark mark(this, "StubRoutines", name);
2098     address start = __ pc();
2099     __ enter(); // required for proper stackwalking of RuntimeStub frame
2100 
2101     // bump this on entry, not on exit:
2102     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2103 
2104     __ orr(rscratch1, s, d);
2105     __ orr(rscratch1, rscratch1, count);
2106 
2107     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2108     __ cbz(rscratch1, L_long_aligned);
2109     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2110     __ cbz(rscratch1, L_int_aligned);
2111     __ tbz(rscratch1, 0, L_short_aligned);
2112     __ b(RuntimeAddress(byte_copy_entry));
2113 
2114     __ BIND(L_short_aligned);
2115     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2116     __ b(RuntimeAddress(short_copy_entry));
2117     __ BIND(L_int_aligned);
2118     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2119     __ b(RuntimeAddress(int_copy_entry));
2120     __ BIND(L_long_aligned);
2121     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2122     __ b(RuntimeAddress(long_copy_entry));
2123 
2124     return start;
2125   }
2126 
2127   //
2128   //  Generate generic array copy stubs
2129   //
2130   //  Input:
2131   //    c_rarg0    -  src oop
2132   //    c_rarg1    -  src_pos (32-bits)
2133   //    c_rarg2    -  dst oop
2134   //    c_rarg3    -  dst_pos (32-bits)
2135   //    c_rarg4    -  element count (32-bits)
2136   //
2137   //  Output:
2138   //    r0 ==  0  -  success
2139   //    r0 == -1^K - failure, where K is partial transfer count
2140   //
2141   address generate_generic_copy(const char *name,
2142                                 address byte_copy_entry, address short_copy_entry,
2143                                 address int_copy_entry, address oop_copy_entry,
2144                                 address long_copy_entry, address checkcast_copy_entry) {
2145 
2146     Label L_failed, L_failed_0, L_objArray;
2147     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2148 
2149     // Input registers
2150     const Register src        = c_rarg0;  // source array oop
2151     const Register src_pos    = c_rarg1;  // source position
2152     const Register dst        = c_rarg2;  // destination array oop
2153     const Register dst_pos    = c_rarg3;  // destination position
2154     const Register length     = c_rarg4;
2155 
2156     __ align(CodeEntryAlignment);
2157 
2158     StubCodeMark mark(this, "StubRoutines", name);
2159 
2160     address start = __ pc();
2161 
2162     __ enter(); // required for proper stackwalking of RuntimeStub frame
2163 
2164     // bump this on entry, not on exit:
2165     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2166 
2167     //-----------------------------------------------------------------------
2168     // Assembler stub will be used for this call to arraycopy
2169     // if the following conditions are met:
2170     //
2171     // (1) src and dst must not be null.
2172     // (2) src_pos must not be negative.
2173     // (3) dst_pos must not be negative.
2174     // (4) length  must not be negative.
2175     // (5) src klass and dst klass should be the same and not NULL.
2176     // (6) src and dst should be arrays.
2177     // (7) src_pos + length must not exceed length of src.
2178     // (8) dst_pos + length must not exceed length of dst.
2179     //
2180 
2181     //  if (src == NULL) return -1;
2182     __ cbz(src, L_failed);
2183 
2184     //  if (src_pos < 0) return -1;
2185     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2186 
2187     //  if (dst == NULL) return -1;
2188     __ cbz(dst, L_failed);
2189 
2190     //  if (dst_pos < 0) return -1;
2191     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2192 
2193     // registers used as temp
2194     const Register scratch_length    = r16; // elements count to copy
2195     const Register scratch_src_klass = r17; // array klass
2196     const Register lh                = r18; // layout helper
2197 
2198     //  if (length < 0) return -1;
2199     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2200     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2201 
2202     __ load_klass(scratch_src_klass, src);
2203 #ifdef ASSERT
2204     //  assert(src->klass() != NULL);
2205     {
2206       BLOCK_COMMENT("assert klasses not null {");
2207       Label L1, L2;
2208       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2209       __ bind(L1);
2210       __ stop("broken null klass");
2211       __ bind(L2);
2212       __ load_klass(rscratch1, dst);
2213       __ cbz(rscratch1, L1);     // this would be broken also
2214       BLOCK_COMMENT("} assert klasses not null done");
2215     }
2216 #endif
2217 
2218     // Load layout helper (32-bits)
2219     //
2220     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2221     // 32        30    24            16              8     2                 0
2222     //
2223     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2224     //
2225 
2226     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2227 
2228     // Handle objArrays completely differently...
2229     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2230     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2231     __ movw(rscratch1, objArray_lh);
2232     __ eorw(rscratch2, lh, rscratch1);
2233     __ cbzw(rscratch2, L_objArray);
2234 
2235     //  if (src->klass() != dst->klass()) return -1;
2236     __ load_klass(rscratch2, dst);
2237     __ eor(rscratch2, rscratch2, scratch_src_klass);
2238     __ cbnz(rscratch2, L_failed);
2239 
2240     //  if (!src->is_Array()) return -1;
2241     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2242 
2243     // At this point, it is known to be a typeArray (array_tag 0x3).
2244 #ifdef ASSERT
2245     {
2246       BLOCK_COMMENT("assert primitive array {");
2247       Label L;
2248       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2249       __ cmpw(lh, rscratch2);
2250       __ br(Assembler::GE, L);
2251       __ stop("must be a primitive array");
2252       __ bind(L);
2253       BLOCK_COMMENT("} assert primitive array done");
2254     }
2255 #endif
2256 
2257     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2258                            rscratch2, L_failed);
2259 
2260     // TypeArrayKlass
2261     //
2262     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2263     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2264     //
2265 
2266     const Register rscratch1_offset = rscratch1;    // array offset
2267     const Register r18_elsize = lh; // element size
2268 
2269     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2270            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2271     __ add(src, src, rscratch1_offset);           // src array offset
2272     __ add(dst, dst, rscratch1_offset);           // dst array offset
2273     BLOCK_COMMENT("choose copy loop based on element size");
2274 
2275     // next registers should be set before the jump to corresponding stub
2276     const Register from     = c_rarg0;  // source array address
2277     const Register to       = c_rarg1;  // destination array address
2278     const Register count    = c_rarg2;  // elements count
2279 
2280     // 'from', 'to', 'count' registers should be set in such order
2281     // since they are the same as 'src', 'src_pos', 'dst'.
2282 
2283     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2284 
2285     // The possible values of elsize are 0-3, i.e. exact_log2(element
2286     // size in bytes).  We do a simple bitwise binary search.
2287   __ BIND(L_copy_bytes);
2288     __ tbnz(r18_elsize, 1, L_copy_ints);
2289     __ tbnz(r18_elsize, 0, L_copy_shorts);
2290     __ lea(from, Address(src, src_pos));// src_addr
2291     __ lea(to,   Address(dst, dst_pos));// dst_addr
2292     __ movw(count, scratch_length); // length
2293     __ b(RuntimeAddress(byte_copy_entry));
2294 
2295   __ BIND(L_copy_shorts);
2296     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2297     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2298     __ movw(count, scratch_length); // length
2299     __ b(RuntimeAddress(short_copy_entry));
2300 
2301   __ BIND(L_copy_ints);
2302     __ tbnz(r18_elsize, 0, L_copy_longs);
2303     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2304     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2305     __ movw(count, scratch_length); // length
2306     __ b(RuntimeAddress(int_copy_entry));
2307 
2308   __ BIND(L_copy_longs);
2309 #ifdef ASSERT
2310     {
2311       BLOCK_COMMENT("assert long copy {");
2312       Label L;
2313       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2314       __ cmpw(r18_elsize, LogBytesPerLong);
2315       __ br(Assembler::EQ, L);
2316       __ stop("must be long copy, but elsize is wrong");
2317       __ bind(L);
2318       BLOCK_COMMENT("} assert long copy done");
2319     }
2320 #endif
2321     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2322     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2323     __ movw(count, scratch_length); // length
2324     __ b(RuntimeAddress(long_copy_entry));
2325 
2326     // ObjArrayKlass
2327   __ BIND(L_objArray);
2328     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2329 
2330     Label L_plain_copy, L_checkcast_copy;
2331     //  test array classes for subtyping
2332     __ load_klass(r18, dst);
2333     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2334     __ br(Assembler::NE, L_checkcast_copy);
2335 
2336     // Identically typed arrays can be copied without element-wise checks.
2337     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2338                            rscratch2, L_failed);
2339 
2340     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2341     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2342     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2343     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2344     __ movw(count, scratch_length); // length
2345   __ BIND(L_plain_copy);
2346     __ b(RuntimeAddress(oop_copy_entry));
2347 
2348   __ BIND(L_checkcast_copy);
2349     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2350     {
2351       // Before looking at dst.length, make sure dst is also an objArray.
2352       __ ldrw(rscratch1, Address(r18, lh_offset));
2353       __ movw(rscratch2, objArray_lh);
2354       __ eorw(rscratch1, rscratch1, rscratch2);
2355       __ cbnzw(rscratch1, L_failed);
2356 
2357       // It is safe to examine both src.length and dst.length.
2358       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2359                              r18, L_failed);
2360 
2361       const Register rscratch2_dst_klass = rscratch2;
2362       __ load_klass(rscratch2_dst_klass, dst); // reload
2363 
2364       // Marshal the base address arguments now, freeing registers.
2365       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2366       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2367       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2368       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2369       __ movw(count, length);           // length (reloaded)
2370       Register sco_temp = c_rarg3;      // this register is free now
2371       assert_different_registers(from, to, count, sco_temp,
2372                                  rscratch2_dst_klass, scratch_src_klass);
2373       // assert_clean_int(count, sco_temp);
2374 
2375       // Generate the type check.
2376       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2377       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2378       // assert_clean_int(sco_temp, r18);
2379       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2380 
2381       // Fetch destination element klass from the ObjArrayKlass header.
2382       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2383       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2384       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2385 
2386       // the checkcast_copy loop needs two extra arguments:
2387       assert(c_rarg3 == sco_temp, "#3 already in place");
2388       // Set up arguments for checkcast_copy_entry.
2389       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2390       __ b(RuntimeAddress(checkcast_copy_entry));
2391     }
2392 
2393   __ BIND(L_failed);
2394     __ mov(r0, -1);
2395     __ leave();   // required for proper stackwalking of RuntimeStub frame
2396     __ ret(lr);
2397 
2398     return start;
2399   }
2400 
2401   void generate_arraycopy_stubs() {
2402     address entry;
2403     address entry_jbyte_arraycopy;
2404     address entry_jshort_arraycopy;
2405     address entry_jint_arraycopy;
2406     address entry_oop_arraycopy;
2407     address entry_jlong_arraycopy;
2408     address entry_checkcast_arraycopy;
2409 
2410     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2411     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2412 
2413     StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
2414 
2415     //*** jbyte
2416     // Always need aligned and unaligned versions
2417     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2418                                                                                   "jbyte_disjoint_arraycopy");
2419     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2420                                                                                   &entry_jbyte_arraycopy,
2421                                                                                   "jbyte_arraycopy");
2422     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2423                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2424     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2425                                                                                   "arrayof_jbyte_arraycopy");
2426 
2427     //*** jshort
2428     // Always need aligned and unaligned versions
2429     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2430                                                                                     "jshort_disjoint_arraycopy");
2431     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2432                                                                                     &entry_jshort_arraycopy,
2433                                                                                     "jshort_arraycopy");
2434     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2435                                                                                     "arrayof_jshort_disjoint_arraycopy");
2436     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2437                                                                                     "arrayof_jshort_arraycopy");
2438 
2439     //*** jint
2440     // Aligned versions
2441     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2442                                                                                 "arrayof_jint_disjoint_arraycopy");
2443     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2444                                                                                 "arrayof_jint_arraycopy");
2445     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2446     // entry_jint_arraycopy always points to the unaligned version
2447     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2448                                                                                 "jint_disjoint_arraycopy");
2449     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2450                                                                                 &entry_jint_arraycopy,
2451                                                                                 "jint_arraycopy");
2452 
2453     //*** jlong
2454     // It is always aligned
2455     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2456                                                                                   "arrayof_jlong_disjoint_arraycopy");
2457     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2458                                                                                   "arrayof_jlong_arraycopy");
2459     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2460     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2461 
2462     //*** oops
2463     {
2464       // With compressed oops we need unaligned versions; notice that
2465       // we overwrite entry_oop_arraycopy.
2466       bool aligned = !UseCompressedOops;
2467 
2468       StubRoutines::_arrayof_oop_disjoint_arraycopy
2469         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2470                                      /*dest_uninitialized*/false);
2471       StubRoutines::_arrayof_oop_arraycopy
2472         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2473                                      /*dest_uninitialized*/false);
2474       // Aligned versions without pre-barriers
2475       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2476         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2477                                      /*dest_uninitialized*/true);
2478       StubRoutines::_arrayof_oop_arraycopy_uninit
2479         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2480                                      /*dest_uninitialized*/true);
2481     }
2482 
2483     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2484     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2485     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2486     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2487 
2488     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2489     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2490                                                                         /*dest_uninitialized*/true);
2491 
2492     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2493                                                               entry_jbyte_arraycopy,
2494                                                               entry_jshort_arraycopy,
2495                                                               entry_jint_arraycopy,
2496                                                               entry_jlong_arraycopy);
2497 
2498     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2499                                                                entry_jbyte_arraycopy,
2500                                                                entry_jshort_arraycopy,
2501                                                                entry_jint_arraycopy,
2502                                                                entry_oop_arraycopy,
2503                                                                entry_jlong_arraycopy,
2504                                                                entry_checkcast_arraycopy);
2505 
2506     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2507     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2508     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2509     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2510     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2511     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2512   }
2513 
2514   // Arguments:
2515   //
2516   // Inputs:
2517   //   c_rarg0   - source byte array address
2518   //   c_rarg1   - destination byte array address
2519   //   c_rarg2   - K (key) in little endian int array
2520   //
2521   address generate_aescrypt_encryptBlock() {
2522     __ align(CodeEntryAlignment);
2523     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2524 
2525     Label L_doLast;
2526 
2527     const Register from        = c_rarg0;  // source array address
2528     const Register to          = c_rarg1;  // destination array address
2529     const Register key         = c_rarg2;  // key array address
2530     const Register keylen      = rscratch1;
2531 
2532     address start = __ pc();
2533     __ enter();
2534 
2535     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2536 
2537     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2538 
2539     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2540     __ rev32(v1, __ T16B, v1);
2541     __ rev32(v2, __ T16B, v2);
2542     __ rev32(v3, __ T16B, v3);
2543     __ rev32(v4, __ T16B, v4);
2544     __ aese(v0, v1);
2545     __ aesmc(v0, v0);
2546     __ aese(v0, v2);
2547     __ aesmc(v0, v0);
2548     __ aese(v0, v3);
2549     __ aesmc(v0, v0);
2550     __ aese(v0, v4);
2551     __ aesmc(v0, v0);
2552 
2553     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2554     __ rev32(v1, __ T16B, v1);
2555     __ rev32(v2, __ T16B, v2);
2556     __ rev32(v3, __ T16B, v3);
2557     __ rev32(v4, __ T16B, v4);
2558     __ aese(v0, v1);
2559     __ aesmc(v0, v0);
2560     __ aese(v0, v2);
2561     __ aesmc(v0, v0);
2562     __ aese(v0, v3);
2563     __ aesmc(v0, v0);
2564     __ aese(v0, v4);
2565     __ aesmc(v0, v0);
2566 
2567     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2568     __ rev32(v1, __ T16B, v1);
2569     __ rev32(v2, __ T16B, v2);
2570 
2571     __ cmpw(keylen, 44);
2572     __ br(Assembler::EQ, L_doLast);
2573 
2574     __ aese(v0, v1);
2575     __ aesmc(v0, v0);
2576     __ aese(v0, v2);
2577     __ aesmc(v0, v0);
2578 
2579     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2580     __ rev32(v1, __ T16B, v1);
2581     __ rev32(v2, __ T16B, v2);
2582 
2583     __ cmpw(keylen, 52);
2584     __ br(Assembler::EQ, L_doLast);
2585 
2586     __ aese(v0, v1);
2587     __ aesmc(v0, v0);
2588     __ aese(v0, v2);
2589     __ aesmc(v0, v0);
2590 
2591     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2592     __ rev32(v1, __ T16B, v1);
2593     __ rev32(v2, __ T16B, v2);
2594 
2595     __ BIND(L_doLast);
2596 
2597     __ aese(v0, v1);
2598     __ aesmc(v0, v0);
2599     __ aese(v0, v2);
2600 
2601     __ ld1(v1, __ T16B, key);
2602     __ rev32(v1, __ T16B, v1);
2603     __ eor(v0, __ T16B, v0, v1);
2604 
2605     __ st1(v0, __ T16B, to);
2606 
2607     __ mov(r0, 0);
2608 
2609     __ leave();
2610     __ ret(lr);
2611 
2612     return start;
2613   }
2614 
2615   // Arguments:
2616   //
2617   // Inputs:
2618   //   c_rarg0   - source byte array address
2619   //   c_rarg1   - destination byte array address
2620   //   c_rarg2   - K (key) in little endian int array
2621   //
2622   address generate_aescrypt_decryptBlock() {
2623     assert(UseAES, "need AES instructions and misaligned SSE support");
2624     __ align(CodeEntryAlignment);
2625     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2626     Label L_doLast;
2627 
2628     const Register from        = c_rarg0;  // source array address
2629     const Register to          = c_rarg1;  // destination array address
2630     const Register key         = c_rarg2;  // key array address
2631     const Register keylen      = rscratch1;
2632 
2633     address start = __ pc();
2634     __ enter(); // required for proper stackwalking of RuntimeStub frame
2635 
2636     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2637 
2638     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2639 
2640     __ ld1(v5, __ T16B, __ post(key, 16));
2641     __ rev32(v5, __ T16B, v5);
2642 
2643     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2644     __ rev32(v1, __ T16B, v1);
2645     __ rev32(v2, __ T16B, v2);
2646     __ rev32(v3, __ T16B, v3);
2647     __ rev32(v4, __ T16B, v4);
2648     __ aesd(v0, v1);
2649     __ aesimc(v0, v0);
2650     __ aesd(v0, v2);
2651     __ aesimc(v0, v0);
2652     __ aesd(v0, v3);
2653     __ aesimc(v0, v0);
2654     __ aesd(v0, v4);
2655     __ aesimc(v0, v0);
2656 
2657     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2658     __ rev32(v1, __ T16B, v1);
2659     __ rev32(v2, __ T16B, v2);
2660     __ rev32(v3, __ T16B, v3);
2661     __ rev32(v4, __ T16B, v4);
2662     __ aesd(v0, v1);
2663     __ aesimc(v0, v0);
2664     __ aesd(v0, v2);
2665     __ aesimc(v0, v0);
2666     __ aesd(v0, v3);
2667     __ aesimc(v0, v0);
2668     __ aesd(v0, v4);
2669     __ aesimc(v0, v0);
2670 
2671     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2672     __ rev32(v1, __ T16B, v1);
2673     __ rev32(v2, __ T16B, v2);
2674 
2675     __ cmpw(keylen, 44);
2676     __ br(Assembler::EQ, L_doLast);
2677 
2678     __ aesd(v0, v1);
2679     __ aesimc(v0, v0);
2680     __ aesd(v0, v2);
2681     __ aesimc(v0, v0);
2682 
2683     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2684     __ rev32(v1, __ T16B, v1);
2685     __ rev32(v2, __ T16B, v2);
2686 
2687     __ cmpw(keylen, 52);
2688     __ br(Assembler::EQ, L_doLast);
2689 
2690     __ aesd(v0, v1);
2691     __ aesimc(v0, v0);
2692     __ aesd(v0, v2);
2693     __ aesimc(v0, v0);
2694 
2695     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2696     __ rev32(v1, __ T16B, v1);
2697     __ rev32(v2, __ T16B, v2);
2698 
2699     __ BIND(L_doLast);
2700 
2701     __ aesd(v0, v1);
2702     __ aesimc(v0, v0);
2703     __ aesd(v0, v2);
2704 
2705     __ eor(v0, __ T16B, v0, v5);
2706 
2707     __ st1(v0, __ T16B, to);
2708 
2709     __ mov(r0, 0);
2710 
2711     __ leave();
2712     __ ret(lr);
2713 
2714     return start;
2715   }
2716 
2717   // Arguments:
2718   //
2719   // Inputs:
2720   //   c_rarg0   - source byte array address
2721   //   c_rarg1   - destination byte array address
2722   //   c_rarg2   - K (key) in little endian int array
2723   //   c_rarg3   - r vector byte array address
2724   //   c_rarg4   - input length
2725   //
2726   // Output:
2727   //   x0        - input length
2728   //
2729   address generate_cipherBlockChaining_encryptAESCrypt() {
2730     assert(UseAES, "need AES instructions and misaligned SSE support");
2731     __ align(CodeEntryAlignment);
2732     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2733 
2734     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish;
2735 
2736     const Register from        = c_rarg0;  // source array address
2737     const Register to          = c_rarg1;  // destination array address
2738     const Register key         = c_rarg2;  // key array address
2739     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2740                                            // and left with the results of the last encryption block
2741     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2742     const Register keylen      = rscratch1;
2743 
2744     address start = __ pc();
2745 
2746       __ enter();
2747 
2748       __ subsw(rscratch2, len_reg, zr);
2749       __ br(Assembler::LE, _L_finish);
2750 
2751       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2752 
2753       __ ld1(v0, __ T16B, rvec);
2754 
2755       __ cmpw(keylen, 52);
2756       __ br(Assembler::CC, L_loadkeys_44);
2757       __ br(Assembler::EQ, L_loadkeys_52);
2758 
2759       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2760       __ rev32(v17, __ T16B, v17);
2761       __ rev32(v18, __ T16B, v18);
2762     __ BIND(L_loadkeys_52);
2763       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2764       __ rev32(v19, __ T16B, v19);
2765       __ rev32(v20, __ T16B, v20);
2766     __ BIND(L_loadkeys_44);
2767       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2768       __ rev32(v21, __ T16B, v21);
2769       __ rev32(v22, __ T16B, v22);
2770       __ rev32(v23, __ T16B, v23);
2771       __ rev32(v24, __ T16B, v24);
2772       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2773       __ rev32(v25, __ T16B, v25);
2774       __ rev32(v26, __ T16B, v26);
2775       __ rev32(v27, __ T16B, v27);
2776       __ rev32(v28, __ T16B, v28);
2777       __ ld1(v29, v30, v31, __ T16B, key);
2778       __ rev32(v29, __ T16B, v29);
2779       __ rev32(v30, __ T16B, v30);
2780       __ rev32(v31, __ T16B, v31);
2781 
2782     __ BIND(L_aes_loop);
2783       __ ld1(v1, __ T16B, __ post(from, 16));
2784       __ eor(v0, __ T16B, v0, v1);
2785 
2786       __ br(Assembler::CC, L_rounds_44);
2787       __ br(Assembler::EQ, L_rounds_52);
2788 
2789       __ aese(v0, v17); __ aesmc(v0, v0);
2790       __ aese(v0, v18); __ aesmc(v0, v0);
2791     __ BIND(L_rounds_52);
2792       __ aese(v0, v19); __ aesmc(v0, v0);
2793       __ aese(v0, v20); __ aesmc(v0, v0);
2794     __ BIND(L_rounds_44);
2795       __ aese(v0, v21); __ aesmc(v0, v0);
2796       __ aese(v0, v22); __ aesmc(v0, v0);
2797       __ aese(v0, v23); __ aesmc(v0, v0);
2798       __ aese(v0, v24); __ aesmc(v0, v0);
2799       __ aese(v0, v25); __ aesmc(v0, v0);
2800       __ aese(v0, v26); __ aesmc(v0, v0);
2801       __ aese(v0, v27); __ aesmc(v0, v0);
2802       __ aese(v0, v28); __ aesmc(v0, v0);
2803       __ aese(v0, v29); __ aesmc(v0, v0);
2804       __ aese(v0, v30);
2805       __ eor(v0, __ T16B, v0, v31);
2806 
2807       __ st1(v0, __ T16B, __ post(to, 16));
2808 
2809       __ subw(len_reg, len_reg, 16);
2810       __ cbnzw(len_reg, L_aes_loop);
2811 
2812       __ st1(v0, __ T16B, rvec);
2813 
2814     __ BIND(_L_finish);
2815       __ mov(r0, rscratch2);
2816 
2817       __ leave();
2818       __ ret(lr);
2819 
2820       return start;
2821   }
2822 
2823   // Arguments:
2824   //
2825   // Inputs:
2826   //   c_rarg0   - source byte array address
2827   //   c_rarg1   - destination byte array address
2828   //   c_rarg2   - K (key) in little endian int array
2829   //   c_rarg3   - r vector byte array address
2830   //   c_rarg4   - input length
2831   //
2832   // Output:
2833   //   r0       - input length
2834   //
2835   address generate_cipherBlockChaining_decryptAESCrypt() {
2836     assert(UseAES, "need AES instructions and misaligned SSE support");
2837     __ align(CodeEntryAlignment);
2838     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2839 
2840     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish;
2841 
2842     const Register from        = c_rarg0;  // source array address
2843     const Register to          = c_rarg1;  // destination array address
2844     const Register key         = c_rarg2;  // key array address
2845     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2846                                            // and left with the results of the last encryption block
2847     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2848     const Register keylen      = rscratch1;
2849 
2850     address start = __ pc();
2851 
2852       __ enter();
2853 
2854       __ subsw(rscratch2, len_reg, zr);
2855       __ br(Assembler::LE, _L_finish);
2856 
2857       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2858 
2859       __ ld1(v2, __ T16B, rvec);
2860 
2861       __ ld1(v31, __ T16B, __ post(key, 16));
2862       __ rev32(v31, __ T16B, v31);
2863 
2864       __ cmpw(keylen, 52);
2865       __ br(Assembler::CC, L_loadkeys_44);
2866       __ br(Assembler::EQ, L_loadkeys_52);
2867 
2868       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2869       __ rev32(v17, __ T16B, v17);
2870       __ rev32(v18, __ T16B, v18);
2871     __ BIND(L_loadkeys_52);
2872       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2873       __ rev32(v19, __ T16B, v19);
2874       __ rev32(v20, __ T16B, v20);
2875     __ BIND(L_loadkeys_44);
2876       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2877       __ rev32(v21, __ T16B, v21);
2878       __ rev32(v22, __ T16B, v22);
2879       __ rev32(v23, __ T16B, v23);
2880       __ rev32(v24, __ T16B, v24);
2881       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2882       __ rev32(v25, __ T16B, v25);
2883       __ rev32(v26, __ T16B, v26);
2884       __ rev32(v27, __ T16B, v27);
2885       __ rev32(v28, __ T16B, v28);
2886       __ ld1(v29, v30, __ T16B, key);
2887       __ rev32(v29, __ T16B, v29);
2888       __ rev32(v30, __ T16B, v30);
2889 
2890     __ BIND(L_aes_loop);
2891       __ ld1(v0, __ T16B, __ post(from, 16));
2892       __ orr(v1, __ T16B, v0, v0);
2893 
2894       __ br(Assembler::CC, L_rounds_44);
2895       __ br(Assembler::EQ, L_rounds_52);
2896 
2897       __ aesd(v0, v17); __ aesimc(v0, v0);
2898       __ aesd(v0, v18); __ aesimc(v0, v0);
2899     __ BIND(L_rounds_52);
2900       __ aesd(v0, v19); __ aesimc(v0, v0);
2901       __ aesd(v0, v20); __ aesimc(v0, v0);
2902     __ BIND(L_rounds_44);
2903       __ aesd(v0, v21); __ aesimc(v0, v0);
2904       __ aesd(v0, v22); __ aesimc(v0, v0);
2905       __ aesd(v0, v23); __ aesimc(v0, v0);
2906       __ aesd(v0, v24); __ aesimc(v0, v0);
2907       __ aesd(v0, v25); __ aesimc(v0, v0);
2908       __ aesd(v0, v26); __ aesimc(v0, v0);
2909       __ aesd(v0, v27); __ aesimc(v0, v0);
2910       __ aesd(v0, v28); __ aesimc(v0, v0);
2911       __ aesd(v0, v29); __ aesimc(v0, v0);
2912       __ aesd(v0, v30);
2913       __ eor(v0, __ T16B, v0, v31);
2914       __ eor(v0, __ T16B, v0, v2);
2915 
2916       __ st1(v0, __ T16B, __ post(to, 16));
2917       __ orr(v2, __ T16B, v1, v1);
2918 
2919       __ subw(len_reg, len_reg, 16);
2920       __ cbnzw(len_reg, L_aes_loop);
2921 
2922       __ st1(v2, __ T16B, rvec);
2923 
2924     __ BIND(_L_finish);
2925       __ mov(r0, rscratch2);
2926 
2927       __ leave();
2928       __ ret(lr);
2929 
2930     return start;
2931   }
2932 
2933   // Arguments:
2934   //
2935   // Inputs:
2936   //   c_rarg0   - byte[]  source+offset
2937   //   c_rarg1   - int[]   SHA.state
2938   //   c_rarg2   - int     offset
2939   //   c_rarg3   - int     limit
2940   //
2941   address generate_sha1_implCompress(bool multi_block, const char *name) {
2942     __ align(CodeEntryAlignment);
2943     StubCodeMark mark(this, "StubRoutines", name);
2944     address start = __ pc();
2945 
2946     Register buf   = c_rarg0;
2947     Register state = c_rarg1;
2948     Register ofs   = c_rarg2;
2949     Register limit = c_rarg3;
2950 
2951     Label keys;
2952     Label sha1_loop;
2953 
2954     // load the keys into v0..v3
2955     __ adr(rscratch1, keys);
2956     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2957     // load 5 words state into v6, v7
2958     __ ldrq(v6, Address(state, 0));
2959     __ ldrs(v7, Address(state, 16));
2960 
2961 
2962     __ BIND(sha1_loop);
2963     // load 64 bytes of data into v16..v19
2964     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2965     __ rev32(v16, __ T16B, v16);
2966     __ rev32(v17, __ T16B, v17);
2967     __ rev32(v18, __ T16B, v18);
2968     __ rev32(v19, __ T16B, v19);
2969 
2970     // do the sha1
2971     __ addv(v4, __ T4S, v16, v0);
2972     __ orr(v20, __ T16B, v6, v6);
2973 
2974     FloatRegister d0 = v16;
2975     FloatRegister d1 = v17;
2976     FloatRegister d2 = v18;
2977     FloatRegister d3 = v19;
2978 
2979     for (int round = 0; round < 20; round++) {
2980       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2981       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2982       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2983       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2984       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2985 
2986       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2987       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2988       __ sha1h(tmp2, __ T4S, v20);
2989       if (round < 5)
2990         __ sha1c(v20, __ T4S, tmp3, tmp4);
2991       else if (round < 10 || round >= 15)
2992         __ sha1p(v20, __ T4S, tmp3, tmp4);
2993       else
2994         __ sha1m(v20, __ T4S, tmp3, tmp4);
2995       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2996 
2997       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2998     }
2999 
3000     __ addv(v7, __ T2S, v7, v21);
3001     __ addv(v6, __ T4S, v6, v20);
3002 
3003     if (multi_block) {
3004       __ add(ofs, ofs, 64);
3005       __ cmp(ofs, limit);
3006       __ br(Assembler::LE, sha1_loop);
3007       __ mov(c_rarg0, ofs); // return ofs
3008     }
3009 
3010     __ strq(v6, Address(state, 0));
3011     __ strs(v7, Address(state, 16));
3012 
3013     __ ret(lr);
3014 
3015     __ bind(keys);
3016     __ emit_int32(0x5a827999);
3017     __ emit_int32(0x6ed9eba1);
3018     __ emit_int32(0x8f1bbcdc);
3019     __ emit_int32(0xca62c1d6);
3020 
3021     return start;
3022   }
3023 
3024 
3025   // Arguments:
3026   //
3027   // Inputs:
3028   //   c_rarg0   - byte[]  source+offset
3029   //   c_rarg1   - int[]   SHA.state
3030   //   c_rarg2   - int     offset
3031   //   c_rarg3   - int     limit
3032   //
3033   address generate_sha256_implCompress(bool multi_block, const char *name) {
3034     static const uint32_t round_consts[64] = {
3035       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3036       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3037       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3038       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3039       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3040       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3041       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3042       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3043       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3044       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3045       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3046       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3047       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3048       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3049       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3050       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3051     };
3052     __ align(CodeEntryAlignment);
3053     StubCodeMark mark(this, "StubRoutines", name);
3054     address start = __ pc();
3055 
3056     Register buf   = c_rarg0;
3057     Register state = c_rarg1;
3058     Register ofs   = c_rarg2;
3059     Register limit = c_rarg3;
3060 
3061     Label sha1_loop;
3062 
3063     __ stpd(v8, v9, __ pre(sp, -32));
3064     __ stpd(v10, v11, Address(sp, 16));
3065 
3066 // dga == v0
3067 // dgb == v1
3068 // dg0 == v2
3069 // dg1 == v3
3070 // dg2 == v4
3071 // t0 == v6
3072 // t1 == v7
3073 
3074     // load 16 keys to v16..v31
3075     __ lea(rscratch1, ExternalAddress((address)round_consts));
3076     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3077     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3078     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3079     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3080 
3081     // load 8 words (256 bits) state
3082     __ ldpq(v0, v1, state);
3083 
3084     __ BIND(sha1_loop);
3085     // load 64 bytes of data into v8..v11
3086     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3087     __ rev32(v8, __ T16B, v8);
3088     __ rev32(v9, __ T16B, v9);
3089     __ rev32(v10, __ T16B, v10);
3090     __ rev32(v11, __ T16B, v11);
3091 
3092     __ addv(v6, __ T4S, v8, v16);
3093     __ orr(v2, __ T16B, v0, v0);
3094     __ orr(v3, __ T16B, v1, v1);
3095 
3096     FloatRegister d0 = v8;
3097     FloatRegister d1 = v9;
3098     FloatRegister d2 = v10;
3099     FloatRegister d3 = v11;
3100 
3101 
3102     for (int round = 0; round < 16; round++) {
3103       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3104       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3105       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3106       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3107 
3108       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3109        __ orr(v4, __ T16B, v2, v2);
3110       if (round < 15)
3111         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3112       __ sha256h(v2, __ T4S, v3, tmp2);
3113       __ sha256h2(v3, __ T4S, v4, tmp2);
3114       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3115 
3116       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3117     }
3118 
3119     __ addv(v0, __ T4S, v0, v2);
3120     __ addv(v1, __ T4S, v1, v3);
3121 
3122     if (multi_block) {
3123       __ add(ofs, ofs, 64);
3124       __ cmp(ofs, limit);
3125       __ br(Assembler::LE, sha1_loop);
3126       __ mov(c_rarg0, ofs); // return ofs
3127     }
3128 
3129     __ ldpd(v10, v11, Address(sp, 16));
3130     __ ldpd(v8, v9, __ post(sp, 32));
3131 
3132     __ stpq(v0, v1, state);
3133 
3134     __ ret(lr);
3135 
3136     return start;
3137   }
3138 
3139   // Safefetch stubs.
3140   void generate_safefetch(const char* name, int size, address* entry,
3141                           address* fault_pc, address* continuation_pc) {
3142     // safefetch signatures:
3143     //   int      SafeFetch32(int*      adr, int      errValue);
3144     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3145     //
3146     // arguments:
3147     //   c_rarg0 = adr
3148     //   c_rarg1 = errValue
3149     //
3150     // result:
3151     //   PPC_RET  = *adr or errValue
3152 
3153     StubCodeMark mark(this, "StubRoutines", name);
3154 
3155     // Entry point, pc or function descriptor.
3156     *entry = __ pc();
3157 
3158     // Load *adr into c_rarg1, may fault.
3159     *fault_pc = __ pc();
3160     switch (size) {
3161       case 4:
3162         // int32_t
3163         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3164         break;
3165       case 8:
3166         // int64_t
3167         __ ldr(c_rarg1, Address(c_rarg0, 0));
3168         break;
3169       default:
3170         ShouldNotReachHere();
3171     }
3172 
3173     // return errValue or *adr
3174     *continuation_pc = __ pc();
3175     __ mov(r0, c_rarg1);
3176     __ ret(lr);
3177   }
3178 
3179   /**
3180    *  Arguments:
3181    *
3182    * Inputs:
3183    *   c_rarg0   - int crc
3184    *   c_rarg1   - byte* buf
3185    *   c_rarg2   - int length
3186    *
3187    * Output:
3188    *       r0   - int crc result
3189    *
3190    * Preserves:
3191    *       r13
3192    *
3193    */
3194   address generate_updateBytesCRC32() {
3195     assert(UseCRC32Intrinsics, "what are we doing here?");
3196 
3197     __ align(CodeEntryAlignment);
3198     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3199 
3200     address start = __ pc();
3201 
3202     const Register crc   = c_rarg0;  // crc
3203     const Register buf   = c_rarg1;  // source java byte array address
3204     const Register len   = c_rarg2;  // length
3205     const Register table0 = c_rarg3; // crc_table address
3206     const Register table1 = c_rarg4;
3207     const Register table2 = c_rarg5;
3208     const Register table3 = c_rarg6;
3209     const Register tmp3 = c_rarg7;
3210 
3211     BLOCK_COMMENT("Entry:");
3212     __ enter(); // required for proper stackwalking of RuntimeStub frame
3213 
3214     __ kernel_crc32(crc, buf, len,
3215               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3216 
3217     __ leave(); // required for proper stackwalking of RuntimeStub frame
3218     __ ret(lr);
3219 
3220     return start;
3221   }
3222 
3223   /**
3224    *  Arguments:
3225    *
3226    *  Input:
3227    *    c_rarg0   - x address
3228    *    c_rarg1   - x length
3229    *    c_rarg2   - y address
3230    *    c_rarg3   - y lenth
3231    *    c_rarg4   - z address
3232    *    c_rarg5   - z length
3233    */
3234   address generate_multiplyToLen() {
3235     __ align(CodeEntryAlignment);
3236     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3237 
3238     address start = __ pc();
3239     const Register x     = r0;
3240     const Register xlen  = r1;
3241     const Register y     = r2;
3242     const Register ylen  = r3;
3243     const Register z     = r4;
3244     const Register zlen  = r5;
3245 
3246     const Register tmp1  = r10;
3247     const Register tmp2  = r11;
3248     const Register tmp3  = r12;
3249     const Register tmp4  = r13;
3250     const Register tmp5  = r14;
3251     const Register tmp6  = r15;
3252     const Register tmp7  = r16;
3253 
3254     BLOCK_COMMENT("Entry:");
3255     __ enter(); // required for proper stackwalking of RuntimeStub frame
3256     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3257     __ leave(); // required for proper stackwalking of RuntimeStub frame
3258     __ ret(lr);
3259 
3260     return start;
3261   }
3262 
3263   // Continuation point for throwing of implicit exceptions that are
3264   // not handled in the current activation. Fabricates an exception
3265   // oop and initiates normal exception dispatching in this
3266   // frame. Since we need to preserve callee-saved values (currently
3267   // only for C2, but done for C1 as well) we need a callee-saved oop
3268   // map and therefore have to make these stubs into RuntimeStubs
3269   // rather than BufferBlobs.  If the compiler needs all registers to
3270   // be preserved between the fault point and the exception handler
3271   // then it must assume responsibility for that in
3272   // AbstractCompiler::continuation_for_implicit_null_exception or
3273   // continuation_for_implicit_division_by_zero_exception. All other
3274   // implicit exceptions (e.g., NullPointerException or
3275   // AbstractMethodError on entry) are either at call sites or
3276   // otherwise assume that stack unwinding will be initiated, so
3277   // caller saved registers were assumed volatile in the compiler.
3278 
3279 #undef __
3280 #define __ masm->
3281 
3282   address generate_throw_exception(const char* name,
3283                                    address runtime_entry,
3284                                    Register arg1 = noreg,
3285                                    Register arg2 = noreg) {
3286     // Information about frame layout at time of blocking runtime call.
3287     // Note that we only have to preserve callee-saved registers since
3288     // the compilers are responsible for supplying a continuation point
3289     // if they expect all registers to be preserved.
3290     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3291     enum layout {
3292       rfp_off = 0,
3293       rfp_off2,
3294       return_off,
3295       return_off2,
3296       framesize // inclusive of return address
3297     };
3298 
3299     int insts_size = 512;
3300     int locs_size  = 64;
3301 
3302     CodeBuffer code(name, insts_size, locs_size);
3303     OopMapSet* oop_maps  = new OopMapSet();
3304     MacroAssembler* masm = new MacroAssembler(&code);
3305 
3306     address start = __ pc();
3307 
3308     // This is an inlined and slightly modified version of call_VM
3309     // which has the ability to fetch the return PC out of
3310     // thread-local storage and also sets up last_Java_sp slightly
3311     // differently than the real call_VM
3312 
3313     __ enter(); // Save FP and LR before call
3314 
3315     assert(is_even(framesize/2), "sp not 16-byte aligned");
3316 
3317     // lr and fp are already in place
3318     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3319 
3320     int frame_complete = __ pc() - start;
3321 
3322     // Set up last_Java_sp and last_Java_fp
3323     address the_pc = __ pc();
3324     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3325 
3326     // Call runtime
3327     if (arg1 != noreg) {
3328       assert(arg2 != c_rarg1, "clobbered");
3329       __ mov(c_rarg1, arg1);
3330     }
3331     if (arg2 != noreg) {
3332       __ mov(c_rarg2, arg2);
3333     }
3334     __ mov(c_rarg0, rthread);
3335     BLOCK_COMMENT("call runtime_entry");
3336     __ mov(rscratch1, runtime_entry);
3337     __ blr(rscratch1);
3338 
3339     // Generate oop map
3340     OopMap* map = new OopMap(framesize, 0);
3341 
3342     oop_maps->add_gc_map(the_pc - start, map);
3343 
3344     __ reset_last_Java_frame(true);
3345     __ maybe_isb();
3346 
3347     __ leave();
3348 
3349     // check for pending exceptions
3350 #ifdef ASSERT
3351     Label L;
3352     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3353     __ cbnz(rscratch1, L);
3354     __ should_not_reach_here();
3355     __ bind(L);
3356 #endif // ASSERT
3357     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3358 
3359 
3360     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3361     RuntimeStub* stub =
3362       RuntimeStub::new_runtime_stub(name,
3363                                     &code,
3364                                     frame_complete,
3365                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3366                                     oop_maps, false);
3367     return stub->entry_point();
3368   }
3369 
3370   class MontgomeryMultiplyGenerator : public MacroAssembler {
3371 
3372     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3373       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3374 
3375     RegSet _toSave;
3376     bool _squaring;
3377 
3378   public:
3379     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3380       : MacroAssembler(as->code()), _squaring(squaring) {
3381 
3382       // Register allocation
3383 
3384       Register reg = c_rarg0;
3385       Pa_base = reg;       // Argument registers
3386       if (squaring)
3387         Pb_base = Pa_base;
3388       else
3389         Pb_base = ++reg;
3390       Pn_base = ++reg;
3391       Rlen= ++reg;
3392       inv = ++reg;
3393       Pm_base = ++reg;
3394 
3395                           // Working registers:
3396       Ra =  ++reg;        // The current digit of a, b, n, and m.
3397       Rb =  ++reg;
3398       Rm =  ++reg;
3399       Rn =  ++reg;
3400 
3401       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3402       Pb =  ++reg;
3403       Pm =  ++reg;
3404       Pn =  ++reg;
3405 
3406       t0 =  ++reg;        // Three registers which form a
3407       t1 =  ++reg;        // triple-precision accumuator.
3408       t2 =  ++reg;
3409 
3410       Ri =  ++reg;        // Inner and outer loop indexes.
3411       Rj =  ++reg;
3412 
3413       Rhi_ab = ++reg;     // Product registers: low and high parts
3414       Rlo_ab = ++reg;     // of a*b and m*n.
3415       Rhi_mn = ++reg;
3416       Rlo_mn = ++reg;
3417 
3418       // r19 and up are callee-saved.
3419       _toSave = RegSet::range(r19, reg) + Pm_base;
3420     }
3421 
3422   private:
3423     void save_regs() {
3424       push(_toSave, sp);
3425     }
3426 
3427     void restore_regs() {
3428       pop(_toSave, sp);
3429     }
3430 
3431     template <typename T>
3432     void unroll_2(Register count, T block) {
3433       Label loop, end, odd;
3434       tbnz(count, 0, odd);
3435       cbz(count, end);
3436       align(16);
3437       bind(loop);
3438       (this->*block)();
3439       bind(odd);
3440       (this->*block)();
3441       subs(count, count, 2);
3442       br(Assembler::GT, loop);
3443       bind(end);
3444     }
3445 
3446     template <typename T>
3447     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3448       Label loop, end, odd;
3449       tbnz(count, 0, odd);
3450       cbz(count, end);
3451       align(16);
3452       bind(loop);
3453       (this->*block)(d, s, tmp);
3454       bind(odd);
3455       (this->*block)(d, s, tmp);
3456       subs(count, count, 2);
3457       br(Assembler::GT, loop);
3458       bind(end);
3459     }
3460 
3461     void pre1(RegisterOrConstant i) {
3462       block_comment("pre1");
3463       // Pa = Pa_base;
3464       // Pb = Pb_base + i;
3465       // Pm = Pm_base;
3466       // Pn = Pn_base + i;
3467       // Ra = *Pa;
3468       // Rb = *Pb;
3469       // Rm = *Pm;
3470       // Rn = *Pn;
3471       ldr(Ra, Address(Pa_base));
3472       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3473       ldr(Rm, Address(Pm_base));
3474       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3475       lea(Pa, Address(Pa_base));
3476       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3477       lea(Pm, Address(Pm_base));
3478       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3479 
3480       // Zero the m*n result.
3481       mov(Rhi_mn, zr);
3482       mov(Rlo_mn, zr);
3483     }
3484 
3485     // The core multiply-accumulate step of a Montgomery
3486     // multiplication.  The idea is to schedule operations as a
3487     // pipeline so that instructions with long latencies (loads and
3488     // multiplies) have time to complete before their results are
3489     // used.  This most benefits in-order implementations of the
3490     // architecture but out-of-order ones also benefit.
3491     void step() {
3492       block_comment("step");
3493       // MACC(Ra, Rb, t0, t1, t2);
3494       // Ra = *++Pa;
3495       // Rb = *--Pb;
3496       umulh(Rhi_ab, Ra, Rb);
3497       mul(Rlo_ab, Ra, Rb);
3498       ldr(Ra, pre(Pa, wordSize));
3499       ldr(Rb, pre(Pb, -wordSize));
3500       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3501                                        // previous iteration.
3502       // MACC(Rm, Rn, t0, t1, t2);
3503       // Rm = *++Pm;
3504       // Rn = *--Pn;
3505       umulh(Rhi_mn, Rm, Rn);
3506       mul(Rlo_mn, Rm, Rn);
3507       ldr(Rm, pre(Pm, wordSize));
3508       ldr(Rn, pre(Pn, -wordSize));
3509       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3510     }
3511 
3512     void post1() {
3513       block_comment("post1");
3514 
3515       // MACC(Ra, Rb, t0, t1, t2);
3516       // Ra = *++Pa;
3517       // Rb = *--Pb;
3518       umulh(Rhi_ab, Ra, Rb);
3519       mul(Rlo_ab, Ra, Rb);
3520       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3521       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3522 
3523       // *Pm = Rm = t0 * inv;
3524       mul(Rm, t0, inv);
3525       str(Rm, Address(Pm));
3526 
3527       // MACC(Rm, Rn, t0, t1, t2);
3528       // t0 = t1; t1 = t2; t2 = 0;
3529       umulh(Rhi_mn, Rm, Rn);
3530 
3531 #ifndef PRODUCT
3532       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3533       {
3534         mul(Rlo_mn, Rm, Rn);
3535         add(Rlo_mn, t0, Rlo_mn);
3536         Label ok;
3537         cbz(Rlo_mn, ok); {
3538           stop("broken Montgomery multiply");
3539         } bind(ok);
3540       }
3541 #endif
3542       // We have very carefully set things up so that
3543       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3544       // the lower half of Rm * Rn because we know the result already:
3545       // it must be -t0.  t0 + (-t0) must generate a carry iff
3546       // t0 != 0.  So, rather than do a mul and an adds we just set
3547       // the carry flag iff t0 is nonzero.
3548       //
3549       // mul(Rlo_mn, Rm, Rn);
3550       // adds(zr, t0, Rlo_mn);
3551       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3552       adcs(t0, t1, Rhi_mn);
3553       adc(t1, t2, zr);
3554       mov(t2, zr);
3555     }
3556 
3557     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
3558       block_comment("pre2");
3559       // Pa = Pa_base + i-len;
3560       // Pb = Pb_base + len;
3561       // Pm = Pm_base + i-len;
3562       // Pn = Pn_base + len;
3563 
3564       if (i.is_register()) {
3565         sub(Rj, i.as_register(), len);
3566       } else {
3567         mov(Rj, i.as_constant());
3568         sub(Rj, Rj, len);
3569       }
3570       // Rj == i-len
3571 
3572       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
3573       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
3574       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3575       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
3576 
3577       // Ra = *++Pa;
3578       // Rb = *--Pb;
3579       // Rm = *++Pm;
3580       // Rn = *--Pn;
3581       ldr(Ra, pre(Pa, wordSize));
3582       ldr(Rb, pre(Pb, -wordSize));
3583       ldr(Rm, pre(Pm, wordSize));
3584       ldr(Rn, pre(Pn, -wordSize));
3585 
3586       mov(Rhi_mn, zr);
3587       mov(Rlo_mn, zr);
3588     }
3589 
3590     void post2(RegisterOrConstant i, RegisterOrConstant len) {
3591       block_comment("post2");
3592       if (i.is_constant()) {
3593         mov(Rj, i.as_constant()-len.as_constant());
3594       } else {
3595         sub(Rj, i.as_register(), len);
3596       }
3597 
3598       adds(t0, t0, Rlo_mn); // The pending m*n, low part
3599 
3600       // As soon as we know the least significant digit of our result,
3601       // store it.
3602       // Pm_base[i-len] = t0;
3603       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3604 
3605       // t0 = t1; t1 = t2; t2 = 0;
3606       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
3607       adc(t1, t2, zr);
3608       mov(t2, zr);
3609     }
3610 
3611     // A carry in t0 after Montgomery multiplication means that we
3612     // should subtract multiples of n from our result in m.  We'll
3613     // keep doing that until there is no carry.
3614     void normalize(RegisterOrConstant len) {
3615       block_comment("normalize");
3616       // while (t0)
3617       //   t0 = sub(Pm_base, Pn_base, t0, len);
3618       Label loop, post, again;
3619       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
3620       cbz(t0, post); {
3621         bind(again); {
3622           mov(i, zr);
3623           mov(cnt, len);
3624           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3625           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3626           subs(zr, zr, zr); // set carry flag, i.e. no borrow
3627           align(16);
3628           bind(loop); {
3629             sbcs(Rm, Rm, Rn);
3630             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3631             add(i, i, 1);
3632             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3633             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3634             sub(cnt, cnt, 1);
3635           } cbnz(cnt, loop);
3636           sbc(t0, t0, zr);
3637         } cbnz(t0, again);
3638       } bind(post);
3639     }
3640 
3641     // Move memory at s to d, reversing words.
3642     //    Increments d to end of copied memory
3643     //    Destroys tmp1, tmp2
3644     //    Preserves len
3645     //    Leaves s pointing to the address which was in d at start
3646     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3647       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
3648 
3649       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
3650       mov(tmp1, len);
3651       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3652       sub(s, d, len, ext::uxtw, LogBytesPerWord);
3653     }
3654     // where
3655     void reverse1(Register d, Register s, Register tmp) {
3656       ldr(tmp, pre(s, -wordSize));
3657       ror(tmp, tmp, 32);
3658       str(tmp, post(d, wordSize));
3659     }
3660 
3661     void step_squaring() {
3662       // An extra ACC
3663       step();
3664       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3665     }
3666 
3667     void last_squaring(RegisterOrConstant i) {
3668       Label dont;
3669       // if ((i & 1) == 0) {
3670       tbnz(i.as_register(), 0, dont); {
3671         // MACC(Ra, Rb, t0, t1, t2);
3672         // Ra = *++Pa;
3673         // Rb = *--Pb;
3674         umulh(Rhi_ab, Ra, Rb);
3675         mul(Rlo_ab, Ra, Rb);
3676         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3677       } bind(dont);
3678     }
3679 
3680     void extra_step_squaring() {
3681       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3682 
3683       // MACC(Rm, Rn, t0, t1, t2);
3684       // Rm = *++Pm;
3685       // Rn = *--Pn;
3686       umulh(Rhi_mn, Rm, Rn);
3687       mul(Rlo_mn, Rm, Rn);
3688       ldr(Rm, pre(Pm, wordSize));
3689       ldr(Rn, pre(Pn, -wordSize));
3690     }
3691 
3692     void post1_squaring() {
3693       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3694 
3695       // *Pm = Rm = t0 * inv;
3696       mul(Rm, t0, inv);
3697       str(Rm, Address(Pm));
3698 
3699       // MACC(Rm, Rn, t0, t1, t2);
3700       // t0 = t1; t1 = t2; t2 = 0;
3701       umulh(Rhi_mn, Rm, Rn);
3702 
3703 #ifndef PRODUCT
3704       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3705       {
3706         mul(Rlo_mn, Rm, Rn);
3707         add(Rlo_mn, t0, Rlo_mn);
3708         Label ok;
3709         cbz(Rlo_mn, ok); {
3710           stop("broken Montgomery multiply");
3711         } bind(ok);
3712       }
3713 #endif
3714       // We have very carefully set things up so that
3715       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3716       // the lower half of Rm * Rn because we know the result already:
3717       // it must be -t0.  t0 + (-t0) must generate a carry iff
3718       // t0 != 0.  So, rather than do a mul and an adds we just set
3719       // the carry flag iff t0 is nonzero.
3720       //
3721       // mul(Rlo_mn, Rm, Rn);
3722       // adds(zr, t0, Rlo_mn);
3723       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3724       adcs(t0, t1, Rhi_mn);
3725       adc(t1, t2, zr);
3726       mov(t2, zr);
3727     }
3728 
3729     void acc(Register Rhi, Register Rlo,
3730              Register t0, Register t1, Register t2) {
3731       adds(t0, t0, Rlo);
3732       adcs(t1, t1, Rhi);
3733       adc(t2, t2, zr);
3734     }
3735 
3736   public:
3737     /**
3738      * Fast Montgomery multiplication.  The derivation of the
3739      * algorithm is in A Cryptographic Library for the Motorola
3740      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3741      *
3742      * Arguments:
3743      *
3744      * Inputs for multiplication:
3745      *   c_rarg0   - int array elements a
3746      *   c_rarg1   - int array elements b
3747      *   c_rarg2   - int array elements n (the modulus)
3748      *   c_rarg3   - int length
3749      *   c_rarg4   - int inv
3750      *   c_rarg5   - int array elements m (the result)
3751      *
3752      * Inputs for squaring:
3753      *   c_rarg0   - int array elements a
3754      *   c_rarg1   - int array elements n (the modulus)
3755      *   c_rarg2   - int length
3756      *   c_rarg3   - int inv
3757      *   c_rarg4   - int array elements m (the result)
3758      *
3759      */
3760     address generate_multiply() {
3761       Label argh, nothing;
3762       bind(argh);
3763       stop("MontgomeryMultiply total_allocation must be <= 8192");
3764 
3765       align(CodeEntryAlignment);
3766       address entry = pc();
3767 
3768       cbzw(Rlen, nothing);
3769 
3770       enter();
3771 
3772       // Make room.
3773       cmpw(Rlen, 512);
3774       br(Assembler::HI, argh);
3775       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3776       andr(sp, Ra, -2 * wordSize);
3777 
3778       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3779 
3780       {
3781         // Copy input args, reversing as we go.  We use Ra as a
3782         // temporary variable.
3783         reverse(Ra, Pa_base, Rlen, t0, t1);
3784         if (!_squaring)
3785           reverse(Ra, Pb_base, Rlen, t0, t1);
3786         reverse(Ra, Pn_base, Rlen, t0, t1);
3787       }
3788 
3789       // Push all call-saved registers and also Pm_base which we'll need
3790       // at the end.
3791       save_regs();
3792 
3793 #ifndef PRODUCT
3794       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3795       {
3796         ldr(Rn, Address(Pn_base, 0));
3797         mul(Rlo_mn, Rn, inv);
3798         cmp(Rlo_mn, -1);
3799         Label ok;
3800         br(EQ, ok); {
3801           stop("broken inverse in Montgomery multiply");
3802         } bind(ok);
3803       }
3804 #endif
3805 
3806       mov(Pm_base, Ra);
3807 
3808       mov(t0, zr);
3809       mov(t1, zr);
3810       mov(t2, zr);
3811 
3812       block_comment("for (int i = 0; i < len; i++) {");
3813       mov(Ri, zr); {
3814         Label loop, end;
3815         cmpw(Ri, Rlen);
3816         br(Assembler::GE, end);
3817 
3818         bind(loop);
3819         pre1(Ri);
3820 
3821         block_comment("  for (j = i; j; j--) {"); {
3822           movw(Rj, Ri);
3823           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3824         } block_comment("  } // j");
3825 
3826         post1();
3827         addw(Ri, Ri, 1);
3828         cmpw(Ri, Rlen);
3829         br(Assembler::LT, loop);
3830         bind(end);
3831         block_comment("} // i");
3832       }
3833 
3834       block_comment("for (int i = len; i < 2*len; i++) {");
3835       mov(Ri, Rlen); {
3836         Label loop, end;
3837         cmpw(Ri, Rlen, Assembler::LSL, 1);
3838         br(Assembler::GE, end);
3839 
3840         bind(loop);
3841         pre2(Ri, Rlen);
3842 
3843         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3844           lslw(Rj, Rlen, 1);
3845           subw(Rj, Rj, Ri);
3846           subw(Rj, Rj, 1);
3847           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3848         } block_comment("  } // j");
3849 
3850         post2(Ri, Rlen);
3851         addw(Ri, Ri, 1);
3852         cmpw(Ri, Rlen, Assembler::LSL, 1);
3853         br(Assembler::LT, loop);
3854         bind(end);
3855       }
3856       block_comment("} // i");
3857 
3858       normalize(Rlen);
3859 
3860       mov(Ra, Pm_base);  // Save Pm_base in Ra
3861       restore_regs();  // Restore caller's Pm_base
3862 
3863       // Copy our result into caller's Pm_base
3864       reverse(Pm_base, Ra, Rlen, t0, t1);
3865 
3866       leave();
3867       bind(nothing);
3868       ret(lr);
3869 
3870       return entry;
3871     }
3872     // In C, approximately:
3873 
3874     // void
3875     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
3876     //                     unsigned long Pn_base[], unsigned long Pm_base[],
3877     //                     unsigned long inv, int len) {
3878     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3879     //   unsigned long *Pa, *Pb, *Pn, *Pm;
3880     //   unsigned long Ra, Rb, Rn, Rm;
3881 
3882     //   int i;
3883 
3884     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
3885 
3886     //   for (i = 0; i < len; i++) {
3887     //     int j;
3888 
3889     //     Pa = Pa_base;
3890     //     Pb = Pb_base + i;
3891     //     Pm = Pm_base;
3892     //     Pn = Pn_base + i;
3893 
3894     //     Ra = *Pa;
3895     //     Rb = *Pb;
3896     //     Rm = *Pm;
3897     //     Rn = *Pn;
3898 
3899     //     int iters = i;
3900     //     for (j = 0; iters--; j++) {
3901     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3902     //       MACC(Ra, Rb, t0, t1, t2);
3903     //       Ra = *++Pa;
3904     //       Rb = *--Pb;
3905     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3906     //       MACC(Rm, Rn, t0, t1, t2);
3907     //       Rm = *++Pm;
3908     //       Rn = *--Pn;
3909     //     }
3910 
3911     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
3912     //     MACC(Ra, Rb, t0, t1, t2);
3913     //     *Pm = Rm = t0 * inv;
3914     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
3915     //     MACC(Rm, Rn, t0, t1, t2);
3916 
3917     //     assert(t0 == 0, "broken Montgomery multiply");
3918 
3919     //     t0 = t1; t1 = t2; t2 = 0;
3920     //   }
3921 
3922     //   for (i = len; i < 2*len; i++) {
3923     //     int j;
3924 
3925     //     Pa = Pa_base + i-len;
3926     //     Pb = Pb_base + len;
3927     //     Pm = Pm_base + i-len;
3928     //     Pn = Pn_base + len;
3929 
3930     //     Ra = *++Pa;
3931     //     Rb = *--Pb;
3932     //     Rm = *++Pm;
3933     //     Rn = *--Pn;
3934 
3935     //     int iters = len*2-i-1;
3936     //     for (j = i-len+1; iters--; j++) {
3937     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3938     //       MACC(Ra, Rb, t0, t1, t2);
3939     //       Ra = *++Pa;
3940     //       Rb = *--Pb;
3941     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3942     //       MACC(Rm, Rn, t0, t1, t2);
3943     //       Rm = *++Pm;
3944     //       Rn = *--Pn;
3945     //     }
3946 
3947     //     Pm_base[i-len] = t0;
3948     //     t0 = t1; t1 = t2; t2 = 0;
3949     //   }
3950 
3951     //   while (t0)
3952     //     t0 = sub(Pm_base, Pn_base, t0, len);
3953     // }
3954 
3955     /**
3956      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
3957      * multiplies than Montgomery multiplication so it should be up to
3958      * 25% faster.  However, its loop control is more complex and it
3959      * may actually run slower on some machines.
3960      *
3961      * Arguments:
3962      *
3963      * Inputs:
3964      *   c_rarg0   - int array elements a
3965      *   c_rarg1   - int array elements n (the modulus)
3966      *   c_rarg2   - int length
3967      *   c_rarg3   - int inv
3968      *   c_rarg4   - int array elements m (the result)
3969      *
3970      */
3971     address generate_square() {
3972       Label argh;
3973       bind(argh);
3974       stop("MontgomeryMultiply total_allocation must be <= 8192");
3975 
3976       align(CodeEntryAlignment);
3977       address entry = pc();
3978 
3979       enter();
3980 
3981       // Make room.
3982       cmpw(Rlen, 512);
3983       br(Assembler::HI, argh);
3984       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3985       andr(sp, Ra, -2 * wordSize);
3986 
3987       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3988 
3989       {
3990         // Copy input args, reversing as we go.  We use Ra as a
3991         // temporary variable.
3992         reverse(Ra, Pa_base, Rlen, t0, t1);
3993         reverse(Ra, Pn_base, Rlen, t0, t1);
3994       }
3995 
3996       // Push all call-saved registers and also Pm_base which we'll need
3997       // at the end.
3998       save_regs();
3999 
4000       mov(Pm_base, Ra);
4001 
4002       mov(t0, zr);
4003       mov(t1, zr);
4004       mov(t2, zr);
4005 
4006       block_comment("for (int i = 0; i < len; i++) {");
4007       mov(Ri, zr); {
4008         Label loop, end;
4009         bind(loop);
4010         cmp(Ri, Rlen);
4011         br(Assembler::GE, end);
4012 
4013         pre1(Ri);
4014 
4015         block_comment("for (j = (i+1)/2; j; j--) {"); {
4016           add(Rj, Ri, 1);
4017           lsr(Rj, Rj, 1);
4018           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4019         } block_comment("  } // j");
4020 
4021         last_squaring(Ri);
4022 
4023         block_comment("  for (j = i/2; j; j--) {"); {
4024           lsr(Rj, Ri, 1);
4025           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4026         } block_comment("  } // j");
4027 
4028         post1_squaring();
4029         add(Ri, Ri, 1);
4030         cmp(Ri, Rlen);
4031         br(Assembler::LT, loop);
4032 
4033         bind(end);
4034         block_comment("} // i");
4035       }
4036 
4037       block_comment("for (int i = len; i < 2*len; i++) {");
4038       mov(Ri, Rlen); {
4039         Label loop, end;
4040         bind(loop);
4041         cmp(Ri, Rlen, Assembler::LSL, 1);
4042         br(Assembler::GE, end);
4043 
4044         pre2(Ri, Rlen);
4045 
4046         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4047           lsl(Rj, Rlen, 1);
4048           sub(Rj, Rj, Ri);
4049           sub(Rj, Rj, 1);
4050           lsr(Rj, Rj, 1);
4051           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4052         } block_comment("  } // j");
4053 
4054         last_squaring(Ri);
4055 
4056         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4057           lsl(Rj, Rlen, 1);
4058           sub(Rj, Rj, Ri);
4059           lsr(Rj, Rj, 1);
4060           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4061         } block_comment("  } // j");
4062 
4063         post2(Ri, Rlen);
4064         add(Ri, Ri, 1);
4065         cmp(Ri, Rlen, Assembler::LSL, 1);
4066 
4067         br(Assembler::LT, loop);
4068         bind(end);
4069         block_comment("} // i");
4070       }
4071 
4072       normalize(Rlen);
4073 
4074       mov(Ra, Pm_base);  // Save Pm_base in Ra
4075       restore_regs();  // Restore caller's Pm_base
4076 
4077       // Copy our result into caller's Pm_base
4078       reverse(Pm_base, Ra, Rlen, t0, t1);
4079 
4080       leave();
4081       ret(lr);
4082 
4083       return entry;
4084     }
4085     // In C, approximately:
4086 
4087     // void
4088     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4089     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4090     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4091     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4092     //   unsigned long Ra, Rb, Rn, Rm;
4093 
4094     //   int i;
4095 
4096     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4097 
4098     //   for (i = 0; i < len; i++) {
4099     //     int j;
4100 
4101     //     Pa = Pa_base;
4102     //     Pb = Pa_base + i;
4103     //     Pm = Pm_base;
4104     //     Pn = Pn_base + i;
4105 
4106     //     Ra = *Pa;
4107     //     Rb = *Pb;
4108     //     Rm = *Pm;
4109     //     Rn = *Pn;
4110 
4111     //     int iters = (i+1)/2;
4112     //     for (j = 0; iters--; j++) {
4113     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4114     //       MACC2(Ra, Rb, t0, t1, t2);
4115     //       Ra = *++Pa;
4116     //       Rb = *--Pb;
4117     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4118     //       MACC(Rm, Rn, t0, t1, t2);
4119     //       Rm = *++Pm;
4120     //       Rn = *--Pn;
4121     //     }
4122     //     if ((i & 1) == 0) {
4123     //       assert(Ra == Pa_base[j], "must be");
4124     //       MACC(Ra, Ra, t0, t1, t2);
4125     //     }
4126     //     iters = i/2;
4127     //     assert(iters == i-j, "must be");
4128     //     for (; iters--; j++) {
4129     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4130     //       MACC(Rm, Rn, t0, t1, t2);
4131     //       Rm = *++Pm;
4132     //       Rn = *--Pn;
4133     //     }
4134 
4135     //     *Pm = Rm = t0 * inv;
4136     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4137     //     MACC(Rm, Rn, t0, t1, t2);
4138 
4139     //     assert(t0 == 0, "broken Montgomery multiply");
4140 
4141     //     t0 = t1; t1 = t2; t2 = 0;
4142     //   }
4143 
4144     //   for (i = len; i < 2*len; i++) {
4145     //     int start = i-len+1;
4146     //     int end = start + (len - start)/2;
4147     //     int j;
4148 
4149     //     Pa = Pa_base + i-len;
4150     //     Pb = Pa_base + len;
4151     //     Pm = Pm_base + i-len;
4152     //     Pn = Pn_base + len;
4153 
4154     //     Ra = *++Pa;
4155     //     Rb = *--Pb;
4156     //     Rm = *++Pm;
4157     //     Rn = *--Pn;
4158 
4159     //     int iters = (2*len-i-1)/2;
4160     //     assert(iters == end-start, "must be");
4161     //     for (j = start; iters--; j++) {
4162     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4163     //       MACC2(Ra, Rb, t0, t1, t2);
4164     //       Ra = *++Pa;
4165     //       Rb = *--Pb;
4166     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4167     //       MACC(Rm, Rn, t0, t1, t2);
4168     //       Rm = *++Pm;
4169     //       Rn = *--Pn;
4170     //     }
4171     //     if ((i & 1) == 0) {
4172     //       assert(Ra == Pa_base[j], "must be");
4173     //       MACC(Ra, Ra, t0, t1, t2);
4174     //     }
4175     //     iters =  (2*len-i)/2;
4176     //     assert(iters == len-j, "must be");
4177     //     for (; iters--; j++) {
4178     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4179     //       MACC(Rm, Rn, t0, t1, t2);
4180     //       Rm = *++Pm;
4181     //       Rn = *--Pn;
4182     //     }
4183     //     Pm_base[i-len] = t0;
4184     //     t0 = t1; t1 = t2; t2 = 0;
4185     //   }
4186 
4187     //   while (t0)
4188     //     t0 = sub(Pm_base, Pn_base, t0, len);
4189     // }
4190   };
4191 
4192   // Initialization
4193   void generate_initial() {
4194     // Generate initial stubs and initializes the entry points
4195 
4196     // entry points that exist in all platforms Note: This is code
4197     // that could be shared among different platforms - however the
4198     // benefit seems to be smaller than the disadvantage of having a
4199     // much more complicated generator structure. See also comment in
4200     // stubRoutines.hpp.
4201 
4202     StubRoutines::_forward_exception_entry = generate_forward_exception();
4203 
4204     StubRoutines::_call_stub_entry =
4205       generate_call_stub(StubRoutines::_call_stub_return_address);
4206 
4207     // is referenced by megamorphic call
4208     StubRoutines::_catch_exception_entry = generate_catch_exception();
4209 
4210     // Build this early so it's available for the interpreter.
4211     StubRoutines::_throw_StackOverflowError_entry =
4212       generate_throw_exception("StackOverflowError throw_exception",
4213                                CAST_FROM_FN_PTR(address,
4214                                                 SharedRuntime::
4215                                                 throw_StackOverflowError));
4216     if (UseCRC32Intrinsics) {
4217       // set table address before stub generation which use it
4218       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4219       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4220     }
4221   }
4222 
4223   void generate_all() {
4224     // support for verify_oop (must happen after universe_init)
4225     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4226     StubRoutines::_throw_AbstractMethodError_entry =
4227       generate_throw_exception("AbstractMethodError throw_exception",
4228                                CAST_FROM_FN_PTR(address,
4229                                                 SharedRuntime::
4230                                                 throw_AbstractMethodError));
4231 
4232     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4233       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4234                                CAST_FROM_FN_PTR(address,
4235                                                 SharedRuntime::
4236                                                 throw_IncompatibleClassChangeError));
4237 
4238     StubRoutines::_throw_NullPointerException_at_call_entry =
4239       generate_throw_exception("NullPointerException at call throw_exception",
4240                                CAST_FROM_FN_PTR(address,
4241                                                 SharedRuntime::
4242                                                 throw_NullPointerException_at_call));
4243 
4244     // arraycopy stubs used by compilers
4245     generate_arraycopy_stubs();
4246 
4247     if (UseMultiplyToLenIntrinsic) {
4248       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4249     }
4250 
4251     if (UseMontgomeryMultiplyIntrinsic) {
4252       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4253       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4254       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4255     }
4256 
4257     if (UseMontgomerySquareIntrinsic) {
4258       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4259       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4260       // We use generate_multiply() rather than generate_square()
4261       // because it's faster for the sizes of modulus we care about.
4262       StubRoutines::_montgomerySquare = g.generate_multiply();
4263     }
4264 
4265     if (UseAESIntrinsics) {
4266       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4267       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4268       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4269       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4270     }
4271 
4272     if (UseSHA1Intrinsics) {
4273       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4274       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4275     }
4276     if (UseSHA256Intrinsics) {
4277       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4278       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4279     }
4280 
4281     // Safefetch stubs.
4282     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4283                                                        &StubRoutines::_safefetch32_fault_pc,
4284                                                        &StubRoutines::_safefetch32_continuation_pc);
4285     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4286                                                        &StubRoutines::_safefetchN_fault_pc,
4287                                                        &StubRoutines::_safefetchN_continuation_pc);
4288   }
4289 
4290  public:
4291   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4292     if (all) {
4293       generate_all();
4294     } else {
4295       generate_initial();
4296     }
4297   }
4298 }; // end class declaration
4299 
4300 void StubGenerator_generate(CodeBuffer* code, bool all) {
4301   StubGenerator g(code, all);
4302 }