1 /*
   2  * Copyright (c) 2013, Red Hat Inc.
   3  * Copyright (c) 2003, 2015, Oracle and/or its affiliates.
   4  * All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "interpreter/interpreter.hpp"
  31 #include "nativeInst_aarch64.hpp"
  32 #include "oops/instanceOop.hpp"
  33 #include "oops/method.hpp"
  34 #include "oops/objArrayKlass.hpp"
  35 #include "oops/oop.inline.hpp"
  36 #include "prims/methodHandles.hpp"
  37 #include "runtime/frame.inline.hpp"
  38 #include "runtime/handles.inline.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubCodeGenerator.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "runtime/thread.inline.hpp"
  43 #include "utilities/top.hpp"
  44 #ifdef COMPILER2
  45 #include "opto/runtime.hpp"
  46 #endif
  47 
  48 // Declaration and definition of StubGenerator (no .hpp file).
  49 // For a more detailed description of the stub routine structure
  50 // see the comment in stubRoutines.hpp
  51 
  52 #undef __
  53 #define __ _masm->
  54 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) /* nothing */
  58 #else
  59 #define BLOCK_COMMENT(str) __ block_comment(str)
  60 #endif
  61 
  62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  63 
  64 // Stub Code definitions
  65 
  66 class StubGenerator: public StubCodeGenerator {
  67  private:
  68 
  69 #ifdef PRODUCT
  70 #define inc_counter_np(counter) ((void)0)
  71 #else
  72   void inc_counter_np_(int& counter) {
  73     __ lea(rscratch2, ExternalAddress((address)&counter));
  74     __ ldrw(rscratch1, Address(rscratch2));
  75     __ addw(rscratch1, rscratch1, 1);
  76     __ strw(rscratch1, Address(rscratch2));
  77   }
  78 #define inc_counter_np(counter) \
  79   BLOCK_COMMENT("inc_counter " #counter); \
  80   inc_counter_np_(counter);
  81 #endif
  82 
  83   // Call stubs are used to call Java from C
  84   //
  85   // Arguments:
  86   //    c_rarg0:   call wrapper address                   address
  87   //    c_rarg1:   result                                 address
  88   //    c_rarg2:   result type                            BasicType
  89   //    c_rarg3:   method                                 Method*
  90   //    c_rarg4:   (interpreter) entry point              address
  91   //    c_rarg5:   parameters                             intptr_t*
  92   //    c_rarg6:   parameter size (in words)              int
  93   //    c_rarg7:   thread                                 Thread*
  94   //
  95   // There is no return from the stub itself as any Java result
  96   // is written to result
  97   //
  98   // we save r30 (lr) as the return PC at the base of the frame and
  99   // link r29 (fp) below it as the frame pointer installing sp (r31)
 100   // into fp.
 101   //
 102   // we save r0-r7, which accounts for all the c arguments.
 103   //
 104   // TODO: strictly do we need to save them all? they are treated as
 105   // volatile by C so could we omit saving the ones we are going to
 106   // place in global registers (thread? method?) or those we only use
 107   // during setup of the Java call?
 108   //
 109   // we don't need to save r8 which C uses as an indirect result location
 110   // return register.
 111   //
 112   // we don't need to save r9-r15 which both C and Java treat as
 113   // volatile
 114   //
 115   // we don't need to save r16-18 because Java does not use them
 116   //
 117   // we save r19-r28 which Java uses as scratch registers and C
 118   // expects to be callee-save
 119   //
 120   // we save the bottom 64 bits of each value stored in v8-v15; it is
 121   // the responsibility of the caller to preserve larger values.
 122   //
 123   // so the stub frame looks like this when we enter Java code
 124   //
 125   //     [ return_from_Java     ] <--- sp
 126   //     [ argument word n      ]
 127   //      ...
 128   // -27 [ argument word 1      ]
 129   // -26 [ saved v15            ] <--- sp_after_call
 130   // -25 [ saved v14            ]
 131   // -24 [ saved v13            ]
 132   // -23 [ saved v12            ]
 133   // -22 [ saved v11            ]
 134   // -21 [ saved v10            ]
 135   // -20 [ saved v9             ]
 136   // -19 [ saved v8             ]
 137   // -18 [ saved r28            ]
 138   // -17 [ saved r27            ]
 139   // -16 [ saved r26            ]
 140   // -15 [ saved r25            ]
 141   // -14 [ saved r24            ]
 142   // -13 [ saved r23            ]
 143   // -12 [ saved r22            ]
 144   // -11 [ saved r21            ]
 145   // -10 [ saved r20            ]
 146   //  -9 [ saved r19            ]
 147   //  -8 [ call wrapper    (r0) ]
 148   //  -7 [ result          (r1) ]
 149   //  -6 [ result type     (r2) ]
 150   //  -5 [ method          (r3) ]
 151   //  -4 [ entry point     (r4) ]
 152   //  -3 [ parameters      (r5) ]
 153   //  -2 [ parameter size  (r6) ]
 154   //  -1 [ thread (r7)          ]
 155   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 156   //   1 [ saved lr       (r30) ]
 157 
 158   // Call stub stack layout word offsets from fp
 159   enum call_stub_layout {
 160     sp_after_call_off = -26,
 161 
 162     d15_off            = -26,
 163     d13_off            = -24,
 164     d11_off            = -22,
 165     d9_off             = -20,
 166 
 167     r28_off            = -18,
 168     r26_off            = -16,
 169     r24_off            = -14,
 170     r22_off            = -12,
 171     r20_off            = -10,
 172     call_wrapper_off   =  -8,
 173     result_off         =  -7,
 174     result_type_off    =  -6,
 175     method_off         =  -5,
 176     entry_point_off    =  -4,
 177     parameter_size_off =  -2,
 178     thread_off         =  -1,
 179     fp_f               =   0,
 180     retaddr_off        =   1,
 181   };
 182 
 183   address generate_call_stub(address& return_address) {
 184     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 185            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 186            "adjust this code");
 187 
 188     StubCodeMark mark(this, "StubRoutines", "call_stub");
 189     address start = __ pc();
 190 
 191     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 192 
 193     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 194     const Address result        (rfp, result_off         * wordSize);
 195     const Address result_type   (rfp, result_type_off    * wordSize);
 196     const Address method        (rfp, method_off         * wordSize);
 197     const Address entry_point   (rfp, entry_point_off    * wordSize);
 198     const Address parameter_size(rfp, parameter_size_off * wordSize);
 199 
 200     const Address thread        (rfp, thread_off         * wordSize);
 201 
 202     const Address d15_save      (rfp, d15_off * wordSize);
 203     const Address d13_save      (rfp, d13_off * wordSize);
 204     const Address d11_save      (rfp, d11_off * wordSize);
 205     const Address d9_save       (rfp, d9_off * wordSize);
 206 
 207     const Address r28_save      (rfp, r28_off * wordSize);
 208     const Address r26_save      (rfp, r26_off * wordSize);
 209     const Address r24_save      (rfp, r24_off * wordSize);
 210     const Address r22_save      (rfp, r22_off * wordSize);
 211     const Address r20_save      (rfp, r20_off * wordSize);
 212 
 213     // stub code
 214 
 215     address aarch64_entry = __ pc();
 216 
 217     // set up frame and move sp to end of save area
 218     __ enter();
 219     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 220 
 221     // save register parameters and Java scratch/global registers
 222     // n.b. we save thread even though it gets installed in
 223     // rthread because we want to sanity check rthread later
 224     __ str(c_rarg7,  thread);
 225     __ strw(c_rarg6, parameter_size);
 226     __ stp(c_rarg4, c_rarg5,  entry_point);
 227     __ stp(c_rarg2, c_rarg3,  result_type);
 228     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 229 
 230     __ stp(r20, r19,   r20_save);
 231     __ stp(r22, r21,   r22_save);
 232     __ stp(r24, r23,   r24_save);
 233     __ stp(r26, r25,   r26_save);
 234     __ stp(r28, r27,   r28_save);
 235 
 236     __ stpd(v9,  v8,   d9_save);
 237     __ stpd(v11, v10,  d11_save);
 238     __ stpd(v13, v12,  d13_save);
 239     __ stpd(v15, v14,  d15_save);
 240 
 241     // install Java thread in global register now we have saved
 242     // whatever value it held
 243     __ mov(rthread, c_rarg7);
 244     // And method
 245     __ mov(rmethod, c_rarg3);
 246 
 247     // set up the heapbase register
 248     __ reinit_heapbase();
 249 
 250 #ifdef ASSERT
 251     // make sure we have no pending exceptions
 252     {
 253       Label L;
 254       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 255       __ cmp(rscratch1, (unsigned)NULL_WORD);
 256       __ br(Assembler::EQ, L);
 257       __ stop("StubRoutines::call_stub: entered with pending exception");
 258       __ BIND(L);
 259     }
 260 #endif
 261     // pass parameters if any
 262     __ mov(esp, sp);
 263     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 264     __ andr(sp, rscratch1, -2 * wordSize);
 265 
 266     BLOCK_COMMENT("pass parameters if any");
 267     Label parameters_done;
 268     // parameter count is still in c_rarg6
 269     // and parameter pointer identifying param 1 is in c_rarg5
 270     __ cbzw(c_rarg6, parameters_done);
 271 
 272     address loop = __ pc();
 273     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 274     __ subsw(c_rarg6, c_rarg6, 1);
 275     __ push(rscratch1);
 276     __ br(Assembler::GT, loop);
 277 
 278     __ BIND(parameters_done);
 279 
 280     // call Java entry -- passing methdoOop, and current sp
 281     //      rmethod: Method*
 282     //      r13: sender sp
 283     BLOCK_COMMENT("call Java function");
 284     __ mov(r13, sp);
 285     __ blr(c_rarg4);
 286 
 287     // we do this here because the notify will already have been done
 288     // if we get to the next instruction via an exception
 289     //
 290     // n.b. adding this instruction here affects the calculation of
 291     // whether or not a routine returns to the call stub (used when
 292     // doing stack walks) since the normal test is to check the return
 293     // pc against the address saved below. so we may need to allow for
 294     // this extra instruction in the check.
 295 
 296     // save current address for use by exception handling code
 297 
 298     return_address = __ pc();
 299 
 300     // store result depending on type (everything that is not
 301     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 302     // n.b. this assumes Java returns an integral result in r0
 303     // and a floating result in j_farg0
 304     __ ldr(j_rarg2, result);
 305     Label is_long, is_float, is_double, exit;
 306     __ ldr(j_rarg1, result_type);
 307     __ cmp(j_rarg1, T_OBJECT);
 308     __ br(Assembler::EQ, is_long);
 309     __ cmp(j_rarg1, T_LONG);
 310     __ br(Assembler::EQ, is_long);
 311     __ cmp(j_rarg1, T_FLOAT);
 312     __ br(Assembler::EQ, is_float);
 313     __ cmp(j_rarg1, T_DOUBLE);
 314     __ br(Assembler::EQ, is_double);
 315 
 316     // handle T_INT case
 317     __ strw(r0, Address(j_rarg2));
 318 
 319     __ BIND(exit);
 320 
 321     // pop parameters
 322     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 323 
 324 #ifdef ASSERT
 325     // verify that threads correspond
 326     {
 327       Label L, S;
 328       __ ldr(rscratch1, thread);
 329       __ cmp(rthread, rscratch1);
 330       __ br(Assembler::NE, S);
 331       __ get_thread(rscratch1);
 332       __ cmp(rthread, rscratch1);
 333       __ br(Assembler::EQ, L);
 334       __ BIND(S);
 335       __ stop("StubRoutines::call_stub: threads must correspond");
 336       __ BIND(L);
 337     }
 338 #endif
 339 
 340     // restore callee-save registers
 341     __ ldpd(v15, v14,  d15_save);
 342     __ ldpd(v13, v12,  d13_save);
 343     __ ldpd(v11, v10,  d11_save);
 344     __ ldpd(v9,  v8,   d9_save);
 345 
 346     __ ldp(r28, r27,   r28_save);
 347     __ ldp(r26, r25,   r26_save);
 348     __ ldp(r24, r23,   r24_save);
 349     __ ldp(r22, r21,   r22_save);
 350     __ ldp(r20, r19,   r20_save);
 351 
 352     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 353     __ ldrw(c_rarg2, result_type);
 354     __ ldr(c_rarg3,  method);
 355     __ ldp(c_rarg4, c_rarg5,  entry_point);
 356     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 357 
 358     // leave frame and return to caller
 359     __ leave();
 360     __ ret(lr);
 361 
 362     // handle return types different from T_INT
 363 
 364     __ BIND(is_long);
 365     __ str(r0, Address(j_rarg2, 0));
 366     __ br(Assembler::AL, exit);
 367 
 368     __ BIND(is_float);
 369     __ strs(j_farg0, Address(j_rarg2, 0));
 370     __ br(Assembler::AL, exit);
 371 
 372     __ BIND(is_double);
 373     __ strd(j_farg0, Address(j_rarg2, 0));
 374     __ br(Assembler::AL, exit);
 375 
 376     return start;
 377   }
 378 
 379   // Return point for a Java call if there's an exception thrown in
 380   // Java code.  The exception is caught and transformed into a
 381   // pending exception stored in JavaThread that can be tested from
 382   // within the VM.
 383   //
 384   // Note: Usually the parameters are removed by the callee. In case
 385   // of an exception crossing an activation frame boundary, that is
 386   // not the case if the callee is compiled code => need to setup the
 387   // rsp.
 388   //
 389   // r0: exception oop
 390 
 391   address generate_catch_exception() {
 392     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 393     address start = __ pc();
 394 
 395     // same as in generate_call_stub():
 396     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 397     const Address thread        (rfp, thread_off         * wordSize);
 398 
 399 #ifdef ASSERT
 400     // verify that threads correspond
 401     {
 402       Label L, S;
 403       __ ldr(rscratch1, thread);
 404       __ cmp(rthread, rscratch1);
 405       __ br(Assembler::NE, S);
 406       __ get_thread(rscratch1);
 407       __ cmp(rthread, rscratch1);
 408       __ br(Assembler::EQ, L);
 409       __ bind(S);
 410       __ stop("StubRoutines::catch_exception: threads must correspond");
 411       __ bind(L);
 412     }
 413 #endif
 414 
 415     // set pending exception
 416     __ verify_oop(r0);
 417 
 418     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 419     __ mov(rscratch1, (address)__FILE__);
 420     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 421     __ movw(rscratch1, (int)__LINE__);
 422     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 423 
 424     // complete return to VM
 425     assert(StubRoutines::_call_stub_return_address != NULL,
 426            "_call_stub_return_address must have been generated before");
 427     __ b(StubRoutines::_call_stub_return_address);
 428 
 429     return start;
 430   }
 431 
 432   // Continuation point for runtime calls returning with a pending
 433   // exception.  The pending exception check happened in the runtime
 434   // or native call stub.  The pending exception in Thread is
 435   // converted into a Java-level exception.
 436   //
 437   // Contract with Java-level exception handlers:
 438   // r0: exception
 439   // r3: throwing pc
 440   //
 441   // NOTE: At entry of this stub, exception-pc must be in LR !!
 442 
 443   // NOTE: this is always used as a jump target within generated code
 444   // so it just needs to be generated code wiht no x86 prolog
 445 
 446   address generate_forward_exception() {
 447     StubCodeMark mark(this, "StubRoutines", "forward exception");
 448     address start = __ pc();
 449 
 450     // Upon entry, LR points to the return address returning into
 451     // Java (interpreted or compiled) code; i.e., the return address
 452     // becomes the throwing pc.
 453     //
 454     // Arguments pushed before the runtime call are still on the stack
 455     // but the exception handler will reset the stack pointer ->
 456     // ignore them.  A potential result in registers can be ignored as
 457     // well.
 458 
 459 #ifdef ASSERT
 460     // make sure this code is only executed if there is a pending exception
 461     {
 462       Label L;
 463       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 464       __ cbnz(rscratch1, L);
 465       __ stop("StubRoutines::forward exception: no pending exception (1)");
 466       __ bind(L);
 467     }
 468 #endif
 469 
 470     // compute exception handler into r19
 471 
 472     // call the VM to find the handler address associated with the
 473     // caller address. pass thread in r0 and caller pc (ret address)
 474     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 475     // the stack.
 476     __ mov(c_rarg1, lr);
 477     // lr will be trashed by the VM call so we move it to R19
 478     // (callee-saved) because we also need to pass it to the handler
 479     // returned by this call.
 480     __ mov(r19, lr);
 481     BLOCK_COMMENT("call exception_handler_for_return_address");
 482     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 483                          SharedRuntime::exception_handler_for_return_address),
 484                     rthread, c_rarg1);
 485     // we should not really care that lr is no longer the callee
 486     // address. we saved the value the handler needs in r19 so we can
 487     // just copy it to r3. however, the C2 handler will push its own
 488     // frame and then calls into the VM and the VM code asserts that
 489     // the PC for the frame above the handler belongs to a compiled
 490     // Java method. So, we restore lr here to satisfy that assert.
 491     __ mov(lr, r19);
 492     // setup r0 & r3 & clear pending exception
 493     __ mov(r3, r19);
 494     __ mov(r19, r0);
 495     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 496     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 497 
 498 #ifdef ASSERT
 499     // make sure exception is set
 500     {
 501       Label L;
 502       __ cbnz(r0, L);
 503       __ stop("StubRoutines::forward exception: no pending exception (2)");
 504       __ bind(L);
 505     }
 506 #endif
 507 
 508     // continue at exception handler
 509     // r0: exception
 510     // r3: throwing pc
 511     // r19: exception handler
 512     __ verify_oop(r0);
 513     __ br(r19);
 514 
 515     return start;
 516   }
 517 
 518   // Non-destructive plausibility checks for oops
 519   //
 520   // Arguments:
 521   //    r0: oop to verify
 522   //    rscratch1: error message
 523   //
 524   // Stack after saving c_rarg3:
 525   //    [tos + 0]: saved c_rarg3
 526   //    [tos + 1]: saved c_rarg2
 527   //    [tos + 2]: saved lr
 528   //    [tos + 3]: saved rscratch2
 529   //    [tos + 4]: saved r0
 530   //    [tos + 5]: saved rscratch1
 531   address generate_verify_oop() {
 532 
 533     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 534     address start = __ pc();
 535 
 536     Label exit, error;
 537 
 538     // save c_rarg2 and c_rarg3
 539     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 540 
 541     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 542     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 543     __ ldr(c_rarg3, Address(c_rarg2));
 544     __ add(c_rarg3, c_rarg3, 1);
 545     __ str(c_rarg3, Address(c_rarg2));
 546 
 547     // object is in r0
 548     // make sure object is 'reasonable'
 549     __ cbz(r0, exit); // if obj is NULL it is OK
 550 
 551     // Check if the oop is in the right area of memory
 552     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 553     __ andr(c_rarg2, r0, c_rarg3);
 554     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 555 
 556     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 557     // instruction here because the flags register is live.
 558     __ eor(c_rarg2, c_rarg2, c_rarg3);
 559     __ cbnz(c_rarg2, error);
 560 
 561     // make sure klass is 'reasonable', which is not zero.
 562     __ load_klass(r0, r0);  // get klass
 563     __ cbz(r0, error);      // if klass is NULL it is broken
 564 
 565     // return if everything seems ok
 566     __ bind(exit);
 567 
 568     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 569     __ ret(lr);
 570 
 571     // handle errors
 572     __ bind(error);
 573     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 574 
 575     __ push(RegSet::range(r0, r29), sp);
 576     // debug(char* msg, int64_t pc, int64_t regs[])
 577     __ mov(c_rarg0, rscratch1);      // pass address of error message
 578     __ mov(c_rarg1, lr);             // pass return address
 579     __ mov(c_rarg2, sp);             // pass address of regs on stack
 580 #ifndef PRODUCT
 581     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 582 #endif
 583     BLOCK_COMMENT("call MacroAssembler::debug");
 584     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 585     __ blr(rscratch1);
 586 
 587     return start;
 588   }
 589 
 590   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 591 
 592   // Generate code for an array write pre barrier
 593   //
 594   //     addr    -  starting address
 595   //     count   -  element count
 596   //     tmp     - scratch register
 597   //
 598   //     Destroy no registers except rscratch1 and rscratch2
 599   //
 600   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 601     BarrierSet* bs = Universe::heap()->barrier_set();
 602     switch (bs->kind()) {
 603     case BarrierSet::G1SATBCT:
 604     case BarrierSet::G1SATBCTLogging:
 605       // With G1, don't generate the call if we statically know that the target in uninitialized
 606       if (!dest_uninitialized) {
 607         __ push_call_clobbered_registers();
 608         if (count == c_rarg0) {
 609           if (addr == c_rarg1) {
 610             // exactly backwards!!
 611             __ mov(rscratch1, c_rarg0);
 612             __ mov(c_rarg0, c_rarg1);
 613             __ mov(c_rarg1, rscratch1);
 614           } else {
 615             __ mov(c_rarg1, count);
 616             __ mov(c_rarg0, addr);
 617           }
 618         } else {
 619           __ mov(c_rarg0, addr);
 620           __ mov(c_rarg1, count);
 621         }
 622         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 623         __ pop_call_clobbered_registers();
 624         break;
 625       case BarrierSet::CardTableModRef:
 626       case BarrierSet::CardTableExtension:
 627       case BarrierSet::ModRef:
 628         break;
 629       default:
 630         ShouldNotReachHere();
 631 
 632       }
 633     }
 634   }
 635 
 636   //
 637   // Generate code for an array write post barrier
 638   //
 639   //  Input:
 640   //     start    - register containing starting address of destination array
 641   //     end      - register containing ending address of destination array
 642   //     scratch  - scratch register
 643   //
 644   //  The input registers are overwritten.
 645   //  The ending address is inclusive.
 646   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 647     assert_different_registers(start, end, scratch);
 648     Label L_done;
 649 
 650     // "end" is inclusive end pointer == start + (count - 1) * array_element_size
 651     // If count == 0, "end" is less than "start" and we need to skip card marking.
 652     __ cmp(end, start);
 653     __ br(__ LO, L_done);
 654 
 655     BarrierSet* bs = Universe::heap()->barrier_set();
 656     switch (bs->kind()) {
 657       case BarrierSet::G1SATBCT:
 658       case BarrierSet::G1SATBCTLogging:
 659 
 660         {
 661           __ push_call_clobbered_registers();
 662           // must compute element count unless barrier set interface is changed (other platforms supply count)
 663           assert_different_registers(start, end, scratch);
 664           __ lea(scratch, Address(end, BytesPerHeapOop));
 665           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 666           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 667           __ mov(c_rarg0, start);
 668           __ mov(c_rarg1, scratch);
 669           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 670           __ pop_call_clobbered_registers();
 671         }
 672         break;
 673       case BarrierSet::CardTableModRef:
 674       case BarrierSet::CardTableExtension:
 675         {
 676           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 677           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 678 
 679           Label L_loop;
 680 
 681            __ lsr(start, start, CardTableModRefBS::card_shift);
 682            __ lsr(end, end, CardTableModRefBS::card_shift);
 683            __ sub(end, end, start); // number of bytes to copy
 684 
 685           const Register count = end; // 'end' register contains bytes count now
 686           __ load_byte_map_base(scratch);
 687           __ add(start, start, scratch);
 688           if (UseConcMarkSweepGC) {
 689             __ membar(__ StoreStore);
 690           }
 691           __ BIND(L_loop);
 692           __ strb(zr, Address(start, count));
 693           __ subs(count, count, 1);
 694           __ br(Assembler::GE, L_loop);
 695         }
 696         break;
 697       default:
 698         ShouldNotReachHere();
 699 
 700     }
 701     __ bind(L_done);
 702   }
 703 
 704   address generate_zero_longs(Register base, Register cnt) {
 705     Register tmp = rscratch1;
 706     Register tmp2 = rscratch2;
 707     int zva_length = VM_Version::zva_length();
 708     Label initial_table_end, loop_zva;
 709     Label fini;
 710 
 711     __ align(CodeEntryAlignment);
 712     StubCodeMark mark(this, "StubRoutines", "zero_longs");
 713     address start = __ pc();
 714 
 715     // Base must be 16 byte aligned. If not just return and let caller handle it
 716     __ tst(base, 0x0f);
 717     __ br(Assembler::NE, fini);
 718     // Align base with ZVA length.
 719     __ neg(tmp, base);
 720     __ andr(tmp, tmp, zva_length - 1);
 721 
 722     // tmp: the number of bytes to be filled to align the base with ZVA length.
 723     __ add(base, base, tmp);
 724     __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
 725     __ adr(tmp2, initial_table_end);
 726     __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
 727     __ br(tmp2);
 728 
 729     for (int i = -zva_length + 16; i < 0; i += 16)
 730       __ stp(zr, zr, Address(base, i));
 731     __ bind(initial_table_end);
 732 
 733     __ sub(cnt, cnt, zva_length >> 3);
 734     __ bind(loop_zva);
 735     __ dc(Assembler::ZVA, base);
 736     __ subs(cnt, cnt, zva_length >> 3);
 737     __ add(base, base, zva_length);
 738     __ br(Assembler::GE, loop_zva);
 739     __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
 740     __ bind(fini);
 741     __ ret(lr);
 742 
 743     return start;
 744   }
 745 
 746   typedef enum {
 747     copy_forwards = 1,
 748     copy_backwards = -1
 749   } copy_direction;
 750 
 751   // Bulk copy of blocks of 8 words.
 752   //
 753   // count is a count of words.
 754   //
 755   // Precondition: count >= 8
 756   //
 757   // Postconditions:
 758   //
 759   // The least significant bit of count contains the remaining count
 760   // of words to copy.  The rest of count is trash.
 761   //
 762   // s and d are adjusted to point to the remaining words to copy
 763   //
 764   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 765                            copy_direction direction) {
 766     int unit = wordSize * direction;
 767     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 768 
 769     int offset;
 770     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 771       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 772     const Register stride = r13;
 773 
 774     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 775     assert_different_registers(s, d, count, rscratch1);
 776 
 777     Label again, drain;
 778     const char *stub_name;
 779     if (direction == copy_forwards)
 780       stub_name = "foward_copy_longs";
 781     else
 782       stub_name = "backward_copy_longs";
 783 
 784     __ align(CodeEntryAlignment);
 785 
 786     StubCodeMark mark(this, "StubRoutines", stub_name);
 787 
 788     __ bind(start);
 789 
 790     Label unaligned_copy_long;
 791     if (AvoidUnalignedAccesses) {
 792       __ tbnz(d, 3, unaligned_copy_long);
 793     }
 794 
 795     if (direction == copy_forwards) {
 796       __ sub(s, s, bias);
 797       __ sub(d, d, bias);
 798     }
 799 
 800 #ifdef ASSERT
 801     // Make sure we are never given < 8 words
 802     {
 803       Label L;
 804       __ cmp(count, 8);
 805       __ br(Assembler::GE, L);
 806       __ stop("genrate_copy_longs called with < 8 words");
 807       __ bind(L);
 808     }
 809 #endif
 810 
 811     // Fill 8 registers
 812     if (UseSIMDForMemoryOps) {
 813       __ ldpq(v0, v1, Address(s, 4 * unit));
 814       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 815     } else {
 816       __ ldp(t0, t1, Address(s, 2 * unit));
 817       __ ldp(t2, t3, Address(s, 4 * unit));
 818       __ ldp(t4, t5, Address(s, 6 * unit));
 819       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 820     }
 821 
 822     __ subs(count, count, 16);
 823     __ br(Assembler::LO, drain);
 824 
 825     int prefetch = PrefetchCopyIntervalInBytes;
 826     bool use_stride = false;
 827     if (direction == copy_backwards) {
 828        use_stride = prefetch > 256;
 829        prefetch = -prefetch;
 830        if (use_stride) __ mov(stride, prefetch);
 831     }
 832 
 833     __ bind(again);
 834 
 835     if (PrefetchCopyIntervalInBytes > 0)
 836       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 837 
 838     if (UseSIMDForMemoryOps) {
 839       __ stpq(v0, v1, Address(d, 4 * unit));
 840       __ ldpq(v0, v1, Address(s, 4 * unit));
 841       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 842       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 843     } else {
 844       __ stp(t0, t1, Address(d, 2 * unit));
 845       __ ldp(t0, t1, Address(s, 2 * unit));
 846       __ stp(t2, t3, Address(d, 4 * unit));
 847       __ ldp(t2, t3, Address(s, 4 * unit));
 848       __ stp(t4, t5, Address(d, 6 * unit));
 849       __ ldp(t4, t5, Address(s, 6 * unit));
 850       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 851       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 852     }
 853 
 854     __ subs(count, count, 8);
 855     __ br(Assembler::HS, again);
 856 
 857     // Drain
 858     __ bind(drain);
 859     if (UseSIMDForMemoryOps) {
 860       __ stpq(v0, v1, Address(d, 4 * unit));
 861       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 862     } else {
 863       __ stp(t0, t1, Address(d, 2 * unit));
 864       __ stp(t2, t3, Address(d, 4 * unit));
 865       __ stp(t4, t5, Address(d, 6 * unit));
 866       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 867     }
 868 
 869     {
 870       Label L1, L2;
 871       __ tbz(count, exact_log2(4), L1);
 872       if (UseSIMDForMemoryOps) {
 873         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 874         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 875       } else {
 876         __ ldp(t0, t1, Address(s, 2 * unit));
 877         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 878         __ stp(t0, t1, Address(d, 2 * unit));
 879         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 880       }
 881       __ bind(L1);
 882 
 883       if (direction == copy_forwards) {
 884         __ add(s, s, bias);
 885         __ add(d, d, bias);
 886       }
 887 
 888       __ tbz(count, 1, L2);
 889       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 890       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 891       __ bind(L2);
 892     }
 893 
 894     __ ret(lr);
 895 
 896     if (AvoidUnalignedAccesses) {
 897       Label drain, again;
 898       // Register order for storing. Order is different for backward copy.
 899 
 900       __ bind(unaligned_copy_long);
 901 
 902       // source address is even aligned, target odd aligned
 903       //
 904       // when forward copying word pairs we read long pairs at offsets
 905       // {0, 2, 4, 6} (in long words). when backwards copying we read
 906       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 907       // address by -2 in the forwards case so we can compute the
 908       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 909       // or -1.
 910       //
 911       // when forward copying we need to store 1 word, 3 pairs and
 912       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 913       // zero offset We adjust the destination by -1 which means we
 914       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 915       //
 916       // When backwards copyng we need to store 1 word, 3 pairs and
 917       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 918       // offsets {1, 3, 5, 7, 8} * unit.
 919 
 920       if (direction == copy_forwards) {
 921         __ sub(s, s, 16);
 922         __ sub(d, d, 8);
 923       }
 924 
 925       // Fill 8 registers
 926       //
 927       // for forwards copy s was offset by -16 from the original input
 928       // value of s so the register contents are at these offsets
 929       // relative to the 64 bit block addressed by that original input
 930       // and so on for each successive 64 byte block when s is updated
 931       //
 932       // t0 at offset 0,  t1 at offset 8
 933       // t2 at offset 16, t3 at offset 24
 934       // t4 at offset 32, t5 at offset 40
 935       // t6 at offset 48, t7 at offset 56
 936 
 937       // for backwards copy s was not offset so the register contents
 938       // are at these offsets into the preceding 64 byte block
 939       // relative to that original input and so on for each successive
 940       // preceding 64 byte block when s is updated. this explains the
 941       // slightly counter-intuitive looking pattern of register usage
 942       // in the stp instructions for backwards copy.
 943       //
 944       // t0 at offset -16, t1 at offset -8
 945       // t2 at offset -32, t3 at offset -24
 946       // t4 at offset -48, t5 at offset -40
 947       // t6 at offset -64, t7 at offset -56
 948 
 949       __ ldp(t0, t1, Address(s, 2 * unit));
 950       __ ldp(t2, t3, Address(s, 4 * unit));
 951       __ ldp(t4, t5, Address(s, 6 * unit));
 952       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 953 
 954       __ subs(count, count, 16);
 955       __ br(Assembler::LO, drain);
 956 
 957       int prefetch = PrefetchCopyIntervalInBytes;
 958       bool use_stride = false;
 959       if (direction == copy_backwards) {
 960          use_stride = prefetch > 256;
 961          prefetch = -prefetch;
 962          if (use_stride) __ mov(stride, prefetch);
 963       }
 964 
 965       __ bind(again);
 966 
 967       if (PrefetchCopyIntervalInBytes > 0)
 968         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 969 
 970       if (direction == copy_forwards) {
 971        // allowing for the offset of -8 the store instructions place
 972        // registers into the target 64 bit block at the following
 973        // offsets
 974        //
 975        // t0 at offset 0
 976        // t1 at offset 8,  t2 at offset 16
 977        // t3 at offset 24, t4 at offset 32
 978        // t5 at offset 40, t6 at offset 48
 979        // t7 at offset 56
 980 
 981         __ str(t0, Address(d, 1 * unit));
 982         __ stp(t1, t2, Address(d, 2 * unit));
 983         __ ldp(t0, t1, Address(s, 2 * unit));
 984         __ stp(t3, t4, Address(d, 4 * unit));
 985         __ ldp(t2, t3, Address(s, 4 * unit));
 986         __ stp(t5, t6, Address(d, 6 * unit));
 987         __ ldp(t4, t5, Address(s, 6 * unit));
 988         __ str(t7, Address(__ pre(d, 8 * unit)));
 989         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 990       } else {
 991        // d was not offset when we started so the registers are
 992        // written into the 64 bit block preceding d with the following
 993        // offsets
 994        //
 995        // t1 at offset -8
 996        // t3 at offset -24, t0 at offset -16
 997        // t5 at offset -48, t2 at offset -32
 998        // t7 at offset -56, t4 at offset -48
 999        //                   t6 at offset -64
1000        //
1001        // note that this matches the offsets previously noted for the
1002        // loads
1003 
1004         __ str(t1, Address(d, 1 * unit));
1005         __ stp(t3, t0, Address(d, 3 * unit));
1006         __ ldp(t0, t1, Address(s, 2 * unit));
1007         __ stp(t5, t2, Address(d, 5 * unit));
1008         __ ldp(t2, t3, Address(s, 4 * unit));
1009         __ stp(t7, t4, Address(d, 7 * unit));
1010         __ ldp(t4, t5, Address(s, 6 * unit));
1011         __ str(t6, Address(__ pre(d, 8 * unit)));
1012         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1013       }
1014 
1015       __ subs(count, count, 8);
1016       __ br(Assembler::HS, again);
1017 
1018       // Drain
1019       //
1020       // this uses the same pattern of offsets and register arguments
1021       // as above
1022       __ bind(drain);
1023       if (direction == copy_forwards) {
1024         __ str(t0, Address(d, 1 * unit));
1025         __ stp(t1, t2, Address(d, 2 * unit));
1026         __ stp(t3, t4, Address(d, 4 * unit));
1027         __ stp(t5, t6, Address(d, 6 * unit));
1028         __ str(t7, Address(__ pre(d, 8 * unit)));
1029       } else {
1030         __ str(t1, Address(d, 1 * unit));
1031         __ stp(t3, t0, Address(d, 3 * unit));
1032         __ stp(t5, t2, Address(d, 5 * unit));
1033         __ stp(t7, t4, Address(d, 7 * unit));
1034         __ str(t6, Address(__ pre(d, 8 * unit)));
1035       }
1036       // now we need to copy any remaining part block which may
1037       // include a 4 word block subblock and/or a 2 word subblock.
1038       // bits 2 and 1 in the count are the tell-tale for whetehr we
1039       // have each such subblock
1040       {
1041         Label L1, L2;
1042         __ tbz(count, exact_log2(4), L1);
1043        // this is the same as above but copying only 4 longs hence
1044        // with ony one intervening stp between the str instructions
1045        // but note that the offsets and registers still follow the
1046        // same pattern
1047         __ ldp(t0, t1, Address(s, 2 * unit));
1048         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1049         if (direction == copy_forwards) {
1050           __ str(t0, Address(d, 1 * unit));
1051           __ stp(t1, t2, Address(d, 2 * unit));
1052           __ str(t3, Address(__ pre(d, 4 * unit)));
1053         } else {
1054           __ str(t1, Address(d, 1 * unit));
1055           __ stp(t3, t0, Address(d, 3 * unit));
1056           __ str(t2, Address(__ pre(d, 4 * unit)));
1057         }
1058         __ bind(L1);
1059 
1060         __ tbz(count, 1, L2);
1061        // this is the same as above but copying only 2 longs hence
1062        // there is no intervening stp between the str instructions
1063        // but note that the offset and register patterns are still
1064        // the same
1065         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1066         if (direction == copy_forwards) {
1067           __ str(t0, Address(d, 1 * unit));
1068           __ str(t1, Address(__ pre(d, 2 * unit)));
1069         } else {
1070           __ str(t1, Address(d, 1 * unit));
1071           __ str(t0, Address(__ pre(d, 2 * unit)));
1072         }
1073         __ bind(L2);
1074 
1075        // for forwards copy we need to re-adjust the offsets we
1076        // applied so that s and d are follow the last words written
1077 
1078        if (direction == copy_forwards) {
1079          __ add(s, s, 16);
1080          __ add(d, d, 8);
1081        }
1082 
1083       }
1084 
1085       __ ret(lr);
1086       }
1087   }
1088 
1089   // Small copy: less than 16 bytes.
1090   //
1091   // NB: Ignores all of the bits of count which represent more than 15
1092   // bytes, so a caller doesn't have to mask them.
1093 
1094   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1095     bool is_backwards = step < 0;
1096     size_t granularity = uabs(step);
1097     int direction = is_backwards ? -1 : 1;
1098     int unit = wordSize * direction;
1099 
1100     Label Lpair, Lword, Lint, Lshort, Lbyte;
1101 
1102     assert(granularity
1103            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1104 
1105     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1106 
1107     // ??? I don't know if this bit-test-and-branch is the right thing
1108     // to do.  It does a lot of jumping, resulting in several
1109     // mispredicted branches.  It might make more sense to do this
1110     // with something like Duff's device with a single computed branch.
1111 
1112     __ tbz(count, 3 - exact_log2(granularity), Lword);
1113     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1114     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1115     __ bind(Lword);
1116 
1117     if (granularity <= sizeof (jint)) {
1118       __ tbz(count, 2 - exact_log2(granularity), Lint);
1119       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1120       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1121       __ bind(Lint);
1122     }
1123 
1124     if (granularity <= sizeof (jshort)) {
1125       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1126       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1127       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1128       __ bind(Lshort);
1129     }
1130 
1131     if (granularity <= sizeof (jbyte)) {
1132       __ tbz(count, 0, Lbyte);
1133       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1134       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1135       __ bind(Lbyte);
1136     }
1137   }
1138 
1139   Label copy_f, copy_b;
1140 
1141   // All-singing all-dancing memory copy.
1142   //
1143   // Copy count units of memory from s to d.  The size of a unit is
1144   // step, which can be positive or negative depending on the direction
1145   // of copy.  If is_aligned is false, we align the source address.
1146   //
1147 
1148   void copy_memory(bool is_aligned, Register s, Register d,
1149                    Register count, Register tmp, int step) {
1150     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1151     bool is_backwards = step < 0;
1152     int granularity = uabs(step);
1153     const Register t0 = r3, t1 = r4;
1154 
1155     // <= 96 bytes do inline. Direction doesn't matter because we always
1156     // load all the data before writing anything
1157     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1158     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1159     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1160     const Register send = r17, dend = r18;
1161 
1162     if (PrefetchCopyIntervalInBytes > 0)
1163       __ prfm(Address(s, 0), PLDL1KEEP);
1164     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1165     __ br(Assembler::HI, copy_big);
1166 
1167     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1168     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1169 
1170     __ cmp(count, 16/granularity);
1171     __ br(Assembler::LS, copy16);
1172 
1173     __ cmp(count, 64/granularity);
1174     __ br(Assembler::HI, copy80);
1175 
1176     __ cmp(count, 32/granularity);
1177     __ br(Assembler::LS, copy32);
1178 
1179     // 33..64 bytes
1180     if (UseSIMDForMemoryOps) {
1181       __ ldpq(v0, v1, Address(s, 0));
1182       __ ldpq(v2, v3, Address(send, -32));
1183       __ stpq(v0, v1, Address(d, 0));
1184       __ stpq(v2, v3, Address(dend, -32));
1185     } else {
1186       __ ldp(t0, t1, Address(s, 0));
1187       __ ldp(t2, t3, Address(s, 16));
1188       __ ldp(t4, t5, Address(send, -32));
1189       __ ldp(t6, t7, Address(send, -16));
1190 
1191       __ stp(t0, t1, Address(d, 0));
1192       __ stp(t2, t3, Address(d, 16));
1193       __ stp(t4, t5, Address(dend, -32));
1194       __ stp(t6, t7, Address(dend, -16));
1195     }
1196     __ b(finish);
1197 
1198     // 17..32 bytes
1199     __ bind(copy32);
1200     __ ldp(t0, t1, Address(s, 0));
1201     __ ldp(t2, t3, Address(send, -16));
1202     __ stp(t0, t1, Address(d, 0));
1203     __ stp(t2, t3, Address(dend, -16));
1204     __ b(finish);
1205 
1206     // 65..80/96 bytes
1207     // (96 bytes if SIMD because we do 32 byes per instruction)
1208     __ bind(copy80);
1209     if (UseSIMDForMemoryOps) {
1210       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1211       __ ldpq(v4, v5, Address(send, -32));
1212       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1213       __ stpq(v4, v5, Address(dend, -32));
1214     } else {
1215       __ ldp(t0, t1, Address(s, 0));
1216       __ ldp(t2, t3, Address(s, 16));
1217       __ ldp(t4, t5, Address(s, 32));
1218       __ ldp(t6, t7, Address(s, 48));
1219       __ ldp(t8, t9, Address(send, -16));
1220 
1221       __ stp(t0, t1, Address(d, 0));
1222       __ stp(t2, t3, Address(d, 16));
1223       __ stp(t4, t5, Address(d, 32));
1224       __ stp(t6, t7, Address(d, 48));
1225       __ stp(t8, t9, Address(dend, -16));
1226     }
1227     __ b(finish);
1228 
1229     // 0..16 bytes
1230     __ bind(copy16);
1231     __ cmp(count, 8/granularity);
1232     __ br(Assembler::LO, copy8);
1233 
1234     // 8..16 bytes
1235     __ ldr(t0, Address(s, 0));
1236     __ ldr(t1, Address(send, -8));
1237     __ str(t0, Address(d, 0));
1238     __ str(t1, Address(dend, -8));
1239     __ b(finish);
1240 
1241     if (granularity < 8) {
1242       // 4..7 bytes
1243       __ bind(copy8);
1244       __ tbz(count, 2 - exact_log2(granularity), copy4);
1245       __ ldrw(t0, Address(s, 0));
1246       __ ldrw(t1, Address(send, -4));
1247       __ strw(t0, Address(d, 0));
1248       __ strw(t1, Address(dend, -4));
1249       __ b(finish);
1250       if (granularity < 4) {
1251         // 0..3 bytes
1252         __ bind(copy4);
1253         __ cbz(count, finish); // get rid of 0 case
1254         if (granularity == 2) {
1255           __ ldrh(t0, Address(s, 0));
1256           __ strh(t0, Address(d, 0));
1257         } else { // granularity == 1
1258           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1259           // the first and last byte.
1260           // Handle the 3 byte case by loading and storing base + count/2
1261           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1262           // This does means in the 1 byte case we load/store the same
1263           // byte 3 times.
1264           __ lsr(count, count, 1);
1265           __ ldrb(t0, Address(s, 0));
1266           __ ldrb(t1, Address(send, -1));
1267           __ ldrb(t2, Address(s, count));
1268           __ strb(t0, Address(d, 0));
1269           __ strb(t1, Address(dend, -1));
1270           __ strb(t2, Address(d, count));
1271         }
1272         __ b(finish);
1273       }
1274     }
1275 
1276     __ bind(copy_big);
1277     if (is_backwards) {
1278       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1279       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1280     }
1281 
1282     // Now we've got the small case out of the way we can align the
1283     // source address on a 2-word boundary.
1284 
1285     Label aligned;
1286 
1287     if (is_aligned) {
1288       // We may have to adjust by 1 word to get s 2-word-aligned.
1289       __ tbz(s, exact_log2(wordSize), aligned);
1290       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1291       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1292       __ sub(count, count, wordSize/granularity);
1293     } else {
1294       if (is_backwards) {
1295         __ andr(rscratch2, s, 2 * wordSize - 1);
1296       } else {
1297         __ neg(rscratch2, s);
1298         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1299       }
1300       // rscratch2 is the byte adjustment needed to align s.
1301       __ cbz(rscratch2, aligned);
1302       int shift = exact_log2(granularity);
1303       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1304       __ sub(count, count, rscratch2);
1305 
1306 #if 0
1307       // ?? This code is only correct for a disjoint copy.  It may or
1308       // may not make sense to use it in that case.
1309 
1310       // Copy the first pair; s and d may not be aligned.
1311       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1312       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1313 
1314       // Align s and d, adjust count
1315       if (is_backwards) {
1316         __ sub(s, s, rscratch2);
1317         __ sub(d, d, rscratch2);
1318       } else {
1319         __ add(s, s, rscratch2);
1320         __ add(d, d, rscratch2);
1321       }
1322 #else
1323       copy_memory_small(s, d, rscratch2, rscratch1, step);
1324 #endif
1325     }
1326 
1327     __ bind(aligned);
1328 
1329     // s is now 2-word-aligned.
1330 
1331     // We have a count of units and some trailing bytes.  Adjust the
1332     // count and do a bulk copy of words.
1333     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1334     if (direction == copy_forwards)
1335       __ bl(copy_f);
1336     else
1337       __ bl(copy_b);
1338 
1339     // And the tail.
1340     copy_memory_small(s, d, count, tmp, step);
1341 
1342     if (granularity >= 8) __ bind(copy8);
1343     if (granularity >= 4) __ bind(copy4);
1344     __ bind(finish);
1345   }
1346 
1347 
1348   void clobber_registers() {
1349 #ifdef ASSERT
1350     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1351     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1352     for (Register r = r3; r <= r18; r++)
1353       if (r != rscratch1) __ mov(r, rscratch1);
1354 #endif
1355   }
1356 
1357   // Scan over array at a for count oops, verifying each one.
1358   // Preserves a and count, clobbers rscratch1 and rscratch2.
1359   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1360     Label loop, end;
1361     __ mov(rscratch1, a);
1362     __ mov(rscratch2, zr);
1363     __ bind(loop);
1364     __ cmp(rscratch2, count);
1365     __ br(Assembler::HS, end);
1366     if (size == (size_t)wordSize) {
1367       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1368       __ verify_oop(temp);
1369     } else {
1370       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1371       __ decode_heap_oop(temp); // calls verify_oop
1372     }
1373     __ add(rscratch2, rscratch2, 1);
1374     __ b(loop);
1375     __ bind(end);
1376   }
1377 
1378   // Arguments:
1379   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1380   //             ignored
1381   //   is_oop  - true => oop array, so generate store check code
1382   //   name    - stub name string
1383   //
1384   // Inputs:
1385   //   c_rarg0   - source array address
1386   //   c_rarg1   - destination array address
1387   //   c_rarg2   - element count, treated as ssize_t, can be zero
1388   //
1389   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1390   // the hardware handle it.  The two dwords within qwords that span
1391   // cache line boundaries will still be loaded and stored atomicly.
1392   //
1393   // Side Effects:
1394   //   disjoint_int_copy_entry is set to the no-overlap entry point
1395   //   used by generate_conjoint_int_oop_copy().
1396   //
1397   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1398                                   const char *name, bool dest_uninitialized = false) {
1399     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1400     __ align(CodeEntryAlignment);
1401     StubCodeMark mark(this, "StubRoutines", name);
1402     address start = __ pc();
1403     __ enter();
1404 
1405     if (entry != NULL) {
1406       *entry = __ pc();
1407       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1408       BLOCK_COMMENT("Entry:");
1409     }
1410 
1411     if (is_oop) {
1412       __ push(RegSet::of(d, count), sp);
1413       // no registers are destroyed by this call
1414       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1415     }
1416     copy_memory(aligned, s, d, count, rscratch1, size);
1417     if (is_oop) {
1418       __ pop(RegSet::of(d, count), sp);
1419       if (VerifyOops)
1420         verify_oop_array(size, d, count, r16);
1421       __ sub(count, count, 1); // make an inclusive end pointer
1422       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1423       gen_write_ref_array_post_barrier(d, count, rscratch1);
1424     }
1425     __ leave();
1426     __ mov(r0, zr); // return 0
1427     __ ret(lr);
1428     return start;
1429   }
1430 
1431   // Arguments:
1432   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1433   //             ignored
1434   //   is_oop  - true => oop array, so generate store check code
1435   //   name    - stub name string
1436   //
1437   // Inputs:
1438   //   c_rarg0   - source array address
1439   //   c_rarg1   - destination array address
1440   //   c_rarg2   - element count, treated as ssize_t, can be zero
1441   //
1442   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1443   // the hardware handle it.  The two dwords within qwords that span
1444   // cache line boundaries will still be loaded and stored atomicly.
1445   //
1446   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1447                                  address *entry, const char *name,
1448                                  bool dest_uninitialized = false) {
1449     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1450 
1451     StubCodeMark mark(this, "StubRoutines", name);
1452     address start = __ pc();
1453 
1454     __ enter();
1455 
1456     if (entry != NULL) {
1457       *entry = __ pc();
1458       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1459       BLOCK_COMMENT("Entry:");
1460     }
1461 
1462     // use fwd copy when (d-s) above_equal (count*size)
1463     __ sub(rscratch1, d, s);
1464     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1465     __ br(Assembler::HS, nooverlap_target);
1466 
1467     if (is_oop) {
1468       __ push(RegSet::of(d, count), sp);
1469       // no registers are destroyed by this call
1470       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1471     }
1472     copy_memory(aligned, s, d, count, rscratch1, -size);
1473     if (is_oop) {
1474       __ pop(RegSet::of(d, count), sp);
1475       if (VerifyOops)
1476         verify_oop_array(size, d, count, r16);
1477       __ sub(count, count, 1); // make an inclusive end pointer
1478       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1479       gen_write_ref_array_post_barrier(d, count, rscratch1);
1480     }
1481     __ leave();
1482     __ mov(r0, zr); // return 0
1483     __ ret(lr);
1484     return start;
1485 }
1486 
1487   // Arguments:
1488   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1489   //             ignored
1490   //   name    - stub name string
1491   //
1492   // Inputs:
1493   //   c_rarg0   - source array address
1494   //   c_rarg1   - destination array address
1495   //   c_rarg2   - element count, treated as ssize_t, can be zero
1496   //
1497   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1498   // we let the hardware handle it.  The one to eight bytes within words,
1499   // dwords or qwords that span cache line boundaries will still be loaded
1500   // and stored atomically.
1501   //
1502   // Side Effects:
1503   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1504   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1505   // we let the hardware handle it.  The one to eight bytes within words,
1506   // dwords or qwords that span cache line boundaries will still be loaded
1507   // and stored atomically.
1508   //
1509   // Side Effects:
1510   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1511   //   used by generate_conjoint_byte_copy().
1512   //
1513   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1514     const bool not_oop = false;
1515     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1516   }
1517 
1518   // Arguments:
1519   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1520   //             ignored
1521   //   name    - stub name string
1522   //
1523   // Inputs:
1524   //   c_rarg0   - source array address
1525   //   c_rarg1   - destination array address
1526   //   c_rarg2   - element count, treated as ssize_t, can be zero
1527   //
1528   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1529   // we let the hardware handle it.  The one to eight bytes within words,
1530   // dwords or qwords that span cache line boundaries will still be loaded
1531   // and stored atomically.
1532   //
1533   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1534                                       address* entry, const char *name) {
1535     const bool not_oop = false;
1536     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1537   }
1538 
1539   // Arguments:
1540   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1541   //             ignored
1542   //   name    - stub name string
1543   //
1544   // Inputs:
1545   //   c_rarg0   - source array address
1546   //   c_rarg1   - destination array address
1547   //   c_rarg2   - element count, treated as ssize_t, can be zero
1548   //
1549   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1550   // let the hardware handle it.  The two or four words within dwords
1551   // or qwords that span cache line boundaries will still be loaded
1552   // and stored atomically.
1553   //
1554   // Side Effects:
1555   //   disjoint_short_copy_entry is set to the no-overlap entry point
1556   //   used by generate_conjoint_short_copy().
1557   //
1558   address generate_disjoint_short_copy(bool aligned,
1559                                        address* entry, const char *name) {
1560     const bool not_oop = false;
1561     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1562   }
1563 
1564   // Arguments:
1565   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1566   //             ignored
1567   //   name    - stub name string
1568   //
1569   // Inputs:
1570   //   c_rarg0   - source array address
1571   //   c_rarg1   - destination array address
1572   //   c_rarg2   - element count, treated as ssize_t, can be zero
1573   //
1574   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1575   // let the hardware handle it.  The two or four words within dwords
1576   // or qwords that span cache line boundaries will still be loaded
1577   // and stored atomically.
1578   //
1579   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1580                                        address *entry, const char *name) {
1581     const bool not_oop = false;
1582     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1583 
1584   }
1585   // Arguments:
1586   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1587   //             ignored
1588   //   name    - stub name string
1589   //
1590   // Inputs:
1591   //   c_rarg0   - source array address
1592   //   c_rarg1   - destination array address
1593   //   c_rarg2   - element count, treated as ssize_t, can be zero
1594   //
1595   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1596   // the hardware handle it.  The two dwords within qwords that span
1597   // cache line boundaries will still be loaded and stored atomicly.
1598   //
1599   // Side Effects:
1600   //   disjoint_int_copy_entry is set to the no-overlap entry point
1601   //   used by generate_conjoint_int_oop_copy().
1602   //
1603   address generate_disjoint_int_copy(bool aligned, address *entry,
1604                                          const char *name, bool dest_uninitialized = false) {
1605     const bool not_oop = false;
1606     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1607   }
1608 
1609   // Arguments:
1610   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1611   //             ignored
1612   //   name    - stub name string
1613   //
1614   // Inputs:
1615   //   c_rarg0   - source array address
1616   //   c_rarg1   - destination array address
1617   //   c_rarg2   - element count, treated as ssize_t, can be zero
1618   //
1619   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1620   // the hardware handle it.  The two dwords within qwords that span
1621   // cache line boundaries will still be loaded and stored atomicly.
1622   //
1623   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1624                                      address *entry, const char *name,
1625                                      bool dest_uninitialized = false) {
1626     const bool not_oop = false;
1627     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1628   }
1629 
1630 
1631   // Arguments:
1632   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1633   //             ignored
1634   //   name    - stub name string
1635   //
1636   // Inputs:
1637   //   c_rarg0   - source array address
1638   //   c_rarg1   - destination array address
1639   //   c_rarg2   - element count, treated as size_t, can be zero
1640   //
1641   // Side Effects:
1642   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1643   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1644   //
1645   address generate_disjoint_long_copy(bool aligned, address *entry,
1646                                           const char *name, bool dest_uninitialized = false) {
1647     const bool not_oop = false;
1648     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1649   }
1650 
1651   // Arguments:
1652   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1653   //             ignored
1654   //   name    - stub name string
1655   //
1656   // Inputs:
1657   //   c_rarg0   - source array address
1658   //   c_rarg1   - destination array address
1659   //   c_rarg2   - element count, treated as size_t, can be zero
1660   //
1661   address generate_conjoint_long_copy(bool aligned,
1662                                       address nooverlap_target, address *entry,
1663                                       const char *name, bool dest_uninitialized = false) {
1664     const bool not_oop = false;
1665     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1666   }
1667 
1668   // Arguments:
1669   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1670   //             ignored
1671   //   name    - stub name string
1672   //
1673   // Inputs:
1674   //   c_rarg0   - source array address
1675   //   c_rarg1   - destination array address
1676   //   c_rarg2   - element count, treated as size_t, can be zero
1677   //
1678   // Side Effects:
1679   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1680   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1681   //
1682   address generate_disjoint_oop_copy(bool aligned, address *entry,
1683                                      const char *name, bool dest_uninitialized) {
1684     const bool is_oop = true;
1685     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1686     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1687   }
1688 
1689   // Arguments:
1690   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1691   //             ignored
1692   //   name    - stub name string
1693   //
1694   // Inputs:
1695   //   c_rarg0   - source array address
1696   //   c_rarg1   - destination array address
1697   //   c_rarg2   - element count, treated as size_t, can be zero
1698   //
1699   address generate_conjoint_oop_copy(bool aligned,
1700                                      address nooverlap_target, address *entry,
1701                                      const char *name, bool dest_uninitialized) {
1702     const bool is_oop = true;
1703     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1704     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1705                                   name, dest_uninitialized);
1706   }
1707 
1708 
1709   // Helper for generating a dynamic type check.
1710   // Smashes rscratch1.
1711   void generate_type_check(Register sub_klass,
1712                            Register super_check_offset,
1713                            Register super_klass,
1714                            Label& L_success) {
1715     assert_different_registers(sub_klass, super_check_offset, super_klass);
1716 
1717     BLOCK_COMMENT("type_check:");
1718 
1719     Label L_miss;
1720 
1721     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1722                                      super_check_offset);
1723     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1724 
1725     // Fall through on failure!
1726     __ BIND(L_miss);
1727   }
1728 
1729   //
1730   //  Generate checkcasting array copy stub
1731   //
1732   //  Input:
1733   //    c_rarg0   - source array address
1734   //    c_rarg1   - destination array address
1735   //    c_rarg2   - element count, treated as ssize_t, can be zero
1736   //    c_rarg3   - size_t ckoff (super_check_offset)
1737   //    c_rarg4   - oop ckval (super_klass)
1738   //
1739   //  Output:
1740   //    r0 ==  0  -  success
1741   //    r0 == -1^K - failure, where K is partial transfer count
1742   //
1743   address generate_checkcast_copy(const char *name, address *entry,
1744                                   bool dest_uninitialized = false) {
1745 
1746     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1747 
1748     // Input registers (after setup_arg_regs)
1749     const Register from        = c_rarg0;   // source array address
1750     const Register to          = c_rarg1;   // destination array address
1751     const Register count       = c_rarg2;   // elementscount
1752     const Register ckoff       = c_rarg3;   // super_check_offset
1753     const Register ckval       = c_rarg4;   // super_klass
1754 
1755     // Registers used as temps (r18, r19, r20 are save-on-entry)
1756     const Register count_save  = r21;       // orig elementscount
1757     const Register start_to    = r20;       // destination array start address
1758     const Register copied_oop  = r18;       // actual oop copied
1759     const Register r19_klass   = r19;       // oop._klass
1760 
1761     //---------------------------------------------------------------
1762     // Assembler stub will be used for this call to arraycopy
1763     // if the two arrays are subtypes of Object[] but the
1764     // destination array type is not equal to or a supertype
1765     // of the source type.  Each element must be separately
1766     // checked.
1767 
1768     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1769                                copied_oop, r19_klass, count_save);
1770 
1771     __ align(CodeEntryAlignment);
1772     StubCodeMark mark(this, "StubRoutines", name);
1773     address start = __ pc();
1774 
1775     __ enter(); // required for proper stackwalking of RuntimeStub frame
1776 
1777 #ifdef ASSERT
1778     // caller guarantees that the arrays really are different
1779     // otherwise, we would have to make conjoint checks
1780     { Label L;
1781       array_overlap_test(L, TIMES_OOP);
1782       __ stop("checkcast_copy within a single array");
1783       __ bind(L);
1784     }
1785 #endif //ASSERT
1786 
1787     // Caller of this entry point must set up the argument registers.
1788     if (entry != NULL) {
1789       *entry = __ pc();
1790       BLOCK_COMMENT("Entry:");
1791     }
1792 
1793      // Empty array:  Nothing to do.
1794     __ cbz(count, L_done);
1795 
1796     __ push(RegSet::of(r18, r19, r20, r21), sp);
1797 
1798 #ifdef ASSERT
1799     BLOCK_COMMENT("assert consistent ckoff/ckval");
1800     // The ckoff and ckval must be mutually consistent,
1801     // even though caller generates both.
1802     { Label L;
1803       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1804       __ ldrw(start_to, Address(ckval, sco_offset));
1805       __ cmpw(ckoff, start_to);
1806       __ br(Assembler::EQ, L);
1807       __ stop("super_check_offset inconsistent");
1808       __ bind(L);
1809     }
1810 #endif //ASSERT
1811 
1812     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1813 
1814     // save the original count
1815     __ mov(count_save, count);
1816 
1817     // Copy from low to high addresses
1818     __ mov(start_to, to);              // Save destination array start address
1819     __ b(L_load_element);
1820 
1821     // ======== begin loop ========
1822     // (Loop is rotated; its entry is L_load_element.)
1823     // Loop control:
1824     //   for (; count != 0; count--) {
1825     //     copied_oop = load_heap_oop(from++);
1826     //     ... generate_type_check ...;
1827     //     store_heap_oop(to++, copied_oop);
1828     //   }
1829     __ align(OptoLoopAlignment);
1830 
1831     __ BIND(L_store_element);
1832     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1833     __ sub(count, count, 1);
1834     __ cbz(count, L_do_card_marks);
1835 
1836     // ======== loop entry is here ========
1837     __ BIND(L_load_element);
1838     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1839     __ cbz(copied_oop, L_store_element);
1840 
1841     __ load_klass(r19_klass, copied_oop);// query the object klass
1842     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1843     // ======== end loop ========
1844 
1845     // It was a real error; we must depend on the caller to finish the job.
1846     // Register count = remaining oops, count_orig = total oops.
1847     // Emit GC store barriers for the oops we have copied and report
1848     // their number to the caller.
1849 
1850     __ subs(count, count_save, count);     // K = partially copied oop count
1851     __ eon(count, count, zr);                   // report (-1^K) to caller
1852     __ br(Assembler::EQ, L_done_pop);
1853 
1854     __ BIND(L_do_card_marks);
1855     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1856     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1857 
1858     __ bind(L_done_pop);
1859     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1860     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1861 
1862     __ bind(L_done);
1863     __ mov(r0, count);
1864     __ leave();
1865     __ ret(lr);
1866 
1867     return start;
1868   }
1869 
1870   // Perform range checks on the proposed arraycopy.
1871   // Kills temp, but nothing else.
1872   // Also, clean the sign bits of src_pos and dst_pos.
1873   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1874                               Register src_pos, // source position (c_rarg1)
1875                               Register dst,     // destination array oo (c_rarg2)
1876                               Register dst_pos, // destination position (c_rarg3)
1877                               Register length,
1878                               Register temp,
1879                               Label& L_failed) {
1880     BLOCK_COMMENT("arraycopy_range_checks:");
1881 
1882     assert_different_registers(rscratch1, temp);
1883 
1884     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1885     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1886     __ addw(temp, length, src_pos);
1887     __ cmpw(temp, rscratch1);
1888     __ br(Assembler::HI, L_failed);
1889 
1890     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1891     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1892     __ addw(temp, length, dst_pos);
1893     __ cmpw(temp, rscratch1);
1894     __ br(Assembler::HI, L_failed);
1895 
1896     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1897     __ movw(src_pos, src_pos);
1898     __ movw(dst_pos, dst_pos);
1899 
1900     BLOCK_COMMENT("arraycopy_range_checks done");
1901   }
1902 
1903   // These stubs get called from some dumb test routine.
1904   // I'll write them properly when they're called from
1905   // something that's actually doing something.
1906   static void fake_arraycopy_stub(address src, address dst, int count) {
1907     assert(count == 0, "huh?");
1908   }
1909 
1910 
1911   //
1912   // Generate stub for array fill. If "aligned" is true, the
1913   // "to" address is assumed to be heapword aligned.
1914   //
1915   // Arguments for generated stub:
1916   //   to:    c_rarg0
1917   //   value: c_rarg1
1918   //   count: c_rarg2 treated as signed
1919   //
1920   address generate_fill(BasicType t, bool aligned, const char *name) {
1921     __ align(CodeEntryAlignment);
1922     StubCodeMark mark(this, "StubRoutines", name);
1923     address start = __ pc();
1924 
1925     BLOCK_COMMENT("Entry:");
1926 
1927     const Register to        = c_rarg0;  // source array address
1928     const Register value     = c_rarg1;  // value
1929     const Register count     = c_rarg2;  // elements count
1930 
1931     const Register bz_base = r10;        // base for block_zero routine
1932     const Register cnt_words = r11;      // temp register
1933 
1934     __ enter();
1935 
1936     Label L_fill_elements, L_exit1;
1937 
1938     int shift = -1;
1939     switch (t) {
1940       case T_BYTE:
1941         shift = 0;
1942         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1943         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
1944         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
1945         __ br(Assembler::LO, L_fill_elements);
1946         break;
1947       case T_SHORT:
1948         shift = 1;
1949         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1950         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
1951         __ br(Assembler::LO, L_fill_elements);
1952         break;
1953       case T_INT:
1954         shift = 2;
1955         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
1956         __ br(Assembler::LO, L_fill_elements);
1957         break;
1958       default: ShouldNotReachHere();
1959     }
1960 
1961     // Align source address at 8 bytes address boundary.
1962     Label L_skip_align1, L_skip_align2, L_skip_align4;
1963     if (!aligned) {
1964       switch (t) {
1965         case T_BYTE:
1966           // One byte misalignment happens only for byte arrays.
1967           __ tbz(to, 0, L_skip_align1);
1968           __ strb(value, Address(__ post(to, 1)));
1969           __ subw(count, count, 1);
1970           __ bind(L_skip_align1);
1971           // Fallthrough
1972         case T_SHORT:
1973           // Two bytes misalignment happens only for byte and short (char) arrays.
1974           __ tbz(to, 1, L_skip_align2);
1975           __ strh(value, Address(__ post(to, 2)));
1976           __ subw(count, count, 2 >> shift);
1977           __ bind(L_skip_align2);
1978           // Fallthrough
1979         case T_INT:
1980           // Align to 8 bytes, we know we are 4 byte aligned to start.
1981           __ tbz(to, 2, L_skip_align4);
1982           __ strw(value, Address(__ post(to, 4)));
1983           __ subw(count, count, 4 >> shift);
1984           __ bind(L_skip_align4);
1985           break;
1986         default: ShouldNotReachHere();
1987       }
1988     }
1989 
1990     //
1991     //  Fill large chunks
1992     //
1993     __ lsrw(cnt_words, count, 3 - shift); // number of words
1994     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
1995     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
1996     if (UseBlockZeroing) {
1997       Label non_block_zeroing, rest;
1998       // count >= BlockZeroingLowLimit && value == 0
1999       __ subs(rscratch1, cnt_words, BlockZeroingLowLimit >> 3);
2000       __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
2001       __ br(Assembler::NE, non_block_zeroing);
2002       __ mov(bz_base, to);
2003       __ block_zero(bz_base, cnt_words, true);
2004       __ mov(to, bz_base);
2005       __ b(rest);
2006       __ bind(non_block_zeroing);
2007       __ fill_words(to, cnt_words, value);
2008       __ bind(rest);
2009     }
2010     else {
2011       __ fill_words(to, cnt_words, value);
2012     }
2013 
2014     // Remaining count is less than 8 bytes. Fill it by a single store.
2015     // Note that the total length is no less than 8 bytes.
2016     if (t == T_BYTE || t == T_SHORT) {
2017       Label L_exit1;
2018       __ cbzw(count, L_exit1);
2019       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2020       __ str(value, Address(to, -8));    // overwrite some elements
2021       __ bind(L_exit1);
2022       __ leave();
2023       __ ret(lr);
2024     }
2025 
2026     // Handle copies less than 8 bytes.
2027     Label L_fill_2, L_fill_4, L_exit2;
2028     __ bind(L_fill_elements);
2029     switch (t) {
2030       case T_BYTE:
2031         __ tbz(count, 0, L_fill_2);
2032         __ strb(value, Address(__ post(to, 1)));
2033         __ bind(L_fill_2);
2034         __ tbz(count, 1, L_fill_4);
2035         __ strh(value, Address(__ post(to, 2)));
2036         __ bind(L_fill_4);
2037         __ tbz(count, 2, L_exit2);
2038         __ strw(value, Address(to));
2039         break;
2040       case T_SHORT:
2041         __ tbz(count, 0, L_fill_4);
2042         __ strh(value, Address(__ post(to, 2)));
2043         __ bind(L_fill_4);
2044         __ tbz(count, 1, L_exit2);
2045         __ strw(value, Address(to));
2046         break;
2047       case T_INT:
2048         __ cbzw(count, L_exit2);
2049         __ strw(value, Address(to));
2050         break;
2051       default: ShouldNotReachHere();
2052     }
2053     __ bind(L_exit2);
2054     __ leave();
2055     __ ret(lr);
2056     return start;
2057   }
2058 
2059   //
2060   //  Generate 'unsafe' array copy stub
2061   //  Though just as safe as the other stubs, it takes an unscaled
2062   //  size_t argument instead of an element count.
2063   //
2064   //  Input:
2065   //    c_rarg0   - source array address
2066   //    c_rarg1   - destination array address
2067   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2068   //
2069   // Examines the alignment of the operands and dispatches
2070   // to a long, int, short, or byte copy loop.
2071   //
2072   address generate_unsafe_copy(const char *name,
2073                                address byte_copy_entry,
2074                                address short_copy_entry,
2075                                address int_copy_entry,
2076                                address long_copy_entry) {
2077     Label L_long_aligned, L_int_aligned, L_short_aligned;
2078     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2079 
2080     __ align(CodeEntryAlignment);
2081     StubCodeMark mark(this, "StubRoutines", name);
2082     address start = __ pc();
2083     __ enter(); // required for proper stackwalking of RuntimeStub frame
2084 
2085     // bump this on entry, not on exit:
2086     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2087 
2088     __ orr(rscratch1, s, d);
2089     __ orr(rscratch1, rscratch1, count);
2090 
2091     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2092     __ cbz(rscratch1, L_long_aligned);
2093     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2094     __ cbz(rscratch1, L_int_aligned);
2095     __ tbz(rscratch1, 0, L_short_aligned);
2096     __ b(RuntimeAddress(byte_copy_entry));
2097 
2098     __ BIND(L_short_aligned);
2099     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2100     __ b(RuntimeAddress(short_copy_entry));
2101     __ BIND(L_int_aligned);
2102     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2103     __ b(RuntimeAddress(int_copy_entry));
2104     __ BIND(L_long_aligned);
2105     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2106     __ b(RuntimeAddress(long_copy_entry));
2107 
2108     return start;
2109   }
2110 
2111   //
2112   //  Generate generic array copy stubs
2113   //
2114   //  Input:
2115   //    c_rarg0    -  src oop
2116   //    c_rarg1    -  src_pos (32-bits)
2117   //    c_rarg2    -  dst oop
2118   //    c_rarg3    -  dst_pos (32-bits)
2119   //    c_rarg4    -  element count (32-bits)
2120   //
2121   //  Output:
2122   //    r0 ==  0  -  success
2123   //    r0 == -1^K - failure, where K is partial transfer count
2124   //
2125   address generate_generic_copy(const char *name,
2126                                 address byte_copy_entry, address short_copy_entry,
2127                                 address int_copy_entry, address oop_copy_entry,
2128                                 address long_copy_entry, address checkcast_copy_entry) {
2129 
2130     Label L_failed, L_failed_0, L_objArray;
2131     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2132 
2133     // Input registers
2134     const Register src        = c_rarg0;  // source array oop
2135     const Register src_pos    = c_rarg1;  // source position
2136     const Register dst        = c_rarg2;  // destination array oop
2137     const Register dst_pos    = c_rarg3;  // destination position
2138     const Register length     = c_rarg4;
2139 
2140     __ align(CodeEntryAlignment);
2141 
2142     StubCodeMark mark(this, "StubRoutines", name);
2143 
2144     address start = __ pc();
2145 
2146     __ enter(); // required for proper stackwalking of RuntimeStub frame
2147 
2148     // bump this on entry, not on exit:
2149     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2150 
2151     //-----------------------------------------------------------------------
2152     // Assembler stub will be used for this call to arraycopy
2153     // if the following conditions are met:
2154     //
2155     // (1) src and dst must not be null.
2156     // (2) src_pos must not be negative.
2157     // (3) dst_pos must not be negative.
2158     // (4) length  must not be negative.
2159     // (5) src klass and dst klass should be the same and not NULL.
2160     // (6) src and dst should be arrays.
2161     // (7) src_pos + length must not exceed length of src.
2162     // (8) dst_pos + length must not exceed length of dst.
2163     //
2164 
2165     //  if (src == NULL) return -1;
2166     __ cbz(src, L_failed);
2167 
2168     //  if (src_pos < 0) return -1;
2169     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2170 
2171     //  if (dst == NULL) return -1;
2172     __ cbz(dst, L_failed);
2173 
2174     //  if (dst_pos < 0) return -1;
2175     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2176 
2177     // registers used as temp
2178     const Register scratch_length    = r16; // elements count to copy
2179     const Register scratch_src_klass = r17; // array klass
2180     const Register lh                = r18; // layout helper
2181 
2182     //  if (length < 0) return -1;
2183     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2184     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2185 
2186     __ load_klass(scratch_src_klass, src);
2187 #ifdef ASSERT
2188     //  assert(src->klass() != NULL);
2189     {
2190       BLOCK_COMMENT("assert klasses not null {");
2191       Label L1, L2;
2192       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2193       __ bind(L1);
2194       __ stop("broken null klass");
2195       __ bind(L2);
2196       __ load_klass(rscratch1, dst);
2197       __ cbz(rscratch1, L1);     // this would be broken also
2198       BLOCK_COMMENT("} assert klasses not null done");
2199     }
2200 #endif
2201 
2202     // Load layout helper (32-bits)
2203     //
2204     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2205     // 32        30    24            16              8     2                 0
2206     //
2207     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2208     //
2209 
2210     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2211 
2212     // Handle objArrays completely differently...
2213     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2214     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2215     __ movw(rscratch1, objArray_lh);
2216     __ eorw(rscratch2, lh, rscratch1);
2217     __ cbzw(rscratch2, L_objArray);
2218 
2219     //  if (src->klass() != dst->klass()) return -1;
2220     __ load_klass(rscratch2, dst);
2221     __ eor(rscratch2, rscratch2, scratch_src_klass);
2222     __ cbnz(rscratch2, L_failed);
2223 
2224     //  if (!src->is_Array()) return -1;
2225     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2226 
2227     // At this point, it is known to be a typeArray (array_tag 0x3).
2228 #ifdef ASSERT
2229     {
2230       BLOCK_COMMENT("assert primitive array {");
2231       Label L;
2232       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2233       __ cmpw(lh, rscratch2);
2234       __ br(Assembler::GE, L);
2235       __ stop("must be a primitive array");
2236       __ bind(L);
2237       BLOCK_COMMENT("} assert primitive array done");
2238     }
2239 #endif
2240 
2241     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2242                            rscratch2, L_failed);
2243 
2244     // TypeArrayKlass
2245     //
2246     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2247     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2248     //
2249 
2250     const Register rscratch1_offset = rscratch1;    // array offset
2251     const Register r18_elsize = lh; // element size
2252 
2253     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2254            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2255     __ add(src, src, rscratch1_offset);           // src array offset
2256     __ add(dst, dst, rscratch1_offset);           // dst array offset
2257     BLOCK_COMMENT("choose copy loop based on element size");
2258 
2259     // next registers should be set before the jump to corresponding stub
2260     const Register from     = c_rarg0;  // source array address
2261     const Register to       = c_rarg1;  // destination array address
2262     const Register count    = c_rarg2;  // elements count
2263 
2264     // 'from', 'to', 'count' registers should be set in such order
2265     // since they are the same as 'src', 'src_pos', 'dst'.
2266 
2267     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2268 
2269     // The possible values of elsize are 0-3, i.e. exact_log2(element
2270     // size in bytes).  We do a simple bitwise binary search.
2271   __ BIND(L_copy_bytes);
2272     __ tbnz(r18_elsize, 1, L_copy_ints);
2273     __ tbnz(r18_elsize, 0, L_copy_shorts);
2274     __ lea(from, Address(src, src_pos));// src_addr
2275     __ lea(to,   Address(dst, dst_pos));// dst_addr
2276     __ movw(count, scratch_length); // length
2277     __ b(RuntimeAddress(byte_copy_entry));
2278 
2279   __ BIND(L_copy_shorts);
2280     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2281     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2282     __ movw(count, scratch_length); // length
2283     __ b(RuntimeAddress(short_copy_entry));
2284 
2285   __ BIND(L_copy_ints);
2286     __ tbnz(r18_elsize, 0, L_copy_longs);
2287     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2288     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2289     __ movw(count, scratch_length); // length
2290     __ b(RuntimeAddress(int_copy_entry));
2291 
2292   __ BIND(L_copy_longs);
2293 #ifdef ASSERT
2294     {
2295       BLOCK_COMMENT("assert long copy {");
2296       Label L;
2297       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2298       __ cmpw(r18_elsize, LogBytesPerLong);
2299       __ br(Assembler::EQ, L);
2300       __ stop("must be long copy, but elsize is wrong");
2301       __ bind(L);
2302       BLOCK_COMMENT("} assert long copy done");
2303     }
2304 #endif
2305     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2306     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2307     __ movw(count, scratch_length); // length
2308     __ b(RuntimeAddress(long_copy_entry));
2309 
2310     // ObjArrayKlass
2311   __ BIND(L_objArray);
2312     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2313 
2314     Label L_plain_copy, L_checkcast_copy;
2315     //  test array classes for subtyping
2316     __ load_klass(r18, dst);
2317     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2318     __ br(Assembler::NE, L_checkcast_copy);
2319 
2320     // Identically typed arrays can be copied without element-wise checks.
2321     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2322                            rscratch2, L_failed);
2323 
2324     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2325     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2326     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2327     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2328     __ movw(count, scratch_length); // length
2329   __ BIND(L_plain_copy);
2330     __ b(RuntimeAddress(oop_copy_entry));
2331 
2332   __ BIND(L_checkcast_copy);
2333     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2334     {
2335       // Before looking at dst.length, make sure dst is also an objArray.
2336       __ ldrw(rscratch1, Address(r18, lh_offset));
2337       __ movw(rscratch2, objArray_lh);
2338       __ eorw(rscratch1, rscratch1, rscratch2);
2339       __ cbnzw(rscratch1, L_failed);
2340 
2341       // It is safe to examine both src.length and dst.length.
2342       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2343                              r18, L_failed);
2344 
2345       const Register rscratch2_dst_klass = rscratch2;
2346       __ load_klass(rscratch2_dst_klass, dst); // reload
2347 
2348       // Marshal the base address arguments now, freeing registers.
2349       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2350       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2351       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2352       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2353       __ movw(count, length);           // length (reloaded)
2354       Register sco_temp = c_rarg3;      // this register is free now
2355       assert_different_registers(from, to, count, sco_temp,
2356                                  rscratch2_dst_klass, scratch_src_klass);
2357       // assert_clean_int(count, sco_temp);
2358 
2359       // Generate the type check.
2360       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2361       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2362       // assert_clean_int(sco_temp, r18);
2363       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2364 
2365       // Fetch destination element klass from the ObjArrayKlass header.
2366       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2367       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2368       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2369 
2370       // the checkcast_copy loop needs two extra arguments:
2371       assert(c_rarg3 == sco_temp, "#3 already in place");
2372       // Set up arguments for checkcast_copy_entry.
2373       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2374       __ b(RuntimeAddress(checkcast_copy_entry));
2375     }
2376 
2377   __ BIND(L_failed);
2378     __ mov(r0, -1);
2379     __ leave();   // required for proper stackwalking of RuntimeStub frame
2380     __ ret(lr);
2381 
2382     return start;
2383   }
2384 
2385   void generate_arraycopy_stubs() {
2386     address entry;
2387     address entry_jbyte_arraycopy;
2388     address entry_jshort_arraycopy;
2389     address entry_jint_arraycopy;
2390     address entry_oop_arraycopy;
2391     address entry_jlong_arraycopy;
2392     address entry_checkcast_arraycopy;
2393 
2394     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2395     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2396 
2397     StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
2398 
2399     //*** jbyte
2400     // Always need aligned and unaligned versions
2401     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2402                                                                                   "jbyte_disjoint_arraycopy");
2403     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2404                                                                                   &entry_jbyte_arraycopy,
2405                                                                                   "jbyte_arraycopy");
2406     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2407                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2408     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2409                                                                                   "arrayof_jbyte_arraycopy");
2410 
2411     //*** jshort
2412     // Always need aligned and unaligned versions
2413     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2414                                                                                     "jshort_disjoint_arraycopy");
2415     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2416                                                                                     &entry_jshort_arraycopy,
2417                                                                                     "jshort_arraycopy");
2418     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2419                                                                                     "arrayof_jshort_disjoint_arraycopy");
2420     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2421                                                                                     "arrayof_jshort_arraycopy");
2422 
2423     //*** jint
2424     // Aligned versions
2425     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2426                                                                                 "arrayof_jint_disjoint_arraycopy");
2427     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2428                                                                                 "arrayof_jint_arraycopy");
2429     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2430     // entry_jint_arraycopy always points to the unaligned version
2431     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2432                                                                                 "jint_disjoint_arraycopy");
2433     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2434                                                                                 &entry_jint_arraycopy,
2435                                                                                 "jint_arraycopy");
2436 
2437     //*** jlong
2438     // It is always aligned
2439     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2440                                                                                   "arrayof_jlong_disjoint_arraycopy");
2441     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2442                                                                                   "arrayof_jlong_arraycopy");
2443     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2444     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2445 
2446     //*** oops
2447     {
2448       // With compressed oops we need unaligned versions; notice that
2449       // we overwrite entry_oop_arraycopy.
2450       bool aligned = !UseCompressedOops;
2451 
2452       StubRoutines::_arrayof_oop_disjoint_arraycopy
2453         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2454                                      /*dest_uninitialized*/false);
2455       StubRoutines::_arrayof_oop_arraycopy
2456         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2457                                      /*dest_uninitialized*/false);
2458       // Aligned versions without pre-barriers
2459       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2460         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2461                                      /*dest_uninitialized*/true);
2462       StubRoutines::_arrayof_oop_arraycopy_uninit
2463         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2464                                      /*dest_uninitialized*/true);
2465     }
2466 
2467     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2468     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2469     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2470     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2471 
2472     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2473     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2474                                                                         /*dest_uninitialized*/true);
2475 
2476     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2477                                                               entry_jbyte_arraycopy,
2478                                                               entry_jshort_arraycopy,
2479                                                               entry_jint_arraycopy,
2480                                                               entry_jlong_arraycopy);
2481 
2482     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2483                                                                entry_jbyte_arraycopy,
2484                                                                entry_jshort_arraycopy,
2485                                                                entry_jint_arraycopy,
2486                                                                entry_oop_arraycopy,
2487                                                                entry_jlong_arraycopy,
2488                                                                entry_checkcast_arraycopy);
2489 
2490     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2491     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2492     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2493     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2494     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2495     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2496   }
2497 
2498   // Arguments:
2499   //
2500   // Inputs:
2501   //   c_rarg0   - source byte array address
2502   //   c_rarg1   - destination byte array address
2503   //   c_rarg2   - K (key) in little endian int array
2504   //
2505   address generate_aescrypt_encryptBlock() {
2506     __ align(CodeEntryAlignment);
2507     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2508 
2509     Label L_doLast;
2510 
2511     const Register from        = c_rarg0;  // source array address
2512     const Register to          = c_rarg1;  // destination array address
2513     const Register key         = c_rarg2;  // key array address
2514     const Register keylen      = rscratch1;
2515 
2516     address start = __ pc();
2517     __ enter();
2518 
2519     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2520 
2521     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2522 
2523     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2524     __ rev32(v1, __ T16B, v1);
2525     __ rev32(v2, __ T16B, v2);
2526     __ rev32(v3, __ T16B, v3);
2527     __ rev32(v4, __ T16B, v4);
2528     __ aese(v0, v1);
2529     __ aesmc(v0, v0);
2530     __ aese(v0, v2);
2531     __ aesmc(v0, v0);
2532     __ aese(v0, v3);
2533     __ aesmc(v0, v0);
2534     __ aese(v0, v4);
2535     __ aesmc(v0, v0);
2536 
2537     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2538     __ rev32(v1, __ T16B, v1);
2539     __ rev32(v2, __ T16B, v2);
2540     __ rev32(v3, __ T16B, v3);
2541     __ rev32(v4, __ T16B, v4);
2542     __ aese(v0, v1);
2543     __ aesmc(v0, v0);
2544     __ aese(v0, v2);
2545     __ aesmc(v0, v0);
2546     __ aese(v0, v3);
2547     __ aesmc(v0, v0);
2548     __ aese(v0, v4);
2549     __ aesmc(v0, v0);
2550 
2551     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2552     __ rev32(v1, __ T16B, v1);
2553     __ rev32(v2, __ T16B, v2);
2554 
2555     __ cmpw(keylen, 44);
2556     __ br(Assembler::EQ, L_doLast);
2557 
2558     __ aese(v0, v1);
2559     __ aesmc(v0, v0);
2560     __ aese(v0, v2);
2561     __ aesmc(v0, v0);
2562 
2563     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2564     __ rev32(v1, __ T16B, v1);
2565     __ rev32(v2, __ T16B, v2);
2566 
2567     __ cmpw(keylen, 52);
2568     __ br(Assembler::EQ, L_doLast);
2569 
2570     __ aese(v0, v1);
2571     __ aesmc(v0, v0);
2572     __ aese(v0, v2);
2573     __ aesmc(v0, v0);
2574 
2575     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2576     __ rev32(v1, __ T16B, v1);
2577     __ rev32(v2, __ T16B, v2);
2578 
2579     __ BIND(L_doLast);
2580 
2581     __ aese(v0, v1);
2582     __ aesmc(v0, v0);
2583     __ aese(v0, v2);
2584 
2585     __ ld1(v1, __ T16B, key);
2586     __ rev32(v1, __ T16B, v1);
2587     __ eor(v0, __ T16B, v0, v1);
2588 
2589     __ st1(v0, __ T16B, to);
2590 
2591     __ mov(r0, 0);
2592 
2593     __ leave();
2594     __ ret(lr);
2595 
2596     return start;
2597   }
2598 
2599   // Arguments:
2600   //
2601   // Inputs:
2602   //   c_rarg0   - source byte array address
2603   //   c_rarg1   - destination byte array address
2604   //   c_rarg2   - K (key) in little endian int array
2605   //
2606   address generate_aescrypt_decryptBlock() {
2607     assert(UseAES, "need AES instructions and misaligned SSE support");
2608     __ align(CodeEntryAlignment);
2609     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2610     Label L_doLast;
2611 
2612     const Register from        = c_rarg0;  // source array address
2613     const Register to          = c_rarg1;  // destination array address
2614     const Register key         = c_rarg2;  // key array address
2615     const Register keylen      = rscratch1;
2616 
2617     address start = __ pc();
2618     __ enter(); // required for proper stackwalking of RuntimeStub frame
2619 
2620     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2621 
2622     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2623 
2624     __ ld1(v5, __ T16B, __ post(key, 16));
2625     __ rev32(v5, __ T16B, v5);
2626 
2627     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2628     __ rev32(v1, __ T16B, v1);
2629     __ rev32(v2, __ T16B, v2);
2630     __ rev32(v3, __ T16B, v3);
2631     __ rev32(v4, __ T16B, v4);
2632     __ aesd(v0, v1);
2633     __ aesimc(v0, v0);
2634     __ aesd(v0, v2);
2635     __ aesimc(v0, v0);
2636     __ aesd(v0, v3);
2637     __ aesimc(v0, v0);
2638     __ aesd(v0, v4);
2639     __ aesimc(v0, v0);
2640 
2641     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2642     __ rev32(v1, __ T16B, v1);
2643     __ rev32(v2, __ T16B, v2);
2644     __ rev32(v3, __ T16B, v3);
2645     __ rev32(v4, __ T16B, v4);
2646     __ aesd(v0, v1);
2647     __ aesimc(v0, v0);
2648     __ aesd(v0, v2);
2649     __ aesimc(v0, v0);
2650     __ aesd(v0, v3);
2651     __ aesimc(v0, v0);
2652     __ aesd(v0, v4);
2653     __ aesimc(v0, v0);
2654 
2655     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2656     __ rev32(v1, __ T16B, v1);
2657     __ rev32(v2, __ T16B, v2);
2658 
2659     __ cmpw(keylen, 44);
2660     __ br(Assembler::EQ, L_doLast);
2661 
2662     __ aesd(v0, v1);
2663     __ aesimc(v0, v0);
2664     __ aesd(v0, v2);
2665     __ aesimc(v0, v0);
2666 
2667     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2668     __ rev32(v1, __ T16B, v1);
2669     __ rev32(v2, __ T16B, v2);
2670 
2671     __ cmpw(keylen, 52);
2672     __ br(Assembler::EQ, L_doLast);
2673 
2674     __ aesd(v0, v1);
2675     __ aesimc(v0, v0);
2676     __ aesd(v0, v2);
2677     __ aesimc(v0, v0);
2678 
2679     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2680     __ rev32(v1, __ T16B, v1);
2681     __ rev32(v2, __ T16B, v2);
2682 
2683     __ BIND(L_doLast);
2684 
2685     __ aesd(v0, v1);
2686     __ aesimc(v0, v0);
2687     __ aesd(v0, v2);
2688 
2689     __ eor(v0, __ T16B, v0, v5);
2690 
2691     __ st1(v0, __ T16B, to);
2692 
2693     __ mov(r0, 0);
2694 
2695     __ leave();
2696     __ ret(lr);
2697 
2698     return start;
2699   }
2700 
2701   // Arguments:
2702   //
2703   // Inputs:
2704   //   c_rarg0   - source byte array address
2705   //   c_rarg1   - destination byte array address
2706   //   c_rarg2   - K (key) in little endian int array
2707   //   c_rarg3   - r vector byte array address
2708   //   c_rarg4   - input length
2709   //
2710   // Output:
2711   //   x0        - input length
2712   //
2713   address generate_cipherBlockChaining_encryptAESCrypt() {
2714     assert(UseAES, "need AES instructions and misaligned SSE support");
2715     __ align(CodeEntryAlignment);
2716     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2717 
2718     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish;
2719 
2720     const Register from        = c_rarg0;  // source array address
2721     const Register to          = c_rarg1;  // destination array address
2722     const Register key         = c_rarg2;  // key array address
2723     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2724                                            // and left with the results of the last encryption block
2725     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2726     const Register keylen      = rscratch1;
2727 
2728     address start = __ pc();
2729 
2730       __ enter();
2731 
2732       __ subsw(rscratch2, len_reg, zr);
2733       __ br(Assembler::LE, _L_finish);
2734 
2735       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2736 
2737       __ ld1(v0, __ T16B, rvec);
2738 
2739       __ cmpw(keylen, 52);
2740       __ br(Assembler::CC, L_loadkeys_44);
2741       __ br(Assembler::EQ, L_loadkeys_52);
2742 
2743       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2744       __ rev32(v17, __ T16B, v17);
2745       __ rev32(v18, __ T16B, v18);
2746     __ BIND(L_loadkeys_52);
2747       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2748       __ rev32(v19, __ T16B, v19);
2749       __ rev32(v20, __ T16B, v20);
2750     __ BIND(L_loadkeys_44);
2751       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2752       __ rev32(v21, __ T16B, v21);
2753       __ rev32(v22, __ T16B, v22);
2754       __ rev32(v23, __ T16B, v23);
2755       __ rev32(v24, __ T16B, v24);
2756       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2757       __ rev32(v25, __ T16B, v25);
2758       __ rev32(v26, __ T16B, v26);
2759       __ rev32(v27, __ T16B, v27);
2760       __ rev32(v28, __ T16B, v28);
2761       __ ld1(v29, v30, v31, __ T16B, key);
2762       __ rev32(v29, __ T16B, v29);
2763       __ rev32(v30, __ T16B, v30);
2764       __ rev32(v31, __ T16B, v31);
2765 
2766     __ BIND(L_aes_loop);
2767       __ ld1(v1, __ T16B, __ post(from, 16));
2768       __ eor(v0, __ T16B, v0, v1);
2769 
2770       __ br(Assembler::CC, L_rounds_44);
2771       __ br(Assembler::EQ, L_rounds_52);
2772 
2773       __ aese(v0, v17); __ aesmc(v0, v0);
2774       __ aese(v0, v18); __ aesmc(v0, v0);
2775     __ BIND(L_rounds_52);
2776       __ aese(v0, v19); __ aesmc(v0, v0);
2777       __ aese(v0, v20); __ aesmc(v0, v0);
2778     __ BIND(L_rounds_44);
2779       __ aese(v0, v21); __ aesmc(v0, v0);
2780       __ aese(v0, v22); __ aesmc(v0, v0);
2781       __ aese(v0, v23); __ aesmc(v0, v0);
2782       __ aese(v0, v24); __ aesmc(v0, v0);
2783       __ aese(v0, v25); __ aesmc(v0, v0);
2784       __ aese(v0, v26); __ aesmc(v0, v0);
2785       __ aese(v0, v27); __ aesmc(v0, v0);
2786       __ aese(v0, v28); __ aesmc(v0, v0);
2787       __ aese(v0, v29); __ aesmc(v0, v0);
2788       __ aese(v0, v30);
2789       __ eor(v0, __ T16B, v0, v31);
2790 
2791       __ st1(v0, __ T16B, __ post(to, 16));
2792 
2793       __ subw(len_reg, len_reg, 16);
2794       __ cbnzw(len_reg, L_aes_loop);
2795 
2796       __ st1(v0, __ T16B, rvec);
2797 
2798     __ BIND(_L_finish);
2799       __ mov(r0, rscratch2);
2800 
2801       __ leave();
2802       __ ret(lr);
2803 
2804       return start;
2805   }
2806 
2807   // Arguments:
2808   //
2809   // Inputs:
2810   //   c_rarg0   - source byte array address
2811   //   c_rarg1   - destination byte array address
2812   //   c_rarg2   - K (key) in little endian int array
2813   //   c_rarg3   - r vector byte array address
2814   //   c_rarg4   - input length
2815   //
2816   // Output:
2817   //   r0       - input length
2818   //
2819   address generate_cipherBlockChaining_decryptAESCrypt() {
2820     assert(UseAES, "need AES instructions and misaligned SSE support");
2821     __ align(CodeEntryAlignment);
2822     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2823 
2824     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish;
2825 
2826     const Register from        = c_rarg0;  // source array address
2827     const Register to          = c_rarg1;  // destination array address
2828     const Register key         = c_rarg2;  // key array address
2829     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2830                                            // and left with the results of the last encryption block
2831     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2832     const Register keylen      = rscratch1;
2833 
2834     address start = __ pc();
2835 
2836       __ enter();
2837 
2838       __ subsw(rscratch2, len_reg, zr);
2839       __ br(Assembler::LE, _L_finish);
2840 
2841       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2842 
2843       __ ld1(v2, __ T16B, rvec);
2844 
2845       __ ld1(v31, __ T16B, __ post(key, 16));
2846       __ rev32(v31, __ T16B, v31);
2847 
2848       __ cmpw(keylen, 52);
2849       __ br(Assembler::CC, L_loadkeys_44);
2850       __ br(Assembler::EQ, L_loadkeys_52);
2851 
2852       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2853       __ rev32(v17, __ T16B, v17);
2854       __ rev32(v18, __ T16B, v18);
2855     __ BIND(L_loadkeys_52);
2856       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2857       __ rev32(v19, __ T16B, v19);
2858       __ rev32(v20, __ T16B, v20);
2859     __ BIND(L_loadkeys_44);
2860       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2861       __ rev32(v21, __ T16B, v21);
2862       __ rev32(v22, __ T16B, v22);
2863       __ rev32(v23, __ T16B, v23);
2864       __ rev32(v24, __ T16B, v24);
2865       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2866       __ rev32(v25, __ T16B, v25);
2867       __ rev32(v26, __ T16B, v26);
2868       __ rev32(v27, __ T16B, v27);
2869       __ rev32(v28, __ T16B, v28);
2870       __ ld1(v29, v30, __ T16B, key);
2871       __ rev32(v29, __ T16B, v29);
2872       __ rev32(v30, __ T16B, v30);
2873 
2874     __ BIND(L_aes_loop);
2875       __ ld1(v0, __ T16B, __ post(from, 16));
2876       __ orr(v1, __ T16B, v0, v0);
2877 
2878       __ br(Assembler::CC, L_rounds_44);
2879       __ br(Assembler::EQ, L_rounds_52);
2880 
2881       __ aesd(v0, v17); __ aesimc(v0, v0);
2882       __ aesd(v0, v18); __ aesimc(v0, v0);
2883     __ BIND(L_rounds_52);
2884       __ aesd(v0, v19); __ aesimc(v0, v0);
2885       __ aesd(v0, v20); __ aesimc(v0, v0);
2886     __ BIND(L_rounds_44);
2887       __ aesd(v0, v21); __ aesimc(v0, v0);
2888       __ aesd(v0, v22); __ aesimc(v0, v0);
2889       __ aesd(v0, v23); __ aesimc(v0, v0);
2890       __ aesd(v0, v24); __ aesimc(v0, v0);
2891       __ aesd(v0, v25); __ aesimc(v0, v0);
2892       __ aesd(v0, v26); __ aesimc(v0, v0);
2893       __ aesd(v0, v27); __ aesimc(v0, v0);
2894       __ aesd(v0, v28); __ aesimc(v0, v0);
2895       __ aesd(v0, v29); __ aesimc(v0, v0);
2896       __ aesd(v0, v30);
2897       __ eor(v0, __ T16B, v0, v31);
2898       __ eor(v0, __ T16B, v0, v2);
2899 
2900       __ st1(v0, __ T16B, __ post(to, 16));
2901       __ orr(v2, __ T16B, v1, v1);
2902 
2903       __ subw(len_reg, len_reg, 16);
2904       __ cbnzw(len_reg, L_aes_loop);
2905 
2906       __ st1(v2, __ T16B, rvec);
2907 
2908     __ BIND(_L_finish);
2909       __ mov(r0, rscratch2);
2910 
2911       __ leave();
2912       __ ret(lr);
2913 
2914     return start;
2915   }
2916 
2917   // Arguments:
2918   //
2919   // Inputs:
2920   //   c_rarg0   - byte[]  source+offset
2921   //   c_rarg1   - int[]   SHA.state
2922   //   c_rarg2   - int     offset
2923   //   c_rarg3   - int     limit
2924   //
2925   address generate_sha1_implCompress(bool multi_block, const char *name) {
2926     __ align(CodeEntryAlignment);
2927     StubCodeMark mark(this, "StubRoutines", name);
2928     address start = __ pc();
2929 
2930     Register buf   = c_rarg0;
2931     Register state = c_rarg1;
2932     Register ofs   = c_rarg2;
2933     Register limit = c_rarg3;
2934 
2935     Label keys;
2936     Label sha1_loop;
2937 
2938     // load the keys into v0..v3
2939     __ adr(rscratch1, keys);
2940     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2941     // load 5 words state into v6, v7
2942     __ ldrq(v6, Address(state, 0));
2943     __ ldrs(v7, Address(state, 16));
2944 
2945 
2946     __ BIND(sha1_loop);
2947     // load 64 bytes of data into v16..v19
2948     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2949     __ rev32(v16, __ T16B, v16);
2950     __ rev32(v17, __ T16B, v17);
2951     __ rev32(v18, __ T16B, v18);
2952     __ rev32(v19, __ T16B, v19);
2953 
2954     // do the sha1
2955     __ addv(v4, __ T4S, v16, v0);
2956     __ orr(v20, __ T16B, v6, v6);
2957 
2958     FloatRegister d0 = v16;
2959     FloatRegister d1 = v17;
2960     FloatRegister d2 = v18;
2961     FloatRegister d3 = v19;
2962 
2963     for (int round = 0; round < 20; round++) {
2964       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2965       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2966       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2967       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2968       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2969 
2970       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2971       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2972       __ sha1h(tmp2, __ T4S, v20);
2973       if (round < 5)
2974         __ sha1c(v20, __ T4S, tmp3, tmp4);
2975       else if (round < 10 || round >= 15)
2976         __ sha1p(v20, __ T4S, tmp3, tmp4);
2977       else
2978         __ sha1m(v20, __ T4S, tmp3, tmp4);
2979       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2980 
2981       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2982     }
2983 
2984     __ addv(v7, __ T2S, v7, v21);
2985     __ addv(v6, __ T4S, v6, v20);
2986 
2987     if (multi_block) {
2988       __ add(ofs, ofs, 64);
2989       __ cmp(ofs, limit);
2990       __ br(Assembler::LE, sha1_loop);
2991       __ mov(c_rarg0, ofs); // return ofs
2992     }
2993 
2994     __ strq(v6, Address(state, 0));
2995     __ strs(v7, Address(state, 16));
2996 
2997     __ ret(lr);
2998 
2999     __ bind(keys);
3000     __ emit_int32(0x5a827999);
3001     __ emit_int32(0x6ed9eba1);
3002     __ emit_int32(0x8f1bbcdc);
3003     __ emit_int32(0xca62c1d6);
3004 
3005     return start;
3006   }
3007 
3008 
3009   // Arguments:
3010   //
3011   // Inputs:
3012   //   c_rarg0   - byte[]  source+offset
3013   //   c_rarg1   - int[]   SHA.state
3014   //   c_rarg2   - int     offset
3015   //   c_rarg3   - int     limit
3016   //
3017   address generate_sha256_implCompress(bool multi_block, const char *name) {
3018     static const uint32_t round_consts[64] = {
3019       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3020       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3021       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3022       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3023       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3024       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3025       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3026       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3027       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3028       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3029       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3030       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3031       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3032       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3033       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3034       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3035     };
3036     __ align(CodeEntryAlignment);
3037     StubCodeMark mark(this, "StubRoutines", name);
3038     address start = __ pc();
3039 
3040     Register buf   = c_rarg0;
3041     Register state = c_rarg1;
3042     Register ofs   = c_rarg2;
3043     Register limit = c_rarg3;
3044 
3045     Label sha1_loop;
3046 
3047     __ stpd(v8, v9, __ pre(sp, -32));
3048     __ stpd(v10, v11, Address(sp, 16));
3049 
3050 // dga == v0
3051 // dgb == v1
3052 // dg0 == v2
3053 // dg1 == v3
3054 // dg2 == v4
3055 // t0 == v6
3056 // t1 == v7
3057 
3058     // load 16 keys to v16..v31
3059     __ lea(rscratch1, ExternalAddress((address)round_consts));
3060     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3061     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3062     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3063     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3064 
3065     // load 8 words (256 bits) state
3066     __ ldpq(v0, v1, state);
3067 
3068     __ BIND(sha1_loop);
3069     // load 64 bytes of data into v8..v11
3070     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3071     __ rev32(v8, __ T16B, v8);
3072     __ rev32(v9, __ T16B, v9);
3073     __ rev32(v10, __ T16B, v10);
3074     __ rev32(v11, __ T16B, v11);
3075 
3076     __ addv(v6, __ T4S, v8, v16);
3077     __ orr(v2, __ T16B, v0, v0);
3078     __ orr(v3, __ T16B, v1, v1);
3079 
3080     FloatRegister d0 = v8;
3081     FloatRegister d1 = v9;
3082     FloatRegister d2 = v10;
3083     FloatRegister d3 = v11;
3084 
3085 
3086     for (int round = 0; round < 16; round++) {
3087       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3088       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3089       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3090       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3091 
3092       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3093        __ orr(v4, __ T16B, v2, v2);
3094       if (round < 15)
3095         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3096       __ sha256h(v2, __ T4S, v3, tmp2);
3097       __ sha256h2(v3, __ T4S, v4, tmp2);
3098       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3099 
3100       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3101     }
3102 
3103     __ addv(v0, __ T4S, v0, v2);
3104     __ addv(v1, __ T4S, v1, v3);
3105 
3106     if (multi_block) {
3107       __ add(ofs, ofs, 64);
3108       __ cmp(ofs, limit);
3109       __ br(Assembler::LE, sha1_loop);
3110       __ mov(c_rarg0, ofs); // return ofs
3111     }
3112 
3113     __ ldpd(v10, v11, Address(sp, 16));
3114     __ ldpd(v8, v9, __ post(sp, 32));
3115 
3116     __ stpq(v0, v1, state);
3117 
3118     __ ret(lr);
3119 
3120     return start;
3121   }
3122 
3123   // Safefetch stubs.
3124   void generate_safefetch(const char* name, int size, address* entry,
3125                           address* fault_pc, address* continuation_pc) {
3126     // safefetch signatures:
3127     //   int      SafeFetch32(int*      adr, int      errValue);
3128     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3129     //
3130     // arguments:
3131     //   c_rarg0 = adr
3132     //   c_rarg1 = errValue
3133     //
3134     // result:
3135     //   PPC_RET  = *adr or errValue
3136 
3137     StubCodeMark mark(this, "StubRoutines", name);
3138 
3139     // Entry point, pc or function descriptor.
3140     *entry = __ pc();
3141 
3142     // Load *adr into c_rarg1, may fault.
3143     *fault_pc = __ pc();
3144     switch (size) {
3145       case 4:
3146         // int32_t
3147         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3148         break;
3149       case 8:
3150         // int64_t
3151         __ ldr(c_rarg1, Address(c_rarg0, 0));
3152         break;
3153       default:
3154         ShouldNotReachHere();
3155     }
3156 
3157     // return errValue or *adr
3158     *continuation_pc = __ pc();
3159     __ mov(r0, c_rarg1);
3160     __ ret(lr);
3161   }
3162 
3163   /**
3164    *  Arguments:
3165    *
3166    * Inputs:
3167    *   c_rarg0   - int crc
3168    *   c_rarg1   - byte* buf
3169    *   c_rarg2   - int length
3170    *
3171    * Output:
3172    *       r0   - int crc result
3173    *
3174    * Preserves:
3175    *       r13
3176    *
3177    */
3178   address generate_updateBytesCRC32() {
3179     assert(UseCRC32Intrinsics, "what are we doing here?");
3180 
3181     __ align(CodeEntryAlignment);
3182     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3183 
3184     address start = __ pc();
3185 
3186     const Register crc   = c_rarg0;  // crc
3187     const Register buf   = c_rarg1;  // source java byte array address
3188     const Register len   = c_rarg2;  // length
3189     const Register table0 = c_rarg3; // crc_table address
3190     const Register table1 = c_rarg4;
3191     const Register table2 = c_rarg5;
3192     const Register table3 = c_rarg6;
3193     const Register tmp3 = c_rarg7;
3194 
3195     BLOCK_COMMENT("Entry:");
3196     __ enter(); // required for proper stackwalking of RuntimeStub frame
3197 
3198     __ kernel_crc32(crc, buf, len,
3199               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3200 
3201     __ leave(); // required for proper stackwalking of RuntimeStub frame
3202     __ ret(lr);
3203 
3204     return start;
3205   }
3206 
3207   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3208                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3209                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3210     // Karatsuba multiplication performs a 128*128 -> 256-bit
3211     // multiplication in three 128-bit multiplications and a few
3212     // additions.
3213     //
3214     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3215     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3216     //
3217     // Inputs:
3218     //
3219     // A0 in a.d[0]     (subkey)
3220     // A1 in a.d[1]
3221     // (A1+A0) in a1_xor_a0.d[0]
3222     //
3223     // B0 in b.d[0]     (state)
3224     // B1 in b.d[1]
3225 
3226     __ ext(tmp1, __ T16B, b, b, 0x08);
3227     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3228     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3229     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3230     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3231 
3232     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3233     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3234     __ eor(tmp2, __ T16B, tmp2, tmp4);
3235     __ eor(tmp2, __ T16B, tmp2, tmp3);
3236 
3237     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3238     __ ins(result_hi, __ D, tmp2, 0, 1);
3239     __ ins(result_lo, __ D, tmp2, 1, 0);
3240   }
3241 
3242   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3243                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3244     const FloatRegister t0 = result;
3245 
3246     // The GCM field polynomial f is z^128 + p(z), where p =
3247     // z^7+z^2+z+1.
3248     //
3249     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3250     //
3251     // so, given that the product we're reducing is
3252     //    a == lo + hi * z^128
3253     // substituting,
3254     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3255     //
3256     // we reduce by multiplying hi by p(z) and subtracting the result
3257     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3258     // bits we can do this with two 64-bit multiplications, lo*p and
3259     // hi*p.
3260 
3261     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3262     __ ext(t1, __ T16B, t0, z, 8);
3263     __ eor(hi, __ T16B, hi, t1);
3264     __ ext(t1, __ T16B, z, t0, 8);
3265     __ eor(lo, __ T16B, lo, t1);
3266     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3267     __ eor(result, __ T16B, lo, t0);
3268   }
3269 
3270   /**
3271    *  Arguments:
3272    *
3273    *  Input:
3274    *    c_rarg0   - x address
3275    *    c_rarg1   - x length
3276    *    c_rarg2   - y address
3277    *    c_rarg3   - y lenth
3278    *    c_rarg4   - z address
3279    *    c_rarg5   - z length
3280    */
3281   address generate_multiplyToLen() {
3282     __ align(CodeEntryAlignment);
3283     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3284 
3285     address start = __ pc();
3286     const Register x     = r0;
3287     const Register xlen  = r1;
3288     const Register y     = r2;
3289     const Register ylen  = r3;
3290     const Register z     = r4;
3291     const Register zlen  = r5;
3292 
3293     const Register tmp1  = r10;
3294     const Register tmp2  = r11;
3295     const Register tmp3  = r12;
3296     const Register tmp4  = r13;
3297     const Register tmp5  = r14;
3298     const Register tmp6  = r15;
3299     const Register tmp7  = r16;
3300 
3301     BLOCK_COMMENT("Entry:");
3302     __ enter(); // required for proper stackwalking of RuntimeStub frame
3303     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3304     __ leave(); // required for proper stackwalking of RuntimeStub frame
3305     __ ret(lr);
3306 
3307     return start;
3308   }
3309 
3310   /**
3311    *  Arguments:
3312    *
3313    *  Input:
3314    *  c_rarg0   - current state address
3315    *  c_rarg1   - H key address
3316    *  c_rarg2   - data address
3317    *  c_rarg3   - number of blocks
3318    *
3319    *  Output:
3320    *  Updated state at c_rarg0
3321    */
3322   address generate_ghash_processBlocks() {
3323     // Bafflingly, GCM uses little-endian for the byte order, but
3324     // big-endian for the bit order.  For example, the polynomial 1 is
3325     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3326     //
3327     // So, we must either reverse the bytes in each word and do
3328     // everything big-endian or reverse the bits in each byte and do
3329     // it little-endian.  On AArch64 it's more idiomatic to reverse
3330     // the bits in each byte (we have an instruction, RBIT, to do
3331     // that) and keep the data in little-endian bit order throught the
3332     // calculation, bit-reversing the inputs and outputs.
3333 
3334     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3335     __ align(wordSize * 2);
3336     address p = __ pc();
3337     __ emit_int64(0x87);  // The low-order bits of the field
3338                           // polynomial (i.e. p = z^7+z^2+z+1)
3339                           // repeated in the low and high parts of a
3340                           // 128-bit vector
3341     __ emit_int64(0x87);
3342 
3343     __ align(CodeEntryAlignment);
3344     address start = __ pc();
3345 
3346     Register state   = c_rarg0;
3347     Register subkeyH = c_rarg1;
3348     Register data    = c_rarg2;
3349     Register blocks  = c_rarg3;
3350 
3351     FloatRegister vzr = v30;
3352     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3353 
3354     __ ldrq(v0, Address(state));
3355     __ ldrq(v1, Address(subkeyH));
3356 
3357     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3358     __ rbit(v0, __ T16B, v0);
3359     __ rev64(v1, __ T16B, v1);
3360     __ rbit(v1, __ T16B, v1);
3361 
3362     __ ldrq(v26, p);
3363 
3364     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3365     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3366 
3367     {
3368       Label L_ghash_loop;
3369       __ bind(L_ghash_loop);
3370 
3371       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3372                                                  // reversing each byte
3373       __ rbit(v2, __ T16B, v2);
3374       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3375 
3376       // Multiply state in v2 by subkey in v1
3377       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3378                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3379                      /*temps*/v6, v20, v18, v21);
3380       // Reduce v7:v5 by the field polynomial
3381       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3382 
3383       __ sub(blocks, blocks, 1);
3384       __ cbnz(blocks, L_ghash_loop);
3385     }
3386 
3387     // The bit-reversed result is at this point in v0
3388     __ rev64(v1, __ T16B, v0);
3389     __ rbit(v1, __ T16B, v1);
3390 
3391     __ st1(v1, __ T16B, state);
3392     __ ret(lr);
3393 
3394     return start;
3395   }
3396 
3397   // Continuation point for throwing of implicit exceptions that are
3398   // not handled in the current activation. Fabricates an exception
3399   // oop and initiates normal exception dispatching in this
3400   // frame. Since we need to preserve callee-saved values (currently
3401   // only for C2, but done for C1 as well) we need a callee-saved oop
3402   // map and therefore have to make these stubs into RuntimeStubs
3403   // rather than BufferBlobs.  If the compiler needs all registers to
3404   // be preserved between the fault point and the exception handler
3405   // then it must assume responsibility for that in
3406   // AbstractCompiler::continuation_for_implicit_null_exception or
3407   // continuation_for_implicit_division_by_zero_exception. All other
3408   // implicit exceptions (e.g., NullPointerException or
3409   // AbstractMethodError on entry) are either at call sites or
3410   // otherwise assume that stack unwinding will be initiated, so
3411   // caller saved registers were assumed volatile in the compiler.
3412 
3413 #undef __
3414 #define __ masm->
3415 
3416   address generate_throw_exception(const char* name,
3417                                    address runtime_entry,
3418                                    Register arg1 = noreg,
3419                                    Register arg2 = noreg) {
3420     // Information about frame layout at time of blocking runtime call.
3421     // Note that we only have to preserve callee-saved registers since
3422     // the compilers are responsible for supplying a continuation point
3423     // if they expect all registers to be preserved.
3424     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3425     enum layout {
3426       rfp_off = 0,
3427       rfp_off2,
3428       return_off,
3429       return_off2,
3430       framesize // inclusive of return address
3431     };
3432 
3433     int insts_size = 512;
3434     int locs_size  = 64;
3435 
3436     CodeBuffer code(name, insts_size, locs_size);
3437     OopMapSet* oop_maps  = new OopMapSet();
3438     MacroAssembler* masm = new MacroAssembler(&code);
3439 
3440     address start = __ pc();
3441 
3442     // This is an inlined and slightly modified version of call_VM
3443     // which has the ability to fetch the return PC out of
3444     // thread-local storage and also sets up last_Java_sp slightly
3445     // differently than the real call_VM
3446 
3447     __ enter(); // Save FP and LR before call
3448 
3449     assert(is_even(framesize/2), "sp not 16-byte aligned");
3450 
3451     // lr and fp are already in place
3452     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3453 
3454     int frame_complete = __ pc() - start;
3455 
3456     // Set up last_Java_sp and last_Java_fp
3457     address the_pc = __ pc();
3458     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3459 
3460     // Call runtime
3461     if (arg1 != noreg) {
3462       assert(arg2 != c_rarg1, "clobbered");
3463       __ mov(c_rarg1, arg1);
3464     }
3465     if (arg2 != noreg) {
3466       __ mov(c_rarg2, arg2);
3467     }
3468     __ mov(c_rarg0, rthread);
3469     BLOCK_COMMENT("call runtime_entry");
3470     __ mov(rscratch1, runtime_entry);
3471     __ blr(rscratch1);
3472 
3473     // Generate oop map
3474     OopMap* map = new OopMap(framesize, 0);
3475 
3476     oop_maps->add_gc_map(the_pc - start, map);
3477 
3478     __ reset_last_Java_frame(true);
3479     __ maybe_isb();
3480 
3481     __ leave();
3482 
3483     // check for pending exceptions
3484 #ifdef ASSERT
3485     Label L;
3486     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3487     __ cbnz(rscratch1, L);
3488     __ should_not_reach_here();
3489     __ bind(L);
3490 #endif // ASSERT
3491     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3492 
3493 
3494     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3495     RuntimeStub* stub =
3496       RuntimeStub::new_runtime_stub(name,
3497                                     &code,
3498                                     frame_complete,
3499                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3500                                     oop_maps, false);
3501     return stub->entry_point();
3502   }
3503 
3504   class MontgomeryMultiplyGenerator : public MacroAssembler {
3505 
3506     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3507       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3508 
3509     RegSet _toSave;
3510     bool _squaring;
3511 
3512   public:
3513     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3514       : MacroAssembler(as->code()), _squaring(squaring) {
3515 
3516       // Register allocation
3517 
3518       Register reg = c_rarg0;
3519       Pa_base = reg;       // Argument registers
3520       if (squaring)
3521         Pb_base = Pa_base;
3522       else
3523         Pb_base = ++reg;
3524       Pn_base = ++reg;
3525       Rlen= ++reg;
3526       inv = ++reg;
3527       Pm_base = ++reg;
3528 
3529                           // Working registers:
3530       Ra =  ++reg;        // The current digit of a, b, n, and m.
3531       Rb =  ++reg;
3532       Rm =  ++reg;
3533       Rn =  ++reg;
3534 
3535       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3536       Pb =  ++reg;
3537       Pm =  ++reg;
3538       Pn =  ++reg;
3539 
3540       t0 =  ++reg;        // Three registers which form a
3541       t1 =  ++reg;        // triple-precision accumuator.
3542       t2 =  ++reg;
3543 
3544       Ri =  ++reg;        // Inner and outer loop indexes.
3545       Rj =  ++reg;
3546 
3547       Rhi_ab = ++reg;     // Product registers: low and high parts
3548       Rlo_ab = ++reg;     // of a*b and m*n.
3549       Rhi_mn = ++reg;
3550       Rlo_mn = ++reg;
3551 
3552       // r19 and up are callee-saved.
3553       _toSave = RegSet::range(r19, reg) + Pm_base;
3554     }
3555 
3556   private:
3557     void save_regs() {
3558       push(_toSave, sp);
3559     }
3560 
3561     void restore_regs() {
3562       pop(_toSave, sp);
3563     }
3564 
3565     template <typename T>
3566     void unroll_2(Register count, T block) {
3567       Label loop, end, odd;
3568       tbnz(count, 0, odd);
3569       cbz(count, end);
3570       align(16);
3571       bind(loop);
3572       (this->*block)();
3573       bind(odd);
3574       (this->*block)();
3575       subs(count, count, 2);
3576       br(Assembler::GT, loop);
3577       bind(end);
3578     }
3579 
3580     template <typename T>
3581     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3582       Label loop, end, odd;
3583       tbnz(count, 0, odd);
3584       cbz(count, end);
3585       align(16);
3586       bind(loop);
3587       (this->*block)(d, s, tmp);
3588       bind(odd);
3589       (this->*block)(d, s, tmp);
3590       subs(count, count, 2);
3591       br(Assembler::GT, loop);
3592       bind(end);
3593     }
3594 
3595     void pre1(RegisterOrConstant i) {
3596       block_comment("pre1");
3597       // Pa = Pa_base;
3598       // Pb = Pb_base + i;
3599       // Pm = Pm_base;
3600       // Pn = Pn_base + i;
3601       // Ra = *Pa;
3602       // Rb = *Pb;
3603       // Rm = *Pm;
3604       // Rn = *Pn;
3605       ldr(Ra, Address(Pa_base));
3606       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3607       ldr(Rm, Address(Pm_base));
3608       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3609       lea(Pa, Address(Pa_base));
3610       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3611       lea(Pm, Address(Pm_base));
3612       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3613 
3614       // Zero the m*n result.
3615       mov(Rhi_mn, zr);
3616       mov(Rlo_mn, zr);
3617     }
3618 
3619     // The core multiply-accumulate step of a Montgomery
3620     // multiplication.  The idea is to schedule operations as a
3621     // pipeline so that instructions with long latencies (loads and
3622     // multiplies) have time to complete before their results are
3623     // used.  This most benefits in-order implementations of the
3624     // architecture but out-of-order ones also benefit.
3625     void step() {
3626       block_comment("step");
3627       // MACC(Ra, Rb, t0, t1, t2);
3628       // Ra = *++Pa;
3629       // Rb = *--Pb;
3630       umulh(Rhi_ab, Ra, Rb);
3631       mul(Rlo_ab, Ra, Rb);
3632       ldr(Ra, pre(Pa, wordSize));
3633       ldr(Rb, pre(Pb, -wordSize));
3634       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3635                                        // previous iteration.
3636       // MACC(Rm, Rn, t0, t1, t2);
3637       // Rm = *++Pm;
3638       // Rn = *--Pn;
3639       umulh(Rhi_mn, Rm, Rn);
3640       mul(Rlo_mn, Rm, Rn);
3641       ldr(Rm, pre(Pm, wordSize));
3642       ldr(Rn, pre(Pn, -wordSize));
3643       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3644     }
3645 
3646     void post1() {
3647       block_comment("post1");
3648 
3649       // MACC(Ra, Rb, t0, t1, t2);
3650       // Ra = *++Pa;
3651       // Rb = *--Pb;
3652       umulh(Rhi_ab, Ra, Rb);
3653       mul(Rlo_ab, Ra, Rb);
3654       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3655       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3656 
3657       // *Pm = Rm = t0 * inv;
3658       mul(Rm, t0, inv);
3659       str(Rm, Address(Pm));
3660 
3661       // MACC(Rm, Rn, t0, t1, t2);
3662       // t0 = t1; t1 = t2; t2 = 0;
3663       umulh(Rhi_mn, Rm, Rn);
3664 
3665 #ifndef PRODUCT
3666       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3667       {
3668         mul(Rlo_mn, Rm, Rn);
3669         add(Rlo_mn, t0, Rlo_mn);
3670         Label ok;
3671         cbz(Rlo_mn, ok); {
3672           stop("broken Montgomery multiply");
3673         } bind(ok);
3674       }
3675 #endif
3676       // We have very carefully set things up so that
3677       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3678       // the lower half of Rm * Rn because we know the result already:
3679       // it must be -t0.  t0 + (-t0) must generate a carry iff
3680       // t0 != 0.  So, rather than do a mul and an adds we just set
3681       // the carry flag iff t0 is nonzero.
3682       //
3683       // mul(Rlo_mn, Rm, Rn);
3684       // adds(zr, t0, Rlo_mn);
3685       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3686       adcs(t0, t1, Rhi_mn);
3687       adc(t1, t2, zr);
3688       mov(t2, zr);
3689     }
3690 
3691     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
3692       block_comment("pre2");
3693       // Pa = Pa_base + i-len;
3694       // Pb = Pb_base + len;
3695       // Pm = Pm_base + i-len;
3696       // Pn = Pn_base + len;
3697 
3698       if (i.is_register()) {
3699         sub(Rj, i.as_register(), len);
3700       } else {
3701         mov(Rj, i.as_constant());
3702         sub(Rj, Rj, len);
3703       }
3704       // Rj == i-len
3705 
3706       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
3707       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
3708       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3709       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
3710 
3711       // Ra = *++Pa;
3712       // Rb = *--Pb;
3713       // Rm = *++Pm;
3714       // Rn = *--Pn;
3715       ldr(Ra, pre(Pa, wordSize));
3716       ldr(Rb, pre(Pb, -wordSize));
3717       ldr(Rm, pre(Pm, wordSize));
3718       ldr(Rn, pre(Pn, -wordSize));
3719 
3720       mov(Rhi_mn, zr);
3721       mov(Rlo_mn, zr);
3722     }
3723 
3724     void post2(RegisterOrConstant i, RegisterOrConstant len) {
3725       block_comment("post2");
3726       if (i.is_constant()) {
3727         mov(Rj, i.as_constant()-len.as_constant());
3728       } else {
3729         sub(Rj, i.as_register(), len);
3730       }
3731 
3732       adds(t0, t0, Rlo_mn); // The pending m*n, low part
3733 
3734       // As soon as we know the least significant digit of our result,
3735       // store it.
3736       // Pm_base[i-len] = t0;
3737       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3738 
3739       // t0 = t1; t1 = t2; t2 = 0;
3740       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
3741       adc(t1, t2, zr);
3742       mov(t2, zr);
3743     }
3744 
3745     // A carry in t0 after Montgomery multiplication means that we
3746     // should subtract multiples of n from our result in m.  We'll
3747     // keep doing that until there is no carry.
3748     void normalize(RegisterOrConstant len) {
3749       block_comment("normalize");
3750       // while (t0)
3751       //   t0 = sub(Pm_base, Pn_base, t0, len);
3752       Label loop, post, again;
3753       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
3754       cbz(t0, post); {
3755         bind(again); {
3756           mov(i, zr);
3757           mov(cnt, len);
3758           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3759           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3760           subs(zr, zr, zr); // set carry flag, i.e. no borrow
3761           align(16);
3762           bind(loop); {
3763             sbcs(Rm, Rm, Rn);
3764             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3765             add(i, i, 1);
3766             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3767             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3768             sub(cnt, cnt, 1);
3769           } cbnz(cnt, loop);
3770           sbc(t0, t0, zr);
3771         } cbnz(t0, again);
3772       } bind(post);
3773     }
3774 
3775     // Move memory at s to d, reversing words.
3776     //    Increments d to end of copied memory
3777     //    Destroys tmp1, tmp2
3778     //    Preserves len
3779     //    Leaves s pointing to the address which was in d at start
3780     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3781       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
3782 
3783       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
3784       mov(tmp1, len);
3785       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3786       sub(s, d, len, ext::uxtw, LogBytesPerWord);
3787     }
3788     // where
3789     void reverse1(Register d, Register s, Register tmp) {
3790       ldr(tmp, pre(s, -wordSize));
3791       ror(tmp, tmp, 32);
3792       str(tmp, post(d, wordSize));
3793     }
3794 
3795     void step_squaring() {
3796       // An extra ACC
3797       step();
3798       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3799     }
3800 
3801     void last_squaring(RegisterOrConstant i) {
3802       Label dont;
3803       // if ((i & 1) == 0) {
3804       tbnz(i.as_register(), 0, dont); {
3805         // MACC(Ra, Rb, t0, t1, t2);
3806         // Ra = *++Pa;
3807         // Rb = *--Pb;
3808         umulh(Rhi_ab, Ra, Rb);
3809         mul(Rlo_ab, Ra, Rb);
3810         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3811       } bind(dont);
3812     }
3813 
3814     void extra_step_squaring() {
3815       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3816 
3817       // MACC(Rm, Rn, t0, t1, t2);
3818       // Rm = *++Pm;
3819       // Rn = *--Pn;
3820       umulh(Rhi_mn, Rm, Rn);
3821       mul(Rlo_mn, Rm, Rn);
3822       ldr(Rm, pre(Pm, wordSize));
3823       ldr(Rn, pre(Pn, -wordSize));
3824     }
3825 
3826     void post1_squaring() {
3827       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3828 
3829       // *Pm = Rm = t0 * inv;
3830       mul(Rm, t0, inv);
3831       str(Rm, Address(Pm));
3832 
3833       // MACC(Rm, Rn, t0, t1, t2);
3834       // t0 = t1; t1 = t2; t2 = 0;
3835       umulh(Rhi_mn, Rm, Rn);
3836 
3837 #ifndef PRODUCT
3838       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3839       {
3840         mul(Rlo_mn, Rm, Rn);
3841         add(Rlo_mn, t0, Rlo_mn);
3842         Label ok;
3843         cbz(Rlo_mn, ok); {
3844           stop("broken Montgomery multiply");
3845         } bind(ok);
3846       }
3847 #endif
3848       // We have very carefully set things up so that
3849       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3850       // the lower half of Rm * Rn because we know the result already:
3851       // it must be -t0.  t0 + (-t0) must generate a carry iff
3852       // t0 != 0.  So, rather than do a mul and an adds we just set
3853       // the carry flag iff t0 is nonzero.
3854       //
3855       // mul(Rlo_mn, Rm, Rn);
3856       // adds(zr, t0, Rlo_mn);
3857       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3858       adcs(t0, t1, Rhi_mn);
3859       adc(t1, t2, zr);
3860       mov(t2, zr);
3861     }
3862 
3863     void acc(Register Rhi, Register Rlo,
3864              Register t0, Register t1, Register t2) {
3865       adds(t0, t0, Rlo);
3866       adcs(t1, t1, Rhi);
3867       adc(t2, t2, zr);
3868     }
3869 
3870   public:
3871     /**
3872      * Fast Montgomery multiplication.  The derivation of the
3873      * algorithm is in A Cryptographic Library for the Motorola
3874      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3875      *
3876      * Arguments:
3877      *
3878      * Inputs for multiplication:
3879      *   c_rarg0   - int array elements a
3880      *   c_rarg1   - int array elements b
3881      *   c_rarg2   - int array elements n (the modulus)
3882      *   c_rarg3   - int length
3883      *   c_rarg4   - int inv
3884      *   c_rarg5   - int array elements m (the result)
3885      *
3886      * Inputs for squaring:
3887      *   c_rarg0   - int array elements a
3888      *   c_rarg1   - int array elements n (the modulus)
3889      *   c_rarg2   - int length
3890      *   c_rarg3   - int inv
3891      *   c_rarg4   - int array elements m (the result)
3892      *
3893      */
3894     address generate_multiply() {
3895       Label argh, nothing;
3896       bind(argh);
3897       stop("MontgomeryMultiply total_allocation must be <= 8192");
3898 
3899       align(CodeEntryAlignment);
3900       address entry = pc();
3901 
3902       cbzw(Rlen, nothing);
3903 
3904       enter();
3905 
3906       // Make room.
3907       cmpw(Rlen, 512);
3908       br(Assembler::HI, argh);
3909       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3910       andr(sp, Ra, -2 * wordSize);
3911 
3912       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3913 
3914       {
3915         // Copy input args, reversing as we go.  We use Ra as a
3916         // temporary variable.
3917         reverse(Ra, Pa_base, Rlen, t0, t1);
3918         if (!_squaring)
3919           reverse(Ra, Pb_base, Rlen, t0, t1);
3920         reverse(Ra, Pn_base, Rlen, t0, t1);
3921       }
3922 
3923       // Push all call-saved registers and also Pm_base which we'll need
3924       // at the end.
3925       save_regs();
3926 
3927 #ifndef PRODUCT
3928       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3929       {
3930         ldr(Rn, Address(Pn_base, 0));
3931         mul(Rlo_mn, Rn, inv);
3932         cmp(Rlo_mn, -1);
3933         Label ok;
3934         br(EQ, ok); {
3935           stop("broken inverse in Montgomery multiply");
3936         } bind(ok);
3937       }
3938 #endif
3939 
3940       mov(Pm_base, Ra);
3941 
3942       mov(t0, zr);
3943       mov(t1, zr);
3944       mov(t2, zr);
3945 
3946       block_comment("for (int i = 0; i < len; i++) {");
3947       mov(Ri, zr); {
3948         Label loop, end;
3949         cmpw(Ri, Rlen);
3950         br(Assembler::GE, end);
3951 
3952         bind(loop);
3953         pre1(Ri);
3954 
3955         block_comment("  for (j = i; j; j--) {"); {
3956           movw(Rj, Ri);
3957           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3958         } block_comment("  } // j");
3959 
3960         post1();
3961         addw(Ri, Ri, 1);
3962         cmpw(Ri, Rlen);
3963         br(Assembler::LT, loop);
3964         bind(end);
3965         block_comment("} // i");
3966       }
3967 
3968       block_comment("for (int i = len; i < 2*len; i++) {");
3969       mov(Ri, Rlen); {
3970         Label loop, end;
3971         cmpw(Ri, Rlen, Assembler::LSL, 1);
3972         br(Assembler::GE, end);
3973 
3974         bind(loop);
3975         pre2(Ri, Rlen);
3976 
3977         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3978           lslw(Rj, Rlen, 1);
3979           subw(Rj, Rj, Ri);
3980           subw(Rj, Rj, 1);
3981           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3982         } block_comment("  } // j");
3983 
3984         post2(Ri, Rlen);
3985         addw(Ri, Ri, 1);
3986         cmpw(Ri, Rlen, Assembler::LSL, 1);
3987         br(Assembler::LT, loop);
3988         bind(end);
3989       }
3990       block_comment("} // i");
3991 
3992       normalize(Rlen);
3993 
3994       mov(Ra, Pm_base);  // Save Pm_base in Ra
3995       restore_regs();  // Restore caller's Pm_base
3996 
3997       // Copy our result into caller's Pm_base
3998       reverse(Pm_base, Ra, Rlen, t0, t1);
3999 
4000       leave();
4001       bind(nothing);
4002       ret(lr);
4003 
4004       return entry;
4005     }
4006     // In C, approximately:
4007 
4008     // void
4009     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4010     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4011     //                     unsigned long inv, int len) {
4012     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4013     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4014     //   unsigned long Ra, Rb, Rn, Rm;
4015 
4016     //   int i;
4017 
4018     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4019 
4020     //   for (i = 0; i < len; i++) {
4021     //     int j;
4022 
4023     //     Pa = Pa_base;
4024     //     Pb = Pb_base + i;
4025     //     Pm = Pm_base;
4026     //     Pn = Pn_base + i;
4027 
4028     //     Ra = *Pa;
4029     //     Rb = *Pb;
4030     //     Rm = *Pm;
4031     //     Rn = *Pn;
4032 
4033     //     int iters = i;
4034     //     for (j = 0; iters--; j++) {
4035     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4036     //       MACC(Ra, Rb, t0, t1, t2);
4037     //       Ra = *++Pa;
4038     //       Rb = *--Pb;
4039     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4040     //       MACC(Rm, Rn, t0, t1, t2);
4041     //       Rm = *++Pm;
4042     //       Rn = *--Pn;
4043     //     }
4044 
4045     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4046     //     MACC(Ra, Rb, t0, t1, t2);
4047     //     *Pm = Rm = t0 * inv;
4048     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4049     //     MACC(Rm, Rn, t0, t1, t2);
4050 
4051     //     assert(t0 == 0, "broken Montgomery multiply");
4052 
4053     //     t0 = t1; t1 = t2; t2 = 0;
4054     //   }
4055 
4056     //   for (i = len; i < 2*len; i++) {
4057     //     int j;
4058 
4059     //     Pa = Pa_base + i-len;
4060     //     Pb = Pb_base + len;
4061     //     Pm = Pm_base + i-len;
4062     //     Pn = Pn_base + len;
4063 
4064     //     Ra = *++Pa;
4065     //     Rb = *--Pb;
4066     //     Rm = *++Pm;
4067     //     Rn = *--Pn;
4068 
4069     //     int iters = len*2-i-1;
4070     //     for (j = i-len+1; iters--; j++) {
4071     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4072     //       MACC(Ra, Rb, t0, t1, t2);
4073     //       Ra = *++Pa;
4074     //       Rb = *--Pb;
4075     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4076     //       MACC(Rm, Rn, t0, t1, t2);
4077     //       Rm = *++Pm;
4078     //       Rn = *--Pn;
4079     //     }
4080 
4081     //     Pm_base[i-len] = t0;
4082     //     t0 = t1; t1 = t2; t2 = 0;
4083     //   }
4084 
4085     //   while (t0)
4086     //     t0 = sub(Pm_base, Pn_base, t0, len);
4087     // }
4088 
4089     /**
4090      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4091      * multiplies than Montgomery multiplication so it should be up to
4092      * 25% faster.  However, its loop control is more complex and it
4093      * may actually run slower on some machines.
4094      *
4095      * Arguments:
4096      *
4097      * Inputs:
4098      *   c_rarg0   - int array elements a
4099      *   c_rarg1   - int array elements n (the modulus)
4100      *   c_rarg2   - int length
4101      *   c_rarg3   - int inv
4102      *   c_rarg4   - int array elements m (the result)
4103      *
4104      */
4105     address generate_square() {
4106       Label argh;
4107       bind(argh);
4108       stop("MontgomeryMultiply total_allocation must be <= 8192");
4109 
4110       align(CodeEntryAlignment);
4111       address entry = pc();
4112 
4113       enter();
4114 
4115       // Make room.
4116       cmpw(Rlen, 512);
4117       br(Assembler::HI, argh);
4118       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4119       andr(sp, Ra, -2 * wordSize);
4120 
4121       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4122 
4123       {
4124         // Copy input args, reversing as we go.  We use Ra as a
4125         // temporary variable.
4126         reverse(Ra, Pa_base, Rlen, t0, t1);
4127         reverse(Ra, Pn_base, Rlen, t0, t1);
4128       }
4129 
4130       // Push all call-saved registers and also Pm_base which we'll need
4131       // at the end.
4132       save_regs();
4133 
4134       mov(Pm_base, Ra);
4135 
4136       mov(t0, zr);
4137       mov(t1, zr);
4138       mov(t2, zr);
4139 
4140       block_comment("for (int i = 0; i < len; i++) {");
4141       mov(Ri, zr); {
4142         Label loop, end;
4143         bind(loop);
4144         cmp(Ri, Rlen);
4145         br(Assembler::GE, end);
4146 
4147         pre1(Ri);
4148 
4149         block_comment("for (j = (i+1)/2; j; j--) {"); {
4150           add(Rj, Ri, 1);
4151           lsr(Rj, Rj, 1);
4152           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4153         } block_comment("  } // j");
4154 
4155         last_squaring(Ri);
4156 
4157         block_comment("  for (j = i/2; j; j--) {"); {
4158           lsr(Rj, Ri, 1);
4159           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4160         } block_comment("  } // j");
4161 
4162         post1_squaring();
4163         add(Ri, Ri, 1);
4164         cmp(Ri, Rlen);
4165         br(Assembler::LT, loop);
4166 
4167         bind(end);
4168         block_comment("} // i");
4169       }
4170 
4171       block_comment("for (int i = len; i < 2*len; i++) {");
4172       mov(Ri, Rlen); {
4173         Label loop, end;
4174         bind(loop);
4175         cmp(Ri, Rlen, Assembler::LSL, 1);
4176         br(Assembler::GE, end);
4177 
4178         pre2(Ri, Rlen);
4179 
4180         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4181           lsl(Rj, Rlen, 1);
4182           sub(Rj, Rj, Ri);
4183           sub(Rj, Rj, 1);
4184           lsr(Rj, Rj, 1);
4185           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4186         } block_comment("  } // j");
4187 
4188         last_squaring(Ri);
4189 
4190         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4191           lsl(Rj, Rlen, 1);
4192           sub(Rj, Rj, Ri);
4193           lsr(Rj, Rj, 1);
4194           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4195         } block_comment("  } // j");
4196 
4197         post2(Ri, Rlen);
4198         add(Ri, Ri, 1);
4199         cmp(Ri, Rlen, Assembler::LSL, 1);
4200 
4201         br(Assembler::LT, loop);
4202         bind(end);
4203         block_comment("} // i");
4204       }
4205 
4206       normalize(Rlen);
4207 
4208       mov(Ra, Pm_base);  // Save Pm_base in Ra
4209       restore_regs();  // Restore caller's Pm_base
4210 
4211       // Copy our result into caller's Pm_base
4212       reverse(Pm_base, Ra, Rlen, t0, t1);
4213 
4214       leave();
4215       ret(lr);
4216 
4217       return entry;
4218     }
4219     // In C, approximately:
4220 
4221     // void
4222     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4223     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4224     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4225     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4226     //   unsigned long Ra, Rb, Rn, Rm;
4227 
4228     //   int i;
4229 
4230     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4231 
4232     //   for (i = 0; i < len; i++) {
4233     //     int j;
4234 
4235     //     Pa = Pa_base;
4236     //     Pb = Pa_base + i;
4237     //     Pm = Pm_base;
4238     //     Pn = Pn_base + i;
4239 
4240     //     Ra = *Pa;
4241     //     Rb = *Pb;
4242     //     Rm = *Pm;
4243     //     Rn = *Pn;
4244 
4245     //     int iters = (i+1)/2;
4246     //     for (j = 0; iters--; j++) {
4247     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4248     //       MACC2(Ra, Rb, t0, t1, t2);
4249     //       Ra = *++Pa;
4250     //       Rb = *--Pb;
4251     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4252     //       MACC(Rm, Rn, t0, t1, t2);
4253     //       Rm = *++Pm;
4254     //       Rn = *--Pn;
4255     //     }
4256     //     if ((i & 1) == 0) {
4257     //       assert(Ra == Pa_base[j], "must be");
4258     //       MACC(Ra, Ra, t0, t1, t2);
4259     //     }
4260     //     iters = i/2;
4261     //     assert(iters == i-j, "must be");
4262     //     for (; iters--; j++) {
4263     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4264     //       MACC(Rm, Rn, t0, t1, t2);
4265     //       Rm = *++Pm;
4266     //       Rn = *--Pn;
4267     //     }
4268 
4269     //     *Pm = Rm = t0 * inv;
4270     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4271     //     MACC(Rm, Rn, t0, t1, t2);
4272 
4273     //     assert(t0 == 0, "broken Montgomery multiply");
4274 
4275     //     t0 = t1; t1 = t2; t2 = 0;
4276     //   }
4277 
4278     //   for (i = len; i < 2*len; i++) {
4279     //     int start = i-len+1;
4280     //     int end = start + (len - start)/2;
4281     //     int j;
4282 
4283     //     Pa = Pa_base + i-len;
4284     //     Pb = Pa_base + len;
4285     //     Pm = Pm_base + i-len;
4286     //     Pn = Pn_base + len;
4287 
4288     //     Ra = *++Pa;
4289     //     Rb = *--Pb;
4290     //     Rm = *++Pm;
4291     //     Rn = *--Pn;
4292 
4293     //     int iters = (2*len-i-1)/2;
4294     //     assert(iters == end-start, "must be");
4295     //     for (j = start; iters--; j++) {
4296     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4297     //       MACC2(Ra, Rb, t0, t1, t2);
4298     //       Ra = *++Pa;
4299     //       Rb = *--Pb;
4300     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4301     //       MACC(Rm, Rn, t0, t1, t2);
4302     //       Rm = *++Pm;
4303     //       Rn = *--Pn;
4304     //     }
4305     //     if ((i & 1) == 0) {
4306     //       assert(Ra == Pa_base[j], "must be");
4307     //       MACC(Ra, Ra, t0, t1, t2);
4308     //     }
4309     //     iters =  (2*len-i)/2;
4310     //     assert(iters == len-j, "must be");
4311     //     for (; iters--; j++) {
4312     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4313     //       MACC(Rm, Rn, t0, t1, t2);
4314     //       Rm = *++Pm;
4315     //       Rn = *--Pn;
4316     //     }
4317     //     Pm_base[i-len] = t0;
4318     //     t0 = t1; t1 = t2; t2 = 0;
4319     //   }
4320 
4321     //   while (t0)
4322     //     t0 = sub(Pm_base, Pn_base, t0, len);
4323     // }
4324   };
4325 
4326   // Initialization
4327   void generate_initial() {
4328     // Generate initial stubs and initializes the entry points
4329 
4330     // entry points that exist in all platforms Note: This is code
4331     // that could be shared among different platforms - however the
4332     // benefit seems to be smaller than the disadvantage of having a
4333     // much more complicated generator structure. See also comment in
4334     // stubRoutines.hpp.
4335 
4336     StubRoutines::_forward_exception_entry = generate_forward_exception();
4337 
4338     StubRoutines::_call_stub_entry =
4339       generate_call_stub(StubRoutines::_call_stub_return_address);
4340 
4341     // is referenced by megamorphic call
4342     StubRoutines::_catch_exception_entry = generate_catch_exception();
4343 
4344     // Build this early so it's available for the interpreter.
4345     StubRoutines::_throw_StackOverflowError_entry =
4346       generate_throw_exception("StackOverflowError throw_exception",
4347                                CAST_FROM_FN_PTR(address,
4348                                                 SharedRuntime::
4349                                                 throw_StackOverflowError));
4350     if (UseCRC32Intrinsics) {
4351       // set table address before stub generation which use it
4352       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4353       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4354     }
4355   }
4356 
4357   void generate_all() {
4358     // support for verify_oop (must happen after universe_init)
4359     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4360     StubRoutines::_throw_AbstractMethodError_entry =
4361       generate_throw_exception("AbstractMethodError throw_exception",
4362                                CAST_FROM_FN_PTR(address,
4363                                                 SharedRuntime::
4364                                                 throw_AbstractMethodError));
4365 
4366     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4367       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4368                                CAST_FROM_FN_PTR(address,
4369                                                 SharedRuntime::
4370                                                 throw_IncompatibleClassChangeError));
4371 
4372     StubRoutines::_throw_NullPointerException_at_call_entry =
4373       generate_throw_exception("NullPointerException at call throw_exception",
4374                                CAST_FROM_FN_PTR(address,
4375                                                 SharedRuntime::
4376                                                 throw_NullPointerException_at_call));
4377 
4378     // arraycopy stubs used by compilers
4379     generate_arraycopy_stubs();
4380 
4381     if (UseMultiplyToLenIntrinsic) {
4382       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4383     }
4384 
4385     if (UseMontgomeryMultiplyIntrinsic) {
4386       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4387       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4388       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4389     }
4390 
4391     if (UseMontgomerySquareIntrinsic) {
4392       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4393       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4394       // We use generate_multiply() rather than generate_square()
4395       // because it's faster for the sizes of modulus we care about.
4396       StubRoutines::_montgomerySquare = g.generate_multiply();
4397     }
4398 
4399     if (UseAESIntrinsics) {
4400       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4401       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4402       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4403       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4404     }
4405 
4406     // generate GHASH intrinsics code
4407     if (UseGHASHIntrinsics) {
4408       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4409     }
4410 
4411     if (UseSHA1Intrinsics) {
4412       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4413       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4414     }
4415     if (UseSHA256Intrinsics) {
4416       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4417       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4418     }
4419 
4420     // Safefetch stubs.
4421     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4422                                                        &StubRoutines::_safefetch32_fault_pc,
4423                                                        &StubRoutines::_safefetch32_continuation_pc);
4424     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4425                                                        &StubRoutines::_safefetchN_fault_pc,
4426                                                        &StubRoutines::_safefetchN_continuation_pc);
4427   }
4428 
4429  public:
4430   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4431     if (all) {
4432       generate_all();
4433     } else {
4434       generate_initial();
4435     }
4436   }
4437 }; // end class declaration
4438 
4439 void StubGenerator_generate(CodeBuffer* code, bool all) {
4440   StubGenerator g(code, all);
4441 }