1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "code/SCCache.hpp"
   31 #include "compiler/oopMap.hpp"
   32 #include "gc/shared/barrierSet.hpp"
   33 #include "gc/shared/barrierSetAssembler.hpp"
   34 #include "gc/shared/gc_globals.hpp"
   35 #include "gc/shared/tlab_globals.hpp"
   36 #include "interpreter/interpreter.hpp"
   37 #include "memory/universe.hpp"
   38 #include "nativeInst_aarch64.hpp"
   39 #include "oops/instanceOop.hpp"
   40 #include "oops/method.hpp"
   41 #include "oops/objArrayKlass.hpp"
   42 #include "oops/oop.inline.hpp"
   43 #include "prims/methodHandles.hpp"
   44 #include "prims/upcallLinker.hpp"
   45 #include "runtime/arguments.hpp"
   46 #include "runtime/atomic.hpp"
   47 #include "runtime/continuation.hpp"
   48 #include "runtime/continuationEntry.inline.hpp"
   49 #include "runtime/frame.inline.hpp"
   50 #include "runtime/handles.inline.hpp"
   51 #include "runtime/javaThread.hpp"
   52 #include "runtime/sharedRuntime.hpp"
   53 #include "runtime/stubCodeGenerator.hpp"
   54 #include "runtime/stubRoutines.hpp"
   55 #include "utilities/align.hpp"
   56 #include "utilities/checkedCast.hpp"
   57 #include "utilities/debug.hpp"
   58 #include "utilities/globalDefinitions.hpp"
   59 #include "utilities/intpow.hpp"
   60 #include "utilities/powerOfTwo.hpp"
   61 #ifdef COMPILER2
   62 #include "opto/runtime.hpp"
   63 #endif
   64 #if INCLUDE_ZGC
   65 #include "gc/z/zThreadLocalData.hpp"
   66 #endif
   67 
   68 // Declaration and definition of StubGenerator (no .hpp file).
   69 // For a more detailed description of the stub routine structure
   70 // see the comment in stubRoutines.hpp
   71 
   72 #undef __
   73 #define __ _masm->
   74 
   75 #ifdef PRODUCT
   76 #define BLOCK_COMMENT(str) /* nothing */
   77 #else
   78 #define BLOCK_COMMENT(str) __ block_comment(str)
   79 #endif
   80 
   81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   82 
   83 // Stub Code definitions
   84 
   85 class StubGenerator: public StubCodeGenerator {
   86  private:
   87 
   88 #ifdef PRODUCT
   89 #define inc_counter_np(counter) ((void)0)
   90 #else
   91   void inc_counter_np_(uint& counter) {
   92     __ incrementw(ExternalAddress((address)&counter));
   93   }
   94 #define inc_counter_np(counter) \
   95   BLOCK_COMMENT("inc_counter " #counter); \
   96   inc_counter_np_(counter);
   97 #endif
   98 
   99   // Call stubs are used to call Java from C
  100   //
  101   // Arguments:
  102   //    c_rarg0:   call wrapper address                   address
  103   //    c_rarg1:   result                                 address
  104   //    c_rarg2:   result type                            BasicType
  105   //    c_rarg3:   method                                 Method*
  106   //    c_rarg4:   (interpreter) entry point              address
  107   //    c_rarg5:   parameters                             intptr_t*
  108   //    c_rarg6:   parameter size (in words)              int
  109   //    c_rarg7:   thread                                 Thread*
  110   //
  111   // There is no return from the stub itself as any Java result
  112   // is written to result
  113   //
  114   // we save r30 (lr) as the return PC at the base of the frame and
  115   // link r29 (fp) below it as the frame pointer installing sp (r31)
  116   // into fp.
  117   //
  118   // we save r0-r7, which accounts for all the c arguments.
  119   //
  120   // TODO: strictly do we need to save them all? they are treated as
  121   // volatile by C so could we omit saving the ones we are going to
  122   // place in global registers (thread? method?) or those we only use
  123   // during setup of the Java call?
  124   //
  125   // we don't need to save r8 which C uses as an indirect result location
  126   // return register.
  127   //
  128   // we don't need to save r9-r15 which both C and Java treat as
  129   // volatile
  130   //
  131   // we don't need to save r16-18 because Java does not use them
  132   //
  133   // we save r19-r28 which Java uses as scratch registers and C
  134   // expects to be callee-save
  135   //
  136   // we save the bottom 64 bits of each value stored in v8-v15; it is
  137   // the responsibility of the caller to preserve larger values.
  138   //
  139   // so the stub frame looks like this when we enter Java code
  140   //
  141   //     [ return_from_Java     ] <--- sp
  142   //     [ argument word n      ]
  143   //      ...
  144   // -29 [ argument word 1      ]
  145   // -28 [ saved Floating-point Control Register ]
  146   // -26 [ saved v15            ] <--- sp_after_call
  147   // -25 [ saved v14            ]
  148   // -24 [ saved v13            ]
  149   // -23 [ saved v12            ]
  150   // -22 [ saved v11            ]
  151   // -21 [ saved v10            ]
  152   // -20 [ saved v9             ]
  153   // -19 [ saved v8             ]
  154   // -18 [ saved r28            ]
  155   // -17 [ saved r27            ]
  156   // -16 [ saved r26            ]
  157   // -15 [ saved r25            ]
  158   // -14 [ saved r24            ]
  159   // -13 [ saved r23            ]
  160   // -12 [ saved r22            ]
  161   // -11 [ saved r21            ]
  162   // -10 [ saved r20            ]
  163   //  -9 [ saved r19            ]
  164   //  -8 [ call wrapper    (r0) ]
  165   //  -7 [ result          (r1) ]
  166   //  -6 [ result type     (r2) ]
  167   //  -5 [ method          (r3) ]
  168   //  -4 [ entry point     (r4) ]
  169   //  -3 [ parameters      (r5) ]
  170   //  -2 [ parameter size  (r6) ]
  171   //  -1 [ thread (r7)          ]
  172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  173   //   1 [ saved lr       (r30) ]
  174 
  175   // Call stub stack layout word offsets from fp
  176   enum call_stub_layout {
  177     sp_after_call_off  = -28,
  178 
  179     fpcr_off           = sp_after_call_off,
  180     d15_off            = -26,
  181     d13_off            = -24,
  182     d11_off            = -22,
  183     d9_off             = -20,
  184 
  185     r28_off            = -18,
  186     r26_off            = -16,
  187     r24_off            = -14,
  188     r22_off            = -12,
  189     r20_off            = -10,
  190     call_wrapper_off   =  -8,
  191     result_off         =  -7,
  192     result_type_off    =  -6,
  193     method_off         =  -5,
  194     entry_point_off    =  -4,
  195     parameter_size_off =  -2,
  196     thread_off         =  -1,
  197     fp_f               =   0,
  198     retaddr_off        =   1,
  199   };
  200 
  201   address generate_call_stub(address& return_address) {
  202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  204            "adjust this code");
  205 
  206     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  207     StubCodeMark mark(this, stub_id);
  208     address start = __ pc();
  209 
  210     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  211 
  212     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  213     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  214     const Address result        (rfp, result_off         * wordSize);
  215     const Address result_type   (rfp, result_type_off    * wordSize);
  216     const Address method        (rfp, method_off         * wordSize);
  217     const Address entry_point   (rfp, entry_point_off    * wordSize);
  218     const Address parameter_size(rfp, parameter_size_off * wordSize);
  219 
  220     const Address thread        (rfp, thread_off         * wordSize);
  221 
  222     const Address d15_save      (rfp, d15_off * wordSize);
  223     const Address d13_save      (rfp, d13_off * wordSize);
  224     const Address d11_save      (rfp, d11_off * wordSize);
  225     const Address d9_save       (rfp, d9_off * wordSize);
  226 
  227     const Address r28_save      (rfp, r28_off * wordSize);
  228     const Address r26_save      (rfp, r26_off * wordSize);
  229     const Address r24_save      (rfp, r24_off * wordSize);
  230     const Address r22_save      (rfp, r22_off * wordSize);
  231     const Address r20_save      (rfp, r20_off * wordSize);
  232 
  233     // stub code
  234 
  235     address aarch64_entry = __ pc();
  236 
  237     // set up frame and move sp to end of save area
  238     __ enter();
  239     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  240 
  241     // save register parameters and Java scratch/global registers
  242     // n.b. we save thread even though it gets installed in
  243     // rthread because we want to sanity check rthread later
  244     __ str(c_rarg7,  thread);
  245     __ strw(c_rarg6, parameter_size);
  246     __ stp(c_rarg4, c_rarg5,  entry_point);
  247     __ stp(c_rarg2, c_rarg3,  result_type);
  248     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  249 
  250     __ stp(r20, r19,   r20_save);
  251     __ stp(r22, r21,   r22_save);
  252     __ stp(r24, r23,   r24_save);
  253     __ stp(r26, r25,   r26_save);
  254     __ stp(r28, r27,   r28_save);
  255 
  256     __ stpd(v9,  v8,   d9_save);
  257     __ stpd(v11, v10,  d11_save);
  258     __ stpd(v13, v12,  d13_save);
  259     __ stpd(v15, v14,  d15_save);
  260 
  261     __ get_fpcr(rscratch1);
  262     __ str(rscratch1, fpcr_save);
  263     // Set FPCR to the state we need. We do want Round to Nearest. We
  264     // don't want non-IEEE rounding modes or floating-point traps.
  265     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  266     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  267     __ set_fpcr(rscratch1);
  268 
  269     // install Java thread in global register now we have saved
  270     // whatever value it held
  271     __ mov(rthread, c_rarg7);
  272     // And method
  273     __ mov(rmethod, c_rarg3);
  274 
  275     // set up the heapbase register
  276     __ reinit_heapbase();
  277 
  278 #ifdef ASSERT
  279     // make sure we have no pending exceptions
  280     {
  281       Label L;
  282       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  283       __ cmp(rscratch1, (u1)NULL_WORD);
  284       __ br(Assembler::EQ, L);
  285       __ stop("StubRoutines::call_stub: entered with pending exception");
  286       __ BIND(L);
  287     }
  288 #endif
  289     // pass parameters if any
  290     __ mov(esp, sp);
  291     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  292     __ andr(sp, rscratch1, -2 * wordSize);
  293 
  294     BLOCK_COMMENT("pass parameters if any");
  295     Label parameters_done;
  296     // parameter count is still in c_rarg6
  297     // and parameter pointer identifying param 1 is in c_rarg5
  298     __ cbzw(c_rarg6, parameters_done);
  299 
  300     address loop = __ pc();
  301     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  302     __ subsw(c_rarg6, c_rarg6, 1);
  303     __ push(rscratch1);
  304     __ br(Assembler::GT, loop);
  305 
  306     __ BIND(parameters_done);
  307 
  308     // call Java entry -- passing methdoOop, and current sp
  309     //      rmethod: Method*
  310     //      r19_sender_sp: sender sp
  311     BLOCK_COMMENT("call Java function");
  312     __ mov(r19_sender_sp, sp);
  313     __ blr(c_rarg4);
  314 
  315     // we do this here because the notify will already have been done
  316     // if we get to the next instruction via an exception
  317     //
  318     // n.b. adding this instruction here affects the calculation of
  319     // whether or not a routine returns to the call stub (used when
  320     // doing stack walks) since the normal test is to check the return
  321     // pc against the address saved below. so we may need to allow for
  322     // this extra instruction in the check.
  323 
  324     // save current address for use by exception handling code
  325 
  326     return_address = __ pc();
  327 
  328     // store result depending on type (everything that is not
  329     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  330     // n.b. this assumes Java returns an integral result in r0
  331     // and a floating result in j_farg0
  332     __ ldr(j_rarg2, result);
  333     Label is_long, is_float, is_double, exit;
  334     __ ldr(j_rarg1, result_type);
  335     __ cmp(j_rarg1, (u1)T_OBJECT);
  336     __ br(Assembler::EQ, is_long);
  337     __ cmp(j_rarg1, (u1)T_LONG);
  338     __ br(Assembler::EQ, is_long);
  339     __ cmp(j_rarg1, (u1)T_FLOAT);
  340     __ br(Assembler::EQ, is_float);
  341     __ cmp(j_rarg1, (u1)T_DOUBLE);
  342     __ br(Assembler::EQ, is_double);
  343 
  344     // handle T_INT case
  345     __ strw(r0, Address(j_rarg2));
  346 
  347     __ BIND(exit);
  348 
  349     // pop parameters
  350     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  351 
  352 #ifdef ASSERT
  353     // verify that threads correspond
  354     {
  355       Label L, S;
  356       __ ldr(rscratch1, thread);
  357       __ cmp(rthread, rscratch1);
  358       __ br(Assembler::NE, S);
  359       __ get_thread(rscratch1);
  360       __ cmp(rthread, rscratch1);
  361       __ br(Assembler::EQ, L);
  362       __ BIND(S);
  363       __ stop("StubRoutines::call_stub: threads must correspond");
  364       __ BIND(L);
  365     }
  366 #endif
  367 
  368     __ pop_cont_fastpath(rthread);
  369 
  370     // restore callee-save registers
  371     __ ldpd(v15, v14,  d15_save);
  372     __ ldpd(v13, v12,  d13_save);
  373     __ ldpd(v11, v10,  d11_save);
  374     __ ldpd(v9,  v8,   d9_save);
  375 
  376     __ ldp(r28, r27,   r28_save);
  377     __ ldp(r26, r25,   r26_save);
  378     __ ldp(r24, r23,   r24_save);
  379     __ ldp(r22, r21,   r22_save);
  380     __ ldp(r20, r19,   r20_save);
  381 
  382     // restore fpcr
  383     __ ldr(rscratch1,  fpcr_save);
  384     __ set_fpcr(rscratch1);
  385 
  386     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  387     __ ldrw(c_rarg2, result_type);
  388     __ ldr(c_rarg3,  method);
  389     __ ldp(c_rarg4, c_rarg5,  entry_point);
  390     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  391 
  392     // leave frame and return to caller
  393     __ leave();
  394     __ ret(lr);
  395 
  396     // handle return types different from T_INT
  397 
  398     __ BIND(is_long);
  399     __ str(r0, Address(j_rarg2, 0));
  400     __ br(Assembler::AL, exit);
  401 
  402     __ BIND(is_float);
  403     __ strs(j_farg0, Address(j_rarg2, 0));
  404     __ br(Assembler::AL, exit);
  405 
  406     __ BIND(is_double);
  407     __ strd(j_farg0, Address(j_rarg2, 0));
  408     __ br(Assembler::AL, exit);
  409 
  410     return start;
  411   }
  412 
  413   // Return point for a Java call if there's an exception thrown in
  414   // Java code.  The exception is caught and transformed into a
  415   // pending exception stored in JavaThread that can be tested from
  416   // within the VM.
  417   //
  418   // Note: Usually the parameters are removed by the callee. In case
  419   // of an exception crossing an activation frame boundary, that is
  420   // not the case if the callee is compiled code => need to setup the
  421   // rsp.
  422   //
  423   // r0: exception oop
  424 
  425   address generate_catch_exception() {
  426     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
  427     StubCodeMark mark(this, stub_id);
  428     address start = __ pc();
  429 
  430     // same as in generate_call_stub():
  431     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  432     const Address thread        (rfp, thread_off         * wordSize);
  433 
  434 #ifdef ASSERT
  435     // verify that threads correspond
  436     {
  437       Label L, S;
  438       __ ldr(rscratch1, thread);
  439       __ cmp(rthread, rscratch1);
  440       __ br(Assembler::NE, S);
  441       __ get_thread(rscratch1);
  442       __ cmp(rthread, rscratch1);
  443       __ br(Assembler::EQ, L);
  444       __ bind(S);
  445       __ stop("StubRoutines::catch_exception: threads must correspond");
  446       __ bind(L);
  447     }
  448 #endif
  449 
  450     // set pending exception
  451     __ verify_oop(r0);
  452 
  453     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  454     __ mov(rscratch1, (address)__FILE__);
  455     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  456     __ movw(rscratch1, (int)__LINE__);
  457     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  458 
  459     // complete return to VM
  460     assert(StubRoutines::_call_stub_return_address != nullptr,
  461            "_call_stub_return_address must have been generated before");
  462     __ b(StubRoutines::_call_stub_return_address);
  463 
  464     return start;
  465   }
  466 
  467   // Continuation point for runtime calls returning with a pending
  468   // exception.  The pending exception check happened in the runtime
  469   // or native call stub.  The pending exception in Thread is
  470   // converted into a Java-level exception.
  471   //
  472   // Contract with Java-level exception handlers:
  473   // r0: exception
  474   // r3: throwing pc
  475   //
  476   // NOTE: At entry of this stub, exception-pc must be in LR !!
  477 
  478   // NOTE: this is always used as a jump target within generated code
  479   // so it just needs to be generated code with no x86 prolog
  480 
  481   address generate_forward_exception() {
  482     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
  483     StubCodeMark mark(this, stub_id);
  484     address start = __ pc();
  485 
  486     // Upon entry, LR points to the return address returning into
  487     // Java (interpreted or compiled) code; i.e., the return address
  488     // becomes the throwing pc.
  489     //
  490     // Arguments pushed before the runtime call are still on the stack
  491     // but the exception handler will reset the stack pointer ->
  492     // ignore them.  A potential result in registers can be ignored as
  493     // well.
  494 
  495 #ifdef ASSERT
  496     // make sure this code is only executed if there is a pending exception
  497     {
  498       Label L;
  499       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  500       __ cbnz(rscratch1, L);
  501       __ stop("StubRoutines::forward exception: no pending exception (1)");
  502       __ bind(L);
  503     }
  504 #endif
  505 
  506     // compute exception handler into r19
  507 
  508     // call the VM to find the handler address associated with the
  509     // caller address. pass thread in r0 and caller pc (ret address)
  510     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  511     // the stack.
  512     __ mov(c_rarg1, lr);
  513     // lr will be trashed by the VM call so we move it to R19
  514     // (callee-saved) because we also need to pass it to the handler
  515     // returned by this call.
  516     __ mov(r19, lr);
  517     BLOCK_COMMENT("call exception_handler_for_return_address");
  518     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  519                          SharedRuntime::exception_handler_for_return_address),
  520                     rthread, c_rarg1);
  521     // Reinitialize the ptrue predicate register, in case the external runtime
  522     // call clobbers ptrue reg, as we may return to SVE compiled code.
  523     __ reinitialize_ptrue();
  524 
  525     // we should not really care that lr is no longer the callee
  526     // address. we saved the value the handler needs in r19 so we can
  527     // just copy it to r3. however, the C2 handler will push its own
  528     // frame and then calls into the VM and the VM code asserts that
  529     // the PC for the frame above the handler belongs to a compiled
  530     // Java method. So, we restore lr here to satisfy that assert.
  531     __ mov(lr, r19);
  532     // setup r0 & r3 & clear pending exception
  533     __ mov(r3, r19);
  534     __ mov(r19, r0);
  535     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  536     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  537 
  538 #ifdef ASSERT
  539     // make sure exception is set
  540     {
  541       Label L;
  542       __ cbnz(r0, L);
  543       __ stop("StubRoutines::forward exception: no pending exception (2)");
  544       __ bind(L);
  545     }
  546 #endif
  547 
  548     // continue at exception handler
  549     // r0: exception
  550     // r3: throwing pc
  551     // r19: exception handler
  552     __ verify_oop(r0);
  553     __ br(r19);
  554 
  555     return start;
  556   }
  557 
  558   // Non-destructive plausibility checks for oops
  559   //
  560   // Arguments:
  561   //    r0: oop to verify
  562   //    rscratch1: error message
  563   //
  564   // Stack after saving c_rarg3:
  565   //    [tos + 0]: saved c_rarg3
  566   //    [tos + 1]: saved c_rarg2
  567   //    [tos + 2]: saved lr
  568   //    [tos + 3]: saved rscratch2
  569   //    [tos + 4]: saved r0
  570   //    [tos + 5]: saved rscratch1
  571   address generate_verify_oop() {
  572     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
  573     StubCodeMark mark(this, stub_id);
  574     address start = __ pc();
  575 
  576     Label exit, error;
  577 
  578     // save c_rarg2 and c_rarg3
  579     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  580 
  581     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  583     __ ldr(c_rarg3, Address(c_rarg2));
  584     __ add(c_rarg3, c_rarg3, 1);
  585     __ str(c_rarg3, Address(c_rarg2));
  586 
  587     // object is in r0
  588     // make sure object is 'reasonable'
  589     __ cbz(r0, exit); // if obj is null it is OK
  590 
  591     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  592     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  593 
  594     // return if everything seems ok
  595     __ bind(exit);
  596 
  597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  598     __ ret(lr);
  599 
  600     // handle errors
  601     __ bind(error);
  602     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  603 
  604     __ push(RegSet::range(r0, r29), sp);
  605     // debug(char* msg, int64_t pc, int64_t regs[])
  606     __ mov(c_rarg0, rscratch1);      // pass address of error message
  607     __ mov(c_rarg1, lr);             // pass return address
  608     __ mov(c_rarg2, sp);             // pass address of regs on stack
  609 #ifndef PRODUCT
  610     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  611 #endif
  612     BLOCK_COMMENT("call MacroAssembler::debug");
  613     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  614     __ blr(rscratch1);
  615     __ hlt(0);
  616 
  617     return start;
  618   }
  619 
  620   // Generate indices for iota vector.
  621   address generate_iota_indices(StubGenStubId stub_id) {
  622     __ align(CodeEntryAlignment);
  623     StubCodeMark mark(this, stub_id);
  624     address start = __ pc();
  625     // B
  626     __ emit_data64(0x0706050403020100, relocInfo::none);
  627     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  628     // H
  629     __ emit_data64(0x0003000200010000, relocInfo::none);
  630     __ emit_data64(0x0007000600050004, relocInfo::none);
  631     // S
  632     __ emit_data64(0x0000000100000000, relocInfo::none);
  633     __ emit_data64(0x0000000300000002, relocInfo::none);
  634     // D
  635     __ emit_data64(0x0000000000000000, relocInfo::none);
  636     __ emit_data64(0x0000000000000001, relocInfo::none);
  637     // S - FP
  638     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  639     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  640     // D - FP
  641     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  642     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  643     return start;
  644   }
  645 
  646   // The inner part of zero_words().  This is the bulk operation,
  647   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  648   // caller is responsible for zeroing the last few words.
  649   //
  650   // Inputs:
  651   // r10: the HeapWord-aligned base address of an array to zero.
  652   // r11: the count in HeapWords, r11 > 0.
  653   //
  654   // Returns r10 and r11, adjusted for the caller to clear.
  655   // r10: the base address of the tail of words left to clear.
  656   // r11: the number of words in the tail.
  657   //      r11 < MacroAssembler::zero_words_block_size.
  658 
  659   address generate_zero_blocks() {
  660     Label done;
  661     Label base_aligned;
  662 
  663     Register base = r10, cnt = r11;
  664 
  665     __ align(CodeEntryAlignment);
  666     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
  667     StubCodeMark mark(this, stub_id);
  668     address start = __ pc();
  669 
  670     if (UseBlockZeroing) {
  671       int zva_length = VM_Version::zva_length();
  672 
  673       // Ensure ZVA length can be divided by 16. This is required by
  674       // the subsequent operations.
  675       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  676 
  677       __ tbz(base, 3, base_aligned);
  678       __ str(zr, Address(__ post(base, 8)));
  679       __ sub(cnt, cnt, 1);
  680       __ bind(base_aligned);
  681 
  682       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  683       // alignment.
  684       Label small;
  685       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  686       __ subs(rscratch1, cnt, low_limit >> 3);
  687       __ br(Assembler::LT, small);
  688       __ zero_dcache_blocks(base, cnt);
  689       __ bind(small);
  690     }
  691 
  692     {
  693       // Number of stp instructions we'll unroll
  694       const int unroll =
  695         MacroAssembler::zero_words_block_size / 2;
  696       // Clear the remaining blocks.
  697       Label loop;
  698       __ subs(cnt, cnt, unroll * 2);
  699       __ br(Assembler::LT, done);
  700       __ bind(loop);
  701       for (int i = 0; i < unroll; i++)
  702         __ stp(zr, zr, __ post(base, 16));
  703       __ subs(cnt, cnt, unroll * 2);
  704       __ br(Assembler::GE, loop);
  705       __ bind(done);
  706       __ add(cnt, cnt, unroll * 2);
  707     }
  708 
  709     __ ret(lr);
  710 
  711     return start;
  712   }
  713 
  714 
  715   typedef enum {
  716     copy_forwards = 1,
  717     copy_backwards = -1
  718   } copy_direction;
  719 
  720   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  721   // for arraycopy stubs.
  722   class ArrayCopyBarrierSetHelper : StackObj {
  723     BarrierSetAssembler* _bs_asm;
  724     MacroAssembler* _masm;
  725     DecoratorSet _decorators;
  726     BasicType _type;
  727     Register _gct1;
  728     Register _gct2;
  729     Register _gct3;
  730     FloatRegister _gcvt1;
  731     FloatRegister _gcvt2;
  732     FloatRegister _gcvt3;
  733 
  734   public:
  735     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  736                               DecoratorSet decorators,
  737                               BasicType type,
  738                               Register gct1,
  739                               Register gct2,
  740                               Register gct3,
  741                               FloatRegister gcvt1,
  742                               FloatRegister gcvt2,
  743                               FloatRegister gcvt3)
  744       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  745         _masm(masm),
  746         _decorators(decorators),
  747         _type(type),
  748         _gct1(gct1),
  749         _gct2(gct2),
  750         _gct3(gct3),
  751         _gcvt1(gcvt1),
  752         _gcvt2(gcvt2),
  753         _gcvt3(gcvt3) {
  754     }
  755 
  756     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  757       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  758                             dst1, dst2, src,
  759                             _gct1, _gct2, _gcvt1);
  760     }
  761 
  762     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  763       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  764                              dst, src1, src2,
  765                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  766     }
  767 
  768     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  769       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  770                             dst1, dst2, src,
  771                             _gct1);
  772     }
  773 
  774     void copy_store_at_16(Address dst, Register src1, Register src2) {
  775       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  776                              dst, src1, src2,
  777                              _gct1, _gct2, _gct3);
  778     }
  779 
  780     void copy_load_at_8(Register dst, Address src) {
  781       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  782                             dst, noreg, src,
  783                             _gct1);
  784     }
  785 
  786     void copy_store_at_8(Address dst, Register src) {
  787       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  788                              dst, src, noreg,
  789                              _gct1, _gct2, _gct3);
  790     }
  791   };
  792 
  793   // Bulk copy of blocks of 8 words.
  794   //
  795   // count is a count of words.
  796   //
  797   // Precondition: count >= 8
  798   //
  799   // Postconditions:
  800   //
  801   // The least significant bit of count contains the remaining count
  802   // of words to copy.  The rest of count is trash.
  803   //
  804   // s and d are adjusted to point to the remaining words to copy
  805   //
  806   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  807     BasicType type;
  808     copy_direction direction;
  809 
  810     switch (stub_id) {
  811     case copy_byte_f_id:
  812       direction = copy_forwards;
  813       type = T_BYTE;
  814       break;
  815     case copy_byte_b_id:
  816       direction = copy_backwards;
  817       type = T_BYTE;
  818       break;
  819     case copy_oop_f_id:
  820       direction = copy_forwards;
  821       type = T_OBJECT;
  822       break;
  823     case copy_oop_b_id:
  824       direction = copy_backwards;
  825       type = T_OBJECT;
  826       break;
  827     case copy_oop_uninit_f_id:
  828       direction = copy_forwards;
  829       type = T_OBJECT;
  830       break;
  831     case copy_oop_uninit_b_id:
  832       direction = copy_backwards;
  833       type = T_OBJECT;
  834       break;
  835     default:
  836       ShouldNotReachHere();
  837     }
  838 
  839     int unit = wordSize * direction;
  840     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  841 
  842     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  843       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  844     const Register stride = r14;
  845     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  846     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  847     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  848 
  849     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  850     assert_different_registers(s, d, count, rscratch1, rscratch2);
  851 
  852     Label again, drain;
  853 
  854     __ align(CodeEntryAlignment);
  855 
  856     StubCodeMark mark(this, stub_id);
  857 
  858     __ bind(start);
  859 
  860     Label unaligned_copy_long;
  861     if (AvoidUnalignedAccesses) {
  862       __ tbnz(d, 3, unaligned_copy_long);
  863     }
  864 
  865     if (direction == copy_forwards) {
  866       __ sub(s, s, bias);
  867       __ sub(d, d, bias);
  868     }
  869 
  870 #ifdef ASSERT
  871     // Make sure we are never given < 8 words
  872     {
  873       Label L;
  874       __ cmp(count, (u1)8);
  875       __ br(Assembler::GE, L);
  876       __ stop("genrate_copy_longs called with < 8 words");
  877       __ bind(L);
  878     }
  879 #endif
  880 
  881     // Fill 8 registers
  882     if (UseSIMDForMemoryOps) {
  883       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  884       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  885     } else {
  886       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  887       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  888       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  889       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  890     }
  891 
  892     __ subs(count, count, 16);
  893     __ br(Assembler::LO, drain);
  894 
  895     int prefetch = PrefetchCopyIntervalInBytes;
  896     bool use_stride = false;
  897     if (direction == copy_backwards) {
  898        use_stride = prefetch > 256;
  899        prefetch = -prefetch;
  900        if (use_stride) __ mov(stride, prefetch);
  901     }
  902 
  903     __ bind(again);
  904 
  905     if (PrefetchCopyIntervalInBytes > 0)
  906       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  907 
  908     if (UseSIMDForMemoryOps) {
  909       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  910       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  911       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  912       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  913     } else {
  914       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  915       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  916       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  917       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  918       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  919       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  920       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  921       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  922     }
  923 
  924     __ subs(count, count, 8);
  925     __ br(Assembler::HS, again);
  926 
  927     // Drain
  928     __ bind(drain);
  929     if (UseSIMDForMemoryOps) {
  930       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  931       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  932     } else {
  933       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  934       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  935       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  936       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  937     }
  938 
  939     {
  940       Label L1, L2;
  941       __ tbz(count, exact_log2(4), L1);
  942       if (UseSIMDForMemoryOps) {
  943         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  944         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  945       } else {
  946         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  947         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  948         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  950       }
  951       __ bind(L1);
  952 
  953       if (direction == copy_forwards) {
  954         __ add(s, s, bias);
  955         __ add(d, d, bias);
  956       }
  957 
  958       __ tbz(count, 1, L2);
  959       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  960       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  961       __ bind(L2);
  962     }
  963 
  964     __ ret(lr);
  965 
  966     if (AvoidUnalignedAccesses) {
  967       Label drain, again;
  968       // Register order for storing. Order is different for backward copy.
  969 
  970       __ bind(unaligned_copy_long);
  971 
  972       // source address is even aligned, target odd aligned
  973       //
  974       // when forward copying word pairs we read long pairs at offsets
  975       // {0, 2, 4, 6} (in long words). when backwards copying we read
  976       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  977       // address by -2 in the forwards case so we can compute the
  978       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  979       // or -1.
  980       //
  981       // when forward copying we need to store 1 word, 3 pairs and
  982       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  983       // zero offset We adjust the destination by -1 which means we
  984       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  985       //
  986       // When backwards copyng we need to store 1 word, 3 pairs and
  987       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  988       // offsets {1, 3, 5, 7, 8} * unit.
  989 
  990       if (direction == copy_forwards) {
  991         __ sub(s, s, 16);
  992         __ sub(d, d, 8);
  993       }
  994 
  995       // Fill 8 registers
  996       //
  997       // for forwards copy s was offset by -16 from the original input
  998       // value of s so the register contents are at these offsets
  999       // relative to the 64 bit block addressed by that original input
 1000       // and so on for each successive 64 byte block when s is updated
 1001       //
 1002       // t0 at offset 0,  t1 at offset 8
 1003       // t2 at offset 16, t3 at offset 24
 1004       // t4 at offset 32, t5 at offset 40
 1005       // t6 at offset 48, t7 at offset 56
 1006 
 1007       // for backwards copy s was not offset so the register contents
 1008       // are at these offsets into the preceding 64 byte block
 1009       // relative to that original input and so on for each successive
 1010       // preceding 64 byte block when s is updated. this explains the
 1011       // slightly counter-intuitive looking pattern of register usage
 1012       // in the stp instructions for backwards copy.
 1013       //
 1014       // t0 at offset -16, t1 at offset -8
 1015       // t2 at offset -32, t3 at offset -24
 1016       // t4 at offset -48, t5 at offset -40
 1017       // t6 at offset -64, t7 at offset -56
 1018 
 1019       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1020       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1021       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1022       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1023 
 1024       __ subs(count, count, 16);
 1025       __ br(Assembler::LO, drain);
 1026 
 1027       int prefetch = PrefetchCopyIntervalInBytes;
 1028       bool use_stride = false;
 1029       if (direction == copy_backwards) {
 1030          use_stride = prefetch > 256;
 1031          prefetch = -prefetch;
 1032          if (use_stride) __ mov(stride, prefetch);
 1033       }
 1034 
 1035       __ bind(again);
 1036 
 1037       if (PrefetchCopyIntervalInBytes > 0)
 1038         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1039 
 1040       if (direction == copy_forwards) {
 1041        // allowing for the offset of -8 the store instructions place
 1042        // registers into the target 64 bit block at the following
 1043        // offsets
 1044        //
 1045        // t0 at offset 0
 1046        // t1 at offset 8,  t2 at offset 16
 1047        // t3 at offset 24, t4 at offset 32
 1048        // t5 at offset 40, t6 at offset 48
 1049        // t7 at offset 56
 1050 
 1051         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1052         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1053         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1054         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1055         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1056         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1057         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1058         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1059         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1060       } else {
 1061        // d was not offset when we started so the registers are
 1062        // written into the 64 bit block preceding d with the following
 1063        // offsets
 1064        //
 1065        // t1 at offset -8
 1066        // t3 at offset -24, t0 at offset -16
 1067        // t5 at offset -48, t2 at offset -32
 1068        // t7 at offset -56, t4 at offset -48
 1069        //                   t6 at offset -64
 1070        //
 1071        // note that this matches the offsets previously noted for the
 1072        // loads
 1073 
 1074         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1075         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1076         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1077         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1078         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1079         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1080         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1081         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1082         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1083       }
 1084 
 1085       __ subs(count, count, 8);
 1086       __ br(Assembler::HS, again);
 1087 
 1088       // Drain
 1089       //
 1090       // this uses the same pattern of offsets and register arguments
 1091       // as above
 1092       __ bind(drain);
 1093       if (direction == copy_forwards) {
 1094         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1095         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1096         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1097         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1098         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1099       } else {
 1100         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1101         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1102         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1103         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1104         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1105       }
 1106       // now we need to copy any remaining part block which may
 1107       // include a 4 word block subblock and/or a 2 word subblock.
 1108       // bits 2 and 1 in the count are the tell-tale for whether we
 1109       // have each such subblock
 1110       {
 1111         Label L1, L2;
 1112         __ tbz(count, exact_log2(4), L1);
 1113        // this is the same as above but copying only 4 longs hence
 1114        // with only one intervening stp between the str instructions
 1115        // but note that the offsets and registers still follow the
 1116        // same pattern
 1117         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1118         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1119         if (direction == copy_forwards) {
 1120           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1121           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1122           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1123         } else {
 1124           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1125           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1126           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1127         }
 1128         __ bind(L1);
 1129 
 1130         __ tbz(count, 1, L2);
 1131        // this is the same as above but copying only 2 longs hence
 1132        // there is no intervening stp between the str instructions
 1133        // but note that the offset and register patterns are still
 1134        // the same
 1135         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1136         if (direction == copy_forwards) {
 1137           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1138           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1139         } else {
 1140           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1141           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1142         }
 1143         __ bind(L2);
 1144 
 1145        // for forwards copy we need to re-adjust the offsets we
 1146        // applied so that s and d are follow the last words written
 1147 
 1148        if (direction == copy_forwards) {
 1149          __ add(s, s, 16);
 1150          __ add(d, d, 8);
 1151        }
 1152 
 1153       }
 1154 
 1155       __ ret(lr);
 1156       }
 1157   }
 1158 
 1159   // Small copy: less than 16 bytes.
 1160   //
 1161   // NB: Ignores all of the bits of count which represent more than 15
 1162   // bytes, so a caller doesn't have to mask them.
 1163 
 1164   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1165     bool is_backwards = step < 0;
 1166     size_t granularity = uabs(step);
 1167     int direction = is_backwards ? -1 : 1;
 1168 
 1169     Label Lword, Lint, Lshort, Lbyte;
 1170 
 1171     assert(granularity
 1172            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1173 
 1174     const Register t0 = r3;
 1175     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1176     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1177 
 1178     // ??? I don't know if this bit-test-and-branch is the right thing
 1179     // to do.  It does a lot of jumping, resulting in several
 1180     // mispredicted branches.  It might make more sense to do this
 1181     // with something like Duff's device with a single computed branch.
 1182 
 1183     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1184     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1185     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1186     __ bind(Lword);
 1187 
 1188     if (granularity <= sizeof (jint)) {
 1189       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1190       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1191       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1192       __ bind(Lint);
 1193     }
 1194 
 1195     if (granularity <= sizeof (jshort)) {
 1196       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1197       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1198       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1199       __ bind(Lshort);
 1200     }
 1201 
 1202     if (granularity <= sizeof (jbyte)) {
 1203       __ tbz(count, 0, Lbyte);
 1204       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1205       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1206       __ bind(Lbyte);
 1207     }
 1208   }
 1209 
 1210   Label copy_f, copy_b;
 1211   Label copy_obj_f, copy_obj_b;
 1212   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1213 
 1214   // All-singing all-dancing memory copy.
 1215   //
 1216   // Copy count units of memory from s to d.  The size of a unit is
 1217   // step, which can be positive or negative depending on the direction
 1218   // of copy.  If is_aligned is false, we align the source address.
 1219   //
 1220 
 1221   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1222                    Register s, Register d, Register count, int step) {
 1223     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1224     bool is_backwards = step < 0;
 1225     unsigned int granularity = uabs(step);
 1226     const Register t0 = r3, t1 = r4;
 1227 
 1228     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1229     // load all the data before writing anything
 1230     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1231     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1232     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1233     const Register send = r17, dend = r16;
 1234     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1235     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1236     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1237 
 1238     if (PrefetchCopyIntervalInBytes > 0)
 1239       __ prfm(Address(s, 0), PLDL1KEEP);
 1240     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1241     __ br(Assembler::HI, copy_big);
 1242 
 1243     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1244     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1245 
 1246     __ cmp(count, u1(16/granularity));
 1247     __ br(Assembler::LS, copy16);
 1248 
 1249     __ cmp(count, u1(64/granularity));
 1250     __ br(Assembler::HI, copy80);
 1251 
 1252     __ cmp(count, u1(32/granularity));
 1253     __ br(Assembler::LS, copy32);
 1254 
 1255     // 33..64 bytes
 1256     if (UseSIMDForMemoryOps) {
 1257       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1258       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1259       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1260       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1261     } else {
 1262       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1263       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1264       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1265       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1266 
 1267       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1268       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1269       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1270       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1271     }
 1272     __ b(finish);
 1273 
 1274     // 17..32 bytes
 1275     __ bind(copy32);
 1276     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1277     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1278 
 1279     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1280     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1281     __ b(finish);
 1282 
 1283     // 65..80/96 bytes
 1284     // (96 bytes if SIMD because we do 32 byes per instruction)
 1285     __ bind(copy80);
 1286     if (UseSIMDForMemoryOps) {
 1287       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1288       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1289       // Unaligned pointers can be an issue for copying.
 1290       // The issue has more chances to happen when granularity of data is
 1291       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1292       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1293       // The most performance drop has been seen for the range 65-80 bytes.
 1294       // For such cases using the pair of ldp/stp instead of the third pair of
 1295       // ldpq/stpq fixes the performance issue.
 1296       if (granularity < sizeof (jint)) {
 1297         Label copy96;
 1298         __ cmp(count, u1(80/granularity));
 1299         __ br(Assembler::HI, copy96);
 1300         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1301 
 1302         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1303         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1304 
 1305         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1306         __ b(finish);
 1307 
 1308         __ bind(copy96);
 1309       }
 1310       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1311 
 1312       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1313       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1314 
 1315       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1316     } else {
 1317       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1318       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1319       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1320       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1321       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1322 
 1323       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1324       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1325       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1326       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1327       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1328     }
 1329     __ b(finish);
 1330 
 1331     // 0..16 bytes
 1332     __ bind(copy16);
 1333     __ cmp(count, u1(8/granularity));
 1334     __ br(Assembler::LO, copy8);
 1335 
 1336     // 8..16 bytes
 1337     bs.copy_load_at_8(t0, Address(s, 0));
 1338     bs.copy_load_at_8(t1, Address(send, -8));
 1339     bs.copy_store_at_8(Address(d, 0), t0);
 1340     bs.copy_store_at_8(Address(dend, -8), t1);
 1341     __ b(finish);
 1342 
 1343     if (granularity < 8) {
 1344       // 4..7 bytes
 1345       __ bind(copy8);
 1346       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1347       __ ldrw(t0, Address(s, 0));
 1348       __ ldrw(t1, Address(send, -4));
 1349       __ strw(t0, Address(d, 0));
 1350       __ strw(t1, Address(dend, -4));
 1351       __ b(finish);
 1352       if (granularity < 4) {
 1353         // 0..3 bytes
 1354         __ bind(copy4);
 1355         __ cbz(count, finish); // get rid of 0 case
 1356         if (granularity == 2) {
 1357           __ ldrh(t0, Address(s, 0));
 1358           __ strh(t0, Address(d, 0));
 1359         } else { // granularity == 1
 1360           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1361           // the first and last byte.
 1362           // Handle the 3 byte case by loading and storing base + count/2
 1363           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1364           // This does means in the 1 byte case we load/store the same
 1365           // byte 3 times.
 1366           __ lsr(count, count, 1);
 1367           __ ldrb(t0, Address(s, 0));
 1368           __ ldrb(t1, Address(send, -1));
 1369           __ ldrb(t2, Address(s, count));
 1370           __ strb(t0, Address(d, 0));
 1371           __ strb(t1, Address(dend, -1));
 1372           __ strb(t2, Address(d, count));
 1373         }
 1374         __ b(finish);
 1375       }
 1376     }
 1377 
 1378     __ bind(copy_big);
 1379     if (is_backwards) {
 1380       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1381       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1382     }
 1383 
 1384     // Now we've got the small case out of the way we can align the
 1385     // source address on a 2-word boundary.
 1386 
 1387     // Here we will materialize a count in r15, which is used by copy_memory_small
 1388     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1389     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1390     // can not be used as a temp register, as it contains the count.
 1391 
 1392     Label aligned;
 1393 
 1394     if (is_aligned) {
 1395       // We may have to adjust by 1 word to get s 2-word-aligned.
 1396       __ tbz(s, exact_log2(wordSize), aligned);
 1397       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1398       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1399       __ sub(count, count, wordSize/granularity);
 1400     } else {
 1401       if (is_backwards) {
 1402         __ andr(r15, s, 2 * wordSize - 1);
 1403       } else {
 1404         __ neg(r15, s);
 1405         __ andr(r15, r15, 2 * wordSize - 1);
 1406       }
 1407       // r15 is the byte adjustment needed to align s.
 1408       __ cbz(r15, aligned);
 1409       int shift = exact_log2(granularity);
 1410       if (shift > 0) {
 1411         __ lsr(r15, r15, shift);
 1412       }
 1413       __ sub(count, count, r15);
 1414 
 1415 #if 0
 1416       // ?? This code is only correct for a disjoint copy.  It may or
 1417       // may not make sense to use it in that case.
 1418 
 1419       // Copy the first pair; s and d may not be aligned.
 1420       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1421       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1422 
 1423       // Align s and d, adjust count
 1424       if (is_backwards) {
 1425         __ sub(s, s, r15);
 1426         __ sub(d, d, r15);
 1427       } else {
 1428         __ add(s, s, r15);
 1429         __ add(d, d, r15);
 1430       }
 1431 #else
 1432       copy_memory_small(decorators, type, s, d, r15, step);
 1433 #endif
 1434     }
 1435 
 1436     __ bind(aligned);
 1437 
 1438     // s is now 2-word-aligned.
 1439 
 1440     // We have a count of units and some trailing bytes. Adjust the
 1441     // count and do a bulk copy of words. If the shift is zero
 1442     // perform a move instead to benefit from zero latency moves.
 1443     int shift = exact_log2(wordSize/granularity);
 1444     if (shift > 0) {
 1445       __ lsr(r15, count, shift);
 1446     } else {
 1447       __ mov(r15, count);
 1448     }
 1449     if (direction == copy_forwards) {
 1450       if (type != T_OBJECT) {
 1451         __ bl(copy_f);
 1452       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1453         __ bl(copy_obj_uninit_f);
 1454       } else {
 1455         __ bl(copy_obj_f);
 1456       }
 1457     } else {
 1458       if (type != T_OBJECT) {
 1459         __ bl(copy_b);
 1460       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1461         __ bl(copy_obj_uninit_b);
 1462       } else {
 1463         __ bl(copy_obj_b);
 1464       }
 1465     }
 1466 
 1467     // And the tail.
 1468     copy_memory_small(decorators, type, s, d, count, step);
 1469 
 1470     if (granularity >= 8) __ bind(copy8);
 1471     if (granularity >= 4) __ bind(copy4);
 1472     __ bind(finish);
 1473   }
 1474 
 1475 
 1476   void clobber_registers() {
 1477 #ifdef ASSERT
 1478     RegSet clobbered
 1479       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1480     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1481     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1482     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1483       __ mov(*it, rscratch1);
 1484     }
 1485 #endif
 1486 
 1487   }
 1488 
 1489   // Scan over array at a for count oops, verifying each one.
 1490   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1491   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1492     Label loop, end;
 1493     __ mov(rscratch1, a);
 1494     __ mov(rscratch2, zr);
 1495     __ bind(loop);
 1496     __ cmp(rscratch2, count);
 1497     __ br(Assembler::HS, end);
 1498     if (size == wordSize) {
 1499       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1500       __ verify_oop(temp);
 1501     } else {
 1502       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1503       __ decode_heap_oop(temp); // calls verify_oop
 1504     }
 1505     __ add(rscratch2, rscratch2, 1);
 1506     __ b(loop);
 1507     __ bind(end);
 1508   }
 1509 
 1510   // Arguments:
 1511   //   stub_id - is used to name the stub and identify all details of
 1512   //             how to perform the copy.
 1513   //
 1514   //   entry - is assigned to the stub's post push entry point unless
 1515   //           it is null
 1516   //
 1517   // Inputs:
 1518   //   c_rarg0   - source array address
 1519   //   c_rarg1   - destination array address
 1520   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1521   //
 1522   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1523   // the hardware handle it.  The two dwords within qwords that span
 1524   // cache line boundaries will still be loaded and stored atomically.
 1525   //
 1526   // Side Effects: entry is set to the (post push) entry point so it
 1527   //               can be used by the corresponding conjoint copy
 1528   //               method
 1529   //
 1530   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
 1531     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1532     RegSet saved_reg = RegSet::of(s, d, count);
 1533     int size;
 1534     bool aligned;
 1535     bool is_oop;
 1536     bool dest_uninitialized;
 1537     switch (stub_id) {
 1538     case jbyte_disjoint_arraycopy_id:
 1539       size = sizeof(jbyte);
 1540       aligned = false;
 1541       is_oop = false;
 1542       dest_uninitialized = false;
 1543       break;
 1544     case arrayof_jbyte_disjoint_arraycopy_id:
 1545       size = sizeof(jbyte);
 1546       aligned = true;
 1547       is_oop = false;
 1548       dest_uninitialized = false;
 1549       break;
 1550     case jshort_disjoint_arraycopy_id:
 1551       size = sizeof(jshort);
 1552       aligned = false;
 1553       is_oop = false;
 1554       dest_uninitialized = false;
 1555       break;
 1556     case arrayof_jshort_disjoint_arraycopy_id:
 1557       size = sizeof(jshort);
 1558       aligned = true;
 1559       is_oop = false;
 1560       dest_uninitialized = false;
 1561       break;
 1562     case jint_disjoint_arraycopy_id:
 1563       size = sizeof(jint);
 1564       aligned = false;
 1565       is_oop = false;
 1566       dest_uninitialized = false;
 1567       break;
 1568     case arrayof_jint_disjoint_arraycopy_id:
 1569       size = sizeof(jint);
 1570       aligned = true;
 1571       is_oop = false;
 1572       dest_uninitialized = false;
 1573       break;
 1574     case jlong_disjoint_arraycopy_id:
 1575       // since this is always aligned we can (should!) use the same
 1576       // stub as for case arrayof_jlong_disjoint_arraycopy
 1577       ShouldNotReachHere();
 1578       break;
 1579     case arrayof_jlong_disjoint_arraycopy_id:
 1580       size = sizeof(jlong);
 1581       aligned = true;
 1582       is_oop = false;
 1583       dest_uninitialized = false;
 1584       break;
 1585     case oop_disjoint_arraycopy_id:
 1586       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1587       aligned = !UseCompressedOops;
 1588       is_oop = true;
 1589       dest_uninitialized = false;
 1590       break;
 1591     case arrayof_oop_disjoint_arraycopy_id:
 1592       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1593       aligned = !UseCompressedOops;
 1594       is_oop = true;
 1595       dest_uninitialized = false;
 1596       break;
 1597     case oop_disjoint_arraycopy_uninit_id:
 1598       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1599       aligned = !UseCompressedOops;
 1600       is_oop = true;
 1601       dest_uninitialized = true;
 1602       break;
 1603     case arrayof_oop_disjoint_arraycopy_uninit_id:
 1604       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1605       aligned = !UseCompressedOops;
 1606       is_oop = true;
 1607       dest_uninitialized = true;
 1608       break;
 1609     default:
 1610       ShouldNotReachHere();
 1611       break;
 1612     }
 1613 
 1614     __ align(CodeEntryAlignment);
 1615     StubCodeMark mark(this, stub_id);
 1616     address start = __ pc();
 1617     __ enter();
 1618 
 1619     if (entry != nullptr) {
 1620       *entry = __ pc();
 1621       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1622       BLOCK_COMMENT("Entry:");
 1623     }
 1624 
 1625     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1626     if (dest_uninitialized) {
 1627       decorators |= IS_DEST_UNINITIALIZED;
 1628     }
 1629     if (aligned) {
 1630       decorators |= ARRAYCOPY_ALIGNED;
 1631     }
 1632 
 1633     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1634     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1635 
 1636     if (is_oop) {
 1637       // save regs before copy_memory
 1638       __ push(RegSet::of(d, count), sp);
 1639     }
 1640     {
 1641       // UnsafeMemoryAccess page error: continue after unsafe access
 1642       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1643       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1644       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1645     }
 1646 
 1647     if (is_oop) {
 1648       __ pop(RegSet::of(d, count), sp);
 1649       if (VerifyOops)
 1650         verify_oop_array(size, d, count, r16);
 1651     }
 1652 
 1653     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1654 
 1655     __ leave();
 1656     __ mov(r0, zr); // return 0
 1657     __ ret(lr);
 1658     return start;
 1659   }
 1660 
 1661   // Arguments:
 1662   //   stub_id - is used to name the stub and identify all details of
 1663   //             how to perform the copy.
 1664   //
 1665   //   nooverlap_target - identifes the (post push) entry for the
 1666   //             corresponding disjoint copy routine which can be
 1667   //             jumped to if the ranges do not actually overlap
 1668   //
 1669   //   entry - is assigned to the stub's post push entry point unless
 1670   //           it is null
 1671   //
 1672   //
 1673   // Inputs:
 1674   //   c_rarg0   - source array address
 1675   //   c_rarg1   - destination array address
 1676   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1677   //
 1678   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1679   // the hardware handle it.  The two dwords within qwords that span
 1680   // cache line boundaries will still be loaded and stored atomically.
 1681   //
 1682   // Side Effects:
 1683   //   entry is set to the no-overlap entry point so it can be used by
 1684   //   some other conjoint copy method
 1685   //
 1686   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
 1687     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1688     RegSet saved_regs = RegSet::of(s, d, count);
 1689     int size;
 1690     bool aligned;
 1691     bool is_oop;
 1692     bool dest_uninitialized;
 1693     switch (stub_id) {
 1694     case jbyte_arraycopy_id:
 1695       size = sizeof(jbyte);
 1696       aligned = false;
 1697       is_oop = false;
 1698       dest_uninitialized = false;
 1699       break;
 1700     case arrayof_jbyte_arraycopy_id:
 1701       size = sizeof(jbyte);
 1702       aligned = true;
 1703       is_oop = false;
 1704       dest_uninitialized = false;
 1705       break;
 1706     case jshort_arraycopy_id:
 1707       size = sizeof(jshort);
 1708       aligned = false;
 1709       is_oop = false;
 1710       dest_uninitialized = false;
 1711       break;
 1712     case arrayof_jshort_arraycopy_id:
 1713       size = sizeof(jshort);
 1714       aligned = true;
 1715       is_oop = false;
 1716       dest_uninitialized = false;
 1717       break;
 1718     case jint_arraycopy_id:
 1719       size = sizeof(jint);
 1720       aligned = false;
 1721       is_oop = false;
 1722       dest_uninitialized = false;
 1723       break;
 1724     case arrayof_jint_arraycopy_id:
 1725       size = sizeof(jint);
 1726       aligned = true;
 1727       is_oop = false;
 1728       dest_uninitialized = false;
 1729       break;
 1730     case jlong_arraycopy_id:
 1731       // since this is always aligned we can (should!) use the same
 1732       // stub as for case arrayof_jlong_disjoint_arraycopy
 1733       ShouldNotReachHere();
 1734       break;
 1735     case arrayof_jlong_arraycopy_id:
 1736       size = sizeof(jlong);
 1737       aligned = true;
 1738       is_oop = false;
 1739       dest_uninitialized = false;
 1740       break;
 1741     case oop_arraycopy_id:
 1742       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1743       aligned = !UseCompressedOops;
 1744       is_oop = true;
 1745       dest_uninitialized = false;
 1746       break;
 1747     case arrayof_oop_arraycopy_id:
 1748       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1749       aligned = !UseCompressedOops;
 1750       is_oop = true;
 1751       dest_uninitialized = false;
 1752       break;
 1753     case oop_arraycopy_uninit_id:
 1754       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1755       aligned = !UseCompressedOops;
 1756       is_oop = true;
 1757       dest_uninitialized = true;
 1758       break;
 1759     case arrayof_oop_arraycopy_uninit_id:
 1760       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1761       aligned = !UseCompressedOops;
 1762       is_oop = true;
 1763       dest_uninitialized = true;
 1764       break;
 1765     default:
 1766       ShouldNotReachHere();
 1767     }
 1768 
 1769     StubCodeMark mark(this, stub_id);
 1770     address start = __ pc();
 1771     __ enter();
 1772 
 1773     if (entry != nullptr) {
 1774       *entry = __ pc();
 1775       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1776       BLOCK_COMMENT("Entry:");
 1777     }
 1778 
 1779     // use fwd copy when (d-s) above_equal (count*size)
 1780     __ sub(rscratch1, d, s);
 1781     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1782     __ br(Assembler::HS, nooverlap_target);
 1783 
 1784     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1785     if (dest_uninitialized) {
 1786       decorators |= IS_DEST_UNINITIALIZED;
 1787     }
 1788     if (aligned) {
 1789       decorators |= ARRAYCOPY_ALIGNED;
 1790     }
 1791 
 1792     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1793     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1794 
 1795     if (is_oop) {
 1796       // save regs before copy_memory
 1797       __ push(RegSet::of(d, count), sp);
 1798     }
 1799     {
 1800       // UnsafeMemoryAccess page error: continue after unsafe access
 1801       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1802       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1803       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1804     }
 1805     if (is_oop) {
 1806       __ pop(RegSet::of(d, count), sp);
 1807       if (VerifyOops)
 1808         verify_oop_array(size, d, count, r16);
 1809     }
 1810     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1811     __ leave();
 1812     __ mov(r0, zr); // return 0
 1813     __ ret(lr);
 1814     return start;
 1815   }
 1816 
 1817   // Helper for generating a dynamic type check.
 1818   // Smashes rscratch1, rscratch2.
 1819   void generate_type_check(Register sub_klass,
 1820                            Register super_check_offset,
 1821                            Register super_klass,
 1822                            Register temp1,
 1823                            Register temp2,
 1824                            Register result,
 1825                            Label& L_success) {
 1826     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1827 
 1828     BLOCK_COMMENT("type_check:");
 1829 
 1830     Label L_miss;
 1831 
 1832     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1833                                      super_check_offset);
 1834     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1835 
 1836     // Fall through on failure!
 1837     __ BIND(L_miss);
 1838   }
 1839 
 1840   //
 1841   //  Generate checkcasting array copy stub
 1842   //
 1843   //  Input:
 1844   //    c_rarg0   - source array address
 1845   //    c_rarg1   - destination array address
 1846   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1847   //    c_rarg3   - size_t ckoff (super_check_offset)
 1848   //    c_rarg4   - oop ckval (super_klass)
 1849   //
 1850   //  Output:
 1851   //    r0 ==  0  -  success
 1852   //    r0 == -1^K - failure, where K is partial transfer count
 1853   //
 1854   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
 1855     bool dest_uninitialized;
 1856     switch (stub_id) {
 1857     case checkcast_arraycopy_id:
 1858       dest_uninitialized = false;
 1859       break;
 1860     case checkcast_arraycopy_uninit_id:
 1861       dest_uninitialized = true;
 1862       break;
 1863     default:
 1864       ShouldNotReachHere();
 1865     }
 1866 
 1867     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1868 
 1869     // Input registers (after setup_arg_regs)
 1870     const Register from        = c_rarg0;   // source array address
 1871     const Register to          = c_rarg1;   // destination array address
 1872     const Register count       = c_rarg2;   // elementscount
 1873     const Register ckoff       = c_rarg3;   // super_check_offset
 1874     const Register ckval       = c_rarg4;   // super_klass
 1875 
 1876     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1877     RegSet wb_post_saved_regs = RegSet::of(count);
 1878 
 1879     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1880     const Register copied_oop  = r22;       // actual oop copied
 1881     const Register count_save  = r21;       // orig elementscount
 1882     const Register start_to    = r20;       // destination array start address
 1883     const Register r19_klass   = r19;       // oop._klass
 1884 
 1885     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1886     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1887 
 1888     //---------------------------------------------------------------
 1889     // Assembler stub will be used for this call to arraycopy
 1890     // if the two arrays are subtypes of Object[] but the
 1891     // destination array type is not equal to or a supertype
 1892     // of the source type.  Each element must be separately
 1893     // checked.
 1894 
 1895     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1896                                copied_oop, r19_klass, count_save);
 1897 
 1898     __ align(CodeEntryAlignment);
 1899     StubCodeMark mark(this, stub_id);
 1900     address start = __ pc();
 1901 
 1902     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1903 
 1904 #ifdef ASSERT
 1905     // caller guarantees that the arrays really are different
 1906     // otherwise, we would have to make conjoint checks
 1907     { Label L;
 1908       __ b(L);                  // conjoint check not yet implemented
 1909       __ stop("checkcast_copy within a single array");
 1910       __ bind(L);
 1911     }
 1912 #endif //ASSERT
 1913 
 1914     // Caller of this entry point must set up the argument registers.
 1915     if (entry != nullptr) {
 1916       *entry = __ pc();
 1917       BLOCK_COMMENT("Entry:");
 1918     }
 1919 
 1920      // Empty array:  Nothing to do.
 1921     __ cbz(count, L_done);
 1922     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1923 
 1924 #ifdef ASSERT
 1925     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1926     // The ckoff and ckval must be mutually consistent,
 1927     // even though caller generates both.
 1928     { Label L;
 1929       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1930       __ ldrw(start_to, Address(ckval, sco_offset));
 1931       __ cmpw(ckoff, start_to);
 1932       __ br(Assembler::EQ, L);
 1933       __ stop("super_check_offset inconsistent");
 1934       __ bind(L);
 1935     }
 1936 #endif //ASSERT
 1937 
 1938     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1939     bool is_oop = true;
 1940     int element_size = UseCompressedOops ? 4 : 8;
 1941     if (dest_uninitialized) {
 1942       decorators |= IS_DEST_UNINITIALIZED;
 1943     }
 1944 
 1945     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1946     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1947 
 1948     // save the original count
 1949     __ mov(count_save, count);
 1950 
 1951     // Copy from low to high addresses
 1952     __ mov(start_to, to);              // Save destination array start address
 1953     __ b(L_load_element);
 1954 
 1955     // ======== begin loop ========
 1956     // (Loop is rotated; its entry is L_load_element.)
 1957     // Loop control:
 1958     //   for (; count != 0; count--) {
 1959     //     copied_oop = load_heap_oop(from++);
 1960     //     ... generate_type_check ...;
 1961     //     store_heap_oop(to++, copied_oop);
 1962     //   }
 1963     __ align(OptoLoopAlignment);
 1964 
 1965     __ BIND(L_store_element);
 1966     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1967                       __ post(to, element_size), copied_oop, noreg,
 1968                       gct1, gct2, gct3);
 1969     __ sub(count, count, 1);
 1970     __ cbz(count, L_do_card_marks);
 1971 
 1972     // ======== loop entry is here ========
 1973     __ BIND(L_load_element);
 1974     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1975                      copied_oop, noreg, __ post(from, element_size),
 1976                      gct1);
 1977     __ cbz(copied_oop, L_store_element);
 1978 
 1979     __ load_klass(r19_klass, copied_oop);// query the object klass
 1980 
 1981     BLOCK_COMMENT("type_check:");
 1982     generate_type_check(/*sub_klass*/r19_klass,
 1983                         /*super_check_offset*/ckoff,
 1984                         /*super_klass*/ckval,
 1985                         /*r_array_base*/gct1,
 1986                         /*temp2*/gct2,
 1987                         /*result*/r10, L_store_element);
 1988 
 1989     // Fall through on failure!
 1990 
 1991     // ======== end loop ========
 1992 
 1993     // It was a real error; we must depend on the caller to finish the job.
 1994     // Register count = remaining oops, count_orig = total oops.
 1995     // Emit GC store barriers for the oops we have copied and report
 1996     // their number to the caller.
 1997 
 1998     __ subs(count, count_save, count);     // K = partially copied oop count
 1999     __ eon(count, count, zr);              // report (-1^K) to caller
 2000     __ br(Assembler::EQ, L_done_pop);
 2001 
 2002     __ BIND(L_do_card_marks);
 2003     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2004 
 2005     __ bind(L_done_pop);
 2006     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2007     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2008 
 2009     __ bind(L_done);
 2010     __ mov(r0, count);
 2011     __ leave();
 2012     __ ret(lr);
 2013 
 2014     return start;
 2015   }
 2016 
 2017   // Perform range checks on the proposed arraycopy.
 2018   // Kills temp, but nothing else.
 2019   // Also, clean the sign bits of src_pos and dst_pos.
 2020   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2021                               Register src_pos, // source position (c_rarg1)
 2022                               Register dst,     // destination array oo (c_rarg2)
 2023                               Register dst_pos, // destination position (c_rarg3)
 2024                               Register length,
 2025                               Register temp,
 2026                               Label& L_failed) {
 2027     BLOCK_COMMENT("arraycopy_range_checks:");
 2028 
 2029     assert_different_registers(rscratch1, temp);
 2030 
 2031     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2032     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2033     __ addw(temp, length, src_pos);
 2034     __ cmpw(temp, rscratch1);
 2035     __ br(Assembler::HI, L_failed);
 2036 
 2037     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2038     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2039     __ addw(temp, length, dst_pos);
 2040     __ cmpw(temp, rscratch1);
 2041     __ br(Assembler::HI, L_failed);
 2042 
 2043     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2044     __ movw(src_pos, src_pos);
 2045     __ movw(dst_pos, dst_pos);
 2046 
 2047     BLOCK_COMMENT("arraycopy_range_checks done");
 2048   }
 2049 
 2050   // These stubs get called from some dumb test routine.
 2051   // I'll write them properly when they're called from
 2052   // something that's actually doing something.
 2053   static void fake_arraycopy_stub(address src, address dst, int count) {
 2054     assert(count == 0, "huh?");
 2055   }
 2056 
 2057 
 2058   //
 2059   //  Generate 'unsafe' array copy stub
 2060   //  Though just as safe as the other stubs, it takes an unscaled
 2061   //  size_t argument instead of an element count.
 2062   //
 2063   //  Input:
 2064   //    c_rarg0   - source array address
 2065   //    c_rarg1   - destination array address
 2066   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2067   //
 2068   // Examines the alignment of the operands and dispatches
 2069   // to a long, int, short, or byte copy loop.
 2070   //
 2071   address generate_unsafe_copy(address byte_copy_entry,
 2072                                address short_copy_entry,
 2073                                address int_copy_entry,
 2074                                address long_copy_entry) {
 2075     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
 2076 
 2077     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2078     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2079 
 2080     __ align(CodeEntryAlignment);
 2081     StubCodeMark mark(this, stub_id);
 2082     address start = __ pc();
 2083     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2084 
 2085     // bump this on entry, not on exit:
 2086     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2087 
 2088     __ orr(rscratch1, s, d);
 2089     __ orr(rscratch1, rscratch1, count);
 2090 
 2091     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2092     __ cbz(rscratch1, L_long_aligned);
 2093     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2094     __ cbz(rscratch1, L_int_aligned);
 2095     __ tbz(rscratch1, 0, L_short_aligned);
 2096     __ b(RuntimeAddress(byte_copy_entry));
 2097 
 2098     __ BIND(L_short_aligned);
 2099     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2100     __ b(RuntimeAddress(short_copy_entry));
 2101     __ BIND(L_int_aligned);
 2102     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2103     __ b(RuntimeAddress(int_copy_entry));
 2104     __ BIND(L_long_aligned);
 2105     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2106     __ b(RuntimeAddress(long_copy_entry));
 2107 
 2108     return start;
 2109   }
 2110 
 2111   //
 2112   //  Generate generic array copy stubs
 2113   //
 2114   //  Input:
 2115   //    c_rarg0    -  src oop
 2116   //    c_rarg1    -  src_pos (32-bits)
 2117   //    c_rarg2    -  dst oop
 2118   //    c_rarg3    -  dst_pos (32-bits)
 2119   //    c_rarg4    -  element count (32-bits)
 2120   //
 2121   //  Output:
 2122   //    r0 ==  0  -  success
 2123   //    r0 == -1^K - failure, where K is partial transfer count
 2124   //
 2125   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2126                                 address int_copy_entry, address oop_copy_entry,
 2127                                 address long_copy_entry, address checkcast_copy_entry) {
 2128     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
 2129 
 2130     Label L_failed, L_objArray;
 2131     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2132 
 2133     // Input registers
 2134     const Register src        = c_rarg0;  // source array oop
 2135     const Register src_pos    = c_rarg1;  // source position
 2136     const Register dst        = c_rarg2;  // destination array oop
 2137     const Register dst_pos    = c_rarg3;  // destination position
 2138     const Register length     = c_rarg4;
 2139 
 2140 
 2141     // Registers used as temps
 2142     const Register dst_klass  = c_rarg5;
 2143 
 2144     __ align(CodeEntryAlignment);
 2145 
 2146     StubCodeMark mark(this, stub_id);
 2147 
 2148     address start = __ pc();
 2149 
 2150     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2151 
 2152     // bump this on entry, not on exit:
 2153     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2154 
 2155     //-----------------------------------------------------------------------
 2156     // Assembler stub will be used for this call to arraycopy
 2157     // if the following conditions are met:
 2158     //
 2159     // (1) src and dst must not be null.
 2160     // (2) src_pos must not be negative.
 2161     // (3) dst_pos must not be negative.
 2162     // (4) length  must not be negative.
 2163     // (5) src klass and dst klass should be the same and not null.
 2164     // (6) src and dst should be arrays.
 2165     // (7) src_pos + length must not exceed length of src.
 2166     // (8) dst_pos + length must not exceed length of dst.
 2167     //
 2168 
 2169     //  if (src == nullptr) return -1;
 2170     __ cbz(src, L_failed);
 2171 
 2172     //  if (src_pos < 0) return -1;
 2173     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2174 
 2175     //  if (dst == nullptr) return -1;
 2176     __ cbz(dst, L_failed);
 2177 
 2178     //  if (dst_pos < 0) return -1;
 2179     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2180 
 2181     // registers used as temp
 2182     const Register scratch_length    = r16; // elements count to copy
 2183     const Register scratch_src_klass = r17; // array klass
 2184     const Register lh                = r15; // layout helper
 2185 
 2186     //  if (length < 0) return -1;
 2187     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2188     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     __ load_klass(scratch_src_klass, src);
 2191 #ifdef ASSERT
 2192     //  assert(src->klass() != nullptr);
 2193     {
 2194       BLOCK_COMMENT("assert klasses not null {");
 2195       Label L1, L2;
 2196       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2197       __ bind(L1);
 2198       __ stop("broken null klass");
 2199       __ bind(L2);
 2200       __ load_klass(rscratch1, dst);
 2201       __ cbz(rscratch1, L1);     // this would be broken also
 2202       BLOCK_COMMENT("} assert klasses not null done");
 2203     }
 2204 #endif
 2205 
 2206     // Load layout helper (32-bits)
 2207     //
 2208     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2209     // 32        30    24            16              8     2                 0
 2210     //
 2211     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2212     //
 2213 
 2214     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2215 
 2216     // Handle objArrays completely differently...
 2217     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2218     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2219     __ movw(rscratch1, objArray_lh);
 2220     __ eorw(rscratch2, lh, rscratch1);
 2221     __ cbzw(rscratch2, L_objArray);
 2222 
 2223     //  if (src->klass() != dst->klass()) return -1;
 2224     __ load_klass(rscratch2, dst);
 2225     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2226     __ cbnz(rscratch2, L_failed);
 2227 
 2228     //  if (!src->is_Array()) return -1;
 2229     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2230 
 2231     // At this point, it is known to be a typeArray (array_tag 0x3).
 2232 #ifdef ASSERT
 2233     {
 2234       BLOCK_COMMENT("assert primitive array {");
 2235       Label L;
 2236       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2237       __ cmpw(lh, rscratch2);
 2238       __ br(Assembler::GE, L);
 2239       __ stop("must be a primitive array");
 2240       __ bind(L);
 2241       BLOCK_COMMENT("} assert primitive array done");
 2242     }
 2243 #endif
 2244 
 2245     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2246                            rscratch2, L_failed);
 2247 
 2248     // TypeArrayKlass
 2249     //
 2250     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2251     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2252     //
 2253 
 2254     const Register rscratch1_offset = rscratch1;    // array offset
 2255     const Register r15_elsize = lh; // element size
 2256 
 2257     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2258            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2259     __ add(src, src, rscratch1_offset);           // src array offset
 2260     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2261     BLOCK_COMMENT("choose copy loop based on element size");
 2262 
 2263     // next registers should be set before the jump to corresponding stub
 2264     const Register from     = c_rarg0;  // source array address
 2265     const Register to       = c_rarg1;  // destination array address
 2266     const Register count    = c_rarg2;  // elements count
 2267 
 2268     // 'from', 'to', 'count' registers should be set in such order
 2269     // since they are the same as 'src', 'src_pos', 'dst'.
 2270 
 2271     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2272 
 2273     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2274     // size in bytes).  We do a simple bitwise binary search.
 2275   __ BIND(L_copy_bytes);
 2276     __ tbnz(r15_elsize, 1, L_copy_ints);
 2277     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2278     __ lea(from, Address(src, src_pos));// src_addr
 2279     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2280     __ movw(count, scratch_length); // length
 2281     __ b(RuntimeAddress(byte_copy_entry));
 2282 
 2283   __ BIND(L_copy_shorts);
 2284     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2285     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2286     __ movw(count, scratch_length); // length
 2287     __ b(RuntimeAddress(short_copy_entry));
 2288 
 2289   __ BIND(L_copy_ints);
 2290     __ tbnz(r15_elsize, 0, L_copy_longs);
 2291     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2292     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2293     __ movw(count, scratch_length); // length
 2294     __ b(RuntimeAddress(int_copy_entry));
 2295 
 2296   __ BIND(L_copy_longs);
 2297 #ifdef ASSERT
 2298     {
 2299       BLOCK_COMMENT("assert long copy {");
 2300       Label L;
 2301       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2302       __ cmpw(r15_elsize, LogBytesPerLong);
 2303       __ br(Assembler::EQ, L);
 2304       __ stop("must be long copy, but elsize is wrong");
 2305       __ bind(L);
 2306       BLOCK_COMMENT("} assert long copy done");
 2307     }
 2308 #endif
 2309     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2310     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2311     __ movw(count, scratch_length); // length
 2312     __ b(RuntimeAddress(long_copy_entry));
 2313 
 2314     // ObjArrayKlass
 2315   __ BIND(L_objArray);
 2316     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2317 
 2318     Label L_plain_copy, L_checkcast_copy;
 2319     //  test array classes for subtyping
 2320     __ load_klass(r15, dst);
 2321     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2322     __ br(Assembler::NE, L_checkcast_copy);
 2323 
 2324     // Identically typed arrays can be copied without element-wise checks.
 2325     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2326                            rscratch2, L_failed);
 2327 
 2328     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2329     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2330     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2331     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2332     __ movw(count, scratch_length); // length
 2333   __ BIND(L_plain_copy);
 2334     __ b(RuntimeAddress(oop_copy_entry));
 2335 
 2336   __ BIND(L_checkcast_copy);
 2337     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2338     {
 2339       // Before looking at dst.length, make sure dst is also an objArray.
 2340       __ ldrw(rscratch1, Address(r15, lh_offset));
 2341       __ movw(rscratch2, objArray_lh);
 2342       __ eorw(rscratch1, rscratch1, rscratch2);
 2343       __ cbnzw(rscratch1, L_failed);
 2344 
 2345       // It is safe to examine both src.length and dst.length.
 2346       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                              r15, L_failed);
 2348 
 2349       __ load_klass(dst_klass, dst); // reload
 2350 
 2351       // Marshal the base address arguments now, freeing registers.
 2352       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2353       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2354       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2355       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2356       __ movw(count, length);           // length (reloaded)
 2357       Register sco_temp = c_rarg3;      // this register is free now
 2358       assert_different_registers(from, to, count, sco_temp,
 2359                                  dst_klass, scratch_src_klass);
 2360       // assert_clean_int(count, sco_temp);
 2361 
 2362       // Generate the type check.
 2363       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2364       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2365 
 2366       // Smashes rscratch1, rscratch2
 2367       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2368                           L_plain_copy);
 2369 
 2370       // Fetch destination element klass from the ObjArrayKlass header.
 2371       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2372       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2373       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2374 
 2375       // the checkcast_copy loop needs two extra arguments:
 2376       assert(c_rarg3 == sco_temp, "#3 already in place");
 2377       // Set up arguments for checkcast_copy_entry.
 2378       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2379       __ b(RuntimeAddress(checkcast_copy_entry));
 2380     }
 2381 
 2382   __ BIND(L_failed);
 2383     __ mov(r0, -1);
 2384     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2385     __ ret(lr);
 2386 
 2387     return start;
 2388   }
 2389 
 2390   //
 2391   // Generate stub for array fill. If "aligned" is true, the
 2392   // "to" address is assumed to be heapword aligned.
 2393   //
 2394   // Arguments for generated stub:
 2395   //   to:    c_rarg0
 2396   //   value: c_rarg1
 2397   //   count: c_rarg2 treated as signed
 2398   //
 2399   address generate_fill(StubGenStubId stub_id) {
 2400     BasicType t;
 2401     bool aligned;
 2402 
 2403     switch (stub_id) {
 2404     case jbyte_fill_id:
 2405       t = T_BYTE;
 2406       aligned = false;
 2407       break;
 2408     case jshort_fill_id:
 2409       t = T_SHORT;
 2410       aligned = false;
 2411       break;
 2412     case jint_fill_id:
 2413       t = T_INT;
 2414       aligned = false;
 2415       break;
 2416     case arrayof_jbyte_fill_id:
 2417       t = T_BYTE;
 2418       aligned = true;
 2419       break;
 2420     case arrayof_jshort_fill_id:
 2421       t = T_SHORT;
 2422       aligned = true;
 2423       break;
 2424     case arrayof_jint_fill_id:
 2425       t = T_INT;
 2426       aligned = true;
 2427       break;
 2428     default:
 2429       ShouldNotReachHere();
 2430     };
 2431 
 2432     __ align(CodeEntryAlignment);
 2433     StubCodeMark mark(this, stub_id);
 2434     address start = __ pc();
 2435 
 2436     BLOCK_COMMENT("Entry:");
 2437 
 2438     const Register to        = c_rarg0;  // source array address
 2439     const Register value     = c_rarg1;  // value
 2440     const Register count     = c_rarg2;  // elements count
 2441 
 2442     const Register bz_base = r10;        // base for block_zero routine
 2443     const Register cnt_words = r11;      // temp register
 2444 
 2445     __ enter();
 2446 
 2447     Label L_fill_elements, L_exit1;
 2448 
 2449     int shift = -1;
 2450     switch (t) {
 2451       case T_BYTE:
 2452         shift = 0;
 2453         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2454         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2455         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2456         __ br(Assembler::LO, L_fill_elements);
 2457         break;
 2458       case T_SHORT:
 2459         shift = 1;
 2460         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2461         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2462         __ br(Assembler::LO, L_fill_elements);
 2463         break;
 2464       case T_INT:
 2465         shift = 2;
 2466         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2467         __ br(Assembler::LO, L_fill_elements);
 2468         break;
 2469       default: ShouldNotReachHere();
 2470     }
 2471 
 2472     // Align source address at 8 bytes address boundary.
 2473     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2474     if (!aligned) {
 2475       switch (t) {
 2476         case T_BYTE:
 2477           // One byte misalignment happens only for byte arrays.
 2478           __ tbz(to, 0, L_skip_align1);
 2479           __ strb(value, Address(__ post(to, 1)));
 2480           __ subw(count, count, 1);
 2481           __ bind(L_skip_align1);
 2482           // Fallthrough
 2483         case T_SHORT:
 2484           // Two bytes misalignment happens only for byte and short (char) arrays.
 2485           __ tbz(to, 1, L_skip_align2);
 2486           __ strh(value, Address(__ post(to, 2)));
 2487           __ subw(count, count, 2 >> shift);
 2488           __ bind(L_skip_align2);
 2489           // Fallthrough
 2490         case T_INT:
 2491           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2492           __ tbz(to, 2, L_skip_align4);
 2493           __ strw(value, Address(__ post(to, 4)));
 2494           __ subw(count, count, 4 >> shift);
 2495           __ bind(L_skip_align4);
 2496           break;
 2497         default: ShouldNotReachHere();
 2498       }
 2499     }
 2500 
 2501     //
 2502     //  Fill large chunks
 2503     //
 2504     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2505     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2506     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2507     if (UseBlockZeroing) {
 2508       Label non_block_zeroing, rest;
 2509       // If the fill value is zero we can use the fast zero_words().
 2510       __ cbnz(value, non_block_zeroing);
 2511       __ mov(bz_base, to);
 2512       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2513       address tpc = __ zero_words(bz_base, cnt_words);
 2514       if (tpc == nullptr) {
 2515         fatal("CodeCache is full at generate_fill");
 2516       }
 2517       __ b(rest);
 2518       __ bind(non_block_zeroing);
 2519       __ fill_words(to, cnt_words, value);
 2520       __ bind(rest);
 2521     } else {
 2522       __ fill_words(to, cnt_words, value);
 2523     }
 2524 
 2525     // Remaining count is less than 8 bytes. Fill it by a single store.
 2526     // Note that the total length is no less than 8 bytes.
 2527     if (t == T_BYTE || t == T_SHORT) {
 2528       Label L_exit1;
 2529       __ cbzw(count, L_exit1);
 2530       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2531       __ str(value, Address(to, -8));    // overwrite some elements
 2532       __ bind(L_exit1);
 2533       __ leave();
 2534       __ ret(lr);
 2535     }
 2536 
 2537     // Handle copies less than 8 bytes.
 2538     Label L_fill_2, L_fill_4, L_exit2;
 2539     __ bind(L_fill_elements);
 2540     switch (t) {
 2541       case T_BYTE:
 2542         __ tbz(count, 0, L_fill_2);
 2543         __ strb(value, Address(__ post(to, 1)));
 2544         __ bind(L_fill_2);
 2545         __ tbz(count, 1, L_fill_4);
 2546         __ strh(value, Address(__ post(to, 2)));
 2547         __ bind(L_fill_4);
 2548         __ tbz(count, 2, L_exit2);
 2549         __ strw(value, Address(to));
 2550         break;
 2551       case T_SHORT:
 2552         __ tbz(count, 0, L_fill_4);
 2553         __ strh(value, Address(__ post(to, 2)));
 2554         __ bind(L_fill_4);
 2555         __ tbz(count, 1, L_exit2);
 2556         __ strw(value, Address(to));
 2557         break;
 2558       case T_INT:
 2559         __ cbzw(count, L_exit2);
 2560         __ strw(value, Address(to));
 2561         break;
 2562       default: ShouldNotReachHere();
 2563     }
 2564     __ bind(L_exit2);
 2565     __ leave();
 2566     __ ret(lr);
 2567     return start;
 2568   }
 2569 
 2570   address generate_data_cache_writeback() {
 2571     const Register line        = c_rarg0;  // address of line to write back
 2572 
 2573     __ align(CodeEntryAlignment);
 2574 
 2575     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
 2576     StubCodeMark mark(this, stub_id);
 2577 
 2578     address start = __ pc();
 2579     __ enter();
 2580     __ cache_wb(Address(line, 0));
 2581     __ leave();
 2582     __ ret(lr);
 2583 
 2584     return start;
 2585   }
 2586 
 2587   address generate_data_cache_writeback_sync() {
 2588     const Register is_pre     = c_rarg0;  // pre or post sync
 2589 
 2590     __ align(CodeEntryAlignment);
 2591 
 2592     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
 2593     StubCodeMark mark(this, stub_id);
 2594 
 2595     // pre wbsync is a no-op
 2596     // post wbsync translates to an sfence
 2597 
 2598     Label skip;
 2599     address start = __ pc();
 2600     __ enter();
 2601     __ cbnz(is_pre, skip);
 2602     __ cache_wbsync(false);
 2603     __ bind(skip);
 2604     __ leave();
 2605     __ ret(lr);
 2606 
 2607     return start;
 2608   }
 2609 
 2610   void generate_arraycopy_stubs() {
 2611     address entry;
 2612     address entry_jbyte_arraycopy;
 2613     address entry_jshort_arraycopy;
 2614     address entry_jint_arraycopy;
 2615     address entry_oop_arraycopy;
 2616     address entry_jlong_arraycopy;
 2617     address entry_checkcast_arraycopy;
 2618 
 2619     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2620     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2621 
 2622     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2623     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2624 
 2625     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2626     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2627 
 2628     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2629 
 2630     //*** jbyte
 2631     // Always need aligned and unaligned versions
 2632     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
 2633     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2634     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2635     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
 2636 
 2637     //*** jshort
 2638     // Always need aligned and unaligned versions
 2639     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
 2640     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2641     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
 2642     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
 2643 
 2644     //*** jint
 2645     // Aligned versions
 2646     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
 2647     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2648     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2649     // entry_jint_arraycopy always points to the unaligned version
 2650     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
 2651     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2652 
 2653     //*** jlong
 2654     // It is always aligned
 2655     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
 2656     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2657     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2658     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2659 
 2660     //*** oops
 2661     {
 2662       // With compressed oops we need unaligned versions; notice that
 2663       // we overwrite entry_oop_arraycopy.
 2664       bool aligned = !UseCompressedOops;
 2665 
 2666       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2667         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
 2668       StubRoutines::_arrayof_oop_arraycopy
 2669         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2670       // Aligned versions without pre-barriers
 2671       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2672         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2673       StubRoutines::_arrayof_oop_arraycopy_uninit
 2674         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2675     }
 2676 
 2677     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2678     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2679     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2680     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2681 
 2682     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2683     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 2684 
 2685     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2686                                                               entry_jshort_arraycopy,
 2687                                                               entry_jint_arraycopy,
 2688                                                               entry_jlong_arraycopy);
 2689 
 2690     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2691                                                                entry_jshort_arraycopy,
 2692                                                                entry_jint_arraycopy,
 2693                                                                entry_oop_arraycopy,
 2694                                                                entry_jlong_arraycopy,
 2695                                                                entry_checkcast_arraycopy);
 2696 
 2697     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 2698     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 2699     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 2700     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 2701     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 2702     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 2703   }
 2704 
 2705   void generate_math_stubs() { Unimplemented(); }
 2706 
 2707   // Arguments:
 2708   //
 2709   // Inputs:
 2710   //   c_rarg0   - source byte array address
 2711   //   c_rarg1   - destination byte array address
 2712   //   c_rarg2   - K (key) in little endian int array
 2713   //
 2714   address generate_aescrypt_encryptBlock() {
 2715     __ align(CodeEntryAlignment);
 2716     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
 2717     StubCodeMark mark(this, stub_id);
 2718 
 2719     const Register from        = c_rarg0;  // source array address
 2720     const Register to          = c_rarg1;  // destination array address
 2721     const Register key         = c_rarg2;  // key array address
 2722     const Register keylen      = rscratch1;
 2723 
 2724     address start = __ pc();
 2725     __ enter();
 2726 
 2727     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2728 
 2729     __ aesenc_loadkeys(key, keylen);
 2730     __ aesecb_encrypt(from, to, keylen);
 2731 
 2732     __ mov(r0, 0);
 2733 
 2734     __ leave();
 2735     __ ret(lr);
 2736 
 2737     return start;
 2738   }
 2739 
 2740   // Arguments:
 2741   //
 2742   // Inputs:
 2743   //   c_rarg0   - source byte array address
 2744   //   c_rarg1   - destination byte array address
 2745   //   c_rarg2   - K (key) in little endian int array
 2746   //
 2747   address generate_aescrypt_decryptBlock() {
 2748     assert(UseAES, "need AES cryptographic extension support");
 2749     __ align(CodeEntryAlignment);
 2750     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
 2751     StubCodeMark mark(this, stub_id);
 2752     Label L_doLast;
 2753 
 2754     const Register from        = c_rarg0;  // source array address
 2755     const Register to          = c_rarg1;  // destination array address
 2756     const Register key         = c_rarg2;  // key array address
 2757     const Register keylen      = rscratch1;
 2758 
 2759     address start = __ pc();
 2760     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2761 
 2762     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2763 
 2764     __ aesecb_decrypt(from, to, key, keylen);
 2765 
 2766     __ mov(r0, 0);
 2767 
 2768     __ leave();
 2769     __ ret(lr);
 2770 
 2771     return start;
 2772   }
 2773 
 2774   // Arguments:
 2775   //
 2776   // Inputs:
 2777   //   c_rarg0   - source byte array address
 2778   //   c_rarg1   - destination byte array address
 2779   //   c_rarg2   - K (key) in little endian int array
 2780   //   c_rarg3   - r vector byte array address
 2781   //   c_rarg4   - input length
 2782   //
 2783   // Output:
 2784   //   x0        - input length
 2785   //
 2786   address generate_cipherBlockChaining_encryptAESCrypt() {
 2787     assert(UseAES, "need AES cryptographic extension support");
 2788     __ align(CodeEntryAlignment);
 2789     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
 2790     StubCodeMark mark(this, stub_id);
 2791 
 2792     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2793 
 2794     const Register from        = c_rarg0;  // source array address
 2795     const Register to          = c_rarg1;  // destination array address
 2796     const Register key         = c_rarg2;  // key array address
 2797     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2798                                            // and left with the results of the last encryption block
 2799     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2800     const Register keylen      = rscratch1;
 2801 
 2802     address start = __ pc();
 2803 
 2804       __ enter();
 2805 
 2806       __ movw(rscratch2, len_reg);
 2807 
 2808       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2809 
 2810       __ ld1(v0, __ T16B, rvec);
 2811 
 2812       __ cmpw(keylen, 52);
 2813       __ br(Assembler::CC, L_loadkeys_44);
 2814       __ br(Assembler::EQ, L_loadkeys_52);
 2815 
 2816       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2817       __ rev32(v17, __ T16B, v17);
 2818       __ rev32(v18, __ T16B, v18);
 2819     __ BIND(L_loadkeys_52);
 2820       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2821       __ rev32(v19, __ T16B, v19);
 2822       __ rev32(v20, __ T16B, v20);
 2823     __ BIND(L_loadkeys_44);
 2824       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2825       __ rev32(v21, __ T16B, v21);
 2826       __ rev32(v22, __ T16B, v22);
 2827       __ rev32(v23, __ T16B, v23);
 2828       __ rev32(v24, __ T16B, v24);
 2829       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2830       __ rev32(v25, __ T16B, v25);
 2831       __ rev32(v26, __ T16B, v26);
 2832       __ rev32(v27, __ T16B, v27);
 2833       __ rev32(v28, __ T16B, v28);
 2834       __ ld1(v29, v30, v31, __ T16B, key);
 2835       __ rev32(v29, __ T16B, v29);
 2836       __ rev32(v30, __ T16B, v30);
 2837       __ rev32(v31, __ T16B, v31);
 2838 
 2839     __ BIND(L_aes_loop);
 2840       __ ld1(v1, __ T16B, __ post(from, 16));
 2841       __ eor(v0, __ T16B, v0, v1);
 2842 
 2843       __ br(Assembler::CC, L_rounds_44);
 2844       __ br(Assembler::EQ, L_rounds_52);
 2845 
 2846       __ aese(v0, v17); __ aesmc(v0, v0);
 2847       __ aese(v0, v18); __ aesmc(v0, v0);
 2848     __ BIND(L_rounds_52);
 2849       __ aese(v0, v19); __ aesmc(v0, v0);
 2850       __ aese(v0, v20); __ aesmc(v0, v0);
 2851     __ BIND(L_rounds_44);
 2852       __ aese(v0, v21); __ aesmc(v0, v0);
 2853       __ aese(v0, v22); __ aesmc(v0, v0);
 2854       __ aese(v0, v23); __ aesmc(v0, v0);
 2855       __ aese(v0, v24); __ aesmc(v0, v0);
 2856       __ aese(v0, v25); __ aesmc(v0, v0);
 2857       __ aese(v0, v26); __ aesmc(v0, v0);
 2858       __ aese(v0, v27); __ aesmc(v0, v0);
 2859       __ aese(v0, v28); __ aesmc(v0, v0);
 2860       __ aese(v0, v29); __ aesmc(v0, v0);
 2861       __ aese(v0, v30);
 2862       __ eor(v0, __ T16B, v0, v31);
 2863 
 2864       __ st1(v0, __ T16B, __ post(to, 16));
 2865 
 2866       __ subw(len_reg, len_reg, 16);
 2867       __ cbnzw(len_reg, L_aes_loop);
 2868 
 2869       __ st1(v0, __ T16B, rvec);
 2870 
 2871       __ mov(r0, rscratch2);
 2872 
 2873       __ leave();
 2874       __ ret(lr);
 2875 
 2876       return start;
 2877   }
 2878 
 2879   // Arguments:
 2880   //
 2881   // Inputs:
 2882   //   c_rarg0   - source byte array address
 2883   //   c_rarg1   - destination byte array address
 2884   //   c_rarg2   - K (key) in little endian int array
 2885   //   c_rarg3   - r vector byte array address
 2886   //   c_rarg4   - input length
 2887   //
 2888   // Output:
 2889   //   r0        - input length
 2890   //
 2891   address generate_cipherBlockChaining_decryptAESCrypt() {
 2892     assert(UseAES, "need AES cryptographic extension support");
 2893     __ align(CodeEntryAlignment);
 2894     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
 2895     StubCodeMark mark(this, stub_id);
 2896 
 2897     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2898 
 2899     const Register from        = c_rarg0;  // source array address
 2900     const Register to          = c_rarg1;  // destination array address
 2901     const Register key         = c_rarg2;  // key array address
 2902     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2903                                            // and left with the results of the last encryption block
 2904     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2905     const Register keylen      = rscratch1;
 2906 
 2907     address start = __ pc();
 2908 
 2909       __ enter();
 2910 
 2911       __ movw(rscratch2, len_reg);
 2912 
 2913       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2914 
 2915       __ ld1(v2, __ T16B, rvec);
 2916 
 2917       __ ld1(v31, __ T16B, __ post(key, 16));
 2918       __ rev32(v31, __ T16B, v31);
 2919 
 2920       __ cmpw(keylen, 52);
 2921       __ br(Assembler::CC, L_loadkeys_44);
 2922       __ br(Assembler::EQ, L_loadkeys_52);
 2923 
 2924       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2925       __ rev32(v17, __ T16B, v17);
 2926       __ rev32(v18, __ T16B, v18);
 2927     __ BIND(L_loadkeys_52);
 2928       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2929       __ rev32(v19, __ T16B, v19);
 2930       __ rev32(v20, __ T16B, v20);
 2931     __ BIND(L_loadkeys_44);
 2932       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2933       __ rev32(v21, __ T16B, v21);
 2934       __ rev32(v22, __ T16B, v22);
 2935       __ rev32(v23, __ T16B, v23);
 2936       __ rev32(v24, __ T16B, v24);
 2937       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2938       __ rev32(v25, __ T16B, v25);
 2939       __ rev32(v26, __ T16B, v26);
 2940       __ rev32(v27, __ T16B, v27);
 2941       __ rev32(v28, __ T16B, v28);
 2942       __ ld1(v29, v30, __ T16B, key);
 2943       __ rev32(v29, __ T16B, v29);
 2944       __ rev32(v30, __ T16B, v30);
 2945 
 2946     __ BIND(L_aes_loop);
 2947       __ ld1(v0, __ T16B, __ post(from, 16));
 2948       __ orr(v1, __ T16B, v0, v0);
 2949 
 2950       __ br(Assembler::CC, L_rounds_44);
 2951       __ br(Assembler::EQ, L_rounds_52);
 2952 
 2953       __ aesd(v0, v17); __ aesimc(v0, v0);
 2954       __ aesd(v0, v18); __ aesimc(v0, v0);
 2955     __ BIND(L_rounds_52);
 2956       __ aesd(v0, v19); __ aesimc(v0, v0);
 2957       __ aesd(v0, v20); __ aesimc(v0, v0);
 2958     __ BIND(L_rounds_44);
 2959       __ aesd(v0, v21); __ aesimc(v0, v0);
 2960       __ aesd(v0, v22); __ aesimc(v0, v0);
 2961       __ aesd(v0, v23); __ aesimc(v0, v0);
 2962       __ aesd(v0, v24); __ aesimc(v0, v0);
 2963       __ aesd(v0, v25); __ aesimc(v0, v0);
 2964       __ aesd(v0, v26); __ aesimc(v0, v0);
 2965       __ aesd(v0, v27); __ aesimc(v0, v0);
 2966       __ aesd(v0, v28); __ aesimc(v0, v0);
 2967       __ aesd(v0, v29); __ aesimc(v0, v0);
 2968       __ aesd(v0, v30);
 2969       __ eor(v0, __ T16B, v0, v31);
 2970       __ eor(v0, __ T16B, v0, v2);
 2971 
 2972       __ st1(v0, __ T16B, __ post(to, 16));
 2973       __ orr(v2, __ T16B, v1, v1);
 2974 
 2975       __ subw(len_reg, len_reg, 16);
 2976       __ cbnzw(len_reg, L_aes_loop);
 2977 
 2978       __ st1(v2, __ T16B, rvec);
 2979 
 2980       __ mov(r0, rscratch2);
 2981 
 2982       __ leave();
 2983       __ ret(lr);
 2984 
 2985     return start;
 2986   }
 2987 
 2988   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 2989   // Inputs: 128-bits. in is preserved.
 2990   // The least-significant 64-bit word is in the upper dword of each vector.
 2991   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 2992   // Output: result
 2993   void be_add_128_64(FloatRegister result, FloatRegister in,
 2994                      FloatRegister inc, FloatRegister tmp) {
 2995     assert_different_registers(result, tmp, inc);
 2996 
 2997     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 2998                                            // input
 2999     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3000     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3001                                            // MSD == 0 (must be!) to LSD
 3002     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3003   }
 3004 
 3005   // CTR AES crypt.
 3006   // Arguments:
 3007   //
 3008   // Inputs:
 3009   //   c_rarg0   - source byte array address
 3010   //   c_rarg1   - destination byte array address
 3011   //   c_rarg2   - K (key) in little endian int array
 3012   //   c_rarg3   - counter vector byte array address
 3013   //   c_rarg4   - input length
 3014   //   c_rarg5   - saved encryptedCounter start
 3015   //   c_rarg6   - saved used length
 3016   //
 3017   // Output:
 3018   //   r0       - input length
 3019   //
 3020   address generate_counterMode_AESCrypt() {
 3021     const Register in = c_rarg0;
 3022     const Register out = c_rarg1;
 3023     const Register key = c_rarg2;
 3024     const Register counter = c_rarg3;
 3025     const Register saved_len = c_rarg4, len = r10;
 3026     const Register saved_encrypted_ctr = c_rarg5;
 3027     const Register used_ptr = c_rarg6, used = r12;
 3028 
 3029     const Register offset = r7;
 3030     const Register keylen = r11;
 3031 
 3032     const unsigned char block_size = 16;
 3033     const int bulk_width = 4;
 3034     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3035     // performance with larger data sizes, but it also means that the
 3036     // fast path isn't used until you have at least 8 blocks, and up
 3037     // to 127 bytes of data will be executed on the slow path. For
 3038     // that reason, and also so as not to blow away too much icache, 4
 3039     // blocks seems like a sensible compromise.
 3040 
 3041     // Algorithm:
 3042     //
 3043     //    if (len == 0) {
 3044     //        goto DONE;
 3045     //    }
 3046     //    int result = len;
 3047     //    do {
 3048     //        if (used >= blockSize) {
 3049     //            if (len >= bulk_width * blockSize) {
 3050     //                CTR_large_block();
 3051     //                if (len == 0)
 3052     //                    goto DONE;
 3053     //            }
 3054     //            for (;;) {
 3055     //                16ByteVector v0 = counter;
 3056     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3057     //                used = 0;
 3058     //                if (len < blockSize)
 3059     //                    break;    /* goto NEXT */
 3060     //                16ByteVector v1 = load16Bytes(in, offset);
 3061     //                v1 = v1 ^ encryptedCounter;
 3062     //                store16Bytes(out, offset);
 3063     //                used = blockSize;
 3064     //                offset += blockSize;
 3065     //                len -= blockSize;
 3066     //                if (len == 0)
 3067     //                    goto DONE;
 3068     //            }
 3069     //        }
 3070     //      NEXT:
 3071     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3072     //        len--;
 3073     //    } while (len != 0);
 3074     //  DONE:
 3075     //    return result;
 3076     //
 3077     // CTR_large_block()
 3078     //    Wide bulk encryption of whole blocks.
 3079 
 3080     __ align(CodeEntryAlignment);
 3081     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
 3082     StubCodeMark mark(this, stub_id);
 3083     const address start = __ pc();
 3084     __ enter();
 3085 
 3086     Label DONE, CTR_large_block, large_block_return;
 3087     __ ldrw(used, Address(used_ptr));
 3088     __ cbzw(saved_len, DONE);
 3089 
 3090     __ mov(len, saved_len);
 3091     __ mov(offset, 0);
 3092 
 3093     // Compute #rounds for AES based on the length of the key array
 3094     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3095 
 3096     __ aesenc_loadkeys(key, keylen);
 3097 
 3098     {
 3099       Label L_CTR_loop, NEXT;
 3100 
 3101       __ bind(L_CTR_loop);
 3102 
 3103       __ cmp(used, block_size);
 3104       __ br(__ LO, NEXT);
 3105 
 3106       // Maybe we have a lot of data
 3107       __ subsw(rscratch1, len, bulk_width * block_size);
 3108       __ br(__ HS, CTR_large_block);
 3109       __ BIND(large_block_return);
 3110       __ cbzw(len, DONE);
 3111 
 3112       // Setup the counter
 3113       __ movi(v4, __ T4S, 0);
 3114       __ movi(v5, __ T4S, 1);
 3115       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3116 
 3117       // 128-bit big-endian increment
 3118       __ ld1(v0, __ T16B, counter);
 3119       __ rev64(v16, __ T16B, v0);
 3120       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3121       __ rev64(v16, __ T16B, v16);
 3122       __ st1(v16, __ T16B, counter);
 3123       // Previous counter value is in v0
 3124       // v4 contains { 0, 1 }
 3125 
 3126       {
 3127         // We have fewer than bulk_width blocks of data left. Encrypt
 3128         // them one by one until there is less than a full block
 3129         // remaining, being careful to save both the encrypted counter
 3130         // and the counter.
 3131 
 3132         Label inner_loop;
 3133         __ bind(inner_loop);
 3134         // Counter to encrypt is in v0
 3135         __ aesecb_encrypt(noreg, noreg, keylen);
 3136         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3137 
 3138         // Do we have a remaining full block?
 3139 
 3140         __ mov(used, 0);
 3141         __ cmp(len, block_size);
 3142         __ br(__ LO, NEXT);
 3143 
 3144         // Yes, we have a full block
 3145         __ ldrq(v1, Address(in, offset));
 3146         __ eor(v1, __ T16B, v1, v0);
 3147         __ strq(v1, Address(out, offset));
 3148         __ mov(used, block_size);
 3149         __ add(offset, offset, block_size);
 3150 
 3151         __ subw(len, len, block_size);
 3152         __ cbzw(len, DONE);
 3153 
 3154         // Increment the counter, store it back
 3155         __ orr(v0, __ T16B, v16, v16);
 3156         __ rev64(v16, __ T16B, v16);
 3157         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3158         __ rev64(v16, __ T16B, v16);
 3159         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3160 
 3161         __ b(inner_loop);
 3162       }
 3163 
 3164       __ BIND(NEXT);
 3165 
 3166       // Encrypt a single byte, and loop.
 3167       // We expect this to be a rare event.
 3168       __ ldrb(rscratch1, Address(in, offset));
 3169       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3170       __ eor(rscratch1, rscratch1, rscratch2);
 3171       __ strb(rscratch1, Address(out, offset));
 3172       __ add(offset, offset, 1);
 3173       __ add(used, used, 1);
 3174       __ subw(len, len,1);
 3175       __ cbnzw(len, L_CTR_loop);
 3176     }
 3177 
 3178     __ bind(DONE);
 3179     __ strw(used, Address(used_ptr));
 3180     __ mov(r0, saved_len);
 3181 
 3182     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3183     __ ret(lr);
 3184 
 3185     // Bulk encryption
 3186 
 3187     __ BIND (CTR_large_block);
 3188     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3189 
 3190     if (bulk_width == 8) {
 3191       __ sub(sp, sp, 4 * 16);
 3192       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3193     }
 3194     __ sub(sp, sp, 4 * 16);
 3195     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3196     RegSet saved_regs = (RegSet::of(in, out, offset)
 3197                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3198     __ push(saved_regs, sp);
 3199     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3200     __ add(in, in, offset);
 3201     __ add(out, out, offset);
 3202 
 3203     // Keys should already be loaded into the correct registers
 3204 
 3205     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3206     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3207 
 3208     // AES/CTR loop
 3209     {
 3210       Label L_CTR_loop;
 3211       __ BIND(L_CTR_loop);
 3212 
 3213       // Setup the counters
 3214       __ movi(v8, __ T4S, 0);
 3215       __ movi(v9, __ T4S, 1);
 3216       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3217 
 3218       for (int i = 0; i < bulk_width; i++) {
 3219         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3220         __ rev64(v0_ofs, __ T16B, v16);
 3221         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3222       }
 3223 
 3224       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3225 
 3226       // Encrypt the counters
 3227       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3228 
 3229       if (bulk_width == 8) {
 3230         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3231       }
 3232 
 3233       // XOR the encrypted counters with the inputs
 3234       for (int i = 0; i < bulk_width; i++) {
 3235         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3236         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3237         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3238       }
 3239 
 3240       // Write the encrypted data
 3241       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3242       if (bulk_width == 8) {
 3243         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3244       }
 3245 
 3246       __ subw(len, len, 16 * bulk_width);
 3247       __ cbnzw(len, L_CTR_loop);
 3248     }
 3249 
 3250     // Save the counter back where it goes
 3251     __ rev64(v16, __ T16B, v16);
 3252     __ st1(v16, __ T16B, counter);
 3253 
 3254     __ pop(saved_regs, sp);
 3255 
 3256     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3257     if (bulk_width == 8) {
 3258       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3259     }
 3260 
 3261     __ andr(rscratch1, len, -16 * bulk_width);
 3262     __ sub(len, len, rscratch1);
 3263     __ add(offset, offset, rscratch1);
 3264     __ mov(used, 16);
 3265     __ strw(used, Address(used_ptr));
 3266     __ b(large_block_return);
 3267 
 3268     return start;
 3269   }
 3270 
 3271   // Vector AES Galois Counter Mode implementation. Parameters:
 3272   //
 3273   // in = c_rarg0
 3274   // len = c_rarg1
 3275   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3276   // out = c_rarg3
 3277   // key = c_rarg4
 3278   // state = c_rarg5 - GHASH.state
 3279   // subkeyHtbl = c_rarg6 - powers of H
 3280   // counter = c_rarg7 - 16 bytes of CTR
 3281   // return - number of processed bytes
 3282   address generate_galoisCounterMode_AESCrypt() {
 3283     address ghash_polynomial = __ pc();
 3284     __ emit_int64(0x87);  // The low-order bits of the field
 3285                           // polynomial (i.e. p = z^7+z^2+z+1)
 3286                           // repeated in the low and high parts of a
 3287                           // 128-bit vector
 3288     __ emit_int64(0x87);
 3289 
 3290     __ align(CodeEntryAlignment);
 3291     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
 3292     StubCodeMark mark(this, stub_id);
 3293     address start = __ pc();
 3294     __ enter();
 3295 
 3296     const Register in = c_rarg0;
 3297     const Register len = c_rarg1;
 3298     const Register ct = c_rarg2;
 3299     const Register out = c_rarg3;
 3300     // and updated with the incremented counter in the end
 3301 
 3302     const Register key = c_rarg4;
 3303     const Register state = c_rarg5;
 3304 
 3305     const Register subkeyHtbl = c_rarg6;
 3306 
 3307     const Register counter = c_rarg7;
 3308 
 3309     const Register keylen = r10;
 3310     // Save state before entering routine
 3311     __ sub(sp, sp, 4 * 16);
 3312     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3313     __ sub(sp, sp, 4 * 16);
 3314     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3315 
 3316     // __ andr(len, len, -512);
 3317     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3318     __ str(len, __ pre(sp, -2 * wordSize));
 3319 
 3320     Label DONE;
 3321     __ cbz(len, DONE);
 3322 
 3323     // Compute #rounds for AES based on the length of the key array
 3324     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3325 
 3326     __ aesenc_loadkeys(key, keylen);
 3327     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3328     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3329 
 3330     // AES/CTR loop
 3331     {
 3332       Label L_CTR_loop;
 3333       __ BIND(L_CTR_loop);
 3334 
 3335       // Setup the counters
 3336       __ movi(v8, __ T4S, 0);
 3337       __ movi(v9, __ T4S, 1);
 3338       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3339 
 3340       assert(v0->encoding() < v8->encoding(), "");
 3341       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3342         FloatRegister f = as_FloatRegister(i);
 3343         __ rev32(f, __ T16B, v16);
 3344         __ addv(v16, __ T4S, v16, v8);
 3345       }
 3346 
 3347       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3348 
 3349       // Encrypt the counters
 3350       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3351 
 3352       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3353 
 3354       // XOR the encrypted counters with the inputs
 3355       for (int i = 0; i < 8; i++) {
 3356         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3357         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3358         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3359       }
 3360       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3361       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3362 
 3363       __ subw(len, len, 16 * 8);
 3364       __ cbnzw(len, L_CTR_loop);
 3365     }
 3366 
 3367     __ rev32(v16, __ T16B, v16);
 3368     __ st1(v16, __ T16B, counter);
 3369 
 3370     __ ldr(len, Address(sp));
 3371     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3372 
 3373     // GHASH/CTR loop
 3374     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3375                                 len, /*unrolls*/4);
 3376 
 3377 #ifdef ASSERT
 3378     { Label L;
 3379       __ cmp(len, (unsigned char)0);
 3380       __ br(Assembler::EQ, L);
 3381       __ stop("stubGenerator: abort");
 3382       __ bind(L);
 3383   }
 3384 #endif
 3385 
 3386   __ bind(DONE);
 3387     // Return the number of bytes processed
 3388     __ ldr(r0, __ post(sp, 2 * wordSize));
 3389 
 3390     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3391     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3392 
 3393     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3394     __ ret(lr);
 3395      return start;
 3396   }
 3397 
 3398   class Cached64Bytes {
 3399   private:
 3400     MacroAssembler *_masm;
 3401     Register _regs[8];
 3402 
 3403   public:
 3404     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3405       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3406       auto it = rs.begin();
 3407       for (auto &r: _regs) {
 3408         r = *it;
 3409         ++it;
 3410       }
 3411     }
 3412 
 3413     void gen_loads(Register base) {
 3414       for (int i = 0; i < 8; i += 2) {
 3415         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3416       }
 3417     }
 3418 
 3419     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3420     void extract_u32(Register dest, int i) {
 3421       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3422     }
 3423   };
 3424 
 3425   // Utility routines for md5.
 3426   // Clobbers r10 and r11.
 3427   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3428               int k, int s, int t) {
 3429     Register rscratch3 = r10;
 3430     Register rscratch4 = r11;
 3431 
 3432     __ eorw(rscratch3, r3, r4);
 3433     __ movw(rscratch2, t);
 3434     __ andw(rscratch3, rscratch3, r2);
 3435     __ addw(rscratch4, r1, rscratch2);
 3436     reg_cache.extract_u32(rscratch1, k);
 3437     __ eorw(rscratch3, rscratch3, r4);
 3438     __ addw(rscratch4, rscratch4, rscratch1);
 3439     __ addw(rscratch3, rscratch3, rscratch4);
 3440     __ rorw(rscratch2, rscratch3, 32 - s);
 3441     __ addw(r1, rscratch2, r2);
 3442   }
 3443 
 3444   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3445               int k, int s, int t) {
 3446     Register rscratch3 = r10;
 3447     Register rscratch4 = r11;
 3448 
 3449     reg_cache.extract_u32(rscratch1, k);
 3450     __ movw(rscratch2, t);
 3451     __ addw(rscratch4, r1, rscratch2);
 3452     __ addw(rscratch4, rscratch4, rscratch1);
 3453     __ bicw(rscratch2, r3, r4);
 3454     __ andw(rscratch3, r2, r4);
 3455     __ addw(rscratch2, rscratch2, rscratch4);
 3456     __ addw(rscratch2, rscratch2, rscratch3);
 3457     __ rorw(rscratch2, rscratch2, 32 - s);
 3458     __ addw(r1, rscratch2, r2);
 3459   }
 3460 
 3461   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3462               int k, int s, int t) {
 3463     Register rscratch3 = r10;
 3464     Register rscratch4 = r11;
 3465 
 3466     __ eorw(rscratch3, r3, r4);
 3467     __ movw(rscratch2, t);
 3468     __ addw(rscratch4, r1, rscratch2);
 3469     reg_cache.extract_u32(rscratch1, k);
 3470     __ eorw(rscratch3, rscratch3, r2);
 3471     __ addw(rscratch4, rscratch4, rscratch1);
 3472     __ addw(rscratch3, rscratch3, rscratch4);
 3473     __ rorw(rscratch2, rscratch3, 32 - s);
 3474     __ addw(r1, rscratch2, r2);
 3475   }
 3476 
 3477   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3478               int k, int s, int t) {
 3479     Register rscratch3 = r10;
 3480     Register rscratch4 = r11;
 3481 
 3482     __ movw(rscratch3, t);
 3483     __ ornw(rscratch2, r2, r4);
 3484     __ addw(rscratch4, r1, rscratch3);
 3485     reg_cache.extract_u32(rscratch1, k);
 3486     __ eorw(rscratch3, rscratch2, r3);
 3487     __ addw(rscratch4, rscratch4, rscratch1);
 3488     __ addw(rscratch3, rscratch3, rscratch4);
 3489     __ rorw(rscratch2, rscratch3, 32 - s);
 3490     __ addw(r1, rscratch2, r2);
 3491   }
 3492 
 3493   // Arguments:
 3494   //
 3495   // Inputs:
 3496   //   c_rarg0   - byte[]  source+offset
 3497   //   c_rarg1   - int[]   SHA.state
 3498   //   c_rarg2   - int     offset
 3499   //   c_rarg3   - int     limit
 3500   //
 3501   address generate_md5_implCompress(StubGenStubId stub_id) {
 3502     bool multi_block;
 3503     switch (stub_id) {
 3504     case md5_implCompress_id:
 3505       multi_block = false;
 3506       break;
 3507     case md5_implCompressMB_id:
 3508       multi_block = true;
 3509       break;
 3510     default:
 3511       ShouldNotReachHere();
 3512     }
 3513     __ align(CodeEntryAlignment);
 3514 
 3515     StubCodeMark mark(this, stub_id);
 3516     address start = __ pc();
 3517 
 3518     Register buf       = c_rarg0;
 3519     Register state     = c_rarg1;
 3520     Register ofs       = c_rarg2;
 3521     Register limit     = c_rarg3;
 3522     Register a         = r4;
 3523     Register b         = r5;
 3524     Register c         = r6;
 3525     Register d         = r7;
 3526     Register rscratch3 = r10;
 3527     Register rscratch4 = r11;
 3528 
 3529     Register state_regs[2] = { r12, r13 };
 3530     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3531     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3532 
 3533     __ push(saved_regs, sp);
 3534 
 3535     __ ldp(state_regs[0], state_regs[1], Address(state));
 3536     __ ubfx(a, state_regs[0],  0, 32);
 3537     __ ubfx(b, state_regs[0], 32, 32);
 3538     __ ubfx(c, state_regs[1],  0, 32);
 3539     __ ubfx(d, state_regs[1], 32, 32);
 3540 
 3541     Label md5_loop;
 3542     __ BIND(md5_loop);
 3543 
 3544     reg_cache.gen_loads(buf);
 3545 
 3546     // Round 1
 3547     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3548     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3549     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3550     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3551     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3552     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3553     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3554     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3555     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3556     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3557     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3558     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3559     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3560     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3561     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3562     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3563 
 3564     // Round 2
 3565     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3566     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3567     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3568     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3569     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3570     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3571     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3572     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3573     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3574     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3575     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3576     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3577     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3578     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3579     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3580     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3581 
 3582     // Round 3
 3583     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3584     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3585     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3586     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3587     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3588     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3589     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3590     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3591     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3592     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3593     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3594     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3595     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3596     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3597     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3598     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3599 
 3600     // Round 4
 3601     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3602     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3603     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3604     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3605     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3606     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3607     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3608     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3609     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3610     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3611     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3612     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3613     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3614     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3615     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3616     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3617 
 3618     __ addw(a, state_regs[0], a);
 3619     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3620     __ addw(b, rscratch2, b);
 3621     __ addw(c, state_regs[1], c);
 3622     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3623     __ addw(d, rscratch4, d);
 3624 
 3625     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3626     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3627 
 3628     if (multi_block) {
 3629       __ add(buf, buf, 64);
 3630       __ add(ofs, ofs, 64);
 3631       __ cmp(ofs, limit);
 3632       __ br(Assembler::LE, md5_loop);
 3633       __ mov(c_rarg0, ofs); // return ofs
 3634     }
 3635 
 3636     // write hash values back in the correct order
 3637     __ stp(state_regs[0], state_regs[1], Address(state));
 3638 
 3639     __ pop(saved_regs, sp);
 3640 
 3641     __ ret(lr);
 3642 
 3643     return start;
 3644   }
 3645 
 3646   // Arguments:
 3647   //
 3648   // Inputs:
 3649   //   c_rarg0   - byte[]  source+offset
 3650   //   c_rarg1   - int[]   SHA.state
 3651   //   c_rarg2   - int     offset
 3652   //   c_rarg3   - int     limit
 3653   //
 3654   address generate_sha1_implCompress(StubGenStubId stub_id) {
 3655     bool multi_block;
 3656     switch (stub_id) {
 3657     case sha1_implCompress_id:
 3658       multi_block = false;
 3659       break;
 3660     case sha1_implCompressMB_id:
 3661       multi_block = true;
 3662       break;
 3663     default:
 3664       ShouldNotReachHere();
 3665     }
 3666 
 3667     __ align(CodeEntryAlignment);
 3668 
 3669     StubCodeMark mark(this, stub_id);
 3670     address start = __ pc();
 3671 
 3672     Register buf   = c_rarg0;
 3673     Register state = c_rarg1;
 3674     Register ofs   = c_rarg2;
 3675     Register limit = c_rarg3;
 3676 
 3677     Label keys;
 3678     Label sha1_loop;
 3679 
 3680     // load the keys into v0..v3
 3681     __ adr(rscratch1, keys);
 3682     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3683     // load 5 words state into v6, v7
 3684     __ ldrq(v6, Address(state, 0));
 3685     __ ldrs(v7, Address(state, 16));
 3686 
 3687 
 3688     __ BIND(sha1_loop);
 3689     // load 64 bytes of data into v16..v19
 3690     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3691     __ rev32(v16, __ T16B, v16);
 3692     __ rev32(v17, __ T16B, v17);
 3693     __ rev32(v18, __ T16B, v18);
 3694     __ rev32(v19, __ T16B, v19);
 3695 
 3696     // do the sha1
 3697     __ addv(v4, __ T4S, v16, v0);
 3698     __ orr(v20, __ T16B, v6, v6);
 3699 
 3700     FloatRegister d0 = v16;
 3701     FloatRegister d1 = v17;
 3702     FloatRegister d2 = v18;
 3703     FloatRegister d3 = v19;
 3704 
 3705     for (int round = 0; round < 20; round++) {
 3706       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3707       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3708       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3709       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3710       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3711 
 3712       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3713       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3714       __ sha1h(tmp2, __ T4S, v20);
 3715       if (round < 5)
 3716         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3717       else if (round < 10 || round >= 15)
 3718         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3719       else
 3720         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3721       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3722 
 3723       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3724     }
 3725 
 3726     __ addv(v7, __ T2S, v7, v21);
 3727     __ addv(v6, __ T4S, v6, v20);
 3728 
 3729     if (multi_block) {
 3730       __ add(ofs, ofs, 64);
 3731       __ cmp(ofs, limit);
 3732       __ br(Assembler::LE, sha1_loop);
 3733       __ mov(c_rarg0, ofs); // return ofs
 3734     }
 3735 
 3736     __ strq(v6, Address(state, 0));
 3737     __ strs(v7, Address(state, 16));
 3738 
 3739     __ ret(lr);
 3740 
 3741     __ bind(keys);
 3742     __ emit_int32(0x5a827999);
 3743     __ emit_int32(0x6ed9eba1);
 3744     __ emit_int32(0x8f1bbcdc);
 3745     __ emit_int32(0xca62c1d6);
 3746 
 3747     return start;
 3748   }
 3749 
 3750 
 3751   // Arguments:
 3752   //
 3753   // Inputs:
 3754   //   c_rarg0   - byte[]  source+offset
 3755   //   c_rarg1   - int[]   SHA.state
 3756   //   c_rarg2   - int     offset
 3757   //   c_rarg3   - int     limit
 3758   //
 3759   address generate_sha256_implCompress(StubGenStubId stub_id) {
 3760     bool multi_block;
 3761     switch (stub_id) {
 3762     case sha256_implCompress_id:
 3763       multi_block = false;
 3764       break;
 3765     case sha256_implCompressMB_id:
 3766       multi_block = true;
 3767       break;
 3768     default:
 3769       ShouldNotReachHere();
 3770     }
 3771 
 3772     static const uint32_t round_consts[64] = {
 3773       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3774       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3775       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3776       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3777       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3778       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3779       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3780       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3781       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3782       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3783       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3784       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3785       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3786       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3787       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3788       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3789     };
 3790 
 3791     __ align(CodeEntryAlignment);
 3792 
 3793     StubCodeMark mark(this, stub_id);
 3794     address start = __ pc();
 3795 
 3796     Register buf   = c_rarg0;
 3797     Register state = c_rarg1;
 3798     Register ofs   = c_rarg2;
 3799     Register limit = c_rarg3;
 3800 
 3801     Label sha1_loop;
 3802 
 3803     __ stpd(v8, v9, __ pre(sp, -32));
 3804     __ stpd(v10, v11, Address(sp, 16));
 3805 
 3806 // dga == v0
 3807 // dgb == v1
 3808 // dg0 == v2
 3809 // dg1 == v3
 3810 // dg2 == v4
 3811 // t0 == v6
 3812 // t1 == v7
 3813 
 3814     // load 16 keys to v16..v31
 3815     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3816     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3817     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3818     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3819     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3820 
 3821     // load 8 words (256 bits) state
 3822     __ ldpq(v0, v1, state);
 3823 
 3824     __ BIND(sha1_loop);
 3825     // load 64 bytes of data into v8..v11
 3826     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3827     __ rev32(v8, __ T16B, v8);
 3828     __ rev32(v9, __ T16B, v9);
 3829     __ rev32(v10, __ T16B, v10);
 3830     __ rev32(v11, __ T16B, v11);
 3831 
 3832     __ addv(v6, __ T4S, v8, v16);
 3833     __ orr(v2, __ T16B, v0, v0);
 3834     __ orr(v3, __ T16B, v1, v1);
 3835 
 3836     FloatRegister d0 = v8;
 3837     FloatRegister d1 = v9;
 3838     FloatRegister d2 = v10;
 3839     FloatRegister d3 = v11;
 3840 
 3841 
 3842     for (int round = 0; round < 16; round++) {
 3843       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3844       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3845       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3846       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3847 
 3848       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3849        __ orr(v4, __ T16B, v2, v2);
 3850       if (round < 15)
 3851         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3852       __ sha256h(v2, __ T4S, v3, tmp2);
 3853       __ sha256h2(v3, __ T4S, v4, tmp2);
 3854       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3855 
 3856       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3857     }
 3858 
 3859     __ addv(v0, __ T4S, v0, v2);
 3860     __ addv(v1, __ T4S, v1, v3);
 3861 
 3862     if (multi_block) {
 3863       __ add(ofs, ofs, 64);
 3864       __ cmp(ofs, limit);
 3865       __ br(Assembler::LE, sha1_loop);
 3866       __ mov(c_rarg0, ofs); // return ofs
 3867     }
 3868 
 3869     __ ldpd(v10, v11, Address(sp, 16));
 3870     __ ldpd(v8, v9, __ post(sp, 32));
 3871 
 3872     __ stpq(v0, v1, state);
 3873 
 3874     __ ret(lr);
 3875 
 3876     return start;
 3877   }
 3878 
 3879   // Double rounds for sha512.
 3880   void sha512_dround(int dr,
 3881                      FloatRegister vi0, FloatRegister vi1,
 3882                      FloatRegister vi2, FloatRegister vi3,
 3883                      FloatRegister vi4, FloatRegister vrc0,
 3884                      FloatRegister vrc1, FloatRegister vin0,
 3885                      FloatRegister vin1, FloatRegister vin2,
 3886                      FloatRegister vin3, FloatRegister vin4) {
 3887       if (dr < 36) {
 3888         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 3889       }
 3890       __ addv(v5, __ T2D, vrc0, vin0);
 3891       __ ext(v6, __ T16B, vi2, vi3, 8);
 3892       __ ext(v5, __ T16B, v5, v5, 8);
 3893       __ ext(v7, __ T16B, vi1, vi2, 8);
 3894       __ addv(vi3, __ T2D, vi3, v5);
 3895       if (dr < 32) {
 3896         __ ext(v5, __ T16B, vin3, vin4, 8);
 3897         __ sha512su0(vin0, __ T2D, vin1);
 3898       }
 3899       __ sha512h(vi3, __ T2D, v6, v7);
 3900       if (dr < 32) {
 3901         __ sha512su1(vin0, __ T2D, vin2, v5);
 3902       }
 3903       __ addv(vi4, __ T2D, vi1, vi3);
 3904       __ sha512h2(vi3, __ T2D, vi1, vi0);
 3905   }
 3906 
 3907   // Arguments:
 3908   //
 3909   // Inputs:
 3910   //   c_rarg0   - byte[]  source+offset
 3911   //   c_rarg1   - int[]   SHA.state
 3912   //   c_rarg2   - int     offset
 3913   //   c_rarg3   - int     limit
 3914   //
 3915   address generate_sha512_implCompress(StubGenStubId stub_id) {
 3916     bool multi_block;
 3917     switch (stub_id) {
 3918     case sha512_implCompress_id:
 3919       multi_block = false;
 3920       break;
 3921     case sha512_implCompressMB_id:
 3922       multi_block = true;
 3923       break;
 3924     default:
 3925       ShouldNotReachHere();
 3926     }
 3927 
 3928     static const uint64_t round_consts[80] = {
 3929       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 3930       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 3931       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 3932       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 3933       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 3934       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 3935       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 3936       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 3937       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 3938       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 3939       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 3940       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 3941       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 3942       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 3943       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 3944       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 3945       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 3946       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 3947       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 3948       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 3949       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 3950       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 3951       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 3952       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 3953       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 3954       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 3955       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 3956     };
 3957 
 3958     __ align(CodeEntryAlignment);
 3959 
 3960     StubCodeMark mark(this, stub_id);
 3961     address start = __ pc();
 3962 
 3963     Register buf   = c_rarg0;
 3964     Register state = c_rarg1;
 3965     Register ofs   = c_rarg2;
 3966     Register limit = c_rarg3;
 3967 
 3968     __ stpd(v8, v9, __ pre(sp, -64));
 3969     __ stpd(v10, v11, Address(sp, 16));
 3970     __ stpd(v12, v13, Address(sp, 32));
 3971     __ stpd(v14, v15, Address(sp, 48));
 3972 
 3973     Label sha512_loop;
 3974 
 3975     // load state
 3976     __ ld1(v8, v9, v10, v11, __ T2D, state);
 3977 
 3978     // load first 4 round constants
 3979     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3980     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 3981 
 3982     __ BIND(sha512_loop);
 3983     // load 128B of data into v12..v19
 3984     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 3985     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 3986     __ rev64(v12, __ T16B, v12);
 3987     __ rev64(v13, __ T16B, v13);
 3988     __ rev64(v14, __ T16B, v14);
 3989     __ rev64(v15, __ T16B, v15);
 3990     __ rev64(v16, __ T16B, v16);
 3991     __ rev64(v17, __ T16B, v17);
 3992     __ rev64(v18, __ T16B, v18);
 3993     __ rev64(v19, __ T16B, v19);
 3994 
 3995     __ mov(rscratch2, rscratch1);
 3996 
 3997     __ mov(v0, __ T16B, v8);
 3998     __ mov(v1, __ T16B, v9);
 3999     __ mov(v2, __ T16B, v10);
 4000     __ mov(v3, __ T16B, v11);
 4001 
 4002     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4003     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4004     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4005     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4006     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4007     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4008     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4009     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4010     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4011     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4012     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4013     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4014     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4015     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4016     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4017     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4018     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4019     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4020     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4021     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4022     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4023     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4024     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4025     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4026     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4027     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4028     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4029     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4030     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4031     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4032     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4033     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4034     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4035     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4036     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4037     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4038     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4039     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4040     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4041     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4042 
 4043     __ addv(v8, __ T2D, v8, v0);
 4044     __ addv(v9, __ T2D, v9, v1);
 4045     __ addv(v10, __ T2D, v10, v2);
 4046     __ addv(v11, __ T2D, v11, v3);
 4047 
 4048     if (multi_block) {
 4049       __ add(ofs, ofs, 128);
 4050       __ cmp(ofs, limit);
 4051       __ br(Assembler::LE, sha512_loop);
 4052       __ mov(c_rarg0, ofs); // return ofs
 4053     }
 4054 
 4055     __ st1(v8, v9, v10, v11, __ T2D, state);
 4056 
 4057     __ ldpd(v14, v15, Address(sp, 48));
 4058     __ ldpd(v12, v13, Address(sp, 32));
 4059     __ ldpd(v10, v11, Address(sp, 16));
 4060     __ ldpd(v8, v9, __ post(sp, 64));
 4061 
 4062     __ ret(lr);
 4063 
 4064     return start;
 4065   }
 4066 
 4067   // Execute one round of keccak of two computations in parallel.
 4068   // One of the states should be loaded into the lower halves of
 4069   // the vector registers v0-v24, the other should be loaded into
 4070   // the upper halves of those registers. The ld1r instruction loads
 4071   // the round constant into both halves of register v31.
 4072   // Intermediate results c0...c5 and d0...d5 are computed
 4073   // in registers v25...v30.
 4074   // All vector instructions that are used operate on both register
 4075   // halves in parallel.
 4076   // If only a single computation is needed, one can only load the lower halves.
 4077   void keccak_round(Register rscratch1) {
 4078   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4079   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4080   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4081   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4082   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4083   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4084   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4085   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4086   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4087   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4088 
 4089   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4090   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4091   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4092   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4093   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4094 
 4095   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4096   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4097   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4098   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4099   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4100   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4101   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4102   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4103   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4104   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4105   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4106   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4107   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4108   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4109   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4110   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4111   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4112   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4113   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4114   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4115   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4116   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4117   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4118   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4119   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4120 
 4121   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4122   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4123   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4124   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4125   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4126 
 4127   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4128 
 4129   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4130   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4131   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4132   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4133   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4134 
 4135   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4136   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4137   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4138   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4139   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4140 
 4141   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4142   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4143   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4144   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4145   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4146 
 4147   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4148   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4149   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4150   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4151   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4152 
 4153   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4154   }
 4155 
 4156   // Arguments:
 4157   //
 4158   // Inputs:
 4159   //   c_rarg0   - byte[]  source+offset
 4160   //   c_rarg1   - byte[]  SHA.state
 4161   //   c_rarg2   - int     block_size
 4162   //   c_rarg3   - int     offset
 4163   //   c_rarg4   - int     limit
 4164   //
 4165   address generate_sha3_implCompress(StubGenStubId stub_id) {
 4166     bool multi_block;
 4167     switch (stub_id) {
 4168     case sha3_implCompress_id:
 4169       multi_block = false;
 4170       break;
 4171     case sha3_implCompressMB_id:
 4172       multi_block = true;
 4173       break;
 4174     default:
 4175       ShouldNotReachHere();
 4176     }
 4177 
 4178     static const uint64_t round_consts[24] = {
 4179       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4180       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4181       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4182       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4183       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4184       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4185       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4186       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4187     };
 4188 
 4189     __ align(CodeEntryAlignment);
 4190 
 4191     StubCodeMark mark(this, stub_id);
 4192     address start = __ pc();
 4193 
 4194     Register buf           = c_rarg0;
 4195     Register state         = c_rarg1;
 4196     Register block_size    = c_rarg2;
 4197     Register ofs           = c_rarg3;
 4198     Register limit         = c_rarg4;
 4199 
 4200     Label sha3_loop, rounds24_loop;
 4201     Label sha3_512_or_sha3_384, shake128;
 4202 
 4203     __ stpd(v8, v9, __ pre(sp, -64));
 4204     __ stpd(v10, v11, Address(sp, 16));
 4205     __ stpd(v12, v13, Address(sp, 32));
 4206     __ stpd(v14, v15, Address(sp, 48));
 4207 
 4208     // load state
 4209     __ add(rscratch1, state, 32);
 4210     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4211     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4212     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4213     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4214     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4215     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4216     __ ld1(v24, __ T1D, rscratch1);
 4217 
 4218     __ BIND(sha3_loop);
 4219 
 4220     // 24 keccak rounds
 4221     __ movw(rscratch2, 24);
 4222 
 4223     // load round_constants base
 4224     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4225 
 4226     // load input
 4227     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4228     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4229     __ eor(v0, __ T8B, v0, v25);
 4230     __ eor(v1, __ T8B, v1, v26);
 4231     __ eor(v2, __ T8B, v2, v27);
 4232     __ eor(v3, __ T8B, v3, v28);
 4233     __ eor(v4, __ T8B, v4, v29);
 4234     __ eor(v5, __ T8B, v5, v30);
 4235     __ eor(v6, __ T8B, v6, v31);
 4236 
 4237     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4238     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4239 
 4240     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4241     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4242     __ eor(v7, __ T8B, v7, v25);
 4243     __ eor(v8, __ T8B, v8, v26);
 4244     __ eor(v9, __ T8B, v9, v27);
 4245     __ eor(v10, __ T8B, v10, v28);
 4246     __ eor(v11, __ T8B, v11, v29);
 4247     __ eor(v12, __ T8B, v12, v30);
 4248     __ eor(v13, __ T8B, v13, v31);
 4249 
 4250     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4251     __ eor(v14, __ T8B, v14, v25);
 4252     __ eor(v15, __ T8B, v15, v26);
 4253     __ eor(v16, __ T8B, v16, v27);
 4254 
 4255     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4256     __ andw(c_rarg5, block_size, 48);
 4257     __ cbzw(c_rarg5, rounds24_loop);
 4258 
 4259     __ tbnz(block_size, 5, shake128);
 4260     // block_size == 144, bit5 == 0, SHA3-224
 4261     __ ldrd(v28, __ post(buf, 8));
 4262     __ eor(v17, __ T8B, v17, v28);
 4263     __ b(rounds24_loop);
 4264 
 4265     __ BIND(shake128);
 4266     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4267     __ eor(v17, __ T8B, v17, v28);
 4268     __ eor(v18, __ T8B, v18, v29);
 4269     __ eor(v19, __ T8B, v19, v30);
 4270     __ eor(v20, __ T8B, v20, v31);
 4271     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4272 
 4273     __ BIND(sha3_512_or_sha3_384);
 4274     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4275     __ eor(v7, __ T8B, v7, v25);
 4276     __ eor(v8, __ T8B, v8, v26);
 4277     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4278 
 4279     // SHA3-384
 4280     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4281     __ eor(v9,  __ T8B, v9,  v27);
 4282     __ eor(v10, __ T8B, v10, v28);
 4283     __ eor(v11, __ T8B, v11, v29);
 4284     __ eor(v12, __ T8B, v12, v30);
 4285 
 4286     __ BIND(rounds24_loop);
 4287     __ subw(rscratch2, rscratch2, 1);
 4288 
 4289     keccak_round(rscratch1);
 4290 
 4291     __ cbnzw(rscratch2, rounds24_loop);
 4292 
 4293     if (multi_block) {
 4294       __ add(ofs, ofs, block_size);
 4295       __ cmp(ofs, limit);
 4296       __ br(Assembler::LE, sha3_loop);
 4297       __ mov(c_rarg0, ofs); // return ofs
 4298     }
 4299 
 4300     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4301     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4302     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4303     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4304     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4305     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4306     __ st1(v24, __ T1D, state);
 4307 
 4308     // restore callee-saved registers
 4309     __ ldpd(v14, v15, Address(sp, 48));
 4310     __ ldpd(v12, v13, Address(sp, 32));
 4311     __ ldpd(v10, v11, Address(sp, 16));
 4312     __ ldpd(v8, v9, __ post(sp, 64));
 4313 
 4314     __ ret(lr);
 4315 
 4316     return start;
 4317   }
 4318 
 4319   // Inputs:
 4320   //   c_rarg0   - long[]  state0
 4321   //   c_rarg1   - long[]  state1
 4322   address generate_double_keccak() {
 4323     static const uint64_t round_consts[24] = {
 4324       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4325       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4326       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4327       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4328       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4329       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4330       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4331       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4332     };
 4333 
 4334     // Implements the double_keccak() method of the
 4335     // sun.secyrity.provider.SHA3Parallel class
 4336     __ align(CodeEntryAlignment);
 4337     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4338     address start = __ pc();
 4339     __ enter();
 4340 
 4341     Register state0        = c_rarg0;
 4342     Register state1        = c_rarg1;
 4343 
 4344     Label rounds24_loop;
 4345 
 4346     // save callee-saved registers
 4347     __ stpd(v8, v9, __ pre(sp, -64));
 4348     __ stpd(v10, v11, Address(sp, 16));
 4349     __ stpd(v12, v13, Address(sp, 32));
 4350     __ stpd(v14, v15, Address(sp, 48));
 4351 
 4352     // load states
 4353     __ add(rscratch1, state0, 32);
 4354     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4355     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4356     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4357     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4358     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4359     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4360     __ ld1(v24, __ D, 0, rscratch1);
 4361     __ add(rscratch1, state1, 32);
 4362     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4363     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4364     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4365     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4366     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4367     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4368     __ ld1(v24, __ D, 1, rscratch1);
 4369 
 4370     // 24 keccak rounds
 4371     __ movw(rscratch2, 24);
 4372 
 4373     // load round_constants base
 4374     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4375 
 4376     __ BIND(rounds24_loop);
 4377     __ subw(rscratch2, rscratch2, 1);
 4378     keccak_round(rscratch1);
 4379     __ cbnzw(rscratch2, rounds24_loop);
 4380 
 4381     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4382     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4383     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4384     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4385     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4386     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4387     __ st1(v24, __ D, 0, state0);
 4388     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4389     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4390     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4391     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4392     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4393     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4394     __ st1(v24, __ D, 1, state1);
 4395 
 4396     // restore callee-saved vector registers
 4397     __ ldpd(v14, v15, Address(sp, 48));
 4398     __ ldpd(v12, v13, Address(sp, 32));
 4399     __ ldpd(v10, v11, Address(sp, 16));
 4400     __ ldpd(v8, v9, __ post(sp, 64));
 4401 
 4402     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4403     __ mov(r0, zr); // return 0
 4404     __ ret(lr);
 4405 
 4406     return start;
 4407   }
 4408 
 4409   /**
 4410    *  Arguments:
 4411    *
 4412    * Inputs:
 4413    *   c_rarg0   - int crc
 4414    *   c_rarg1   - byte* buf
 4415    *   c_rarg2   - int length
 4416    *
 4417    * Output:
 4418    *       rax   - int crc result
 4419    */
 4420   address generate_updateBytesCRC32() {
 4421     assert(UseCRC32Intrinsics, "what are we doing here?");
 4422 
 4423     __ align(CodeEntryAlignment);
 4424     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
 4425     StubCodeMark mark(this, stub_id);
 4426 
 4427     address start = __ pc();
 4428 
 4429     const Register crc   = c_rarg0;  // crc
 4430     const Register buf   = c_rarg1;  // source java byte array address
 4431     const Register len   = c_rarg2;  // length
 4432     const Register table0 = c_rarg3; // crc_table address
 4433     const Register table1 = c_rarg4;
 4434     const Register table2 = c_rarg5;
 4435     const Register table3 = c_rarg6;
 4436     const Register tmp3 = c_rarg7;
 4437 
 4438     BLOCK_COMMENT("Entry:");
 4439     __ enter(); // required for proper stackwalking of RuntimeStub frame
 4440 
 4441     __ kernel_crc32(crc, buf, len,
 4442               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 4443 
 4444     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4445     __ ret(lr);
 4446 
 4447     return start;
 4448   }
 4449 
 4450   // ChaCha20 block function.  This version parallelizes 4 quarter
 4451   // round operations at a time.  It uses 16 SIMD registers to
 4452   // produce 4 blocks of key stream.
 4453   //
 4454   // state (int[16]) = c_rarg0
 4455   // keystream (byte[256]) = c_rarg1
 4456   // return - number of bytes of keystream (always 256)
 4457   //
 4458   // In this approach, we load the 512-bit start state sequentially into
 4459   // 4 128-bit vectors.  We then make 4 4-vector copies of that starting
 4460   // state, with each successive set of 4 vectors having a +1 added into
 4461   // the first 32-bit lane of the 4th vector in that group (the counter).
 4462   // By doing this, we can perform the block function on 4 512-bit blocks
 4463   // within one run of this intrinsic.
 4464   // The alignment of the data across the 4-vector group is such that at
 4465   // the start it is already aligned for the first round of each two-round
 4466   // loop iteration.  In other words, the corresponding lanes of each vector
 4467   // will contain the values needed for that quarter round operation (e.g.
 4468   // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.).
 4469   // In between each full round, a lane shift must occur.  Within a loop
 4470   // iteration, between the first and second rounds, the 2nd, 3rd, and 4th
 4471   // vectors are rotated left 32, 64 and 96 bits, respectively.  The result
 4472   // is effectively a diagonal orientation in columnar form.  After the
 4473   // second full round, those registers are left-rotated again, this time
 4474   // 96, 64, and 32 bits - returning the vectors to their columnar organization.
 4475   // After all 10 iterations, the original state is added to each 4-vector
 4476   // working state along with the add mask, and the 4 vector groups are
 4477   // sequentially written to the memory dedicated for the output key stream.
 4478   //
 4479   // For a more detailed explanation, see Goll and Gueron, "Vectorization of
 4480   // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology:
 4481   // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33
 4482   address generate_chacha20Block_qrpar() {
 4483     Label L_Q_twoRounds, L_Q_cc20_const;
 4484     // The constant data is broken into two 128-bit segments to be loaded
 4485     // onto SIMD registers.  The first 128 bits are a counter add overlay
 4486     // that adds +1/+0/+0/+0 to the vectors holding replicated state[12].
 4487     // The second 128-bits is a table constant used for 8-bit left rotations.
 4488     // on 32-bit lanes within a SIMD register.
 4489     __ BIND(L_Q_cc20_const);
 4490     __ emit_int64(0x0000000000000001UL);
 4491     __ emit_int64(0x0000000000000000UL);
 4492     __ emit_int64(0x0605040702010003UL);
 4493     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4494 
 4495     __ align(CodeEntryAlignment);
 4496     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
 4497     StubCodeMark mark(this, stub_id);
 4498     address start = __ pc();
 4499     __ enter();
 4500 
 4501     const Register state = c_rarg0;
 4502     const Register keystream = c_rarg1;
 4503     const Register loopCtr = r10;
 4504     const Register tmpAddr = r11;
 4505 
 4506     const FloatRegister aState = v0;
 4507     const FloatRegister bState = v1;
 4508     const FloatRegister cState = v2;
 4509     const FloatRegister dState = v3;
 4510     const FloatRegister a1Vec = v4;
 4511     const FloatRegister b1Vec = v5;
 4512     const FloatRegister c1Vec = v6;
 4513     const FloatRegister d1Vec = v7;
 4514     // Skip the callee-saved registers v8 - v15
 4515     const FloatRegister a2Vec = v16;
 4516     const FloatRegister b2Vec = v17;
 4517     const FloatRegister c2Vec = v18;
 4518     const FloatRegister d2Vec = v19;
 4519     const FloatRegister a3Vec = v20;
 4520     const FloatRegister b3Vec = v21;
 4521     const FloatRegister c3Vec = v22;
 4522     const FloatRegister d3Vec = v23;
 4523     const FloatRegister a4Vec = v24;
 4524     const FloatRegister b4Vec = v25;
 4525     const FloatRegister c4Vec = v26;
 4526     const FloatRegister d4Vec = v27;
 4527     const FloatRegister scratch = v28;
 4528     const FloatRegister addMask = v29;
 4529     const FloatRegister lrot8Tbl = v30;
 4530 
 4531     // Load the initial state in the first 4 quadword registers,
 4532     // then copy the initial state into the next 4 quadword registers
 4533     // that will be used for the working state.
 4534     __ ld1(aState, bState, cState, dState, __ T16B, Address(state));
 4535 
 4536     // Load the index register for 2 constant 128-bit data fields.
 4537     // The first represents the +1/+0/+0/+0 add mask.  The second is
 4538     // the 8-bit left rotation.
 4539     __ adr(tmpAddr, L_Q_cc20_const);
 4540     __ ldpq(addMask, lrot8Tbl, Address(tmpAddr));
 4541 
 4542     __ mov(a1Vec, __ T16B, aState);
 4543     __ mov(b1Vec, __ T16B, bState);
 4544     __ mov(c1Vec, __ T16B, cState);
 4545     __ mov(d1Vec, __ T16B, dState);
 4546 
 4547     __ mov(a2Vec, __ T16B, aState);
 4548     __ mov(b2Vec, __ T16B, bState);
 4549     __ mov(c2Vec, __ T16B, cState);
 4550     __ addv(d2Vec, __ T4S, d1Vec, addMask);
 4551 
 4552     __ mov(a3Vec, __ T16B, aState);
 4553     __ mov(b3Vec, __ T16B, bState);
 4554     __ mov(c3Vec, __ T16B, cState);
 4555     __ addv(d3Vec, __ T4S, d2Vec, addMask);
 4556 
 4557     __ mov(a4Vec, __ T16B, aState);
 4558     __ mov(b4Vec, __ T16B, bState);
 4559     __ mov(c4Vec, __ T16B, cState);
 4560     __ addv(d4Vec, __ T4S, d3Vec, addMask);
 4561 
 4562     // Set up the 10 iteration loop
 4563     __ mov(loopCtr, 10);
 4564     __ BIND(L_Q_twoRounds);
 4565 
 4566     // The first set of operations on the vectors covers the first 4 quarter
 4567     // round operations:
 4568     //  Qround(state, 0, 4, 8,12)
 4569     //  Qround(state, 1, 5, 9,13)
 4570     //  Qround(state, 2, 6,10,14)
 4571     //  Qround(state, 3, 7,11,15)
 4572     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
 4573     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
 4574     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
 4575     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
 4576 
 4577     // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to
 4578     // diagonals. The a1Vec does not need to change orientation.
 4579     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true);
 4580     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true);
 4581     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true);
 4582     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true);
 4583 
 4584     // The second set of operations on the vectors covers the second 4 quarter
 4585     // round operations, now acting on the diagonals:
 4586     //  Qround(state, 0, 5,10,15)
 4587     //  Qround(state, 1, 6,11,12)
 4588     //  Qround(state, 2, 7, 8,13)
 4589     //  Qround(state, 3, 4, 9,14)
 4590     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
 4591     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
 4592     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
 4593     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
 4594 
 4595     // Before we start the next iteration, we need to perform shuffles
 4596     // on the b/c/d vectors to move them back to columnar organizations
 4597     // from their current diagonal orientation.
 4598     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false);
 4599     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false);
 4600     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false);
 4601     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false);
 4602 
 4603     // Decrement and iterate
 4604     __ sub(loopCtr, loopCtr, 1);
 4605     __ cbnz(loopCtr, L_Q_twoRounds);
 4606 
 4607     // Once the counter reaches zero, we fall out of the loop
 4608     // and need to add the initial state back into the working state
 4609     // represented by the a/b/c/d1Vec registers.  This is destructive
 4610     // on the dState register but we no longer will need it.
 4611     __ addv(a1Vec, __ T4S, a1Vec, aState);
 4612     __ addv(b1Vec, __ T4S, b1Vec, bState);
 4613     __ addv(c1Vec, __ T4S, c1Vec, cState);
 4614     __ addv(d1Vec, __ T4S, d1Vec, dState);
 4615 
 4616     __ addv(a2Vec, __ T4S, a2Vec, aState);
 4617     __ addv(b2Vec, __ T4S, b2Vec, bState);
 4618     __ addv(c2Vec, __ T4S, c2Vec, cState);
 4619     __ addv(dState, __ T4S, dState, addMask);
 4620     __ addv(d2Vec, __ T4S, d2Vec, dState);
 4621 
 4622     __ addv(a3Vec, __ T4S, a3Vec, aState);
 4623     __ addv(b3Vec, __ T4S, b3Vec, bState);
 4624     __ addv(c3Vec, __ T4S, c3Vec, cState);
 4625     __ addv(dState, __ T4S, dState, addMask);
 4626     __ addv(d3Vec, __ T4S, d3Vec, dState);
 4627 
 4628     __ addv(a4Vec, __ T4S, a4Vec, aState);
 4629     __ addv(b4Vec, __ T4S, b4Vec, bState);
 4630     __ addv(c4Vec, __ T4S, c4Vec, cState);
 4631     __ addv(dState, __ T4S, dState, addMask);
 4632     __ addv(d4Vec, __ T4S, d4Vec, dState);
 4633 
 4634     // Write the final state back to the result buffer
 4635     __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64));
 4636     __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64));
 4637     __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64));
 4638     __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64));
 4639 
 4640     __ mov(r0, 256);             // Return length of output keystream
 4641     __ leave();
 4642     __ ret(lr);
 4643 
 4644     return start;
 4645   }
 4646 
 4647   void dilithium_load16zetas(int o0, Register zetas) {
 4648     __ ldpq(as_FloatRegister(o0), as_FloatRegister(o0 + 1), __ post (zetas, 32));
 4649     __ ldpq(as_FloatRegister(o0 + 2), as_FloatRegister(o0 + 3), __ post (zetas, 32));
 4650 
 4651   }
 4652 
 4653   void dilithium_load32zetas(Register zetas) {
 4654     dilithium_load16zetas(16, zetas);
 4655     dilithium_load16zetas(20, zetas);
 4656   }
 4657 
 4658   // 2x16 32-bit Montgomery multiplications in parallel
 4659   // See the montMul() method of the sun.security.provider.ML_DSA class.
 4660   // Here MONT_R_BITS is 32, so the right shift by it is implicit.
 4661   // The constants qInv = MONT_Q_INV_MOD_R and q = MONT_Q are loaded in
 4662   // (all 32-bit chunks of) vector registers v30 and v31, resp.
 4663   // The inputs are b[i]s in v0-v7 and c[i]s v16-v23 and
 4664   // the results are a[i]s in v16-v23, four 32-bit values in each register
 4665   // and we do a_i = b_i * c_i * 2^-32 mod MONT_Q for all
 4666   void dilithium_montmul32(bool by_constant) {
 4667     FloatRegister vr0 = by_constant ? v29 : v0;
 4668     FloatRegister vr1 = by_constant ? v29 : v1;
 4669     FloatRegister vr2 = by_constant ? v29 : v2;
 4670     FloatRegister vr3 = by_constant ? v29 : v3;
 4671     FloatRegister vr4 = by_constant ? v29 : v4;
 4672     FloatRegister vr5 = by_constant ? v29 : v5;
 4673     FloatRegister vr6 = by_constant ? v29 : v6;
 4674     FloatRegister vr7 = by_constant ? v29 : v7;
 4675 
 4676     __ sqdmulh(v24, __ T4S, vr0, v16); // aHigh = hi32(2 * b * c)
 4677     __ mulv(v16, __ T4S, vr0, v16);    // aLow = lo32(b * c)
 4678     __ sqdmulh(v25, __ T4S, vr1, v17);
 4679     __ mulv(v17, __ T4S, vr1, v17);
 4680     __ sqdmulh(v26, __ T4S, vr2, v18);
 4681     __ mulv(v18, __ T4S, vr2, v18);
 4682     __ sqdmulh(v27, __ T4S, vr3, v19);
 4683     __ mulv(v19, __ T4S, vr3, v19);
 4684 
 4685     __ mulv(v16, __ T4S, v16, v30);     // m = aLow * qinv
 4686     __ mulv(v17, __ T4S, v17, v30);
 4687     __ mulv(v18, __ T4S, v18, v30);
 4688     __ mulv(v19, __ T4S, v19, v30);
 4689 
 4690     __ sqdmulh(v16, __ T4S, v16, v31);  // n = hi32(2 * m * q)
 4691     __ sqdmulh(v17, __ T4S, v17, v31);
 4692     __ sqdmulh(v18, __ T4S, v18, v31);
 4693     __ sqdmulh(v19, __ T4S, v19, v31);
 4694 
 4695     __ shsubv(v16, __ T4S, v24, v16);   // a = (aHigh - n) / 2
 4696     __ shsubv(v17, __ T4S, v25, v17);
 4697     __ shsubv(v18, __ T4S, v26, v18);
 4698     __ shsubv(v19, __ T4S, v27, v19);
 4699 
 4700     __ sqdmulh(v24, __ T4S, vr4, v20);
 4701     __ mulv(v20, __ T4S, vr4, v20);
 4702     __ sqdmulh(v25, __ T4S, vr5, v21);
 4703     __ mulv(v21, __ T4S, vr5, v21);
 4704     __ sqdmulh(v26, __ T4S, vr6, v22);
 4705     __ mulv(v22, __ T4S, vr6, v22);
 4706     __ sqdmulh(v27, __ T4S, vr7, v23);
 4707     __ mulv(v23, __ T4S, vr7, v23);
 4708 
 4709     __ mulv(v20, __ T4S, v20, v30);
 4710     __ mulv(v21, __ T4S, v21, v30);
 4711     __ mulv(v22, __ T4S, v22, v30);
 4712     __ mulv(v23, __ T4S, v23, v30);
 4713 
 4714     __ sqdmulh(v20, __ T4S, v20, v31);
 4715     __ sqdmulh(v21, __ T4S, v21, v31);
 4716     __ sqdmulh(v22, __ T4S, v22, v31);
 4717     __ sqdmulh(v23, __ T4S, v23, v31);
 4718 
 4719     __ shsubv(v20, __ T4S, v24, v20);
 4720     __ shsubv(v21, __ T4S, v25, v21);
 4721     __ shsubv(v22, __ T4S, v26, v22);
 4722     __ shsubv(v23, __ T4S, v27, v23);
 4723   }
 4724 
 4725  // Do the addition and subtraction done in the ntt algorithm.
 4726  // See sun.security.provider.ML_DSA.implDilithiumAlmostNttJava()
 4727   void dilithium_add_sub32() {
 4728     __ addv(v24, __ T4S, v0, v16); // coeffs[j] = coeffs[j] + tmp;
 4729     __ addv(v25, __ T4S, v1, v17);
 4730     __ addv(v26, __ T4S, v2, v18);
 4731     __ addv(v27, __ T4S, v3, v19);
 4732     __ addv(v28, __ T4S, v4, v20);
 4733     __ addv(v29, __ T4S, v5, v21);
 4734     __ addv(v30, __ T4S, v6, v22);
 4735     __ addv(v31, __ T4S, v7, v23);
 4736 
 4737     __ subv(v0, __ T4S, v0, v16);  // coeffs[j + l] = coeffs[j] - tmp;
 4738     __ subv(v1, __ T4S, v1, v17);
 4739     __ subv(v2, __ T4S, v2, v18);
 4740     __ subv(v3, __ T4S, v3, v19);
 4741     __ subv(v4, __ T4S, v4, v20);
 4742     __ subv(v5, __ T4S, v5, v21);
 4743     __ subv(v6, __ T4S, v6, v22);
 4744     __ subv(v7, __ T4S, v7, v23);
 4745   }
 4746 
 4747   // Do the same computation that
 4748   // dilithium_montmul32() and dilithium_add_sub32() does,
 4749   // except for only 4x4 32-bit vector elements and with
 4750   // different register usage.
 4751   void dilithium_montmul_sub_add16() {
 4752     __ sqdmulh(v24, __ T4S, v1, v16);
 4753     __ mulv(v16, __ T4S, v1, v16);
 4754     __ sqdmulh(v25, __ T4S, v3, v17);
 4755     __ mulv(v17, __ T4S, v3, v17);
 4756     __ sqdmulh(v26, __ T4S, v5, v18);
 4757     __ mulv(v18, __ T4S, v5, v18);
 4758     __ sqdmulh(v27, __ T4S, v7, v19);
 4759     __ mulv(v19, __ T4S, v7, v19);
 4760 
 4761     __ mulv(v16, __ T4S, v16, v30);
 4762     __ mulv(v17, __ T4S, v17, v30);
 4763     __ mulv(v18, __ T4S, v18, v30);
 4764     __ mulv(v19, __ T4S, v19, v30);
 4765 
 4766     __ sqdmulh(v16, __ T4S, v16, v31);
 4767     __ sqdmulh(v17, __ T4S, v17, v31);
 4768     __ sqdmulh(v18, __ T4S, v18, v31);
 4769     __ sqdmulh(v19, __ T4S, v19, v31);
 4770 
 4771     __ shsubv(v16, __ T4S, v24, v16);
 4772     __ shsubv(v17, __ T4S, v25, v17);
 4773     __ shsubv(v18, __ T4S, v26, v18);
 4774     __ shsubv(v19, __ T4S, v27, v19);
 4775 
 4776     __ subv(v1, __ T4S, v0, v16);
 4777     __ subv(v3, __ T4S, v2, v17);
 4778     __ subv(v5, __ T4S, v4, v18);
 4779     __ subv(v7, __ T4S, v6, v19);
 4780 
 4781     __ addv(v0, __ T4S, v0, v16);
 4782     __ addv(v2, __ T4S, v2, v17);
 4783     __ addv(v4, __ T4S, v4, v18);
 4784     __ addv(v6, __ T4S, v6, v19);
 4785   }
 4786 
 4787   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 4788   // in the Java implementation come in sequences of at least 8, so we
 4789   // can use ldpq to collect the corresponding data into pairs of vector
 4790   // registers.
 4791   // We collect the coefficients corresponding to the 'j+l' indexes into
 4792   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 4793   // then we do the (Montgomery) multiplications by the zetas in parallel
 4794   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 4795   // v0-v7, then do the additions into v24-v31 and the subtractions into
 4796   // v0-v7 and finally save the results back to the coeffs array.
 4797   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 4798     const Register coeffs, const Register zetas) {
 4799     int c1 = 0;
 4800     int c2 = 512;
 4801     int startIncr;
 4802     int incr1 = 32;
 4803     int incr2 = 64;
 4804     int incr3 = 96;
 4805 
 4806     for (int level = 0; level < 5; level++) {
 4807       int c1Start = c1;
 4808       int c2Start = c2;
 4809       if (level == 3) {
 4810         incr1 = 32;
 4811         incr2 = 128;
 4812         incr3 = 160;
 4813       } else if (level == 4) {
 4814         incr1 = 64;
 4815         incr2 = 128;
 4816         incr3 = 192;
 4817       }
 4818 
 4819       for (int i = 0; i < 4; i++) {
 4820         __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q
 4821         __ ldpq(v0, v1, Address(coeffs, c2Start));
 4822         __ ldpq(v2, v3, Address(coeffs, c2Start + incr1));
 4823         __ ldpq(v4, v5, Address(coeffs, c2Start + incr2));
 4824         __ ldpq(v6, v7, Address(coeffs, c2Start + incr3));
 4825         dilithium_load32zetas(zetas);
 4826         dilithium_montmul32(false);
 4827         __ ldpq(v0, v1, Address(coeffs, c1Start));
 4828         __ ldpq(v2, v3, Address(coeffs, c1Start + incr1));
 4829         __ ldpq(v4, v5, Address(coeffs, c1Start + incr2));
 4830         __ ldpq(v6, v7, Address(coeffs, c1Start + incr3));
 4831         dilithium_add_sub32();
 4832         __ stpq(v24, v25, Address(coeffs, c1Start));
 4833         __ stpq(v26, v27, Address(coeffs, c1Start + incr1));
 4834         __ stpq(v28, v29, Address(coeffs, c1Start + incr2));
 4835         __ stpq(v30, v31, Address(coeffs, c1Start + incr3));
 4836         __ stpq(v0, v1, Address(coeffs, c2Start));
 4837         __ stpq(v2, v3, Address(coeffs, c2Start + incr1));
 4838         __ stpq(v4, v5, Address(coeffs, c2Start + incr2));
 4839         __ stpq(v6, v7, Address(coeffs, c2Start + incr3));
 4840 
 4841         int k = 4 * level + i;
 4842 
 4843         if (k > 7) {
 4844           startIncr = 256;
 4845         } else if (k == 5) {
 4846           startIncr = 384;
 4847         } else {
 4848           startIncr = 128;
 4849         }
 4850 
 4851         c1Start += startIncr;
 4852         c2Start += startIncr;
 4853       }
 4854 
 4855       c2 /= 2;
 4856     }
 4857   }
 4858 
 4859   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 4860   // Implements the method
 4861   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 4862   // of the Java class sun.security.provider
 4863   //
 4864   // coeffs (int[256]) = c_rarg0
 4865   // zetas (int[256]) = c_rarg1
 4866   address generate_dilithiumAlmostNtt() {
 4867 
 4868     __ align(CodeEntryAlignment);
 4869     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id;
 4870     StubCodeMark mark(this, stub_id);
 4871     address start = __ pc();
 4872     __ enter();
 4873 
 4874     const Register coeffs = c_rarg0;
 4875     const Register zetas = c_rarg1;
 4876 
 4877     const Register tmpAddr = r9;
 4878     const Register dilithiumConsts = r10;
 4879     const Register result = r11;
 4880 
 4881     __ add(result, coeffs, 0);
 4882     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 4883 
 4884     // Each level represents one iteration of the outer for loop of the Java version
 4885 
 4886     // level 0-4
 4887     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 4888 
 4889     // level 5
 4890     for (int i = 0; i < 1024; i += 256) {
 4891       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 4892       __ ldr(v0, __ Q, Address(coeffs, i + 16));
 4893       __ ldr(v1, __ Q, Address(coeffs, i + 48));
 4894       __ ldr(v2, __ Q, Address(coeffs, i + 80));
 4895       __ ldr(v3, __ Q, Address(coeffs, i + 112));
 4896       __ ldr(v4, __ Q, Address(coeffs, i + 144));
 4897       __ ldr(v5, __ Q, Address(coeffs, i + 176));
 4898       __ ldr(v6, __ Q, Address(coeffs, i + 208));
 4899       __ ldr(v7, __ Q, Address(coeffs, i + 240));
 4900       dilithium_load32zetas(zetas);
 4901       dilithium_montmul32(false);
 4902       __ ldr(v0, __ Q, Address(coeffs, i));
 4903       __ ldr(v1, __ Q, Address(coeffs, i + 32));
 4904       __ ldr(v2, __ Q, Address(coeffs, i + 64));
 4905       __ ldr(v3, __ Q, Address(coeffs, i + 96));
 4906       __ ldr(v4, __ Q, Address(coeffs, i + 128));
 4907       __ ldr(v5, __ Q, Address(coeffs, i + 160));
 4908       __ ldr(v6, __ Q, Address(coeffs, i + 192));
 4909       __ ldr(v7, __ Q, Address(coeffs, i + 224));
 4910       dilithium_add_sub32();
 4911       __ str(v24, __ Q, Address(coeffs, i));
 4912       __ str(v25, __ Q, Address(coeffs, i + 32));
 4913       __ str(v26, __ Q, Address(coeffs, i + 64));
 4914       __ str(v27, __ Q, Address(coeffs, i + 96));
 4915       __ str(v28, __ Q, Address(coeffs, i + 128));
 4916       __ str(v29, __ Q, Address(coeffs, i + 160));
 4917       __ str(v30, __ Q, Address(coeffs, i + 192));
 4918       __ str(v31, __ Q, Address(coeffs, i + 224));
 4919       __ str(v0, __ Q, Address(coeffs, i + 16));
 4920       __ str(v1, __ Q, Address(coeffs, i + 48));
 4921       __ str(v2, __ Q, Address(coeffs, i + 80));
 4922       __ str(v3, __ Q, Address(coeffs, i + 112));
 4923       __ str(v4, __ Q, Address(coeffs, i + 144));
 4924       __ str(v5, __ Q, Address(coeffs, i + 176));
 4925       __ str(v6, __ Q, Address(coeffs, i + 208));
 4926       __ str(v7, __ Q, Address(coeffs, i + 240));
 4927     }
 4928 
 4929     // level 6
 4930     for (int i = 0; i < 1024; i += 128) {
 4931       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 4932       __ add(tmpAddr, coeffs, i);
 4933       __ ld2(v0, v1, __ T2D, tmpAddr);
 4934       __ add(tmpAddr, coeffs, i + 32);
 4935       __ ld2(v2, v3, __ T2D, tmpAddr);
 4936       __ add(tmpAddr, coeffs, i + 64);
 4937       __ ld2(v4, v5, __ T2D, tmpAddr);
 4938       __ add(tmpAddr, coeffs, i + 96);
 4939       __ ld2(v6, v7, __ T2D, tmpAddr);
 4940       dilithium_load16zetas(16, zetas);
 4941       dilithium_montmul_sub_add16();
 4942       __ add(tmpAddr, coeffs, i);
 4943       __ st2(v0, v1, __ T2D, tmpAddr);
 4944       __ add(tmpAddr, coeffs, i + 32);
 4945       __ st2(v2, v3, __ T2D, tmpAddr);
 4946       __ add(tmpAddr, coeffs, i + 64);
 4947       __ st2(v4, v5, __ T2D, tmpAddr);
 4948       __ add(tmpAddr, coeffs, i + 96);
 4949       __ st2(v6, v7, __ T2D, tmpAddr);
 4950     }
 4951 
 4952     // level 7
 4953     for (int i = 0; i < 1024; i += 128) {
 4954       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 4955       __ add(tmpAddr, coeffs, i);
 4956       __ ld2(v0, v1, __ T4S, tmpAddr);
 4957       __ add(tmpAddr, coeffs, i + 32);
 4958       __ ld2(v2, v3, __ T4S, tmpAddr);
 4959       __ add(tmpAddr, coeffs, i + 64);
 4960       __ ld2(v4, v5, __ T4S, tmpAddr);
 4961       __ add(tmpAddr, coeffs, i + 96);
 4962       __ ld2(v6, v7, __ T4S, tmpAddr);
 4963       dilithium_load16zetas(16, zetas);
 4964       dilithium_montmul_sub_add16();
 4965       __ add(tmpAddr, coeffs, i);
 4966       __ st2(v0, v1, __ T4S, tmpAddr);
 4967       __ add(tmpAddr, coeffs, i + 32);
 4968       __ st2(v2, v3, __ T4S, tmpAddr);
 4969       __ add(tmpAddr, coeffs, i + 64);
 4970       __ st2(v4, v5, __ T4S, tmpAddr);
 4971       __ add(tmpAddr, coeffs, i + 96);
 4972       __ st2(v6, v7, __ T4S, tmpAddr);
 4973     }
 4974     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4975     __ mov(r0, zr); // return 0
 4976     __ ret(lr);
 4977 
 4978     return start;
 4979 
 4980   }
 4981 
 4982   // Do the computations that can be found in the body of the loop in
 4983   // sun.security.provider.ML_DSA.implDilithiumAlmostInverseNttJava()
 4984   // for 16 coefficients in parallel:
 4985   // tmp = coeffs[j];
 4986   // coeffs[j] = (tmp + coeffs[j + l]);
 4987   // coeffs[j + l] = montMul(tmp - coeffs[j + l], -MONT_ZETAS_FOR_NTT[m]);
 4988   // coefss[j]s are loaded in v0, v2, v4 and v6,
 4989   // coeffs[j + l]s in v1, v3, v5 and v7,
 4990   // the corresponding zetas in v16, v17, v18 and v19.
 4991   void dilithium_sub_add_montmul16() {
 4992     __ subv(v20, __ T4S, v0, v1);
 4993     __ subv(v21, __ T4S, v2, v3);
 4994     __ subv(v22, __ T4S, v4, v5);
 4995     __ subv(v23, __ T4S, v6, v7);
 4996 
 4997     __ addv(v0, __ T4S, v0, v1);
 4998     __ addv(v2, __ T4S, v2, v3);
 4999     __ addv(v4, __ T4S, v4, v5);
 5000     __ addv(v6, __ T4S, v6, v7);
 5001 
 5002     __ sqdmulh(v24, __ T4S, v20, v16); // aHigh = hi32(2 * b * c)
 5003     __ mulv(v1, __ T4S, v20, v16);     // aLow = lo32(b * c)
 5004     __ sqdmulh(v25, __ T4S, v21, v17);
 5005     __ mulv(v3, __ T4S, v21, v17);
 5006     __ sqdmulh(v26, __ T4S, v22, v18);
 5007     __ mulv(v5, __ T4S, v22, v18);
 5008     __ sqdmulh(v27, __ T4S, v23, v19);
 5009     __ mulv(v7, __ T4S, v23, v19);
 5010 
 5011     __ mulv(v1, __ T4S, v1, v30);      // m = (aLow * q)
 5012     __ mulv(v3, __ T4S, v3, v30);
 5013     __ mulv(v5, __ T4S, v5, v30);
 5014     __ mulv(v7, __ T4S, v7, v30);
 5015 
 5016     __ sqdmulh(v1, __ T4S, v1, v31);  // n = hi32(2 * m * q)
 5017     __ sqdmulh(v3, __ T4S, v3, v31);
 5018     __ sqdmulh(v5, __ T4S, v5, v31);
 5019     __ sqdmulh(v7, __ T4S, v7, v31);
 5020 
 5021     __ shsubv(v1, __ T4S, v24, v1);  // a = (aHigh  - n) / 2
 5022     __ shsubv(v3, __ T4S, v25, v3);
 5023     __ shsubv(v5, __ T4S, v26, v5);
 5024     __ shsubv(v7, __ T4S, v27, v7);
 5025   }
 5026 
 5027   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 5028   // in the Java implementation come in sequences of at least 8, so we
 5029   // can use ldpq to collect the corresponding data into pairs of vector
 5030   // registers
 5031   // We collect the coefficients that correspond to the 'j's into v0-v7
 5032   // the coefficiets that correspond to the 'j+l's into v16-v23 then
 5033   // do the additions into v24-v31 and the subtractions into v0-v7 then
 5034   // save the result of the additions, load the zetas into v16-v23
 5035   // do the (Montgomery) multiplications by zeta in parallel into v16-v23
 5036   // finally save the results back to the coeffs array
 5037   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 5038     const Register coeffs, const Register zetas) {
 5039     int c1 = 0;
 5040     int c2 = 32;
 5041     int startIncr;
 5042     int incr1;
 5043     int incr2;
 5044     int incr3;
 5045 
 5046     for (int level = 3; level < 8; level++) {
 5047       int c1Start = c1;
 5048       int c2Start = c2;
 5049       if (level == 3) {
 5050         incr1 = 64;
 5051         incr2 = 128;
 5052         incr3 = 192;
 5053       } else if (level == 4) {
 5054         incr1 = 32;
 5055         incr2 = 128;
 5056         incr3 = 160;
 5057       } else {
 5058         incr1 = 32;
 5059         incr2 = 64;
 5060         incr3 = 96;
 5061       }
 5062 
 5063       for (int i = 0; i < 4; i++) {
 5064         __ ldpq(v0, v1, Address(coeffs, c1Start));
 5065         __ ldpq(v2, v3, Address(coeffs, c1Start + incr1));
 5066         __ ldpq(v4, v5, Address(coeffs, c1Start + incr2));
 5067         __ ldpq(v6, v7, Address(coeffs, c1Start + incr3));
 5068         __ ldpq(v16, v17, Address(coeffs, c2Start));
 5069         __ ldpq(v18, v19, Address(coeffs, c2Start + incr1));
 5070         __ ldpq(v20, v21, Address(coeffs, c2Start + incr2));
 5071         __ ldpq(v22, v23, Address(coeffs, c2Start + incr3));
 5072         dilithium_add_sub32();
 5073         __ stpq(v24, v25, Address(coeffs, c1Start));
 5074         __ stpq(v26, v27, Address(coeffs, c1Start + incr1));
 5075         __ stpq(v28, v29, Address(coeffs, c1Start + incr2));
 5076         __ stpq(v30, v31, Address(coeffs, c1Start + incr3));
 5077         __ ldpq(v30, v31, Address(dilithiumConsts, 0));   // qInv, q
 5078         dilithium_load32zetas(zetas);
 5079         dilithium_montmul32(false);
 5080         __ stpq(v16, v17, Address(coeffs, c2Start));
 5081         __ stpq(v18, v19, Address(coeffs, c2Start + incr1));
 5082         __ stpq(v20, v21, Address(coeffs, c2Start + incr2));
 5083         __ stpq(v22, v23, Address(coeffs, c2Start + incr3));
 5084 
 5085         int k = 4 * level + i;
 5086 
 5087         if (k < 24) {
 5088           startIncr = 256;
 5089         } else if (k == 25) {
 5090           startIncr = 384;
 5091         } else {
 5092           startIncr = 128;
 5093         }
 5094 
 5095         c1Start += startIncr;
 5096         c2Start += startIncr;
 5097       }
 5098 
 5099       c2 *= 2;
 5100     }
 5101   }
 5102 
 5103   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 5104   // Implements the method
 5105   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 5106   // the sun.security.provider.ML_DSA class.
 5107   //
 5108   // coeffs (int[256]) = c_rarg0
 5109   // zetas (int[256]) = c_rarg1
 5110   address generate_dilithiumAlmostInverseNtt() {
 5111 
 5112     __ align(CodeEntryAlignment);
 5113     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id;
 5114     StubCodeMark mark(this, stub_id);
 5115     address start = __ pc();
 5116     __ enter();
 5117 
 5118     const Register coeffs = c_rarg0;
 5119     const Register zetas = c_rarg1;
 5120 
 5121     const Register tmpAddr = r9;
 5122     const Register dilithiumConsts = r10;
 5123     const Register result = r11;
 5124 
 5125     __ add(result, coeffs, 0);
 5126     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5127 
 5128     // Each level represents one iteration of the outer for loop of the Java version
 5129     // level0
 5130     for (int i = 0; i < 1024; i += 128) {
 5131       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 5132       __ add(tmpAddr, coeffs, i);
 5133       __ ld2(v0, v1, __ T4S, tmpAddr);
 5134       __ add(tmpAddr, coeffs, i + 32);
 5135       __ ld2(v2, v3, __ T4S, tmpAddr);
 5136       __ add(tmpAddr, coeffs, i + 64);
 5137       __ ld2(v4, v5, __ T4S, tmpAddr);
 5138       __ add(tmpAddr, coeffs, i + 96);
 5139       __ ld2(v6, v7, __ T4S, tmpAddr);
 5140       dilithium_load16zetas(16, zetas);
 5141       dilithium_sub_add_montmul16();
 5142       __ add(tmpAddr, coeffs, i);
 5143       __ st2(v0, v1, __ T4S, tmpAddr);
 5144       __ add(tmpAddr, coeffs, i + 32);
 5145       __ st2(v2, v3, __ T4S, tmpAddr);
 5146       __ add(tmpAddr, coeffs, i + 64);
 5147       __ st2(v4, v5, __ T4S, tmpAddr);
 5148       __ add(tmpAddr, coeffs, i + 96);
 5149       __ st2(v6, v7, __ T4S, tmpAddr);
 5150     }
 5151 
 5152     // level 1
 5153     for (int i = 0; i < 1024; i += 128) {
 5154       __ add(tmpAddr, coeffs, i);
 5155       __ ld2(v0, v1, __ T2D, tmpAddr);
 5156       __ add(tmpAddr, coeffs, i + 32);
 5157       __ ld2(v2, v3, __ T2D, tmpAddr);
 5158       __ add(tmpAddr, coeffs, i + 64);
 5159       __ ld2(v4, v5, __ T2D, tmpAddr);
 5160       __ add(tmpAddr, coeffs, i + 96);
 5161       __ ld2(v6, v7, __ T2D, tmpAddr);
 5162       dilithium_load16zetas(16, zetas);
 5163       dilithium_sub_add_montmul16();
 5164       __ add(tmpAddr, coeffs, i);
 5165       __ st2(v0, v1, __ T2D, tmpAddr);
 5166       __ add(tmpAddr, coeffs, i + 32);
 5167       __ st2(v2, v3, __ T2D, tmpAddr);
 5168       __ add(tmpAddr, coeffs, i + 64);
 5169       __ st2(v4, v5, __ T2D, tmpAddr);
 5170       __ add(tmpAddr, coeffs, i + 96);
 5171       __ st2(v6, v7, __ T2D, tmpAddr);
 5172     }
 5173 
 5174     //level 2
 5175     for (int i = 0; i < 1024; i += 256) {
 5176       __ ldr(v0, __ Q, Address(coeffs, i));
 5177       __ ldr(v1, __ Q, Address(coeffs, i + 32));
 5178       __ ldr(v2, __ Q, Address(coeffs, i + 64));
 5179       __ ldr(v3, __ Q, Address(coeffs, i + 96));
 5180       __ ldr(v4, __ Q, Address(coeffs, i + 128));
 5181       __ ldr(v5, __ Q, Address(coeffs, i + 160));
 5182       __ ldr(v6, __ Q, Address(coeffs, i + 192));
 5183       __ ldr(v7, __ Q, Address(coeffs, i + 224));
 5184       __ ldr(v16, __ Q, Address(coeffs, i + 16));
 5185       __ ldr(v17, __ Q, Address(coeffs, i + 48));
 5186       __ ldr(v18, __ Q, Address(coeffs, i + 80));
 5187       __ ldr(v19, __ Q, Address(coeffs, i + 112));
 5188       __ ldr(v20, __ Q, Address(coeffs, i + 144));
 5189       __ ldr(v21, __ Q, Address(coeffs, i + 176));
 5190       __ ldr(v22, __ Q, Address(coeffs, i + 208));
 5191       __ ldr(v23, __ Q, Address(coeffs, i + 240));
 5192       dilithium_add_sub32();
 5193       __ str(v24, __ Q, Address(coeffs, i));
 5194       __ str(v25, __ Q, Address(coeffs, i + 32));
 5195       __ str(v26, __ Q, Address(coeffs, i + 64));
 5196       __ str(v27, __ Q, Address(coeffs, i + 96));
 5197       __ str(v28, __ Q, Address(coeffs, i + 128));
 5198       __ str(v29, __ Q, Address(coeffs, i + 160));
 5199       __ str(v30, __ Q, Address(coeffs, i + 192));
 5200       __ str(v31, __ Q, Address(coeffs, i + 224));
 5201       dilithium_load32zetas(zetas);
 5202       __ ldpq(v30, v31, Address(dilithiumConsts, 0));  // qInv, q
 5203       dilithium_montmul32(false);
 5204       __ str(v16, __ Q, Address(coeffs, i + 16));
 5205       __ str(v17, __ Q, Address(coeffs, i + 48));
 5206       __ str(v18, __ Q, Address(coeffs, i + 80));
 5207       __ str(v19, __ Q, Address(coeffs, i + 112));
 5208       __ str(v20, __ Q, Address(coeffs, i + 144));
 5209       __ str(v21, __ Q, Address(coeffs, i + 176));
 5210       __ str(v22, __ Q, Address(coeffs, i + 208));
 5211       __ str(v23, __ Q, Address(coeffs, i + 240));
 5212     }
 5213 
 5214     // level 3-7
 5215     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 5216 
 5217     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5218     __ mov(r0, zr); // return 0
 5219     __ ret(lr);
 5220 
 5221     return start;
 5222 
 5223   }
 5224 
 5225   // Dilithium multiply polynomials in the NTT domain.
 5226   // Straightforward implementation of the method
 5227   // static int implDilithiumNttMult(
 5228   //              int[] result, int[] ntta, int[] nttb {} of
 5229   // the sun.security.provider.ML_DSA class.
 5230   //
 5231   // result (int[256]) = c_rarg0
 5232   // poly1 (int[256]) = c_rarg1
 5233   // poly2 (int[256]) = c_rarg2
 5234   address generate_dilithiumNttMult() {
 5235 
 5236     __ align(CodeEntryAlignment);
 5237     StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id;
 5238     StubCodeMark mark(this, stub_id);
 5239     address start = __ pc();
 5240     __ enter();
 5241 
 5242     Label L_loop;
 5243 
 5244     const Register result = c_rarg0;
 5245     const Register poly1 = c_rarg1;
 5246     const Register poly2 = c_rarg2;
 5247 
 5248     const Register dilithiumConsts = r10;
 5249     const Register len = r11;
 5250 
 5251     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5252 
 5253     __ ldpq(v30, v31, Address(dilithiumConsts, 0));   // qInv, q
 5254     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 5255 
 5256     __ mov(len, zr);
 5257     __ add(len, len, 1024);
 5258 
 5259     __ BIND(L_loop);
 5260 
 5261     __ ldpq(v0, v1, __ post(poly1, 32));
 5262     __ ldpq(v2, v3, __ post(poly1, 32));
 5263     __ ldpq(v4, v5, __ post(poly1, 32));
 5264     __ ldpq(v6, v7, __ post(poly1, 32));
 5265     __ ldpq(v16, v17, __ post(poly2, 32));
 5266     __ ldpq(v18, v19, __ post(poly2, 32));
 5267     __ ldpq(v20, v21, __ post(poly2, 32));
 5268     __ ldpq(v22, v23, __ post(poly2, 32));
 5269     dilithium_montmul32(false);
 5270     dilithium_montmul32(true);
 5271     __ stpq(v16, v17, __ post(result, 32));
 5272     __ stpq(v18, v19, __ post(result, 32));
 5273     __ stpq(v20, v21, __ post(result, 32));
 5274     __ stpq(v22, v23, __ post(result, 32));
 5275 
 5276     __ sub(len, len, 128);
 5277     __ cmp(len, (u1)128);
 5278     __ br(Assembler::GE, L_loop);
 5279 
 5280     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5281     __ mov(r0, zr); // return 0
 5282     __ ret(lr);
 5283 
 5284     return start;
 5285 
 5286   }
 5287 
 5288   // Dilithium Motgomery multiply an array by a constant.
 5289   // A straightforward implementation of the method
 5290   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 5291   // of the sun.security.provider.MLDSA class
 5292   //
 5293   // coeffs (int[256]) = c_rarg0
 5294   // constant (int) = c_rarg1
 5295   address generate_dilithiumMontMulByConstant() {
 5296 
 5297     __ align(CodeEntryAlignment);
 5298     StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id;
 5299     StubCodeMark mark(this, stub_id);
 5300     address start = __ pc();
 5301     __ enter();
 5302 
 5303     Label L_loop;
 5304 
 5305     const Register coeffs = c_rarg0;
 5306     const Register constant = c_rarg1;
 5307 
 5308     const Register dilithiumConsts = r10;
 5309     const Register result = r11;
 5310     const Register len = r12;
 5311 
 5312     __ add(result, coeffs, 0);
 5313     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5314 
 5315     __ ldpq(v30, v31, Address(dilithiumConsts, 0));   // qInv, q
 5316     __ dup(v29, __ T4S, constant);
 5317     __ mov(len, zr);
 5318     __ add(len, len, 1024);
 5319 
 5320     __ BIND(L_loop);
 5321 
 5322     __ ldpq(v16, v17, __ post(coeffs, 32));
 5323     __ ldpq(v18, v19, __ post(coeffs, 32));
 5324     __ ldpq(v20, v21, __ post(coeffs, 32));
 5325     __ ldpq(v22, v23, __ post(coeffs, 32));
 5326     dilithium_montmul32(true);
 5327     __ stpq(v16, v17, __ post(result, 32));
 5328     __ stpq(v18, v19, __ post(result, 32));
 5329     __ stpq(v20, v21, __ post(result, 32));
 5330     __ stpq(v22, v23, __ post(result, 32));
 5331 
 5332     __ sub(len, len, 128);
 5333     __ cmp(len, (u1)128);
 5334     __ br(Assembler::GE, L_loop);
 5335 
 5336     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5337     __ mov(r0, zr); // return 0
 5338     __ ret(lr);
 5339 
 5340     return start;
 5341   }
 5342 
 5343   // Dilithium decompose poly.
 5344   // Implements the method
 5345   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 5346   // of the sun.security.provider.ML_DSA class
 5347   //
 5348   // input (int[256]) = c_rarg0
 5349   // lowPart (int[256]) = c_rarg1
 5350   // highPart (int[256]) = c_rarg2
 5351   // twoGamma2  (int) = c_rarg3
 5352   // multiplier (int) = c_rarg4
 5353   address generate_dilithiumDecomposePoly() {
 5354 
 5355     __ align(CodeEntryAlignment);
 5356     StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id;
 5357     StubCodeMark mark(this, stub_id);
 5358     address start = __ pc();
 5359     __ enter();
 5360 
 5361     Label L_loop;
 5362 
 5363     const Register input = c_rarg0;
 5364     const Register lowPart = c_rarg1;
 5365     const Register highPart = c_rarg2;
 5366     const Register twoGamma2 = c_rarg3;
 5367     const Register multiplier = c_rarg4;
 5368 
 5369     const Register len = r9;
 5370     const Register dilithiumConsts = r10;
 5371     const Register tmp = r11;
 5372 
 5373     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5374 
 5375     // save callee-saved registers
 5376     __ stpd(v8, v9, __ pre(sp, -64));
 5377     __ stpd(v10, v11, Address(sp, 16));
 5378     __ stpd(v12, v13, Address(sp, 32));
 5379     __ stpd(v14, v15, Address(sp, 48));
 5380 
 5381 
 5382     __ mov(tmp, zr);
 5383     __ add(tmp, tmp, 1);
 5384     __ dup(v25, __ T4S, tmp); // 1
 5385     __ ldr(v30, __ Q, Address(dilithiumConsts, 16)); // q
 5386     __ ldr(v31, __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 5387     __ dup(v28, __ T4S, twoGamma2); // 2 * gamma2
 5388     __ dup(v29, __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 5389     __ subv(v26, __ T4S, v30, v25); // q - 1
 5390     __ sshr(v27, __ T4S, v28, 1); // gamma2
 5391 
 5392     __ mov(len, zr);
 5393     __ add(len, len, 1024);
 5394 
 5395     __ BIND(L_loop);
 5396 
 5397     __ ld4(v0, v1, v2, v3, __ T4S, __ post(input, 64));
 5398 
 5399     // rplus in v0
 5400     //  rplus = rplus - ((rplus + 5373807) >> 23) * dilithium_q;
 5401     __ addv(v4, __ T4S, v0, v31);
 5402     __ addv(v5, __ T4S, v1, v31);
 5403     __ addv(v6, __ T4S, v2, v31);
 5404     __ addv(v7, __ T4S, v3, v31);
 5405 
 5406     __ sshr(v4, __ T4S, v4, 23);
 5407     __ sshr(v5, __ T4S, v5, 23);
 5408     __ sshr(v6, __ T4S, v6, 23);
 5409     __ sshr(v7, __ T4S, v7, 23);
 5410 
 5411     __ mulv(v4, __ T4S, v4, v30);
 5412     __ mulv(v5, __ T4S, v5, v30);
 5413     __ mulv(v6, __ T4S, v6, v30);
 5414     __ mulv(v7, __ T4S, v7, v30);
 5415 
 5416     __ subv(v0, __ T4S, v0, v4);
 5417     __ subv(v1, __ T4S, v1, v5);
 5418     __ subv(v2, __ T4S, v2, v6);
 5419     __ subv(v3, __ T4S, v3, v7);
 5420 
 5421     // rplus in v0
 5422     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 5423     __ sshr(v4, __ T4S, v0, 31);
 5424     __ sshr(v5, __ T4S, v1, 31);
 5425     __ sshr(v6, __ T4S, v2, 31);
 5426     __ sshr(v7, __ T4S, v3, 31);
 5427 
 5428     __ andr(v4, __ T16B, v4, v30);
 5429     __ andr(v5, __ T16B, v5, v30);
 5430     __ andr(v6, __ T16B, v6, v30);
 5431     __ andr(v7, __ T16B, v7, v30);
 5432 
 5433     __ addv(v0, __ T4S, v0, v4);
 5434     __ addv(v1, __ T4S, v1, v5);
 5435     __ addv(v2, __ T4S, v2, v6);
 5436     __ addv(v3, __ T4S, v3, v7);
 5437 
 5438     // rplus in v0
 5439     // int quotient = (rplus * multiplier) >> 22;
 5440     __ mulv(v4, __ T4S, v0, v29);
 5441     __ mulv(v5, __ T4S, v1, v29);
 5442     __ mulv(v6, __ T4S, v2, v29);
 5443     __ mulv(v7, __ T4S, v3, v29);
 5444 
 5445     __ sshr(v4, __ T4S, v4, 22);
 5446     __ sshr(v5, __ T4S, v5, 22);
 5447     __ sshr(v6, __ T4S, v6, 22);
 5448     __ sshr(v7, __ T4S, v7, 22);
 5449 
 5450     // quotient in v4
 5451     // int r0 = rplus - quotient * twoGamma2;
 5452     __ mulv(v8, __ T4S, v4, v28);
 5453     __ mulv(v9, __ T4S, v5, v28);
 5454     __ mulv(v10, __ T4S, v6, v28);
 5455     __ mulv(v11, __ T4S, v7, v28);
 5456 
 5457     __ subv(v8, __ T4S, v0, v8);
 5458     __ subv(v9, __ T4S, v1, v9);
 5459     __ subv(v10, __ T4S, v2, v10);
 5460     __ subv(v11, __ T4S, v3, v11);
 5461 
 5462     // r0 in v8
 5463     // int mask = (twoGamma2 - r0) >> 22;
 5464     __ subv(v12, __ T4S, v28, v8);
 5465     __ subv(v13, __ T4S, v28, v9);
 5466     __ subv(v14, __ T4S, v28, v10);
 5467     __ subv(v15, __ T4S, v28, v11);
 5468 
 5469     __ sshr(v12, __ T4S, v12, 22);
 5470     __ sshr(v13, __ T4S, v13, 22);
 5471     __ sshr(v14, __ T4S, v14, 22);
 5472     __ sshr(v15, __ T4S, v15, 22);
 5473 
 5474     // mask in v12
 5475     // r0 -= (mask & twoGamma2);
 5476     __ andr(v16, __ T16B, v12, v28);
 5477     __ andr(v17, __ T16B, v13, v28);
 5478     __ andr(v18, __ T16B, v14, v28);
 5479     __ andr(v19, __ T16B, v15, v28);
 5480 
 5481     __ subv(v8, __ T4S, v8, v16);
 5482     __ subv(v9, __ T4S, v9, v17);
 5483     __ subv(v10, __ T4S, v10, v18);
 5484     __ subv(v11, __ T4S, v11, v19);
 5485 
 5486     // r0 in v8
 5487     //  quotient += (mask & 1);
 5488     __ andr(v16, __ T16B, v12, v25);
 5489     __ andr(v17, __ T16B, v13, v25);
 5490     __ andr(v18, __ T16B, v14, v25);
 5491     __ andr(v19, __ T16B, v15, v25);
 5492 
 5493     __ addv(v4, __ T4S, v4, v16);
 5494     __ addv(v5, __ T4S, v5, v17);
 5495     __ addv(v6, __ T4S, v6, v18);
 5496     __ addv(v7, __ T4S, v7, v19);
 5497 
 5498     // mask = (twoGamma2 / 2 - r0) >> 31;
 5499     __ subv(v12, __ T4S, v27, v8);
 5500     __ subv(v13, __ T4S, v27, v9);
 5501     __ subv(v14, __ T4S, v27, v10);
 5502     __ subv(v15, __ T4S, v27, v11);
 5503 
 5504     __ sshr(v12, __ T4S, v12, 31);
 5505     __ sshr(v13, __ T4S, v13, 31);
 5506     __ sshr(v14, __ T4S, v14, 31);
 5507     __ sshr(v15, __ T4S, v15, 31);
 5508 
 5509     // r0 -= (mask & twoGamma2);
 5510     __ andr(v16, __ T16B, v12, v28);
 5511     __ andr(v17, __ T16B, v13, v28);
 5512     __ andr(v18, __ T16B, v14, v28);
 5513     __ andr(v19, __ T16B, v15, v28);
 5514 
 5515     __ subv(v8, __ T4S, v8, v16);
 5516     __ subv(v9, __ T4S, v9, v17);
 5517     __ subv(v10, __ T4S, v10, v18);
 5518     __ subv(v11, __ T4S, v11, v19);
 5519 
 5520     // quotient += (mask & 1);
 5521     __ andr(v16, __ T16B, v12, v25);
 5522     __ andr(v17, __ T16B, v13, v25);
 5523     __ andr(v18, __ T16B, v14, v25);
 5524     __ andr(v19, __ T16B, v15, v25);
 5525 
 5526     __ addv(v4, __ T4S, v4, v16);
 5527     __ addv(v5, __ T4S, v5, v17);
 5528     __ addv(v6, __ T4S, v6, v18);
 5529     __ addv(v7, __ T4S, v7, v19);
 5530 
 5531     // int r1 = rplus - r0 - (dilithium_q - 1);
 5532     __ subv(v16, __ T4S, v0, v8);
 5533     __ subv(v17, __ T4S, v1, v9);
 5534     __ subv(v18, __ T4S, v2, v10);
 5535     __ subv(v19, __ T4S, v3, v11);
 5536 
 5537     __ subv(v16, __ T4S, v16, v26);
 5538     __ subv(v17, __ T4S, v17, v26);
 5539     __ subv(v18, __ T4S, v18, v26);
 5540     __ subv(v19, __ T4S, v19, v26);
 5541 
 5542     // r1 in v16
 5543     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 5544     __ negr(v20, __ T4S, v16);
 5545     __ negr(v21, __ T4S, v17);
 5546     __ negr(v22, __ T4S, v18);
 5547     __ negr(v23, __ T4S, v19);
 5548 
 5549     __ orr(v16, __ T16B, v16, v20);
 5550     __ orr(v17, __ T16B, v17, v21);
 5551     __ orr(v18, __ T16B, v18, v22);
 5552     __ orr(v19, __ T16B, v19, v23);
 5553 
 5554     __ sshr(v0, __ T4S, v16, 31);
 5555     __ sshr(v1, __ T4S, v17, 31);
 5556     __ sshr(v2, __ T4S, v18, 31);
 5557     __ sshr(v3, __ T4S, v19, 31);
 5558 
 5559     // r1 in v0
 5560     // r0 += ~r1;
 5561     __ notr(v20, __ T16B, v0);
 5562     __ notr(v21, __ T16B, v1);
 5563     __ notr(v22, __ T16B, v2);
 5564     __ notr(v23, __ T16B, v3);
 5565 
 5566     __ addv(v8, __ T4S, v8, v20);
 5567     __ addv(v9, __ T4S, v9, v21);
 5568     __ addv(v10, __ T4S, v10, v22);
 5569     __ addv(v11, __ T4S, v11, v23);
 5570 
 5571     // r0 in v8
 5572     // r1 = r1 & quotient;
 5573     __ andr(v0, __ T16B, v4, v0);
 5574     __ andr(v1, __ T16B, v5, v1);
 5575     __ andr(v2, __ T16B, v6, v2);
 5576     __ andr(v3, __ T16B, v7, v3);
 5577 
 5578     // r1 in v0
 5579     // lowPart[m] = r0;
 5580     // highPart[m] = r1;
 5581     __ st4(v8, v9, v10, v11, __ T4S, __ post(lowPart, 64));
 5582     __ st4(v0, v1, v2, v3, __ T4S, __ post(highPart, 64));
 5583 
 5584 
 5585     __ sub(len, len, 64);
 5586     __ cmp(len, (u1)64);
 5587     __ br(Assembler::GE, L_loop);
 5588 
 5589     // restore callee-saved vector registers
 5590     __ ldpd(v14, v15, Address(sp, 48));
 5591     __ ldpd(v12, v13, Address(sp, 32));
 5592     __ ldpd(v10, v11, Address(sp, 16));
 5593     __ ldpd(v8, v9, __ post(sp, 64));
 5594 
 5595     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5596     __ mov(r0, zr); // return 0
 5597     __ ret(lr);
 5598 
 5599     return start;
 5600   }
 5601 
 5602   /**
 5603    *  Arguments:
 5604    *
 5605    * Inputs:
 5606    *   c_rarg0   - int crc
 5607    *   c_rarg1   - byte* buf
 5608    *   c_rarg2   - int length
 5609    *   c_rarg3   - int* table
 5610    *
 5611    * Output:
 5612    *       r0   - int crc result
 5613    */
 5614   address generate_updateBytesCRC32C() {
 5615     assert(UseCRC32CIntrinsics, "what are we doing here?");
 5616 
 5617     __ align(CodeEntryAlignment);
 5618     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
 5619     StubCodeMark mark(this, stub_id);
 5620 
 5621     address start = __ pc();
 5622 
 5623     const Register crc   = c_rarg0;  // crc
 5624     const Register buf   = c_rarg1;  // source java byte array address
 5625     const Register len   = c_rarg2;  // length
 5626     const Register table0 = c_rarg3; // crc_table address
 5627     const Register table1 = c_rarg4;
 5628     const Register table2 = c_rarg5;
 5629     const Register table3 = c_rarg6;
 5630     const Register tmp3 = c_rarg7;
 5631 
 5632     BLOCK_COMMENT("Entry:");
 5633     __ enter(); // required for proper stackwalking of RuntimeStub frame
 5634 
 5635     __ kernel_crc32c(crc, buf, len,
 5636               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 5637 
 5638     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5639     __ ret(lr);
 5640 
 5641     return start;
 5642   }
 5643 
 5644   /***
 5645    *  Arguments:
 5646    *
 5647    *  Inputs:
 5648    *   c_rarg0   - int   adler
 5649    *   c_rarg1   - byte* buff
 5650    *   c_rarg2   - int   len
 5651    *
 5652    * Output:
 5653    *   c_rarg0   - int adler result
 5654    */
 5655   address generate_updateBytesAdler32() {
 5656     __ align(CodeEntryAlignment);
 5657     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
 5658     StubCodeMark mark(this, stub_id);
 5659     address start = __ pc();
 5660 
 5661     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 5662 
 5663     // Aliases
 5664     Register adler  = c_rarg0;
 5665     Register s1     = c_rarg0;
 5666     Register s2     = c_rarg3;
 5667     Register buff   = c_rarg1;
 5668     Register len    = c_rarg2;
 5669     Register nmax  = r4;
 5670     Register base  = r5;
 5671     Register count = r6;
 5672     Register temp0 = rscratch1;
 5673     Register temp1 = rscratch2;
 5674     FloatRegister vbytes = v0;
 5675     FloatRegister vs1acc = v1;
 5676     FloatRegister vs2acc = v2;
 5677     FloatRegister vtable = v3;
 5678 
 5679     // Max number of bytes we can process before having to take the mod
 5680     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 5681     uint64_t BASE = 0xfff1;
 5682     uint64_t NMAX = 0x15B0;
 5683 
 5684     __ mov(base, BASE);
 5685     __ mov(nmax, NMAX);
 5686 
 5687     // Load accumulation coefficients for the upper 16 bits
 5688     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 5689     __ ld1(vtable, __ T16B, Address(temp0));
 5690 
 5691     // s1 is initialized to the lower 16 bits of adler
 5692     // s2 is initialized to the upper 16 bits of adler
 5693     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 5694     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 5695 
 5696     // The pipelined loop needs at least 16 elements for 1 iteration
 5697     // It does check this, but it is more effective to skip to the cleanup loop
 5698     __ cmp(len, (u1)16);
 5699     __ br(Assembler::HS, L_nmax);
 5700     __ cbz(len, L_combine);
 5701 
 5702     __ bind(L_simple_by1_loop);
 5703     __ ldrb(temp0, Address(__ post(buff, 1)));
 5704     __ add(s1, s1, temp0);
 5705     __ add(s2, s2, s1);
 5706     __ subs(len, len, 1);
 5707     __ br(Assembler::HI, L_simple_by1_loop);
 5708 
 5709     // s1 = s1 % BASE
 5710     __ subs(temp0, s1, base);
 5711     __ csel(s1, temp0, s1, Assembler::HS);
 5712 
 5713     // s2 = s2 % BASE
 5714     __ lsr(temp0, s2, 16);
 5715     __ lsl(temp1, temp0, 4);
 5716     __ sub(temp1, temp1, temp0);
 5717     __ add(s2, temp1, s2, ext::uxth);
 5718 
 5719     __ subs(temp0, s2, base);
 5720     __ csel(s2, temp0, s2, Assembler::HS);
 5721 
 5722     __ b(L_combine);
 5723 
 5724     __ bind(L_nmax);
 5725     __ subs(len, len, nmax);
 5726     __ sub(count, nmax, 16);
 5727     __ br(Assembler::LO, L_by16);
 5728 
 5729     __ bind(L_nmax_loop);
 5730 
 5731     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 5732                                       vbytes, vs1acc, vs2acc, vtable);
 5733 
 5734     __ subs(count, count, 16);
 5735     __ br(Assembler::HS, L_nmax_loop);
 5736 
 5737     // s1 = s1 % BASE
 5738     __ lsr(temp0, s1, 16);
 5739     __ lsl(temp1, temp0, 4);
 5740     __ sub(temp1, temp1, temp0);
 5741     __ add(temp1, temp1, s1, ext::uxth);
 5742 
 5743     __ lsr(temp0, temp1, 16);
 5744     __ lsl(s1, temp0, 4);
 5745     __ sub(s1, s1, temp0);
 5746     __ add(s1, s1, temp1, ext:: uxth);
 5747 
 5748     __ subs(temp0, s1, base);
 5749     __ csel(s1, temp0, s1, Assembler::HS);
 5750 
 5751     // s2 = s2 % BASE
 5752     __ lsr(temp0, s2, 16);
 5753     __ lsl(temp1, temp0, 4);
 5754     __ sub(temp1, temp1, temp0);
 5755     __ add(temp1, temp1, s2, ext::uxth);
 5756 
 5757     __ lsr(temp0, temp1, 16);
 5758     __ lsl(s2, temp0, 4);
 5759     __ sub(s2, s2, temp0);
 5760     __ add(s2, s2, temp1, ext:: uxth);
 5761 
 5762     __ subs(temp0, s2, base);
 5763     __ csel(s2, temp0, s2, Assembler::HS);
 5764 
 5765     __ subs(len, len, nmax);
 5766     __ sub(count, nmax, 16);
 5767     __ br(Assembler::HS, L_nmax_loop);
 5768 
 5769     __ bind(L_by16);
 5770     __ adds(len, len, count);
 5771     __ br(Assembler::LO, L_by1);
 5772 
 5773     __ bind(L_by16_loop);
 5774 
 5775     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 5776                                       vbytes, vs1acc, vs2acc, vtable);
 5777 
 5778     __ subs(len, len, 16);
 5779     __ br(Assembler::HS, L_by16_loop);
 5780 
 5781     __ bind(L_by1);
 5782     __ adds(len, len, 15);
 5783     __ br(Assembler::LO, L_do_mod);
 5784 
 5785     __ bind(L_by1_loop);
 5786     __ ldrb(temp0, Address(__ post(buff, 1)));
 5787     __ add(s1, temp0, s1);
 5788     __ add(s2, s2, s1);
 5789     __ subs(len, len, 1);
 5790     __ br(Assembler::HS, L_by1_loop);
 5791 
 5792     __ bind(L_do_mod);
 5793     // s1 = s1 % BASE
 5794     __ lsr(temp0, s1, 16);
 5795     __ lsl(temp1, temp0, 4);
 5796     __ sub(temp1, temp1, temp0);
 5797     __ add(temp1, temp1, s1, ext::uxth);
 5798 
 5799     __ lsr(temp0, temp1, 16);
 5800     __ lsl(s1, temp0, 4);
 5801     __ sub(s1, s1, temp0);
 5802     __ add(s1, s1, temp1, ext:: uxth);
 5803 
 5804     __ subs(temp0, s1, base);
 5805     __ csel(s1, temp0, s1, Assembler::HS);
 5806 
 5807     // s2 = s2 % BASE
 5808     __ lsr(temp0, s2, 16);
 5809     __ lsl(temp1, temp0, 4);
 5810     __ sub(temp1, temp1, temp0);
 5811     __ add(temp1, temp1, s2, ext::uxth);
 5812 
 5813     __ lsr(temp0, temp1, 16);
 5814     __ lsl(s2, temp0, 4);
 5815     __ sub(s2, s2, temp0);
 5816     __ add(s2, s2, temp1, ext:: uxth);
 5817 
 5818     __ subs(temp0, s2, base);
 5819     __ csel(s2, temp0, s2, Assembler::HS);
 5820 
 5821     // Combine lower bits and higher bits
 5822     __ bind(L_combine);
 5823     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 5824 
 5825     __ ret(lr);
 5826 
 5827     return start;
 5828   }
 5829 
 5830   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 5831           Register temp0, Register temp1, FloatRegister vbytes,
 5832           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 5833     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 5834     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 5835     // In non-vectorized code, we update s1 and s2 as:
 5836     //   s1 <- s1 + b1
 5837     //   s2 <- s2 + s1
 5838     //   s1 <- s1 + b2
 5839     //   s2 <- s2 + b1
 5840     //   ...
 5841     //   s1 <- s1 + b16
 5842     //   s2 <- s2 + s1
 5843     // Putting above assignments together, we have:
 5844     //   s1_new = s1 + b1 + b2 + ... + b16
 5845     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 5846     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 5847     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 5848     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 5849 
 5850     // s2 = s2 + s1 * 16
 5851     __ add(s2, s2, s1, Assembler::LSL, 4);
 5852 
 5853     // vs1acc = b1 + b2 + b3 + ... + b16
 5854     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 5855     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 5856     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 5857     __ uaddlv(vs1acc, __ T16B, vbytes);
 5858     __ uaddlv(vs2acc, __ T8H, vs2acc);
 5859 
 5860     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 5861     __ fmovd(temp0, vs1acc);
 5862     __ fmovd(temp1, vs2acc);
 5863     __ add(s1, s1, temp0);
 5864     __ add(s2, s2, temp1);
 5865   }
 5866 
 5867   /**
 5868    *  Arguments:
 5869    *
 5870    *  Input:
 5871    *    c_rarg0   - x address
 5872    *    c_rarg1   - x length
 5873    *    c_rarg2   - y address
 5874    *    c_rarg3   - y length
 5875    *    c_rarg4   - z address
 5876    */
 5877   address generate_multiplyToLen() {
 5878     __ align(CodeEntryAlignment);
 5879     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
 5880     StubCodeMark mark(this, stub_id);
 5881 
 5882     address start = __ pc();
 5883  
 5884     if (SCCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) {
 5885       return start;
 5886     }
 5887     const Register x     = r0;
 5888     const Register xlen  = r1;
 5889     const Register y     = r2;
 5890     const Register ylen  = r3;
 5891     const Register z     = r4;
 5892 
 5893     const Register tmp0  = r5;
 5894     const Register tmp1  = r10;
 5895     const Register tmp2  = r11;
 5896     const Register tmp3  = r12;
 5897     const Register tmp4  = r13;
 5898     const Register tmp5  = r14;
 5899     const Register tmp6  = r15;
 5900     const Register tmp7  = r16;
 5901 
 5902     BLOCK_COMMENT("Entry:");
 5903     __ enter(); // required for proper stackwalking of RuntimeStub frame
 5904     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 5905     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5906     __ ret(lr);
 5907 
 5908     SCCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start);
 5909     return start;
 5910   }
 5911 
 5912   address generate_squareToLen() {
 5913     // squareToLen algorithm for sizes 1..127 described in java code works
 5914     // faster than multiply_to_len on some CPUs and slower on others, but
 5915     // multiply_to_len shows a bit better overall results
 5916     __ align(CodeEntryAlignment);
 5917     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
 5918     StubCodeMark mark(this, stub_id);
 5919     address start = __ pc();
 5920 
 5921     if (SCCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) {
 5922       return start;
 5923     }
 5924     const Register x     = r0;
 5925     const Register xlen  = r1;
 5926     const Register z     = r2;
 5927     const Register y     = r4; // == x
 5928     const Register ylen  = r5; // == xlen
 5929 
 5930     const Register tmp0  = r3;
 5931     const Register tmp1  = r10;
 5932     const Register tmp2  = r11;
 5933     const Register tmp3  = r12;
 5934     const Register tmp4  = r13;
 5935     const Register tmp5  = r14;
 5936     const Register tmp6  = r15;
 5937     const Register tmp7  = r16;
 5938 
 5939     RegSet spilled_regs = RegSet::of(y, ylen);
 5940     BLOCK_COMMENT("Entry:");
 5941     __ enter();
 5942     __ push(spilled_regs, sp);
 5943     __ mov(y, x);
 5944     __ mov(ylen, xlen);
 5945     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 5946     __ pop(spilled_regs, sp);
 5947     __ leave();
 5948     __ ret(lr);
 5949 
 5950     SCCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start);
 5951     return start;
 5952   }
 5953 
 5954   address generate_mulAdd() {
 5955     __ align(CodeEntryAlignment);
 5956     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
 5957     StubCodeMark mark(this, stub_id);
 5958 
 5959     address start = __ pc();
 5960 
 5961     if (SCCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) {
 5962       return start;
 5963     }
 5964     const Register out     = r0;
 5965     const Register in      = r1;
 5966     const Register offset  = r2;
 5967     const Register len     = r3;
 5968     const Register k       = r4;
 5969 
 5970     BLOCK_COMMENT("Entry:");
 5971     __ enter();
 5972     __ mul_add(out, in, offset, len, k);
 5973     __ leave();
 5974     __ ret(lr);
 5975 
 5976     SCCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start);
 5977     return start;
 5978   }
 5979 
 5980   // Arguments:
 5981   //
 5982   // Input:
 5983   //   c_rarg0   - newArr address
 5984   //   c_rarg1   - oldArr address
 5985   //   c_rarg2   - newIdx
 5986   //   c_rarg3   - shiftCount
 5987   //   c_rarg4   - numIter
 5988   //
 5989   address generate_bigIntegerRightShift() {
 5990     __ align(CodeEntryAlignment);
 5991     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
 5992     StubCodeMark mark(this, stub_id);
 5993     address start = __ pc();
 5994 
 5995     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 5996 
 5997     Register newArr        = c_rarg0;
 5998     Register oldArr        = c_rarg1;
 5999     Register newIdx        = c_rarg2;
 6000     Register shiftCount    = c_rarg3;
 6001     Register numIter       = c_rarg4;
 6002     Register idx           = numIter;
 6003 
 6004     Register newArrCur     = rscratch1;
 6005     Register shiftRevCount = rscratch2;
 6006     Register oldArrCur     = r13;
 6007     Register oldArrNext    = r14;
 6008 
 6009     FloatRegister oldElem0        = v0;
 6010     FloatRegister oldElem1        = v1;
 6011     FloatRegister newElem         = v2;
 6012     FloatRegister shiftVCount     = v3;
 6013     FloatRegister shiftVRevCount  = v4;
 6014 
 6015     __ cbz(idx, Exit);
 6016 
 6017     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 6018 
 6019     // left shift count
 6020     __ movw(shiftRevCount, 32);
 6021     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 6022 
 6023     // numIter too small to allow a 4-words SIMD loop, rolling back
 6024     __ cmp(numIter, (u1)4);
 6025     __ br(Assembler::LT, ShiftThree);
 6026 
 6027     __ dup(shiftVCount,    __ T4S, shiftCount);
 6028     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 6029     __ negr(shiftVCount,   __ T4S, shiftVCount);
 6030 
 6031     __ BIND(ShiftSIMDLoop);
 6032 
 6033     // Calculate the load addresses
 6034     __ sub(idx, idx, 4);
 6035     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 6036     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 6037     __ add(oldArrCur,  oldArrNext, 4);
 6038 
 6039     // Load 4 words and process
 6040     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 6041     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 6042     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 6043     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 6044     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 6045     __ st1(newElem,   __ T4S,  Address(newArrCur));
 6046 
 6047     __ cmp(idx, (u1)4);
 6048     __ br(Assembler::LT, ShiftTwoLoop);
 6049     __ b(ShiftSIMDLoop);
 6050 
 6051     __ BIND(ShiftTwoLoop);
 6052     __ cbz(idx, Exit);
 6053     __ cmp(idx, (u1)1);
 6054     __ br(Assembler::EQ, ShiftOne);
 6055 
 6056     // Calculate the load addresses
 6057     __ sub(idx, idx, 2);
 6058     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 6059     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 6060     __ add(oldArrCur,  oldArrNext, 4);
 6061 
 6062     // Load 2 words and process
 6063     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 6064     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 6065     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 6066     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 6067     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 6068     __ st1(newElem,   __ T2S, Address(newArrCur));
 6069     __ b(ShiftTwoLoop);
 6070 
 6071     __ BIND(ShiftThree);
 6072     __ tbz(idx, 1, ShiftOne);
 6073     __ tbz(idx, 0, ShiftTwo);
 6074     __ ldrw(r10,  Address(oldArr, 12));
 6075     __ ldrw(r11,  Address(oldArr, 8));
 6076     __ lsrvw(r10, r10, shiftCount);
 6077     __ lslvw(r11, r11, shiftRevCount);
 6078     __ orrw(r12,  r10, r11);
 6079     __ strw(r12,  Address(newArr, 8));
 6080 
 6081     __ BIND(ShiftTwo);
 6082     __ ldrw(r10,  Address(oldArr, 8));
 6083     __ ldrw(r11,  Address(oldArr, 4));
 6084     __ lsrvw(r10, r10, shiftCount);
 6085     __ lslvw(r11, r11, shiftRevCount);
 6086     __ orrw(r12,  r10, r11);
 6087     __ strw(r12,  Address(newArr, 4));
 6088 
 6089     __ BIND(ShiftOne);
 6090     __ ldrw(r10,  Address(oldArr, 4));
 6091     __ ldrw(r11,  Address(oldArr));
 6092     __ lsrvw(r10, r10, shiftCount);
 6093     __ lslvw(r11, r11, shiftRevCount);
 6094     __ orrw(r12,  r10, r11);
 6095     __ strw(r12,  Address(newArr));
 6096 
 6097     __ BIND(Exit);
 6098     __ ret(lr);
 6099 
 6100     return start;
 6101   }
 6102 
 6103   // Arguments:
 6104   //
 6105   // Input:
 6106   //   c_rarg0   - newArr address
 6107   //   c_rarg1   - oldArr address
 6108   //   c_rarg2   - newIdx
 6109   //   c_rarg3   - shiftCount
 6110   //   c_rarg4   - numIter
 6111   //
 6112   address generate_bigIntegerLeftShift() {
 6113     __ align(CodeEntryAlignment);
 6114     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
 6115     StubCodeMark mark(this, stub_id);
 6116     address start = __ pc();
 6117 
 6118     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 6119 
 6120     Register newArr        = c_rarg0;
 6121     Register oldArr        = c_rarg1;
 6122     Register newIdx        = c_rarg2;
 6123     Register shiftCount    = c_rarg3;
 6124     Register numIter       = c_rarg4;
 6125 
 6126     Register shiftRevCount = rscratch1;
 6127     Register oldArrNext    = rscratch2;
 6128 
 6129     FloatRegister oldElem0        = v0;
 6130     FloatRegister oldElem1        = v1;
 6131     FloatRegister newElem         = v2;
 6132     FloatRegister shiftVCount     = v3;
 6133     FloatRegister shiftVRevCount  = v4;
 6134 
 6135     __ cbz(numIter, Exit);
 6136 
 6137     __ add(oldArrNext, oldArr, 4);
 6138     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 6139 
 6140     // right shift count
 6141     __ movw(shiftRevCount, 32);
 6142     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 6143 
 6144     // numIter too small to allow a 4-words SIMD loop, rolling back
 6145     __ cmp(numIter, (u1)4);
 6146     __ br(Assembler::LT, ShiftThree);
 6147 
 6148     __ dup(shiftVCount,     __ T4S, shiftCount);
 6149     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 6150     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 6151 
 6152     __ BIND(ShiftSIMDLoop);
 6153 
 6154     // load 4 words and process
 6155     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 6156     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 6157     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 6158     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 6159     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 6160     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 6161     __ sub(numIter,   numIter, 4);
 6162 
 6163     __ cmp(numIter, (u1)4);
 6164     __ br(Assembler::LT, ShiftTwoLoop);
 6165     __ b(ShiftSIMDLoop);
 6166 
 6167     __ BIND(ShiftTwoLoop);
 6168     __ cbz(numIter, Exit);
 6169     __ cmp(numIter, (u1)1);
 6170     __ br(Assembler::EQ, ShiftOne);
 6171 
 6172     // load 2 words and process
 6173     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 6174     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 6175     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 6176     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 6177     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 6178     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 6179     __ sub(numIter,   numIter, 2);
 6180     __ b(ShiftTwoLoop);
 6181 
 6182     __ BIND(ShiftThree);
 6183     __ ldrw(r10,  __ post(oldArr, 4));
 6184     __ ldrw(r11,  __ post(oldArrNext, 4));
 6185     __ lslvw(r10, r10, shiftCount);
 6186     __ lsrvw(r11, r11, shiftRevCount);
 6187     __ orrw(r12,  r10, r11);
 6188     __ strw(r12,  __ post(newArr, 4));
 6189     __ tbz(numIter, 1, Exit);
 6190     __ tbz(numIter, 0, ShiftOne);
 6191 
 6192     __ BIND(ShiftTwo);
 6193     __ ldrw(r10,  __ post(oldArr, 4));
 6194     __ ldrw(r11,  __ post(oldArrNext, 4));
 6195     __ lslvw(r10, r10, shiftCount);
 6196     __ lsrvw(r11, r11, shiftRevCount);
 6197     __ orrw(r12,  r10, r11);
 6198     __ strw(r12,  __ post(newArr, 4));
 6199 
 6200     __ BIND(ShiftOne);
 6201     __ ldrw(r10,  Address(oldArr));
 6202     __ ldrw(r11,  Address(oldArrNext));
 6203     __ lslvw(r10, r10, shiftCount);
 6204     __ lsrvw(r11, r11, shiftRevCount);
 6205     __ orrw(r12,  r10, r11);
 6206     __ strw(r12,  Address(newArr));
 6207 
 6208     __ BIND(Exit);
 6209     __ ret(lr);
 6210 
 6211     return start;
 6212   }
 6213 
 6214   address generate_count_positives(address &count_positives_long) {
 6215     const u1 large_loop_size = 64;
 6216     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 6217     int dcache_line = VM_Version::dcache_line_size();
 6218 
 6219     Register ary1 = r1, len = r2, result = r0;
 6220 
 6221     __ align(CodeEntryAlignment);
 6222 
 6223     StubGenStubId stub_id = StubGenStubId::count_positives_id;
 6224     StubCodeMark mark(this, stub_id);
 6225 
 6226     address entry = __ pc();
 6227 
 6228     __ enter();
 6229     // precondition: a copy of len is already in result
 6230     // __ mov(result, len);
 6231 
 6232   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 6233         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 6234 
 6235   __ cmp(len, (u1)15);
 6236   __ br(Assembler::GT, LEN_OVER_15);
 6237   // The only case when execution falls into this code is when pointer is near
 6238   // the end of memory page and we have to avoid reading next page
 6239   __ add(ary1, ary1, len);
 6240   __ subs(len, len, 8);
 6241   __ br(Assembler::GT, LEN_OVER_8);
 6242   __ ldr(rscratch2, Address(ary1, -8));
 6243   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 6244   __ lsrv(rscratch2, rscratch2, rscratch1);
 6245   __ tst(rscratch2, UPPER_BIT_MASK);
 6246   __ csel(result, zr, result, Assembler::NE);
 6247   __ leave();
 6248   __ ret(lr);
 6249   __ bind(LEN_OVER_8);
 6250   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 6251   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 6252   __ tst(rscratch2, UPPER_BIT_MASK);
 6253   __ br(Assembler::NE, RET_NO_POP);
 6254   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 6255   __ lsrv(rscratch1, rscratch1, rscratch2);
 6256   __ tst(rscratch1, UPPER_BIT_MASK);
 6257   __ bind(RET_NO_POP);
 6258   __ csel(result, zr, result, Assembler::NE);
 6259   __ leave();
 6260   __ ret(lr);
 6261 
 6262   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 6263   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 6264 
 6265   count_positives_long = __ pc(); // 2nd entry point
 6266 
 6267   __ enter();
 6268 
 6269   __ bind(LEN_OVER_15);
 6270     __ push(spilled_regs, sp);
 6271     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 6272     __ cbz(rscratch2, ALIGNED);
 6273     __ ldp(tmp6, tmp1, Address(ary1));
 6274     __ mov(tmp5, 16);
 6275     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 6276     __ add(ary1, ary1, rscratch1);
 6277     __ orr(tmp6, tmp6, tmp1);
 6278     __ tst(tmp6, UPPER_BIT_MASK);
 6279     __ br(Assembler::NE, RET_ADJUST);
 6280     __ sub(len, len, rscratch1);
 6281 
 6282   __ bind(ALIGNED);
 6283     __ cmp(len, large_loop_size);
 6284     __ br(Assembler::LT, CHECK_16);
 6285     // Perform 16-byte load as early return in pre-loop to handle situation
 6286     // when initially aligned large array has negative values at starting bytes,
 6287     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 6288     // slower. Cases with negative bytes further ahead won't be affected that
 6289     // much. In fact, it'll be faster due to early loads, less instructions and
 6290     // less branches in LARGE_LOOP.
 6291     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 6292     __ sub(len, len, 16);
 6293     __ orr(tmp6, tmp6, tmp1);
 6294     __ tst(tmp6, UPPER_BIT_MASK);
 6295     __ br(Assembler::NE, RET_ADJUST_16);
 6296     __ cmp(len, large_loop_size);
 6297     __ br(Assembler::LT, CHECK_16);
 6298 
 6299     if (SoftwarePrefetchHintDistance >= 0
 6300         && SoftwarePrefetchHintDistance >= dcache_line) {
 6301       // initial prefetch
 6302       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 6303     }
 6304   __ bind(LARGE_LOOP);
 6305     if (SoftwarePrefetchHintDistance >= 0) {
 6306       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 6307     }
 6308     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 6309     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 6310     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 6311     // instructions per cycle and have less branches, but this approach disables
 6312     // early return, thus, all 64 bytes are loaded and checked every time.
 6313     __ ldp(tmp2, tmp3, Address(ary1));
 6314     __ ldp(tmp4, tmp5, Address(ary1, 16));
 6315     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 6316     __ ldp(tmp6, tmp1, Address(ary1, 48));
 6317     __ add(ary1, ary1, large_loop_size);
 6318     __ sub(len, len, large_loop_size);
 6319     __ orr(tmp2, tmp2, tmp3);
 6320     __ orr(tmp4, tmp4, tmp5);
 6321     __ orr(rscratch1, rscratch1, rscratch2);
 6322     __ orr(tmp6, tmp6, tmp1);
 6323     __ orr(tmp2, tmp2, tmp4);
 6324     __ orr(rscratch1, rscratch1, tmp6);
 6325     __ orr(tmp2, tmp2, rscratch1);
 6326     __ tst(tmp2, UPPER_BIT_MASK);
 6327     __ br(Assembler::NE, RET_ADJUST_LONG);
 6328     __ cmp(len, large_loop_size);
 6329     __ br(Assembler::GE, LARGE_LOOP);
 6330 
 6331   __ bind(CHECK_16); // small 16-byte load pre-loop
 6332     __ cmp(len, (u1)16);
 6333     __ br(Assembler::LT, POST_LOOP16);
 6334 
 6335   __ bind(LOOP16); // small 16-byte load loop
 6336     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 6337     __ sub(len, len, 16);
 6338     __ orr(tmp2, tmp2, tmp3);
 6339     __ tst(tmp2, UPPER_BIT_MASK);
 6340     __ br(Assembler::NE, RET_ADJUST_16);
 6341     __ cmp(len, (u1)16);
 6342     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 6343 
 6344   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 6345     __ cmp(len, (u1)8);
 6346     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 6347     __ ldr(tmp3, Address(__ post(ary1, 8)));
 6348     __ tst(tmp3, UPPER_BIT_MASK);
 6349     __ br(Assembler::NE, RET_ADJUST);
 6350     __ sub(len, len, 8);
 6351 
 6352   __ bind(POST_LOOP16_LOAD_TAIL);
 6353     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 6354     __ ldr(tmp1, Address(ary1));
 6355     __ mov(tmp2, 64);
 6356     __ sub(tmp4, tmp2, len, __ LSL, 3);
 6357     __ lslv(tmp1, tmp1, tmp4);
 6358     __ tst(tmp1, UPPER_BIT_MASK);
 6359     __ br(Assembler::NE, RET_ADJUST);
 6360     // Fallthrough
 6361 
 6362   __ bind(RET_LEN);
 6363     __ pop(spilled_regs, sp);
 6364     __ leave();
 6365     __ ret(lr);
 6366 
 6367     // difference result - len is the count of guaranteed to be
 6368     // positive bytes
 6369 
 6370   __ bind(RET_ADJUST_LONG);
 6371     __ add(len, len, (u1)(large_loop_size - 16));
 6372   __ bind(RET_ADJUST_16);
 6373     __ add(len, len, 16);
 6374   __ bind(RET_ADJUST);
 6375     __ pop(spilled_regs, sp);
 6376     __ leave();
 6377     __ sub(result, result, len);
 6378     __ ret(lr);
 6379 
 6380     return entry;
 6381   }
 6382 
 6383   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 6384         bool usePrefetch, Label &NOT_EQUAL) {
 6385     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6386         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 6387         tmp7 = r12, tmp8 = r13;
 6388     Label LOOP;
 6389 
 6390     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6391     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6392     __ bind(LOOP);
 6393     if (usePrefetch) {
 6394       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 6395       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 6396     }
 6397     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 6398     __ eor(tmp1, tmp1, tmp2);
 6399     __ eor(tmp3, tmp3, tmp4);
 6400     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 6401     __ orr(tmp1, tmp1, tmp3);
 6402     __ cbnz(tmp1, NOT_EQUAL);
 6403     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6404     __ eor(tmp5, tmp5, tmp6);
 6405     __ eor(tmp7, tmp7, tmp8);
 6406     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6407     __ orr(tmp5, tmp5, tmp7);
 6408     __ cbnz(tmp5, NOT_EQUAL);
 6409     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 6410     __ eor(tmp1, tmp1, tmp2);
 6411     __ eor(tmp3, tmp3, tmp4);
 6412     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 6413     __ orr(tmp1, tmp1, tmp3);
 6414     __ cbnz(tmp1, NOT_EQUAL);
 6415     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6416     __ eor(tmp5, tmp5, tmp6);
 6417     __ sub(cnt1, cnt1, 8 * wordSize);
 6418     __ eor(tmp7, tmp7, tmp8);
 6419     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6420     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 6421     // cmp) because subs allows an unlimited range of immediate operand.
 6422     __ subs(tmp6, cnt1, loopThreshold);
 6423     __ orr(tmp5, tmp5, tmp7);
 6424     __ cbnz(tmp5, NOT_EQUAL);
 6425     __ br(__ GE, LOOP);
 6426     // post-loop
 6427     __ eor(tmp1, tmp1, tmp2);
 6428     __ eor(tmp3, tmp3, tmp4);
 6429     __ orr(tmp1, tmp1, tmp3);
 6430     __ sub(cnt1, cnt1, 2 * wordSize);
 6431     __ cbnz(tmp1, NOT_EQUAL);
 6432   }
 6433 
 6434   void generate_large_array_equals_loop_simd(int loopThreshold,
 6435         bool usePrefetch, Label &NOT_EQUAL) {
 6436     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6437         tmp2 = rscratch2;
 6438     Label LOOP;
 6439 
 6440     __ bind(LOOP);
 6441     if (usePrefetch) {
 6442       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 6443       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 6444     }
 6445     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 6446     __ sub(cnt1, cnt1, 8 * wordSize);
 6447     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 6448     __ subs(tmp1, cnt1, loopThreshold);
 6449     __ eor(v0, __ T16B, v0, v4);
 6450     __ eor(v1, __ T16B, v1, v5);
 6451     __ eor(v2, __ T16B, v2, v6);
 6452     __ eor(v3, __ T16B, v3, v7);
 6453     __ orr(v0, __ T16B, v0, v1);
 6454     __ orr(v1, __ T16B, v2, v3);
 6455     __ orr(v0, __ T16B, v0, v1);
 6456     __ umov(tmp1, v0, __ D, 0);
 6457     __ umov(tmp2, v0, __ D, 1);
 6458     __ orr(tmp1, tmp1, tmp2);
 6459     __ cbnz(tmp1, NOT_EQUAL);
 6460     __ br(__ GE, LOOP);
 6461   }
 6462 
 6463   // a1 = r1 - array1 address
 6464   // a2 = r2 - array2 address
 6465   // result = r0 - return value. Already contains "false"
 6466   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 6467   // r3-r5 are reserved temporary registers
 6468   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 6469   address generate_large_array_equals() {
 6470     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6471         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 6472         tmp7 = r12, tmp8 = r13;
 6473     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 6474         SMALL_LOOP, POST_LOOP;
 6475     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 6476     // calculate if at least 32 prefetched bytes are used
 6477     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 6478     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 6479     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 6480     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 6481         tmp5, tmp6, tmp7, tmp8);
 6482 
 6483     __ align(CodeEntryAlignment);
 6484 
 6485     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
 6486     StubCodeMark mark(this, stub_id);
 6487 
 6488     address entry = __ pc();
 6489     __ enter();
 6490     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 6491     // also advance pointers to use post-increment instead of pre-increment
 6492     __ add(a1, a1, wordSize);
 6493     __ add(a2, a2, wordSize);
 6494     if (AvoidUnalignedAccesses) {
 6495       // both implementations (SIMD/nonSIMD) are using relatively large load
 6496       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 6497       // on some CPUs in case of address is not at least 16-byte aligned.
 6498       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 6499       // load if needed at least for 1st address and make if 16-byte aligned.
 6500       Label ALIGNED16;
 6501       __ tbz(a1, 3, ALIGNED16);
 6502       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 6503       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 6504       __ sub(cnt1, cnt1, wordSize);
 6505       __ eor(tmp1, tmp1, tmp2);
 6506       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 6507       __ bind(ALIGNED16);
 6508     }
 6509     if (UseSIMDForArrayEquals) {
 6510       if (SoftwarePrefetchHintDistance >= 0) {
 6511         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 6512         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 6513         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 6514             /* prfm = */ true, NOT_EQUAL);
 6515         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 6516         __ br(__ LT, TAIL);
 6517       }
 6518       __ bind(NO_PREFETCH_LARGE_LOOP);
 6519       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 6520           /* prfm = */ false, NOT_EQUAL);
 6521     } else {
 6522       __ push(spilled_regs, sp);
 6523       if (SoftwarePrefetchHintDistance >= 0) {
 6524         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 6525         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 6526         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 6527             /* prfm = */ true, NOT_EQUAL);
 6528         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 6529         __ br(__ LT, TAIL);
 6530       }
 6531       __ bind(NO_PREFETCH_LARGE_LOOP);
 6532       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 6533           /* prfm = */ false, NOT_EQUAL);
 6534     }
 6535     __ bind(TAIL);
 6536       __ cbz(cnt1, EQUAL);
 6537       __ subs(cnt1, cnt1, wordSize);
 6538       __ br(__ LE, POST_LOOP);
 6539     __ bind(SMALL_LOOP);
 6540       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 6541       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 6542       __ subs(cnt1, cnt1, wordSize);
 6543       __ eor(tmp1, tmp1, tmp2);
 6544       __ cbnz(tmp1, NOT_EQUAL);
 6545       __ br(__ GT, SMALL_LOOP);
 6546     __ bind(POST_LOOP);
 6547       __ ldr(tmp1, Address(a1, cnt1));
 6548       __ ldr(tmp2, Address(a2, cnt1));
 6549       __ eor(tmp1, tmp1, tmp2);
 6550       __ cbnz(tmp1, NOT_EQUAL);
 6551     __ bind(EQUAL);
 6552       __ mov(result, true);
 6553     __ bind(NOT_EQUAL);
 6554       if (!UseSIMDForArrayEquals) {
 6555         __ pop(spilled_regs, sp);
 6556       }
 6557     __ bind(NOT_EQUAL_NO_POP);
 6558     __ leave();
 6559     __ ret(lr);
 6560     return entry;
 6561   }
 6562 
 6563   // result = r0 - return value. Contains initial hashcode value on entry.
 6564   // ary = r1 - array address
 6565   // cnt = r2 - elements count
 6566   // Clobbers: v0-v13, rscratch1, rscratch2
 6567   address generate_large_arrays_hashcode(BasicType eltype) {
 6568     const Register result = r0, ary = r1, cnt = r2;
 6569     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 6570     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 6571     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 6572     const FloatRegister vpowm = v13;
 6573 
 6574     ARRAYS_HASHCODE_REGISTERS;
 6575 
 6576     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 6577 
 6578     unsigned int vf; // vectorization factor
 6579     bool multiply_by_halves;
 6580     Assembler::SIMD_Arrangement load_arrangement;
 6581     switch (eltype) {
 6582     case T_BOOLEAN:
 6583     case T_BYTE:
 6584       load_arrangement = Assembler::T8B;
 6585       multiply_by_halves = true;
 6586       vf = 8;
 6587       break;
 6588     case T_CHAR:
 6589     case T_SHORT:
 6590       load_arrangement = Assembler::T8H;
 6591       multiply_by_halves = true;
 6592       vf = 8;
 6593       break;
 6594     case T_INT:
 6595       load_arrangement = Assembler::T4S;
 6596       multiply_by_halves = false;
 6597       vf = 4;
 6598       break;
 6599     default:
 6600       ShouldNotReachHere();
 6601     }
 6602 
 6603     // Unroll factor
 6604     const unsigned uf = 4;
 6605 
 6606     // Effective vectorization factor
 6607     const unsigned evf = vf * uf;
 6608 
 6609     __ align(CodeEntryAlignment);
 6610 
 6611     StubGenStubId stub_id;
 6612     switch (eltype) {
 6613     case T_BOOLEAN:
 6614       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
 6615       break;
 6616     case T_BYTE:
 6617       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
 6618       break;
 6619     case T_CHAR:
 6620       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
 6621       break;
 6622     case T_SHORT:
 6623       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
 6624       break;
 6625     case T_INT:
 6626       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
 6627       break;
 6628     default:
 6629       stub_id = StubGenStubId::NO_STUBID;
 6630       ShouldNotReachHere();
 6631     };
 6632 
 6633     StubCodeMark mark(this, stub_id);
 6634 
 6635     address entry = __ pc();
 6636     __ enter();
 6637 
 6638     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 6639     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 6640     // value shouldn't change throughout both loops.
 6641     __ movw(rscratch1, intpow(31U, 3));
 6642     __ mov(vpow, Assembler::S, 0, rscratch1);
 6643     __ movw(rscratch1, intpow(31U, 2));
 6644     __ mov(vpow, Assembler::S, 1, rscratch1);
 6645     __ movw(rscratch1, intpow(31U, 1));
 6646     __ mov(vpow, Assembler::S, 2, rscratch1);
 6647     __ movw(rscratch1, intpow(31U, 0));
 6648     __ mov(vpow, Assembler::S, 3, rscratch1);
 6649 
 6650     __ mov(vmul0, Assembler::T16B, 0);
 6651     __ mov(vmul0, Assembler::S, 3, result);
 6652 
 6653     __ andr(rscratch2, cnt, (uf - 1) * vf);
 6654     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 6655 
 6656     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 6657     __ mov(vpowm, Assembler::S, 0, rscratch1);
 6658 
 6659     // SMALL LOOP
 6660     __ bind(SMALL_LOOP);
 6661 
 6662     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 6663     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6664     __ subsw(rscratch2, rscratch2, vf);
 6665 
 6666     if (load_arrangement == Assembler::T8B) {
 6667       // Extend 8B to 8H to be able to use vector multiply
 6668       // instructions
 6669       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 6670       if (is_signed_subword_type(eltype)) {
 6671         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6672       } else {
 6673         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6674       }
 6675     }
 6676 
 6677     switch (load_arrangement) {
 6678     case Assembler::T4S:
 6679       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 6680       break;
 6681     case Assembler::T8B:
 6682     case Assembler::T8H:
 6683       assert(is_subword_type(eltype), "subword type expected");
 6684       if (is_signed_subword_type(eltype)) {
 6685         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6686       } else {
 6687         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6688       }
 6689       break;
 6690     default:
 6691       __ should_not_reach_here();
 6692     }
 6693 
 6694     // Process the upper half of a vector
 6695     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 6696       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6697       if (is_signed_subword_type(eltype)) {
 6698         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6699       } else {
 6700         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6701       }
 6702     }
 6703 
 6704     __ br(Assembler::HI, SMALL_LOOP);
 6705 
 6706     // SMALL LOOP'S EPILOQUE
 6707     __ lsr(rscratch2, cnt, exact_log2(evf));
 6708     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 6709 
 6710     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 6711     __ addv(vmul0, Assembler::T4S, vmul0);
 6712     __ umov(result, vmul0, Assembler::S, 0);
 6713 
 6714     // TAIL
 6715     __ bind(TAIL);
 6716 
 6717     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 6718     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 6719     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 6720     __ andr(rscratch2, cnt, vf - 1);
 6721     __ bind(TAIL_SHORTCUT);
 6722     __ adr(rscratch1, BR_BASE);
 6723     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
 6724     __ movw(rscratch2, 0x1f);
 6725     __ br(rscratch1);
 6726 
 6727     for (size_t i = 0; i < vf - 1; ++i) {
 6728       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 6729                                    eltype);
 6730       __ maddw(result, result, rscratch2, rscratch1);
 6731     }
 6732     __ bind(BR_BASE);
 6733 
 6734     __ leave();
 6735     __ ret(lr);
 6736 
 6737     // LARGE LOOP
 6738     __ bind(LARGE_LOOP_PREHEADER);
 6739 
 6740     __ lsr(rscratch2, cnt, exact_log2(evf));
 6741 
 6742     if (multiply_by_halves) {
 6743       // 31^4 - multiplier between lower and upper parts of a register
 6744       __ movw(rscratch1, intpow(31U, vf / 2));
 6745       __ mov(vpowm, Assembler::S, 1, rscratch1);
 6746       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 6747       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 6748       __ mov(vpowm, Assembler::S, 0, rscratch1);
 6749     } else {
 6750       // 31^16
 6751       __ movw(rscratch1, intpow(31U, evf));
 6752       __ mov(vpowm, Assembler::S, 0, rscratch1);
 6753     }
 6754 
 6755     __ mov(vmul3, Assembler::T16B, 0);
 6756     __ mov(vmul2, Assembler::T16B, 0);
 6757     __ mov(vmul1, Assembler::T16B, 0);
 6758 
 6759     __ bind(LARGE_LOOP);
 6760 
 6761     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 6762     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 6763     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 6764     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6765 
 6766     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 6767            Address(__ post(ary, evf * type2aelembytes(eltype))));
 6768 
 6769     if (load_arrangement == Assembler::T8B) {
 6770       // Extend 8B to 8H to be able to use vector multiply
 6771       // instructions
 6772       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 6773       if (is_signed_subword_type(eltype)) {
 6774         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 6775         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 6776         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 6777         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6778       } else {
 6779         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 6780         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 6781         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 6782         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6783       }
 6784     }
 6785 
 6786     switch (load_arrangement) {
 6787     case Assembler::T4S:
 6788       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 6789       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 6790       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 6791       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 6792       break;
 6793     case Assembler::T8B:
 6794     case Assembler::T8H:
 6795       assert(is_subword_type(eltype), "subword type expected");
 6796       if (is_signed_subword_type(eltype)) {
 6797         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 6798         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 6799         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 6800         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6801       } else {
 6802         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 6803         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 6804         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 6805         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6806       }
 6807       break;
 6808     default:
 6809       __ should_not_reach_here();
 6810     }
 6811 
 6812     // Process the upper half of a vector
 6813     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 6814       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 6815       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 6816       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 6817       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 6818       if (is_signed_subword_type(eltype)) {
 6819         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 6820         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 6821         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 6822         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6823       } else {
 6824         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 6825         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 6826         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 6827         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6828       }
 6829     }
 6830 
 6831     __ subsw(rscratch2, rscratch2, 1);
 6832     __ br(Assembler::HI, LARGE_LOOP);
 6833 
 6834     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 6835     __ addv(vmul3, Assembler::T4S, vmul3);
 6836     __ umov(result, vmul3, Assembler::S, 0);
 6837 
 6838     __ mov(rscratch2, intpow(31U, vf));
 6839 
 6840     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 6841     __ addv(vmul2, Assembler::T4S, vmul2);
 6842     __ umov(rscratch1, vmul2, Assembler::S, 0);
 6843     __ maddw(result, result, rscratch2, rscratch1);
 6844 
 6845     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 6846     __ addv(vmul1, Assembler::T4S, vmul1);
 6847     __ umov(rscratch1, vmul1, Assembler::S, 0);
 6848     __ maddw(result, result, rscratch2, rscratch1);
 6849 
 6850     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 6851     __ addv(vmul0, Assembler::T4S, vmul0);
 6852     __ umov(rscratch1, vmul0, Assembler::S, 0);
 6853     __ maddw(result, result, rscratch2, rscratch1);
 6854 
 6855     __ andr(rscratch2, cnt, vf - 1);
 6856     __ cbnz(rscratch2, TAIL_SHORTCUT);
 6857 
 6858     __ leave();
 6859     __ ret(lr);
 6860 
 6861     return entry;
 6862   }
 6863 
 6864   address generate_dsin_dcos(bool isCos) {
 6865     __ align(CodeEntryAlignment);
 6866     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
 6867     StubCodeMark mark(this, stub_id);
 6868     address start = __ pc();
 6869     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 6870         (address)StubRoutines::aarch64::_two_over_pi,
 6871         (address)StubRoutines::aarch64::_pio2,
 6872         (address)StubRoutines::aarch64::_dsin_coef,
 6873         (address)StubRoutines::aarch64::_dcos_coef);
 6874     return start;
 6875   }
 6876 
 6877   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 6878   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 6879       Label &DIFF2) {
 6880     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 6881     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 6882 
 6883     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 6884     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 6885     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 6886     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 6887 
 6888     __ fmovd(tmpL, vtmp3);
 6889     __ eor(rscratch2, tmp3, tmpL);
 6890     __ cbnz(rscratch2, DIFF2);
 6891 
 6892     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6893     __ umov(tmpL, vtmp3, __ D, 1);
 6894     __ eor(rscratch2, tmpU, tmpL);
 6895     __ cbnz(rscratch2, DIFF1);
 6896 
 6897     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 6898     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 6899     __ fmovd(tmpL, vtmp);
 6900     __ eor(rscratch2, tmp3, tmpL);
 6901     __ cbnz(rscratch2, DIFF2);
 6902 
 6903     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6904     __ umov(tmpL, vtmp, __ D, 1);
 6905     __ eor(rscratch2, tmpU, tmpL);
 6906     __ cbnz(rscratch2, DIFF1);
 6907   }
 6908 
 6909   // r0  = result
 6910   // r1  = str1
 6911   // r2  = cnt1
 6912   // r3  = str2
 6913   // r4  = cnt2
 6914   // r10 = tmp1
 6915   // r11 = tmp2
 6916   address generate_compare_long_string_different_encoding(bool isLU) {
 6917     __ align(CodeEntryAlignment);
 6918     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
 6919     StubCodeMark mark(this, stub_id);
 6920     address entry = __ pc();
 6921     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 6922         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 6923         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 6924     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 6925         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 6926     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 6927     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 6928 
 6929     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 6930 
 6931     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 6932     // cnt2 == amount of characters left to compare
 6933     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 6934     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 6935     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 6936     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 6937     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 6938     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 6939     __ eor(rscratch2, tmp1, tmp2);
 6940     __ mov(rscratch1, tmp2);
 6941     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 6942     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 6943              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 6944     __ push(spilled_regs, sp);
 6945     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 6946     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 6947 
 6948     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6949 
 6950     if (SoftwarePrefetchHintDistance >= 0) {
 6951       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 6952       __ br(__ LT, NO_PREFETCH);
 6953       __ bind(LARGE_LOOP_PREFETCH);
 6954         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 6955         __ mov(tmp4, 2);
 6956         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 6957         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 6958           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6959           __ subs(tmp4, tmp4, 1);
 6960           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 6961           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 6962           __ mov(tmp4, 2);
 6963         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 6964           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6965           __ subs(tmp4, tmp4, 1);
 6966           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 6967           __ sub(cnt2, cnt2, 64);
 6968           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 6969           __ br(__ GE, LARGE_LOOP_PREFETCH);
 6970     }
 6971     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 6972     __ bind(NO_PREFETCH);
 6973     __ subs(cnt2, cnt2, 16);
 6974     __ br(__ LT, TAIL);
 6975     __ align(OptoLoopAlignment);
 6976     __ bind(SMALL_LOOP); // smaller loop
 6977       __ subs(cnt2, cnt2, 16);
 6978       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6979       __ br(__ GE, SMALL_LOOP);
 6980       __ cmn(cnt2, (u1)16);
 6981       __ br(__ EQ, LOAD_LAST);
 6982     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 6983       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 6984       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 6985       __ ldr(tmp3, Address(cnt1, -8));
 6986       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 6987       __ b(LOAD_LAST);
 6988     __ bind(DIFF2);
 6989       __ mov(tmpU, tmp3);
 6990     __ bind(DIFF1);
 6991       __ pop(spilled_regs, sp);
 6992       __ b(CALCULATE_DIFFERENCE);
 6993     __ bind(LOAD_LAST);
 6994       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 6995       // No need to load it again
 6996       __ mov(tmpU, tmp3);
 6997       __ pop(spilled_regs, sp);
 6998 
 6999       // tmp2 points to the address of the last 4 Latin1 characters right now
 7000       __ ldrs(vtmp, Address(tmp2));
 7001       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 7002       __ fmovd(tmpL, vtmp);
 7003 
 7004       __ eor(rscratch2, tmpU, tmpL);
 7005       __ cbz(rscratch2, DONE);
 7006 
 7007     // Find the first different characters in the longwords and
 7008     // compute their difference.
 7009     __ bind(CALCULATE_DIFFERENCE);
 7010       __ rev(rscratch2, rscratch2);
 7011       __ clz(rscratch2, rscratch2);
 7012       __ andr(rscratch2, rscratch2, -16);
 7013       __ lsrv(tmp1, tmp1, rscratch2);
 7014       __ uxthw(tmp1, tmp1);
 7015       __ lsrv(rscratch1, rscratch1, rscratch2);
 7016       __ uxthw(rscratch1, rscratch1);
 7017       __ subw(result, tmp1, rscratch1);
 7018     __ bind(DONE);
 7019       __ ret(lr);
 7020     return entry;
 7021   }
 7022 
 7023   // r0 = input (float16)
 7024   // v0 = result (float)
 7025   // v1 = temporary float register
 7026   address generate_float16ToFloat() {
 7027     __ align(CodeEntryAlignment);
 7028     StubGenStubId stub_id = StubGenStubId::hf2f_id;
 7029     StubCodeMark mark(this, stub_id);
 7030     address entry = __ pc();
 7031     BLOCK_COMMENT("Entry:");
 7032     __ flt16_to_flt(v0, r0, v1);
 7033     __ ret(lr);
 7034     return entry;
 7035   }
 7036 
 7037   // v0 = input (float)
 7038   // r0 = result (float16)
 7039   // v1 = temporary float register
 7040   address generate_floatToFloat16() {
 7041     __ align(CodeEntryAlignment);
 7042     StubGenStubId stub_id = StubGenStubId::f2hf_id;
 7043     StubCodeMark mark(this, stub_id);
 7044     address entry = __ pc();
 7045     BLOCK_COMMENT("Entry:");
 7046     __ flt_to_flt16(r0, v0, v1);
 7047     __ ret(lr);
 7048     return entry;
 7049   }
 7050 
 7051   address generate_method_entry_barrier() {
 7052     __ align(CodeEntryAlignment);
 7053     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
 7054     StubCodeMark mark(this, stub_id);
 7055 
 7056     Label deoptimize_label;
 7057 
 7058     address start = __ pc();
 7059 
 7060     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 7061 
 7062     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 7063       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 7064       // We can get here despite the nmethod being good, if we have not
 7065       // yet applied our cross modification fence (or data fence).
 7066       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 7067       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 7068       __ ldrw(rscratch2, rscratch2);
 7069       __ strw(rscratch2, thread_epoch_addr);
 7070       __ isb();
 7071       __ membar(__ LoadLoad);
 7072     }
 7073 
 7074     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 7075 
 7076     __ enter();
 7077     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 7078 
 7079     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 7080 
 7081     __ push_call_clobbered_registers();
 7082 
 7083     __ mov(c_rarg0, rscratch2);
 7084     __ call_VM_leaf
 7085          (CAST_FROM_FN_PTR
 7086           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 7087 
 7088     __ reset_last_Java_frame(true);
 7089 
 7090     __ mov(rscratch1, r0);
 7091 
 7092     __ pop_call_clobbered_registers();
 7093 
 7094     __ cbnz(rscratch1, deoptimize_label);
 7095 
 7096     __ leave();
 7097     __ ret(lr);
 7098 
 7099     __ BIND(deoptimize_label);
 7100 
 7101     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 7102     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 7103 
 7104     __ mov(sp, rscratch1);
 7105     __ br(rscratch2);
 7106 
 7107     return start;
 7108   }
 7109 
 7110   // r0  = result
 7111   // r1  = str1
 7112   // r2  = cnt1
 7113   // r3  = str2
 7114   // r4  = cnt2
 7115   // r10 = tmp1
 7116   // r11 = tmp2
 7117   address generate_compare_long_string_same_encoding(bool isLL) {
 7118     __ align(CodeEntryAlignment);
 7119     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
 7120     StubCodeMark mark(this, stub_id);
 7121     address entry = __ pc();
 7122     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 7123         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 7124 
 7125     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 7126 
 7127     // exit from large loop when less than 64 bytes left to read or we're about
 7128     // to prefetch memory behind array border
 7129     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 7130 
 7131     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 7132     __ eor(rscratch2, tmp1, tmp2);
 7133     __ cbnz(rscratch2, CAL_DIFFERENCE);
 7134 
 7135     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 7136     // update pointers, because of previous read
 7137     __ add(str1, str1, wordSize);
 7138     __ add(str2, str2, wordSize);
 7139     if (SoftwarePrefetchHintDistance >= 0) {
 7140       __ align(OptoLoopAlignment);
 7141       __ bind(LARGE_LOOP_PREFETCH);
 7142         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 7143         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 7144 
 7145         for (int i = 0; i < 4; i++) {
 7146           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 7147           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 7148           __ cmp(tmp1, tmp2);
 7149           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7150           __ br(Assembler::NE, DIFF);
 7151         }
 7152         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 7153         __ add(str1, str1, 64);
 7154         __ add(str2, str2, 64);
 7155         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 7156         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 7157         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 7158     }
 7159 
 7160     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 7161     __ br(Assembler::LE, LESS16);
 7162     __ align(OptoLoopAlignment);
 7163     __ bind(LOOP_COMPARE16);
 7164       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 7165       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 7166       __ cmp(tmp1, tmp2);
 7167       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7168       __ br(Assembler::NE, DIFF);
 7169       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 7170       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 7171       __ br(Assembler::LT, LESS16);
 7172 
 7173       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 7174       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 7175       __ cmp(tmp1, tmp2);
 7176       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7177       __ br(Assembler::NE, DIFF);
 7178       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 7179       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 7180       __ br(Assembler::GE, LOOP_COMPARE16);
 7181       __ cbz(cnt2, LENGTH_DIFF);
 7182 
 7183     __ bind(LESS16);
 7184       // each 8 compare
 7185       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 7186       __ br(Assembler::LE, LESS8);
 7187       __ ldr(tmp1, Address(__ post(str1, 8)));
 7188       __ ldr(tmp2, Address(__ post(str2, 8)));
 7189       __ eor(rscratch2, tmp1, tmp2);
 7190       __ cbnz(rscratch2, CAL_DIFFERENCE);
 7191       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 7192 
 7193     __ bind(LESS8); // directly load last 8 bytes
 7194       if (!isLL) {
 7195         __ add(cnt2, cnt2, cnt2);
 7196       }
 7197       __ ldr(tmp1, Address(str1, cnt2));
 7198       __ ldr(tmp2, Address(str2, cnt2));
 7199       __ eor(rscratch2, tmp1, tmp2);
 7200       __ cbz(rscratch2, LENGTH_DIFF);
 7201       __ b(CAL_DIFFERENCE);
 7202 
 7203     __ bind(DIFF);
 7204       __ cmp(tmp1, tmp2);
 7205       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 7206       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 7207       // reuse rscratch2 register for the result of eor instruction
 7208       __ eor(rscratch2, tmp1, tmp2);
 7209 
 7210     __ bind(CAL_DIFFERENCE);
 7211       __ rev(rscratch2, rscratch2);
 7212       __ clz(rscratch2, rscratch2);
 7213       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 7214       __ lsrv(tmp1, tmp1, rscratch2);
 7215       __ lsrv(tmp2, tmp2, rscratch2);
 7216       if (isLL) {
 7217         __ uxtbw(tmp1, tmp1);
 7218         __ uxtbw(tmp2, tmp2);
 7219       } else {
 7220         __ uxthw(tmp1, tmp1);
 7221         __ uxthw(tmp2, tmp2);
 7222       }
 7223       __ subw(result, tmp1, tmp2);
 7224 
 7225     __ bind(LENGTH_DIFF);
 7226       __ ret(lr);
 7227     return entry;
 7228   }
 7229 
 7230   enum string_compare_mode {
 7231     LL,
 7232     LU,
 7233     UL,
 7234     UU,
 7235   };
 7236 
 7237   // The following registers are declared in aarch64.ad
 7238   // r0  = result
 7239   // r1  = str1
 7240   // r2  = cnt1
 7241   // r3  = str2
 7242   // r4  = cnt2
 7243   // r10 = tmp1
 7244   // r11 = tmp2
 7245   // z0  = ztmp1
 7246   // z1  = ztmp2
 7247   // p0  = pgtmp1
 7248   // p1  = pgtmp2
 7249   address generate_compare_long_string_sve(string_compare_mode mode) {
 7250     StubGenStubId stub_id;
 7251     switch (mode) {
 7252       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
 7253       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
 7254       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
 7255       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
 7256       default: ShouldNotReachHere();
 7257     }
 7258 
 7259     __ align(CodeEntryAlignment);
 7260     address entry = __ pc();
 7261     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 7262              tmp1 = r10, tmp2 = r11;
 7263 
 7264     Label LOOP, DONE, MISMATCH;
 7265     Register vec_len = tmp1;
 7266     Register idx = tmp2;
 7267     // The minimum of the string lengths has been stored in cnt2.
 7268     Register cnt = cnt2;
 7269     FloatRegister ztmp1 = z0, ztmp2 = z1;
 7270     PRegister pgtmp1 = p0, pgtmp2 = p1;
 7271 
 7272 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 7273     switch (mode) {                                                            \
 7274       case LL:                                                                 \
 7275         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 7276         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 7277         break;                                                                 \
 7278       case LU:                                                                 \
 7279         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 7280         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 7281         break;                                                                 \
 7282       case UL:                                                                 \
 7283         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 7284         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 7285         break;                                                                 \
 7286       case UU:                                                                 \
 7287         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 7288         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 7289         break;                                                                 \
 7290       default:                                                                 \
 7291         ShouldNotReachHere();                                                  \
 7292     }
 7293 
 7294     StubCodeMark mark(this, stub_id);
 7295 
 7296     __ mov(idx, 0);
 7297     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 7298 
 7299     if (mode == LL) {
 7300       __ sve_cntb(vec_len);
 7301     } else {
 7302       __ sve_cnth(vec_len);
 7303     }
 7304 
 7305     __ sub(rscratch1, cnt, vec_len);
 7306 
 7307     __ bind(LOOP);
 7308 
 7309       // main loop
 7310       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 7311       __ add(idx, idx, vec_len);
 7312       // Compare strings.
 7313       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 7314       __ br(__ NE, MISMATCH);
 7315       __ cmp(idx, rscratch1);
 7316       __ br(__ LT, LOOP);
 7317 
 7318     // post loop, last iteration
 7319     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 7320 
 7321     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 7322     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 7323     __ br(__ EQ, DONE);
 7324 
 7325     __ bind(MISMATCH);
 7326 
 7327     // Crop the vector to find its location.
 7328     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 7329     // Extract the first different characters of each string.
 7330     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 7331     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 7332 
 7333     // Compute the difference of the first different characters.
 7334     __ sub(result, rscratch1, rscratch2);
 7335 
 7336     __ bind(DONE);
 7337     __ ret(lr);
 7338 #undef LOAD_PAIR
 7339     return entry;
 7340   }
 7341 
 7342   void generate_compare_long_strings() {
 7343     if (UseSVE == 0) {
 7344       StubRoutines::aarch64::_compare_long_string_LL
 7345           = generate_compare_long_string_same_encoding(true);
 7346       StubRoutines::aarch64::_compare_long_string_UU
 7347           = generate_compare_long_string_same_encoding(false);
 7348       StubRoutines::aarch64::_compare_long_string_LU
 7349           = generate_compare_long_string_different_encoding(true);
 7350       StubRoutines::aarch64::_compare_long_string_UL
 7351           = generate_compare_long_string_different_encoding(false);
 7352     } else {
 7353       StubRoutines::aarch64::_compare_long_string_LL
 7354           = generate_compare_long_string_sve(LL);
 7355       StubRoutines::aarch64::_compare_long_string_UU
 7356           = generate_compare_long_string_sve(UU);
 7357       StubRoutines::aarch64::_compare_long_string_LU
 7358           = generate_compare_long_string_sve(LU);
 7359       StubRoutines::aarch64::_compare_long_string_UL
 7360           = generate_compare_long_string_sve(UL);
 7361     }
 7362   }
 7363 
 7364   // R0 = result
 7365   // R1 = str2
 7366   // R2 = cnt1
 7367   // R3 = str1
 7368   // R4 = cnt2
 7369   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 7370   //
 7371   // This generic linear code use few additional ideas, which makes it faster:
 7372   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 7373   // in order to skip initial loading(help in systems with 1 ld pipeline)
 7374   // 2) we can use "fast" algorithm of finding single character to search for
 7375   // first symbol with less branches(1 branch per each loaded register instead
 7376   // of branch for each symbol), so, this is where constants like
 7377   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 7378   // 3) after loading and analyzing 1st register of source string, it can be
 7379   // used to search for every 1st character entry, saving few loads in
 7380   // comparison with "simplier-but-slower" implementation
 7381   // 4) in order to avoid lots of push/pop operations, code below is heavily
 7382   // re-using/re-initializing/compressing register values, which makes code
 7383   // larger and a bit less readable, however, most of extra operations are
 7384   // issued during loads or branches, so, penalty is minimal
 7385   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 7386     StubGenStubId stub_id;
 7387     if (str1_isL) {
 7388       if (str2_isL) {
 7389         stub_id = StubGenStubId::string_indexof_linear_ll_id;
 7390       } else {
 7391         stub_id = StubGenStubId::string_indexof_linear_ul_id;
 7392       }
 7393     } else {
 7394       if (str2_isL) {
 7395         ShouldNotReachHere();
 7396       } else {
 7397         stub_id = StubGenStubId::string_indexof_linear_uu_id;
 7398       }
 7399     }
 7400     __ align(CodeEntryAlignment);
 7401     StubCodeMark mark(this, stub_id);
 7402     address entry = __ pc();
 7403 
 7404     int str1_chr_size = str1_isL ? 1 : 2;
 7405     int str2_chr_size = str2_isL ? 1 : 2;
 7406     int str1_chr_shift = str1_isL ? 0 : 1;
 7407     int str2_chr_shift = str2_isL ? 0 : 1;
 7408     bool isL = str1_isL && str2_isL;
 7409    // parameters
 7410     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 7411     // temporary registers
 7412     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 7413     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 7414     // redefinitions
 7415     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 7416 
 7417     __ push(spilled_regs, sp);
 7418     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 7419         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 7420         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 7421         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 7422         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 7423         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 7424     // Read whole register from str1. It is safe, because length >=8 here
 7425     __ ldr(ch1, Address(str1));
 7426     // Read whole register from str2. It is safe, because length >=8 here
 7427     __ ldr(ch2, Address(str2));
 7428     __ sub(cnt2, cnt2, cnt1);
 7429     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 7430     if (str1_isL != str2_isL) {
 7431       __ eor(v0, __ T16B, v0, v0);
 7432     }
 7433     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 7434     __ mul(first, first, tmp1);
 7435     // check if we have less than 1 register to check
 7436     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 7437     if (str1_isL != str2_isL) {
 7438       __ fmovd(v1, ch1);
 7439     }
 7440     __ br(__ LE, L_SMALL);
 7441     __ eor(ch2, first, ch2);
 7442     if (str1_isL != str2_isL) {
 7443       __ zip1(v1, __ T16B, v1, v0);
 7444     }
 7445     __ sub(tmp2, ch2, tmp1);
 7446     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7447     __ bics(tmp2, tmp2, ch2);
 7448     if (str1_isL != str2_isL) {
 7449       __ fmovd(ch1, v1);
 7450     }
 7451     __ br(__ NE, L_HAS_ZERO);
 7452     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 7453     __ add(result, result, wordSize/str2_chr_size);
 7454     __ add(str2, str2, wordSize);
 7455     __ br(__ LT, L_POST_LOOP);
 7456     __ BIND(L_LOOP);
 7457       __ ldr(ch2, Address(str2));
 7458       __ eor(ch2, first, ch2);
 7459       __ sub(tmp2, ch2, tmp1);
 7460       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7461       __ bics(tmp2, tmp2, ch2);
 7462       __ br(__ NE, L_HAS_ZERO);
 7463     __ BIND(L_LOOP_PROCEED);
 7464       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 7465       __ add(str2, str2, wordSize);
 7466       __ add(result, result, wordSize/str2_chr_size);
 7467       __ br(__ GE, L_LOOP);
 7468     __ BIND(L_POST_LOOP);
 7469       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 7470       __ br(__ LE, NOMATCH);
 7471       __ ldr(ch2, Address(str2));
 7472       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 7473       __ eor(ch2, first, ch2);
 7474       __ sub(tmp2, ch2, tmp1);
 7475       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7476       __ mov(tmp4, -1); // all bits set
 7477       __ b(L_SMALL_PROCEED);
 7478     __ align(OptoLoopAlignment);
 7479     __ BIND(L_SMALL);
 7480       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 7481       __ eor(ch2, first, ch2);
 7482       if (str1_isL != str2_isL) {
 7483         __ zip1(v1, __ T16B, v1, v0);
 7484       }
 7485       __ sub(tmp2, ch2, tmp1);
 7486       __ mov(tmp4, -1); // all bits set
 7487       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7488       if (str1_isL != str2_isL) {
 7489         __ fmovd(ch1, v1); // move converted 4 symbols
 7490       }
 7491     __ BIND(L_SMALL_PROCEED);
 7492       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 7493       __ bic(tmp2, tmp2, ch2);
 7494       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 7495       __ rbit(tmp2, tmp2);
 7496       __ br(__ EQ, NOMATCH);
 7497     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 7498       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 7499       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 7500       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 7501       if (str2_isL) { // LL
 7502         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 7503         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 7504         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 7505         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 7506         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7507       } else {
 7508         __ mov(ch2, 0xE); // all bits in byte set except last one
 7509         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7510         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7511         __ lslv(tmp2, tmp2, tmp4);
 7512         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7513         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7514         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7515         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7516       }
 7517       __ cmp(ch1, ch2);
 7518       __ mov(tmp4, wordSize/str2_chr_size);
 7519       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7520     __ BIND(L_SMALL_CMP_LOOP);
 7521       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 7522                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 7523       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 7524                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 7525       __ add(tmp4, tmp4, 1);
 7526       __ cmp(tmp4, cnt1);
 7527       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 7528       __ cmp(first, ch2);
 7529       __ br(__ EQ, L_SMALL_CMP_LOOP);
 7530     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 7531       __ cbz(tmp2, NOMATCH); // no more matches. exit
 7532       __ clz(tmp4, tmp2);
 7533       __ add(result, result, 1); // advance index
 7534       __ add(str2, str2, str2_chr_size); // advance pointer
 7535       __ b(L_SMALL_HAS_ZERO_LOOP);
 7536     __ align(OptoLoopAlignment);
 7537     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 7538       __ cmp(first, ch2);
 7539       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7540       __ b(DONE);
 7541     __ align(OptoLoopAlignment);
 7542     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 7543       if (str2_isL) { // LL
 7544         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 7545         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 7546         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 7547         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 7548         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7549       } else {
 7550         __ mov(ch2, 0xE); // all bits in byte set except last one
 7551         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7552         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7553         __ lslv(tmp2, tmp2, tmp4);
 7554         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7555         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7556         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7557         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7558       }
 7559       __ cmp(ch1, ch2);
 7560       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7561       __ b(DONE);
 7562     __ align(OptoLoopAlignment);
 7563     __ BIND(L_HAS_ZERO);
 7564       __ rbit(tmp2, tmp2);
 7565       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 7566       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 7567       // It's fine because both counters are 32bit and are not changed in this
 7568       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 7569       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 7570       __ sub(result, result, 1);
 7571     __ BIND(L_HAS_ZERO_LOOP);
 7572       __ mov(cnt1, wordSize/str2_chr_size);
 7573       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 7574       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 7575       if (str2_isL) {
 7576         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 7577         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7578         __ lslv(tmp2, tmp2, tmp4);
 7579         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7580         __ add(tmp4, tmp4, 1);
 7581         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7582         __ lsl(tmp2, tmp2, 1);
 7583         __ mov(tmp4, wordSize/str2_chr_size);
 7584       } else {
 7585         __ mov(ch2, 0xE);
 7586         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7587         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7588         __ lslv(tmp2, tmp2, tmp4);
 7589         __ add(tmp4, tmp4, 1);
 7590         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7591         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 7592         __ lsl(tmp2, tmp2, 1);
 7593         __ mov(tmp4, wordSize/str2_chr_size);
 7594         __ sub(str2, str2, str2_chr_size);
 7595       }
 7596       __ cmp(ch1, ch2);
 7597       __ mov(tmp4, wordSize/str2_chr_size);
 7598       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7599     __ BIND(L_CMP_LOOP);
 7600       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 7601                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 7602       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 7603                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 7604       __ add(tmp4, tmp4, 1);
 7605       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 7606       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 7607       __ cmp(cnt1, ch2);
 7608       __ br(__ EQ, L_CMP_LOOP);
 7609     __ BIND(L_CMP_LOOP_NOMATCH);
 7610       // here we're not matched
 7611       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 7612       __ clz(tmp4, tmp2);
 7613       __ add(str2, str2, str2_chr_size); // advance pointer
 7614       __ b(L_HAS_ZERO_LOOP);
 7615     __ align(OptoLoopAlignment);
 7616     __ BIND(L_CMP_LOOP_LAST_CMP);
 7617       __ cmp(cnt1, ch2);
 7618       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7619       __ b(DONE);
 7620     __ align(OptoLoopAlignment);
 7621     __ BIND(L_CMP_LOOP_LAST_CMP2);
 7622       if (str2_isL) {
 7623         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 7624         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7625         __ lslv(tmp2, tmp2, tmp4);
 7626         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7627         __ add(tmp4, tmp4, 1);
 7628         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7629         __ lsl(tmp2, tmp2, 1);
 7630       } else {
 7631         __ mov(ch2, 0xE);
 7632         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7633         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7634         __ lslv(tmp2, tmp2, tmp4);
 7635         __ add(tmp4, tmp4, 1);
 7636         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7637         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 7638         __ lsl(tmp2, tmp2, 1);
 7639         __ sub(str2, str2, str2_chr_size);
 7640       }
 7641       __ cmp(ch1, ch2);
 7642       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7643       __ b(DONE);
 7644     __ align(OptoLoopAlignment);
 7645     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 7646       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 7647       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 7648       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 7649       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 7650       // result by analyzed characters value, so, we can just reset lower bits
 7651       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 7652       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 7653       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 7654       // index of last analyzed substring inside current octet. So, str2 in at
 7655       // respective start address. We need to advance it to next octet
 7656       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 7657       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 7658       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 7659       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 7660       __ movw(cnt2, cnt2);
 7661       __ b(L_LOOP_PROCEED);
 7662     __ align(OptoLoopAlignment);
 7663     __ BIND(NOMATCH);
 7664       __ mov(result, -1);
 7665     __ BIND(DONE);
 7666       __ pop(spilled_regs, sp);
 7667       __ ret(lr);
 7668     return entry;
 7669   }
 7670 
 7671   void generate_string_indexof_stubs() {
 7672     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 7673     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 7674     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 7675   }
 7676 
 7677   void inflate_and_store_2_fp_registers(bool generatePrfm,
 7678       FloatRegister src1, FloatRegister src2) {
 7679     Register dst = r1;
 7680     __ zip1(v1, __ T16B, src1, v0);
 7681     __ zip2(v2, __ T16B, src1, v0);
 7682     if (generatePrfm) {
 7683       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 7684     }
 7685     __ zip1(v3, __ T16B, src2, v0);
 7686     __ zip2(v4, __ T16B, src2, v0);
 7687     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 7688   }
 7689 
 7690   // R0 = src
 7691   // R1 = dst
 7692   // R2 = len
 7693   // R3 = len >> 3
 7694   // V0 = 0
 7695   // v1 = loaded 8 bytes
 7696   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 7697   address generate_large_byte_array_inflate() {
 7698     __ align(CodeEntryAlignment);
 7699     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
 7700     StubCodeMark mark(this, stub_id);
 7701     address entry = __ pc();
 7702     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 7703     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 7704     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 7705 
 7706     // do one more 8-byte read to have address 16-byte aligned in most cases
 7707     // also use single store instruction
 7708     __ ldrd(v2, __ post(src, 8));
 7709     __ sub(octetCounter, octetCounter, 2);
 7710     __ zip1(v1, __ T16B, v1, v0);
 7711     __ zip1(v2, __ T16B, v2, v0);
 7712     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 7713     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7714     __ subs(rscratch1, octetCounter, large_loop_threshold);
 7715     __ br(__ LE, LOOP_START);
 7716     __ b(LOOP_PRFM_START);
 7717     __ bind(LOOP_PRFM);
 7718       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7719     __ bind(LOOP_PRFM_START);
 7720       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 7721       __ sub(octetCounter, octetCounter, 8);
 7722       __ subs(rscratch1, octetCounter, large_loop_threshold);
 7723       inflate_and_store_2_fp_registers(true, v3, v4);
 7724       inflate_and_store_2_fp_registers(true, v5, v6);
 7725       __ br(__ GT, LOOP_PRFM);
 7726       __ cmp(octetCounter, (u1)8);
 7727       __ br(__ LT, DONE);
 7728     __ bind(LOOP);
 7729       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7730       __ bind(LOOP_START);
 7731       __ sub(octetCounter, octetCounter, 8);
 7732       __ cmp(octetCounter, (u1)8);
 7733       inflate_and_store_2_fp_registers(false, v3, v4);
 7734       inflate_and_store_2_fp_registers(false, v5, v6);
 7735       __ br(__ GE, LOOP);
 7736     __ bind(DONE);
 7737       __ ret(lr);
 7738     return entry;
 7739   }
 7740 
 7741   /**
 7742    *  Arguments:
 7743    *
 7744    *  Input:
 7745    *  c_rarg0   - current state address
 7746    *  c_rarg1   - H key address
 7747    *  c_rarg2   - data address
 7748    *  c_rarg3   - number of blocks
 7749    *
 7750    *  Output:
 7751    *  Updated state at c_rarg0
 7752    */
 7753   address generate_ghash_processBlocks() {
 7754     // Bafflingly, GCM uses little-endian for the byte order, but
 7755     // big-endian for the bit order.  For example, the polynomial 1 is
 7756     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 7757     //
 7758     // So, we must either reverse the bytes in each word and do
 7759     // everything big-endian or reverse the bits in each byte and do
 7760     // it little-endian.  On AArch64 it's more idiomatic to reverse
 7761     // the bits in each byte (we have an instruction, RBIT, to do
 7762     // that) and keep the data in little-endian bit order through the
 7763     // calculation, bit-reversing the inputs and outputs.
 7764 
 7765     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
 7766     StubCodeMark mark(this, stub_id);
 7767     __ align(wordSize * 2);
 7768     address p = __ pc();
 7769     __ emit_int64(0x87);  // The low-order bits of the field
 7770                           // polynomial (i.e. p = z^7+z^2+z+1)
 7771                           // repeated in the low and high parts of a
 7772                           // 128-bit vector
 7773     __ emit_int64(0x87);
 7774 
 7775     __ align(CodeEntryAlignment);
 7776     address start = __ pc();
 7777 
 7778     Register state   = c_rarg0;
 7779     Register subkeyH = c_rarg1;
 7780     Register data    = c_rarg2;
 7781     Register blocks  = c_rarg3;
 7782 
 7783     FloatRegister vzr = v30;
 7784     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 7785 
 7786     __ ldrq(v24, p);    // The field polynomial
 7787 
 7788     __ ldrq(v0, Address(state));
 7789     __ ldrq(v1, Address(subkeyH));
 7790 
 7791     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 7792     __ rbit(v0, __ T16B, v0);
 7793     __ rev64(v1, __ T16B, v1);
 7794     __ rbit(v1, __ T16B, v1);
 7795 
 7796     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 7797     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 7798 
 7799     {
 7800       Label L_ghash_loop;
 7801       __ bind(L_ghash_loop);
 7802 
 7803       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 7804                                                  // reversing each byte
 7805       __ rbit(v2, __ T16B, v2);
 7806       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 7807 
 7808       // Multiply state in v2 by subkey in v1
 7809       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 7810                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 7811                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 7812       // Reduce v7:v5 by the field polynomial
 7813       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 7814 
 7815       __ sub(blocks, blocks, 1);
 7816       __ cbnz(blocks, L_ghash_loop);
 7817     }
 7818 
 7819     // The bit-reversed result is at this point in v0
 7820     __ rev64(v0, __ T16B, v0);
 7821     __ rbit(v0, __ T16B, v0);
 7822 
 7823     __ st1(v0, __ T16B, state);
 7824     __ ret(lr);
 7825 
 7826     return start;
 7827   }
 7828 
 7829   address generate_ghash_processBlocks_wide() {
 7830     address small = generate_ghash_processBlocks();
 7831 
 7832     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
 7833     StubCodeMark mark(this, stub_id);
 7834     __ align(wordSize * 2);
 7835     address p = __ pc();
 7836     __ emit_int64(0x87);  // The low-order bits of the field
 7837                           // polynomial (i.e. p = z^7+z^2+z+1)
 7838                           // repeated in the low and high parts of a
 7839                           // 128-bit vector
 7840     __ emit_int64(0x87);
 7841 
 7842     __ align(CodeEntryAlignment);
 7843     address start = __ pc();
 7844 
 7845     Register state   = c_rarg0;
 7846     Register subkeyH = c_rarg1;
 7847     Register data    = c_rarg2;
 7848     Register blocks  = c_rarg3;
 7849 
 7850     const int unroll = 4;
 7851 
 7852     __ cmp(blocks, (unsigned char)(unroll * 2));
 7853     __ br(__ LT, small);
 7854 
 7855     if (unroll > 1) {
 7856     // Save state before entering routine
 7857       __ sub(sp, sp, 4 * 16);
 7858       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 7859       __ sub(sp, sp, 4 * 16);
 7860       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 7861     }
 7862 
 7863     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 7864 
 7865     if (unroll > 1) {
 7866       // And restore state
 7867       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 7868       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 7869     }
 7870 
 7871     __ cmp(blocks, (unsigned char)0);
 7872     __ br(__ GT, small);
 7873 
 7874     __ ret(lr);
 7875 
 7876     return start;
 7877   }
 7878 
 7879   void generate_base64_encode_simdround(Register src, Register dst,
 7880         FloatRegister codec, u8 size) {
 7881 
 7882     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 7883     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 7884     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 7885 
 7886     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 7887 
 7888     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 7889 
 7890     __ ushr(ind0, arrangement, in0,  2);
 7891 
 7892     __ ushr(ind1, arrangement, in1,  2);
 7893     __ shl(in0,   arrangement, in0,  6);
 7894     __ orr(ind1,  arrangement, ind1, in0);
 7895     __ ushr(ind1, arrangement, ind1, 2);
 7896 
 7897     __ ushr(ind2, arrangement, in2,  4);
 7898     __ shl(in1,   arrangement, in1,  4);
 7899     __ orr(ind2,  arrangement, in1,  ind2);
 7900     __ ushr(ind2, arrangement, ind2, 2);
 7901 
 7902     __ shl(ind3,  arrangement, in2,  2);
 7903     __ ushr(ind3, arrangement, ind3, 2);
 7904 
 7905     __ tbl(out0,  arrangement, codec,  4, ind0);
 7906     __ tbl(out1,  arrangement, codec,  4, ind1);
 7907     __ tbl(out2,  arrangement, codec,  4, ind2);
 7908     __ tbl(out3,  arrangement, codec,  4, ind3);
 7909 
 7910     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 7911   }
 7912 
 7913    /**
 7914    *  Arguments:
 7915    *
 7916    *  Input:
 7917    *  c_rarg0   - src_start
 7918    *  c_rarg1   - src_offset
 7919    *  c_rarg2   - src_length
 7920    *  c_rarg3   - dest_start
 7921    *  c_rarg4   - dest_offset
 7922    *  c_rarg5   - isURL
 7923    *
 7924    */
 7925   address generate_base64_encodeBlock() {
 7926 
 7927     static const char toBase64[64] = {
 7928       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 7929       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 7930       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 7931       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 7932       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 7933     };
 7934 
 7935     static const char toBase64URL[64] = {
 7936       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 7937       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 7938       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 7939       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 7940       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 7941     };
 7942 
 7943     __ align(CodeEntryAlignment);
 7944     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
 7945     StubCodeMark mark(this, stub_id);
 7946     address start = __ pc();
 7947 
 7948     Register src   = c_rarg0;  // source array
 7949     Register soff  = c_rarg1;  // source start offset
 7950     Register send  = c_rarg2;  // source end offset
 7951     Register dst   = c_rarg3;  // dest array
 7952     Register doff  = c_rarg4;  // position for writing to dest array
 7953     Register isURL = c_rarg5;  // Base64 or URL character set
 7954 
 7955     // c_rarg6 and c_rarg7 are free to use as temps
 7956     Register codec  = c_rarg6;
 7957     Register length = c_rarg7;
 7958 
 7959     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 7960 
 7961     __ add(src, src, soff);
 7962     __ add(dst, dst, doff);
 7963     __ sub(length, send, soff);
 7964 
 7965     // load the codec base address
 7966     __ lea(codec, ExternalAddress((address) toBase64));
 7967     __ cbz(isURL, ProcessData);
 7968     __ lea(codec, ExternalAddress((address) toBase64URL));
 7969 
 7970     __ BIND(ProcessData);
 7971 
 7972     // too short to formup a SIMD loop, roll back
 7973     __ cmp(length, (u1)24);
 7974     __ br(Assembler::LT, Process3B);
 7975 
 7976     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 7977 
 7978     __ BIND(Process48B);
 7979     __ cmp(length, (u1)48);
 7980     __ br(Assembler::LT, Process24B);
 7981     generate_base64_encode_simdround(src, dst, v0, 16);
 7982     __ sub(length, length, 48);
 7983     __ b(Process48B);
 7984 
 7985     __ BIND(Process24B);
 7986     __ cmp(length, (u1)24);
 7987     __ br(Assembler::LT, SIMDExit);
 7988     generate_base64_encode_simdround(src, dst, v0, 8);
 7989     __ sub(length, length, 24);
 7990 
 7991     __ BIND(SIMDExit);
 7992     __ cbz(length, Exit);
 7993 
 7994     __ BIND(Process3B);
 7995     //  3 src bytes, 24 bits
 7996     __ ldrb(r10, __ post(src, 1));
 7997     __ ldrb(r11, __ post(src, 1));
 7998     __ ldrb(r12, __ post(src, 1));
 7999     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 8000     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 8001     // codec index
 8002     __ ubfmw(r15, r12, 18, 23);
 8003     __ ubfmw(r14, r12, 12, 17);
 8004     __ ubfmw(r13, r12, 6,  11);
 8005     __ andw(r12,  r12, 63);
 8006     // get the code based on the codec
 8007     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 8008     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 8009     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 8010     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 8011     __ strb(r15, __ post(dst, 1));
 8012     __ strb(r14, __ post(dst, 1));
 8013     __ strb(r13, __ post(dst, 1));
 8014     __ strb(r12, __ post(dst, 1));
 8015     __ sub(length, length, 3);
 8016     __ cbnz(length, Process3B);
 8017 
 8018     __ BIND(Exit);
 8019     __ ret(lr);
 8020 
 8021     return start;
 8022   }
 8023 
 8024   void generate_base64_decode_simdround(Register src, Register dst,
 8025         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 8026 
 8027     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 8028     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 8029 
 8030     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 8031     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 8032 
 8033     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 8034 
 8035     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 8036 
 8037     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 8038 
 8039     // we need unsigned saturating subtract, to make sure all input values
 8040     // in range [0, 63] will have 0U value in the higher half lookup
 8041     __ uqsubv(decH0, __ T16B, in0, v27);
 8042     __ uqsubv(decH1, __ T16B, in1, v27);
 8043     __ uqsubv(decH2, __ T16B, in2, v27);
 8044     __ uqsubv(decH3, __ T16B, in3, v27);
 8045 
 8046     // lower half lookup
 8047     __ tbl(decL0, arrangement, codecL, 4, in0);
 8048     __ tbl(decL1, arrangement, codecL, 4, in1);
 8049     __ tbl(decL2, arrangement, codecL, 4, in2);
 8050     __ tbl(decL3, arrangement, codecL, 4, in3);
 8051 
 8052     // higher half lookup
 8053     __ tbx(decH0, arrangement, codecH, 4, decH0);
 8054     __ tbx(decH1, arrangement, codecH, 4, decH1);
 8055     __ tbx(decH2, arrangement, codecH, 4, decH2);
 8056     __ tbx(decH3, arrangement, codecH, 4, decH3);
 8057 
 8058     // combine lower and higher
 8059     __ orr(decL0, arrangement, decL0, decH0);
 8060     __ orr(decL1, arrangement, decL1, decH1);
 8061     __ orr(decL2, arrangement, decL2, decH2);
 8062     __ orr(decL3, arrangement, decL3, decH3);
 8063 
 8064     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 8065     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 8066     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 8067     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 8068     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 8069     __ orr(in0, arrangement, decH0, decH1);
 8070     __ orr(in1, arrangement, decH2, decH3);
 8071     __ orr(in2, arrangement, in0,   in1);
 8072     __ umaxv(in3, arrangement, in2);
 8073     __ umov(rscratch2, in3, __ B, 0);
 8074 
 8075     // get the data to output
 8076     __ shl(out0,  arrangement, decL0, 2);
 8077     __ ushr(out1, arrangement, decL1, 4);
 8078     __ orr(out0,  arrangement, out0,  out1);
 8079     __ shl(out1,  arrangement, decL1, 4);
 8080     __ ushr(out2, arrangement, decL2, 2);
 8081     __ orr(out1,  arrangement, out1,  out2);
 8082     __ shl(out2,  arrangement, decL2, 6);
 8083     __ orr(out2,  arrangement, out2,  decL3);
 8084 
 8085     __ cbz(rscratch2, NoIllegalData);
 8086 
 8087     // handle illegal input
 8088     __ umov(r10, in2, __ D, 0);
 8089     if (size == 16) {
 8090       __ cbnz(r10, ErrorInLowerHalf);
 8091 
 8092       // illegal input is in higher half, store the lower half now.
 8093       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 8094 
 8095       __ umov(r10, in2,  __ D, 1);
 8096       __ umov(r11, out0, __ D, 1);
 8097       __ umov(r12, out1, __ D, 1);
 8098       __ umov(r13, out2, __ D, 1);
 8099       __ b(StoreLegalData);
 8100 
 8101       __ BIND(ErrorInLowerHalf);
 8102     }
 8103     __ umov(r11, out0, __ D, 0);
 8104     __ umov(r12, out1, __ D, 0);
 8105     __ umov(r13, out2, __ D, 0);
 8106 
 8107     __ BIND(StoreLegalData);
 8108     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 8109     __ strb(r11, __ post(dst, 1));
 8110     __ strb(r12, __ post(dst, 1));
 8111     __ strb(r13, __ post(dst, 1));
 8112     __ lsr(r10, r10, 8);
 8113     __ lsr(r11, r11, 8);
 8114     __ lsr(r12, r12, 8);
 8115     __ lsr(r13, r13, 8);
 8116     __ b(StoreLegalData);
 8117 
 8118     __ BIND(NoIllegalData);
 8119     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 8120   }
 8121 
 8122 
 8123    /**
 8124    *  Arguments:
 8125    *
 8126    *  Input:
 8127    *  c_rarg0   - src_start
 8128    *  c_rarg1   - src_offset
 8129    *  c_rarg2   - src_length
 8130    *  c_rarg3   - dest_start
 8131    *  c_rarg4   - dest_offset
 8132    *  c_rarg5   - isURL
 8133    *  c_rarg6   - isMIME
 8134    *
 8135    */
 8136   address generate_base64_decodeBlock() {
 8137 
 8138     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
 8139     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
 8140     // titled "Base64 decoding".
 8141 
 8142     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
 8143     // except the trailing character '=' is also treated illegal value in this intrinsic. That
 8144     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
 8145     static const uint8_t fromBase64ForNoSIMD[256] = {
 8146       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8147       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8148       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 8149        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8150       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 8151        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
 8152       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 8153        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 8154       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8155       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8156       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8157       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8158       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8159       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8160       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8161       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8162     };
 8163 
 8164     static const uint8_t fromBase64URLForNoSIMD[256] = {
 8165       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8166       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8167       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 8168        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8169       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 8170        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
 8171       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 8172        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 8173       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8174       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8175       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8176       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8177       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8178       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8179       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8180       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8181     };
 8182 
 8183     // A legal value of base64 code is in range [0, 127].  We need two lookups
 8184     // with tbl/tbx and combine them to get the decode data. The 1st table vector
 8185     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
 8186     // table vector lookup use tbx, out of range indices are unchanged in
 8187     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
 8188     // The value of index 64 is set to 0, so that we know that we already get the
 8189     // decoded data with the 1st lookup.
 8190     static const uint8_t fromBase64ForSIMD[128] = {
 8191       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8192       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8193       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 8194        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8195         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 8196        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 8197       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 8198        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 8199     };
 8200 
 8201     static const uint8_t fromBase64URLForSIMD[128] = {
 8202       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8203       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8204       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 8205        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8206         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 8207        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 8208        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 8209        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 8210     };
 8211 
 8212     __ align(CodeEntryAlignment);
 8213     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
 8214     StubCodeMark mark(this, stub_id);
 8215     address start = __ pc();
 8216 
 8217     Register src    = c_rarg0;  // source array
 8218     Register soff   = c_rarg1;  // source start offset
 8219     Register send   = c_rarg2;  // source end offset
 8220     Register dst    = c_rarg3;  // dest array
 8221     Register doff   = c_rarg4;  // position for writing to dest array
 8222     Register isURL  = c_rarg5;  // Base64 or URL character set
 8223     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
 8224 
 8225     Register length = send;    // reuse send as length of source data to process
 8226 
 8227     Register simd_codec   = c_rarg6;
 8228     Register nosimd_codec = c_rarg7;
 8229 
 8230     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
 8231 
 8232     __ enter();
 8233 
 8234     __ add(src, src, soff);
 8235     __ add(dst, dst, doff);
 8236 
 8237     __ mov(doff, dst);
 8238 
 8239     __ sub(length, send, soff);
 8240     __ bfm(length, zr, 0, 1);
 8241 
 8242     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
 8243     __ cbz(isURL, ProcessData);
 8244     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
 8245 
 8246     __ BIND(ProcessData);
 8247     __ mov(rscratch1, length);
 8248     __ cmp(length, (u1)144); // 144 = 80 + 64
 8249     __ br(Assembler::LT, Process4B);
 8250 
 8251     // In the MIME case, the line length cannot be more than 76
 8252     // bytes (see RFC 2045). This is too short a block for SIMD
 8253     // to be worthwhile, so we use non-SIMD here.
 8254     __ movw(rscratch1, 79);
 8255 
 8256     __ BIND(Process4B);
 8257     __ ldrw(r14, __ post(src, 4));
 8258     __ ubfxw(r10, r14, 0,  8);
 8259     __ ubfxw(r11, r14, 8,  8);
 8260     __ ubfxw(r12, r14, 16, 8);
 8261     __ ubfxw(r13, r14, 24, 8);
 8262     // get the de-code
 8263     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
 8264     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
 8265     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
 8266     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
 8267     // error detection, 255u indicates an illegal input
 8268     __ orrw(r14, r10, r11);
 8269     __ orrw(r15, r12, r13);
 8270     __ orrw(r14, r14, r15);
 8271     __ tbnz(r14, 7, Exit);
 8272     // recover the data
 8273     __ lslw(r14, r10, 10);
 8274     __ bfiw(r14, r11, 4, 6);
 8275     __ bfmw(r14, r12, 2, 5);
 8276     __ rev16w(r14, r14);
 8277     __ bfiw(r13, r12, 6, 2);
 8278     __ strh(r14, __ post(dst, 2));
 8279     __ strb(r13, __ post(dst, 1));
 8280     // non-simd loop
 8281     __ subsw(rscratch1, rscratch1, 4);
 8282     __ br(Assembler::GT, Process4B);
 8283 
 8284     // if exiting from PreProcess80B, rscratch1 == -1;
 8285     // otherwise, rscratch1 == 0.
 8286     __ cbzw(rscratch1, Exit);
 8287     __ sub(length, length, 80);
 8288 
 8289     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
 8290     __ cbz(isURL, SIMDEnter);
 8291     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
 8292 
 8293     __ BIND(SIMDEnter);
 8294     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
 8295     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
 8296     __ mov(rscratch1, 63);
 8297     __ dup(v27, __ T16B, rscratch1);
 8298 
 8299     __ BIND(Process64B);
 8300     __ cmp(length, (u1)64);
 8301     __ br(Assembler::LT, Process32B);
 8302     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
 8303     __ sub(length, length, 64);
 8304     __ b(Process64B);
 8305 
 8306     __ BIND(Process32B);
 8307     __ cmp(length, (u1)32);
 8308     __ br(Assembler::LT, SIMDExit);
 8309     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
 8310     __ sub(length, length, 32);
 8311     __ b(Process32B);
 8312 
 8313     __ BIND(SIMDExit);
 8314     __ cbz(length, Exit);
 8315     __ movw(rscratch1, length);
 8316     __ b(Process4B);
 8317 
 8318     __ BIND(Exit);
 8319     __ sub(c_rarg0, dst, doff);
 8320 
 8321     __ leave();
 8322     __ ret(lr);
 8323 
 8324     return start;
 8325   }
 8326 
 8327   // Support for spin waits.
 8328   address generate_spin_wait() {
 8329     __ align(CodeEntryAlignment);
 8330     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
 8331     StubCodeMark mark(this, stub_id);
 8332     address start = __ pc();
 8333 
 8334     __ spin_wait();
 8335     __ ret(lr);
 8336 
 8337     return start;
 8338   }
 8339 
 8340   void generate_lookup_secondary_supers_table_stub() {
 8341     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 8342     StubCodeMark mark(this, stub_id);
 8343 
 8344     const Register
 8345       r_super_klass  = r0,
 8346       r_array_base   = r1,
 8347       r_array_length = r2,
 8348       r_array_index  = r3,
 8349       r_sub_klass    = r4,
 8350       r_bitmap       = rscratch2,
 8351       result         = r5;
 8352     const FloatRegister
 8353       vtemp          = v0;
 8354 
 8355     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 8356       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 8357       Label L_success;
 8358       __ enter();
 8359       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 8360                                              r_array_base, r_array_length, r_array_index,
 8361                                              vtemp, result, slot,
 8362                                              /*stub_is_near*/true);
 8363       __ leave();
 8364       __ ret(lr);
 8365     }
 8366   }
 8367 
 8368   // Slow path implementation for UseSecondarySupersTable.
 8369   address generate_lookup_secondary_supers_table_slow_path_stub() {
 8370     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 8371     StubCodeMark mark(this, stub_id);
 8372 
 8373     address start = __ pc();
 8374     const Register
 8375       r_super_klass  = r0,        // argument
 8376       r_array_base   = r1,        // argument
 8377       temp1          = r2,        // temp
 8378       r_array_index  = r3,        // argument
 8379       r_bitmap       = rscratch2, // argument
 8380       result         = r5;        // argument
 8381 
 8382     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
 8383     __ ret(lr);
 8384 
 8385     return start;
 8386   }
 8387 
 8388 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 8389 
 8390   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
 8391   //
 8392   // If LSE is in use, generate LSE versions of all the stubs. The
 8393   // non-LSE versions are in atomic_aarch64.S.
 8394 
 8395   // class AtomicStubMark records the entry point of a stub and the
 8396   // stub pointer which will point to it. The stub pointer is set to
 8397   // the entry point when ~AtomicStubMark() is called, which must be
 8398   // after ICache::invalidate_range. This ensures safe publication of
 8399   // the generated code.
 8400   class AtomicStubMark {
 8401     address _entry_point;
 8402     aarch64_atomic_stub_t *_stub;
 8403     MacroAssembler *_masm;
 8404   public:
 8405     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
 8406       _masm = masm;
 8407       __ align(32);
 8408       _entry_point = __ pc();
 8409       _stub = stub;
 8410     }
 8411     ~AtomicStubMark() {
 8412       *_stub = (aarch64_atomic_stub_t)_entry_point;
 8413     }
 8414   };
 8415 
 8416   // NB: For memory_order_conservative we need a trailing membar after
 8417   // LSE atomic operations but not a leading membar.
 8418   //
 8419   // We don't need a leading membar because a clause in the Arm ARM
 8420   // says:
 8421   //
 8422   //   Barrier-ordered-before
 8423   //
 8424   //   Barrier instructions order prior Memory effects before subsequent
 8425   //   Memory effects generated by the same Observer. A read or a write
 8426   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
 8427   //   Observer if and only if RW1 appears in program order before RW 2
 8428   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
 8429   //   instruction with both Acquire and Release semantics.
 8430   //
 8431   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
 8432   // and Release semantics, therefore we don't need a leading
 8433   // barrier. However, there is no corresponding Barrier-ordered-after
 8434   // relationship, therefore we need a trailing membar to prevent a
 8435   // later store or load from being reordered with the store in an
 8436   // atomic instruction.
 8437   //
 8438   // This was checked by using the herd7 consistency model simulator
 8439   // (http://diy.inria.fr/) with this test case:
 8440   //
 8441   // AArch64 LseCas
 8442   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
 8443   // P0 | P1;
 8444   // LDR W4, [X2] | MOV W3, #0;
 8445   // DMB LD       | MOV W4, #1;
 8446   // LDR W3, [X1] | CASAL W3, W4, [X1];
 8447   //              | DMB ISH;
 8448   //              | STR W4, [X2];
 8449   // exists
 8450   // (0:X3=0 /\ 0:X4=1)
 8451   //
 8452   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
 8453   // with the store to x in P1. Without the DMB in P1 this may happen.
 8454   //
 8455   // At the time of writing we don't know of any AArch64 hardware that
 8456   // reorders stores in this way, but the Reference Manual permits it.
 8457 
 8458   void gen_cas_entry(Assembler::operand_size size,
 8459                      atomic_memory_order order) {
 8460     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
 8461       exchange_val = c_rarg2;
 8462     bool acquire, release;
 8463     switch (order) {
 8464       case memory_order_relaxed:
 8465         acquire = false;
 8466         release = false;
 8467         break;
 8468       case memory_order_release:
 8469         acquire = false;
 8470         release = true;
 8471         break;
 8472       default:
 8473         acquire = true;
 8474         release = true;
 8475         break;
 8476     }
 8477     __ mov(prev, compare_val);
 8478     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
 8479     if (order == memory_order_conservative) {
 8480       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8481     }
 8482     if (size == Assembler::xword) {
 8483       __ mov(r0, prev);
 8484     } else {
 8485       __ movw(r0, prev);
 8486     }
 8487     __ ret(lr);
 8488   }
 8489 
 8490   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
 8491     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 8492     // If not relaxed, then default to conservative.  Relaxed is the only
 8493     // case we use enough to be worth specializing.
 8494     if (order == memory_order_relaxed) {
 8495       __ ldadd(size, incr, prev, addr);
 8496     } else {
 8497       __ ldaddal(size, incr, prev, addr);
 8498       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8499     }
 8500     if (size == Assembler::xword) {
 8501       __ mov(r0, prev);
 8502     } else {
 8503       __ movw(r0, prev);
 8504     }
 8505     __ ret(lr);
 8506   }
 8507 
 8508   void gen_swpal_entry(Assembler::operand_size size) {
 8509     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 8510     __ swpal(size, incr, prev, addr);
 8511     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8512     if (size == Assembler::xword) {
 8513       __ mov(r0, prev);
 8514     } else {
 8515       __ movw(r0, prev);
 8516     }
 8517     __ ret(lr);
 8518   }
 8519 
 8520   void generate_atomic_entry_points() {
 8521     if (! UseLSE) {
 8522       return;
 8523     }
 8524     __ align(CodeEntryAlignment);
 8525     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
 8526     StubCodeMark mark(this, stub_id);
 8527     address first_entry = __ pc();
 8528 
 8529     // ADD, memory_order_conservative
 8530     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
 8531     gen_ldadd_entry(Assembler::word, memory_order_conservative);
 8532     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
 8533     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
 8534 
 8535     // ADD, memory_order_relaxed
 8536     AtomicStubMark mark_fetch_add_4_relaxed
 8537       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
 8538     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
 8539     AtomicStubMark mark_fetch_add_8_relaxed
 8540       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
 8541     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
 8542 
 8543     // XCHG, memory_order_conservative
 8544     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
 8545     gen_swpal_entry(Assembler::word);
 8546     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
 8547     gen_swpal_entry(Assembler::xword);
 8548 
 8549     // CAS, memory_order_conservative
 8550     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
 8551     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
 8552     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
 8553     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
 8554     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
 8555     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
 8556 
 8557     // CAS, memory_order_relaxed
 8558     AtomicStubMark mark_cmpxchg_1_relaxed
 8559       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
 8560     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
 8561     AtomicStubMark mark_cmpxchg_4_relaxed
 8562       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
 8563     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
 8564     AtomicStubMark mark_cmpxchg_8_relaxed
 8565       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
 8566     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
 8567 
 8568     AtomicStubMark mark_cmpxchg_4_release
 8569       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
 8570     gen_cas_entry(MacroAssembler::word, memory_order_release);
 8571     AtomicStubMark mark_cmpxchg_8_release
 8572       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
 8573     gen_cas_entry(MacroAssembler::xword, memory_order_release);
 8574 
 8575     AtomicStubMark mark_cmpxchg_4_seq_cst
 8576       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
 8577     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
 8578     AtomicStubMark mark_cmpxchg_8_seq_cst
 8579       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
 8580     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
 8581 
 8582     ICache::invalidate_range(first_entry, __ pc() - first_entry);
 8583   }
 8584 #endif // LINUX
 8585 
 8586   address generate_cont_thaw(Continuation::thaw_kind kind) {
 8587     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
 8588     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
 8589 
 8590     address start = __ pc();
 8591 
 8592     if (return_barrier) {
 8593       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
 8594       __ mov(sp, rscratch1);
 8595     }
 8596     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 8597 
 8598     if (return_barrier) {
 8599       // preserve possible return value from a method returning to the return barrier
 8600       __ fmovd(rscratch1, v0);
 8601       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 8602     }
 8603 
 8604     __ movw(c_rarg1, (return_barrier ? 1 : 0));
 8605     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
 8606     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
 8607 
 8608     if (return_barrier) {
 8609       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 8610       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
 8611       __ fmovd(v0, rscratch1);
 8612     }
 8613     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 8614 
 8615 
 8616     Label thaw_success;
 8617     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
 8618     __ cbnz(rscratch2, thaw_success);
 8619     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
 8620     __ br(rscratch1);
 8621     __ bind(thaw_success);
 8622 
 8623     // make room for the thawed frames
 8624     __ sub(rscratch1, sp, rscratch2);
 8625     __ andr(rscratch1, rscratch1, -16); // align
 8626     __ mov(sp, rscratch1);
 8627 
 8628     if (return_barrier) {
 8629       // save original return value -- again
 8630       __ fmovd(rscratch1, v0);
 8631       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 8632     }
 8633 
 8634     // If we want, we can templatize thaw by kind, and have three different entries
 8635     __ movw(c_rarg1, (uint32_t)kind);
 8636 
 8637     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
 8638     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
 8639 
 8640     if (return_barrier) {
 8641       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 8642       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
 8643       __ fmovd(v0, rscratch1);
 8644     } else {
 8645       __ mov(r0, zr); // return 0 (success) from doYield
 8646     }
 8647 
 8648     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
 8649     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
 8650     __ mov(rfp, sp);
 8651 
 8652     if (return_barrier_exception) {
 8653       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
 8654       __ authenticate_return_address(c_rarg1);
 8655       __ verify_oop(r0);
 8656       // save return value containing the exception oop in callee-saved R19
 8657       __ mov(r19, r0);
 8658 
 8659       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
 8660 
 8661       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
 8662       // __ reinitialize_ptrue();
 8663 
 8664       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
 8665 
 8666       __ mov(r1, r0); // the exception handler
 8667       __ mov(r0, r19); // restore return value containing the exception oop
 8668       __ verify_oop(r0);
 8669 
 8670       __ leave();
 8671       __ mov(r3, lr);
 8672       __ br(r1); // the exception handler
 8673     } else {
 8674       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
 8675       __ leave();
 8676       __ ret(lr);
 8677     }
 8678 
 8679     return start;
 8680   }
 8681 
 8682   address generate_cont_thaw() {
 8683     if (!Continuations::enabled()) return nullptr;
 8684 
 8685     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
 8686     StubCodeMark mark(this, stub_id);
 8687     address start = __ pc();
 8688     generate_cont_thaw(Continuation::thaw_top);
 8689     return start;
 8690   }
 8691 
 8692   address generate_cont_returnBarrier() {
 8693     if (!Continuations::enabled()) return nullptr;
 8694 
 8695     // TODO: will probably need multiple return barriers depending on return type
 8696     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
 8697     StubCodeMark mark(this, stub_id);
 8698     address start = __ pc();
 8699 
 8700     generate_cont_thaw(Continuation::thaw_return_barrier);
 8701 
 8702     return start;
 8703   }
 8704 
 8705   address generate_cont_returnBarrier_exception() {
 8706     if (!Continuations::enabled()) return nullptr;
 8707 
 8708     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
 8709     StubCodeMark mark(this, stub_id);
 8710     address start = __ pc();
 8711 
 8712     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
 8713 
 8714     return start;
 8715   }
 8716 
 8717   address generate_cont_preempt_stub() {
 8718     if (!Continuations::enabled()) return nullptr;
 8719     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
 8720     StubCodeMark mark(this, stub_id);
 8721     address start = __ pc();
 8722 
 8723     __ reset_last_Java_frame(true);
 8724 
 8725     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
 8726     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
 8727     __ mov(sp, rscratch2);
 8728 
 8729     Label preemption_cancelled;
 8730     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
 8731     __ cbnz(rscratch1, preemption_cancelled);
 8732 
 8733     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
 8734     SharedRuntime::continuation_enter_cleanup(_masm);
 8735     __ leave();
 8736     __ ret(lr);
 8737 
 8738     // We acquired the monitor after freezing the frames so call thaw to continue execution.
 8739     __ bind(preemption_cancelled);
 8740     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
 8741     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
 8742     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
 8743     __ ldr(rscratch1, Address(rscratch1));
 8744     __ br(rscratch1);
 8745 
 8746     return start;
 8747   }
 8748 
 8749   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
 8750   // are represented as long[5], with BITS_PER_LIMB = 26.
 8751   // Pack five 26-bit limbs into three 64-bit registers.
 8752   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
 8753     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
 8754     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
 8755     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
 8756     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
 8757 
 8758     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
 8759     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
 8760     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
 8761     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
 8762 
 8763     if (dest2->is_valid()) {
 8764       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
 8765     } else {
 8766 #ifdef ASSERT
 8767       Label OK;
 8768       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
 8769       __ br(__ EQ, OK);
 8770       __ stop("high bits of Poly1305 integer should be zero");
 8771       __ should_not_reach_here();
 8772       __ bind(OK);
 8773 #endif
 8774     }
 8775   }
 8776 
 8777   // As above, but return only a 128-bit integer, packed into two
 8778   // 64-bit registers.
 8779   void pack_26(Register dest0, Register dest1, Register src) {
 8780     pack_26(dest0, dest1, noreg, src);
 8781   }
 8782 
 8783   // Multiply and multiply-accumulate unsigned 64-bit registers.
 8784   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
 8785     __ mul(prod_lo, n, m);
 8786     __ umulh(prod_hi, n, m);
 8787   }
 8788   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
 8789     wide_mul(rscratch1, rscratch2, n, m);
 8790     __ adds(sum_lo, sum_lo, rscratch1);
 8791     __ adc(sum_hi, sum_hi, rscratch2);
 8792   }
 8793 
 8794   // Poly1305, RFC 7539
 8795 
 8796   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
 8797   // description of the tricks used to simplify and accelerate this
 8798   // computation.
 8799 
 8800   address generate_poly1305_processBlocks() {
 8801     __ align(CodeEntryAlignment);
 8802     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
 8803     StubCodeMark mark(this, stub_id);
 8804     address start = __ pc();
 8805     Label here;
 8806     __ enter();
 8807     RegSet callee_saved = RegSet::range(r19, r28);
 8808     __ push(callee_saved, sp);
 8809 
 8810     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
 8811 
 8812     // Arguments
 8813     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
 8814 
 8815     // R_n is the 128-bit randomly-generated key, packed into two
 8816     // registers.  The caller passes this key to us as long[5], with
 8817     // BITS_PER_LIMB = 26.
 8818     const Register R_0 = *++regs, R_1 = *++regs;
 8819     pack_26(R_0, R_1, r_start);
 8820 
 8821     // RR_n is (R_n >> 2) * 5
 8822     const Register RR_0 = *++regs, RR_1 = *++regs;
 8823     __ lsr(RR_0, R_0, 2);
 8824     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
 8825     __ lsr(RR_1, R_1, 2);
 8826     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
 8827 
 8828     // U_n is the current checksum
 8829     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
 8830     pack_26(U_0, U_1, U_2, acc_start);
 8831 
 8832     static constexpr int BLOCK_LENGTH = 16;
 8833     Label DONE, LOOP;
 8834 
 8835     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
 8836     __ br(Assembler::LT, DONE); {
 8837       __ bind(LOOP);
 8838 
 8839       // S_n is to be the sum of U_n and the next block of data
 8840       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
 8841       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
 8842       __ adds(S_0, U_0, S_0);
 8843       __ adcs(S_1, U_1, S_1);
 8844       __ adc(S_2, U_2, zr);
 8845       __ add(S_2, S_2, 1);
 8846 
 8847       const Register U_0HI = *++regs, U_1HI = *++regs;
 8848 
 8849       // NB: this logic depends on some of the special properties of
 8850       // Poly1305 keys. In particular, because we know that the top
 8851       // four bits of R_0 and R_1 are zero, we can add together
 8852       // partial products without any risk of needing to propagate a
 8853       // carry out.
 8854       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
 8855       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
 8856       __ andr(U_2, R_0, 3);
 8857       __ mul(U_2, S_2, U_2);
 8858 
 8859       // Recycle registers S_0, S_1, S_2
 8860       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
 8861 
 8862       // Partial reduction mod 2**130 - 5
 8863       __ adds(U_1, U_0HI, U_1);
 8864       __ adc(U_2, U_1HI, U_2);
 8865       // Sum now in U_2:U_1:U_0.
 8866       // Dead: U_0HI, U_1HI.
 8867       regs = (regs.remaining() + U_0HI + U_1HI).begin();
 8868 
 8869       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
 8870 
 8871       // First, U_2:U_1:U_0 += (U_2 >> 2)
 8872       __ lsr(rscratch1, U_2, 2);
 8873       __ andr(U_2, U_2, (u8)3);
 8874       __ adds(U_0, U_0, rscratch1);
 8875       __ adcs(U_1, U_1, zr);
 8876       __ adc(U_2, U_2, zr);
 8877       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
 8878       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
 8879       __ adcs(U_1, U_1, zr);
 8880       __ adc(U_2, U_2, zr);
 8881 
 8882       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
 8883       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
 8884       __ br(~ Assembler::LT, LOOP);
 8885     }
 8886 
 8887     // Further reduce modulo 2^130 - 5
 8888     __ lsr(rscratch1, U_2, 2);
 8889     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
 8890     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
 8891     __ adcs(U_1, U_1, zr);
 8892     __ andr(U_2, U_2, (u1)3);
 8893     __ adc(U_2, U_2, zr);
 8894 
 8895     // Unpack the sum into five 26-bit limbs and write to memory.
 8896     __ ubfiz(rscratch1, U_0, 0, 26);
 8897     __ ubfx(rscratch2, U_0, 26, 26);
 8898     __ stp(rscratch1, rscratch2, Address(acc_start));
 8899     __ ubfx(rscratch1, U_0, 52, 12);
 8900     __ bfi(rscratch1, U_1, 12, 14);
 8901     __ ubfx(rscratch2, U_1, 14, 26);
 8902     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
 8903     __ ubfx(rscratch1, U_1, 40, 24);
 8904     __ bfi(rscratch1, U_2, 24, 3);
 8905     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
 8906 
 8907     __ bind(DONE);
 8908     __ pop(callee_saved, sp);
 8909     __ leave();
 8910     __ ret(lr);
 8911 
 8912     return start;
 8913   }
 8914 
 8915   // exception handler for upcall stubs
 8916   address generate_upcall_stub_exception_handler() {
 8917     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
 8918     StubCodeMark mark(this, stub_id);
 8919     address start = __ pc();
 8920 
 8921     // Native caller has no idea how to handle exceptions,
 8922     // so we just crash here. Up to callee to catch exceptions.
 8923     __ verify_oop(r0);
 8924     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
 8925     __ blr(rscratch1);
 8926     __ should_not_reach_here();
 8927 
 8928     return start;
 8929   }
 8930 
 8931   // load Method* target of MethodHandle
 8932   // j_rarg0 = jobject receiver
 8933   // rmethod = result
 8934   address generate_upcall_stub_load_target() {
 8935     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
 8936     StubCodeMark mark(this, stub_id);
 8937     address start = __ pc();
 8938 
 8939     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
 8940       // Load target method from receiver
 8941     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
 8942     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
 8943     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
 8944     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
 8945                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
 8946                       noreg, noreg);
 8947     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
 8948 
 8949     __ ret(lr);
 8950 
 8951     return start;
 8952   }
 8953 
 8954 #undef __
 8955 #define __ masm->
 8956 
 8957   class MontgomeryMultiplyGenerator : public MacroAssembler {
 8958 
 8959     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
 8960       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
 8961 
 8962     RegSet _toSave;
 8963     bool _squaring;
 8964 
 8965   public:
 8966     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
 8967       : MacroAssembler(as->code()), _squaring(squaring) {
 8968 
 8969       // Register allocation
 8970 
 8971       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
 8972       Pa_base = *regs;       // Argument registers
 8973       if (squaring)
 8974         Pb_base = Pa_base;
 8975       else
 8976         Pb_base = *++regs;
 8977       Pn_base = *++regs;
 8978       Rlen= *++regs;
 8979       inv = *++regs;
 8980       Pm_base = *++regs;
 8981 
 8982                           // Working registers:
 8983       Ra =  *++regs;        // The current digit of a, b, n, and m.
 8984       Rb =  *++regs;
 8985       Rm =  *++regs;
 8986       Rn =  *++regs;
 8987 
 8988       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
 8989       Pb =  *++regs;
 8990       Pm =  *++regs;
 8991       Pn =  *++regs;
 8992 
 8993       t0 =  *++regs;        // Three registers which form a
 8994       t1 =  *++regs;        // triple-precision accumuator.
 8995       t2 =  *++regs;
 8996 
 8997       Ri =  *++regs;        // Inner and outer loop indexes.
 8998       Rj =  *++regs;
 8999 
 9000       Rhi_ab = *++regs;     // Product registers: low and high parts
 9001       Rlo_ab = *++regs;     // of a*b and m*n.
 9002       Rhi_mn = *++regs;
 9003       Rlo_mn = *++regs;
 9004 
 9005       // r19 and up are callee-saved.
 9006       _toSave = RegSet::range(r19, *regs) + Pm_base;
 9007     }
 9008 
 9009   private:
 9010     void save_regs() {
 9011       push(_toSave, sp);
 9012     }
 9013 
 9014     void restore_regs() {
 9015       pop(_toSave, sp);
 9016     }
 9017 
 9018     template <typename T>
 9019     void unroll_2(Register count, T block) {
 9020       Label loop, end, odd;
 9021       tbnz(count, 0, odd);
 9022       cbz(count, end);
 9023       align(16);
 9024       bind(loop);
 9025       (this->*block)();
 9026       bind(odd);
 9027       (this->*block)();
 9028       subs(count, count, 2);
 9029       br(Assembler::GT, loop);
 9030       bind(end);
 9031     }
 9032 
 9033     template <typename T>
 9034     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
 9035       Label loop, end, odd;
 9036       tbnz(count, 0, odd);
 9037       cbz(count, end);
 9038       align(16);
 9039       bind(loop);
 9040       (this->*block)(d, s, tmp);
 9041       bind(odd);
 9042       (this->*block)(d, s, tmp);
 9043       subs(count, count, 2);
 9044       br(Assembler::GT, loop);
 9045       bind(end);
 9046     }
 9047 
 9048     void pre1(RegisterOrConstant i) {
 9049       block_comment("pre1");
 9050       // Pa = Pa_base;
 9051       // Pb = Pb_base + i;
 9052       // Pm = Pm_base;
 9053       // Pn = Pn_base + i;
 9054       // Ra = *Pa;
 9055       // Rb = *Pb;
 9056       // Rm = *Pm;
 9057       // Rn = *Pn;
 9058       ldr(Ra, Address(Pa_base));
 9059       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
 9060       ldr(Rm, Address(Pm_base));
 9061       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9062       lea(Pa, Address(Pa_base));
 9063       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
 9064       lea(Pm, Address(Pm_base));
 9065       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9066 
 9067       // Zero the m*n result.
 9068       mov(Rhi_mn, zr);
 9069       mov(Rlo_mn, zr);
 9070     }
 9071 
 9072     // The core multiply-accumulate step of a Montgomery
 9073     // multiplication.  The idea is to schedule operations as a
 9074     // pipeline so that instructions with long latencies (loads and
 9075     // multiplies) have time to complete before their results are
 9076     // used.  This most benefits in-order implementations of the
 9077     // architecture but out-of-order ones also benefit.
 9078     void step() {
 9079       block_comment("step");
 9080       // MACC(Ra, Rb, t0, t1, t2);
 9081       // Ra = *++Pa;
 9082       // Rb = *--Pb;
 9083       umulh(Rhi_ab, Ra, Rb);
 9084       mul(Rlo_ab, Ra, Rb);
 9085       ldr(Ra, pre(Pa, wordSize));
 9086       ldr(Rb, pre(Pb, -wordSize));
 9087       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
 9088                                        // previous iteration.
 9089       // MACC(Rm, Rn, t0, t1, t2);
 9090       // Rm = *++Pm;
 9091       // Rn = *--Pn;
 9092       umulh(Rhi_mn, Rm, Rn);
 9093       mul(Rlo_mn, Rm, Rn);
 9094       ldr(Rm, pre(Pm, wordSize));
 9095       ldr(Rn, pre(Pn, -wordSize));
 9096       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9097     }
 9098 
 9099     void post1() {
 9100       block_comment("post1");
 9101 
 9102       // MACC(Ra, Rb, t0, t1, t2);
 9103       // Ra = *++Pa;
 9104       // Rb = *--Pb;
 9105       umulh(Rhi_ab, Ra, Rb);
 9106       mul(Rlo_ab, Ra, Rb);
 9107       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9108       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9109 
 9110       // *Pm = Rm = t0 * inv;
 9111       mul(Rm, t0, inv);
 9112       str(Rm, Address(Pm));
 9113 
 9114       // MACC(Rm, Rn, t0, t1, t2);
 9115       // t0 = t1; t1 = t2; t2 = 0;
 9116       umulh(Rhi_mn, Rm, Rn);
 9117 
 9118 #ifndef PRODUCT
 9119       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
 9120       {
 9121         mul(Rlo_mn, Rm, Rn);
 9122         add(Rlo_mn, t0, Rlo_mn);
 9123         Label ok;
 9124         cbz(Rlo_mn, ok); {
 9125           stop("broken Montgomery multiply");
 9126         } bind(ok);
 9127       }
 9128 #endif
 9129       // We have very carefully set things up so that
 9130       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
 9131       // the lower half of Rm * Rn because we know the result already:
 9132       // it must be -t0.  t0 + (-t0) must generate a carry iff
 9133       // t0 != 0.  So, rather than do a mul and an adds we just set
 9134       // the carry flag iff t0 is nonzero.
 9135       //
 9136       // mul(Rlo_mn, Rm, Rn);
 9137       // adds(zr, t0, Rlo_mn);
 9138       subs(zr, t0, 1); // Set carry iff t0 is nonzero
 9139       adcs(t0, t1, Rhi_mn);
 9140       adc(t1, t2, zr);
 9141       mov(t2, zr);
 9142     }
 9143 
 9144     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
 9145       block_comment("pre2");
 9146       // Pa = Pa_base + i-len;
 9147       // Pb = Pb_base + len;
 9148       // Pm = Pm_base + i-len;
 9149       // Pn = Pn_base + len;
 9150 
 9151       if (i.is_register()) {
 9152         sub(Rj, i.as_register(), len);
 9153       } else {
 9154         mov(Rj, i.as_constant());
 9155         sub(Rj, Rj, len);
 9156       }
 9157       // Rj == i-len
 9158 
 9159       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
 9160       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
 9161       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
 9162       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
 9163 
 9164       // Ra = *++Pa;
 9165       // Rb = *--Pb;
 9166       // Rm = *++Pm;
 9167       // Rn = *--Pn;
 9168       ldr(Ra, pre(Pa, wordSize));
 9169       ldr(Rb, pre(Pb, -wordSize));
 9170       ldr(Rm, pre(Pm, wordSize));
 9171       ldr(Rn, pre(Pn, -wordSize));
 9172 
 9173       mov(Rhi_mn, zr);
 9174       mov(Rlo_mn, zr);
 9175     }
 9176 
 9177     void post2(RegisterOrConstant i, RegisterOrConstant len) {
 9178       block_comment("post2");
 9179       if (i.is_constant()) {
 9180         mov(Rj, i.as_constant()-len.as_constant());
 9181       } else {
 9182         sub(Rj, i.as_register(), len);
 9183       }
 9184 
 9185       adds(t0, t0, Rlo_mn); // The pending m*n, low part
 9186 
 9187       // As soon as we know the least significant digit of our result,
 9188       // store it.
 9189       // Pm_base[i-len] = t0;
 9190       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
 9191 
 9192       // t0 = t1; t1 = t2; t2 = 0;
 9193       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
 9194       adc(t1, t2, zr);
 9195       mov(t2, zr);
 9196     }
 9197 
 9198     // A carry in t0 after Montgomery multiplication means that we
 9199     // should subtract multiples of n from our result in m.  We'll
 9200     // keep doing that until there is no carry.
 9201     void normalize(RegisterOrConstant len) {
 9202       block_comment("normalize");
 9203       // while (t0)
 9204       //   t0 = sub(Pm_base, Pn_base, t0, len);
 9205       Label loop, post, again;
 9206       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
 9207       cbz(t0, post); {
 9208         bind(again); {
 9209           mov(i, zr);
 9210           mov(cnt, len);
 9211           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9212           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9213           subs(zr, zr, zr); // set carry flag, i.e. no borrow
 9214           align(16);
 9215           bind(loop); {
 9216             sbcs(Rm, Rm, Rn);
 9217             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9218             add(i, i, 1);
 9219             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9220             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9221             sub(cnt, cnt, 1);
 9222           } cbnz(cnt, loop);
 9223           sbc(t0, t0, zr);
 9224         } cbnz(t0, again);
 9225       } bind(post);
 9226     }
 9227 
 9228     // Move memory at s to d, reversing words.
 9229     //    Increments d to end of copied memory
 9230     //    Destroys tmp1, tmp2
 9231     //    Preserves len
 9232     //    Leaves s pointing to the address which was in d at start
 9233     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
 9234       assert(tmp1->encoding() < r19->encoding(), "register corruption");
 9235       assert(tmp2->encoding() < r19->encoding(), "register corruption");
 9236 
 9237       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
 9238       mov(tmp1, len);
 9239       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
 9240       sub(s, d, len, ext::uxtw, LogBytesPerWord);
 9241     }
 9242     // where
 9243     void reverse1(Register d, Register s, Register tmp) {
 9244       ldr(tmp, pre(s, -wordSize));
 9245       ror(tmp, tmp, 32);
 9246       str(tmp, post(d, wordSize));
 9247     }
 9248 
 9249     void step_squaring() {
 9250       // An extra ACC
 9251       step();
 9252       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9253     }
 9254 
 9255     void last_squaring(RegisterOrConstant i) {
 9256       Label dont;
 9257       // if ((i & 1) == 0) {
 9258       tbnz(i.as_register(), 0, dont); {
 9259         // MACC(Ra, Rb, t0, t1, t2);
 9260         // Ra = *++Pa;
 9261         // Rb = *--Pb;
 9262         umulh(Rhi_ab, Ra, Rb);
 9263         mul(Rlo_ab, Ra, Rb);
 9264         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9265       } bind(dont);
 9266     }
 9267 
 9268     void extra_step_squaring() {
 9269       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9270 
 9271       // MACC(Rm, Rn, t0, t1, t2);
 9272       // Rm = *++Pm;
 9273       // Rn = *--Pn;
 9274       umulh(Rhi_mn, Rm, Rn);
 9275       mul(Rlo_mn, Rm, Rn);
 9276       ldr(Rm, pre(Pm, wordSize));
 9277       ldr(Rn, pre(Pn, -wordSize));
 9278     }
 9279 
 9280     void post1_squaring() {
 9281       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9282 
 9283       // *Pm = Rm = t0 * inv;
 9284       mul(Rm, t0, inv);
 9285       str(Rm, Address(Pm));
 9286 
 9287       // MACC(Rm, Rn, t0, t1, t2);
 9288       // t0 = t1; t1 = t2; t2 = 0;
 9289       umulh(Rhi_mn, Rm, Rn);
 9290 
 9291 #ifndef PRODUCT
 9292       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
 9293       {
 9294         mul(Rlo_mn, Rm, Rn);
 9295         add(Rlo_mn, t0, Rlo_mn);
 9296         Label ok;
 9297         cbz(Rlo_mn, ok); {
 9298           stop("broken Montgomery multiply");
 9299         } bind(ok);
 9300       }
 9301 #endif
 9302       // We have very carefully set things up so that
 9303       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
 9304       // the lower half of Rm * Rn because we know the result already:
 9305       // it must be -t0.  t0 + (-t0) must generate a carry iff
 9306       // t0 != 0.  So, rather than do a mul and an adds we just set
 9307       // the carry flag iff t0 is nonzero.
 9308       //
 9309       // mul(Rlo_mn, Rm, Rn);
 9310       // adds(zr, t0, Rlo_mn);
 9311       subs(zr, t0, 1); // Set carry iff t0 is nonzero
 9312       adcs(t0, t1, Rhi_mn);
 9313       adc(t1, t2, zr);
 9314       mov(t2, zr);
 9315     }
 9316 
 9317     void acc(Register Rhi, Register Rlo,
 9318              Register t0, Register t1, Register t2) {
 9319       adds(t0, t0, Rlo);
 9320       adcs(t1, t1, Rhi);
 9321       adc(t2, t2, zr);
 9322     }
 9323 
 9324   public:
 9325     /**
 9326      * Fast Montgomery multiplication.  The derivation of the
 9327      * algorithm is in A Cryptographic Library for the Motorola
 9328      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
 9329      *
 9330      * Arguments:
 9331      *
 9332      * Inputs for multiplication:
 9333      *   c_rarg0   - int array elements a
 9334      *   c_rarg1   - int array elements b
 9335      *   c_rarg2   - int array elements n (the modulus)
 9336      *   c_rarg3   - int length
 9337      *   c_rarg4   - int inv
 9338      *   c_rarg5   - int array elements m (the result)
 9339      *
 9340      * Inputs for squaring:
 9341      *   c_rarg0   - int array elements a
 9342      *   c_rarg1   - int array elements n (the modulus)
 9343      *   c_rarg2   - int length
 9344      *   c_rarg3   - int inv
 9345      *   c_rarg4   - int array elements m (the result)
 9346      *
 9347      */
 9348     address generate_multiply() {
 9349       Label argh, nothing;
 9350       bind(argh);
 9351       stop("MontgomeryMultiply total_allocation must be <= 8192");
 9352 
 9353       align(CodeEntryAlignment);
 9354       address entry = pc();
 9355 
 9356       cbzw(Rlen, nothing);
 9357 
 9358       enter();
 9359 
 9360       // Make room.
 9361       cmpw(Rlen, 512);
 9362       br(Assembler::HI, argh);
 9363       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
 9364       andr(sp, Ra, -2 * wordSize);
 9365 
 9366       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
 9367 
 9368       {
 9369         // Copy input args, reversing as we go.  We use Ra as a
 9370         // temporary variable.
 9371         reverse(Ra, Pa_base, Rlen, t0, t1);
 9372         if (!_squaring)
 9373           reverse(Ra, Pb_base, Rlen, t0, t1);
 9374         reverse(Ra, Pn_base, Rlen, t0, t1);
 9375       }
 9376 
 9377       // Push all call-saved registers and also Pm_base which we'll need
 9378       // at the end.
 9379       save_regs();
 9380 
 9381 #ifndef PRODUCT
 9382       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
 9383       {
 9384         ldr(Rn, Address(Pn_base, 0));
 9385         mul(Rlo_mn, Rn, inv);
 9386         subs(zr, Rlo_mn, -1);
 9387         Label ok;
 9388         br(EQ, ok); {
 9389           stop("broken inverse in Montgomery multiply");
 9390         } bind(ok);
 9391       }
 9392 #endif
 9393 
 9394       mov(Pm_base, Ra);
 9395 
 9396       mov(t0, zr);
 9397       mov(t1, zr);
 9398       mov(t2, zr);
 9399 
 9400       block_comment("for (int i = 0; i < len; i++) {");
 9401       mov(Ri, zr); {
 9402         Label loop, end;
 9403         cmpw(Ri, Rlen);
 9404         br(Assembler::GE, end);
 9405 
 9406         bind(loop);
 9407         pre1(Ri);
 9408 
 9409         block_comment("  for (j = i; j; j--) {"); {
 9410           movw(Rj, Ri);
 9411           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
 9412         } block_comment("  } // j");
 9413 
 9414         post1();
 9415         addw(Ri, Ri, 1);
 9416         cmpw(Ri, Rlen);
 9417         br(Assembler::LT, loop);
 9418         bind(end);
 9419         block_comment("} // i");
 9420       }
 9421 
 9422       block_comment("for (int i = len; i < 2*len; i++) {");
 9423       mov(Ri, Rlen); {
 9424         Label loop, end;
 9425         cmpw(Ri, Rlen, Assembler::LSL, 1);
 9426         br(Assembler::GE, end);
 9427 
 9428         bind(loop);
 9429         pre2(Ri, Rlen);
 9430 
 9431         block_comment("  for (j = len*2-i-1; j; j--) {"); {
 9432           lslw(Rj, Rlen, 1);
 9433           subw(Rj, Rj, Ri);
 9434           subw(Rj, Rj, 1);
 9435           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
 9436         } block_comment("  } // j");
 9437 
 9438         post2(Ri, Rlen);
 9439         addw(Ri, Ri, 1);
 9440         cmpw(Ri, Rlen, Assembler::LSL, 1);
 9441         br(Assembler::LT, loop);
 9442         bind(end);
 9443       }
 9444       block_comment("} // i");
 9445 
 9446       normalize(Rlen);
 9447 
 9448       mov(Ra, Pm_base);  // Save Pm_base in Ra
 9449       restore_regs();  // Restore caller's Pm_base
 9450 
 9451       // Copy our result into caller's Pm_base
 9452       reverse(Pm_base, Ra, Rlen, t0, t1);
 9453 
 9454       leave();
 9455       bind(nothing);
 9456       ret(lr);
 9457 
 9458       return entry;
 9459     }
 9460     // In C, approximately:
 9461 
 9462     // void
 9463     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
 9464     //                     julong Pn_base[], julong Pm_base[],
 9465     //                     julong inv, int len) {
 9466     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
 9467     //   julong *Pa, *Pb, *Pn, *Pm;
 9468     //   julong Ra, Rb, Rn, Rm;
 9469 
 9470     //   int i;
 9471 
 9472     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
 9473 
 9474     //   for (i = 0; i < len; i++) {
 9475     //     int j;
 9476 
 9477     //     Pa = Pa_base;
 9478     //     Pb = Pb_base + i;
 9479     //     Pm = Pm_base;
 9480     //     Pn = Pn_base + i;
 9481 
 9482     //     Ra = *Pa;
 9483     //     Rb = *Pb;
 9484     //     Rm = *Pm;
 9485     //     Rn = *Pn;
 9486 
 9487     //     int iters = i;
 9488     //     for (j = 0; iters--; j++) {
 9489     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
 9490     //       MACC(Ra, Rb, t0, t1, t2);
 9491     //       Ra = *++Pa;
 9492     //       Rb = *--Pb;
 9493     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9494     //       MACC(Rm, Rn, t0, t1, t2);
 9495     //       Rm = *++Pm;
 9496     //       Rn = *--Pn;
 9497     //     }
 9498 
 9499     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
 9500     //     MACC(Ra, Rb, t0, t1, t2);
 9501     //     *Pm = Rm = t0 * inv;
 9502     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
 9503     //     MACC(Rm, Rn, t0, t1, t2);
 9504 
 9505     //     assert(t0 == 0, "broken Montgomery multiply");
 9506 
 9507     //     t0 = t1; t1 = t2; t2 = 0;
 9508     //   }
 9509 
 9510     //   for (i = len; i < 2*len; i++) {
 9511     //     int j;
 9512 
 9513     //     Pa = Pa_base + i-len;
 9514     //     Pb = Pb_base + len;
 9515     //     Pm = Pm_base + i-len;
 9516     //     Pn = Pn_base + len;
 9517 
 9518     //     Ra = *++Pa;
 9519     //     Rb = *--Pb;
 9520     //     Rm = *++Pm;
 9521     //     Rn = *--Pn;
 9522 
 9523     //     int iters = len*2-i-1;
 9524     //     for (j = i-len+1; iters--; j++) {
 9525     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
 9526     //       MACC(Ra, Rb, t0, t1, t2);
 9527     //       Ra = *++Pa;
 9528     //       Rb = *--Pb;
 9529     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9530     //       MACC(Rm, Rn, t0, t1, t2);
 9531     //       Rm = *++Pm;
 9532     //       Rn = *--Pn;
 9533     //     }
 9534 
 9535     //     Pm_base[i-len] = t0;
 9536     //     t0 = t1; t1 = t2; t2 = 0;
 9537     //   }
 9538 
 9539     //   while (t0)
 9540     //     t0 = sub(Pm_base, Pn_base, t0, len);
 9541     // }
 9542 
 9543     /**
 9544      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
 9545      * multiplies than Montgomery multiplication so it should be up to
 9546      * 25% faster.  However, its loop control is more complex and it
 9547      * may actually run slower on some machines.
 9548      *
 9549      * Arguments:
 9550      *
 9551      * Inputs:
 9552      *   c_rarg0   - int array elements a
 9553      *   c_rarg1   - int array elements n (the modulus)
 9554      *   c_rarg2   - int length
 9555      *   c_rarg3   - int inv
 9556      *   c_rarg4   - int array elements m (the result)
 9557      *
 9558      */
 9559     address generate_square() {
 9560       Label argh;
 9561       bind(argh);
 9562       stop("MontgomeryMultiply total_allocation must be <= 8192");
 9563 
 9564       align(CodeEntryAlignment);
 9565       address entry = pc();
 9566 
 9567       enter();
 9568 
 9569       // Make room.
 9570       cmpw(Rlen, 512);
 9571       br(Assembler::HI, argh);
 9572       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
 9573       andr(sp, Ra, -2 * wordSize);
 9574 
 9575       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
 9576 
 9577       {
 9578         // Copy input args, reversing as we go.  We use Ra as a
 9579         // temporary variable.
 9580         reverse(Ra, Pa_base, Rlen, t0, t1);
 9581         reverse(Ra, Pn_base, Rlen, t0, t1);
 9582       }
 9583 
 9584       // Push all call-saved registers and also Pm_base which we'll need
 9585       // at the end.
 9586       save_regs();
 9587 
 9588       mov(Pm_base, Ra);
 9589 
 9590       mov(t0, zr);
 9591       mov(t1, zr);
 9592       mov(t2, zr);
 9593 
 9594       block_comment("for (int i = 0; i < len; i++) {");
 9595       mov(Ri, zr); {
 9596         Label loop, end;
 9597         bind(loop);
 9598         cmp(Ri, Rlen);
 9599         br(Assembler::GE, end);
 9600 
 9601         pre1(Ri);
 9602 
 9603         block_comment("for (j = (i+1)/2; j; j--) {"); {
 9604           add(Rj, Ri, 1);
 9605           lsr(Rj, Rj, 1);
 9606           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
 9607         } block_comment("  } // j");
 9608 
 9609         last_squaring(Ri);
 9610 
 9611         block_comment("  for (j = i/2; j; j--) {"); {
 9612           lsr(Rj, Ri, 1);
 9613           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
 9614         } block_comment("  } // j");
 9615 
 9616         post1_squaring();
 9617         add(Ri, Ri, 1);
 9618         cmp(Ri, Rlen);
 9619         br(Assembler::LT, loop);
 9620 
 9621         bind(end);
 9622         block_comment("} // i");
 9623       }
 9624 
 9625       block_comment("for (int i = len; i < 2*len; i++) {");
 9626       mov(Ri, Rlen); {
 9627         Label loop, end;
 9628         bind(loop);
 9629         cmp(Ri, Rlen, Assembler::LSL, 1);
 9630         br(Assembler::GE, end);
 9631 
 9632         pre2(Ri, Rlen);
 9633 
 9634         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
 9635           lsl(Rj, Rlen, 1);
 9636           sub(Rj, Rj, Ri);
 9637           sub(Rj, Rj, 1);
 9638           lsr(Rj, Rj, 1);
 9639           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
 9640         } block_comment("  } // j");
 9641 
 9642         last_squaring(Ri);
 9643 
 9644         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
 9645           lsl(Rj, Rlen, 1);
 9646           sub(Rj, Rj, Ri);
 9647           lsr(Rj, Rj, 1);
 9648           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
 9649         } block_comment("  } // j");
 9650 
 9651         post2(Ri, Rlen);
 9652         add(Ri, Ri, 1);
 9653         cmp(Ri, Rlen, Assembler::LSL, 1);
 9654 
 9655         br(Assembler::LT, loop);
 9656         bind(end);
 9657         block_comment("} // i");
 9658       }
 9659 
 9660       normalize(Rlen);
 9661 
 9662       mov(Ra, Pm_base);  // Save Pm_base in Ra
 9663       restore_regs();  // Restore caller's Pm_base
 9664 
 9665       // Copy our result into caller's Pm_base
 9666       reverse(Pm_base, Ra, Rlen, t0, t1);
 9667 
 9668       leave();
 9669       ret(lr);
 9670 
 9671       return entry;
 9672     }
 9673     // In C, approximately:
 9674 
 9675     // void
 9676     // montgomery_square(julong Pa_base[], julong Pn_base[],
 9677     //                   julong Pm_base[], julong inv, int len) {
 9678     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
 9679     //   julong *Pa, *Pb, *Pn, *Pm;
 9680     //   julong Ra, Rb, Rn, Rm;
 9681 
 9682     //   int i;
 9683 
 9684     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
 9685 
 9686     //   for (i = 0; i < len; i++) {
 9687     //     int j;
 9688 
 9689     //     Pa = Pa_base;
 9690     //     Pb = Pa_base + i;
 9691     //     Pm = Pm_base;
 9692     //     Pn = Pn_base + i;
 9693 
 9694     //     Ra = *Pa;
 9695     //     Rb = *Pb;
 9696     //     Rm = *Pm;
 9697     //     Rn = *Pn;
 9698 
 9699     //     int iters = (i+1)/2;
 9700     //     for (j = 0; iters--; j++) {
 9701     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
 9702     //       MACC2(Ra, Rb, t0, t1, t2);
 9703     //       Ra = *++Pa;
 9704     //       Rb = *--Pb;
 9705     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9706     //       MACC(Rm, Rn, t0, t1, t2);
 9707     //       Rm = *++Pm;
 9708     //       Rn = *--Pn;
 9709     //     }
 9710     //     if ((i & 1) == 0) {
 9711     //       assert(Ra == Pa_base[j], "must be");
 9712     //       MACC(Ra, Ra, t0, t1, t2);
 9713     //     }
 9714     //     iters = i/2;
 9715     //     assert(iters == i-j, "must be");
 9716     //     for (; iters--; j++) {
 9717     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9718     //       MACC(Rm, Rn, t0, t1, t2);
 9719     //       Rm = *++Pm;
 9720     //       Rn = *--Pn;
 9721     //     }
 9722 
 9723     //     *Pm = Rm = t0 * inv;
 9724     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
 9725     //     MACC(Rm, Rn, t0, t1, t2);
 9726 
 9727     //     assert(t0 == 0, "broken Montgomery multiply");
 9728 
 9729     //     t0 = t1; t1 = t2; t2 = 0;
 9730     //   }
 9731 
 9732     //   for (i = len; i < 2*len; i++) {
 9733     //     int start = i-len+1;
 9734     //     int end = start + (len - start)/2;
 9735     //     int j;
 9736 
 9737     //     Pa = Pa_base + i-len;
 9738     //     Pb = Pa_base + len;
 9739     //     Pm = Pm_base + i-len;
 9740     //     Pn = Pn_base + len;
 9741 
 9742     //     Ra = *++Pa;
 9743     //     Rb = *--Pb;
 9744     //     Rm = *++Pm;
 9745     //     Rn = *--Pn;
 9746 
 9747     //     int iters = (2*len-i-1)/2;
 9748     //     assert(iters == end-start, "must be");
 9749     //     for (j = start; iters--; j++) {
 9750     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
 9751     //       MACC2(Ra, Rb, t0, t1, t2);
 9752     //       Ra = *++Pa;
 9753     //       Rb = *--Pb;
 9754     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9755     //       MACC(Rm, Rn, t0, t1, t2);
 9756     //       Rm = *++Pm;
 9757     //       Rn = *--Pn;
 9758     //     }
 9759     //     if ((i & 1) == 0) {
 9760     //       assert(Ra == Pa_base[j], "must be");
 9761     //       MACC(Ra, Ra, t0, t1, t2);
 9762     //     }
 9763     //     iters =  (2*len-i)/2;
 9764     //     assert(iters == len-j, "must be");
 9765     //     for (; iters--; j++) {
 9766     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9767     //       MACC(Rm, Rn, t0, t1, t2);
 9768     //       Rm = *++Pm;
 9769     //       Rn = *--Pn;
 9770     //     }
 9771     //     Pm_base[i-len] = t0;
 9772     //     t0 = t1; t1 = t2; t2 = 0;
 9773     //   }
 9774 
 9775     //   while (t0)
 9776     //     t0 = sub(Pm_base, Pn_base, t0, len);
 9777     // }
 9778   };
 9779 
 9780   void generate_vector_math_stubs() {
 9781     // Get native vector math stub routine addresses
 9782     void* libsleef = nullptr;
 9783     char ebuf[1024];
 9784     char dll_name[JVM_MAXPATHLEN];
 9785     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
 9786       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
 9787     }
 9788     if (libsleef == nullptr) {
 9789       log_info(library)("Failed to load native vector math library, %s!", ebuf);
 9790       return;
 9791     }
 9792     // Method naming convention
 9793     //   All the methods are named as <OP><T><N>_<U><suffix>
 9794     //   Where:
 9795     //     <OP>     is the operation name, e.g. sin
 9796     //     <T>      is optional to indicate float/double
 9797     //              "f/d" for vector float/double operation
 9798     //     <N>      is the number of elements in the vector
 9799     //              "2/4" for neon, and "x" for sve
 9800     //     <U>      is the precision level
 9801     //              "u10/u05" represents 1.0/0.5 ULP error bounds
 9802     //               We use "u10" for all operations by default
 9803     //               But for those functions do not have u10 support, we use "u05" instead
 9804     //     <suffix> indicates neon/sve
 9805     //              "sve/advsimd" for sve/neon implementations
 9806     //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
 9807     //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
 9808     //
 9809     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
 9810 
 9811     // Math vector stubs implemented with SVE for scalable vector size.
 9812     if (UseSVE > 0) {
 9813       for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
 9814         int vop = VectorSupport::VECTOR_OP_MATH_START + op;
 9815         // Skip "tanh" because there is performance regression
 9816         if (vop == VectorSupport::VECTOR_OP_TANH) {
 9817           continue;
 9818         }
 9819 
 9820         // The native library does not support u10 level of "hypot".
 9821         const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
 9822 
 9823         snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
 9824         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
 9825 
 9826         snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
 9827         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
 9828       }
 9829     }
 9830 
 9831     // Math vector stubs implemented with NEON for 64/128 bits vector size.
 9832     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
 9833       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
 9834       // Skip "tanh" because there is performance regression
 9835       if (vop == VectorSupport::VECTOR_OP_TANH) {
 9836         continue;
 9837       }
 9838 
 9839       // The native library does not support u10 level of "hypot".
 9840       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
 9841 
 9842       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
 9843       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
 9844 
 9845       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
 9846       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
 9847 
 9848       snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
 9849       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
 9850     }
 9851   }
 9852 
 9853   // Initialization
 9854   void generate_initial_stubs() {
 9855     // Generate initial stubs and initializes the entry points
 9856 
 9857     // entry points that exist in all platforms Note: This is code
 9858     // that could be shared among different platforms - however the
 9859     // benefit seems to be smaller than the disadvantage of having a
 9860     // much more complicated generator structure. See also comment in
 9861     // stubRoutines.hpp.
 9862 
 9863     StubRoutines::_forward_exception_entry = generate_forward_exception();
 9864 
 9865     StubRoutines::_call_stub_entry =
 9866       generate_call_stub(StubRoutines::_call_stub_return_address);
 9867 
 9868     // is referenced by megamorphic call
 9869     StubRoutines::_catch_exception_entry = generate_catch_exception();
 9870 
 9871     // Initialize table for copy memory (arraycopy) check.
 9872     if (UnsafeMemoryAccess::_table == nullptr) {
 9873       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
 9874     }
 9875 
 9876     if (UseCRC32Intrinsics) {
 9877       // set table address before stub generation which use it
 9878       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
 9879       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
 9880     }
 9881 
 9882     if (UseCRC32CIntrinsics) {
 9883       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
 9884     }
 9885 
 9886     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
 9887       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
 9888     }
 9889 
 9890     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
 9891       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
 9892     }
 9893 
 9894     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
 9895         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
 9896       StubRoutines::_hf2f = generate_float16ToFloat();
 9897       StubRoutines::_f2hf = generate_floatToFloat16();
 9898     }
 9899   }
 9900 
 9901   void generate_continuation_stubs() {
 9902     // Continuation stubs:
 9903     StubRoutines::_cont_thaw          = generate_cont_thaw();
 9904     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
 9905     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
 9906     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
 9907   }
 9908 
 9909   void generate_final_stubs() {
 9910     // support for verify_oop (must happen after universe_init)
 9911     if (VerifyOops) {
 9912       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
 9913     }
 9914 
 9915     // arraycopy stubs used by compilers
 9916     generate_arraycopy_stubs();
 9917 
 9918     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
 9919 
 9920     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
 9921 
 9922     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
 9923     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
 9924 
 9925 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 9926 
 9927     generate_atomic_entry_points();
 9928 
 9929 #endif // LINUX
 9930 
 9931 #ifdef COMPILER2
 9932     if (UseSecondarySupersTable) {
 9933       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
 9934       if (! InlineSecondarySupersTest) {
 9935         generate_lookup_secondary_supers_table_stub();
 9936       }
 9937     }
 9938 #endif
 9939 
 9940     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
 9941   }
 9942 
 9943   void generate_compiler_stubs() {
 9944 #if COMPILER2_OR_JVMCI
 9945 
 9946     if (UseSVE == 0) {
 9947       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
 9948     }
 9949 
 9950     // array equals stub for large arrays.
 9951     if (!UseSimpleArrayEquals) {
 9952       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
 9953     }
 9954 
 9955     // arrays_hascode stub for large arrays.
 9956     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
 9957     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
 9958     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
 9959     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
 9960     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
 9961 
 9962     // byte_array_inflate stub for large arrays.
 9963     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
 9964 
 9965     // countPositives stub for large arrays.
 9966     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
 9967 
 9968     generate_compare_long_strings();
 9969 
 9970     generate_string_indexof_stubs();
 9971 
 9972 #ifdef COMPILER2
 9973     if (UseMultiplyToLenIntrinsic) {
 9974       StubRoutines::_multiplyToLen = generate_multiplyToLen();
 9975     }
 9976 
 9977     if (UseSquareToLenIntrinsic) {
 9978       StubRoutines::_squareToLen = generate_squareToLen();
 9979     }
 9980 
 9981     if (UseMulAddIntrinsic) {
 9982       StubRoutines::_mulAdd = generate_mulAdd();
 9983     }
 9984 
 9985     if (UseSIMDForBigIntegerShiftIntrinsics) {
 9986       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
 9987       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
 9988     }
 9989 
 9990     if (UseMontgomeryMultiplyIntrinsic) {
 9991       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
 9992       StubCodeMark mark(this, stub_id);
 9993       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
 9994       StubRoutines::_montgomeryMultiply = g.generate_multiply();
 9995     }
 9996 
 9997     if (UseMontgomerySquareIntrinsic) {
 9998       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
 9999       StubCodeMark mark(this, stub_id);
10000       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
10001       // We use generate_multiply() rather than generate_square()
10002       // because it's faster for the sizes of modulus we care about.
10003       StubRoutines::_montgomerySquare = g.generate_multiply();
10004     }
10005 
10006     generate_vector_math_stubs();
10007 
10008 #endif // COMPILER2
10009 
10010     if (UseChaCha20Intrinsics) {
10011       StubRoutines::_chacha20Block = generate_chacha20Block_qrpar();
10012     }
10013 
10014     if (UseDilithiumIntrinsics) {
10015       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
10016       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
10017       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
10018       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
10019       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
10020     }
10021 
10022     if (UseBASE64Intrinsics) {
10023         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
10024         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
10025     }
10026 
10027     // data cache line writeback
10028     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
10029     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
10030 
10031     if (UseAESIntrinsics) {
10032       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
10033       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
10034       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
10035       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
10036       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
10037     }
10038     if (UseGHASHIntrinsics) {
10039       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
10040       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
10041     }
10042     if (UseAESIntrinsics && UseGHASHIntrinsics) {
10043       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
10044     }
10045 
10046     if (UseMD5Intrinsics) {
10047       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
10048       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
10049     }
10050     if (UseSHA1Intrinsics) {
10051       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
10052       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
10053     }
10054     if (UseSHA256Intrinsics) {
10055       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
10056       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
10057     }
10058     if (UseSHA512Intrinsics) {
10059       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
10060       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
10061     }
10062     if (UseSHA3Intrinsics) {
10063       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
10064       StubRoutines::_double_keccak         = generate_double_keccak();
10065       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
10066     }
10067 
10068     if (UsePoly1305Intrinsics) {
10069       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
10070     }
10071 
10072     // generate Adler32 intrinsics code
10073     if (UseAdler32Intrinsics) {
10074       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
10075     }
10076 
10077 #endif // COMPILER2_OR_JVMCI
10078   }
10079 
10080  public:
10081   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
10082     switch(blob_id) {
10083     case initial_id:
10084       generate_initial_stubs();
10085       break;
10086      case continuation_id:
10087       generate_continuation_stubs();
10088       break;
10089     case compiler_id:
10090       generate_compiler_stubs();
10091       break;
10092     case final_id:
10093       generate_final_stubs();
10094       break;
10095     default:
10096       fatal("unexpected blob id: %d", blob_id);
10097       break;
10098     };
10099   }
10100 }; // end class declaration
10101 
10102 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
10103   StubGenerator g(code, blob_id);
10104 }
10105 
10106 
10107 #if defined (LINUX)
10108 
10109 // Define pointers to atomic stubs and initialize them to point to the
10110 // code in atomic_aarch64.S.
10111 
10112 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
10113   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
10114     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
10115   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
10116     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
10117 
10118 DEFAULT_ATOMIC_OP(fetch_add, 4, )
10119 DEFAULT_ATOMIC_OP(fetch_add, 8, )
10120 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
10121 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
10122 DEFAULT_ATOMIC_OP(xchg, 4, )
10123 DEFAULT_ATOMIC_OP(xchg, 8, )
10124 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
10125 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
10126 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
10127 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
10128 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
10129 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
10130 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
10131 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
10132 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
10133 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
10134 
10135 #undef DEFAULT_ATOMIC_OP
10136 
10137 #endif // LINUX