1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "code/SCCache.hpp"
   31 #include "compiler/oopMap.hpp"
   32 #include "gc/shared/barrierSet.hpp"
   33 #include "gc/shared/barrierSetAssembler.hpp"
   34 #include "gc/shared/gc_globals.hpp"
   35 #include "gc/shared/tlab_globals.hpp"
   36 #include "interpreter/interpreter.hpp"
   37 #include "memory/universe.hpp"
   38 #include "nativeInst_aarch64.hpp"
   39 #include "oops/instanceOop.hpp"
   40 #include "oops/method.hpp"
   41 #include "oops/objArrayKlass.hpp"
   42 #include "oops/oop.inline.hpp"
   43 #include "prims/methodHandles.hpp"
   44 #include "prims/upcallLinker.hpp"
   45 #include "runtime/arguments.hpp"
   46 #include "runtime/atomic.hpp"
   47 #include "runtime/continuation.hpp"
   48 #include "runtime/continuationEntry.inline.hpp"
   49 #include "runtime/frame.inline.hpp"
   50 #include "runtime/handles.inline.hpp"
   51 #include "runtime/javaThread.hpp"
   52 #include "runtime/sharedRuntime.hpp"
   53 #include "runtime/stubCodeGenerator.hpp"
   54 #include "runtime/stubRoutines.hpp"
   55 #include "utilities/align.hpp"
   56 #include "utilities/checkedCast.hpp"
   57 #include "utilities/debug.hpp"
   58 #include "utilities/globalDefinitions.hpp"
   59 #include "utilities/intpow.hpp"
   60 #include "utilities/powerOfTwo.hpp"
   61 #ifdef COMPILER2
   62 #include "opto/runtime.hpp"
   63 #endif
   64 #if INCLUDE_ZGC
   65 #include "gc/z/zThreadLocalData.hpp"
   66 #endif
   67 
   68 // Declaration and definition of StubGenerator (no .hpp file).
   69 // For a more detailed description of the stub routine structure
   70 // see the comment in stubRoutines.hpp
   71 
   72 #undef __
   73 #define __ _masm->
   74 
   75 #ifdef PRODUCT
   76 #define BLOCK_COMMENT(str) /* nothing */
   77 #else
   78 #define BLOCK_COMMENT(str) __ block_comment(str)
   79 #endif
   80 
   81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   82 
   83 // Stub Code definitions
   84 
   85 class StubGenerator: public StubCodeGenerator {
   86  private:
   87 
   88 #ifdef PRODUCT
   89 #define inc_counter_np(counter) ((void)0)
   90 #else
   91   void inc_counter_np_(uint& counter) {
   92     __ incrementw(ExternalAddress((address)&counter));
   93   }
   94 #define inc_counter_np(counter) \
   95   BLOCK_COMMENT("inc_counter " #counter); \
   96   inc_counter_np_(counter);
   97 #endif
   98 
   99   // Call stubs are used to call Java from C
  100   //
  101   // Arguments:
  102   //    c_rarg0:   call wrapper address                   address
  103   //    c_rarg1:   result                                 address
  104   //    c_rarg2:   result type                            BasicType
  105   //    c_rarg3:   method                                 Method*
  106   //    c_rarg4:   (interpreter) entry point              address
  107   //    c_rarg5:   parameters                             intptr_t*
  108   //    c_rarg6:   parameter size (in words)              int
  109   //    c_rarg7:   thread                                 Thread*
  110   //
  111   // There is no return from the stub itself as any Java result
  112   // is written to result
  113   //
  114   // we save r30 (lr) as the return PC at the base of the frame and
  115   // link r29 (fp) below it as the frame pointer installing sp (r31)
  116   // into fp.
  117   //
  118   // we save r0-r7, which accounts for all the c arguments.
  119   //
  120   // TODO: strictly do we need to save them all? they are treated as
  121   // volatile by C so could we omit saving the ones we are going to
  122   // place in global registers (thread? method?) or those we only use
  123   // during setup of the Java call?
  124   //
  125   // we don't need to save r8 which C uses as an indirect result location
  126   // return register.
  127   //
  128   // we don't need to save r9-r15 which both C and Java treat as
  129   // volatile
  130   //
  131   // we don't need to save r16-18 because Java does not use them
  132   //
  133   // we save r19-r28 which Java uses as scratch registers and C
  134   // expects to be callee-save
  135   //
  136   // we save the bottom 64 bits of each value stored in v8-v15; it is
  137   // the responsibility of the caller to preserve larger values.
  138   //
  139   // so the stub frame looks like this when we enter Java code
  140   //
  141   //     [ return_from_Java     ] <--- sp
  142   //     [ argument word n      ]
  143   //      ...
  144   // -29 [ argument word 1      ]
  145   // -28 [ saved Floating-point Control Register ]
  146   // -26 [ saved v15            ] <--- sp_after_call
  147   // -25 [ saved v14            ]
  148   // -24 [ saved v13            ]
  149   // -23 [ saved v12            ]
  150   // -22 [ saved v11            ]
  151   // -21 [ saved v10            ]
  152   // -20 [ saved v9             ]
  153   // -19 [ saved v8             ]
  154   // -18 [ saved r28            ]
  155   // -17 [ saved r27            ]
  156   // -16 [ saved r26            ]
  157   // -15 [ saved r25            ]
  158   // -14 [ saved r24            ]
  159   // -13 [ saved r23            ]
  160   // -12 [ saved r22            ]
  161   // -11 [ saved r21            ]
  162   // -10 [ saved r20            ]
  163   //  -9 [ saved r19            ]
  164   //  -8 [ call wrapper    (r0) ]
  165   //  -7 [ result          (r1) ]
  166   //  -6 [ result type     (r2) ]
  167   //  -5 [ method          (r3) ]
  168   //  -4 [ entry point     (r4) ]
  169   //  -3 [ parameters      (r5) ]
  170   //  -2 [ parameter size  (r6) ]
  171   //  -1 [ thread (r7)          ]
  172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  173   //   1 [ saved lr       (r30) ]
  174 
  175   // Call stub stack layout word offsets from fp
  176   enum call_stub_layout {
  177     sp_after_call_off  = -28,
  178 
  179     fpcr_off           = sp_after_call_off,
  180     d15_off            = -26,
  181     d13_off            = -24,
  182     d11_off            = -22,
  183     d9_off             = -20,
  184 
  185     r28_off            = -18,
  186     r26_off            = -16,
  187     r24_off            = -14,
  188     r22_off            = -12,
  189     r20_off            = -10,
  190     call_wrapper_off   =  -8,
  191     result_off         =  -7,
  192     result_type_off    =  -6,
  193     method_off         =  -5,
  194     entry_point_off    =  -4,
  195     parameter_size_off =  -2,
  196     thread_off         =  -1,
  197     fp_f               =   0,
  198     retaddr_off        =   1,
  199   };
  200 
  201   address generate_call_stub(address& return_address) {
  202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  204            "adjust this code");
  205 
  206     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  207     StubCodeMark mark(this, stub_id);
  208     address start = __ pc();
  209 
  210     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  211 
  212     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  213     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  214     const Address result        (rfp, result_off         * wordSize);
  215     const Address result_type   (rfp, result_type_off    * wordSize);
  216     const Address method        (rfp, method_off         * wordSize);
  217     const Address entry_point   (rfp, entry_point_off    * wordSize);
  218     const Address parameter_size(rfp, parameter_size_off * wordSize);
  219 
  220     const Address thread        (rfp, thread_off         * wordSize);
  221 
  222     const Address d15_save      (rfp, d15_off * wordSize);
  223     const Address d13_save      (rfp, d13_off * wordSize);
  224     const Address d11_save      (rfp, d11_off * wordSize);
  225     const Address d9_save       (rfp, d9_off * wordSize);
  226 
  227     const Address r28_save      (rfp, r28_off * wordSize);
  228     const Address r26_save      (rfp, r26_off * wordSize);
  229     const Address r24_save      (rfp, r24_off * wordSize);
  230     const Address r22_save      (rfp, r22_off * wordSize);
  231     const Address r20_save      (rfp, r20_off * wordSize);
  232 
  233     // stub code
  234 
  235     address aarch64_entry = __ pc();
  236 
  237     // set up frame and move sp to end of save area
  238     __ enter();
  239     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  240 
  241     // save register parameters and Java scratch/global registers
  242     // n.b. we save thread even though it gets installed in
  243     // rthread because we want to sanity check rthread later
  244     __ str(c_rarg7,  thread);
  245     __ strw(c_rarg6, parameter_size);
  246     __ stp(c_rarg4, c_rarg5,  entry_point);
  247     __ stp(c_rarg2, c_rarg3,  result_type);
  248     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  249 
  250     __ stp(r20, r19,   r20_save);
  251     __ stp(r22, r21,   r22_save);
  252     __ stp(r24, r23,   r24_save);
  253     __ stp(r26, r25,   r26_save);
  254     __ stp(r28, r27,   r28_save);
  255 
  256     __ stpd(v9,  v8,   d9_save);
  257     __ stpd(v11, v10,  d11_save);
  258     __ stpd(v13, v12,  d13_save);
  259     __ stpd(v15, v14,  d15_save);
  260 
  261     __ get_fpcr(rscratch1);
  262     __ str(rscratch1, fpcr_save);
  263     // Set FPCR to the state we need. We do want Round to Nearest. We
  264     // don't want non-IEEE rounding modes or floating-point traps.
  265     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  266     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  267     __ set_fpcr(rscratch1);
  268 
  269     // install Java thread in global register now we have saved
  270     // whatever value it held
  271     __ mov(rthread, c_rarg7);
  272     // And method
  273     __ mov(rmethod, c_rarg3);
  274 
  275     // set up the heapbase register
  276     __ reinit_heapbase();
  277 
  278 #ifdef ASSERT
  279     // make sure we have no pending exceptions
  280     {
  281       Label L;
  282       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  283       __ cmp(rscratch1, (u1)NULL_WORD);
  284       __ br(Assembler::EQ, L);
  285       __ stop("StubRoutines::call_stub: entered with pending exception");
  286       __ BIND(L);
  287     }
  288 #endif
  289     // pass parameters if any
  290     __ mov(esp, sp);
  291     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  292     __ andr(sp, rscratch1, -2 * wordSize);
  293 
  294     BLOCK_COMMENT("pass parameters if any");
  295     Label parameters_done;
  296     // parameter count is still in c_rarg6
  297     // and parameter pointer identifying param 1 is in c_rarg5
  298     __ cbzw(c_rarg6, parameters_done);
  299 
  300     address loop = __ pc();
  301     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  302     __ subsw(c_rarg6, c_rarg6, 1);
  303     __ push(rscratch1);
  304     __ br(Assembler::GT, loop);
  305 
  306     __ BIND(parameters_done);
  307 
  308     // call Java entry -- passing methdoOop, and current sp
  309     //      rmethod: Method*
  310     //      r19_sender_sp: sender sp
  311     BLOCK_COMMENT("call Java function");
  312     __ mov(r19_sender_sp, sp);
  313     __ blr(c_rarg4);
  314 
  315     // we do this here because the notify will already have been done
  316     // if we get to the next instruction via an exception
  317     //
  318     // n.b. adding this instruction here affects the calculation of
  319     // whether or not a routine returns to the call stub (used when
  320     // doing stack walks) since the normal test is to check the return
  321     // pc against the address saved below. so we may need to allow for
  322     // this extra instruction in the check.
  323 
  324     // save current address for use by exception handling code
  325 
  326     return_address = __ pc();
  327 
  328     // store result depending on type (everything that is not
  329     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  330     // n.b. this assumes Java returns an integral result in r0
  331     // and a floating result in j_farg0
  332     __ ldr(j_rarg2, result);
  333     Label is_long, is_float, is_double, exit;
  334     __ ldr(j_rarg1, result_type);
  335     __ cmp(j_rarg1, (u1)T_OBJECT);
  336     __ br(Assembler::EQ, is_long);
  337     __ cmp(j_rarg1, (u1)T_LONG);
  338     __ br(Assembler::EQ, is_long);
  339     __ cmp(j_rarg1, (u1)T_FLOAT);
  340     __ br(Assembler::EQ, is_float);
  341     __ cmp(j_rarg1, (u1)T_DOUBLE);
  342     __ br(Assembler::EQ, is_double);
  343 
  344     // handle T_INT case
  345     __ strw(r0, Address(j_rarg2));
  346 
  347     __ BIND(exit);
  348 
  349     // pop parameters
  350     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  351 
  352 #ifdef ASSERT
  353     // verify that threads correspond
  354     {
  355       Label L, S;
  356       __ ldr(rscratch1, thread);
  357       __ cmp(rthread, rscratch1);
  358       __ br(Assembler::NE, S);
  359       __ get_thread(rscratch1);
  360       __ cmp(rthread, rscratch1);
  361       __ br(Assembler::EQ, L);
  362       __ BIND(S);
  363       __ stop("StubRoutines::call_stub: threads must correspond");
  364       __ BIND(L);
  365     }
  366 #endif
  367 
  368     __ pop_cont_fastpath(rthread);
  369 
  370     // restore callee-save registers
  371     __ ldpd(v15, v14,  d15_save);
  372     __ ldpd(v13, v12,  d13_save);
  373     __ ldpd(v11, v10,  d11_save);
  374     __ ldpd(v9,  v8,   d9_save);
  375 
  376     __ ldp(r28, r27,   r28_save);
  377     __ ldp(r26, r25,   r26_save);
  378     __ ldp(r24, r23,   r24_save);
  379     __ ldp(r22, r21,   r22_save);
  380     __ ldp(r20, r19,   r20_save);
  381 
  382     // restore fpcr
  383     __ ldr(rscratch1,  fpcr_save);
  384     __ set_fpcr(rscratch1);
  385 
  386     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  387     __ ldrw(c_rarg2, result_type);
  388     __ ldr(c_rarg3,  method);
  389     __ ldp(c_rarg4, c_rarg5,  entry_point);
  390     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  391 
  392     // leave frame and return to caller
  393     __ leave();
  394     __ ret(lr);
  395 
  396     // handle return types different from T_INT
  397 
  398     __ BIND(is_long);
  399     __ str(r0, Address(j_rarg2, 0));
  400     __ br(Assembler::AL, exit);
  401 
  402     __ BIND(is_float);
  403     __ strs(j_farg0, Address(j_rarg2, 0));
  404     __ br(Assembler::AL, exit);
  405 
  406     __ BIND(is_double);
  407     __ strd(j_farg0, Address(j_rarg2, 0));
  408     __ br(Assembler::AL, exit);
  409 
  410     return start;
  411   }
  412 
  413   // Return point for a Java call if there's an exception thrown in
  414   // Java code.  The exception is caught and transformed into a
  415   // pending exception stored in JavaThread that can be tested from
  416   // within the VM.
  417   //
  418   // Note: Usually the parameters are removed by the callee. In case
  419   // of an exception crossing an activation frame boundary, that is
  420   // not the case if the callee is compiled code => need to setup the
  421   // rsp.
  422   //
  423   // r0: exception oop
  424 
  425   address generate_catch_exception() {
  426     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
  427     StubCodeMark mark(this, stub_id);
  428     address start = __ pc();
  429 
  430     // same as in generate_call_stub():
  431     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  432     const Address thread        (rfp, thread_off         * wordSize);
  433 
  434 #ifdef ASSERT
  435     // verify that threads correspond
  436     {
  437       Label L, S;
  438       __ ldr(rscratch1, thread);
  439       __ cmp(rthread, rscratch1);
  440       __ br(Assembler::NE, S);
  441       __ get_thread(rscratch1);
  442       __ cmp(rthread, rscratch1);
  443       __ br(Assembler::EQ, L);
  444       __ bind(S);
  445       __ stop("StubRoutines::catch_exception: threads must correspond");
  446       __ bind(L);
  447     }
  448 #endif
  449 
  450     // set pending exception
  451     __ verify_oop(r0);
  452 
  453     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  454     __ mov(rscratch1, (address)__FILE__);
  455     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  456     __ movw(rscratch1, (int)__LINE__);
  457     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  458 
  459     // complete return to VM
  460     assert(StubRoutines::_call_stub_return_address != nullptr,
  461            "_call_stub_return_address must have been generated before");
  462     __ b(StubRoutines::_call_stub_return_address);
  463 
  464     return start;
  465   }
  466 
  467   // Continuation point for runtime calls returning with a pending
  468   // exception.  The pending exception check happened in the runtime
  469   // or native call stub.  The pending exception in Thread is
  470   // converted into a Java-level exception.
  471   //
  472   // Contract with Java-level exception handlers:
  473   // r0: exception
  474   // r3: throwing pc
  475   //
  476   // NOTE: At entry of this stub, exception-pc must be in LR !!
  477 
  478   // NOTE: this is always used as a jump target within generated code
  479   // so it just needs to be generated code with no x86 prolog
  480 
  481   address generate_forward_exception() {
  482     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
  483     StubCodeMark mark(this, stub_id);
  484     address start = __ pc();
  485 
  486     // Upon entry, LR points to the return address returning into
  487     // Java (interpreted or compiled) code; i.e., the return address
  488     // becomes the throwing pc.
  489     //
  490     // Arguments pushed before the runtime call are still on the stack
  491     // but the exception handler will reset the stack pointer ->
  492     // ignore them.  A potential result in registers can be ignored as
  493     // well.
  494 
  495 #ifdef ASSERT
  496     // make sure this code is only executed if there is a pending exception
  497     {
  498       Label L;
  499       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  500       __ cbnz(rscratch1, L);
  501       __ stop("StubRoutines::forward exception: no pending exception (1)");
  502       __ bind(L);
  503     }
  504 #endif
  505 
  506     // compute exception handler into r19
  507 
  508     // call the VM to find the handler address associated with the
  509     // caller address. pass thread in r0 and caller pc (ret address)
  510     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  511     // the stack.
  512     __ mov(c_rarg1, lr);
  513     // lr will be trashed by the VM call so we move it to R19
  514     // (callee-saved) because we also need to pass it to the handler
  515     // returned by this call.
  516     __ mov(r19, lr);
  517     BLOCK_COMMENT("call exception_handler_for_return_address");
  518     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  519                          SharedRuntime::exception_handler_for_return_address),
  520                     rthread, c_rarg1);
  521     // Reinitialize the ptrue predicate register, in case the external runtime
  522     // call clobbers ptrue reg, as we may return to SVE compiled code.
  523     __ reinitialize_ptrue();
  524 
  525     // we should not really care that lr is no longer the callee
  526     // address. we saved the value the handler needs in r19 so we can
  527     // just copy it to r3. however, the C2 handler will push its own
  528     // frame and then calls into the VM and the VM code asserts that
  529     // the PC for the frame above the handler belongs to a compiled
  530     // Java method. So, we restore lr here to satisfy that assert.
  531     __ mov(lr, r19);
  532     // setup r0 & r3 & clear pending exception
  533     __ mov(r3, r19);
  534     __ mov(r19, r0);
  535     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  536     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  537 
  538 #ifdef ASSERT
  539     // make sure exception is set
  540     {
  541       Label L;
  542       __ cbnz(r0, L);
  543       __ stop("StubRoutines::forward exception: no pending exception (2)");
  544       __ bind(L);
  545     }
  546 #endif
  547 
  548     // continue at exception handler
  549     // r0: exception
  550     // r3: throwing pc
  551     // r19: exception handler
  552     __ verify_oop(r0);
  553     __ br(r19);
  554 
  555     return start;
  556   }
  557 
  558   // Non-destructive plausibility checks for oops
  559   //
  560   // Arguments:
  561   //    r0: oop to verify
  562   //    rscratch1: error message
  563   //
  564   // Stack after saving c_rarg3:
  565   //    [tos + 0]: saved c_rarg3
  566   //    [tos + 1]: saved c_rarg2
  567   //    [tos + 2]: saved lr
  568   //    [tos + 3]: saved rscratch2
  569   //    [tos + 4]: saved r0
  570   //    [tos + 5]: saved rscratch1
  571   address generate_verify_oop() {
  572     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
  573     StubCodeMark mark(this, stub_id);
  574     address start = __ pc();
  575 
  576     Label exit, error;
  577 
  578     // save c_rarg2 and c_rarg3
  579     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  580 
  581     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  583     __ ldr(c_rarg3, Address(c_rarg2));
  584     __ add(c_rarg3, c_rarg3, 1);
  585     __ str(c_rarg3, Address(c_rarg2));
  586 
  587     // object is in r0
  588     // make sure object is 'reasonable'
  589     __ cbz(r0, exit); // if obj is null it is OK
  590 
  591     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  592     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  593 
  594     // return if everything seems ok
  595     __ bind(exit);
  596 
  597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  598     __ ret(lr);
  599 
  600     // handle errors
  601     __ bind(error);
  602     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  603 
  604     __ push(RegSet::range(r0, r29), sp);
  605     // debug(char* msg, int64_t pc, int64_t regs[])
  606     __ mov(c_rarg0, rscratch1);      // pass address of error message
  607     __ mov(c_rarg1, lr);             // pass return address
  608     __ mov(c_rarg2, sp);             // pass address of regs on stack
  609 #ifndef PRODUCT
  610     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  611 #endif
  612     BLOCK_COMMENT("call MacroAssembler::debug");
  613     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  614     __ blr(rscratch1);
  615     __ hlt(0);
  616 
  617     return start;
  618   }
  619 
  620   // Generate indices for iota vector.
  621   address generate_iota_indices(StubGenStubId stub_id) {
  622     __ align(CodeEntryAlignment);
  623     StubCodeMark mark(this, stub_id);
  624     address start = __ pc();
  625     // B
  626     __ emit_data64(0x0706050403020100, relocInfo::none);
  627     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  628     // H
  629     __ emit_data64(0x0003000200010000, relocInfo::none);
  630     __ emit_data64(0x0007000600050004, relocInfo::none);
  631     // S
  632     __ emit_data64(0x0000000100000000, relocInfo::none);
  633     __ emit_data64(0x0000000300000002, relocInfo::none);
  634     // D
  635     __ emit_data64(0x0000000000000000, relocInfo::none);
  636     __ emit_data64(0x0000000000000001, relocInfo::none);
  637     // S - FP
  638     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  639     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  640     // D - FP
  641     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  642     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  643     return start;
  644   }
  645 
  646   // The inner part of zero_words().  This is the bulk operation,
  647   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  648   // caller is responsible for zeroing the last few words.
  649   //
  650   // Inputs:
  651   // r10: the HeapWord-aligned base address of an array to zero.
  652   // r11: the count in HeapWords, r11 > 0.
  653   //
  654   // Returns r10 and r11, adjusted for the caller to clear.
  655   // r10: the base address of the tail of words left to clear.
  656   // r11: the number of words in the tail.
  657   //      r11 < MacroAssembler::zero_words_block_size.
  658 
  659   address generate_zero_blocks() {
  660     Label done;
  661     Label base_aligned;
  662 
  663     Register base = r10, cnt = r11;
  664 
  665     __ align(CodeEntryAlignment);
  666     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
  667     StubCodeMark mark(this, stub_id);
  668     address start = __ pc();
  669 
  670     if (UseBlockZeroing) {
  671       int zva_length = VM_Version::zva_length();
  672 
  673       // Ensure ZVA length can be divided by 16. This is required by
  674       // the subsequent operations.
  675       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  676 
  677       __ tbz(base, 3, base_aligned);
  678       __ str(zr, Address(__ post(base, 8)));
  679       __ sub(cnt, cnt, 1);
  680       __ bind(base_aligned);
  681 
  682       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  683       // alignment.
  684       Label small;
  685       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  686       __ subs(rscratch1, cnt, low_limit >> 3);
  687       __ br(Assembler::LT, small);
  688       __ zero_dcache_blocks(base, cnt);
  689       __ bind(small);
  690     }
  691 
  692     {
  693       // Number of stp instructions we'll unroll
  694       const int unroll =
  695         MacroAssembler::zero_words_block_size / 2;
  696       // Clear the remaining blocks.
  697       Label loop;
  698       __ subs(cnt, cnt, unroll * 2);
  699       __ br(Assembler::LT, done);
  700       __ bind(loop);
  701       for (int i = 0; i < unroll; i++)
  702         __ stp(zr, zr, __ post(base, 16));
  703       __ subs(cnt, cnt, unroll * 2);
  704       __ br(Assembler::GE, loop);
  705       __ bind(done);
  706       __ add(cnt, cnt, unroll * 2);
  707     }
  708 
  709     __ ret(lr);
  710 
  711     return start;
  712   }
  713 
  714 
  715   typedef enum {
  716     copy_forwards = 1,
  717     copy_backwards = -1
  718   } copy_direction;
  719 
  720   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  721   // for arraycopy stubs.
  722   class ArrayCopyBarrierSetHelper : StackObj {
  723     BarrierSetAssembler* _bs_asm;
  724     MacroAssembler* _masm;
  725     DecoratorSet _decorators;
  726     BasicType _type;
  727     Register _gct1;
  728     Register _gct2;
  729     Register _gct3;
  730     FloatRegister _gcvt1;
  731     FloatRegister _gcvt2;
  732     FloatRegister _gcvt3;
  733 
  734   public:
  735     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  736                               DecoratorSet decorators,
  737                               BasicType type,
  738                               Register gct1,
  739                               Register gct2,
  740                               Register gct3,
  741                               FloatRegister gcvt1,
  742                               FloatRegister gcvt2,
  743                               FloatRegister gcvt3)
  744       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  745         _masm(masm),
  746         _decorators(decorators),
  747         _type(type),
  748         _gct1(gct1),
  749         _gct2(gct2),
  750         _gct3(gct3),
  751         _gcvt1(gcvt1),
  752         _gcvt2(gcvt2),
  753         _gcvt3(gcvt3) {
  754     }
  755 
  756     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  757       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  758                             dst1, dst2, src,
  759                             _gct1, _gct2, _gcvt1);
  760     }
  761 
  762     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  763       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  764                              dst, src1, src2,
  765                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  766     }
  767 
  768     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  769       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  770                             dst1, dst2, src,
  771                             _gct1);
  772     }
  773 
  774     void copy_store_at_16(Address dst, Register src1, Register src2) {
  775       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  776                              dst, src1, src2,
  777                              _gct1, _gct2, _gct3);
  778     }
  779 
  780     void copy_load_at_8(Register dst, Address src) {
  781       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  782                             dst, noreg, src,
  783                             _gct1);
  784     }
  785 
  786     void copy_store_at_8(Address dst, Register src) {
  787       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  788                              dst, src, noreg,
  789                              _gct1, _gct2, _gct3);
  790     }
  791   };
  792 
  793   // Bulk copy of blocks of 8 words.
  794   //
  795   // count is a count of words.
  796   //
  797   // Precondition: count >= 8
  798   //
  799   // Postconditions:
  800   //
  801   // The least significant bit of count contains the remaining count
  802   // of words to copy.  The rest of count is trash.
  803   //
  804   // s and d are adjusted to point to the remaining words to copy
  805   //
  806   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  807     BasicType type;
  808     copy_direction direction;
  809 
  810     switch (stub_id) {
  811     case copy_byte_f_id:
  812       direction = copy_forwards;
  813       type = T_BYTE;
  814       break;
  815     case copy_byte_b_id:
  816       direction = copy_backwards;
  817       type = T_BYTE;
  818       break;
  819     case copy_oop_f_id:
  820       direction = copy_forwards;
  821       type = T_OBJECT;
  822       break;
  823     case copy_oop_b_id:
  824       direction = copy_backwards;
  825       type = T_OBJECT;
  826       break;
  827     case copy_oop_uninit_f_id:
  828       direction = copy_forwards;
  829       type = T_OBJECT;
  830       break;
  831     case copy_oop_uninit_b_id:
  832       direction = copy_backwards;
  833       type = T_OBJECT;
  834       break;
  835     default:
  836       ShouldNotReachHere();
  837     }
  838 
  839     int unit = wordSize * direction;
  840     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  841 
  842     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  843       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  844     const Register stride = r14;
  845     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  846     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  847     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  848 
  849     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  850     assert_different_registers(s, d, count, rscratch1, rscratch2);
  851 
  852     Label again, drain;
  853 
  854     __ align(CodeEntryAlignment);
  855 
  856     StubCodeMark mark(this, stub_id);
  857 
  858     __ bind(start);
  859 
  860     Label unaligned_copy_long;
  861     if (AvoidUnalignedAccesses) {
  862       __ tbnz(d, 3, unaligned_copy_long);
  863     }
  864 
  865     if (direction == copy_forwards) {
  866       __ sub(s, s, bias);
  867       __ sub(d, d, bias);
  868     }
  869 
  870 #ifdef ASSERT
  871     // Make sure we are never given < 8 words
  872     {
  873       Label L;
  874       __ cmp(count, (u1)8);
  875       __ br(Assembler::GE, L);
  876       __ stop("genrate_copy_longs called with < 8 words");
  877       __ bind(L);
  878     }
  879 #endif
  880 
  881     // Fill 8 registers
  882     if (UseSIMDForMemoryOps) {
  883       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  884       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  885     } else {
  886       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  887       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  888       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  889       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  890     }
  891 
  892     __ subs(count, count, 16);
  893     __ br(Assembler::LO, drain);
  894 
  895     int prefetch = PrefetchCopyIntervalInBytes;
  896     bool use_stride = false;
  897     if (direction == copy_backwards) {
  898        use_stride = prefetch > 256;
  899        prefetch = -prefetch;
  900        if (use_stride) __ mov(stride, prefetch);
  901     }
  902 
  903     __ bind(again);
  904 
  905     if (PrefetchCopyIntervalInBytes > 0)
  906       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  907 
  908     if (UseSIMDForMemoryOps) {
  909       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  910       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  911       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  912       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  913     } else {
  914       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  915       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  916       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  917       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  918       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  919       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  920       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  921       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  922     }
  923 
  924     __ subs(count, count, 8);
  925     __ br(Assembler::HS, again);
  926 
  927     // Drain
  928     __ bind(drain);
  929     if (UseSIMDForMemoryOps) {
  930       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  931       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  932     } else {
  933       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  934       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  935       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  936       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  937     }
  938 
  939     {
  940       Label L1, L2;
  941       __ tbz(count, exact_log2(4), L1);
  942       if (UseSIMDForMemoryOps) {
  943         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  944         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  945       } else {
  946         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  947         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  948         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  950       }
  951       __ bind(L1);
  952 
  953       if (direction == copy_forwards) {
  954         __ add(s, s, bias);
  955         __ add(d, d, bias);
  956       }
  957 
  958       __ tbz(count, 1, L2);
  959       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  960       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  961       __ bind(L2);
  962     }
  963 
  964     __ ret(lr);
  965 
  966     if (AvoidUnalignedAccesses) {
  967       Label drain, again;
  968       // Register order for storing. Order is different for backward copy.
  969 
  970       __ bind(unaligned_copy_long);
  971 
  972       // source address is even aligned, target odd aligned
  973       //
  974       // when forward copying word pairs we read long pairs at offsets
  975       // {0, 2, 4, 6} (in long words). when backwards copying we read
  976       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  977       // address by -2 in the forwards case so we can compute the
  978       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  979       // or -1.
  980       //
  981       // when forward copying we need to store 1 word, 3 pairs and
  982       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  983       // zero offset We adjust the destination by -1 which means we
  984       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  985       //
  986       // When backwards copyng we need to store 1 word, 3 pairs and
  987       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  988       // offsets {1, 3, 5, 7, 8} * unit.
  989 
  990       if (direction == copy_forwards) {
  991         __ sub(s, s, 16);
  992         __ sub(d, d, 8);
  993       }
  994 
  995       // Fill 8 registers
  996       //
  997       // for forwards copy s was offset by -16 from the original input
  998       // value of s so the register contents are at these offsets
  999       // relative to the 64 bit block addressed by that original input
 1000       // and so on for each successive 64 byte block when s is updated
 1001       //
 1002       // t0 at offset 0,  t1 at offset 8
 1003       // t2 at offset 16, t3 at offset 24
 1004       // t4 at offset 32, t5 at offset 40
 1005       // t6 at offset 48, t7 at offset 56
 1006 
 1007       // for backwards copy s was not offset so the register contents
 1008       // are at these offsets into the preceding 64 byte block
 1009       // relative to that original input and so on for each successive
 1010       // preceding 64 byte block when s is updated. this explains the
 1011       // slightly counter-intuitive looking pattern of register usage
 1012       // in the stp instructions for backwards copy.
 1013       //
 1014       // t0 at offset -16, t1 at offset -8
 1015       // t2 at offset -32, t3 at offset -24
 1016       // t4 at offset -48, t5 at offset -40
 1017       // t6 at offset -64, t7 at offset -56
 1018 
 1019       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1020       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1021       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1022       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1023 
 1024       __ subs(count, count, 16);
 1025       __ br(Assembler::LO, drain);
 1026 
 1027       int prefetch = PrefetchCopyIntervalInBytes;
 1028       bool use_stride = false;
 1029       if (direction == copy_backwards) {
 1030          use_stride = prefetch > 256;
 1031          prefetch = -prefetch;
 1032          if (use_stride) __ mov(stride, prefetch);
 1033       }
 1034 
 1035       __ bind(again);
 1036 
 1037       if (PrefetchCopyIntervalInBytes > 0)
 1038         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1039 
 1040       if (direction == copy_forwards) {
 1041        // allowing for the offset of -8 the store instructions place
 1042        // registers into the target 64 bit block at the following
 1043        // offsets
 1044        //
 1045        // t0 at offset 0
 1046        // t1 at offset 8,  t2 at offset 16
 1047        // t3 at offset 24, t4 at offset 32
 1048        // t5 at offset 40, t6 at offset 48
 1049        // t7 at offset 56
 1050 
 1051         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1052         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1053         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1054         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1055         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1056         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1057         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1058         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1059         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1060       } else {
 1061        // d was not offset when we started so the registers are
 1062        // written into the 64 bit block preceding d with the following
 1063        // offsets
 1064        //
 1065        // t1 at offset -8
 1066        // t3 at offset -24, t0 at offset -16
 1067        // t5 at offset -48, t2 at offset -32
 1068        // t7 at offset -56, t4 at offset -48
 1069        //                   t6 at offset -64
 1070        //
 1071        // note that this matches the offsets previously noted for the
 1072        // loads
 1073 
 1074         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1075         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1076         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1077         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1078         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1079         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1080         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1081         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1082         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1083       }
 1084 
 1085       __ subs(count, count, 8);
 1086       __ br(Assembler::HS, again);
 1087 
 1088       // Drain
 1089       //
 1090       // this uses the same pattern of offsets and register arguments
 1091       // as above
 1092       __ bind(drain);
 1093       if (direction == copy_forwards) {
 1094         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1095         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1096         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1097         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1098         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1099       } else {
 1100         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1101         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1102         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1103         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1104         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1105       }
 1106       // now we need to copy any remaining part block which may
 1107       // include a 4 word block subblock and/or a 2 word subblock.
 1108       // bits 2 and 1 in the count are the tell-tale for whether we
 1109       // have each such subblock
 1110       {
 1111         Label L1, L2;
 1112         __ tbz(count, exact_log2(4), L1);
 1113        // this is the same as above but copying only 4 longs hence
 1114        // with only one intervening stp between the str instructions
 1115        // but note that the offsets and registers still follow the
 1116        // same pattern
 1117         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1118         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1119         if (direction == copy_forwards) {
 1120           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1121           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1122           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1123         } else {
 1124           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1125           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1126           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1127         }
 1128         __ bind(L1);
 1129 
 1130         __ tbz(count, 1, L2);
 1131        // this is the same as above but copying only 2 longs hence
 1132        // there is no intervening stp between the str instructions
 1133        // but note that the offset and register patterns are still
 1134        // the same
 1135         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1136         if (direction == copy_forwards) {
 1137           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1138           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1139         } else {
 1140           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1141           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1142         }
 1143         __ bind(L2);
 1144 
 1145        // for forwards copy we need to re-adjust the offsets we
 1146        // applied so that s and d are follow the last words written
 1147 
 1148        if (direction == copy_forwards) {
 1149          __ add(s, s, 16);
 1150          __ add(d, d, 8);
 1151        }
 1152 
 1153       }
 1154 
 1155       __ ret(lr);
 1156       }
 1157   }
 1158 
 1159   // Small copy: less than 16 bytes.
 1160   //
 1161   // NB: Ignores all of the bits of count which represent more than 15
 1162   // bytes, so a caller doesn't have to mask them.
 1163 
 1164   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1165     bool is_backwards = step < 0;
 1166     size_t granularity = uabs(step);
 1167     int direction = is_backwards ? -1 : 1;
 1168 
 1169     Label Lword, Lint, Lshort, Lbyte;
 1170 
 1171     assert(granularity
 1172            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1173 
 1174     const Register t0 = r3;
 1175     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1176     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1177 
 1178     // ??? I don't know if this bit-test-and-branch is the right thing
 1179     // to do.  It does a lot of jumping, resulting in several
 1180     // mispredicted branches.  It might make more sense to do this
 1181     // with something like Duff's device with a single computed branch.
 1182 
 1183     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1184     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1185     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1186     __ bind(Lword);
 1187 
 1188     if (granularity <= sizeof (jint)) {
 1189       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1190       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1191       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1192       __ bind(Lint);
 1193     }
 1194 
 1195     if (granularity <= sizeof (jshort)) {
 1196       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1197       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1198       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1199       __ bind(Lshort);
 1200     }
 1201 
 1202     if (granularity <= sizeof (jbyte)) {
 1203       __ tbz(count, 0, Lbyte);
 1204       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1205       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1206       __ bind(Lbyte);
 1207     }
 1208   }
 1209 
 1210   Label copy_f, copy_b;
 1211   Label copy_obj_f, copy_obj_b;
 1212   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1213 
 1214   // All-singing all-dancing memory copy.
 1215   //
 1216   // Copy count units of memory from s to d.  The size of a unit is
 1217   // step, which can be positive or negative depending on the direction
 1218   // of copy.  If is_aligned is false, we align the source address.
 1219   //
 1220 
 1221   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1222                    Register s, Register d, Register count, int step) {
 1223     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1224     bool is_backwards = step < 0;
 1225     unsigned int granularity = uabs(step);
 1226     const Register t0 = r3, t1 = r4;
 1227 
 1228     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1229     // load all the data before writing anything
 1230     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1231     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1232     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1233     const Register send = r17, dend = r16;
 1234     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1235     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1236     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1237 
 1238     if (PrefetchCopyIntervalInBytes > 0)
 1239       __ prfm(Address(s, 0), PLDL1KEEP);
 1240     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1241     __ br(Assembler::HI, copy_big);
 1242 
 1243     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1244     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1245 
 1246     __ cmp(count, u1(16/granularity));
 1247     __ br(Assembler::LS, copy16);
 1248 
 1249     __ cmp(count, u1(64/granularity));
 1250     __ br(Assembler::HI, copy80);
 1251 
 1252     __ cmp(count, u1(32/granularity));
 1253     __ br(Assembler::LS, copy32);
 1254 
 1255     // 33..64 bytes
 1256     if (UseSIMDForMemoryOps) {
 1257       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1258       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1259       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1260       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1261     } else {
 1262       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1263       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1264       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1265       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1266 
 1267       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1268       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1269       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1270       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1271     }
 1272     __ b(finish);
 1273 
 1274     // 17..32 bytes
 1275     __ bind(copy32);
 1276     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1277     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1278 
 1279     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1280     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1281     __ b(finish);
 1282 
 1283     // 65..80/96 bytes
 1284     // (96 bytes if SIMD because we do 32 byes per instruction)
 1285     __ bind(copy80);
 1286     if (UseSIMDForMemoryOps) {
 1287       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1288       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1289       // Unaligned pointers can be an issue for copying.
 1290       // The issue has more chances to happen when granularity of data is
 1291       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1292       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1293       // The most performance drop has been seen for the range 65-80 bytes.
 1294       // For such cases using the pair of ldp/stp instead of the third pair of
 1295       // ldpq/stpq fixes the performance issue.
 1296       if (granularity < sizeof (jint)) {
 1297         Label copy96;
 1298         __ cmp(count, u1(80/granularity));
 1299         __ br(Assembler::HI, copy96);
 1300         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1301 
 1302         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1303         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1304 
 1305         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1306         __ b(finish);
 1307 
 1308         __ bind(copy96);
 1309       }
 1310       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1311 
 1312       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1313       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1314 
 1315       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1316     } else {
 1317       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1318       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1319       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1320       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1321       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1322 
 1323       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1324       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1325       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1326       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1327       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1328     }
 1329     __ b(finish);
 1330 
 1331     // 0..16 bytes
 1332     __ bind(copy16);
 1333     __ cmp(count, u1(8/granularity));
 1334     __ br(Assembler::LO, copy8);
 1335 
 1336     // 8..16 bytes
 1337     bs.copy_load_at_8(t0, Address(s, 0));
 1338     bs.copy_load_at_8(t1, Address(send, -8));
 1339     bs.copy_store_at_8(Address(d, 0), t0);
 1340     bs.copy_store_at_8(Address(dend, -8), t1);
 1341     __ b(finish);
 1342 
 1343     if (granularity < 8) {
 1344       // 4..7 bytes
 1345       __ bind(copy8);
 1346       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1347       __ ldrw(t0, Address(s, 0));
 1348       __ ldrw(t1, Address(send, -4));
 1349       __ strw(t0, Address(d, 0));
 1350       __ strw(t1, Address(dend, -4));
 1351       __ b(finish);
 1352       if (granularity < 4) {
 1353         // 0..3 bytes
 1354         __ bind(copy4);
 1355         __ cbz(count, finish); // get rid of 0 case
 1356         if (granularity == 2) {
 1357           __ ldrh(t0, Address(s, 0));
 1358           __ strh(t0, Address(d, 0));
 1359         } else { // granularity == 1
 1360           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1361           // the first and last byte.
 1362           // Handle the 3 byte case by loading and storing base + count/2
 1363           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1364           // This does means in the 1 byte case we load/store the same
 1365           // byte 3 times.
 1366           __ lsr(count, count, 1);
 1367           __ ldrb(t0, Address(s, 0));
 1368           __ ldrb(t1, Address(send, -1));
 1369           __ ldrb(t2, Address(s, count));
 1370           __ strb(t0, Address(d, 0));
 1371           __ strb(t1, Address(dend, -1));
 1372           __ strb(t2, Address(d, count));
 1373         }
 1374         __ b(finish);
 1375       }
 1376     }
 1377 
 1378     __ bind(copy_big);
 1379     if (is_backwards) {
 1380       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1381       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1382     }
 1383 
 1384     // Now we've got the small case out of the way we can align the
 1385     // source address on a 2-word boundary.
 1386 
 1387     // Here we will materialize a count in r15, which is used by copy_memory_small
 1388     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1389     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1390     // can not be used as a temp register, as it contains the count.
 1391 
 1392     Label aligned;
 1393 
 1394     if (is_aligned) {
 1395       // We may have to adjust by 1 word to get s 2-word-aligned.
 1396       __ tbz(s, exact_log2(wordSize), aligned);
 1397       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1398       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1399       __ sub(count, count, wordSize/granularity);
 1400     } else {
 1401       if (is_backwards) {
 1402         __ andr(r15, s, 2 * wordSize - 1);
 1403       } else {
 1404         __ neg(r15, s);
 1405         __ andr(r15, r15, 2 * wordSize - 1);
 1406       }
 1407       // r15 is the byte adjustment needed to align s.
 1408       __ cbz(r15, aligned);
 1409       int shift = exact_log2(granularity);
 1410       if (shift > 0) {
 1411         __ lsr(r15, r15, shift);
 1412       }
 1413       __ sub(count, count, r15);
 1414 
 1415 #if 0
 1416       // ?? This code is only correct for a disjoint copy.  It may or
 1417       // may not make sense to use it in that case.
 1418 
 1419       // Copy the first pair; s and d may not be aligned.
 1420       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1421       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1422 
 1423       // Align s and d, adjust count
 1424       if (is_backwards) {
 1425         __ sub(s, s, r15);
 1426         __ sub(d, d, r15);
 1427       } else {
 1428         __ add(s, s, r15);
 1429         __ add(d, d, r15);
 1430       }
 1431 #else
 1432       copy_memory_small(decorators, type, s, d, r15, step);
 1433 #endif
 1434     }
 1435 
 1436     __ bind(aligned);
 1437 
 1438     // s is now 2-word-aligned.
 1439 
 1440     // We have a count of units and some trailing bytes. Adjust the
 1441     // count and do a bulk copy of words. If the shift is zero
 1442     // perform a move instead to benefit from zero latency moves.
 1443     int shift = exact_log2(wordSize/granularity);
 1444     if (shift > 0) {
 1445       __ lsr(r15, count, shift);
 1446     } else {
 1447       __ mov(r15, count);
 1448     }
 1449     if (direction == copy_forwards) {
 1450       if (type != T_OBJECT) {
 1451         __ bl(copy_f);
 1452       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1453         __ bl(copy_obj_uninit_f);
 1454       } else {
 1455         __ bl(copy_obj_f);
 1456       }
 1457     } else {
 1458       if (type != T_OBJECT) {
 1459         __ bl(copy_b);
 1460       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1461         __ bl(copy_obj_uninit_b);
 1462       } else {
 1463         __ bl(copy_obj_b);
 1464       }
 1465     }
 1466 
 1467     // And the tail.
 1468     copy_memory_small(decorators, type, s, d, count, step);
 1469 
 1470     if (granularity >= 8) __ bind(copy8);
 1471     if (granularity >= 4) __ bind(copy4);
 1472     __ bind(finish);
 1473   }
 1474 
 1475 
 1476   void clobber_registers() {
 1477 #ifdef ASSERT
 1478     RegSet clobbered
 1479       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1480     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1481     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1482     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1483       __ mov(*it, rscratch1);
 1484     }
 1485 #endif
 1486 
 1487   }
 1488 
 1489   // Scan over array at a for count oops, verifying each one.
 1490   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1491   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1492     Label loop, end;
 1493     __ mov(rscratch1, a);
 1494     __ mov(rscratch2, zr);
 1495     __ bind(loop);
 1496     __ cmp(rscratch2, count);
 1497     __ br(Assembler::HS, end);
 1498     if (size == wordSize) {
 1499       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1500       __ verify_oop(temp);
 1501     } else {
 1502       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1503       __ decode_heap_oop(temp); // calls verify_oop
 1504     }
 1505     __ add(rscratch2, rscratch2, 1);
 1506     __ b(loop);
 1507     __ bind(end);
 1508   }
 1509 
 1510   // Arguments:
 1511   //   stub_id - is used to name the stub and identify all details of
 1512   //             how to perform the copy.
 1513   //
 1514   //   entry - is assigned to the stub's post push entry point unless
 1515   //           it is null
 1516   //
 1517   // Inputs:
 1518   //   c_rarg0   - source array address
 1519   //   c_rarg1   - destination array address
 1520   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1521   //
 1522   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1523   // the hardware handle it.  The two dwords within qwords that span
 1524   // cache line boundaries will still be loaded and stored atomically.
 1525   //
 1526   // Side Effects: entry is set to the (post push) entry point so it
 1527   //               can be used by the corresponding conjoint copy
 1528   //               method
 1529   //
 1530   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
 1531     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1532     RegSet saved_reg = RegSet::of(s, d, count);
 1533     int size;
 1534     bool aligned;
 1535     bool is_oop;
 1536     bool dest_uninitialized;
 1537     switch (stub_id) {
 1538     case jbyte_disjoint_arraycopy_id:
 1539       size = sizeof(jbyte);
 1540       aligned = false;
 1541       is_oop = false;
 1542       dest_uninitialized = false;
 1543       break;
 1544     case arrayof_jbyte_disjoint_arraycopy_id:
 1545       size = sizeof(jbyte);
 1546       aligned = true;
 1547       is_oop = false;
 1548       dest_uninitialized = false;
 1549       break;
 1550     case jshort_disjoint_arraycopy_id:
 1551       size = sizeof(jshort);
 1552       aligned = false;
 1553       is_oop = false;
 1554       dest_uninitialized = false;
 1555       break;
 1556     case arrayof_jshort_disjoint_arraycopy_id:
 1557       size = sizeof(jshort);
 1558       aligned = true;
 1559       is_oop = false;
 1560       dest_uninitialized = false;
 1561       break;
 1562     case jint_disjoint_arraycopy_id:
 1563       size = sizeof(jint);
 1564       aligned = false;
 1565       is_oop = false;
 1566       dest_uninitialized = false;
 1567       break;
 1568     case arrayof_jint_disjoint_arraycopy_id:
 1569       size = sizeof(jint);
 1570       aligned = true;
 1571       is_oop = false;
 1572       dest_uninitialized = false;
 1573       break;
 1574     case jlong_disjoint_arraycopy_id:
 1575       // since this is always aligned we can (should!) use the same
 1576       // stub as for case arrayof_jlong_disjoint_arraycopy
 1577       ShouldNotReachHere();
 1578       break;
 1579     case arrayof_jlong_disjoint_arraycopy_id:
 1580       size = sizeof(jlong);
 1581       aligned = true;
 1582       is_oop = false;
 1583       dest_uninitialized = false;
 1584       break;
 1585     case oop_disjoint_arraycopy_id:
 1586       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1587       aligned = !UseCompressedOops;
 1588       is_oop = true;
 1589       dest_uninitialized = false;
 1590       break;
 1591     case arrayof_oop_disjoint_arraycopy_id:
 1592       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1593       aligned = !UseCompressedOops;
 1594       is_oop = true;
 1595       dest_uninitialized = false;
 1596       break;
 1597     case oop_disjoint_arraycopy_uninit_id:
 1598       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1599       aligned = !UseCompressedOops;
 1600       is_oop = true;
 1601       dest_uninitialized = true;
 1602       break;
 1603     case arrayof_oop_disjoint_arraycopy_uninit_id:
 1604       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1605       aligned = !UseCompressedOops;
 1606       is_oop = true;
 1607       dest_uninitialized = true;
 1608       break;
 1609     default:
 1610       ShouldNotReachHere();
 1611       break;
 1612     }
 1613 
 1614     __ align(CodeEntryAlignment);
 1615     StubCodeMark mark(this, stub_id);
 1616     address start = __ pc();
 1617     __ enter();
 1618 
 1619     if (entry != nullptr) {
 1620       *entry = __ pc();
 1621       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1622       BLOCK_COMMENT("Entry:");
 1623     }
 1624 
 1625     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1626     if (dest_uninitialized) {
 1627       decorators |= IS_DEST_UNINITIALIZED;
 1628     }
 1629     if (aligned) {
 1630       decorators |= ARRAYCOPY_ALIGNED;
 1631     }
 1632 
 1633     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1634     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1635 
 1636     if (is_oop) {
 1637       // save regs before copy_memory
 1638       __ push(RegSet::of(d, count), sp);
 1639     }
 1640     {
 1641       // UnsafeMemoryAccess page error: continue after unsafe access
 1642       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1643       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1644       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1645     }
 1646 
 1647     if (is_oop) {
 1648       __ pop(RegSet::of(d, count), sp);
 1649       if (VerifyOops)
 1650         verify_oop_array(size, d, count, r16);
 1651     }
 1652 
 1653     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1654 
 1655     __ leave();
 1656     __ mov(r0, zr); // return 0
 1657     __ ret(lr);
 1658     return start;
 1659   }
 1660 
 1661   // Arguments:
 1662   //   stub_id - is used to name the stub and identify all details of
 1663   //             how to perform the copy.
 1664   //
 1665   //   nooverlap_target - identifes the (post push) entry for the
 1666   //             corresponding disjoint copy routine which can be
 1667   //             jumped to if the ranges do not actually overlap
 1668   //
 1669   //   entry - is assigned to the stub's post push entry point unless
 1670   //           it is null
 1671   //
 1672   //
 1673   // Inputs:
 1674   //   c_rarg0   - source array address
 1675   //   c_rarg1   - destination array address
 1676   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1677   //
 1678   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1679   // the hardware handle it.  The two dwords within qwords that span
 1680   // cache line boundaries will still be loaded and stored atomically.
 1681   //
 1682   // Side Effects:
 1683   //   entry is set to the no-overlap entry point so it can be used by
 1684   //   some other conjoint copy method
 1685   //
 1686   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
 1687     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1688     RegSet saved_regs = RegSet::of(s, d, count);
 1689     int size;
 1690     bool aligned;
 1691     bool is_oop;
 1692     bool dest_uninitialized;
 1693     switch (stub_id) {
 1694     case jbyte_arraycopy_id:
 1695       size = sizeof(jbyte);
 1696       aligned = false;
 1697       is_oop = false;
 1698       dest_uninitialized = false;
 1699       break;
 1700     case arrayof_jbyte_arraycopy_id:
 1701       size = sizeof(jbyte);
 1702       aligned = true;
 1703       is_oop = false;
 1704       dest_uninitialized = false;
 1705       break;
 1706     case jshort_arraycopy_id:
 1707       size = sizeof(jshort);
 1708       aligned = false;
 1709       is_oop = false;
 1710       dest_uninitialized = false;
 1711       break;
 1712     case arrayof_jshort_arraycopy_id:
 1713       size = sizeof(jshort);
 1714       aligned = true;
 1715       is_oop = false;
 1716       dest_uninitialized = false;
 1717       break;
 1718     case jint_arraycopy_id:
 1719       size = sizeof(jint);
 1720       aligned = false;
 1721       is_oop = false;
 1722       dest_uninitialized = false;
 1723       break;
 1724     case arrayof_jint_arraycopy_id:
 1725       size = sizeof(jint);
 1726       aligned = true;
 1727       is_oop = false;
 1728       dest_uninitialized = false;
 1729       break;
 1730     case jlong_arraycopy_id:
 1731       // since this is always aligned we can (should!) use the same
 1732       // stub as for case arrayof_jlong_disjoint_arraycopy
 1733       ShouldNotReachHere();
 1734       break;
 1735     case arrayof_jlong_arraycopy_id:
 1736       size = sizeof(jlong);
 1737       aligned = true;
 1738       is_oop = false;
 1739       dest_uninitialized = false;
 1740       break;
 1741     case oop_arraycopy_id:
 1742       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1743       aligned = !UseCompressedOops;
 1744       is_oop = true;
 1745       dest_uninitialized = false;
 1746       break;
 1747     case arrayof_oop_arraycopy_id:
 1748       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1749       aligned = !UseCompressedOops;
 1750       is_oop = true;
 1751       dest_uninitialized = false;
 1752       break;
 1753     case oop_arraycopy_uninit_id:
 1754       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1755       aligned = !UseCompressedOops;
 1756       is_oop = true;
 1757       dest_uninitialized = true;
 1758       break;
 1759     case arrayof_oop_arraycopy_uninit_id:
 1760       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1761       aligned = !UseCompressedOops;
 1762       is_oop = true;
 1763       dest_uninitialized = true;
 1764       break;
 1765     default:
 1766       ShouldNotReachHere();
 1767     }
 1768 
 1769     StubCodeMark mark(this, stub_id);
 1770     address start = __ pc();
 1771     __ enter();
 1772 
 1773     if (entry != nullptr) {
 1774       *entry = __ pc();
 1775       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1776       BLOCK_COMMENT("Entry:");
 1777     }
 1778 
 1779     // use fwd copy when (d-s) above_equal (count*size)
 1780     __ sub(rscratch1, d, s);
 1781     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1782     __ br(Assembler::HS, nooverlap_target);
 1783 
 1784     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1785     if (dest_uninitialized) {
 1786       decorators |= IS_DEST_UNINITIALIZED;
 1787     }
 1788     if (aligned) {
 1789       decorators |= ARRAYCOPY_ALIGNED;
 1790     }
 1791 
 1792     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1793     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1794 
 1795     if (is_oop) {
 1796       // save regs before copy_memory
 1797       __ push(RegSet::of(d, count), sp);
 1798     }
 1799     {
 1800       // UnsafeMemoryAccess page error: continue after unsafe access
 1801       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1802       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1803       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1804     }
 1805     if (is_oop) {
 1806       __ pop(RegSet::of(d, count), sp);
 1807       if (VerifyOops)
 1808         verify_oop_array(size, d, count, r16);
 1809     }
 1810     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1811     __ leave();
 1812     __ mov(r0, zr); // return 0
 1813     __ ret(lr);
 1814     return start;
 1815   }
 1816 
 1817   // Helper for generating a dynamic type check.
 1818   // Smashes rscratch1, rscratch2.
 1819   void generate_type_check(Register sub_klass,
 1820                            Register super_check_offset,
 1821                            Register super_klass,
 1822                            Register temp1,
 1823                            Register temp2,
 1824                            Register result,
 1825                            Label& L_success) {
 1826     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1827 
 1828     BLOCK_COMMENT("type_check:");
 1829 
 1830     Label L_miss;
 1831 
 1832     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1833                                      super_check_offset);
 1834     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1835 
 1836     // Fall through on failure!
 1837     __ BIND(L_miss);
 1838   }
 1839 
 1840   //
 1841   //  Generate checkcasting array copy stub
 1842   //
 1843   //  Input:
 1844   //    c_rarg0   - source array address
 1845   //    c_rarg1   - destination array address
 1846   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1847   //    c_rarg3   - size_t ckoff (super_check_offset)
 1848   //    c_rarg4   - oop ckval (super_klass)
 1849   //
 1850   //  Output:
 1851   //    r0 ==  0  -  success
 1852   //    r0 == -1^K - failure, where K is partial transfer count
 1853   //
 1854   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
 1855     bool dest_uninitialized;
 1856     switch (stub_id) {
 1857     case checkcast_arraycopy_id:
 1858       dest_uninitialized = false;
 1859       break;
 1860     case checkcast_arraycopy_uninit_id:
 1861       dest_uninitialized = true;
 1862       break;
 1863     default:
 1864       ShouldNotReachHere();
 1865     }
 1866 
 1867     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1868 
 1869     // Input registers (after setup_arg_regs)
 1870     const Register from        = c_rarg0;   // source array address
 1871     const Register to          = c_rarg1;   // destination array address
 1872     const Register count       = c_rarg2;   // elementscount
 1873     const Register ckoff       = c_rarg3;   // super_check_offset
 1874     const Register ckval       = c_rarg4;   // super_klass
 1875 
 1876     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1877     RegSet wb_post_saved_regs = RegSet::of(count);
 1878 
 1879     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1880     const Register copied_oop  = r22;       // actual oop copied
 1881     const Register count_save  = r21;       // orig elementscount
 1882     const Register start_to    = r20;       // destination array start address
 1883     const Register r19_klass   = r19;       // oop._klass
 1884 
 1885     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1886     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1887 
 1888     //---------------------------------------------------------------
 1889     // Assembler stub will be used for this call to arraycopy
 1890     // if the two arrays are subtypes of Object[] but the
 1891     // destination array type is not equal to or a supertype
 1892     // of the source type.  Each element must be separately
 1893     // checked.
 1894 
 1895     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1896                                copied_oop, r19_klass, count_save);
 1897 
 1898     __ align(CodeEntryAlignment);
 1899     StubCodeMark mark(this, stub_id);
 1900     address start = __ pc();
 1901 
 1902     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1903 
 1904 #ifdef ASSERT
 1905     // caller guarantees that the arrays really are different
 1906     // otherwise, we would have to make conjoint checks
 1907     { Label L;
 1908       __ b(L);                  // conjoint check not yet implemented
 1909       __ stop("checkcast_copy within a single array");
 1910       __ bind(L);
 1911     }
 1912 #endif //ASSERT
 1913 
 1914     // Caller of this entry point must set up the argument registers.
 1915     if (entry != nullptr) {
 1916       *entry = __ pc();
 1917       BLOCK_COMMENT("Entry:");
 1918     }
 1919 
 1920      // Empty array:  Nothing to do.
 1921     __ cbz(count, L_done);
 1922     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1923 
 1924 #ifdef ASSERT
 1925     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1926     // The ckoff and ckval must be mutually consistent,
 1927     // even though caller generates both.
 1928     { Label L;
 1929       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1930       __ ldrw(start_to, Address(ckval, sco_offset));
 1931       __ cmpw(ckoff, start_to);
 1932       __ br(Assembler::EQ, L);
 1933       __ stop("super_check_offset inconsistent");
 1934       __ bind(L);
 1935     }
 1936 #endif //ASSERT
 1937 
 1938     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1939     bool is_oop = true;
 1940     int element_size = UseCompressedOops ? 4 : 8;
 1941     if (dest_uninitialized) {
 1942       decorators |= IS_DEST_UNINITIALIZED;
 1943     }
 1944 
 1945     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1946     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1947 
 1948     // save the original count
 1949     __ mov(count_save, count);
 1950 
 1951     // Copy from low to high addresses
 1952     __ mov(start_to, to);              // Save destination array start address
 1953     __ b(L_load_element);
 1954 
 1955     // ======== begin loop ========
 1956     // (Loop is rotated; its entry is L_load_element.)
 1957     // Loop control:
 1958     //   for (; count != 0; count--) {
 1959     //     copied_oop = load_heap_oop(from++);
 1960     //     ... generate_type_check ...;
 1961     //     store_heap_oop(to++, copied_oop);
 1962     //   }
 1963     __ align(OptoLoopAlignment);
 1964 
 1965     __ BIND(L_store_element);
 1966     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1967                       __ post(to, element_size), copied_oop, noreg,
 1968                       gct1, gct2, gct3);
 1969     __ sub(count, count, 1);
 1970     __ cbz(count, L_do_card_marks);
 1971 
 1972     // ======== loop entry is here ========
 1973     __ BIND(L_load_element);
 1974     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1975                      copied_oop, noreg, __ post(from, element_size),
 1976                      gct1);
 1977     __ cbz(copied_oop, L_store_element);
 1978 
 1979     __ load_klass(r19_klass, copied_oop);// query the object klass
 1980 
 1981     BLOCK_COMMENT("type_check:");
 1982     generate_type_check(/*sub_klass*/r19_klass,
 1983                         /*super_check_offset*/ckoff,
 1984                         /*super_klass*/ckval,
 1985                         /*r_array_base*/gct1,
 1986                         /*temp2*/gct2,
 1987                         /*result*/r10, L_store_element);
 1988 
 1989     // Fall through on failure!
 1990 
 1991     // ======== end loop ========
 1992 
 1993     // It was a real error; we must depend on the caller to finish the job.
 1994     // Register count = remaining oops, count_orig = total oops.
 1995     // Emit GC store barriers for the oops we have copied and report
 1996     // their number to the caller.
 1997 
 1998     __ subs(count, count_save, count);     // K = partially copied oop count
 1999     __ eon(count, count, zr);              // report (-1^K) to caller
 2000     __ br(Assembler::EQ, L_done_pop);
 2001 
 2002     __ BIND(L_do_card_marks);
 2003     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2004 
 2005     __ bind(L_done_pop);
 2006     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2007     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2008 
 2009     __ bind(L_done);
 2010     __ mov(r0, count);
 2011     __ leave();
 2012     __ ret(lr);
 2013 
 2014     return start;
 2015   }
 2016 
 2017   // Perform range checks on the proposed arraycopy.
 2018   // Kills temp, but nothing else.
 2019   // Also, clean the sign bits of src_pos and dst_pos.
 2020   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2021                               Register src_pos, // source position (c_rarg1)
 2022                               Register dst,     // destination array oo (c_rarg2)
 2023                               Register dst_pos, // destination position (c_rarg3)
 2024                               Register length,
 2025                               Register temp,
 2026                               Label& L_failed) {
 2027     BLOCK_COMMENT("arraycopy_range_checks:");
 2028 
 2029     assert_different_registers(rscratch1, temp);
 2030 
 2031     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2032     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2033     __ addw(temp, length, src_pos);
 2034     __ cmpw(temp, rscratch1);
 2035     __ br(Assembler::HI, L_failed);
 2036 
 2037     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2038     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2039     __ addw(temp, length, dst_pos);
 2040     __ cmpw(temp, rscratch1);
 2041     __ br(Assembler::HI, L_failed);
 2042 
 2043     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2044     __ movw(src_pos, src_pos);
 2045     __ movw(dst_pos, dst_pos);
 2046 
 2047     BLOCK_COMMENT("arraycopy_range_checks done");
 2048   }
 2049 
 2050   // These stubs get called from some dumb test routine.
 2051   // I'll write them properly when they're called from
 2052   // something that's actually doing something.
 2053   static void fake_arraycopy_stub(address src, address dst, int count) {
 2054     assert(count == 0, "huh?");
 2055   }
 2056 
 2057 
 2058   //
 2059   //  Generate 'unsafe' array copy stub
 2060   //  Though just as safe as the other stubs, it takes an unscaled
 2061   //  size_t argument instead of an element count.
 2062   //
 2063   //  Input:
 2064   //    c_rarg0   - source array address
 2065   //    c_rarg1   - destination array address
 2066   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2067   //
 2068   // Examines the alignment of the operands and dispatches
 2069   // to a long, int, short, or byte copy loop.
 2070   //
 2071   address generate_unsafe_copy(address byte_copy_entry,
 2072                                address short_copy_entry,
 2073                                address int_copy_entry,
 2074                                address long_copy_entry) {
 2075     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
 2076 
 2077     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2078     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2079 
 2080     __ align(CodeEntryAlignment);
 2081     StubCodeMark mark(this, stub_id);
 2082     address start = __ pc();
 2083     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2084 
 2085     // bump this on entry, not on exit:
 2086     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2087 
 2088     __ orr(rscratch1, s, d);
 2089     __ orr(rscratch1, rscratch1, count);
 2090 
 2091     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2092     __ cbz(rscratch1, L_long_aligned);
 2093     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2094     __ cbz(rscratch1, L_int_aligned);
 2095     __ tbz(rscratch1, 0, L_short_aligned);
 2096     __ b(RuntimeAddress(byte_copy_entry));
 2097 
 2098     __ BIND(L_short_aligned);
 2099     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2100     __ b(RuntimeAddress(short_copy_entry));
 2101     __ BIND(L_int_aligned);
 2102     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2103     __ b(RuntimeAddress(int_copy_entry));
 2104     __ BIND(L_long_aligned);
 2105     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2106     __ b(RuntimeAddress(long_copy_entry));
 2107 
 2108     return start;
 2109   }
 2110 
 2111   //
 2112   //  Generate generic array copy stubs
 2113   //
 2114   //  Input:
 2115   //    c_rarg0    -  src oop
 2116   //    c_rarg1    -  src_pos (32-bits)
 2117   //    c_rarg2    -  dst oop
 2118   //    c_rarg3    -  dst_pos (32-bits)
 2119   //    c_rarg4    -  element count (32-bits)
 2120   //
 2121   //  Output:
 2122   //    r0 ==  0  -  success
 2123   //    r0 == -1^K - failure, where K is partial transfer count
 2124   //
 2125   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2126                                 address int_copy_entry, address oop_copy_entry,
 2127                                 address long_copy_entry, address checkcast_copy_entry) {
 2128     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
 2129 
 2130     Label L_failed, L_objArray;
 2131     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2132 
 2133     // Input registers
 2134     const Register src        = c_rarg0;  // source array oop
 2135     const Register src_pos    = c_rarg1;  // source position
 2136     const Register dst        = c_rarg2;  // destination array oop
 2137     const Register dst_pos    = c_rarg3;  // destination position
 2138     const Register length     = c_rarg4;
 2139 
 2140 
 2141     // Registers used as temps
 2142     const Register dst_klass  = c_rarg5;
 2143 
 2144     __ align(CodeEntryAlignment);
 2145 
 2146     StubCodeMark mark(this, stub_id);
 2147 
 2148     address start = __ pc();
 2149 
 2150     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2151 
 2152     // bump this on entry, not on exit:
 2153     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2154 
 2155     //-----------------------------------------------------------------------
 2156     // Assembler stub will be used for this call to arraycopy
 2157     // if the following conditions are met:
 2158     //
 2159     // (1) src and dst must not be null.
 2160     // (2) src_pos must not be negative.
 2161     // (3) dst_pos must not be negative.
 2162     // (4) length  must not be negative.
 2163     // (5) src klass and dst klass should be the same and not null.
 2164     // (6) src and dst should be arrays.
 2165     // (7) src_pos + length must not exceed length of src.
 2166     // (8) dst_pos + length must not exceed length of dst.
 2167     //
 2168 
 2169     //  if (src == nullptr) return -1;
 2170     __ cbz(src, L_failed);
 2171 
 2172     //  if (src_pos < 0) return -1;
 2173     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2174 
 2175     //  if (dst == nullptr) return -1;
 2176     __ cbz(dst, L_failed);
 2177 
 2178     //  if (dst_pos < 0) return -1;
 2179     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2180 
 2181     // registers used as temp
 2182     const Register scratch_length    = r16; // elements count to copy
 2183     const Register scratch_src_klass = r17; // array klass
 2184     const Register lh                = r15; // layout helper
 2185 
 2186     //  if (length < 0) return -1;
 2187     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2188     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     __ load_klass(scratch_src_klass, src);
 2191 #ifdef ASSERT
 2192     //  assert(src->klass() != nullptr);
 2193     {
 2194       BLOCK_COMMENT("assert klasses not null {");
 2195       Label L1, L2;
 2196       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2197       __ bind(L1);
 2198       __ stop("broken null klass");
 2199       __ bind(L2);
 2200       __ load_klass(rscratch1, dst);
 2201       __ cbz(rscratch1, L1);     // this would be broken also
 2202       BLOCK_COMMENT("} assert klasses not null done");
 2203     }
 2204 #endif
 2205 
 2206     // Load layout helper (32-bits)
 2207     //
 2208     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2209     // 32        30    24            16              8     2                 0
 2210     //
 2211     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2212     //
 2213 
 2214     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2215 
 2216     // Handle objArrays completely differently...
 2217     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2218     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2219     __ movw(rscratch1, objArray_lh);
 2220     __ eorw(rscratch2, lh, rscratch1);
 2221     __ cbzw(rscratch2, L_objArray);
 2222 
 2223     //  if (src->klass() != dst->klass()) return -1;
 2224     __ load_klass(rscratch2, dst);
 2225     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2226     __ cbnz(rscratch2, L_failed);
 2227 
 2228     //  if (!src->is_Array()) return -1;
 2229     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2230 
 2231     // At this point, it is known to be a typeArray (array_tag 0x3).
 2232 #ifdef ASSERT
 2233     {
 2234       BLOCK_COMMENT("assert primitive array {");
 2235       Label L;
 2236       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2237       __ cmpw(lh, rscratch2);
 2238       __ br(Assembler::GE, L);
 2239       __ stop("must be a primitive array");
 2240       __ bind(L);
 2241       BLOCK_COMMENT("} assert primitive array done");
 2242     }
 2243 #endif
 2244 
 2245     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2246                            rscratch2, L_failed);
 2247 
 2248     // TypeArrayKlass
 2249     //
 2250     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2251     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2252     //
 2253 
 2254     const Register rscratch1_offset = rscratch1;    // array offset
 2255     const Register r15_elsize = lh; // element size
 2256 
 2257     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2258            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2259     __ add(src, src, rscratch1_offset);           // src array offset
 2260     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2261     BLOCK_COMMENT("choose copy loop based on element size");
 2262 
 2263     // next registers should be set before the jump to corresponding stub
 2264     const Register from     = c_rarg0;  // source array address
 2265     const Register to       = c_rarg1;  // destination array address
 2266     const Register count    = c_rarg2;  // elements count
 2267 
 2268     // 'from', 'to', 'count' registers should be set in such order
 2269     // since they are the same as 'src', 'src_pos', 'dst'.
 2270 
 2271     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2272 
 2273     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2274     // size in bytes).  We do a simple bitwise binary search.
 2275   __ BIND(L_copy_bytes);
 2276     __ tbnz(r15_elsize, 1, L_copy_ints);
 2277     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2278     __ lea(from, Address(src, src_pos));// src_addr
 2279     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2280     __ movw(count, scratch_length); // length
 2281     __ b(RuntimeAddress(byte_copy_entry));
 2282 
 2283   __ BIND(L_copy_shorts);
 2284     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2285     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2286     __ movw(count, scratch_length); // length
 2287     __ b(RuntimeAddress(short_copy_entry));
 2288 
 2289   __ BIND(L_copy_ints);
 2290     __ tbnz(r15_elsize, 0, L_copy_longs);
 2291     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2292     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2293     __ movw(count, scratch_length); // length
 2294     __ b(RuntimeAddress(int_copy_entry));
 2295 
 2296   __ BIND(L_copy_longs);
 2297 #ifdef ASSERT
 2298     {
 2299       BLOCK_COMMENT("assert long copy {");
 2300       Label L;
 2301       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2302       __ cmpw(r15_elsize, LogBytesPerLong);
 2303       __ br(Assembler::EQ, L);
 2304       __ stop("must be long copy, but elsize is wrong");
 2305       __ bind(L);
 2306       BLOCK_COMMENT("} assert long copy done");
 2307     }
 2308 #endif
 2309     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2310     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2311     __ movw(count, scratch_length); // length
 2312     __ b(RuntimeAddress(long_copy_entry));
 2313 
 2314     // ObjArrayKlass
 2315   __ BIND(L_objArray);
 2316     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2317 
 2318     Label L_plain_copy, L_checkcast_copy;
 2319     //  test array classes for subtyping
 2320     __ load_klass(r15, dst);
 2321     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2322     __ br(Assembler::NE, L_checkcast_copy);
 2323 
 2324     // Identically typed arrays can be copied without element-wise checks.
 2325     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2326                            rscratch2, L_failed);
 2327 
 2328     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2329     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2330     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2331     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2332     __ movw(count, scratch_length); // length
 2333   __ BIND(L_plain_copy);
 2334     __ b(RuntimeAddress(oop_copy_entry));
 2335 
 2336   __ BIND(L_checkcast_copy);
 2337     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2338     {
 2339       // Before looking at dst.length, make sure dst is also an objArray.
 2340       __ ldrw(rscratch1, Address(r15, lh_offset));
 2341       __ movw(rscratch2, objArray_lh);
 2342       __ eorw(rscratch1, rscratch1, rscratch2);
 2343       __ cbnzw(rscratch1, L_failed);
 2344 
 2345       // It is safe to examine both src.length and dst.length.
 2346       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                              r15, L_failed);
 2348 
 2349       __ load_klass(dst_klass, dst); // reload
 2350 
 2351       // Marshal the base address arguments now, freeing registers.
 2352       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2353       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2354       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2355       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2356       __ movw(count, length);           // length (reloaded)
 2357       Register sco_temp = c_rarg3;      // this register is free now
 2358       assert_different_registers(from, to, count, sco_temp,
 2359                                  dst_klass, scratch_src_klass);
 2360       // assert_clean_int(count, sco_temp);
 2361 
 2362       // Generate the type check.
 2363       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2364       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2365 
 2366       // Smashes rscratch1, rscratch2
 2367       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2368                           L_plain_copy);
 2369 
 2370       // Fetch destination element klass from the ObjArrayKlass header.
 2371       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2372       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2373       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2374 
 2375       // the checkcast_copy loop needs two extra arguments:
 2376       assert(c_rarg3 == sco_temp, "#3 already in place");
 2377       // Set up arguments for checkcast_copy_entry.
 2378       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2379       __ b(RuntimeAddress(checkcast_copy_entry));
 2380     }
 2381 
 2382   __ BIND(L_failed);
 2383     __ mov(r0, -1);
 2384     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2385     __ ret(lr);
 2386 
 2387     return start;
 2388   }
 2389 
 2390   //
 2391   // Generate stub for array fill. If "aligned" is true, the
 2392   // "to" address is assumed to be heapword aligned.
 2393   //
 2394   // Arguments for generated stub:
 2395   //   to:    c_rarg0
 2396   //   value: c_rarg1
 2397   //   count: c_rarg2 treated as signed
 2398   //
 2399   address generate_fill(StubGenStubId stub_id) {
 2400     BasicType t;
 2401     bool aligned;
 2402 
 2403     switch (stub_id) {
 2404     case jbyte_fill_id:
 2405       t = T_BYTE;
 2406       aligned = false;
 2407       break;
 2408     case jshort_fill_id:
 2409       t = T_SHORT;
 2410       aligned = false;
 2411       break;
 2412     case jint_fill_id:
 2413       t = T_INT;
 2414       aligned = false;
 2415       break;
 2416     case arrayof_jbyte_fill_id:
 2417       t = T_BYTE;
 2418       aligned = true;
 2419       break;
 2420     case arrayof_jshort_fill_id:
 2421       t = T_SHORT;
 2422       aligned = true;
 2423       break;
 2424     case arrayof_jint_fill_id:
 2425       t = T_INT;
 2426       aligned = true;
 2427       break;
 2428     default:
 2429       ShouldNotReachHere();
 2430     };
 2431 
 2432     __ align(CodeEntryAlignment);
 2433     StubCodeMark mark(this, stub_id);
 2434     address start = __ pc();
 2435 
 2436     BLOCK_COMMENT("Entry:");
 2437 
 2438     const Register to        = c_rarg0;  // source array address
 2439     const Register value     = c_rarg1;  // value
 2440     const Register count     = c_rarg2;  // elements count
 2441 
 2442     const Register bz_base = r10;        // base for block_zero routine
 2443     const Register cnt_words = r11;      // temp register
 2444 
 2445     __ enter();
 2446 
 2447     Label L_fill_elements, L_exit1;
 2448 
 2449     int shift = -1;
 2450     switch (t) {
 2451       case T_BYTE:
 2452         shift = 0;
 2453         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2454         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2455         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2456         __ br(Assembler::LO, L_fill_elements);
 2457         break;
 2458       case T_SHORT:
 2459         shift = 1;
 2460         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2461         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2462         __ br(Assembler::LO, L_fill_elements);
 2463         break;
 2464       case T_INT:
 2465         shift = 2;
 2466         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2467         __ br(Assembler::LO, L_fill_elements);
 2468         break;
 2469       default: ShouldNotReachHere();
 2470     }
 2471 
 2472     // Align source address at 8 bytes address boundary.
 2473     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2474     if (!aligned) {
 2475       switch (t) {
 2476         case T_BYTE:
 2477           // One byte misalignment happens only for byte arrays.
 2478           __ tbz(to, 0, L_skip_align1);
 2479           __ strb(value, Address(__ post(to, 1)));
 2480           __ subw(count, count, 1);
 2481           __ bind(L_skip_align1);
 2482           // Fallthrough
 2483         case T_SHORT:
 2484           // Two bytes misalignment happens only for byte and short (char) arrays.
 2485           __ tbz(to, 1, L_skip_align2);
 2486           __ strh(value, Address(__ post(to, 2)));
 2487           __ subw(count, count, 2 >> shift);
 2488           __ bind(L_skip_align2);
 2489           // Fallthrough
 2490         case T_INT:
 2491           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2492           __ tbz(to, 2, L_skip_align4);
 2493           __ strw(value, Address(__ post(to, 4)));
 2494           __ subw(count, count, 4 >> shift);
 2495           __ bind(L_skip_align4);
 2496           break;
 2497         default: ShouldNotReachHere();
 2498       }
 2499     }
 2500 
 2501     //
 2502     //  Fill large chunks
 2503     //
 2504     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2505     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2506     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2507     if (UseBlockZeroing) {
 2508       Label non_block_zeroing, rest;
 2509       // If the fill value is zero we can use the fast zero_words().
 2510       __ cbnz(value, non_block_zeroing);
 2511       __ mov(bz_base, to);
 2512       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2513       address tpc = __ zero_words(bz_base, cnt_words);
 2514       if (tpc == nullptr) {
 2515         fatal("CodeCache is full at generate_fill");
 2516       }
 2517       __ b(rest);
 2518       __ bind(non_block_zeroing);
 2519       __ fill_words(to, cnt_words, value);
 2520       __ bind(rest);
 2521     } else {
 2522       __ fill_words(to, cnt_words, value);
 2523     }
 2524 
 2525     // Remaining count is less than 8 bytes. Fill it by a single store.
 2526     // Note that the total length is no less than 8 bytes.
 2527     if (t == T_BYTE || t == T_SHORT) {
 2528       Label L_exit1;
 2529       __ cbzw(count, L_exit1);
 2530       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2531       __ str(value, Address(to, -8));    // overwrite some elements
 2532       __ bind(L_exit1);
 2533       __ leave();
 2534       __ ret(lr);
 2535     }
 2536 
 2537     // Handle copies less than 8 bytes.
 2538     Label L_fill_2, L_fill_4, L_exit2;
 2539     __ bind(L_fill_elements);
 2540     switch (t) {
 2541       case T_BYTE:
 2542         __ tbz(count, 0, L_fill_2);
 2543         __ strb(value, Address(__ post(to, 1)));
 2544         __ bind(L_fill_2);
 2545         __ tbz(count, 1, L_fill_4);
 2546         __ strh(value, Address(__ post(to, 2)));
 2547         __ bind(L_fill_4);
 2548         __ tbz(count, 2, L_exit2);
 2549         __ strw(value, Address(to));
 2550         break;
 2551       case T_SHORT:
 2552         __ tbz(count, 0, L_fill_4);
 2553         __ strh(value, Address(__ post(to, 2)));
 2554         __ bind(L_fill_4);
 2555         __ tbz(count, 1, L_exit2);
 2556         __ strw(value, Address(to));
 2557         break;
 2558       case T_INT:
 2559         __ cbzw(count, L_exit2);
 2560         __ strw(value, Address(to));
 2561         break;
 2562       default: ShouldNotReachHere();
 2563     }
 2564     __ bind(L_exit2);
 2565     __ leave();
 2566     __ ret(lr);
 2567     return start;
 2568   }
 2569 
 2570   address generate_data_cache_writeback() {
 2571     const Register line        = c_rarg0;  // address of line to write back
 2572 
 2573     __ align(CodeEntryAlignment);
 2574 
 2575     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
 2576     StubCodeMark mark(this, stub_id);
 2577 
 2578     address start = __ pc();
 2579     __ enter();
 2580     __ cache_wb(Address(line, 0));
 2581     __ leave();
 2582     __ ret(lr);
 2583 
 2584     return start;
 2585   }
 2586 
 2587   address generate_data_cache_writeback_sync() {
 2588     const Register is_pre     = c_rarg0;  // pre or post sync
 2589 
 2590     __ align(CodeEntryAlignment);
 2591 
 2592     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
 2593     StubCodeMark mark(this, stub_id);
 2594 
 2595     // pre wbsync is a no-op
 2596     // post wbsync translates to an sfence
 2597 
 2598     Label skip;
 2599     address start = __ pc();
 2600     __ enter();
 2601     __ cbnz(is_pre, skip);
 2602     __ cache_wbsync(false);
 2603     __ bind(skip);
 2604     __ leave();
 2605     __ ret(lr);
 2606 
 2607     return start;
 2608   }
 2609 
 2610   void generate_arraycopy_stubs() {
 2611     address entry;
 2612     address entry_jbyte_arraycopy;
 2613     address entry_jshort_arraycopy;
 2614     address entry_jint_arraycopy;
 2615     address entry_oop_arraycopy;
 2616     address entry_jlong_arraycopy;
 2617     address entry_checkcast_arraycopy;
 2618 
 2619     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2620     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2621 
 2622     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2623     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2624 
 2625     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2626     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2627 
 2628     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2629 
 2630     //*** jbyte
 2631     // Always need aligned and unaligned versions
 2632     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
 2633     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2634     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2635     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
 2636 
 2637     //*** jshort
 2638     // Always need aligned and unaligned versions
 2639     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
 2640     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2641     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
 2642     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
 2643 
 2644     //*** jint
 2645     // Aligned versions
 2646     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
 2647     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2648     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2649     // entry_jint_arraycopy always points to the unaligned version
 2650     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
 2651     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2652 
 2653     //*** jlong
 2654     // It is always aligned
 2655     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
 2656     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2657     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2658     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2659 
 2660     //*** oops
 2661     {
 2662       // With compressed oops we need unaligned versions; notice that
 2663       // we overwrite entry_oop_arraycopy.
 2664       bool aligned = !UseCompressedOops;
 2665 
 2666       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2667         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
 2668       StubRoutines::_arrayof_oop_arraycopy
 2669         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2670       // Aligned versions without pre-barriers
 2671       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2672         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2673       StubRoutines::_arrayof_oop_arraycopy_uninit
 2674         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2675     }
 2676 
 2677     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2678     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2679     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2680     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2681 
 2682     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2683     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 2684 
 2685     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2686                                                               entry_jshort_arraycopy,
 2687                                                               entry_jint_arraycopy,
 2688                                                               entry_jlong_arraycopy);
 2689 
 2690     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2691                                                                entry_jshort_arraycopy,
 2692                                                                entry_jint_arraycopy,
 2693                                                                entry_oop_arraycopy,
 2694                                                                entry_jlong_arraycopy,
 2695                                                                entry_checkcast_arraycopy);
 2696 
 2697     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 2698     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 2699     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 2700     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 2701     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 2702     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 2703   }
 2704 
 2705   void generate_math_stubs() { Unimplemented(); }
 2706 
 2707   // Arguments:
 2708   //
 2709   // Inputs:
 2710   //   c_rarg0   - source byte array address
 2711   //   c_rarg1   - destination byte array address
 2712   //   c_rarg2   - K (key) in little endian int array
 2713   //
 2714   address generate_aescrypt_encryptBlock() {
 2715     __ align(CodeEntryAlignment);
 2716     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
 2717     StubCodeMark mark(this, stub_id);
 2718 
 2719     const Register from        = c_rarg0;  // source array address
 2720     const Register to          = c_rarg1;  // destination array address
 2721     const Register key         = c_rarg2;  // key array address
 2722     const Register keylen      = rscratch1;
 2723 
 2724     address start = __ pc();
 2725     __ enter();
 2726 
 2727     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2728 
 2729     __ aesenc_loadkeys(key, keylen);
 2730     __ aesecb_encrypt(from, to, keylen);
 2731 
 2732     __ mov(r0, 0);
 2733 
 2734     __ leave();
 2735     __ ret(lr);
 2736 
 2737     return start;
 2738   }
 2739 
 2740   // Arguments:
 2741   //
 2742   // Inputs:
 2743   //   c_rarg0   - source byte array address
 2744   //   c_rarg1   - destination byte array address
 2745   //   c_rarg2   - K (key) in little endian int array
 2746   //
 2747   address generate_aescrypt_decryptBlock() {
 2748     assert(UseAES, "need AES cryptographic extension support");
 2749     __ align(CodeEntryAlignment);
 2750     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
 2751     StubCodeMark mark(this, stub_id);
 2752     Label L_doLast;
 2753 
 2754     const Register from        = c_rarg0;  // source array address
 2755     const Register to          = c_rarg1;  // destination array address
 2756     const Register key         = c_rarg2;  // key array address
 2757     const Register keylen      = rscratch1;
 2758 
 2759     address start = __ pc();
 2760     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2761 
 2762     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2763 
 2764     __ aesecb_decrypt(from, to, key, keylen);
 2765 
 2766     __ mov(r0, 0);
 2767 
 2768     __ leave();
 2769     __ ret(lr);
 2770 
 2771     return start;
 2772   }
 2773 
 2774   // Arguments:
 2775   //
 2776   // Inputs:
 2777   //   c_rarg0   - source byte array address
 2778   //   c_rarg1   - destination byte array address
 2779   //   c_rarg2   - K (key) in little endian int array
 2780   //   c_rarg3   - r vector byte array address
 2781   //   c_rarg4   - input length
 2782   //
 2783   // Output:
 2784   //   x0        - input length
 2785   //
 2786   address generate_cipherBlockChaining_encryptAESCrypt() {
 2787     assert(UseAES, "need AES cryptographic extension support");
 2788     __ align(CodeEntryAlignment);
 2789     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
 2790     StubCodeMark mark(this, stub_id);
 2791 
 2792     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2793 
 2794     const Register from        = c_rarg0;  // source array address
 2795     const Register to          = c_rarg1;  // destination array address
 2796     const Register key         = c_rarg2;  // key array address
 2797     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2798                                            // and left with the results of the last encryption block
 2799     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2800     const Register keylen      = rscratch1;
 2801 
 2802     address start = __ pc();
 2803 
 2804       __ enter();
 2805 
 2806       __ movw(rscratch2, len_reg);
 2807 
 2808       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2809 
 2810       __ ld1(v0, __ T16B, rvec);
 2811 
 2812       __ cmpw(keylen, 52);
 2813       __ br(Assembler::CC, L_loadkeys_44);
 2814       __ br(Assembler::EQ, L_loadkeys_52);
 2815 
 2816       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2817       __ rev32(v17, __ T16B, v17);
 2818       __ rev32(v18, __ T16B, v18);
 2819     __ BIND(L_loadkeys_52);
 2820       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2821       __ rev32(v19, __ T16B, v19);
 2822       __ rev32(v20, __ T16B, v20);
 2823     __ BIND(L_loadkeys_44);
 2824       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2825       __ rev32(v21, __ T16B, v21);
 2826       __ rev32(v22, __ T16B, v22);
 2827       __ rev32(v23, __ T16B, v23);
 2828       __ rev32(v24, __ T16B, v24);
 2829       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2830       __ rev32(v25, __ T16B, v25);
 2831       __ rev32(v26, __ T16B, v26);
 2832       __ rev32(v27, __ T16B, v27);
 2833       __ rev32(v28, __ T16B, v28);
 2834       __ ld1(v29, v30, v31, __ T16B, key);
 2835       __ rev32(v29, __ T16B, v29);
 2836       __ rev32(v30, __ T16B, v30);
 2837       __ rev32(v31, __ T16B, v31);
 2838 
 2839     __ BIND(L_aes_loop);
 2840       __ ld1(v1, __ T16B, __ post(from, 16));
 2841       __ eor(v0, __ T16B, v0, v1);
 2842 
 2843       __ br(Assembler::CC, L_rounds_44);
 2844       __ br(Assembler::EQ, L_rounds_52);
 2845 
 2846       __ aese(v0, v17); __ aesmc(v0, v0);
 2847       __ aese(v0, v18); __ aesmc(v0, v0);
 2848     __ BIND(L_rounds_52);
 2849       __ aese(v0, v19); __ aesmc(v0, v0);
 2850       __ aese(v0, v20); __ aesmc(v0, v0);
 2851     __ BIND(L_rounds_44);
 2852       __ aese(v0, v21); __ aesmc(v0, v0);
 2853       __ aese(v0, v22); __ aesmc(v0, v0);
 2854       __ aese(v0, v23); __ aesmc(v0, v0);
 2855       __ aese(v0, v24); __ aesmc(v0, v0);
 2856       __ aese(v0, v25); __ aesmc(v0, v0);
 2857       __ aese(v0, v26); __ aesmc(v0, v0);
 2858       __ aese(v0, v27); __ aesmc(v0, v0);
 2859       __ aese(v0, v28); __ aesmc(v0, v0);
 2860       __ aese(v0, v29); __ aesmc(v0, v0);
 2861       __ aese(v0, v30);
 2862       __ eor(v0, __ T16B, v0, v31);
 2863 
 2864       __ st1(v0, __ T16B, __ post(to, 16));
 2865 
 2866       __ subw(len_reg, len_reg, 16);
 2867       __ cbnzw(len_reg, L_aes_loop);
 2868 
 2869       __ st1(v0, __ T16B, rvec);
 2870 
 2871       __ mov(r0, rscratch2);
 2872 
 2873       __ leave();
 2874       __ ret(lr);
 2875 
 2876       return start;
 2877   }
 2878 
 2879   // Arguments:
 2880   //
 2881   // Inputs:
 2882   //   c_rarg0   - source byte array address
 2883   //   c_rarg1   - destination byte array address
 2884   //   c_rarg2   - K (key) in little endian int array
 2885   //   c_rarg3   - r vector byte array address
 2886   //   c_rarg4   - input length
 2887   //
 2888   // Output:
 2889   //   r0        - input length
 2890   //
 2891   address generate_cipherBlockChaining_decryptAESCrypt() {
 2892     assert(UseAES, "need AES cryptographic extension support");
 2893     __ align(CodeEntryAlignment);
 2894     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
 2895     StubCodeMark mark(this, stub_id);
 2896 
 2897     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2898 
 2899     const Register from        = c_rarg0;  // source array address
 2900     const Register to          = c_rarg1;  // destination array address
 2901     const Register key         = c_rarg2;  // key array address
 2902     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2903                                            // and left with the results of the last encryption block
 2904     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2905     const Register keylen      = rscratch1;
 2906 
 2907     address start = __ pc();
 2908 
 2909       __ enter();
 2910 
 2911       __ movw(rscratch2, len_reg);
 2912 
 2913       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2914 
 2915       __ ld1(v2, __ T16B, rvec);
 2916 
 2917       __ ld1(v31, __ T16B, __ post(key, 16));
 2918       __ rev32(v31, __ T16B, v31);
 2919 
 2920       __ cmpw(keylen, 52);
 2921       __ br(Assembler::CC, L_loadkeys_44);
 2922       __ br(Assembler::EQ, L_loadkeys_52);
 2923 
 2924       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2925       __ rev32(v17, __ T16B, v17);
 2926       __ rev32(v18, __ T16B, v18);
 2927     __ BIND(L_loadkeys_52);
 2928       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2929       __ rev32(v19, __ T16B, v19);
 2930       __ rev32(v20, __ T16B, v20);
 2931     __ BIND(L_loadkeys_44);
 2932       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2933       __ rev32(v21, __ T16B, v21);
 2934       __ rev32(v22, __ T16B, v22);
 2935       __ rev32(v23, __ T16B, v23);
 2936       __ rev32(v24, __ T16B, v24);
 2937       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2938       __ rev32(v25, __ T16B, v25);
 2939       __ rev32(v26, __ T16B, v26);
 2940       __ rev32(v27, __ T16B, v27);
 2941       __ rev32(v28, __ T16B, v28);
 2942       __ ld1(v29, v30, __ T16B, key);
 2943       __ rev32(v29, __ T16B, v29);
 2944       __ rev32(v30, __ T16B, v30);
 2945 
 2946     __ BIND(L_aes_loop);
 2947       __ ld1(v0, __ T16B, __ post(from, 16));
 2948       __ orr(v1, __ T16B, v0, v0);
 2949 
 2950       __ br(Assembler::CC, L_rounds_44);
 2951       __ br(Assembler::EQ, L_rounds_52);
 2952 
 2953       __ aesd(v0, v17); __ aesimc(v0, v0);
 2954       __ aesd(v0, v18); __ aesimc(v0, v0);
 2955     __ BIND(L_rounds_52);
 2956       __ aesd(v0, v19); __ aesimc(v0, v0);
 2957       __ aesd(v0, v20); __ aesimc(v0, v0);
 2958     __ BIND(L_rounds_44);
 2959       __ aesd(v0, v21); __ aesimc(v0, v0);
 2960       __ aesd(v0, v22); __ aesimc(v0, v0);
 2961       __ aesd(v0, v23); __ aesimc(v0, v0);
 2962       __ aesd(v0, v24); __ aesimc(v0, v0);
 2963       __ aesd(v0, v25); __ aesimc(v0, v0);
 2964       __ aesd(v0, v26); __ aesimc(v0, v0);
 2965       __ aesd(v0, v27); __ aesimc(v0, v0);
 2966       __ aesd(v0, v28); __ aesimc(v0, v0);
 2967       __ aesd(v0, v29); __ aesimc(v0, v0);
 2968       __ aesd(v0, v30);
 2969       __ eor(v0, __ T16B, v0, v31);
 2970       __ eor(v0, __ T16B, v0, v2);
 2971 
 2972       __ st1(v0, __ T16B, __ post(to, 16));
 2973       __ orr(v2, __ T16B, v1, v1);
 2974 
 2975       __ subw(len_reg, len_reg, 16);
 2976       __ cbnzw(len_reg, L_aes_loop);
 2977 
 2978       __ st1(v2, __ T16B, rvec);
 2979 
 2980       __ mov(r0, rscratch2);
 2981 
 2982       __ leave();
 2983       __ ret(lr);
 2984 
 2985     return start;
 2986   }
 2987 
 2988   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 2989   // Inputs: 128-bits. in is preserved.
 2990   // The least-significant 64-bit word is in the upper dword of each vector.
 2991   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 2992   // Output: result
 2993   void be_add_128_64(FloatRegister result, FloatRegister in,
 2994                      FloatRegister inc, FloatRegister tmp) {
 2995     assert_different_registers(result, tmp, inc);
 2996 
 2997     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 2998                                            // input
 2999     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3000     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3001                                            // MSD == 0 (must be!) to LSD
 3002     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3003   }
 3004 
 3005   // CTR AES crypt.
 3006   // Arguments:
 3007   //
 3008   // Inputs:
 3009   //   c_rarg0   - source byte array address
 3010   //   c_rarg1   - destination byte array address
 3011   //   c_rarg2   - K (key) in little endian int array
 3012   //   c_rarg3   - counter vector byte array address
 3013   //   c_rarg4   - input length
 3014   //   c_rarg5   - saved encryptedCounter start
 3015   //   c_rarg6   - saved used length
 3016   //
 3017   // Output:
 3018   //   r0       - input length
 3019   //
 3020   address generate_counterMode_AESCrypt() {
 3021     const Register in = c_rarg0;
 3022     const Register out = c_rarg1;
 3023     const Register key = c_rarg2;
 3024     const Register counter = c_rarg3;
 3025     const Register saved_len = c_rarg4, len = r10;
 3026     const Register saved_encrypted_ctr = c_rarg5;
 3027     const Register used_ptr = c_rarg6, used = r12;
 3028 
 3029     const Register offset = r7;
 3030     const Register keylen = r11;
 3031 
 3032     const unsigned char block_size = 16;
 3033     const int bulk_width = 4;
 3034     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3035     // performance with larger data sizes, but it also means that the
 3036     // fast path isn't used until you have at least 8 blocks, and up
 3037     // to 127 bytes of data will be executed on the slow path. For
 3038     // that reason, and also so as not to blow away too much icache, 4
 3039     // blocks seems like a sensible compromise.
 3040 
 3041     // Algorithm:
 3042     //
 3043     //    if (len == 0) {
 3044     //        goto DONE;
 3045     //    }
 3046     //    int result = len;
 3047     //    do {
 3048     //        if (used >= blockSize) {
 3049     //            if (len >= bulk_width * blockSize) {
 3050     //                CTR_large_block();
 3051     //                if (len == 0)
 3052     //                    goto DONE;
 3053     //            }
 3054     //            for (;;) {
 3055     //                16ByteVector v0 = counter;
 3056     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3057     //                used = 0;
 3058     //                if (len < blockSize)
 3059     //                    break;    /* goto NEXT */
 3060     //                16ByteVector v1 = load16Bytes(in, offset);
 3061     //                v1 = v1 ^ encryptedCounter;
 3062     //                store16Bytes(out, offset);
 3063     //                used = blockSize;
 3064     //                offset += blockSize;
 3065     //                len -= blockSize;
 3066     //                if (len == 0)
 3067     //                    goto DONE;
 3068     //            }
 3069     //        }
 3070     //      NEXT:
 3071     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3072     //        len--;
 3073     //    } while (len != 0);
 3074     //  DONE:
 3075     //    return result;
 3076     //
 3077     // CTR_large_block()
 3078     //    Wide bulk encryption of whole blocks.
 3079 
 3080     __ align(CodeEntryAlignment);
 3081     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
 3082     StubCodeMark mark(this, stub_id);
 3083     const address start = __ pc();
 3084     __ enter();
 3085 
 3086     Label DONE, CTR_large_block, large_block_return;
 3087     __ ldrw(used, Address(used_ptr));
 3088     __ cbzw(saved_len, DONE);
 3089 
 3090     __ mov(len, saved_len);
 3091     __ mov(offset, 0);
 3092 
 3093     // Compute #rounds for AES based on the length of the key array
 3094     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3095 
 3096     __ aesenc_loadkeys(key, keylen);
 3097 
 3098     {
 3099       Label L_CTR_loop, NEXT;
 3100 
 3101       __ bind(L_CTR_loop);
 3102 
 3103       __ cmp(used, block_size);
 3104       __ br(__ LO, NEXT);
 3105 
 3106       // Maybe we have a lot of data
 3107       __ subsw(rscratch1, len, bulk_width * block_size);
 3108       __ br(__ HS, CTR_large_block);
 3109       __ BIND(large_block_return);
 3110       __ cbzw(len, DONE);
 3111 
 3112       // Setup the counter
 3113       __ movi(v4, __ T4S, 0);
 3114       __ movi(v5, __ T4S, 1);
 3115       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3116 
 3117       // 128-bit big-endian increment
 3118       __ ld1(v0, __ T16B, counter);
 3119       __ rev64(v16, __ T16B, v0);
 3120       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3121       __ rev64(v16, __ T16B, v16);
 3122       __ st1(v16, __ T16B, counter);
 3123       // Previous counter value is in v0
 3124       // v4 contains { 0, 1 }
 3125 
 3126       {
 3127         // We have fewer than bulk_width blocks of data left. Encrypt
 3128         // them one by one until there is less than a full block
 3129         // remaining, being careful to save both the encrypted counter
 3130         // and the counter.
 3131 
 3132         Label inner_loop;
 3133         __ bind(inner_loop);
 3134         // Counter to encrypt is in v0
 3135         __ aesecb_encrypt(noreg, noreg, keylen);
 3136         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3137 
 3138         // Do we have a remaining full block?
 3139 
 3140         __ mov(used, 0);
 3141         __ cmp(len, block_size);
 3142         __ br(__ LO, NEXT);
 3143 
 3144         // Yes, we have a full block
 3145         __ ldrq(v1, Address(in, offset));
 3146         __ eor(v1, __ T16B, v1, v0);
 3147         __ strq(v1, Address(out, offset));
 3148         __ mov(used, block_size);
 3149         __ add(offset, offset, block_size);
 3150 
 3151         __ subw(len, len, block_size);
 3152         __ cbzw(len, DONE);
 3153 
 3154         // Increment the counter, store it back
 3155         __ orr(v0, __ T16B, v16, v16);
 3156         __ rev64(v16, __ T16B, v16);
 3157         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3158         __ rev64(v16, __ T16B, v16);
 3159         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3160 
 3161         __ b(inner_loop);
 3162       }
 3163 
 3164       __ BIND(NEXT);
 3165 
 3166       // Encrypt a single byte, and loop.
 3167       // We expect this to be a rare event.
 3168       __ ldrb(rscratch1, Address(in, offset));
 3169       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3170       __ eor(rscratch1, rscratch1, rscratch2);
 3171       __ strb(rscratch1, Address(out, offset));
 3172       __ add(offset, offset, 1);
 3173       __ add(used, used, 1);
 3174       __ subw(len, len,1);
 3175       __ cbnzw(len, L_CTR_loop);
 3176     }
 3177 
 3178     __ bind(DONE);
 3179     __ strw(used, Address(used_ptr));
 3180     __ mov(r0, saved_len);
 3181 
 3182     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3183     __ ret(lr);
 3184 
 3185     // Bulk encryption
 3186 
 3187     __ BIND (CTR_large_block);
 3188     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3189 
 3190     if (bulk_width == 8) {
 3191       __ sub(sp, sp, 4 * 16);
 3192       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3193     }
 3194     __ sub(sp, sp, 4 * 16);
 3195     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3196     RegSet saved_regs = (RegSet::of(in, out, offset)
 3197                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3198     __ push(saved_regs, sp);
 3199     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3200     __ add(in, in, offset);
 3201     __ add(out, out, offset);
 3202 
 3203     // Keys should already be loaded into the correct registers
 3204 
 3205     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3206     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3207 
 3208     // AES/CTR loop
 3209     {
 3210       Label L_CTR_loop;
 3211       __ BIND(L_CTR_loop);
 3212 
 3213       // Setup the counters
 3214       __ movi(v8, __ T4S, 0);
 3215       __ movi(v9, __ T4S, 1);
 3216       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3217 
 3218       for (int i = 0; i < bulk_width; i++) {
 3219         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3220         __ rev64(v0_ofs, __ T16B, v16);
 3221         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3222       }
 3223 
 3224       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3225 
 3226       // Encrypt the counters
 3227       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3228 
 3229       if (bulk_width == 8) {
 3230         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3231       }
 3232 
 3233       // XOR the encrypted counters with the inputs
 3234       for (int i = 0; i < bulk_width; i++) {
 3235         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3236         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3237         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3238       }
 3239 
 3240       // Write the encrypted data
 3241       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3242       if (bulk_width == 8) {
 3243         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3244       }
 3245 
 3246       __ subw(len, len, 16 * bulk_width);
 3247       __ cbnzw(len, L_CTR_loop);
 3248     }
 3249 
 3250     // Save the counter back where it goes
 3251     __ rev64(v16, __ T16B, v16);
 3252     __ st1(v16, __ T16B, counter);
 3253 
 3254     __ pop(saved_regs, sp);
 3255 
 3256     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3257     if (bulk_width == 8) {
 3258       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3259     }
 3260 
 3261     __ andr(rscratch1, len, -16 * bulk_width);
 3262     __ sub(len, len, rscratch1);
 3263     __ add(offset, offset, rscratch1);
 3264     __ mov(used, 16);
 3265     __ strw(used, Address(used_ptr));
 3266     __ b(large_block_return);
 3267 
 3268     return start;
 3269   }
 3270 
 3271   // Vector AES Galois Counter Mode implementation. Parameters:
 3272   //
 3273   // in = c_rarg0
 3274   // len = c_rarg1
 3275   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3276   // out = c_rarg3
 3277   // key = c_rarg4
 3278   // state = c_rarg5 - GHASH.state
 3279   // subkeyHtbl = c_rarg6 - powers of H
 3280   // counter = c_rarg7 - 16 bytes of CTR
 3281   // return - number of processed bytes
 3282   address generate_galoisCounterMode_AESCrypt() {
 3283     address ghash_polynomial = __ pc();
 3284     __ emit_int64(0x87);  // The low-order bits of the field
 3285                           // polynomial (i.e. p = z^7+z^2+z+1)
 3286                           // repeated in the low and high parts of a
 3287                           // 128-bit vector
 3288     __ emit_int64(0x87);
 3289 
 3290     __ align(CodeEntryAlignment);
 3291     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
 3292     StubCodeMark mark(this, stub_id);
 3293     address start = __ pc();
 3294     __ enter();
 3295 
 3296     const Register in = c_rarg0;
 3297     const Register len = c_rarg1;
 3298     const Register ct = c_rarg2;
 3299     const Register out = c_rarg3;
 3300     // and updated with the incremented counter in the end
 3301 
 3302     const Register key = c_rarg4;
 3303     const Register state = c_rarg5;
 3304 
 3305     const Register subkeyHtbl = c_rarg6;
 3306 
 3307     const Register counter = c_rarg7;
 3308 
 3309     const Register keylen = r10;
 3310     // Save state before entering routine
 3311     __ sub(sp, sp, 4 * 16);
 3312     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3313     __ sub(sp, sp, 4 * 16);
 3314     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3315 
 3316     // __ andr(len, len, -512);
 3317     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3318     __ str(len, __ pre(sp, -2 * wordSize));
 3319 
 3320     Label DONE;
 3321     __ cbz(len, DONE);
 3322 
 3323     // Compute #rounds for AES based on the length of the key array
 3324     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3325 
 3326     __ aesenc_loadkeys(key, keylen);
 3327     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3328     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3329 
 3330     // AES/CTR loop
 3331     {
 3332       Label L_CTR_loop;
 3333       __ BIND(L_CTR_loop);
 3334 
 3335       // Setup the counters
 3336       __ movi(v8, __ T4S, 0);
 3337       __ movi(v9, __ T4S, 1);
 3338       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3339 
 3340       assert(v0->encoding() < v8->encoding(), "");
 3341       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3342         FloatRegister f = as_FloatRegister(i);
 3343         __ rev32(f, __ T16B, v16);
 3344         __ addv(v16, __ T4S, v16, v8);
 3345       }
 3346 
 3347       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3348 
 3349       // Encrypt the counters
 3350       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3351 
 3352       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3353 
 3354       // XOR the encrypted counters with the inputs
 3355       for (int i = 0; i < 8; i++) {
 3356         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3357         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3358         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3359       }
 3360       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3361       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3362 
 3363       __ subw(len, len, 16 * 8);
 3364       __ cbnzw(len, L_CTR_loop);
 3365     }
 3366 
 3367     __ rev32(v16, __ T16B, v16);
 3368     __ st1(v16, __ T16B, counter);
 3369 
 3370     __ ldr(len, Address(sp));
 3371     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3372 
 3373     // GHASH/CTR loop
 3374     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3375                                 len, /*unrolls*/4);
 3376 
 3377 #ifdef ASSERT
 3378     { Label L;
 3379       __ cmp(len, (unsigned char)0);
 3380       __ br(Assembler::EQ, L);
 3381       __ stop("stubGenerator: abort");
 3382       __ bind(L);
 3383   }
 3384 #endif
 3385 
 3386   __ bind(DONE);
 3387     // Return the number of bytes processed
 3388     __ ldr(r0, __ post(sp, 2 * wordSize));
 3389 
 3390     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3391     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3392 
 3393     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3394     __ ret(lr);
 3395      return start;
 3396   }
 3397 
 3398   class Cached64Bytes {
 3399   private:
 3400     MacroAssembler *_masm;
 3401     Register _regs[8];
 3402 
 3403   public:
 3404     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3405       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3406       auto it = rs.begin();
 3407       for (auto &r: _regs) {
 3408         r = *it;
 3409         ++it;
 3410       }
 3411     }
 3412 
 3413     void gen_loads(Register base) {
 3414       for (int i = 0; i < 8; i += 2) {
 3415         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3416       }
 3417     }
 3418 
 3419     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3420     void extract_u32(Register dest, int i) {
 3421       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3422     }
 3423   };
 3424 
 3425   // Utility routines for md5.
 3426   // Clobbers r10 and r11.
 3427   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3428               int k, int s, int t) {
 3429     Register rscratch3 = r10;
 3430     Register rscratch4 = r11;
 3431 
 3432     __ eorw(rscratch3, r3, r4);
 3433     __ movw(rscratch2, t);
 3434     __ andw(rscratch3, rscratch3, r2);
 3435     __ addw(rscratch4, r1, rscratch2);
 3436     reg_cache.extract_u32(rscratch1, k);
 3437     __ eorw(rscratch3, rscratch3, r4);
 3438     __ addw(rscratch4, rscratch4, rscratch1);
 3439     __ addw(rscratch3, rscratch3, rscratch4);
 3440     __ rorw(rscratch2, rscratch3, 32 - s);
 3441     __ addw(r1, rscratch2, r2);
 3442   }
 3443 
 3444   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3445               int k, int s, int t) {
 3446     Register rscratch3 = r10;
 3447     Register rscratch4 = r11;
 3448 
 3449     reg_cache.extract_u32(rscratch1, k);
 3450     __ movw(rscratch2, t);
 3451     __ addw(rscratch4, r1, rscratch2);
 3452     __ addw(rscratch4, rscratch4, rscratch1);
 3453     __ bicw(rscratch2, r3, r4);
 3454     __ andw(rscratch3, r2, r4);
 3455     __ addw(rscratch2, rscratch2, rscratch4);
 3456     __ addw(rscratch2, rscratch2, rscratch3);
 3457     __ rorw(rscratch2, rscratch2, 32 - s);
 3458     __ addw(r1, rscratch2, r2);
 3459   }
 3460 
 3461   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3462               int k, int s, int t) {
 3463     Register rscratch3 = r10;
 3464     Register rscratch4 = r11;
 3465 
 3466     __ eorw(rscratch3, r3, r4);
 3467     __ movw(rscratch2, t);
 3468     __ addw(rscratch4, r1, rscratch2);
 3469     reg_cache.extract_u32(rscratch1, k);
 3470     __ eorw(rscratch3, rscratch3, r2);
 3471     __ addw(rscratch4, rscratch4, rscratch1);
 3472     __ addw(rscratch3, rscratch3, rscratch4);
 3473     __ rorw(rscratch2, rscratch3, 32 - s);
 3474     __ addw(r1, rscratch2, r2);
 3475   }
 3476 
 3477   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3478               int k, int s, int t) {
 3479     Register rscratch3 = r10;
 3480     Register rscratch4 = r11;
 3481 
 3482     __ movw(rscratch3, t);
 3483     __ ornw(rscratch2, r2, r4);
 3484     __ addw(rscratch4, r1, rscratch3);
 3485     reg_cache.extract_u32(rscratch1, k);
 3486     __ eorw(rscratch3, rscratch2, r3);
 3487     __ addw(rscratch4, rscratch4, rscratch1);
 3488     __ addw(rscratch3, rscratch3, rscratch4);
 3489     __ rorw(rscratch2, rscratch3, 32 - s);
 3490     __ addw(r1, rscratch2, r2);
 3491   }
 3492 
 3493   // Arguments:
 3494   //
 3495   // Inputs:
 3496   //   c_rarg0   - byte[]  source+offset
 3497   //   c_rarg1   - int[]   SHA.state
 3498   //   c_rarg2   - int     offset
 3499   //   c_rarg3   - int     limit
 3500   //
 3501   address generate_md5_implCompress(StubGenStubId stub_id) {
 3502     bool multi_block;
 3503     switch (stub_id) {
 3504     case md5_implCompress_id:
 3505       multi_block = false;
 3506       break;
 3507     case md5_implCompressMB_id:
 3508       multi_block = true;
 3509       break;
 3510     default:
 3511       ShouldNotReachHere();
 3512     }
 3513     __ align(CodeEntryAlignment);
 3514 
 3515     StubCodeMark mark(this, stub_id);
 3516     address start = __ pc();
 3517 
 3518     Register buf       = c_rarg0;
 3519     Register state     = c_rarg1;
 3520     Register ofs       = c_rarg2;
 3521     Register limit     = c_rarg3;
 3522     Register a         = r4;
 3523     Register b         = r5;
 3524     Register c         = r6;
 3525     Register d         = r7;
 3526     Register rscratch3 = r10;
 3527     Register rscratch4 = r11;
 3528 
 3529     Register state_regs[2] = { r12, r13 };
 3530     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3531     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3532 
 3533     __ push(saved_regs, sp);
 3534 
 3535     __ ldp(state_regs[0], state_regs[1], Address(state));
 3536     __ ubfx(a, state_regs[0],  0, 32);
 3537     __ ubfx(b, state_regs[0], 32, 32);
 3538     __ ubfx(c, state_regs[1],  0, 32);
 3539     __ ubfx(d, state_regs[1], 32, 32);
 3540 
 3541     Label md5_loop;
 3542     __ BIND(md5_loop);
 3543 
 3544     reg_cache.gen_loads(buf);
 3545 
 3546     // Round 1
 3547     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3548     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3549     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3550     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3551     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3552     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3553     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3554     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3555     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3556     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3557     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3558     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3559     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3560     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3561     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3562     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3563 
 3564     // Round 2
 3565     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3566     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3567     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3568     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3569     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3570     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3571     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3572     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3573     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3574     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3575     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3576     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3577     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3578     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3579     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3580     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3581 
 3582     // Round 3
 3583     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3584     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3585     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3586     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3587     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3588     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3589     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3590     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3591     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3592     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3593     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3594     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3595     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3596     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3597     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3598     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3599 
 3600     // Round 4
 3601     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3602     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3603     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3604     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3605     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3606     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3607     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3608     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3609     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3610     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3611     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3612     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3613     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3614     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3615     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3616     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3617 
 3618     __ addw(a, state_regs[0], a);
 3619     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3620     __ addw(b, rscratch2, b);
 3621     __ addw(c, state_regs[1], c);
 3622     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3623     __ addw(d, rscratch4, d);
 3624 
 3625     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3626     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3627 
 3628     if (multi_block) {
 3629       __ add(buf, buf, 64);
 3630       __ add(ofs, ofs, 64);
 3631       __ cmp(ofs, limit);
 3632       __ br(Assembler::LE, md5_loop);
 3633       __ mov(c_rarg0, ofs); // return ofs
 3634     }
 3635 
 3636     // write hash values back in the correct order
 3637     __ stp(state_regs[0], state_regs[1], Address(state));
 3638 
 3639     __ pop(saved_regs, sp);
 3640 
 3641     __ ret(lr);
 3642 
 3643     return start;
 3644   }
 3645 
 3646   // Arguments:
 3647   //
 3648   // Inputs:
 3649   //   c_rarg0   - byte[]  source+offset
 3650   //   c_rarg1   - int[]   SHA.state
 3651   //   c_rarg2   - int     offset
 3652   //   c_rarg3   - int     limit
 3653   //
 3654   address generate_sha1_implCompress(StubGenStubId stub_id) {
 3655     bool multi_block;
 3656     switch (stub_id) {
 3657     case sha1_implCompress_id:
 3658       multi_block = false;
 3659       break;
 3660     case sha1_implCompressMB_id:
 3661       multi_block = true;
 3662       break;
 3663     default:
 3664       ShouldNotReachHere();
 3665     }
 3666 
 3667     __ align(CodeEntryAlignment);
 3668 
 3669     StubCodeMark mark(this, stub_id);
 3670     address start = __ pc();
 3671 
 3672     Register buf   = c_rarg0;
 3673     Register state = c_rarg1;
 3674     Register ofs   = c_rarg2;
 3675     Register limit = c_rarg3;
 3676 
 3677     Label keys;
 3678     Label sha1_loop;
 3679 
 3680     // load the keys into v0..v3
 3681     __ adr(rscratch1, keys);
 3682     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3683     // load 5 words state into v6, v7
 3684     __ ldrq(v6, Address(state, 0));
 3685     __ ldrs(v7, Address(state, 16));
 3686 
 3687 
 3688     __ BIND(sha1_loop);
 3689     // load 64 bytes of data into v16..v19
 3690     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3691     __ rev32(v16, __ T16B, v16);
 3692     __ rev32(v17, __ T16B, v17);
 3693     __ rev32(v18, __ T16B, v18);
 3694     __ rev32(v19, __ T16B, v19);
 3695 
 3696     // do the sha1
 3697     __ addv(v4, __ T4S, v16, v0);
 3698     __ orr(v20, __ T16B, v6, v6);
 3699 
 3700     FloatRegister d0 = v16;
 3701     FloatRegister d1 = v17;
 3702     FloatRegister d2 = v18;
 3703     FloatRegister d3 = v19;
 3704 
 3705     for (int round = 0; round < 20; round++) {
 3706       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3707       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3708       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3709       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3710       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3711 
 3712       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3713       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3714       __ sha1h(tmp2, __ T4S, v20);
 3715       if (round < 5)
 3716         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3717       else if (round < 10 || round >= 15)
 3718         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3719       else
 3720         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3721       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3722 
 3723       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3724     }
 3725 
 3726     __ addv(v7, __ T2S, v7, v21);
 3727     __ addv(v6, __ T4S, v6, v20);
 3728 
 3729     if (multi_block) {
 3730       __ add(ofs, ofs, 64);
 3731       __ cmp(ofs, limit);
 3732       __ br(Assembler::LE, sha1_loop);
 3733       __ mov(c_rarg0, ofs); // return ofs
 3734     }
 3735 
 3736     __ strq(v6, Address(state, 0));
 3737     __ strs(v7, Address(state, 16));
 3738 
 3739     __ ret(lr);
 3740 
 3741     __ bind(keys);
 3742     __ emit_int32(0x5a827999);
 3743     __ emit_int32(0x6ed9eba1);
 3744     __ emit_int32(0x8f1bbcdc);
 3745     __ emit_int32(0xca62c1d6);
 3746 
 3747     return start;
 3748   }
 3749 
 3750 
 3751   // Arguments:
 3752   //
 3753   // Inputs:
 3754   //   c_rarg0   - byte[]  source+offset
 3755   //   c_rarg1   - int[]   SHA.state
 3756   //   c_rarg2   - int     offset
 3757   //   c_rarg3   - int     limit
 3758   //
 3759   address generate_sha256_implCompress(StubGenStubId stub_id) {
 3760     bool multi_block;
 3761     switch (stub_id) {
 3762     case sha256_implCompress_id:
 3763       multi_block = false;
 3764       break;
 3765     case sha256_implCompressMB_id:
 3766       multi_block = true;
 3767       break;
 3768     default:
 3769       ShouldNotReachHere();
 3770     }
 3771 
 3772     static const uint32_t round_consts[64] = {
 3773       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3774       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3775       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3776       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3777       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3778       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3779       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3780       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3781       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3782       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3783       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3784       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3785       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3786       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3787       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3788       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3789     };
 3790 
 3791     __ align(CodeEntryAlignment);
 3792 
 3793     StubCodeMark mark(this, stub_id);
 3794     address start = __ pc();
 3795 
 3796     Register buf   = c_rarg0;
 3797     Register state = c_rarg1;
 3798     Register ofs   = c_rarg2;
 3799     Register limit = c_rarg3;
 3800 
 3801     Label sha1_loop;
 3802 
 3803     __ stpd(v8, v9, __ pre(sp, -32));
 3804     __ stpd(v10, v11, Address(sp, 16));
 3805 
 3806 // dga == v0
 3807 // dgb == v1
 3808 // dg0 == v2
 3809 // dg1 == v3
 3810 // dg2 == v4
 3811 // t0 == v6
 3812 // t1 == v7
 3813 
 3814     // load 16 keys to v16..v31
 3815     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3816     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3817     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3818     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3819     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3820 
 3821     // load 8 words (256 bits) state
 3822     __ ldpq(v0, v1, state);
 3823 
 3824     __ BIND(sha1_loop);
 3825     // load 64 bytes of data into v8..v11
 3826     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3827     __ rev32(v8, __ T16B, v8);
 3828     __ rev32(v9, __ T16B, v9);
 3829     __ rev32(v10, __ T16B, v10);
 3830     __ rev32(v11, __ T16B, v11);
 3831 
 3832     __ addv(v6, __ T4S, v8, v16);
 3833     __ orr(v2, __ T16B, v0, v0);
 3834     __ orr(v3, __ T16B, v1, v1);
 3835 
 3836     FloatRegister d0 = v8;
 3837     FloatRegister d1 = v9;
 3838     FloatRegister d2 = v10;
 3839     FloatRegister d3 = v11;
 3840 
 3841 
 3842     for (int round = 0; round < 16; round++) {
 3843       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3844       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3845       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3846       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3847 
 3848       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3849        __ orr(v4, __ T16B, v2, v2);
 3850       if (round < 15)
 3851         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3852       __ sha256h(v2, __ T4S, v3, tmp2);
 3853       __ sha256h2(v3, __ T4S, v4, tmp2);
 3854       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3855 
 3856       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3857     }
 3858 
 3859     __ addv(v0, __ T4S, v0, v2);
 3860     __ addv(v1, __ T4S, v1, v3);
 3861 
 3862     if (multi_block) {
 3863       __ add(ofs, ofs, 64);
 3864       __ cmp(ofs, limit);
 3865       __ br(Assembler::LE, sha1_loop);
 3866       __ mov(c_rarg0, ofs); // return ofs
 3867     }
 3868 
 3869     __ ldpd(v10, v11, Address(sp, 16));
 3870     __ ldpd(v8, v9, __ post(sp, 32));
 3871 
 3872     __ stpq(v0, v1, state);
 3873 
 3874     __ ret(lr);
 3875 
 3876     return start;
 3877   }
 3878 
 3879   // Double rounds for sha512.
 3880   void sha512_dround(int dr,
 3881                      FloatRegister vi0, FloatRegister vi1,
 3882                      FloatRegister vi2, FloatRegister vi3,
 3883                      FloatRegister vi4, FloatRegister vrc0,
 3884                      FloatRegister vrc1, FloatRegister vin0,
 3885                      FloatRegister vin1, FloatRegister vin2,
 3886                      FloatRegister vin3, FloatRegister vin4) {
 3887       if (dr < 36) {
 3888         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 3889       }
 3890       __ addv(v5, __ T2D, vrc0, vin0);
 3891       __ ext(v6, __ T16B, vi2, vi3, 8);
 3892       __ ext(v5, __ T16B, v5, v5, 8);
 3893       __ ext(v7, __ T16B, vi1, vi2, 8);
 3894       __ addv(vi3, __ T2D, vi3, v5);
 3895       if (dr < 32) {
 3896         __ ext(v5, __ T16B, vin3, vin4, 8);
 3897         __ sha512su0(vin0, __ T2D, vin1);
 3898       }
 3899       __ sha512h(vi3, __ T2D, v6, v7);
 3900       if (dr < 32) {
 3901         __ sha512su1(vin0, __ T2D, vin2, v5);
 3902       }
 3903       __ addv(vi4, __ T2D, vi1, vi3);
 3904       __ sha512h2(vi3, __ T2D, vi1, vi0);
 3905   }
 3906 
 3907   // Arguments:
 3908   //
 3909   // Inputs:
 3910   //   c_rarg0   - byte[]  source+offset
 3911   //   c_rarg1   - int[]   SHA.state
 3912   //   c_rarg2   - int     offset
 3913   //   c_rarg3   - int     limit
 3914   //
 3915   address generate_sha512_implCompress(StubGenStubId stub_id) {
 3916     bool multi_block;
 3917     switch (stub_id) {
 3918     case sha512_implCompress_id:
 3919       multi_block = false;
 3920       break;
 3921     case sha512_implCompressMB_id:
 3922       multi_block = true;
 3923       break;
 3924     default:
 3925       ShouldNotReachHere();
 3926     }
 3927 
 3928     static const uint64_t round_consts[80] = {
 3929       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 3930       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 3931       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 3932       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 3933       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 3934       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 3935       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 3936       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 3937       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 3938       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 3939       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 3940       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 3941       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 3942       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 3943       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 3944       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 3945       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 3946       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 3947       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 3948       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 3949       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 3950       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 3951       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 3952       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 3953       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 3954       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 3955       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 3956     };
 3957 
 3958     __ align(CodeEntryAlignment);
 3959 
 3960     StubCodeMark mark(this, stub_id);
 3961     address start = __ pc();
 3962 
 3963     Register buf   = c_rarg0;
 3964     Register state = c_rarg1;
 3965     Register ofs   = c_rarg2;
 3966     Register limit = c_rarg3;
 3967 
 3968     __ stpd(v8, v9, __ pre(sp, -64));
 3969     __ stpd(v10, v11, Address(sp, 16));
 3970     __ stpd(v12, v13, Address(sp, 32));
 3971     __ stpd(v14, v15, Address(sp, 48));
 3972 
 3973     Label sha512_loop;
 3974 
 3975     // load state
 3976     __ ld1(v8, v9, v10, v11, __ T2D, state);
 3977 
 3978     // load first 4 round constants
 3979     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3980     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 3981 
 3982     __ BIND(sha512_loop);
 3983     // load 128B of data into v12..v19
 3984     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 3985     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 3986     __ rev64(v12, __ T16B, v12);
 3987     __ rev64(v13, __ T16B, v13);
 3988     __ rev64(v14, __ T16B, v14);
 3989     __ rev64(v15, __ T16B, v15);
 3990     __ rev64(v16, __ T16B, v16);
 3991     __ rev64(v17, __ T16B, v17);
 3992     __ rev64(v18, __ T16B, v18);
 3993     __ rev64(v19, __ T16B, v19);
 3994 
 3995     __ mov(rscratch2, rscratch1);
 3996 
 3997     __ mov(v0, __ T16B, v8);
 3998     __ mov(v1, __ T16B, v9);
 3999     __ mov(v2, __ T16B, v10);
 4000     __ mov(v3, __ T16B, v11);
 4001 
 4002     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4003     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4004     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4005     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4006     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4007     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4008     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4009     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4010     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4011     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4012     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4013     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4014     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4015     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4016     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4017     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4018     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4019     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4020     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4021     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4022     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4023     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4024     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4025     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4026     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4027     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4028     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4029     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4030     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4031     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4032     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4033     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4034     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4035     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4036     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4037     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4038     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4039     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4040     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4041     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4042 
 4043     __ addv(v8, __ T2D, v8, v0);
 4044     __ addv(v9, __ T2D, v9, v1);
 4045     __ addv(v10, __ T2D, v10, v2);
 4046     __ addv(v11, __ T2D, v11, v3);
 4047 
 4048     if (multi_block) {
 4049       __ add(ofs, ofs, 128);
 4050       __ cmp(ofs, limit);
 4051       __ br(Assembler::LE, sha512_loop);
 4052       __ mov(c_rarg0, ofs); // return ofs
 4053     }
 4054 
 4055     __ st1(v8, v9, v10, v11, __ T2D, state);
 4056 
 4057     __ ldpd(v14, v15, Address(sp, 48));
 4058     __ ldpd(v12, v13, Address(sp, 32));
 4059     __ ldpd(v10, v11, Address(sp, 16));
 4060     __ ldpd(v8, v9, __ post(sp, 64));
 4061 
 4062     __ ret(lr);
 4063 
 4064     return start;
 4065   }
 4066 
 4067   // Execute one round of keccak of two computations in parallel.
 4068   // One of the states should be loaded into the lower halves of
 4069   // the vector registers v0-v24, the other should be loaded into
 4070   // the upper halves of those registers. The ld1r instruction loads
 4071   // the round constant into both halves of register v31.
 4072   // Intermediate results c0...c5 and d0...d5 are computed
 4073   // in registers v25...v30.
 4074   // All vector instructions that are used operate on both register
 4075   // halves in parallel.
 4076   // If only a single computation is needed, one can only load the lower halves.
 4077   void keccak_round(Register rscratch1) {
 4078   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4079   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4080   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4081   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4082   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4083   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4084   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4085   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4086   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4087   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4088 
 4089   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4090   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4091   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4092   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4093   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4094 
 4095   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4096   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4097   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4098   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4099   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4100   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4101   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4102   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4103   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4104   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4105   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4106   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4107   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4108   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4109   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4110   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4111   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4112   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4113   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4114   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4115   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4116   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4117   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4118   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4119   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4120 
 4121   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4122   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4123   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4124   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4125   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4126 
 4127   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4128 
 4129   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4130   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4131   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4132   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4133   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4134 
 4135   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4136   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4137   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4138   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4139   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4140 
 4141   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4142   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4143   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4144   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4145   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4146 
 4147   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4148   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4149   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4150   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4151   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4152 
 4153   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4154   }
 4155 
 4156   // Arguments:
 4157   //
 4158   // Inputs:
 4159   //   c_rarg0   - byte[]  source+offset
 4160   //   c_rarg1   - byte[]  SHA.state
 4161   //   c_rarg2   - int     block_size
 4162   //   c_rarg3   - int     offset
 4163   //   c_rarg4   - int     limit
 4164   //
 4165   address generate_sha3_implCompress(StubGenStubId stub_id) {
 4166     bool multi_block;
 4167     switch (stub_id) {
 4168     case sha3_implCompress_id:
 4169       multi_block = false;
 4170       break;
 4171     case sha3_implCompressMB_id:
 4172       multi_block = true;
 4173       break;
 4174     default:
 4175       ShouldNotReachHere();
 4176     }
 4177 
 4178     static const uint64_t round_consts[24] = {
 4179       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4180       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4181       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4182       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4183       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4184       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4185       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4186       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4187     };
 4188 
 4189     __ align(CodeEntryAlignment);
 4190 
 4191     StubCodeMark mark(this, stub_id);
 4192     address start = __ pc();
 4193 
 4194     Register buf           = c_rarg0;
 4195     Register state         = c_rarg1;
 4196     Register block_size    = c_rarg2;
 4197     Register ofs           = c_rarg3;
 4198     Register limit         = c_rarg4;
 4199 
 4200     Label sha3_loop, rounds24_loop;
 4201     Label sha3_512_or_sha3_384, shake128;
 4202 
 4203     __ stpd(v8, v9, __ pre(sp, -64));
 4204     __ stpd(v10, v11, Address(sp, 16));
 4205     __ stpd(v12, v13, Address(sp, 32));
 4206     __ stpd(v14, v15, Address(sp, 48));
 4207 
 4208     // load state
 4209     __ add(rscratch1, state, 32);
 4210     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4211     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4212     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4213     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4214     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4215     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4216     __ ld1(v24, __ T1D, rscratch1);
 4217 
 4218     __ BIND(sha3_loop);
 4219 
 4220     // 24 keccak rounds
 4221     __ movw(rscratch2, 24);
 4222 
 4223     // load round_constants base
 4224     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4225 
 4226     // load input
 4227     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4228     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4229     __ eor(v0, __ T8B, v0, v25);
 4230     __ eor(v1, __ T8B, v1, v26);
 4231     __ eor(v2, __ T8B, v2, v27);
 4232     __ eor(v3, __ T8B, v3, v28);
 4233     __ eor(v4, __ T8B, v4, v29);
 4234     __ eor(v5, __ T8B, v5, v30);
 4235     __ eor(v6, __ T8B, v6, v31);
 4236 
 4237     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4238     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4239 
 4240     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4241     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4242     __ eor(v7, __ T8B, v7, v25);
 4243     __ eor(v8, __ T8B, v8, v26);
 4244     __ eor(v9, __ T8B, v9, v27);
 4245     __ eor(v10, __ T8B, v10, v28);
 4246     __ eor(v11, __ T8B, v11, v29);
 4247     __ eor(v12, __ T8B, v12, v30);
 4248     __ eor(v13, __ T8B, v13, v31);
 4249 
 4250     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4251     __ eor(v14, __ T8B, v14, v25);
 4252     __ eor(v15, __ T8B, v15, v26);
 4253     __ eor(v16, __ T8B, v16, v27);
 4254 
 4255     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4256     __ andw(c_rarg5, block_size, 48);
 4257     __ cbzw(c_rarg5, rounds24_loop);
 4258 
 4259     __ tbnz(block_size, 5, shake128);
 4260     // block_size == 144, bit5 == 0, SHA3-224
 4261     __ ldrd(v28, __ post(buf, 8));
 4262     __ eor(v17, __ T8B, v17, v28);
 4263     __ b(rounds24_loop);
 4264 
 4265     __ BIND(shake128);
 4266     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4267     __ eor(v17, __ T8B, v17, v28);
 4268     __ eor(v18, __ T8B, v18, v29);
 4269     __ eor(v19, __ T8B, v19, v30);
 4270     __ eor(v20, __ T8B, v20, v31);
 4271     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4272 
 4273     __ BIND(sha3_512_or_sha3_384);
 4274     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4275     __ eor(v7, __ T8B, v7, v25);
 4276     __ eor(v8, __ T8B, v8, v26);
 4277     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4278 
 4279     // SHA3-384
 4280     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4281     __ eor(v9,  __ T8B, v9,  v27);
 4282     __ eor(v10, __ T8B, v10, v28);
 4283     __ eor(v11, __ T8B, v11, v29);
 4284     __ eor(v12, __ T8B, v12, v30);
 4285 
 4286     __ BIND(rounds24_loop);
 4287     __ subw(rscratch2, rscratch2, 1);
 4288 
 4289     keccak_round(rscratch1);
 4290 
 4291     __ cbnzw(rscratch2, rounds24_loop);
 4292 
 4293     if (multi_block) {
 4294       __ add(ofs, ofs, block_size);
 4295       __ cmp(ofs, limit);
 4296       __ br(Assembler::LE, sha3_loop);
 4297       __ mov(c_rarg0, ofs); // return ofs
 4298     }
 4299 
 4300     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4301     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4302     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4303     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4304     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4305     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4306     __ st1(v24, __ T1D, state);
 4307 
 4308     // restore callee-saved registers
 4309     __ ldpd(v14, v15, Address(sp, 48));
 4310     __ ldpd(v12, v13, Address(sp, 32));
 4311     __ ldpd(v10, v11, Address(sp, 16));
 4312     __ ldpd(v8, v9, __ post(sp, 64));
 4313 
 4314     __ ret(lr);
 4315 
 4316     return start;
 4317   }
 4318 
 4319   // Inputs:
 4320   //   c_rarg0   - long[]  state0
 4321   //   c_rarg1   - long[]  state1
 4322   address generate_double_keccak() {
 4323     static const uint64_t round_consts[24] = {
 4324       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4325       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4326       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4327       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4328       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4329       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4330       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4331       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4332     };
 4333 
 4334     // Implements the double_keccak() method of the
 4335     // sun.secyrity.provider.SHA3Parallel class
 4336     __ align(CodeEntryAlignment);
 4337     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4338     address start = __ pc();
 4339     __ enter();
 4340 
 4341     Register state0        = c_rarg0;
 4342     Register state1        = c_rarg1;
 4343 
 4344     Label rounds24_loop;
 4345 
 4346     // save callee-saved registers
 4347     __ stpd(v8, v9, __ pre(sp, -64));
 4348     __ stpd(v10, v11, Address(sp, 16));
 4349     __ stpd(v12, v13, Address(sp, 32));
 4350     __ stpd(v14, v15, Address(sp, 48));
 4351 
 4352     // load states
 4353     __ add(rscratch1, state0, 32);
 4354     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4355     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4356     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4357     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4358     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4359     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4360     __ ld1(v24, __ D, 0, rscratch1);
 4361     __ add(rscratch1, state1, 32);
 4362     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4363     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4364     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4365     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4366     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4367     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4368     __ ld1(v24, __ D, 1, rscratch1);
 4369 
 4370     // 24 keccak rounds
 4371     __ movw(rscratch2, 24);
 4372 
 4373     // load round_constants base
 4374     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4375 
 4376     __ BIND(rounds24_loop);
 4377     __ subw(rscratch2, rscratch2, 1);
 4378     keccak_round(rscratch1);
 4379     __ cbnzw(rscratch2, rounds24_loop);
 4380 
 4381     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4382     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4383     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4384     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4385     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4386     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4387     __ st1(v24, __ D, 0, state0);
 4388     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4389     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4390     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4391     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4392     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4393     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4394     __ st1(v24, __ D, 1, state1);
 4395 
 4396     // restore callee-saved vector registers
 4397     __ ldpd(v14, v15, Address(sp, 48));
 4398     __ ldpd(v12, v13, Address(sp, 32));
 4399     __ ldpd(v10, v11, Address(sp, 16));
 4400     __ ldpd(v8, v9, __ post(sp, 64));
 4401 
 4402     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4403     __ mov(r0, zr); // return 0
 4404     __ ret(lr);
 4405 
 4406     return start;
 4407   }
 4408 
 4409   /**
 4410    *  Arguments:
 4411    *
 4412    * Inputs:
 4413    *   c_rarg0   - int crc
 4414    *   c_rarg1   - byte* buf
 4415    *   c_rarg2   - int length
 4416    *
 4417    * Output:
 4418    *       rax   - int crc result
 4419    */
 4420   address generate_updateBytesCRC32() {
 4421     assert(UseCRC32Intrinsics, "what are we doing here?");
 4422 
 4423     __ align(CodeEntryAlignment);
 4424     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
 4425     StubCodeMark mark(this, stub_id);
 4426 
 4427     address start = __ pc();
 4428 
 4429     const Register crc   = c_rarg0;  // crc
 4430     const Register buf   = c_rarg1;  // source java byte array address
 4431     const Register len   = c_rarg2;  // length
 4432     const Register table0 = c_rarg3; // crc_table address
 4433     const Register table1 = c_rarg4;
 4434     const Register table2 = c_rarg5;
 4435     const Register table3 = c_rarg6;
 4436     const Register tmp3 = c_rarg7;
 4437 
 4438     BLOCK_COMMENT("Entry:");
 4439     __ enter(); // required for proper stackwalking of RuntimeStub frame
 4440 
 4441     __ kernel_crc32(crc, buf, len,
 4442               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 4443 
 4444     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4445     __ ret(lr);
 4446 
 4447     return start;
 4448   }
 4449 
 4450   // ChaCha20 block function.  This version parallelizes 4 quarter
 4451   // round operations at a time.  It uses 16 SIMD registers to
 4452   // produce 4 blocks of key stream.
 4453   //
 4454   // state (int[16]) = c_rarg0
 4455   // keystream (byte[256]) = c_rarg1
 4456   // return - number of bytes of keystream (always 256)
 4457   //
 4458   // In this approach, we load the 512-bit start state sequentially into
 4459   // 4 128-bit vectors.  We then make 4 4-vector copies of that starting
 4460   // state, with each successive set of 4 vectors having a +1 added into
 4461   // the first 32-bit lane of the 4th vector in that group (the counter).
 4462   // By doing this, we can perform the block function on 4 512-bit blocks
 4463   // within one run of this intrinsic.
 4464   // The alignment of the data across the 4-vector group is such that at
 4465   // the start it is already aligned for the first round of each two-round
 4466   // loop iteration.  In other words, the corresponding lanes of each vector
 4467   // will contain the values needed for that quarter round operation (e.g.
 4468   // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.).
 4469   // In between each full round, a lane shift must occur.  Within a loop
 4470   // iteration, between the first and second rounds, the 2nd, 3rd, and 4th
 4471   // vectors are rotated left 32, 64 and 96 bits, respectively.  The result
 4472   // is effectively a diagonal orientation in columnar form.  After the
 4473   // second full round, those registers are left-rotated again, this time
 4474   // 96, 64, and 32 bits - returning the vectors to their columnar organization.
 4475   // After all 10 iterations, the original state is added to each 4-vector
 4476   // working state along with the add mask, and the 4 vector groups are
 4477   // sequentially written to the memory dedicated for the output key stream.
 4478   //
 4479   // For a more detailed explanation, see Goll and Gueron, "Vectorization of
 4480   // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology:
 4481   // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33
 4482   address generate_chacha20Block_qrpar() {
 4483     Label L_Q_twoRounds, L_Q_cc20_const;
 4484     // The constant data is broken into two 128-bit segments to be loaded
 4485     // onto SIMD registers.  The first 128 bits are a counter add overlay
 4486     // that adds +1/+0/+0/+0 to the vectors holding replicated state[12].
 4487     // The second 128-bits is a table constant used for 8-bit left rotations.
 4488     // on 32-bit lanes within a SIMD register.
 4489     __ BIND(L_Q_cc20_const);
 4490     __ emit_int64(0x0000000000000001UL);
 4491     __ emit_int64(0x0000000000000000UL);
 4492     __ emit_int64(0x0605040702010003UL);
 4493     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4494 
 4495     __ align(CodeEntryAlignment);
 4496     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
 4497     StubCodeMark mark(this, stub_id);
 4498     address start = __ pc();
 4499     __ enter();
 4500 
 4501     const Register state = c_rarg0;
 4502     const Register keystream = c_rarg1;
 4503     const Register loopCtr = r10;
 4504     const Register tmpAddr = r11;
 4505 
 4506     const FloatRegister aState = v0;
 4507     const FloatRegister bState = v1;
 4508     const FloatRegister cState = v2;
 4509     const FloatRegister dState = v3;
 4510     const FloatRegister a1Vec = v4;
 4511     const FloatRegister b1Vec = v5;
 4512     const FloatRegister c1Vec = v6;
 4513     const FloatRegister d1Vec = v7;
 4514     // Skip the callee-saved registers v8 - v15
 4515     const FloatRegister a2Vec = v16;
 4516     const FloatRegister b2Vec = v17;
 4517     const FloatRegister c2Vec = v18;
 4518     const FloatRegister d2Vec = v19;
 4519     const FloatRegister a3Vec = v20;
 4520     const FloatRegister b3Vec = v21;
 4521     const FloatRegister c3Vec = v22;
 4522     const FloatRegister d3Vec = v23;
 4523     const FloatRegister a4Vec = v24;
 4524     const FloatRegister b4Vec = v25;
 4525     const FloatRegister c4Vec = v26;
 4526     const FloatRegister d4Vec = v27;
 4527     const FloatRegister scratch = v28;
 4528     const FloatRegister addMask = v29;
 4529     const FloatRegister lrot8Tbl = v30;
 4530 
 4531     // Load the initial state in the first 4 quadword registers,
 4532     // then copy the initial state into the next 4 quadword registers
 4533     // that will be used for the working state.
 4534     __ ld1(aState, bState, cState, dState, __ T16B, Address(state));
 4535 
 4536     // Load the index register for 2 constant 128-bit data fields.
 4537     // The first represents the +1/+0/+0/+0 add mask.  The second is
 4538     // the 8-bit left rotation.
 4539     __ adr(tmpAddr, L_Q_cc20_const);
 4540     __ ldpq(addMask, lrot8Tbl, Address(tmpAddr));
 4541 
 4542     __ mov(a1Vec, __ T16B, aState);
 4543     __ mov(b1Vec, __ T16B, bState);
 4544     __ mov(c1Vec, __ T16B, cState);
 4545     __ mov(d1Vec, __ T16B, dState);
 4546 
 4547     __ mov(a2Vec, __ T16B, aState);
 4548     __ mov(b2Vec, __ T16B, bState);
 4549     __ mov(c2Vec, __ T16B, cState);
 4550     __ addv(d2Vec, __ T4S, d1Vec, addMask);
 4551 
 4552     __ mov(a3Vec, __ T16B, aState);
 4553     __ mov(b3Vec, __ T16B, bState);
 4554     __ mov(c3Vec, __ T16B, cState);
 4555     __ addv(d3Vec, __ T4S, d2Vec, addMask);
 4556 
 4557     __ mov(a4Vec, __ T16B, aState);
 4558     __ mov(b4Vec, __ T16B, bState);
 4559     __ mov(c4Vec, __ T16B, cState);
 4560     __ addv(d4Vec, __ T4S, d3Vec, addMask);
 4561 
 4562     // Set up the 10 iteration loop
 4563     __ mov(loopCtr, 10);
 4564     __ BIND(L_Q_twoRounds);
 4565 
 4566     // The first set of operations on the vectors covers the first 4 quarter
 4567     // round operations:
 4568     //  Qround(state, 0, 4, 8,12)
 4569     //  Qround(state, 1, 5, 9,13)
 4570     //  Qround(state, 2, 6,10,14)
 4571     //  Qround(state, 3, 7,11,15)
 4572     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
 4573     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
 4574     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
 4575     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
 4576 
 4577     // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to
 4578     // diagonals. The a1Vec does not need to change orientation.
 4579     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true);
 4580     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true);
 4581     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true);
 4582     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true);
 4583 
 4584     // The second set of operations on the vectors covers the second 4 quarter
 4585     // round operations, now acting on the diagonals:
 4586     //  Qround(state, 0, 5,10,15)
 4587     //  Qround(state, 1, 6,11,12)
 4588     //  Qround(state, 2, 7, 8,13)
 4589     //  Qround(state, 3, 4, 9,14)
 4590     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
 4591     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
 4592     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
 4593     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
 4594 
 4595     // Before we start the next iteration, we need to perform shuffles
 4596     // on the b/c/d vectors to move them back to columnar organizations
 4597     // from their current diagonal orientation.
 4598     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false);
 4599     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false);
 4600     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false);
 4601     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false);
 4602 
 4603     // Decrement and iterate
 4604     __ sub(loopCtr, loopCtr, 1);
 4605     __ cbnz(loopCtr, L_Q_twoRounds);
 4606 
 4607     // Once the counter reaches zero, we fall out of the loop
 4608     // and need to add the initial state back into the working state
 4609     // represented by the a/b/c/d1Vec registers.  This is destructive
 4610     // on the dState register but we no longer will need it.
 4611     __ addv(a1Vec, __ T4S, a1Vec, aState);
 4612     __ addv(b1Vec, __ T4S, b1Vec, bState);
 4613     __ addv(c1Vec, __ T4S, c1Vec, cState);
 4614     __ addv(d1Vec, __ T4S, d1Vec, dState);
 4615 
 4616     __ addv(a2Vec, __ T4S, a2Vec, aState);
 4617     __ addv(b2Vec, __ T4S, b2Vec, bState);
 4618     __ addv(c2Vec, __ T4S, c2Vec, cState);
 4619     __ addv(dState, __ T4S, dState, addMask);
 4620     __ addv(d2Vec, __ T4S, d2Vec, dState);
 4621 
 4622     __ addv(a3Vec, __ T4S, a3Vec, aState);
 4623     __ addv(b3Vec, __ T4S, b3Vec, bState);
 4624     __ addv(c3Vec, __ T4S, c3Vec, cState);
 4625     __ addv(dState, __ T4S, dState, addMask);
 4626     __ addv(d3Vec, __ T4S, d3Vec, dState);
 4627 
 4628     __ addv(a4Vec, __ T4S, a4Vec, aState);
 4629     __ addv(b4Vec, __ T4S, b4Vec, bState);
 4630     __ addv(c4Vec, __ T4S, c4Vec, cState);
 4631     __ addv(dState, __ T4S, dState, addMask);
 4632     __ addv(d4Vec, __ T4S, d4Vec, dState);
 4633 
 4634     // Write the final state back to the result buffer
 4635     __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64));
 4636     __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64));
 4637     __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64));
 4638     __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64));
 4639 
 4640     __ mov(r0, 256);             // Return length of output keystream
 4641     __ leave();
 4642     __ ret(lr);
 4643 
 4644     return start;
 4645   }
 4646 
 4647   // Helpers to schedule parallel operation bundles across vector
 4648   // register sequences of size 2, 4 or 8.
 4649 
 4650   // Implement various primitive computations across vector sequences
 4651 
 4652   template<int N>
 4653   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4654                const VSeq<N>& v1, const VSeq<N>& v2) {
 4655     for (int i = 0; i < N; i++) {
 4656       __ addv(v[i], T, v1[i], v2[i]);
 4657     }
 4658   }
 4659 
 4660   template<int N>
 4661   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4662                const VSeq<N>& v1, const VSeq<N>& v2) {
 4663     for (int i = 0; i < N; i++) {
 4664       __ subv(v[i], T, v1[i], v2[i]);
 4665     }
 4666   }
 4667 
 4668   template<int N>
 4669   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4670                const VSeq<N>& v1, const VSeq<N>& v2) {
 4671     for (int i = 0; i < N; i++) {
 4672       __ mulv(v[i], T, v1[i], v2[i]);
 4673     }
 4674   }
 4675 
 4676   template<int N>
 4677   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4678     for (int i = 0; i < N; i++) {
 4679       __ negr(v[i], T, v1[i]);
 4680     }
 4681   }
 4682 
 4683   template<int N>
 4684   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4685                const VSeq<N>& v1, int shift) {
 4686     for (int i = 0; i < N; i++) {
 4687       __ sshr(v[i], T, v1[i], shift);
 4688     }
 4689   }
 4690 
 4691   template<int N>
 4692   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4693     for (int i = 0; i < N; i++) {
 4694       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4695     }
 4696   }
 4697 
 4698   template<int N>
 4699   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4700     for (int i = 0; i < N; i++) {
 4701       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4702     }
 4703   }
 4704 
 4705   template<int N>
 4706     void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4707     for (int i = 0; i < N; i++) {
 4708       __ notr(v[i], __ T16B, v1[i]);
 4709     }
 4710   }
 4711 
 4712   // load N/2 successive pairs of quadword values from memory in order
 4713   // into N successive vector registers of the sequence via the
 4714   // address supplied in base.
 4715   template<int N>
 4716   void vs_ldpq(const VSeq<N>& v, Register base) {
 4717     for (int i = 0; i < N; i += 2) {
 4718       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4719     }
 4720   }
 4721 
 4722   // load N/2 successive pairs of quadword values from memory in order
 4723   // into N vector registers of the sequence via the address supplied
 4724   // in base using post-increment addressing
 4725   template<int N>
 4726   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4727     for (int i = 0; i < N; i += 2) {
 4728       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4729     }
 4730   }
 4731 
 4732   // store N successive vector registers of the sequence into N/2
 4733   // successive pairs of quadword memory locations via the address
 4734   // supplied in base using post-increment addressing
 4735   template<int N>
 4736   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4737     for (int i = 0; i < N; i += 2) {
 4738       __ stpq(v[i], v[i+1], __ post(base, 32));
 4739     }
 4740   }
 4741 
 4742   // load N/2 pairs of quadword values from memory into N vector
 4743   // registers via the address supplied in base with each pair indexed
 4744   // using the the start offset plus the corresponding entry in the
 4745   // offsets array
 4746   template<int N>
 4747   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4748     for (int i = 0; i < N/2; i++) {
 4749       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4750     }
 4751   }
 4752 
 4753   // store N vector registers into N/2 pairs of quadword memory
 4754   // locations via the address supplied in base with each pair indexed
 4755   // using the the start offset plus the corresponding entry in the
 4756   // offsets array
 4757   template<int N>
 4758   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4759     for (int i = 0; i < N/2; i++) {
 4760       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4761     }
 4762   }
 4763 
 4764   // load N single quadword values from memory into N vector registers
 4765   // via the address supplied in base with each value indexed using
 4766   // the the start offset plus the corresponding entry in the offsets
 4767   // array
 4768   template<int N>
 4769   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4770                       int start, int (&offsets)[N]) {
 4771     for (int i = 0; i < N; i++) {
 4772       __ ldr(v[i], T, Address(base, start + offsets[i]));
 4773     }
 4774   }
 4775 
 4776   // store N vector registers into N single quadword memory locations
 4777   // via the address supplied in base with each value indexed using
 4778   // the the start offset plus the corresponding entry in the offsets
 4779   // array
 4780   template<int N>
 4781   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4782                       int start, int (&offsets)[N]) {
 4783     for (int i = 0; i < N; i++) {
 4784       __ str(v[i], T, Address(base, start + offsets[i]));
 4785     }
 4786   }
 4787 
 4788   // load N/2 pairs of quadword values from memory de-interleaved into
 4789   // N vector registers 2 at a time via the address supplied in base
 4790   // with each pair indexed using the the start offset plus the
 4791   // corresponding entry in the offsets array
 4792   template<int N>
 4793   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4794                       Register tmp, int start, int (&offsets)[N/2]) {
 4795     for (int i = 0; i < N/2; i++) {
 4796       __ add(tmp, base, start + offsets[i]);
 4797       __ ld2(v[2*i], v[2*i+1], T, tmp);
 4798     }
 4799   }
 4800 
 4801   // store N vector registers 2 at a time interleaved into N/2 pairs
 4802   // of quadword memory locations via the address supplied in base
 4803   // with each pair indexed using the the start offset plus the
 4804   // corresponding entry in the offsets array
 4805   template<int N>
 4806   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4807                       Register tmp, int start, int (&offsets)[N/2]) {
 4808     for (int i = 0; i < N/2; i++) {
 4809       __ add(tmp, base, start + offsets[i]);
 4810       __ st2(v[2*i], v[2*i+1], T, tmp);
 4811     }
 4812   }
 4813 
 4814   // Helper routines for various flavours of dilithium montgomery
 4815   // multiply
 4816 
 4817   // Perform 16 32-bit Montgomery multiplications in parallel
 4818   // See the montMul() method of the sun.security.provider.ML_DSA class.
 4819   //
 4820   // Computes 4x4S results
 4821   //    a = b * c * 2^-32 mod MONT_Q
 4822   // Inputs:  vb, vc - 4x4S vector register sequences
 4823   //          vq - 2x4S constants <MONT_Q, MONT_Q_INV_MOD_R>
 4824   // Temps:   vtmp - 4x4S vector sequence trashed after call
 4825   // Outputs: va - 4x4S vector register sequences
 4826   // vb, vc, vtmp and vq must all be disjoint
 4827   // va must be disjoint from all other inputs/temps or must equal vc
 4828   // n.b. MONT_R_BITS is 32, so the right shift by it is implicit.
 4829   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 4830                     const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4831     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 4832     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 4833     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 4834 
 4835     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 4836     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 4837 
 4838     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 4839 
 4840     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 4841     assert(vs_disjoint(va, vb), "va and vb overlap");
 4842     assert(vs_disjoint(va, vq), "va and vq overlap");
 4843     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 4844 
 4845     // schedule 4 streams of instructions across the vector sequences
 4846     for (int i = 0; i < 4; i++) {
 4847       __ sqdmulh(vtmp[i], __ T4S, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 4848       __ mulv(va[i], __ T4S, vb[i], vc[i]);    // aLow = lo32(b * c)
 4849     }
 4850 
 4851     for (int i = 0; i < 4; i++) {
 4852       __ mulv(va[i], __ T4S, va[i], vq[0]);     // m = aLow * qinv
 4853     }
 4854 
 4855     for (int i = 0; i < 4; i++) {
 4856       __ sqdmulh(va[i], __ T4S, va[i], vq[1]);  // n = hi32(2 * m * q)
 4857     }
 4858 
 4859     for (int i = 0; i < 4; i++) {
 4860       __ shsubv(va[i], __ T4S, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 4861     }
 4862   }
 4863 
 4864   // Perform 2x16 32-bit Montgomery multiplications in parallel
 4865   // See the montMul() method of the sun.security.provider.ML_DSA class.
 4866   //
 4867   // Computes 8x4S results
 4868   //    a = b * c * 2^-32 mod MONT_Q
 4869   // Inputs:  vb, vc - 8x4S vector register sequences
 4870   //          vq - 2x4S constants <MONT_Q, MONT_Q_INV_MOD_R>
 4871   // Temps:   vtmp - 4x4S vector sequence trashed after call
 4872   // Outputs: va - 8x4S vector register sequences
 4873   // vb, vc, vtmp and vq must all be disjoint
 4874   // va must be disjoint from all other inputs/temps or must equal vc
 4875   // n.b. MONT_R_BITS is 32, so the right shift by it is implicit.
 4876   void vs_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 4877                     const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4878     // vb, vc, vtmp and vq must be disjoint. va must either be
 4879     // disjoint from all other registers or equal vc
 4880 
 4881     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 4882     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 4883     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 4884 
 4885     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 4886     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 4887 
 4888     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 4889 
 4890     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 4891     assert(vs_disjoint(va, vb), "va and vb overlap");
 4892     assert(vs_disjoint(va, vq), "va and vq overlap");
 4893     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 4894 
 4895     // we need to multiply the front and back halves of each sequence
 4896     // 4x4S at a time because
 4897     //
 4898     // 1) we are currently only able to get 4-way instruction
 4899     // parallelism at best
 4900     //
 4901     // 2) we need registers for the constants in vq and temporary
 4902     // scratch registers to hold intermediate results so vtmp can only
 4903     // be a VSeq<4> which means we only have 4 scratch slots
 4904 
 4905     dilithium_montmul16(vs_front(va), vs_front(vb), vs_front(vc), vtmp, vq);
 4906     dilithium_montmul16(vs_back(va), vs_back(vb), vs_back(vc), vtmp, vq);
 4907   }
 4908 
 4909   // perform combined montmul then add/sub on 4x4S vectors
 4910 
 4911   void dilithium_montmul16_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 4912                                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4913     // compute a = montmul(a1, c)
 4914     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 4915     // ouptut a1 = a0 - a
 4916     vs_subv(va1, __ T4S, va0, vc);
 4917     //    and a0 = a0 + a
 4918     vs_addv(va0, __ T4S, va0, vc);
 4919   }
 4920 
 4921   // perform combined add/sub then montul on 4x4S vectors
 4922 
 4923   void dilithium_sub_add_montmul16(const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 4924                                    const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 4925     // compute c = a0 - a1
 4926     vs_subv(vtmp1, __ T4S, va0, va1);
 4927     // output a0 = a0 + a1
 4928     vs_addv(va0, __ T4S, va0, va1);
 4929     // output a1 = b montmul c
 4930     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 4931   }
 4932 
 4933   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 4934   // in the Java implementation come in sequences of at least 8, so we
 4935   // can use ldpq to collect the corresponding data into pairs of vector
 4936   // registers.
 4937   // We collect the coefficients corresponding to the 'j+l' indexes into
 4938   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 4939   // then we do the (Montgomery) multiplications by the zetas in parallel
 4940   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 4941   // v0-v7, then do the additions into v24-v31 and the subtractions into
 4942   // v0-v7 and finally save the results back to the coeffs array.
 4943   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 4944     const Register coeffs, const Register zetas) {
 4945     int c1 = 0;
 4946     int c2 = 512;
 4947     int startIncr;
 4948     // don't use callee save registers v8 - v15
 4949     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 4950     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 4951     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 4952     int offsets[4] = { 0, 32, 64, 96 };
 4953 
 4954     for (int level = 0; level < 5; level++) {
 4955       int c1Start = c1;
 4956       int c2Start = c2;
 4957       if (level == 3) {
 4958         offsets[1] = 32;
 4959         offsets[2] = 128;
 4960         offsets[3] = 160;
 4961       } else if (level == 4) {
 4962         offsets[1] = 64;
 4963         offsets[2] = 128;
 4964         offsets[3] = 192;
 4965       }
 4966 
 4967       // for levels 1 - 4 we simply load 2 x 4 adjacent values at a
 4968       // time at 4 different offsets and multiply them in order by the
 4969       // next set of input values. So we employ indexed load and store
 4970       // pair instructions with arrangement 4S
 4971       for (int i = 0; i < 4; i++) {
 4972         // reload q and qinv
 4973         vs_ldpq(vq, dilithiumConsts); // qInv, q
 4974         // load 8x4S coefficients via second start pos == c2
 4975         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 4976         // load next 8x4S inputs == b
 4977         vs_ldpq_post(vs2, zetas);
 4978         // compute a == c2 * b mod MONT_Q
 4979         vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 4980         // load 8x4s coefficients via first start pos == c1
 4981         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 4982         // compute a1 =  c1 + a
 4983         vs_addv(vs3, __ T4S, vs1, vs2);
 4984         // compute a2 =  c1 - a
 4985         vs_subv(vs1, __ T4S, vs1, vs2);
 4986         // output a1 and a2
 4987         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 4988         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 4989 
 4990         int k = 4 * level + i;
 4991 
 4992         if (k > 7) {
 4993           startIncr = 256;
 4994         } else if (k == 5) {
 4995           startIncr = 384;
 4996         } else {
 4997           startIncr = 128;
 4998         }
 4999 
 5000         c1Start += startIncr;
 5001         c2Start += startIncr;
 5002       }
 5003 
 5004       c2 /= 2;
 5005     }
 5006   }
 5007 
 5008   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 5009   // Implements the method
 5010   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 5011   // of the Java class sun.security.provider
 5012   //
 5013   // coeffs (int[256]) = c_rarg0
 5014   // zetas (int[256]) = c_rarg1
 5015   address generate_dilithiumAlmostNtt() {
 5016 
 5017     __ align(CodeEntryAlignment);
 5018     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id;
 5019     StubCodeMark mark(this, stub_id);
 5020     address start = __ pc();
 5021     __ enter();
 5022 
 5023     const Register coeffs = c_rarg0;
 5024     const Register zetas = c_rarg1;
 5025 
 5026     const Register tmpAddr = r9;
 5027     const Register dilithiumConsts = r10;
 5028     const Register result = r11;
 5029     // don't use callee save registers v8 - v15
 5030     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5031     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 5032     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5033     int offsets[4] = {0, 32, 64, 96};
 5034     int offsets1[8] = {16, 48, 80, 112, 144, 176, 208, 240 };
 5035     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5036     __ add(result, coeffs, 0);
 5037     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5038 
 5039     // Each level represents one iteration of the outer for loop of the Java version
 5040 
 5041     // level 0-4
 5042     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 5043 
 5044     // level 5
 5045 
 5046     // at level 5 the coefficients we need to combine with the zetas
 5047     // are grouped in memory in blocks of size 4. So, for both sets of
 5048     // coefficients we load 4 adjacent values at 8 different offsets
 5049     // using an indexed ldr with register variant Q and multiply them
 5050     // in sequence order by the next set of inputs. Likewise we store
 5051     // the resuls using an indexed str with register variant Q.
 5052     for (int i = 0; i < 1024; i += 256) {
 5053       // reload constants q, qinv each iteration as they get clobbered later
 5054       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5055       // load 32 (8x4S) coefficients via first offsets = c1
 5056       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 5057       // load next 32 (8x4S) inputs = b
 5058       vs_ldpq_post(vs2, zetas);
 5059       // a = b montul c1
 5060       vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5061       // load 32 (8x4S) coefficients via second offsets = c2
 5062       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 5063       // add/sub with result of multiply
 5064       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 5065       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 5066       // write back new coefficients using same offsets
 5067       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 5068       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 5069     }
 5070 
 5071     // level 6
 5072     // at level 6 the coefficients we need to combine with the zetas
 5073     // are grouped in memory in pairs, the first two being montmul
 5074     // inputs and the second add/sub inputs. We can still implement
 5075     // the montmul+sub+add using 4-way parallelism but only if we
 5076     // combine the coefficients with the zetas 16 at a time. We load 8
 5077     // adjacent values at 4 different offsets using an ld2 load with
 5078     // arrangement 2D. That interleaves the lower and upper halves of
 5079     // each pair of quadwords into successive vector registers. We
 5080     // then need to montmul the 4 even elements of the coefficients
 5081     // register sequence by the zetas in order and then add/sub the 4
 5082     // odd elements of the coefficients register sequence. We use an
 5083     // equivalent st2 operation to store the results back into memory
 5084     // de-interleaved.
 5085     for (int i = 0; i < 1024; i += 128) {
 5086       // reload constants q, qinv each iteration as they get clobbered later
 5087       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5088       // load interleaved 16 (4x2D) coefficients via offsets
 5089       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 5090       // load next 16 (4x4S) inputs
 5091       vs_ldpq_post(vs_front(vs2), zetas);
 5092       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 5093       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 5094                                   vs_front(vs2), vtmp, vq);
 5095       // store interleaved 16 (4x2D) coefficients via offsets
 5096       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 5097     }
 5098 
 5099     // level 7
 5100     // at level 7 the coefficients we need to combine with the zetas
 5101     // occur singly with montmul inputs alterating with add/sub
 5102     // inputs. Once again we can use 4-way parallelism to combine 16
 5103     // zetas at a time. However, we have to load 8 adjacent values at
 5104     // 4 different offsets using an ld2 load with arrangement 4S. That
 5105     // interleaves the the odd words of each pair into one
 5106     // coefficients vector register and the even words of the pair
 5107     // into the next register. We then need to montmul the 4 even
 5108     // elements of the coefficients register sequence by the zetas in
 5109     // order and then add/sub the 4 odd elements of the coefficients
 5110     // register sequence. We use an equivalent st2 operation to store
 5111     // the results back into memory de-interleaved.
 5112 
 5113     for (int i = 0; i < 1024; i += 128) {
 5114       // reload constants q, qinv each iteration as they get clobbered later
 5115       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5116       // load interleaved 16 (4x4S) coefficients via offsets
 5117       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 5118       // load next 16 (4x4S) inputs
 5119       vs_ldpq_post(vs_front(vs2), zetas);
 5120       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 5121       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 5122                                   vs_front(vs2), vtmp, vq);
 5123       // store interleaved 16 (4x4S) coefficients via offsets
 5124       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 5125     }
 5126     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5127     __ mov(r0, zr); // return 0
 5128     __ ret(lr);
 5129 
 5130     return start;
 5131   }
 5132 
 5133   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 5134   // in the Java implementation come in sequences of at least 8, so we
 5135   // can use ldpq to collect the corresponding data into pairs of vector
 5136   // registers
 5137   // We collect the coefficients that correspond to the 'j's into vs1
 5138   // the coefficiets that correspond to the 'j+l's into vs2 then
 5139   // do the additions into vs3 and the subtractions into vs1 then
 5140   // save the result of the additions, load the zetas into vs2
 5141   // do the (Montgomery) multiplications by zeta in parallel into vs2
 5142   // finally save the results back to the coeffs array
 5143   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 5144     const Register coeffs, const Register zetas) {
 5145     int c1 = 0;
 5146     int c2 = 32;
 5147     int startIncr;
 5148     int offsets[4];
 5149     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5150     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5151     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5152 
 5153     offsets[0] = 0;
 5154 
 5155     for (int level = 3; level < 8; level++) {
 5156       int c1Start = c1;
 5157       int c2Start = c2;
 5158       if (level == 3) {
 5159         offsets[1] = 64;
 5160         offsets[2] = 128;
 5161         offsets[3] = 192;
 5162       } else if (level == 4) {
 5163         offsets[1] = 32;
 5164         offsets[2] = 128;
 5165         offsets[3] = 160;
 5166       } else {
 5167         offsets[1] = 32;
 5168         offsets[2] = 64;
 5169         offsets[3] = 96;
 5170       }
 5171 
 5172       // for levels 3 - 7 we simply load 2 x 4 adjacent values at a
 5173       // time at 4 different offsets and multiply them in order by the
 5174       // next set of input values. So we employ indexed load and store
 5175       // pair instructions with arrangement 4S
 5176       for (int i = 0; i < 4; i++) {
 5177         // load v1 32 (8x4S) coefficients relative to first start index
 5178         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 5179         // load v2 32 (8x4S) coefficients relative to second start index
 5180         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 5181         // a0 = v1 + v2 -- n.b. clobbers vqs
 5182         vs_addv(vs3, __ T4S, vs1, vs2);
 5183         // a1 = v1 - v2
 5184         vs_subv(vs1, __ T4S, vs1, vs2);
 5185         // save a1 relative to first start index
 5186         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 5187         // load constants q, qinv each iteration as they get clobbered above
 5188         vs_ldpq(vq, dilithiumConsts); // qInv, q
 5189         // load b next 32 (8x4S) inputs
 5190         vs_ldpq_post(vs2, zetas);
 5191         // a = a1 montmul b
 5192         vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5193         // save a relative to second start index
 5194         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 5195 
 5196         int k = 4 * level + i;
 5197 
 5198         if (k < 24) {
 5199           startIncr = 256;
 5200         } else if (k == 25) {
 5201           startIncr = 384;
 5202         } else {
 5203           startIncr = 128;
 5204         }
 5205 
 5206         c1Start += startIncr;
 5207         c2Start += startIncr;
 5208       }
 5209 
 5210       c2 *= 2;
 5211     }
 5212   }
 5213 
 5214   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 5215   // Implements the method
 5216   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 5217   // the sun.security.provider.ML_DSA class.
 5218   //
 5219   // coeffs (int[256]) = c_rarg0
 5220   // zetas (int[256]) = c_rarg1
 5221   address generate_dilithiumAlmostInverseNtt() {
 5222 
 5223     __ align(CodeEntryAlignment);
 5224     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id;
 5225     StubCodeMark mark(this, stub_id);
 5226     address start = __ pc();
 5227     __ enter();
 5228 
 5229     const Register coeffs = c_rarg0;
 5230     const Register zetas = c_rarg1;
 5231 
 5232     const Register tmpAddr = r9;
 5233     const Register dilithiumConsts = r10;
 5234     const Register result = r11;
 5235     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5236     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 5237     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5238     int offsets[4] = { 0, 32, 64, 96 };
 5239     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5240     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 5241 
 5242     __ add(result, coeffs, 0);
 5243     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5244 
 5245     // Each level represents one iteration of the outer for loop of the Java version
 5246     // level0
 5247 
 5248     // level 0
 5249     // At level 0 we need to interleave adjacent quartets of
 5250     // coefficients before we multiply and add/sub by the next 16
 5251     // zetas just as we did for level 7 in the multiply code. So we
 5252     // load and store the values using an ld2/st2 with arrangement 4S
 5253     for (int i = 0; i < 1024; i += 128) {
 5254       // load constants q, qinv
 5255       // n.b. this can be moved out of the loop as they do not get
 5256       // clobbered by first two loops
 5257       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5258       // a0/a1 load interleaved 32 (8x4S) coefficients
 5259       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 5260       // b load next 32 (8x4S) inputs
 5261       vs_ldpq_post(vs_front(vs2), zetas);
 5262       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 5263       // n.b. second half of vs2 provides temporary register storage
 5264       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 5265                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 5266       // a0/a1 store interleaved 32 (8x4S) coefficients
 5267       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 5268     }
 5269 
 5270     // level 1
 5271     // At level 1 we need to interleave pairs of adjacent pairs of
 5272     // coefficients before we multiply by the next 16 zetas just as we
 5273     // did for level 6 in the multiply code. So we load and store the
 5274     // values an ld2/st2 with arrangement 2D
 5275     for (int i = 0; i < 1024; i += 128) {
 5276       // a0/a1 load interleaved 32 (8x2D) coefficients
 5277       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 5278       // b load next 16 (4x4S) inputs
 5279       vs_ldpq_post(vs_front(vs2), zetas);
 5280       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 5281       // n.b. second half of vs2 provides temporary register storage
 5282       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 5283                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 5284       // a0/a1 store interleaved 32 (8x2D) coefficients
 5285       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 5286     }
 5287 
 5288     // level 2
 5289     // At level 2 coefficients come in blocks of 4. So, we load 4
 5290     // adjacent coefficients at 8 distinct offsets for both the first
 5291     // and second coefficient sequences, using an ldr with register
 5292     // variant Q then combine them with next set of 32 zetas. Likewise
 5293     // we store the results using an str with register variant Q.
 5294     for (int i = 0; i < 1024; i += 256) {
 5295       // c0 load 32 (8x4S) coefficients via first offsets
 5296       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 5297       // c1 load 32 (8x4S) coefficients via second offsets
 5298       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 5299       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 5300       vs_addv(vs3, __ T4S, vs1, vs2);
 5301       // c = c0 - c1
 5302       vs_subv(vs1, __ T4S, vs1, vs2);
 5303       // store a0 32 (8x4S) coefficients via first offsets
 5304       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 5305       // b load 32 (8x4S) next inputs
 5306       vs_ldpq_post(vs2, zetas);
 5307       // reload constants q, qinv -- they were clobbered earlier
 5308       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5309       // compute a1 = b montmul c
 5310       vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5311       // store a1 32 (8x4S) coefficients via second offsets
 5312       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 5313     }
 5314 
 5315     // level 3-7
 5316     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 5317 
 5318     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5319     __ mov(r0, zr); // return 0
 5320     __ ret(lr);
 5321 
 5322     return start;
 5323 
 5324   }
 5325 
 5326   // Dilithium multiply polynomials in the NTT domain.
 5327   // Straightforward implementation of the method
 5328   // static int implDilithiumNttMult(
 5329   //              int[] result, int[] ntta, int[] nttb {} of
 5330   // the sun.security.provider.ML_DSA class.
 5331   //
 5332   // result (int[256]) = c_rarg0
 5333   // poly1 (int[256]) = c_rarg1
 5334   // poly2 (int[256]) = c_rarg2
 5335   address generate_dilithiumNttMult() {
 5336 
 5337         __ align(CodeEntryAlignment);
 5338     StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id;
 5339     StubCodeMark mark(this, stub_id);
 5340     address start = __ pc();
 5341     __ enter();
 5342 
 5343     Label L_loop;
 5344 
 5345     const Register result = c_rarg0;
 5346     const Register poly1 = c_rarg1;
 5347     const Register poly2 = c_rarg2;
 5348 
 5349     const Register dilithiumConsts = r10;
 5350     const Register len = r11;
 5351 
 5352     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5353     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 5354     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5355     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 5356 
 5357     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5358 
 5359     // load constants q, qinv
 5360     vs_ldpq(vq, dilithiumConsts); // qInv, q
 5361     // load constant rSquare into v29
 5362     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 5363 
 5364     __ mov(len, zr);
 5365     __ add(len, len, 1024);
 5366 
 5367     __ BIND(L_loop);
 5368 
 5369     // b load 32 (8x4S) next inputs from poly1
 5370     vs_ldpq_post(vs1, poly1);
 5371     // c load 32 (8x4S) next inputs from poly2
 5372     vs_ldpq_post(vs2, poly2);
 5373     // compute a = b montmul c
 5374     vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5375     // compute a = rsquare montmul a
 5376     vs_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 5377     // save a 32 (8x4S) results
 5378     vs_stpq_post(vs2, result);
 5379 
 5380     __ sub(len, len, 128);
 5381     __ cmp(len, (u1)128);
 5382     __ br(Assembler::GE, L_loop);
 5383 
 5384     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5385     __ mov(r0, zr); // return 0
 5386     __ ret(lr);
 5387 
 5388     return start;
 5389 
 5390   }
 5391 
 5392   // Dilithium Motgomery multiply an array by a constant.
 5393   // A straightforward implementation of the method
 5394   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 5395   // of the sun.security.provider.MLDSA class
 5396   //
 5397   // coeffs (int[256]) = c_rarg0
 5398   // constant (int) = c_rarg1
 5399   address generate_dilithiumMontMulByConstant() {
 5400 
 5401     __ align(CodeEntryAlignment);
 5402     StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id;
 5403     StubCodeMark mark(this, stub_id);
 5404     address start = __ pc();
 5405     __ enter();
 5406 
 5407     Label L_loop;
 5408 
 5409     const Register coeffs = c_rarg0;
 5410     const Register constant = c_rarg1;
 5411 
 5412     const Register dilithiumConsts = r10;
 5413     const Register result = r11;
 5414     const Register len = r12;
 5415 
 5416     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5417     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 5418     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5419     VSeq<8> vconst(29, 0);             // for montmul by constant
 5420 
 5421     // results track inputs
 5422     __ add(result, coeffs, 0);
 5423     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5424 
 5425     // load constants q, qinv -- they do not get clobbered by first two loops
 5426     vs_ldpq(vq, dilithiumConsts); // qInv, q
 5427     // copy caller supplied constant across vconst
 5428     __ dup(vconst[0], __ T4S, constant);
 5429     __ mov(len, zr);
 5430     __ add(len, len, 1024);
 5431 
 5432     __ BIND(L_loop);
 5433 
 5434     // load next 32 inputs
 5435     vs_ldpq_post(vs2, coeffs);
 5436     // mont mul by constant
 5437     vs_montmul32(vs2, vconst, vs2, vtmp, vq);
 5438     // write next 32 results
 5439     vs_stpq_post(vs2, result);
 5440 
 5441     __ sub(len, len, 128);
 5442     __ cmp(len, (u1)128);
 5443     __ br(Assembler::GE, L_loop);
 5444 
 5445     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5446     __ mov(r0, zr); // return 0
 5447     __ ret(lr);
 5448 
 5449     return start;
 5450 
 5451   }
 5452 
 5453   // Dilithium decompose poly.
 5454   // Implements the method
 5455   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 5456   // of the sun.security.provider.ML_DSA class
 5457   //
 5458   // input (int[256]) = c_rarg0
 5459   // lowPart (int[256]) = c_rarg1
 5460   // highPart (int[256]) = c_rarg2
 5461   // twoGamma2  (int) = c_rarg3
 5462   // multiplier (int) = c_rarg4
 5463   address generate_dilithiumDecomposePoly() {
 5464 
 5465     __ align(CodeEntryAlignment);
 5466     StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id;
 5467     StubCodeMark mark(this, stub_id);
 5468     address start = __ pc();
 5469     Label L_loop;
 5470 
 5471     const Register input = c_rarg0;
 5472     const Register lowPart = c_rarg1;
 5473     const Register highPart = c_rarg2;
 5474     const Register twoGamma2 = c_rarg3;
 5475     const Register multiplier = c_rarg4;
 5476 
 5477     const Register len = r9;
 5478     const Register dilithiumConsts = r10;
 5479     const Register tmp = r11;
 5480 
 5481     VSeq<4> vs1(0), vs2(4), vs3(8); // 6 independent sets of 4x4s values
 5482     VSeq<4> vs4(12), vs5(16), vtmp(20);
 5483     VSeq<4> one(25, 0);            // 7 constants for cross-multiplying
 5484     VSeq<4> qminus1(26, 0);
 5485     VSeq<4> g2(27, 0);
 5486     VSeq<4> twog2(28, 0);
 5487     VSeq<4> mult(29, 0);
 5488     VSeq<4> q(30, 0);
 5489     VSeq<4> qadd(31, 0);
 5490 
 5491     __ enter();
 5492 
 5493     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5494 
 5495     // save callee-saved registers
 5496     __ stpd(v8, v9, __ pre(sp, -64));
 5497     __ stpd(v10, v11, Address(sp, 16));
 5498     __ stpd(v12, v13, Address(sp, 32));
 5499     __ stpd(v14, v15, Address(sp, 48));
 5500 
 5501     // populate constant registers
 5502     __ mov(tmp, zr);
 5503     __ add(tmp, tmp, 1);
 5504     __ dup(one[0], __ T4S, tmp); // 1
 5505     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 5506     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 5507     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 5508     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 5509     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 5510     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 5511 
 5512     __ mov(len, zr);
 5513     __ add(len, len, 1024);
 5514 
 5515     __ BIND(L_loop);
 5516 
 5517     // load next 4x4S inputs interleaved: rplus --> vs1
 5518     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 5519 
 5520     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 5521     vs_addv(vtmp, __ T4S, vs1, qadd);
 5522     vs_sshr(vtmp, __ T4S, vtmp, 23);
 5523     vs_mulv(vtmp, __ T4S, vtmp, q);
 5524     vs_subv(vs1, __ T4S, vs1, vtmp);
 5525 
 5526     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 5527     vs_sshr(vtmp, __ T4S, vs1, 31);
 5528     vs_andr(vtmp, vtmp, q);
 5529     vs_addv(vs1, __ T4S, vs1, vtmp);
 5530 
 5531     // quotient --> vs2
 5532     // int quotient = (rplus * multiplier) >> 22;
 5533     vs_mulv(vtmp, __ T4S, vs1, mult);
 5534     vs_sshr(vs2, __ T4S, vtmp, 22);
 5535 
 5536     // r0 --> vs3
 5537     // int r0 = rplus - quotient * twoGamma2;
 5538     vs_mulv(vtmp, __ T4S, vs2, twog2);
 5539     vs_subv(vs3, __ T4S, vs1, vtmp);
 5540 
 5541     // mask --> vs4
 5542     // int mask = (twoGamma2 - r0) >> 22;
 5543     vs_subv(vtmp, __ T4S, twog2, vs3);
 5544     vs_sshr(vs4, __ T4S, vtmp, 22);
 5545 
 5546     // r0 -= (mask & twoGamma2);
 5547     vs_andr(vtmp, vs4, twog2);
 5548     vs_subv(vs3, __ T4S, vs3, vtmp);
 5549 
 5550     //  quotient += (mask & 1);
 5551     vs_andr(vtmp, vs4, one);
 5552     vs_addv(vs2, __ T4S, vs2, vtmp);
 5553 
 5554     // mask = (twoGamma2 / 2 - r0) >> 31;
 5555     vs_subv(vtmp, __ T4S, g2, vs3);
 5556     vs_sshr(vs4, __ T4S, vtmp, 31);
 5557 
 5558     // r0 -= (mask & twoGamma2);
 5559     vs_andr(vtmp, vs4, twog2);
 5560     vs_subv(vs3, __ T4S, vs3, vtmp);
 5561 
 5562     // quotient += (mask & 1);
 5563     vs_andr(vtmp, vs4, one);
 5564     vs_addv(vs2, __ T4S, vs2, vtmp);
 5565 
 5566     // r1 --> vs5
 5567     // int r1 = rplus - r0 - (dilithium_q - 1);
 5568     vs_subv(vtmp, __ T4S, vs1, vs3);
 5569     vs_subv(vs5, __ T4S, vtmp, qminus1);
 5570 
 5571     // r1 --> vs1 (overwriting rplus)
 5572     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 5573     vs_negr(vtmp, __ T4S, vs5);
 5574     vs_orr(vtmp, vs5, vtmp);
 5575     vs_sshr(vs1, __ T4S, vtmp, 31);
 5576 
 5577     // r0 += ~r1;
 5578     vs_notr(vtmp, vs1);
 5579     vs_addv(vs3, __ T4S, vs3, vtmp);
 5580 
 5581     // r1 = r1 & quotient;
 5582     vs_andr(vs1, vs2, vs1);
 5583 
 5584     // store results inteleaved
 5585     // lowPart[m] = r0;
 5586     // highPart[m] = r1;
 5587     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 5588     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 5589 
 5590 
 5591     __ sub(len, len, 64);
 5592     __ cmp(len, (u1)64);
 5593     __ br(Assembler::GE, L_loop);
 5594 
 5595     // restore callee-saved vector registers
 5596     __ ldpd(v14, v15, Address(sp, 48));
 5597     __ ldpd(v12, v13, Address(sp, 32));
 5598     __ ldpd(v10, v11, Address(sp, 16));
 5599     __ ldpd(v8, v9, __ post(sp, 64));
 5600 
 5601     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5602     __ mov(r0, zr); // return 0
 5603     __ ret(lr);
 5604 
 5605     return start;
 5606 
 5607   }
 5608 
 5609   /**
 5610    *  Arguments:
 5611    *
 5612    * Inputs:
 5613    *   c_rarg0   - int crc
 5614    *   c_rarg1   - byte* buf
 5615    *   c_rarg2   - int length
 5616    *   c_rarg3   - int* table
 5617    *
 5618    * Output:
 5619    *       r0   - int crc result
 5620    */
 5621   address generate_updateBytesCRC32C() {
 5622     assert(UseCRC32CIntrinsics, "what are we doing here?");
 5623 
 5624     __ align(CodeEntryAlignment);
 5625     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
 5626     StubCodeMark mark(this, stub_id);
 5627 
 5628     address start = __ pc();
 5629 
 5630     const Register crc   = c_rarg0;  // crc
 5631     const Register buf   = c_rarg1;  // source java byte array address
 5632     const Register len   = c_rarg2;  // length
 5633     const Register table0 = c_rarg3; // crc_table address
 5634     const Register table1 = c_rarg4;
 5635     const Register table2 = c_rarg5;
 5636     const Register table3 = c_rarg6;
 5637     const Register tmp3 = c_rarg7;
 5638 
 5639     BLOCK_COMMENT("Entry:");
 5640     __ enter(); // required for proper stackwalking of RuntimeStub frame
 5641 
 5642     __ kernel_crc32c(crc, buf, len,
 5643               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 5644 
 5645     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5646     __ ret(lr);
 5647 
 5648     return start;
 5649   }
 5650 
 5651   /***
 5652    *  Arguments:
 5653    *
 5654    *  Inputs:
 5655    *   c_rarg0   - int   adler
 5656    *   c_rarg1   - byte* buff
 5657    *   c_rarg2   - int   len
 5658    *
 5659    * Output:
 5660    *   c_rarg0   - int adler result
 5661    */
 5662   address generate_updateBytesAdler32() {
 5663     __ align(CodeEntryAlignment);
 5664     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
 5665     StubCodeMark mark(this, stub_id);
 5666     address start = __ pc();
 5667 
 5668     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 5669 
 5670     // Aliases
 5671     Register adler  = c_rarg0;
 5672     Register s1     = c_rarg0;
 5673     Register s2     = c_rarg3;
 5674     Register buff   = c_rarg1;
 5675     Register len    = c_rarg2;
 5676     Register nmax  = r4;
 5677     Register base  = r5;
 5678     Register count = r6;
 5679     Register temp0 = rscratch1;
 5680     Register temp1 = rscratch2;
 5681     FloatRegister vbytes = v0;
 5682     FloatRegister vs1acc = v1;
 5683     FloatRegister vs2acc = v2;
 5684     FloatRegister vtable = v3;
 5685 
 5686     // Max number of bytes we can process before having to take the mod
 5687     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 5688     uint64_t BASE = 0xfff1;
 5689     uint64_t NMAX = 0x15B0;
 5690 
 5691     __ mov(base, BASE);
 5692     __ mov(nmax, NMAX);
 5693 
 5694     // Load accumulation coefficients for the upper 16 bits
 5695     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 5696     __ ld1(vtable, __ T16B, Address(temp0));
 5697 
 5698     // s1 is initialized to the lower 16 bits of adler
 5699     // s2 is initialized to the upper 16 bits of adler
 5700     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 5701     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 5702 
 5703     // The pipelined loop needs at least 16 elements for 1 iteration
 5704     // It does check this, but it is more effective to skip to the cleanup loop
 5705     __ cmp(len, (u1)16);
 5706     __ br(Assembler::HS, L_nmax);
 5707     __ cbz(len, L_combine);
 5708 
 5709     __ bind(L_simple_by1_loop);
 5710     __ ldrb(temp0, Address(__ post(buff, 1)));
 5711     __ add(s1, s1, temp0);
 5712     __ add(s2, s2, s1);
 5713     __ subs(len, len, 1);
 5714     __ br(Assembler::HI, L_simple_by1_loop);
 5715 
 5716     // s1 = s1 % BASE
 5717     __ subs(temp0, s1, base);
 5718     __ csel(s1, temp0, s1, Assembler::HS);
 5719 
 5720     // s2 = s2 % BASE
 5721     __ lsr(temp0, s2, 16);
 5722     __ lsl(temp1, temp0, 4);
 5723     __ sub(temp1, temp1, temp0);
 5724     __ add(s2, temp1, s2, ext::uxth);
 5725 
 5726     __ subs(temp0, s2, base);
 5727     __ csel(s2, temp0, s2, Assembler::HS);
 5728 
 5729     __ b(L_combine);
 5730 
 5731     __ bind(L_nmax);
 5732     __ subs(len, len, nmax);
 5733     __ sub(count, nmax, 16);
 5734     __ br(Assembler::LO, L_by16);
 5735 
 5736     __ bind(L_nmax_loop);
 5737 
 5738     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 5739                                       vbytes, vs1acc, vs2acc, vtable);
 5740 
 5741     __ subs(count, count, 16);
 5742     __ br(Assembler::HS, L_nmax_loop);
 5743 
 5744     // s1 = s1 % BASE
 5745     __ lsr(temp0, s1, 16);
 5746     __ lsl(temp1, temp0, 4);
 5747     __ sub(temp1, temp1, temp0);
 5748     __ add(temp1, temp1, s1, ext::uxth);
 5749 
 5750     __ lsr(temp0, temp1, 16);
 5751     __ lsl(s1, temp0, 4);
 5752     __ sub(s1, s1, temp0);
 5753     __ add(s1, s1, temp1, ext:: uxth);
 5754 
 5755     __ subs(temp0, s1, base);
 5756     __ csel(s1, temp0, s1, Assembler::HS);
 5757 
 5758     // s2 = s2 % BASE
 5759     __ lsr(temp0, s2, 16);
 5760     __ lsl(temp1, temp0, 4);
 5761     __ sub(temp1, temp1, temp0);
 5762     __ add(temp1, temp1, s2, ext::uxth);
 5763 
 5764     __ lsr(temp0, temp1, 16);
 5765     __ lsl(s2, temp0, 4);
 5766     __ sub(s2, s2, temp0);
 5767     __ add(s2, s2, temp1, ext:: uxth);
 5768 
 5769     __ subs(temp0, s2, base);
 5770     __ csel(s2, temp0, s2, Assembler::HS);
 5771 
 5772     __ subs(len, len, nmax);
 5773     __ sub(count, nmax, 16);
 5774     __ br(Assembler::HS, L_nmax_loop);
 5775 
 5776     __ bind(L_by16);
 5777     __ adds(len, len, count);
 5778     __ br(Assembler::LO, L_by1);
 5779 
 5780     __ bind(L_by16_loop);
 5781 
 5782     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 5783                                       vbytes, vs1acc, vs2acc, vtable);
 5784 
 5785     __ subs(len, len, 16);
 5786     __ br(Assembler::HS, L_by16_loop);
 5787 
 5788     __ bind(L_by1);
 5789     __ adds(len, len, 15);
 5790     __ br(Assembler::LO, L_do_mod);
 5791 
 5792     __ bind(L_by1_loop);
 5793     __ ldrb(temp0, Address(__ post(buff, 1)));
 5794     __ add(s1, temp0, s1);
 5795     __ add(s2, s2, s1);
 5796     __ subs(len, len, 1);
 5797     __ br(Assembler::HS, L_by1_loop);
 5798 
 5799     __ bind(L_do_mod);
 5800     // s1 = s1 % BASE
 5801     __ lsr(temp0, s1, 16);
 5802     __ lsl(temp1, temp0, 4);
 5803     __ sub(temp1, temp1, temp0);
 5804     __ add(temp1, temp1, s1, ext::uxth);
 5805 
 5806     __ lsr(temp0, temp1, 16);
 5807     __ lsl(s1, temp0, 4);
 5808     __ sub(s1, s1, temp0);
 5809     __ add(s1, s1, temp1, ext:: uxth);
 5810 
 5811     __ subs(temp0, s1, base);
 5812     __ csel(s1, temp0, s1, Assembler::HS);
 5813 
 5814     // s2 = s2 % BASE
 5815     __ lsr(temp0, s2, 16);
 5816     __ lsl(temp1, temp0, 4);
 5817     __ sub(temp1, temp1, temp0);
 5818     __ add(temp1, temp1, s2, ext::uxth);
 5819 
 5820     __ lsr(temp0, temp1, 16);
 5821     __ lsl(s2, temp0, 4);
 5822     __ sub(s2, s2, temp0);
 5823     __ add(s2, s2, temp1, ext:: uxth);
 5824 
 5825     __ subs(temp0, s2, base);
 5826     __ csel(s2, temp0, s2, Assembler::HS);
 5827 
 5828     // Combine lower bits and higher bits
 5829     __ bind(L_combine);
 5830     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 5831 
 5832     __ ret(lr);
 5833 
 5834     return start;
 5835   }
 5836 
 5837   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 5838           Register temp0, Register temp1, FloatRegister vbytes,
 5839           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 5840     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 5841     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 5842     // In non-vectorized code, we update s1 and s2 as:
 5843     //   s1 <- s1 + b1
 5844     //   s2 <- s2 + s1
 5845     //   s1 <- s1 + b2
 5846     //   s2 <- s2 + b1
 5847     //   ...
 5848     //   s1 <- s1 + b16
 5849     //   s2 <- s2 + s1
 5850     // Putting above assignments together, we have:
 5851     //   s1_new = s1 + b1 + b2 + ... + b16
 5852     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 5853     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 5854     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 5855     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 5856 
 5857     // s2 = s2 + s1 * 16
 5858     __ add(s2, s2, s1, Assembler::LSL, 4);
 5859 
 5860     // vs1acc = b1 + b2 + b3 + ... + b16
 5861     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 5862     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 5863     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 5864     __ uaddlv(vs1acc, __ T16B, vbytes);
 5865     __ uaddlv(vs2acc, __ T8H, vs2acc);
 5866 
 5867     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 5868     __ fmovd(temp0, vs1acc);
 5869     __ fmovd(temp1, vs2acc);
 5870     __ add(s1, s1, temp0);
 5871     __ add(s2, s2, temp1);
 5872   }
 5873 
 5874   /**
 5875    *  Arguments:
 5876    *
 5877    *  Input:
 5878    *    c_rarg0   - x address
 5879    *    c_rarg1   - x length
 5880    *    c_rarg2   - y address
 5881    *    c_rarg3   - y length
 5882    *    c_rarg4   - z address
 5883    */
 5884   address generate_multiplyToLen() {
 5885     __ align(CodeEntryAlignment);
 5886     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
 5887     StubCodeMark mark(this, stub_id);
 5888 
 5889     address start = __ pc();
 5890  
 5891     if (SCCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) {
 5892       return start;
 5893     }
 5894     const Register x     = r0;
 5895     const Register xlen  = r1;
 5896     const Register y     = r2;
 5897     const Register ylen  = r3;
 5898     const Register z     = r4;
 5899 
 5900     const Register tmp0  = r5;
 5901     const Register tmp1  = r10;
 5902     const Register tmp2  = r11;
 5903     const Register tmp3  = r12;
 5904     const Register tmp4  = r13;
 5905     const Register tmp5  = r14;
 5906     const Register tmp6  = r15;
 5907     const Register tmp7  = r16;
 5908 
 5909     BLOCK_COMMENT("Entry:");
 5910     __ enter(); // required for proper stackwalking of RuntimeStub frame
 5911     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 5912     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5913     __ ret(lr);
 5914 
 5915     SCCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start);
 5916     return start;
 5917   }
 5918 
 5919   address generate_squareToLen() {
 5920     // squareToLen algorithm for sizes 1..127 described in java code works
 5921     // faster than multiply_to_len on some CPUs and slower on others, but
 5922     // multiply_to_len shows a bit better overall results
 5923     __ align(CodeEntryAlignment);
 5924     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
 5925     StubCodeMark mark(this, stub_id);
 5926     address start = __ pc();
 5927 
 5928     if (SCCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) {
 5929       return start;
 5930     }
 5931     const Register x     = r0;
 5932     const Register xlen  = r1;
 5933     const Register z     = r2;
 5934     const Register y     = r4; // == x
 5935     const Register ylen  = r5; // == xlen
 5936 
 5937     const Register tmp0  = r3;
 5938     const Register tmp1  = r10;
 5939     const Register tmp2  = r11;
 5940     const Register tmp3  = r12;
 5941     const Register tmp4  = r13;
 5942     const Register tmp5  = r14;
 5943     const Register tmp6  = r15;
 5944     const Register tmp7  = r16;
 5945 
 5946     RegSet spilled_regs = RegSet::of(y, ylen);
 5947     BLOCK_COMMENT("Entry:");
 5948     __ enter();
 5949     __ push(spilled_regs, sp);
 5950     __ mov(y, x);
 5951     __ mov(ylen, xlen);
 5952     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 5953     __ pop(spilled_regs, sp);
 5954     __ leave();
 5955     __ ret(lr);
 5956 
 5957     SCCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start);
 5958     return start;
 5959   }
 5960 
 5961   address generate_mulAdd() {
 5962     __ align(CodeEntryAlignment);
 5963     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
 5964     StubCodeMark mark(this, stub_id);
 5965 
 5966     address start = __ pc();
 5967 
 5968     if (SCCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) {
 5969       return start;
 5970     }
 5971     const Register out     = r0;
 5972     const Register in      = r1;
 5973     const Register offset  = r2;
 5974     const Register len     = r3;
 5975     const Register k       = r4;
 5976 
 5977     BLOCK_COMMENT("Entry:");
 5978     __ enter();
 5979     __ mul_add(out, in, offset, len, k);
 5980     __ leave();
 5981     __ ret(lr);
 5982 
 5983     SCCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start);
 5984     return start;
 5985   }
 5986 
 5987   // Arguments:
 5988   //
 5989   // Input:
 5990   //   c_rarg0   - newArr address
 5991   //   c_rarg1   - oldArr address
 5992   //   c_rarg2   - newIdx
 5993   //   c_rarg3   - shiftCount
 5994   //   c_rarg4   - numIter
 5995   //
 5996   address generate_bigIntegerRightShift() {
 5997     __ align(CodeEntryAlignment);
 5998     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
 5999     StubCodeMark mark(this, stub_id);
 6000     address start = __ pc();
 6001 
 6002     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 6003 
 6004     Register newArr        = c_rarg0;
 6005     Register oldArr        = c_rarg1;
 6006     Register newIdx        = c_rarg2;
 6007     Register shiftCount    = c_rarg3;
 6008     Register numIter       = c_rarg4;
 6009     Register idx           = numIter;
 6010 
 6011     Register newArrCur     = rscratch1;
 6012     Register shiftRevCount = rscratch2;
 6013     Register oldArrCur     = r13;
 6014     Register oldArrNext    = r14;
 6015 
 6016     FloatRegister oldElem0        = v0;
 6017     FloatRegister oldElem1        = v1;
 6018     FloatRegister newElem         = v2;
 6019     FloatRegister shiftVCount     = v3;
 6020     FloatRegister shiftVRevCount  = v4;
 6021 
 6022     __ cbz(idx, Exit);
 6023 
 6024     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 6025 
 6026     // left shift count
 6027     __ movw(shiftRevCount, 32);
 6028     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 6029 
 6030     // numIter too small to allow a 4-words SIMD loop, rolling back
 6031     __ cmp(numIter, (u1)4);
 6032     __ br(Assembler::LT, ShiftThree);
 6033 
 6034     __ dup(shiftVCount,    __ T4S, shiftCount);
 6035     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 6036     __ negr(shiftVCount,   __ T4S, shiftVCount);
 6037 
 6038     __ BIND(ShiftSIMDLoop);
 6039 
 6040     // Calculate the load addresses
 6041     __ sub(idx, idx, 4);
 6042     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 6043     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 6044     __ add(oldArrCur,  oldArrNext, 4);
 6045 
 6046     // Load 4 words and process
 6047     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 6048     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 6049     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 6050     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 6051     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 6052     __ st1(newElem,   __ T4S,  Address(newArrCur));
 6053 
 6054     __ cmp(idx, (u1)4);
 6055     __ br(Assembler::LT, ShiftTwoLoop);
 6056     __ b(ShiftSIMDLoop);
 6057 
 6058     __ BIND(ShiftTwoLoop);
 6059     __ cbz(idx, Exit);
 6060     __ cmp(idx, (u1)1);
 6061     __ br(Assembler::EQ, ShiftOne);
 6062 
 6063     // Calculate the load addresses
 6064     __ sub(idx, idx, 2);
 6065     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 6066     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 6067     __ add(oldArrCur,  oldArrNext, 4);
 6068 
 6069     // Load 2 words and process
 6070     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 6071     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 6072     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 6073     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 6074     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 6075     __ st1(newElem,   __ T2S, Address(newArrCur));
 6076     __ b(ShiftTwoLoop);
 6077 
 6078     __ BIND(ShiftThree);
 6079     __ tbz(idx, 1, ShiftOne);
 6080     __ tbz(idx, 0, ShiftTwo);
 6081     __ ldrw(r10,  Address(oldArr, 12));
 6082     __ ldrw(r11,  Address(oldArr, 8));
 6083     __ lsrvw(r10, r10, shiftCount);
 6084     __ lslvw(r11, r11, shiftRevCount);
 6085     __ orrw(r12,  r10, r11);
 6086     __ strw(r12,  Address(newArr, 8));
 6087 
 6088     __ BIND(ShiftTwo);
 6089     __ ldrw(r10,  Address(oldArr, 8));
 6090     __ ldrw(r11,  Address(oldArr, 4));
 6091     __ lsrvw(r10, r10, shiftCount);
 6092     __ lslvw(r11, r11, shiftRevCount);
 6093     __ orrw(r12,  r10, r11);
 6094     __ strw(r12,  Address(newArr, 4));
 6095 
 6096     __ BIND(ShiftOne);
 6097     __ ldrw(r10,  Address(oldArr, 4));
 6098     __ ldrw(r11,  Address(oldArr));
 6099     __ lsrvw(r10, r10, shiftCount);
 6100     __ lslvw(r11, r11, shiftRevCount);
 6101     __ orrw(r12,  r10, r11);
 6102     __ strw(r12,  Address(newArr));
 6103 
 6104     __ BIND(Exit);
 6105     __ ret(lr);
 6106 
 6107     return start;
 6108   }
 6109 
 6110   // Arguments:
 6111   //
 6112   // Input:
 6113   //   c_rarg0   - newArr address
 6114   //   c_rarg1   - oldArr address
 6115   //   c_rarg2   - newIdx
 6116   //   c_rarg3   - shiftCount
 6117   //   c_rarg4   - numIter
 6118   //
 6119   address generate_bigIntegerLeftShift() {
 6120     __ align(CodeEntryAlignment);
 6121     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
 6122     StubCodeMark mark(this, stub_id);
 6123     address start = __ pc();
 6124 
 6125     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 6126 
 6127     Register newArr        = c_rarg0;
 6128     Register oldArr        = c_rarg1;
 6129     Register newIdx        = c_rarg2;
 6130     Register shiftCount    = c_rarg3;
 6131     Register numIter       = c_rarg4;
 6132 
 6133     Register shiftRevCount = rscratch1;
 6134     Register oldArrNext    = rscratch2;
 6135 
 6136     FloatRegister oldElem0        = v0;
 6137     FloatRegister oldElem1        = v1;
 6138     FloatRegister newElem         = v2;
 6139     FloatRegister shiftVCount     = v3;
 6140     FloatRegister shiftVRevCount  = v4;
 6141 
 6142     __ cbz(numIter, Exit);
 6143 
 6144     __ add(oldArrNext, oldArr, 4);
 6145     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 6146 
 6147     // right shift count
 6148     __ movw(shiftRevCount, 32);
 6149     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 6150 
 6151     // numIter too small to allow a 4-words SIMD loop, rolling back
 6152     __ cmp(numIter, (u1)4);
 6153     __ br(Assembler::LT, ShiftThree);
 6154 
 6155     __ dup(shiftVCount,     __ T4S, shiftCount);
 6156     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 6157     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 6158 
 6159     __ BIND(ShiftSIMDLoop);
 6160 
 6161     // load 4 words and process
 6162     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 6163     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 6164     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 6165     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 6166     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 6167     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 6168     __ sub(numIter,   numIter, 4);
 6169 
 6170     __ cmp(numIter, (u1)4);
 6171     __ br(Assembler::LT, ShiftTwoLoop);
 6172     __ b(ShiftSIMDLoop);
 6173 
 6174     __ BIND(ShiftTwoLoop);
 6175     __ cbz(numIter, Exit);
 6176     __ cmp(numIter, (u1)1);
 6177     __ br(Assembler::EQ, ShiftOne);
 6178 
 6179     // load 2 words and process
 6180     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 6181     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 6182     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 6183     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 6184     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 6185     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 6186     __ sub(numIter,   numIter, 2);
 6187     __ b(ShiftTwoLoop);
 6188 
 6189     __ BIND(ShiftThree);
 6190     __ ldrw(r10,  __ post(oldArr, 4));
 6191     __ ldrw(r11,  __ post(oldArrNext, 4));
 6192     __ lslvw(r10, r10, shiftCount);
 6193     __ lsrvw(r11, r11, shiftRevCount);
 6194     __ orrw(r12,  r10, r11);
 6195     __ strw(r12,  __ post(newArr, 4));
 6196     __ tbz(numIter, 1, Exit);
 6197     __ tbz(numIter, 0, ShiftOne);
 6198 
 6199     __ BIND(ShiftTwo);
 6200     __ ldrw(r10,  __ post(oldArr, 4));
 6201     __ ldrw(r11,  __ post(oldArrNext, 4));
 6202     __ lslvw(r10, r10, shiftCount);
 6203     __ lsrvw(r11, r11, shiftRevCount);
 6204     __ orrw(r12,  r10, r11);
 6205     __ strw(r12,  __ post(newArr, 4));
 6206 
 6207     __ BIND(ShiftOne);
 6208     __ ldrw(r10,  Address(oldArr));
 6209     __ ldrw(r11,  Address(oldArrNext));
 6210     __ lslvw(r10, r10, shiftCount);
 6211     __ lsrvw(r11, r11, shiftRevCount);
 6212     __ orrw(r12,  r10, r11);
 6213     __ strw(r12,  Address(newArr));
 6214 
 6215     __ BIND(Exit);
 6216     __ ret(lr);
 6217 
 6218     return start;
 6219   }
 6220 
 6221   address generate_count_positives(address &count_positives_long) {
 6222     const u1 large_loop_size = 64;
 6223     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 6224     int dcache_line = VM_Version::dcache_line_size();
 6225 
 6226     Register ary1 = r1, len = r2, result = r0;
 6227 
 6228     __ align(CodeEntryAlignment);
 6229 
 6230     StubGenStubId stub_id = StubGenStubId::count_positives_id;
 6231     StubCodeMark mark(this, stub_id);
 6232 
 6233     address entry = __ pc();
 6234 
 6235     __ enter();
 6236     // precondition: a copy of len is already in result
 6237     // __ mov(result, len);
 6238 
 6239   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 6240         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 6241 
 6242   __ cmp(len, (u1)15);
 6243   __ br(Assembler::GT, LEN_OVER_15);
 6244   // The only case when execution falls into this code is when pointer is near
 6245   // the end of memory page and we have to avoid reading next page
 6246   __ add(ary1, ary1, len);
 6247   __ subs(len, len, 8);
 6248   __ br(Assembler::GT, LEN_OVER_8);
 6249   __ ldr(rscratch2, Address(ary1, -8));
 6250   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 6251   __ lsrv(rscratch2, rscratch2, rscratch1);
 6252   __ tst(rscratch2, UPPER_BIT_MASK);
 6253   __ csel(result, zr, result, Assembler::NE);
 6254   __ leave();
 6255   __ ret(lr);
 6256   __ bind(LEN_OVER_8);
 6257   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 6258   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 6259   __ tst(rscratch2, UPPER_BIT_MASK);
 6260   __ br(Assembler::NE, RET_NO_POP);
 6261   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 6262   __ lsrv(rscratch1, rscratch1, rscratch2);
 6263   __ tst(rscratch1, UPPER_BIT_MASK);
 6264   __ bind(RET_NO_POP);
 6265   __ csel(result, zr, result, Assembler::NE);
 6266   __ leave();
 6267   __ ret(lr);
 6268 
 6269   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 6270   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 6271 
 6272   count_positives_long = __ pc(); // 2nd entry point
 6273 
 6274   __ enter();
 6275 
 6276   __ bind(LEN_OVER_15);
 6277     __ push(spilled_regs, sp);
 6278     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 6279     __ cbz(rscratch2, ALIGNED);
 6280     __ ldp(tmp6, tmp1, Address(ary1));
 6281     __ mov(tmp5, 16);
 6282     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 6283     __ add(ary1, ary1, rscratch1);
 6284     __ orr(tmp6, tmp6, tmp1);
 6285     __ tst(tmp6, UPPER_BIT_MASK);
 6286     __ br(Assembler::NE, RET_ADJUST);
 6287     __ sub(len, len, rscratch1);
 6288 
 6289   __ bind(ALIGNED);
 6290     __ cmp(len, large_loop_size);
 6291     __ br(Assembler::LT, CHECK_16);
 6292     // Perform 16-byte load as early return in pre-loop to handle situation
 6293     // when initially aligned large array has negative values at starting bytes,
 6294     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 6295     // slower. Cases with negative bytes further ahead won't be affected that
 6296     // much. In fact, it'll be faster due to early loads, less instructions and
 6297     // less branches in LARGE_LOOP.
 6298     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 6299     __ sub(len, len, 16);
 6300     __ orr(tmp6, tmp6, tmp1);
 6301     __ tst(tmp6, UPPER_BIT_MASK);
 6302     __ br(Assembler::NE, RET_ADJUST_16);
 6303     __ cmp(len, large_loop_size);
 6304     __ br(Assembler::LT, CHECK_16);
 6305 
 6306     if (SoftwarePrefetchHintDistance >= 0
 6307         && SoftwarePrefetchHintDistance >= dcache_line) {
 6308       // initial prefetch
 6309       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 6310     }
 6311   __ bind(LARGE_LOOP);
 6312     if (SoftwarePrefetchHintDistance >= 0) {
 6313       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 6314     }
 6315     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 6316     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 6317     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 6318     // instructions per cycle and have less branches, but this approach disables
 6319     // early return, thus, all 64 bytes are loaded and checked every time.
 6320     __ ldp(tmp2, tmp3, Address(ary1));
 6321     __ ldp(tmp4, tmp5, Address(ary1, 16));
 6322     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 6323     __ ldp(tmp6, tmp1, Address(ary1, 48));
 6324     __ add(ary1, ary1, large_loop_size);
 6325     __ sub(len, len, large_loop_size);
 6326     __ orr(tmp2, tmp2, tmp3);
 6327     __ orr(tmp4, tmp4, tmp5);
 6328     __ orr(rscratch1, rscratch1, rscratch2);
 6329     __ orr(tmp6, tmp6, tmp1);
 6330     __ orr(tmp2, tmp2, tmp4);
 6331     __ orr(rscratch1, rscratch1, tmp6);
 6332     __ orr(tmp2, tmp2, rscratch1);
 6333     __ tst(tmp2, UPPER_BIT_MASK);
 6334     __ br(Assembler::NE, RET_ADJUST_LONG);
 6335     __ cmp(len, large_loop_size);
 6336     __ br(Assembler::GE, LARGE_LOOP);
 6337 
 6338   __ bind(CHECK_16); // small 16-byte load pre-loop
 6339     __ cmp(len, (u1)16);
 6340     __ br(Assembler::LT, POST_LOOP16);
 6341 
 6342   __ bind(LOOP16); // small 16-byte load loop
 6343     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 6344     __ sub(len, len, 16);
 6345     __ orr(tmp2, tmp2, tmp3);
 6346     __ tst(tmp2, UPPER_BIT_MASK);
 6347     __ br(Assembler::NE, RET_ADJUST_16);
 6348     __ cmp(len, (u1)16);
 6349     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 6350 
 6351   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 6352     __ cmp(len, (u1)8);
 6353     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 6354     __ ldr(tmp3, Address(__ post(ary1, 8)));
 6355     __ tst(tmp3, UPPER_BIT_MASK);
 6356     __ br(Assembler::NE, RET_ADJUST);
 6357     __ sub(len, len, 8);
 6358 
 6359   __ bind(POST_LOOP16_LOAD_TAIL);
 6360     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 6361     __ ldr(tmp1, Address(ary1));
 6362     __ mov(tmp2, 64);
 6363     __ sub(tmp4, tmp2, len, __ LSL, 3);
 6364     __ lslv(tmp1, tmp1, tmp4);
 6365     __ tst(tmp1, UPPER_BIT_MASK);
 6366     __ br(Assembler::NE, RET_ADJUST);
 6367     // Fallthrough
 6368 
 6369   __ bind(RET_LEN);
 6370     __ pop(spilled_regs, sp);
 6371     __ leave();
 6372     __ ret(lr);
 6373 
 6374     // difference result - len is the count of guaranteed to be
 6375     // positive bytes
 6376 
 6377   __ bind(RET_ADJUST_LONG);
 6378     __ add(len, len, (u1)(large_loop_size - 16));
 6379   __ bind(RET_ADJUST_16);
 6380     __ add(len, len, 16);
 6381   __ bind(RET_ADJUST);
 6382     __ pop(spilled_regs, sp);
 6383     __ leave();
 6384     __ sub(result, result, len);
 6385     __ ret(lr);
 6386 
 6387     return entry;
 6388   }
 6389 
 6390   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 6391         bool usePrefetch, Label &NOT_EQUAL) {
 6392     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6393         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 6394         tmp7 = r12, tmp8 = r13;
 6395     Label LOOP;
 6396 
 6397     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6398     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6399     __ bind(LOOP);
 6400     if (usePrefetch) {
 6401       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 6402       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 6403     }
 6404     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 6405     __ eor(tmp1, tmp1, tmp2);
 6406     __ eor(tmp3, tmp3, tmp4);
 6407     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 6408     __ orr(tmp1, tmp1, tmp3);
 6409     __ cbnz(tmp1, NOT_EQUAL);
 6410     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6411     __ eor(tmp5, tmp5, tmp6);
 6412     __ eor(tmp7, tmp7, tmp8);
 6413     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6414     __ orr(tmp5, tmp5, tmp7);
 6415     __ cbnz(tmp5, NOT_EQUAL);
 6416     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 6417     __ eor(tmp1, tmp1, tmp2);
 6418     __ eor(tmp3, tmp3, tmp4);
 6419     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 6420     __ orr(tmp1, tmp1, tmp3);
 6421     __ cbnz(tmp1, NOT_EQUAL);
 6422     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6423     __ eor(tmp5, tmp5, tmp6);
 6424     __ sub(cnt1, cnt1, 8 * wordSize);
 6425     __ eor(tmp7, tmp7, tmp8);
 6426     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6427     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 6428     // cmp) because subs allows an unlimited range of immediate operand.
 6429     __ subs(tmp6, cnt1, loopThreshold);
 6430     __ orr(tmp5, tmp5, tmp7);
 6431     __ cbnz(tmp5, NOT_EQUAL);
 6432     __ br(__ GE, LOOP);
 6433     // post-loop
 6434     __ eor(tmp1, tmp1, tmp2);
 6435     __ eor(tmp3, tmp3, tmp4);
 6436     __ orr(tmp1, tmp1, tmp3);
 6437     __ sub(cnt1, cnt1, 2 * wordSize);
 6438     __ cbnz(tmp1, NOT_EQUAL);
 6439   }
 6440 
 6441   void generate_large_array_equals_loop_simd(int loopThreshold,
 6442         bool usePrefetch, Label &NOT_EQUAL) {
 6443     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6444         tmp2 = rscratch2;
 6445     Label LOOP;
 6446 
 6447     __ bind(LOOP);
 6448     if (usePrefetch) {
 6449       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 6450       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 6451     }
 6452     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 6453     __ sub(cnt1, cnt1, 8 * wordSize);
 6454     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 6455     __ subs(tmp1, cnt1, loopThreshold);
 6456     __ eor(v0, __ T16B, v0, v4);
 6457     __ eor(v1, __ T16B, v1, v5);
 6458     __ eor(v2, __ T16B, v2, v6);
 6459     __ eor(v3, __ T16B, v3, v7);
 6460     __ orr(v0, __ T16B, v0, v1);
 6461     __ orr(v1, __ T16B, v2, v3);
 6462     __ orr(v0, __ T16B, v0, v1);
 6463     __ umov(tmp1, v0, __ D, 0);
 6464     __ umov(tmp2, v0, __ D, 1);
 6465     __ orr(tmp1, tmp1, tmp2);
 6466     __ cbnz(tmp1, NOT_EQUAL);
 6467     __ br(__ GE, LOOP);
 6468   }
 6469 
 6470   // a1 = r1 - array1 address
 6471   // a2 = r2 - array2 address
 6472   // result = r0 - return value. Already contains "false"
 6473   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 6474   // r3-r5 are reserved temporary registers
 6475   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 6476   address generate_large_array_equals() {
 6477     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6478         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 6479         tmp7 = r12, tmp8 = r13;
 6480     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 6481         SMALL_LOOP, POST_LOOP;
 6482     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 6483     // calculate if at least 32 prefetched bytes are used
 6484     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 6485     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 6486     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 6487     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 6488         tmp5, tmp6, tmp7, tmp8);
 6489 
 6490     __ align(CodeEntryAlignment);
 6491 
 6492     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
 6493     StubCodeMark mark(this, stub_id);
 6494 
 6495     address entry = __ pc();
 6496     __ enter();
 6497     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 6498     // also advance pointers to use post-increment instead of pre-increment
 6499     __ add(a1, a1, wordSize);
 6500     __ add(a2, a2, wordSize);
 6501     if (AvoidUnalignedAccesses) {
 6502       // both implementations (SIMD/nonSIMD) are using relatively large load
 6503       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 6504       // on some CPUs in case of address is not at least 16-byte aligned.
 6505       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 6506       // load if needed at least for 1st address and make if 16-byte aligned.
 6507       Label ALIGNED16;
 6508       __ tbz(a1, 3, ALIGNED16);
 6509       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 6510       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 6511       __ sub(cnt1, cnt1, wordSize);
 6512       __ eor(tmp1, tmp1, tmp2);
 6513       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 6514       __ bind(ALIGNED16);
 6515     }
 6516     if (UseSIMDForArrayEquals) {
 6517       if (SoftwarePrefetchHintDistance >= 0) {
 6518         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 6519         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 6520         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 6521             /* prfm = */ true, NOT_EQUAL);
 6522         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 6523         __ br(__ LT, TAIL);
 6524       }
 6525       __ bind(NO_PREFETCH_LARGE_LOOP);
 6526       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 6527           /* prfm = */ false, NOT_EQUAL);
 6528     } else {
 6529       __ push(spilled_regs, sp);
 6530       if (SoftwarePrefetchHintDistance >= 0) {
 6531         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 6532         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 6533         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 6534             /* prfm = */ true, NOT_EQUAL);
 6535         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 6536         __ br(__ LT, TAIL);
 6537       }
 6538       __ bind(NO_PREFETCH_LARGE_LOOP);
 6539       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 6540           /* prfm = */ false, NOT_EQUAL);
 6541     }
 6542     __ bind(TAIL);
 6543       __ cbz(cnt1, EQUAL);
 6544       __ subs(cnt1, cnt1, wordSize);
 6545       __ br(__ LE, POST_LOOP);
 6546     __ bind(SMALL_LOOP);
 6547       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 6548       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 6549       __ subs(cnt1, cnt1, wordSize);
 6550       __ eor(tmp1, tmp1, tmp2);
 6551       __ cbnz(tmp1, NOT_EQUAL);
 6552       __ br(__ GT, SMALL_LOOP);
 6553     __ bind(POST_LOOP);
 6554       __ ldr(tmp1, Address(a1, cnt1));
 6555       __ ldr(tmp2, Address(a2, cnt1));
 6556       __ eor(tmp1, tmp1, tmp2);
 6557       __ cbnz(tmp1, NOT_EQUAL);
 6558     __ bind(EQUAL);
 6559       __ mov(result, true);
 6560     __ bind(NOT_EQUAL);
 6561       if (!UseSIMDForArrayEquals) {
 6562         __ pop(spilled_regs, sp);
 6563       }
 6564     __ bind(NOT_EQUAL_NO_POP);
 6565     __ leave();
 6566     __ ret(lr);
 6567     return entry;
 6568   }
 6569 
 6570   // result = r0 - return value. Contains initial hashcode value on entry.
 6571   // ary = r1 - array address
 6572   // cnt = r2 - elements count
 6573   // Clobbers: v0-v13, rscratch1, rscratch2
 6574   address generate_large_arrays_hashcode(BasicType eltype) {
 6575     const Register result = r0, ary = r1, cnt = r2;
 6576     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 6577     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 6578     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 6579     const FloatRegister vpowm = v13;
 6580 
 6581     ARRAYS_HASHCODE_REGISTERS;
 6582 
 6583     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 6584 
 6585     unsigned int vf; // vectorization factor
 6586     bool multiply_by_halves;
 6587     Assembler::SIMD_Arrangement load_arrangement;
 6588     switch (eltype) {
 6589     case T_BOOLEAN:
 6590     case T_BYTE:
 6591       load_arrangement = Assembler::T8B;
 6592       multiply_by_halves = true;
 6593       vf = 8;
 6594       break;
 6595     case T_CHAR:
 6596     case T_SHORT:
 6597       load_arrangement = Assembler::T8H;
 6598       multiply_by_halves = true;
 6599       vf = 8;
 6600       break;
 6601     case T_INT:
 6602       load_arrangement = Assembler::T4S;
 6603       multiply_by_halves = false;
 6604       vf = 4;
 6605       break;
 6606     default:
 6607       ShouldNotReachHere();
 6608     }
 6609 
 6610     // Unroll factor
 6611     const unsigned uf = 4;
 6612 
 6613     // Effective vectorization factor
 6614     const unsigned evf = vf * uf;
 6615 
 6616     __ align(CodeEntryAlignment);
 6617 
 6618     StubGenStubId stub_id;
 6619     switch (eltype) {
 6620     case T_BOOLEAN:
 6621       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
 6622       break;
 6623     case T_BYTE:
 6624       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
 6625       break;
 6626     case T_CHAR:
 6627       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
 6628       break;
 6629     case T_SHORT:
 6630       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
 6631       break;
 6632     case T_INT:
 6633       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
 6634       break;
 6635     default:
 6636       stub_id = StubGenStubId::NO_STUBID;
 6637       ShouldNotReachHere();
 6638     };
 6639 
 6640     StubCodeMark mark(this, stub_id);
 6641 
 6642     address entry = __ pc();
 6643     __ enter();
 6644 
 6645     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 6646     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 6647     // value shouldn't change throughout both loops.
 6648     __ movw(rscratch1, intpow(31U, 3));
 6649     __ mov(vpow, Assembler::S, 0, rscratch1);
 6650     __ movw(rscratch1, intpow(31U, 2));
 6651     __ mov(vpow, Assembler::S, 1, rscratch1);
 6652     __ movw(rscratch1, intpow(31U, 1));
 6653     __ mov(vpow, Assembler::S, 2, rscratch1);
 6654     __ movw(rscratch1, intpow(31U, 0));
 6655     __ mov(vpow, Assembler::S, 3, rscratch1);
 6656 
 6657     __ mov(vmul0, Assembler::T16B, 0);
 6658     __ mov(vmul0, Assembler::S, 3, result);
 6659 
 6660     __ andr(rscratch2, cnt, (uf - 1) * vf);
 6661     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 6662 
 6663     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 6664     __ mov(vpowm, Assembler::S, 0, rscratch1);
 6665 
 6666     // SMALL LOOP
 6667     __ bind(SMALL_LOOP);
 6668 
 6669     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 6670     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6671     __ subsw(rscratch2, rscratch2, vf);
 6672 
 6673     if (load_arrangement == Assembler::T8B) {
 6674       // Extend 8B to 8H to be able to use vector multiply
 6675       // instructions
 6676       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 6677       if (is_signed_subword_type(eltype)) {
 6678         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6679       } else {
 6680         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6681       }
 6682     }
 6683 
 6684     switch (load_arrangement) {
 6685     case Assembler::T4S:
 6686       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 6687       break;
 6688     case Assembler::T8B:
 6689     case Assembler::T8H:
 6690       assert(is_subword_type(eltype), "subword type expected");
 6691       if (is_signed_subword_type(eltype)) {
 6692         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6693       } else {
 6694         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6695       }
 6696       break;
 6697     default:
 6698       __ should_not_reach_here();
 6699     }
 6700 
 6701     // Process the upper half of a vector
 6702     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 6703       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6704       if (is_signed_subword_type(eltype)) {
 6705         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6706       } else {
 6707         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6708       }
 6709     }
 6710 
 6711     __ br(Assembler::HI, SMALL_LOOP);
 6712 
 6713     // SMALL LOOP'S EPILOQUE
 6714     __ lsr(rscratch2, cnt, exact_log2(evf));
 6715     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 6716 
 6717     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 6718     __ addv(vmul0, Assembler::T4S, vmul0);
 6719     __ umov(result, vmul0, Assembler::S, 0);
 6720 
 6721     // TAIL
 6722     __ bind(TAIL);
 6723 
 6724     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 6725     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 6726     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 6727     __ andr(rscratch2, cnt, vf - 1);
 6728     __ bind(TAIL_SHORTCUT);
 6729     __ adr(rscratch1, BR_BASE);
 6730     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
 6731     __ movw(rscratch2, 0x1f);
 6732     __ br(rscratch1);
 6733 
 6734     for (size_t i = 0; i < vf - 1; ++i) {
 6735       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 6736                                    eltype);
 6737       __ maddw(result, result, rscratch2, rscratch1);
 6738     }
 6739     __ bind(BR_BASE);
 6740 
 6741     __ leave();
 6742     __ ret(lr);
 6743 
 6744     // LARGE LOOP
 6745     __ bind(LARGE_LOOP_PREHEADER);
 6746 
 6747     __ lsr(rscratch2, cnt, exact_log2(evf));
 6748 
 6749     if (multiply_by_halves) {
 6750       // 31^4 - multiplier between lower and upper parts of a register
 6751       __ movw(rscratch1, intpow(31U, vf / 2));
 6752       __ mov(vpowm, Assembler::S, 1, rscratch1);
 6753       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 6754       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 6755       __ mov(vpowm, Assembler::S, 0, rscratch1);
 6756     } else {
 6757       // 31^16
 6758       __ movw(rscratch1, intpow(31U, evf));
 6759       __ mov(vpowm, Assembler::S, 0, rscratch1);
 6760     }
 6761 
 6762     __ mov(vmul3, Assembler::T16B, 0);
 6763     __ mov(vmul2, Assembler::T16B, 0);
 6764     __ mov(vmul1, Assembler::T16B, 0);
 6765 
 6766     __ bind(LARGE_LOOP);
 6767 
 6768     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 6769     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 6770     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 6771     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6772 
 6773     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 6774            Address(__ post(ary, evf * type2aelembytes(eltype))));
 6775 
 6776     if (load_arrangement == Assembler::T8B) {
 6777       // Extend 8B to 8H to be able to use vector multiply
 6778       // instructions
 6779       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 6780       if (is_signed_subword_type(eltype)) {
 6781         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 6782         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 6783         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 6784         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6785       } else {
 6786         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 6787         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 6788         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 6789         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6790       }
 6791     }
 6792 
 6793     switch (load_arrangement) {
 6794     case Assembler::T4S:
 6795       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 6796       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 6797       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 6798       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 6799       break;
 6800     case Assembler::T8B:
 6801     case Assembler::T8H:
 6802       assert(is_subword_type(eltype), "subword type expected");
 6803       if (is_signed_subword_type(eltype)) {
 6804         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 6805         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 6806         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 6807         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6808       } else {
 6809         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 6810         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 6811         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 6812         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6813       }
 6814       break;
 6815     default:
 6816       __ should_not_reach_here();
 6817     }
 6818 
 6819     // Process the upper half of a vector
 6820     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 6821       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 6822       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 6823       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 6824       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 6825       if (is_signed_subword_type(eltype)) {
 6826         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 6827         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 6828         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 6829         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6830       } else {
 6831         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 6832         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 6833         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 6834         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6835       }
 6836     }
 6837 
 6838     __ subsw(rscratch2, rscratch2, 1);
 6839     __ br(Assembler::HI, LARGE_LOOP);
 6840 
 6841     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 6842     __ addv(vmul3, Assembler::T4S, vmul3);
 6843     __ umov(result, vmul3, Assembler::S, 0);
 6844 
 6845     __ mov(rscratch2, intpow(31U, vf));
 6846 
 6847     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 6848     __ addv(vmul2, Assembler::T4S, vmul2);
 6849     __ umov(rscratch1, vmul2, Assembler::S, 0);
 6850     __ maddw(result, result, rscratch2, rscratch1);
 6851 
 6852     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 6853     __ addv(vmul1, Assembler::T4S, vmul1);
 6854     __ umov(rscratch1, vmul1, Assembler::S, 0);
 6855     __ maddw(result, result, rscratch2, rscratch1);
 6856 
 6857     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 6858     __ addv(vmul0, Assembler::T4S, vmul0);
 6859     __ umov(rscratch1, vmul0, Assembler::S, 0);
 6860     __ maddw(result, result, rscratch2, rscratch1);
 6861 
 6862     __ andr(rscratch2, cnt, vf - 1);
 6863     __ cbnz(rscratch2, TAIL_SHORTCUT);
 6864 
 6865     __ leave();
 6866     __ ret(lr);
 6867 
 6868     return entry;
 6869   }
 6870 
 6871   address generate_dsin_dcos(bool isCos) {
 6872     __ align(CodeEntryAlignment);
 6873     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
 6874     StubCodeMark mark(this, stub_id);
 6875     address start = __ pc();
 6876     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 6877         (address)StubRoutines::aarch64::_two_over_pi,
 6878         (address)StubRoutines::aarch64::_pio2,
 6879         (address)StubRoutines::aarch64::_dsin_coef,
 6880         (address)StubRoutines::aarch64::_dcos_coef);
 6881     return start;
 6882   }
 6883 
 6884   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 6885   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 6886       Label &DIFF2) {
 6887     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 6888     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 6889 
 6890     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 6891     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 6892     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 6893     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 6894 
 6895     __ fmovd(tmpL, vtmp3);
 6896     __ eor(rscratch2, tmp3, tmpL);
 6897     __ cbnz(rscratch2, DIFF2);
 6898 
 6899     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6900     __ umov(tmpL, vtmp3, __ D, 1);
 6901     __ eor(rscratch2, tmpU, tmpL);
 6902     __ cbnz(rscratch2, DIFF1);
 6903 
 6904     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 6905     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 6906     __ fmovd(tmpL, vtmp);
 6907     __ eor(rscratch2, tmp3, tmpL);
 6908     __ cbnz(rscratch2, DIFF2);
 6909 
 6910     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6911     __ umov(tmpL, vtmp, __ D, 1);
 6912     __ eor(rscratch2, tmpU, tmpL);
 6913     __ cbnz(rscratch2, DIFF1);
 6914   }
 6915 
 6916   // r0  = result
 6917   // r1  = str1
 6918   // r2  = cnt1
 6919   // r3  = str2
 6920   // r4  = cnt2
 6921   // r10 = tmp1
 6922   // r11 = tmp2
 6923   address generate_compare_long_string_different_encoding(bool isLU) {
 6924     __ align(CodeEntryAlignment);
 6925     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
 6926     StubCodeMark mark(this, stub_id);
 6927     address entry = __ pc();
 6928     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 6929         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 6930         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 6931     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 6932         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 6933     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 6934     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 6935 
 6936     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 6937 
 6938     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 6939     // cnt2 == amount of characters left to compare
 6940     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 6941     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 6942     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 6943     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 6944     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 6945     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 6946     __ eor(rscratch2, tmp1, tmp2);
 6947     __ mov(rscratch1, tmp2);
 6948     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 6949     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 6950              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 6951     __ push(spilled_regs, sp);
 6952     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 6953     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 6954 
 6955     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6956 
 6957     if (SoftwarePrefetchHintDistance >= 0) {
 6958       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 6959       __ br(__ LT, NO_PREFETCH);
 6960       __ bind(LARGE_LOOP_PREFETCH);
 6961         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 6962         __ mov(tmp4, 2);
 6963         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 6964         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 6965           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6966           __ subs(tmp4, tmp4, 1);
 6967           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 6968           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 6969           __ mov(tmp4, 2);
 6970         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 6971           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6972           __ subs(tmp4, tmp4, 1);
 6973           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 6974           __ sub(cnt2, cnt2, 64);
 6975           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 6976           __ br(__ GE, LARGE_LOOP_PREFETCH);
 6977     }
 6978     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 6979     __ bind(NO_PREFETCH);
 6980     __ subs(cnt2, cnt2, 16);
 6981     __ br(__ LT, TAIL);
 6982     __ align(OptoLoopAlignment);
 6983     __ bind(SMALL_LOOP); // smaller loop
 6984       __ subs(cnt2, cnt2, 16);
 6985       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6986       __ br(__ GE, SMALL_LOOP);
 6987       __ cmn(cnt2, (u1)16);
 6988       __ br(__ EQ, LOAD_LAST);
 6989     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 6990       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 6991       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 6992       __ ldr(tmp3, Address(cnt1, -8));
 6993       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 6994       __ b(LOAD_LAST);
 6995     __ bind(DIFF2);
 6996       __ mov(tmpU, tmp3);
 6997     __ bind(DIFF1);
 6998       __ pop(spilled_regs, sp);
 6999       __ b(CALCULATE_DIFFERENCE);
 7000     __ bind(LOAD_LAST);
 7001       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 7002       // No need to load it again
 7003       __ mov(tmpU, tmp3);
 7004       __ pop(spilled_regs, sp);
 7005 
 7006       // tmp2 points to the address of the last 4 Latin1 characters right now
 7007       __ ldrs(vtmp, Address(tmp2));
 7008       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 7009       __ fmovd(tmpL, vtmp);
 7010 
 7011       __ eor(rscratch2, tmpU, tmpL);
 7012       __ cbz(rscratch2, DONE);
 7013 
 7014     // Find the first different characters in the longwords and
 7015     // compute their difference.
 7016     __ bind(CALCULATE_DIFFERENCE);
 7017       __ rev(rscratch2, rscratch2);
 7018       __ clz(rscratch2, rscratch2);
 7019       __ andr(rscratch2, rscratch2, -16);
 7020       __ lsrv(tmp1, tmp1, rscratch2);
 7021       __ uxthw(tmp1, tmp1);
 7022       __ lsrv(rscratch1, rscratch1, rscratch2);
 7023       __ uxthw(rscratch1, rscratch1);
 7024       __ subw(result, tmp1, rscratch1);
 7025     __ bind(DONE);
 7026       __ ret(lr);
 7027     return entry;
 7028   }
 7029 
 7030   // r0 = input (float16)
 7031   // v0 = result (float)
 7032   // v1 = temporary float register
 7033   address generate_float16ToFloat() {
 7034     __ align(CodeEntryAlignment);
 7035     StubGenStubId stub_id = StubGenStubId::hf2f_id;
 7036     StubCodeMark mark(this, stub_id);
 7037     address entry = __ pc();
 7038     BLOCK_COMMENT("Entry:");
 7039     __ flt16_to_flt(v0, r0, v1);
 7040     __ ret(lr);
 7041     return entry;
 7042   }
 7043 
 7044   // v0 = input (float)
 7045   // r0 = result (float16)
 7046   // v1 = temporary float register
 7047   address generate_floatToFloat16() {
 7048     __ align(CodeEntryAlignment);
 7049     StubGenStubId stub_id = StubGenStubId::f2hf_id;
 7050     StubCodeMark mark(this, stub_id);
 7051     address entry = __ pc();
 7052     BLOCK_COMMENT("Entry:");
 7053     __ flt_to_flt16(r0, v0, v1);
 7054     __ ret(lr);
 7055     return entry;
 7056   }
 7057 
 7058   address generate_method_entry_barrier() {
 7059     __ align(CodeEntryAlignment);
 7060     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
 7061     StubCodeMark mark(this, stub_id);
 7062 
 7063     Label deoptimize_label;
 7064 
 7065     address start = __ pc();
 7066 
 7067     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 7068 
 7069     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 7070       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 7071       // We can get here despite the nmethod being good, if we have not
 7072       // yet applied our cross modification fence (or data fence).
 7073       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 7074       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 7075       __ ldrw(rscratch2, rscratch2);
 7076       __ strw(rscratch2, thread_epoch_addr);
 7077       __ isb();
 7078       __ membar(__ LoadLoad);
 7079     }
 7080 
 7081     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 7082 
 7083     __ enter();
 7084     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 7085 
 7086     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 7087 
 7088     __ push_call_clobbered_registers();
 7089 
 7090     __ mov(c_rarg0, rscratch2);
 7091     __ call_VM_leaf
 7092          (CAST_FROM_FN_PTR
 7093           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 7094 
 7095     __ reset_last_Java_frame(true);
 7096 
 7097     __ mov(rscratch1, r0);
 7098 
 7099     __ pop_call_clobbered_registers();
 7100 
 7101     __ cbnz(rscratch1, deoptimize_label);
 7102 
 7103     __ leave();
 7104     __ ret(lr);
 7105 
 7106     __ BIND(deoptimize_label);
 7107 
 7108     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 7109     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 7110 
 7111     __ mov(sp, rscratch1);
 7112     __ br(rscratch2);
 7113 
 7114     return start;
 7115   }
 7116 
 7117   // r0  = result
 7118   // r1  = str1
 7119   // r2  = cnt1
 7120   // r3  = str2
 7121   // r4  = cnt2
 7122   // r10 = tmp1
 7123   // r11 = tmp2
 7124   address generate_compare_long_string_same_encoding(bool isLL) {
 7125     __ align(CodeEntryAlignment);
 7126     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
 7127     StubCodeMark mark(this, stub_id);
 7128     address entry = __ pc();
 7129     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 7130         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 7131 
 7132     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 7133 
 7134     // exit from large loop when less than 64 bytes left to read or we're about
 7135     // to prefetch memory behind array border
 7136     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 7137 
 7138     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 7139     __ eor(rscratch2, tmp1, tmp2);
 7140     __ cbnz(rscratch2, CAL_DIFFERENCE);
 7141 
 7142     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 7143     // update pointers, because of previous read
 7144     __ add(str1, str1, wordSize);
 7145     __ add(str2, str2, wordSize);
 7146     if (SoftwarePrefetchHintDistance >= 0) {
 7147       __ align(OptoLoopAlignment);
 7148       __ bind(LARGE_LOOP_PREFETCH);
 7149         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 7150         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 7151 
 7152         for (int i = 0; i < 4; i++) {
 7153           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 7154           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 7155           __ cmp(tmp1, tmp2);
 7156           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7157           __ br(Assembler::NE, DIFF);
 7158         }
 7159         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 7160         __ add(str1, str1, 64);
 7161         __ add(str2, str2, 64);
 7162         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 7163         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 7164         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 7165     }
 7166 
 7167     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 7168     __ br(Assembler::LE, LESS16);
 7169     __ align(OptoLoopAlignment);
 7170     __ bind(LOOP_COMPARE16);
 7171       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 7172       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 7173       __ cmp(tmp1, tmp2);
 7174       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7175       __ br(Assembler::NE, DIFF);
 7176       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 7177       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 7178       __ br(Assembler::LT, LESS16);
 7179 
 7180       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 7181       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 7182       __ cmp(tmp1, tmp2);
 7183       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7184       __ br(Assembler::NE, DIFF);
 7185       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 7186       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 7187       __ br(Assembler::GE, LOOP_COMPARE16);
 7188       __ cbz(cnt2, LENGTH_DIFF);
 7189 
 7190     __ bind(LESS16);
 7191       // each 8 compare
 7192       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 7193       __ br(Assembler::LE, LESS8);
 7194       __ ldr(tmp1, Address(__ post(str1, 8)));
 7195       __ ldr(tmp2, Address(__ post(str2, 8)));
 7196       __ eor(rscratch2, tmp1, tmp2);
 7197       __ cbnz(rscratch2, CAL_DIFFERENCE);
 7198       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 7199 
 7200     __ bind(LESS8); // directly load last 8 bytes
 7201       if (!isLL) {
 7202         __ add(cnt2, cnt2, cnt2);
 7203       }
 7204       __ ldr(tmp1, Address(str1, cnt2));
 7205       __ ldr(tmp2, Address(str2, cnt2));
 7206       __ eor(rscratch2, tmp1, tmp2);
 7207       __ cbz(rscratch2, LENGTH_DIFF);
 7208       __ b(CAL_DIFFERENCE);
 7209 
 7210     __ bind(DIFF);
 7211       __ cmp(tmp1, tmp2);
 7212       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 7213       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 7214       // reuse rscratch2 register for the result of eor instruction
 7215       __ eor(rscratch2, tmp1, tmp2);
 7216 
 7217     __ bind(CAL_DIFFERENCE);
 7218       __ rev(rscratch2, rscratch2);
 7219       __ clz(rscratch2, rscratch2);
 7220       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 7221       __ lsrv(tmp1, tmp1, rscratch2);
 7222       __ lsrv(tmp2, tmp2, rscratch2);
 7223       if (isLL) {
 7224         __ uxtbw(tmp1, tmp1);
 7225         __ uxtbw(tmp2, tmp2);
 7226       } else {
 7227         __ uxthw(tmp1, tmp1);
 7228         __ uxthw(tmp2, tmp2);
 7229       }
 7230       __ subw(result, tmp1, tmp2);
 7231 
 7232     __ bind(LENGTH_DIFF);
 7233       __ ret(lr);
 7234     return entry;
 7235   }
 7236 
 7237   enum string_compare_mode {
 7238     LL,
 7239     LU,
 7240     UL,
 7241     UU,
 7242   };
 7243 
 7244   // The following registers are declared in aarch64.ad
 7245   // r0  = result
 7246   // r1  = str1
 7247   // r2  = cnt1
 7248   // r3  = str2
 7249   // r4  = cnt2
 7250   // r10 = tmp1
 7251   // r11 = tmp2
 7252   // z0  = ztmp1
 7253   // z1  = ztmp2
 7254   // p0  = pgtmp1
 7255   // p1  = pgtmp2
 7256   address generate_compare_long_string_sve(string_compare_mode mode) {
 7257     StubGenStubId stub_id;
 7258     switch (mode) {
 7259       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
 7260       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
 7261       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
 7262       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
 7263       default: ShouldNotReachHere();
 7264     }
 7265 
 7266     __ align(CodeEntryAlignment);
 7267     address entry = __ pc();
 7268     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 7269              tmp1 = r10, tmp2 = r11;
 7270 
 7271     Label LOOP, DONE, MISMATCH;
 7272     Register vec_len = tmp1;
 7273     Register idx = tmp2;
 7274     // The minimum of the string lengths has been stored in cnt2.
 7275     Register cnt = cnt2;
 7276     FloatRegister ztmp1 = z0, ztmp2 = z1;
 7277     PRegister pgtmp1 = p0, pgtmp2 = p1;
 7278 
 7279 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 7280     switch (mode) {                                                            \
 7281       case LL:                                                                 \
 7282         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 7283         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 7284         break;                                                                 \
 7285       case LU:                                                                 \
 7286         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 7287         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 7288         break;                                                                 \
 7289       case UL:                                                                 \
 7290         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 7291         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 7292         break;                                                                 \
 7293       case UU:                                                                 \
 7294         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 7295         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 7296         break;                                                                 \
 7297       default:                                                                 \
 7298         ShouldNotReachHere();                                                  \
 7299     }
 7300 
 7301     StubCodeMark mark(this, stub_id);
 7302 
 7303     __ mov(idx, 0);
 7304     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 7305 
 7306     if (mode == LL) {
 7307       __ sve_cntb(vec_len);
 7308     } else {
 7309       __ sve_cnth(vec_len);
 7310     }
 7311 
 7312     __ sub(rscratch1, cnt, vec_len);
 7313 
 7314     __ bind(LOOP);
 7315 
 7316       // main loop
 7317       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 7318       __ add(idx, idx, vec_len);
 7319       // Compare strings.
 7320       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 7321       __ br(__ NE, MISMATCH);
 7322       __ cmp(idx, rscratch1);
 7323       __ br(__ LT, LOOP);
 7324 
 7325     // post loop, last iteration
 7326     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 7327 
 7328     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 7329     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 7330     __ br(__ EQ, DONE);
 7331 
 7332     __ bind(MISMATCH);
 7333 
 7334     // Crop the vector to find its location.
 7335     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 7336     // Extract the first different characters of each string.
 7337     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 7338     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 7339 
 7340     // Compute the difference of the first different characters.
 7341     __ sub(result, rscratch1, rscratch2);
 7342 
 7343     __ bind(DONE);
 7344     __ ret(lr);
 7345 #undef LOAD_PAIR
 7346     return entry;
 7347   }
 7348 
 7349   void generate_compare_long_strings() {
 7350     if (UseSVE == 0) {
 7351       StubRoutines::aarch64::_compare_long_string_LL
 7352           = generate_compare_long_string_same_encoding(true);
 7353       StubRoutines::aarch64::_compare_long_string_UU
 7354           = generate_compare_long_string_same_encoding(false);
 7355       StubRoutines::aarch64::_compare_long_string_LU
 7356           = generate_compare_long_string_different_encoding(true);
 7357       StubRoutines::aarch64::_compare_long_string_UL
 7358           = generate_compare_long_string_different_encoding(false);
 7359     } else {
 7360       StubRoutines::aarch64::_compare_long_string_LL
 7361           = generate_compare_long_string_sve(LL);
 7362       StubRoutines::aarch64::_compare_long_string_UU
 7363           = generate_compare_long_string_sve(UU);
 7364       StubRoutines::aarch64::_compare_long_string_LU
 7365           = generate_compare_long_string_sve(LU);
 7366       StubRoutines::aarch64::_compare_long_string_UL
 7367           = generate_compare_long_string_sve(UL);
 7368     }
 7369   }
 7370 
 7371   // R0 = result
 7372   // R1 = str2
 7373   // R2 = cnt1
 7374   // R3 = str1
 7375   // R4 = cnt2
 7376   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 7377   //
 7378   // This generic linear code use few additional ideas, which makes it faster:
 7379   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 7380   // in order to skip initial loading(help in systems with 1 ld pipeline)
 7381   // 2) we can use "fast" algorithm of finding single character to search for
 7382   // first symbol with less branches(1 branch per each loaded register instead
 7383   // of branch for each symbol), so, this is where constants like
 7384   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 7385   // 3) after loading and analyzing 1st register of source string, it can be
 7386   // used to search for every 1st character entry, saving few loads in
 7387   // comparison with "simplier-but-slower" implementation
 7388   // 4) in order to avoid lots of push/pop operations, code below is heavily
 7389   // re-using/re-initializing/compressing register values, which makes code
 7390   // larger and a bit less readable, however, most of extra operations are
 7391   // issued during loads or branches, so, penalty is minimal
 7392   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 7393     StubGenStubId stub_id;
 7394     if (str1_isL) {
 7395       if (str2_isL) {
 7396         stub_id = StubGenStubId::string_indexof_linear_ll_id;
 7397       } else {
 7398         stub_id = StubGenStubId::string_indexof_linear_ul_id;
 7399       }
 7400     } else {
 7401       if (str2_isL) {
 7402         ShouldNotReachHere();
 7403       } else {
 7404         stub_id = StubGenStubId::string_indexof_linear_uu_id;
 7405       }
 7406     }
 7407     __ align(CodeEntryAlignment);
 7408     StubCodeMark mark(this, stub_id);
 7409     address entry = __ pc();
 7410 
 7411     int str1_chr_size = str1_isL ? 1 : 2;
 7412     int str2_chr_size = str2_isL ? 1 : 2;
 7413     int str1_chr_shift = str1_isL ? 0 : 1;
 7414     int str2_chr_shift = str2_isL ? 0 : 1;
 7415     bool isL = str1_isL && str2_isL;
 7416    // parameters
 7417     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 7418     // temporary registers
 7419     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 7420     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 7421     // redefinitions
 7422     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 7423 
 7424     __ push(spilled_regs, sp);
 7425     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 7426         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 7427         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 7428         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 7429         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 7430         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 7431     // Read whole register from str1. It is safe, because length >=8 here
 7432     __ ldr(ch1, Address(str1));
 7433     // Read whole register from str2. It is safe, because length >=8 here
 7434     __ ldr(ch2, Address(str2));
 7435     __ sub(cnt2, cnt2, cnt1);
 7436     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 7437     if (str1_isL != str2_isL) {
 7438       __ eor(v0, __ T16B, v0, v0);
 7439     }
 7440     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 7441     __ mul(first, first, tmp1);
 7442     // check if we have less than 1 register to check
 7443     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 7444     if (str1_isL != str2_isL) {
 7445       __ fmovd(v1, ch1);
 7446     }
 7447     __ br(__ LE, L_SMALL);
 7448     __ eor(ch2, first, ch2);
 7449     if (str1_isL != str2_isL) {
 7450       __ zip1(v1, __ T16B, v1, v0);
 7451     }
 7452     __ sub(tmp2, ch2, tmp1);
 7453     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7454     __ bics(tmp2, tmp2, ch2);
 7455     if (str1_isL != str2_isL) {
 7456       __ fmovd(ch1, v1);
 7457     }
 7458     __ br(__ NE, L_HAS_ZERO);
 7459     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 7460     __ add(result, result, wordSize/str2_chr_size);
 7461     __ add(str2, str2, wordSize);
 7462     __ br(__ LT, L_POST_LOOP);
 7463     __ BIND(L_LOOP);
 7464       __ ldr(ch2, Address(str2));
 7465       __ eor(ch2, first, ch2);
 7466       __ sub(tmp2, ch2, tmp1);
 7467       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7468       __ bics(tmp2, tmp2, ch2);
 7469       __ br(__ NE, L_HAS_ZERO);
 7470     __ BIND(L_LOOP_PROCEED);
 7471       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 7472       __ add(str2, str2, wordSize);
 7473       __ add(result, result, wordSize/str2_chr_size);
 7474       __ br(__ GE, L_LOOP);
 7475     __ BIND(L_POST_LOOP);
 7476       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 7477       __ br(__ LE, NOMATCH);
 7478       __ ldr(ch2, Address(str2));
 7479       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 7480       __ eor(ch2, first, ch2);
 7481       __ sub(tmp2, ch2, tmp1);
 7482       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7483       __ mov(tmp4, -1); // all bits set
 7484       __ b(L_SMALL_PROCEED);
 7485     __ align(OptoLoopAlignment);
 7486     __ BIND(L_SMALL);
 7487       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 7488       __ eor(ch2, first, ch2);
 7489       if (str1_isL != str2_isL) {
 7490         __ zip1(v1, __ T16B, v1, v0);
 7491       }
 7492       __ sub(tmp2, ch2, tmp1);
 7493       __ mov(tmp4, -1); // all bits set
 7494       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7495       if (str1_isL != str2_isL) {
 7496         __ fmovd(ch1, v1); // move converted 4 symbols
 7497       }
 7498     __ BIND(L_SMALL_PROCEED);
 7499       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 7500       __ bic(tmp2, tmp2, ch2);
 7501       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 7502       __ rbit(tmp2, tmp2);
 7503       __ br(__ EQ, NOMATCH);
 7504     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 7505       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 7506       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 7507       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 7508       if (str2_isL) { // LL
 7509         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 7510         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 7511         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 7512         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 7513         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7514       } else {
 7515         __ mov(ch2, 0xE); // all bits in byte set except last one
 7516         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7517         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7518         __ lslv(tmp2, tmp2, tmp4);
 7519         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7520         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7521         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7522         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7523       }
 7524       __ cmp(ch1, ch2);
 7525       __ mov(tmp4, wordSize/str2_chr_size);
 7526       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7527     __ BIND(L_SMALL_CMP_LOOP);
 7528       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 7529                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 7530       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 7531                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 7532       __ add(tmp4, tmp4, 1);
 7533       __ cmp(tmp4, cnt1);
 7534       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 7535       __ cmp(first, ch2);
 7536       __ br(__ EQ, L_SMALL_CMP_LOOP);
 7537     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 7538       __ cbz(tmp2, NOMATCH); // no more matches. exit
 7539       __ clz(tmp4, tmp2);
 7540       __ add(result, result, 1); // advance index
 7541       __ add(str2, str2, str2_chr_size); // advance pointer
 7542       __ b(L_SMALL_HAS_ZERO_LOOP);
 7543     __ align(OptoLoopAlignment);
 7544     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 7545       __ cmp(first, ch2);
 7546       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7547       __ b(DONE);
 7548     __ align(OptoLoopAlignment);
 7549     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 7550       if (str2_isL) { // LL
 7551         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 7552         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 7553         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 7554         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 7555         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7556       } else {
 7557         __ mov(ch2, 0xE); // all bits in byte set except last one
 7558         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7559         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7560         __ lslv(tmp2, tmp2, tmp4);
 7561         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7562         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7563         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7564         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7565       }
 7566       __ cmp(ch1, ch2);
 7567       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7568       __ b(DONE);
 7569     __ align(OptoLoopAlignment);
 7570     __ BIND(L_HAS_ZERO);
 7571       __ rbit(tmp2, tmp2);
 7572       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 7573       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 7574       // It's fine because both counters are 32bit and are not changed in this
 7575       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 7576       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 7577       __ sub(result, result, 1);
 7578     __ BIND(L_HAS_ZERO_LOOP);
 7579       __ mov(cnt1, wordSize/str2_chr_size);
 7580       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 7581       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 7582       if (str2_isL) {
 7583         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 7584         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7585         __ lslv(tmp2, tmp2, tmp4);
 7586         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7587         __ add(tmp4, tmp4, 1);
 7588         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7589         __ lsl(tmp2, tmp2, 1);
 7590         __ mov(tmp4, wordSize/str2_chr_size);
 7591       } else {
 7592         __ mov(ch2, 0xE);
 7593         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7594         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7595         __ lslv(tmp2, tmp2, tmp4);
 7596         __ add(tmp4, tmp4, 1);
 7597         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7598         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 7599         __ lsl(tmp2, tmp2, 1);
 7600         __ mov(tmp4, wordSize/str2_chr_size);
 7601         __ sub(str2, str2, str2_chr_size);
 7602       }
 7603       __ cmp(ch1, ch2);
 7604       __ mov(tmp4, wordSize/str2_chr_size);
 7605       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7606     __ BIND(L_CMP_LOOP);
 7607       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 7608                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 7609       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 7610                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 7611       __ add(tmp4, tmp4, 1);
 7612       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 7613       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 7614       __ cmp(cnt1, ch2);
 7615       __ br(__ EQ, L_CMP_LOOP);
 7616     __ BIND(L_CMP_LOOP_NOMATCH);
 7617       // here we're not matched
 7618       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 7619       __ clz(tmp4, tmp2);
 7620       __ add(str2, str2, str2_chr_size); // advance pointer
 7621       __ b(L_HAS_ZERO_LOOP);
 7622     __ align(OptoLoopAlignment);
 7623     __ BIND(L_CMP_LOOP_LAST_CMP);
 7624       __ cmp(cnt1, ch2);
 7625       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7626       __ b(DONE);
 7627     __ align(OptoLoopAlignment);
 7628     __ BIND(L_CMP_LOOP_LAST_CMP2);
 7629       if (str2_isL) {
 7630         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 7631         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7632         __ lslv(tmp2, tmp2, tmp4);
 7633         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7634         __ add(tmp4, tmp4, 1);
 7635         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7636         __ lsl(tmp2, tmp2, 1);
 7637       } else {
 7638         __ mov(ch2, 0xE);
 7639         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7640         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7641         __ lslv(tmp2, tmp2, tmp4);
 7642         __ add(tmp4, tmp4, 1);
 7643         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7644         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 7645         __ lsl(tmp2, tmp2, 1);
 7646         __ sub(str2, str2, str2_chr_size);
 7647       }
 7648       __ cmp(ch1, ch2);
 7649       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7650       __ b(DONE);
 7651     __ align(OptoLoopAlignment);
 7652     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 7653       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 7654       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 7655       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 7656       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 7657       // result by analyzed characters value, so, we can just reset lower bits
 7658       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 7659       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 7660       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 7661       // index of last analyzed substring inside current octet. So, str2 in at
 7662       // respective start address. We need to advance it to next octet
 7663       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 7664       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 7665       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 7666       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 7667       __ movw(cnt2, cnt2);
 7668       __ b(L_LOOP_PROCEED);
 7669     __ align(OptoLoopAlignment);
 7670     __ BIND(NOMATCH);
 7671       __ mov(result, -1);
 7672     __ BIND(DONE);
 7673       __ pop(spilled_regs, sp);
 7674       __ ret(lr);
 7675     return entry;
 7676   }
 7677 
 7678   void generate_string_indexof_stubs() {
 7679     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 7680     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 7681     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 7682   }
 7683 
 7684   void inflate_and_store_2_fp_registers(bool generatePrfm,
 7685       FloatRegister src1, FloatRegister src2) {
 7686     Register dst = r1;
 7687     __ zip1(v1, __ T16B, src1, v0);
 7688     __ zip2(v2, __ T16B, src1, v0);
 7689     if (generatePrfm) {
 7690       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 7691     }
 7692     __ zip1(v3, __ T16B, src2, v0);
 7693     __ zip2(v4, __ T16B, src2, v0);
 7694     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 7695   }
 7696 
 7697   // R0 = src
 7698   // R1 = dst
 7699   // R2 = len
 7700   // R3 = len >> 3
 7701   // V0 = 0
 7702   // v1 = loaded 8 bytes
 7703   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 7704   address generate_large_byte_array_inflate() {
 7705     __ align(CodeEntryAlignment);
 7706     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
 7707     StubCodeMark mark(this, stub_id);
 7708     address entry = __ pc();
 7709     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 7710     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 7711     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 7712 
 7713     // do one more 8-byte read to have address 16-byte aligned in most cases
 7714     // also use single store instruction
 7715     __ ldrd(v2, __ post(src, 8));
 7716     __ sub(octetCounter, octetCounter, 2);
 7717     __ zip1(v1, __ T16B, v1, v0);
 7718     __ zip1(v2, __ T16B, v2, v0);
 7719     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 7720     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7721     __ subs(rscratch1, octetCounter, large_loop_threshold);
 7722     __ br(__ LE, LOOP_START);
 7723     __ b(LOOP_PRFM_START);
 7724     __ bind(LOOP_PRFM);
 7725       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7726     __ bind(LOOP_PRFM_START);
 7727       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 7728       __ sub(octetCounter, octetCounter, 8);
 7729       __ subs(rscratch1, octetCounter, large_loop_threshold);
 7730       inflate_and_store_2_fp_registers(true, v3, v4);
 7731       inflate_and_store_2_fp_registers(true, v5, v6);
 7732       __ br(__ GT, LOOP_PRFM);
 7733       __ cmp(octetCounter, (u1)8);
 7734       __ br(__ LT, DONE);
 7735     __ bind(LOOP);
 7736       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7737       __ bind(LOOP_START);
 7738       __ sub(octetCounter, octetCounter, 8);
 7739       __ cmp(octetCounter, (u1)8);
 7740       inflate_and_store_2_fp_registers(false, v3, v4);
 7741       inflate_and_store_2_fp_registers(false, v5, v6);
 7742       __ br(__ GE, LOOP);
 7743     __ bind(DONE);
 7744       __ ret(lr);
 7745     return entry;
 7746   }
 7747 
 7748   /**
 7749    *  Arguments:
 7750    *
 7751    *  Input:
 7752    *  c_rarg0   - current state address
 7753    *  c_rarg1   - H key address
 7754    *  c_rarg2   - data address
 7755    *  c_rarg3   - number of blocks
 7756    *
 7757    *  Output:
 7758    *  Updated state at c_rarg0
 7759    */
 7760   address generate_ghash_processBlocks() {
 7761     // Bafflingly, GCM uses little-endian for the byte order, but
 7762     // big-endian for the bit order.  For example, the polynomial 1 is
 7763     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 7764     //
 7765     // So, we must either reverse the bytes in each word and do
 7766     // everything big-endian or reverse the bits in each byte and do
 7767     // it little-endian.  On AArch64 it's more idiomatic to reverse
 7768     // the bits in each byte (we have an instruction, RBIT, to do
 7769     // that) and keep the data in little-endian bit order through the
 7770     // calculation, bit-reversing the inputs and outputs.
 7771 
 7772     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
 7773     StubCodeMark mark(this, stub_id);
 7774     __ align(wordSize * 2);
 7775     address p = __ pc();
 7776     __ emit_int64(0x87);  // The low-order bits of the field
 7777                           // polynomial (i.e. p = z^7+z^2+z+1)
 7778                           // repeated in the low and high parts of a
 7779                           // 128-bit vector
 7780     __ emit_int64(0x87);
 7781 
 7782     __ align(CodeEntryAlignment);
 7783     address start = __ pc();
 7784 
 7785     Register state   = c_rarg0;
 7786     Register subkeyH = c_rarg1;
 7787     Register data    = c_rarg2;
 7788     Register blocks  = c_rarg3;
 7789 
 7790     FloatRegister vzr = v30;
 7791     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 7792 
 7793     __ ldrq(v24, p);    // The field polynomial
 7794 
 7795     __ ldrq(v0, Address(state));
 7796     __ ldrq(v1, Address(subkeyH));
 7797 
 7798     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 7799     __ rbit(v0, __ T16B, v0);
 7800     __ rev64(v1, __ T16B, v1);
 7801     __ rbit(v1, __ T16B, v1);
 7802 
 7803     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 7804     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 7805 
 7806     {
 7807       Label L_ghash_loop;
 7808       __ bind(L_ghash_loop);
 7809 
 7810       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 7811                                                  // reversing each byte
 7812       __ rbit(v2, __ T16B, v2);
 7813       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 7814 
 7815       // Multiply state in v2 by subkey in v1
 7816       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 7817                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 7818                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 7819       // Reduce v7:v5 by the field polynomial
 7820       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 7821 
 7822       __ sub(blocks, blocks, 1);
 7823       __ cbnz(blocks, L_ghash_loop);
 7824     }
 7825 
 7826     // The bit-reversed result is at this point in v0
 7827     __ rev64(v0, __ T16B, v0);
 7828     __ rbit(v0, __ T16B, v0);
 7829 
 7830     __ st1(v0, __ T16B, state);
 7831     __ ret(lr);
 7832 
 7833     return start;
 7834   }
 7835 
 7836   address generate_ghash_processBlocks_wide() {
 7837     address small = generate_ghash_processBlocks();
 7838 
 7839     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
 7840     StubCodeMark mark(this, stub_id);
 7841     __ align(wordSize * 2);
 7842     address p = __ pc();
 7843     __ emit_int64(0x87);  // The low-order bits of the field
 7844                           // polynomial (i.e. p = z^7+z^2+z+1)
 7845                           // repeated in the low and high parts of a
 7846                           // 128-bit vector
 7847     __ emit_int64(0x87);
 7848 
 7849     __ align(CodeEntryAlignment);
 7850     address start = __ pc();
 7851 
 7852     Register state   = c_rarg0;
 7853     Register subkeyH = c_rarg1;
 7854     Register data    = c_rarg2;
 7855     Register blocks  = c_rarg3;
 7856 
 7857     const int unroll = 4;
 7858 
 7859     __ cmp(blocks, (unsigned char)(unroll * 2));
 7860     __ br(__ LT, small);
 7861 
 7862     if (unroll > 1) {
 7863     // Save state before entering routine
 7864       __ sub(sp, sp, 4 * 16);
 7865       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 7866       __ sub(sp, sp, 4 * 16);
 7867       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 7868     }
 7869 
 7870     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 7871 
 7872     if (unroll > 1) {
 7873       // And restore state
 7874       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 7875       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 7876     }
 7877 
 7878     __ cmp(blocks, (unsigned char)0);
 7879     __ br(__ GT, small);
 7880 
 7881     __ ret(lr);
 7882 
 7883     return start;
 7884   }
 7885 
 7886   void generate_base64_encode_simdround(Register src, Register dst,
 7887         FloatRegister codec, u8 size) {
 7888 
 7889     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 7890     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 7891     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 7892 
 7893     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 7894 
 7895     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 7896 
 7897     __ ushr(ind0, arrangement, in0,  2);
 7898 
 7899     __ ushr(ind1, arrangement, in1,  2);
 7900     __ shl(in0,   arrangement, in0,  6);
 7901     __ orr(ind1,  arrangement, ind1, in0);
 7902     __ ushr(ind1, arrangement, ind1, 2);
 7903 
 7904     __ ushr(ind2, arrangement, in2,  4);
 7905     __ shl(in1,   arrangement, in1,  4);
 7906     __ orr(ind2,  arrangement, in1,  ind2);
 7907     __ ushr(ind2, arrangement, ind2, 2);
 7908 
 7909     __ shl(ind3,  arrangement, in2,  2);
 7910     __ ushr(ind3, arrangement, ind3, 2);
 7911 
 7912     __ tbl(out0,  arrangement, codec,  4, ind0);
 7913     __ tbl(out1,  arrangement, codec,  4, ind1);
 7914     __ tbl(out2,  arrangement, codec,  4, ind2);
 7915     __ tbl(out3,  arrangement, codec,  4, ind3);
 7916 
 7917     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 7918   }
 7919 
 7920    /**
 7921    *  Arguments:
 7922    *
 7923    *  Input:
 7924    *  c_rarg0   - src_start
 7925    *  c_rarg1   - src_offset
 7926    *  c_rarg2   - src_length
 7927    *  c_rarg3   - dest_start
 7928    *  c_rarg4   - dest_offset
 7929    *  c_rarg5   - isURL
 7930    *
 7931    */
 7932   address generate_base64_encodeBlock() {
 7933 
 7934     static const char toBase64[64] = {
 7935       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 7936       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 7937       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 7938       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 7939       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 7940     };
 7941 
 7942     static const char toBase64URL[64] = {
 7943       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 7944       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 7945       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 7946       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 7947       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 7948     };
 7949 
 7950     __ align(CodeEntryAlignment);
 7951     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
 7952     StubCodeMark mark(this, stub_id);
 7953     address start = __ pc();
 7954 
 7955     Register src   = c_rarg0;  // source array
 7956     Register soff  = c_rarg1;  // source start offset
 7957     Register send  = c_rarg2;  // source end offset
 7958     Register dst   = c_rarg3;  // dest array
 7959     Register doff  = c_rarg4;  // position for writing to dest array
 7960     Register isURL = c_rarg5;  // Base64 or URL character set
 7961 
 7962     // c_rarg6 and c_rarg7 are free to use as temps
 7963     Register codec  = c_rarg6;
 7964     Register length = c_rarg7;
 7965 
 7966     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 7967 
 7968     __ add(src, src, soff);
 7969     __ add(dst, dst, doff);
 7970     __ sub(length, send, soff);
 7971 
 7972     // load the codec base address
 7973     __ lea(codec, ExternalAddress((address) toBase64));
 7974     __ cbz(isURL, ProcessData);
 7975     __ lea(codec, ExternalAddress((address) toBase64URL));
 7976 
 7977     __ BIND(ProcessData);
 7978 
 7979     // too short to formup a SIMD loop, roll back
 7980     __ cmp(length, (u1)24);
 7981     __ br(Assembler::LT, Process3B);
 7982 
 7983     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 7984 
 7985     __ BIND(Process48B);
 7986     __ cmp(length, (u1)48);
 7987     __ br(Assembler::LT, Process24B);
 7988     generate_base64_encode_simdround(src, dst, v0, 16);
 7989     __ sub(length, length, 48);
 7990     __ b(Process48B);
 7991 
 7992     __ BIND(Process24B);
 7993     __ cmp(length, (u1)24);
 7994     __ br(Assembler::LT, SIMDExit);
 7995     generate_base64_encode_simdround(src, dst, v0, 8);
 7996     __ sub(length, length, 24);
 7997 
 7998     __ BIND(SIMDExit);
 7999     __ cbz(length, Exit);
 8000 
 8001     __ BIND(Process3B);
 8002     //  3 src bytes, 24 bits
 8003     __ ldrb(r10, __ post(src, 1));
 8004     __ ldrb(r11, __ post(src, 1));
 8005     __ ldrb(r12, __ post(src, 1));
 8006     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 8007     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 8008     // codec index
 8009     __ ubfmw(r15, r12, 18, 23);
 8010     __ ubfmw(r14, r12, 12, 17);
 8011     __ ubfmw(r13, r12, 6,  11);
 8012     __ andw(r12,  r12, 63);
 8013     // get the code based on the codec
 8014     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 8015     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 8016     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 8017     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 8018     __ strb(r15, __ post(dst, 1));
 8019     __ strb(r14, __ post(dst, 1));
 8020     __ strb(r13, __ post(dst, 1));
 8021     __ strb(r12, __ post(dst, 1));
 8022     __ sub(length, length, 3);
 8023     __ cbnz(length, Process3B);
 8024 
 8025     __ BIND(Exit);
 8026     __ ret(lr);
 8027 
 8028     return start;
 8029   }
 8030 
 8031   void generate_base64_decode_simdround(Register src, Register dst,
 8032         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 8033 
 8034     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 8035     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 8036 
 8037     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 8038     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 8039 
 8040     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 8041 
 8042     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 8043 
 8044     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 8045 
 8046     // we need unsigned saturating subtract, to make sure all input values
 8047     // in range [0, 63] will have 0U value in the higher half lookup
 8048     __ uqsubv(decH0, __ T16B, in0, v27);
 8049     __ uqsubv(decH1, __ T16B, in1, v27);
 8050     __ uqsubv(decH2, __ T16B, in2, v27);
 8051     __ uqsubv(decH3, __ T16B, in3, v27);
 8052 
 8053     // lower half lookup
 8054     __ tbl(decL0, arrangement, codecL, 4, in0);
 8055     __ tbl(decL1, arrangement, codecL, 4, in1);
 8056     __ tbl(decL2, arrangement, codecL, 4, in2);
 8057     __ tbl(decL3, arrangement, codecL, 4, in3);
 8058 
 8059     // higher half lookup
 8060     __ tbx(decH0, arrangement, codecH, 4, decH0);
 8061     __ tbx(decH1, arrangement, codecH, 4, decH1);
 8062     __ tbx(decH2, arrangement, codecH, 4, decH2);
 8063     __ tbx(decH3, arrangement, codecH, 4, decH3);
 8064 
 8065     // combine lower and higher
 8066     __ orr(decL0, arrangement, decL0, decH0);
 8067     __ orr(decL1, arrangement, decL1, decH1);
 8068     __ orr(decL2, arrangement, decL2, decH2);
 8069     __ orr(decL3, arrangement, decL3, decH3);
 8070 
 8071     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 8072     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 8073     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 8074     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 8075     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 8076     __ orr(in0, arrangement, decH0, decH1);
 8077     __ orr(in1, arrangement, decH2, decH3);
 8078     __ orr(in2, arrangement, in0,   in1);
 8079     __ umaxv(in3, arrangement, in2);
 8080     __ umov(rscratch2, in3, __ B, 0);
 8081 
 8082     // get the data to output
 8083     __ shl(out0,  arrangement, decL0, 2);
 8084     __ ushr(out1, arrangement, decL1, 4);
 8085     __ orr(out0,  arrangement, out0,  out1);
 8086     __ shl(out1,  arrangement, decL1, 4);
 8087     __ ushr(out2, arrangement, decL2, 2);
 8088     __ orr(out1,  arrangement, out1,  out2);
 8089     __ shl(out2,  arrangement, decL2, 6);
 8090     __ orr(out2,  arrangement, out2,  decL3);
 8091 
 8092     __ cbz(rscratch2, NoIllegalData);
 8093 
 8094     // handle illegal input
 8095     __ umov(r10, in2, __ D, 0);
 8096     if (size == 16) {
 8097       __ cbnz(r10, ErrorInLowerHalf);
 8098 
 8099       // illegal input is in higher half, store the lower half now.
 8100       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 8101 
 8102       __ umov(r10, in2,  __ D, 1);
 8103       __ umov(r11, out0, __ D, 1);
 8104       __ umov(r12, out1, __ D, 1);
 8105       __ umov(r13, out2, __ D, 1);
 8106       __ b(StoreLegalData);
 8107 
 8108       __ BIND(ErrorInLowerHalf);
 8109     }
 8110     __ umov(r11, out0, __ D, 0);
 8111     __ umov(r12, out1, __ D, 0);
 8112     __ umov(r13, out2, __ D, 0);
 8113 
 8114     __ BIND(StoreLegalData);
 8115     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 8116     __ strb(r11, __ post(dst, 1));
 8117     __ strb(r12, __ post(dst, 1));
 8118     __ strb(r13, __ post(dst, 1));
 8119     __ lsr(r10, r10, 8);
 8120     __ lsr(r11, r11, 8);
 8121     __ lsr(r12, r12, 8);
 8122     __ lsr(r13, r13, 8);
 8123     __ b(StoreLegalData);
 8124 
 8125     __ BIND(NoIllegalData);
 8126     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 8127   }
 8128 
 8129 
 8130    /**
 8131    *  Arguments:
 8132    *
 8133    *  Input:
 8134    *  c_rarg0   - src_start
 8135    *  c_rarg1   - src_offset
 8136    *  c_rarg2   - src_length
 8137    *  c_rarg3   - dest_start
 8138    *  c_rarg4   - dest_offset
 8139    *  c_rarg5   - isURL
 8140    *  c_rarg6   - isMIME
 8141    *
 8142    */
 8143   address generate_base64_decodeBlock() {
 8144 
 8145     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
 8146     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
 8147     // titled "Base64 decoding".
 8148 
 8149     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
 8150     // except the trailing character '=' is also treated illegal value in this intrinsic. That
 8151     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
 8152     static const uint8_t fromBase64ForNoSIMD[256] = {
 8153       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8154       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8155       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 8156        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8157       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 8158        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
 8159       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 8160        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 8161       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8162       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8163       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8164       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8165       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8166       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8167       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8168       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8169     };
 8170 
 8171     static const uint8_t fromBase64URLForNoSIMD[256] = {
 8172       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8173       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8174       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 8175        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8176       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 8177        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
 8178       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 8179        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 8180       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8181       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8182       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8183       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8184       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8185       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8186       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8187       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8188     };
 8189 
 8190     // A legal value of base64 code is in range [0, 127].  We need two lookups
 8191     // with tbl/tbx and combine them to get the decode data. The 1st table vector
 8192     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
 8193     // table vector lookup use tbx, out of range indices are unchanged in
 8194     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
 8195     // The value of index 64 is set to 0, so that we know that we already get the
 8196     // decoded data with the 1st lookup.
 8197     static const uint8_t fromBase64ForSIMD[128] = {
 8198       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8199       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8200       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 8201        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8202         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 8203        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 8204       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 8205        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 8206     };
 8207 
 8208     static const uint8_t fromBase64URLForSIMD[128] = {
 8209       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8210       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8211       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 8212        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8213         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 8214        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 8215        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 8216        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 8217     };
 8218 
 8219     __ align(CodeEntryAlignment);
 8220     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
 8221     StubCodeMark mark(this, stub_id);
 8222     address start = __ pc();
 8223 
 8224     Register src    = c_rarg0;  // source array
 8225     Register soff   = c_rarg1;  // source start offset
 8226     Register send   = c_rarg2;  // source end offset
 8227     Register dst    = c_rarg3;  // dest array
 8228     Register doff   = c_rarg4;  // position for writing to dest array
 8229     Register isURL  = c_rarg5;  // Base64 or URL character set
 8230     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
 8231 
 8232     Register length = send;    // reuse send as length of source data to process
 8233 
 8234     Register simd_codec   = c_rarg6;
 8235     Register nosimd_codec = c_rarg7;
 8236 
 8237     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
 8238 
 8239     __ enter();
 8240 
 8241     __ add(src, src, soff);
 8242     __ add(dst, dst, doff);
 8243 
 8244     __ mov(doff, dst);
 8245 
 8246     __ sub(length, send, soff);
 8247     __ bfm(length, zr, 0, 1);
 8248 
 8249     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
 8250     __ cbz(isURL, ProcessData);
 8251     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
 8252 
 8253     __ BIND(ProcessData);
 8254     __ mov(rscratch1, length);
 8255     __ cmp(length, (u1)144); // 144 = 80 + 64
 8256     __ br(Assembler::LT, Process4B);
 8257 
 8258     // In the MIME case, the line length cannot be more than 76
 8259     // bytes (see RFC 2045). This is too short a block for SIMD
 8260     // to be worthwhile, so we use non-SIMD here.
 8261     __ movw(rscratch1, 79);
 8262 
 8263     __ BIND(Process4B);
 8264     __ ldrw(r14, __ post(src, 4));
 8265     __ ubfxw(r10, r14, 0,  8);
 8266     __ ubfxw(r11, r14, 8,  8);
 8267     __ ubfxw(r12, r14, 16, 8);
 8268     __ ubfxw(r13, r14, 24, 8);
 8269     // get the de-code
 8270     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
 8271     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
 8272     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
 8273     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
 8274     // error detection, 255u indicates an illegal input
 8275     __ orrw(r14, r10, r11);
 8276     __ orrw(r15, r12, r13);
 8277     __ orrw(r14, r14, r15);
 8278     __ tbnz(r14, 7, Exit);
 8279     // recover the data
 8280     __ lslw(r14, r10, 10);
 8281     __ bfiw(r14, r11, 4, 6);
 8282     __ bfmw(r14, r12, 2, 5);
 8283     __ rev16w(r14, r14);
 8284     __ bfiw(r13, r12, 6, 2);
 8285     __ strh(r14, __ post(dst, 2));
 8286     __ strb(r13, __ post(dst, 1));
 8287     // non-simd loop
 8288     __ subsw(rscratch1, rscratch1, 4);
 8289     __ br(Assembler::GT, Process4B);
 8290 
 8291     // if exiting from PreProcess80B, rscratch1 == -1;
 8292     // otherwise, rscratch1 == 0.
 8293     __ cbzw(rscratch1, Exit);
 8294     __ sub(length, length, 80);
 8295 
 8296     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
 8297     __ cbz(isURL, SIMDEnter);
 8298     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
 8299 
 8300     __ BIND(SIMDEnter);
 8301     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
 8302     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
 8303     __ mov(rscratch1, 63);
 8304     __ dup(v27, __ T16B, rscratch1);
 8305 
 8306     __ BIND(Process64B);
 8307     __ cmp(length, (u1)64);
 8308     __ br(Assembler::LT, Process32B);
 8309     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
 8310     __ sub(length, length, 64);
 8311     __ b(Process64B);
 8312 
 8313     __ BIND(Process32B);
 8314     __ cmp(length, (u1)32);
 8315     __ br(Assembler::LT, SIMDExit);
 8316     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
 8317     __ sub(length, length, 32);
 8318     __ b(Process32B);
 8319 
 8320     __ BIND(SIMDExit);
 8321     __ cbz(length, Exit);
 8322     __ movw(rscratch1, length);
 8323     __ b(Process4B);
 8324 
 8325     __ BIND(Exit);
 8326     __ sub(c_rarg0, dst, doff);
 8327 
 8328     __ leave();
 8329     __ ret(lr);
 8330 
 8331     return start;
 8332   }
 8333 
 8334   // Support for spin waits.
 8335   address generate_spin_wait() {
 8336     __ align(CodeEntryAlignment);
 8337     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
 8338     StubCodeMark mark(this, stub_id);
 8339     address start = __ pc();
 8340 
 8341     __ spin_wait();
 8342     __ ret(lr);
 8343 
 8344     return start;
 8345   }
 8346 
 8347   void generate_lookup_secondary_supers_table_stub() {
 8348     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 8349     StubCodeMark mark(this, stub_id);
 8350 
 8351     const Register
 8352       r_super_klass  = r0,
 8353       r_array_base   = r1,
 8354       r_array_length = r2,
 8355       r_array_index  = r3,
 8356       r_sub_klass    = r4,
 8357       r_bitmap       = rscratch2,
 8358       result         = r5;
 8359     const FloatRegister
 8360       vtemp          = v0;
 8361 
 8362     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 8363       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 8364       Label L_success;
 8365       __ enter();
 8366       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 8367                                              r_array_base, r_array_length, r_array_index,
 8368                                              vtemp, result, slot,
 8369                                              /*stub_is_near*/true);
 8370       __ leave();
 8371       __ ret(lr);
 8372     }
 8373   }
 8374 
 8375   // Slow path implementation for UseSecondarySupersTable.
 8376   address generate_lookup_secondary_supers_table_slow_path_stub() {
 8377     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 8378     StubCodeMark mark(this, stub_id);
 8379 
 8380     address start = __ pc();
 8381     const Register
 8382       r_super_klass  = r0,        // argument
 8383       r_array_base   = r1,        // argument
 8384       temp1          = r2,        // temp
 8385       r_array_index  = r3,        // argument
 8386       r_bitmap       = rscratch2, // argument
 8387       result         = r5;        // argument
 8388 
 8389     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
 8390     __ ret(lr);
 8391 
 8392     return start;
 8393   }
 8394 
 8395 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 8396 
 8397   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
 8398   //
 8399   // If LSE is in use, generate LSE versions of all the stubs. The
 8400   // non-LSE versions are in atomic_aarch64.S.
 8401 
 8402   // class AtomicStubMark records the entry point of a stub and the
 8403   // stub pointer which will point to it. The stub pointer is set to
 8404   // the entry point when ~AtomicStubMark() is called, which must be
 8405   // after ICache::invalidate_range. This ensures safe publication of
 8406   // the generated code.
 8407   class AtomicStubMark {
 8408     address _entry_point;
 8409     aarch64_atomic_stub_t *_stub;
 8410     MacroAssembler *_masm;
 8411   public:
 8412     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
 8413       _masm = masm;
 8414       __ align(32);
 8415       _entry_point = __ pc();
 8416       _stub = stub;
 8417     }
 8418     ~AtomicStubMark() {
 8419       *_stub = (aarch64_atomic_stub_t)_entry_point;
 8420     }
 8421   };
 8422 
 8423   // NB: For memory_order_conservative we need a trailing membar after
 8424   // LSE atomic operations but not a leading membar.
 8425   //
 8426   // We don't need a leading membar because a clause in the Arm ARM
 8427   // says:
 8428   //
 8429   //   Barrier-ordered-before
 8430   //
 8431   //   Barrier instructions order prior Memory effects before subsequent
 8432   //   Memory effects generated by the same Observer. A read or a write
 8433   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
 8434   //   Observer if and only if RW1 appears in program order before RW 2
 8435   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
 8436   //   instruction with both Acquire and Release semantics.
 8437   //
 8438   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
 8439   // and Release semantics, therefore we don't need a leading
 8440   // barrier. However, there is no corresponding Barrier-ordered-after
 8441   // relationship, therefore we need a trailing membar to prevent a
 8442   // later store or load from being reordered with the store in an
 8443   // atomic instruction.
 8444   //
 8445   // This was checked by using the herd7 consistency model simulator
 8446   // (http://diy.inria.fr/) with this test case:
 8447   //
 8448   // AArch64 LseCas
 8449   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
 8450   // P0 | P1;
 8451   // LDR W4, [X2] | MOV W3, #0;
 8452   // DMB LD       | MOV W4, #1;
 8453   // LDR W3, [X1] | CASAL W3, W4, [X1];
 8454   //              | DMB ISH;
 8455   //              | STR W4, [X2];
 8456   // exists
 8457   // (0:X3=0 /\ 0:X4=1)
 8458   //
 8459   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
 8460   // with the store to x in P1. Without the DMB in P1 this may happen.
 8461   //
 8462   // At the time of writing we don't know of any AArch64 hardware that
 8463   // reorders stores in this way, but the Reference Manual permits it.
 8464 
 8465   void gen_cas_entry(Assembler::operand_size size,
 8466                      atomic_memory_order order) {
 8467     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
 8468       exchange_val = c_rarg2;
 8469     bool acquire, release;
 8470     switch (order) {
 8471       case memory_order_relaxed:
 8472         acquire = false;
 8473         release = false;
 8474         break;
 8475       case memory_order_release:
 8476         acquire = false;
 8477         release = true;
 8478         break;
 8479       default:
 8480         acquire = true;
 8481         release = true;
 8482         break;
 8483     }
 8484     __ mov(prev, compare_val);
 8485     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
 8486     if (order == memory_order_conservative) {
 8487       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8488     }
 8489     if (size == Assembler::xword) {
 8490       __ mov(r0, prev);
 8491     } else {
 8492       __ movw(r0, prev);
 8493     }
 8494     __ ret(lr);
 8495   }
 8496 
 8497   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
 8498     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 8499     // If not relaxed, then default to conservative.  Relaxed is the only
 8500     // case we use enough to be worth specializing.
 8501     if (order == memory_order_relaxed) {
 8502       __ ldadd(size, incr, prev, addr);
 8503     } else {
 8504       __ ldaddal(size, incr, prev, addr);
 8505       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8506     }
 8507     if (size == Assembler::xword) {
 8508       __ mov(r0, prev);
 8509     } else {
 8510       __ movw(r0, prev);
 8511     }
 8512     __ ret(lr);
 8513   }
 8514 
 8515   void gen_swpal_entry(Assembler::operand_size size) {
 8516     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 8517     __ swpal(size, incr, prev, addr);
 8518     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8519     if (size == Assembler::xword) {
 8520       __ mov(r0, prev);
 8521     } else {
 8522       __ movw(r0, prev);
 8523     }
 8524     __ ret(lr);
 8525   }
 8526 
 8527   void generate_atomic_entry_points() {
 8528     if (! UseLSE) {
 8529       return;
 8530     }
 8531     __ align(CodeEntryAlignment);
 8532     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
 8533     StubCodeMark mark(this, stub_id);
 8534     address first_entry = __ pc();
 8535 
 8536     // ADD, memory_order_conservative
 8537     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
 8538     gen_ldadd_entry(Assembler::word, memory_order_conservative);
 8539     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
 8540     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
 8541 
 8542     // ADD, memory_order_relaxed
 8543     AtomicStubMark mark_fetch_add_4_relaxed
 8544       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
 8545     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
 8546     AtomicStubMark mark_fetch_add_8_relaxed
 8547       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
 8548     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
 8549 
 8550     // XCHG, memory_order_conservative
 8551     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
 8552     gen_swpal_entry(Assembler::word);
 8553     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
 8554     gen_swpal_entry(Assembler::xword);
 8555 
 8556     // CAS, memory_order_conservative
 8557     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
 8558     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
 8559     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
 8560     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
 8561     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
 8562     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
 8563 
 8564     // CAS, memory_order_relaxed
 8565     AtomicStubMark mark_cmpxchg_1_relaxed
 8566       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
 8567     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
 8568     AtomicStubMark mark_cmpxchg_4_relaxed
 8569       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
 8570     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
 8571     AtomicStubMark mark_cmpxchg_8_relaxed
 8572       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
 8573     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
 8574 
 8575     AtomicStubMark mark_cmpxchg_4_release
 8576       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
 8577     gen_cas_entry(MacroAssembler::word, memory_order_release);
 8578     AtomicStubMark mark_cmpxchg_8_release
 8579       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
 8580     gen_cas_entry(MacroAssembler::xword, memory_order_release);
 8581 
 8582     AtomicStubMark mark_cmpxchg_4_seq_cst
 8583       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
 8584     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
 8585     AtomicStubMark mark_cmpxchg_8_seq_cst
 8586       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
 8587     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
 8588 
 8589     ICache::invalidate_range(first_entry, __ pc() - first_entry);
 8590   }
 8591 #endif // LINUX
 8592 
 8593   address generate_cont_thaw(Continuation::thaw_kind kind) {
 8594     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
 8595     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
 8596 
 8597     address start = __ pc();
 8598 
 8599     if (return_barrier) {
 8600       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
 8601       __ mov(sp, rscratch1);
 8602     }
 8603     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 8604 
 8605     if (return_barrier) {
 8606       // preserve possible return value from a method returning to the return barrier
 8607       __ fmovd(rscratch1, v0);
 8608       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 8609     }
 8610 
 8611     __ movw(c_rarg1, (return_barrier ? 1 : 0));
 8612     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
 8613     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
 8614 
 8615     if (return_barrier) {
 8616       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 8617       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
 8618       __ fmovd(v0, rscratch1);
 8619     }
 8620     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 8621 
 8622 
 8623     Label thaw_success;
 8624     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
 8625     __ cbnz(rscratch2, thaw_success);
 8626     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
 8627     __ br(rscratch1);
 8628     __ bind(thaw_success);
 8629 
 8630     // make room for the thawed frames
 8631     __ sub(rscratch1, sp, rscratch2);
 8632     __ andr(rscratch1, rscratch1, -16); // align
 8633     __ mov(sp, rscratch1);
 8634 
 8635     if (return_barrier) {
 8636       // save original return value -- again
 8637       __ fmovd(rscratch1, v0);
 8638       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 8639     }
 8640 
 8641     // If we want, we can templatize thaw by kind, and have three different entries
 8642     __ movw(c_rarg1, (uint32_t)kind);
 8643 
 8644     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
 8645     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
 8646 
 8647     if (return_barrier) {
 8648       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 8649       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
 8650       __ fmovd(v0, rscratch1);
 8651     } else {
 8652       __ mov(r0, zr); // return 0 (success) from doYield
 8653     }
 8654 
 8655     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
 8656     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
 8657     __ mov(rfp, sp);
 8658 
 8659     if (return_barrier_exception) {
 8660       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
 8661       __ authenticate_return_address(c_rarg1);
 8662       __ verify_oop(r0);
 8663       // save return value containing the exception oop in callee-saved R19
 8664       __ mov(r19, r0);
 8665 
 8666       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
 8667 
 8668       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
 8669       // __ reinitialize_ptrue();
 8670 
 8671       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
 8672 
 8673       __ mov(r1, r0); // the exception handler
 8674       __ mov(r0, r19); // restore return value containing the exception oop
 8675       __ verify_oop(r0);
 8676 
 8677       __ leave();
 8678       __ mov(r3, lr);
 8679       __ br(r1); // the exception handler
 8680     } else {
 8681       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
 8682       __ leave();
 8683       __ ret(lr);
 8684     }
 8685 
 8686     return start;
 8687   }
 8688 
 8689   address generate_cont_thaw() {
 8690     if (!Continuations::enabled()) return nullptr;
 8691 
 8692     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
 8693     StubCodeMark mark(this, stub_id);
 8694     address start = __ pc();
 8695     generate_cont_thaw(Continuation::thaw_top);
 8696     return start;
 8697   }
 8698 
 8699   address generate_cont_returnBarrier() {
 8700     if (!Continuations::enabled()) return nullptr;
 8701 
 8702     // TODO: will probably need multiple return barriers depending on return type
 8703     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
 8704     StubCodeMark mark(this, stub_id);
 8705     address start = __ pc();
 8706 
 8707     generate_cont_thaw(Continuation::thaw_return_barrier);
 8708 
 8709     return start;
 8710   }
 8711 
 8712   address generate_cont_returnBarrier_exception() {
 8713     if (!Continuations::enabled()) return nullptr;
 8714 
 8715     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
 8716     StubCodeMark mark(this, stub_id);
 8717     address start = __ pc();
 8718 
 8719     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
 8720 
 8721     return start;
 8722   }
 8723 
 8724   address generate_cont_preempt_stub() {
 8725     if (!Continuations::enabled()) return nullptr;
 8726     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
 8727     StubCodeMark mark(this, stub_id);
 8728     address start = __ pc();
 8729 
 8730     __ reset_last_Java_frame(true);
 8731 
 8732     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
 8733     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
 8734     __ mov(sp, rscratch2);
 8735 
 8736     Label preemption_cancelled;
 8737     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
 8738     __ cbnz(rscratch1, preemption_cancelled);
 8739 
 8740     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
 8741     SharedRuntime::continuation_enter_cleanup(_masm);
 8742     __ leave();
 8743     __ ret(lr);
 8744 
 8745     // We acquired the monitor after freezing the frames so call thaw to continue execution.
 8746     __ bind(preemption_cancelled);
 8747     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
 8748     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
 8749     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
 8750     __ ldr(rscratch1, Address(rscratch1));
 8751     __ br(rscratch1);
 8752 
 8753     return start;
 8754   }
 8755 
 8756   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
 8757   // are represented as long[5], with BITS_PER_LIMB = 26.
 8758   // Pack five 26-bit limbs into three 64-bit registers.
 8759   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
 8760     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
 8761     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
 8762     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
 8763     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
 8764 
 8765     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
 8766     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
 8767     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
 8768     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
 8769 
 8770     if (dest2->is_valid()) {
 8771       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
 8772     } else {
 8773 #ifdef ASSERT
 8774       Label OK;
 8775       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
 8776       __ br(__ EQ, OK);
 8777       __ stop("high bits of Poly1305 integer should be zero");
 8778       __ should_not_reach_here();
 8779       __ bind(OK);
 8780 #endif
 8781     }
 8782   }
 8783 
 8784   // As above, but return only a 128-bit integer, packed into two
 8785   // 64-bit registers.
 8786   void pack_26(Register dest0, Register dest1, Register src) {
 8787     pack_26(dest0, dest1, noreg, src);
 8788   }
 8789 
 8790   // Multiply and multiply-accumulate unsigned 64-bit registers.
 8791   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
 8792     __ mul(prod_lo, n, m);
 8793     __ umulh(prod_hi, n, m);
 8794   }
 8795   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
 8796     wide_mul(rscratch1, rscratch2, n, m);
 8797     __ adds(sum_lo, sum_lo, rscratch1);
 8798     __ adc(sum_hi, sum_hi, rscratch2);
 8799   }
 8800 
 8801   // Poly1305, RFC 7539
 8802 
 8803   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
 8804   // description of the tricks used to simplify and accelerate this
 8805   // computation.
 8806 
 8807   address generate_poly1305_processBlocks() {
 8808     __ align(CodeEntryAlignment);
 8809     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
 8810     StubCodeMark mark(this, stub_id);
 8811     address start = __ pc();
 8812     Label here;
 8813     __ enter();
 8814     RegSet callee_saved = RegSet::range(r19, r28);
 8815     __ push(callee_saved, sp);
 8816 
 8817     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
 8818 
 8819     // Arguments
 8820     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
 8821 
 8822     // R_n is the 128-bit randomly-generated key, packed into two
 8823     // registers.  The caller passes this key to us as long[5], with
 8824     // BITS_PER_LIMB = 26.
 8825     const Register R_0 = *++regs, R_1 = *++regs;
 8826     pack_26(R_0, R_1, r_start);
 8827 
 8828     // RR_n is (R_n >> 2) * 5
 8829     const Register RR_0 = *++regs, RR_1 = *++regs;
 8830     __ lsr(RR_0, R_0, 2);
 8831     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
 8832     __ lsr(RR_1, R_1, 2);
 8833     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
 8834 
 8835     // U_n is the current checksum
 8836     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
 8837     pack_26(U_0, U_1, U_2, acc_start);
 8838 
 8839     static constexpr int BLOCK_LENGTH = 16;
 8840     Label DONE, LOOP;
 8841 
 8842     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
 8843     __ br(Assembler::LT, DONE); {
 8844       __ bind(LOOP);
 8845 
 8846       // S_n is to be the sum of U_n and the next block of data
 8847       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
 8848       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
 8849       __ adds(S_0, U_0, S_0);
 8850       __ adcs(S_1, U_1, S_1);
 8851       __ adc(S_2, U_2, zr);
 8852       __ add(S_2, S_2, 1);
 8853 
 8854       const Register U_0HI = *++regs, U_1HI = *++regs;
 8855 
 8856       // NB: this logic depends on some of the special properties of
 8857       // Poly1305 keys. In particular, because we know that the top
 8858       // four bits of R_0 and R_1 are zero, we can add together
 8859       // partial products without any risk of needing to propagate a
 8860       // carry out.
 8861       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
 8862       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
 8863       __ andr(U_2, R_0, 3);
 8864       __ mul(U_2, S_2, U_2);
 8865 
 8866       // Recycle registers S_0, S_1, S_2
 8867       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
 8868 
 8869       // Partial reduction mod 2**130 - 5
 8870       __ adds(U_1, U_0HI, U_1);
 8871       __ adc(U_2, U_1HI, U_2);
 8872       // Sum now in U_2:U_1:U_0.
 8873       // Dead: U_0HI, U_1HI.
 8874       regs = (regs.remaining() + U_0HI + U_1HI).begin();
 8875 
 8876       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
 8877 
 8878       // First, U_2:U_1:U_0 += (U_2 >> 2)
 8879       __ lsr(rscratch1, U_2, 2);
 8880       __ andr(U_2, U_2, (u8)3);
 8881       __ adds(U_0, U_0, rscratch1);
 8882       __ adcs(U_1, U_1, zr);
 8883       __ adc(U_2, U_2, zr);
 8884       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
 8885       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
 8886       __ adcs(U_1, U_1, zr);
 8887       __ adc(U_2, U_2, zr);
 8888 
 8889       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
 8890       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
 8891       __ br(~ Assembler::LT, LOOP);
 8892     }
 8893 
 8894     // Further reduce modulo 2^130 - 5
 8895     __ lsr(rscratch1, U_2, 2);
 8896     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
 8897     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
 8898     __ adcs(U_1, U_1, zr);
 8899     __ andr(U_2, U_2, (u1)3);
 8900     __ adc(U_2, U_2, zr);
 8901 
 8902     // Unpack the sum into five 26-bit limbs and write to memory.
 8903     __ ubfiz(rscratch1, U_0, 0, 26);
 8904     __ ubfx(rscratch2, U_0, 26, 26);
 8905     __ stp(rscratch1, rscratch2, Address(acc_start));
 8906     __ ubfx(rscratch1, U_0, 52, 12);
 8907     __ bfi(rscratch1, U_1, 12, 14);
 8908     __ ubfx(rscratch2, U_1, 14, 26);
 8909     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
 8910     __ ubfx(rscratch1, U_1, 40, 24);
 8911     __ bfi(rscratch1, U_2, 24, 3);
 8912     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
 8913 
 8914     __ bind(DONE);
 8915     __ pop(callee_saved, sp);
 8916     __ leave();
 8917     __ ret(lr);
 8918 
 8919     return start;
 8920   }
 8921 
 8922   // exception handler for upcall stubs
 8923   address generate_upcall_stub_exception_handler() {
 8924     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
 8925     StubCodeMark mark(this, stub_id);
 8926     address start = __ pc();
 8927 
 8928     // Native caller has no idea how to handle exceptions,
 8929     // so we just crash here. Up to callee to catch exceptions.
 8930     __ verify_oop(r0);
 8931     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
 8932     __ blr(rscratch1);
 8933     __ should_not_reach_here();
 8934 
 8935     return start;
 8936   }
 8937 
 8938   // load Method* target of MethodHandle
 8939   // j_rarg0 = jobject receiver
 8940   // rmethod = result
 8941   address generate_upcall_stub_load_target() {
 8942     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
 8943     StubCodeMark mark(this, stub_id);
 8944     address start = __ pc();
 8945 
 8946     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
 8947       // Load target method from receiver
 8948     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
 8949     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
 8950     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
 8951     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
 8952                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
 8953                       noreg, noreg);
 8954     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
 8955 
 8956     __ ret(lr);
 8957 
 8958     return start;
 8959   }
 8960 
 8961 #undef __
 8962 #define __ masm->
 8963 
 8964   class MontgomeryMultiplyGenerator : public MacroAssembler {
 8965 
 8966     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
 8967       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
 8968 
 8969     RegSet _toSave;
 8970     bool _squaring;
 8971 
 8972   public:
 8973     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
 8974       : MacroAssembler(as->code()), _squaring(squaring) {
 8975 
 8976       // Register allocation
 8977 
 8978       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
 8979       Pa_base = *regs;       // Argument registers
 8980       if (squaring)
 8981         Pb_base = Pa_base;
 8982       else
 8983         Pb_base = *++regs;
 8984       Pn_base = *++regs;
 8985       Rlen= *++regs;
 8986       inv = *++regs;
 8987       Pm_base = *++regs;
 8988 
 8989                           // Working registers:
 8990       Ra =  *++regs;        // The current digit of a, b, n, and m.
 8991       Rb =  *++regs;
 8992       Rm =  *++regs;
 8993       Rn =  *++regs;
 8994 
 8995       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
 8996       Pb =  *++regs;
 8997       Pm =  *++regs;
 8998       Pn =  *++regs;
 8999 
 9000       t0 =  *++regs;        // Three registers which form a
 9001       t1 =  *++regs;        // triple-precision accumuator.
 9002       t2 =  *++regs;
 9003 
 9004       Ri =  *++regs;        // Inner and outer loop indexes.
 9005       Rj =  *++regs;
 9006 
 9007       Rhi_ab = *++regs;     // Product registers: low and high parts
 9008       Rlo_ab = *++regs;     // of a*b and m*n.
 9009       Rhi_mn = *++regs;
 9010       Rlo_mn = *++regs;
 9011 
 9012       // r19 and up are callee-saved.
 9013       _toSave = RegSet::range(r19, *regs) + Pm_base;
 9014     }
 9015 
 9016   private:
 9017     void save_regs() {
 9018       push(_toSave, sp);
 9019     }
 9020 
 9021     void restore_regs() {
 9022       pop(_toSave, sp);
 9023     }
 9024 
 9025     template <typename T>
 9026     void unroll_2(Register count, T block) {
 9027       Label loop, end, odd;
 9028       tbnz(count, 0, odd);
 9029       cbz(count, end);
 9030       align(16);
 9031       bind(loop);
 9032       (this->*block)();
 9033       bind(odd);
 9034       (this->*block)();
 9035       subs(count, count, 2);
 9036       br(Assembler::GT, loop);
 9037       bind(end);
 9038     }
 9039 
 9040     template <typename T>
 9041     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
 9042       Label loop, end, odd;
 9043       tbnz(count, 0, odd);
 9044       cbz(count, end);
 9045       align(16);
 9046       bind(loop);
 9047       (this->*block)(d, s, tmp);
 9048       bind(odd);
 9049       (this->*block)(d, s, tmp);
 9050       subs(count, count, 2);
 9051       br(Assembler::GT, loop);
 9052       bind(end);
 9053     }
 9054 
 9055     void pre1(RegisterOrConstant i) {
 9056       block_comment("pre1");
 9057       // Pa = Pa_base;
 9058       // Pb = Pb_base + i;
 9059       // Pm = Pm_base;
 9060       // Pn = Pn_base + i;
 9061       // Ra = *Pa;
 9062       // Rb = *Pb;
 9063       // Rm = *Pm;
 9064       // Rn = *Pn;
 9065       ldr(Ra, Address(Pa_base));
 9066       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
 9067       ldr(Rm, Address(Pm_base));
 9068       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9069       lea(Pa, Address(Pa_base));
 9070       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
 9071       lea(Pm, Address(Pm_base));
 9072       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9073 
 9074       // Zero the m*n result.
 9075       mov(Rhi_mn, zr);
 9076       mov(Rlo_mn, zr);
 9077     }
 9078 
 9079     // The core multiply-accumulate step of a Montgomery
 9080     // multiplication.  The idea is to schedule operations as a
 9081     // pipeline so that instructions with long latencies (loads and
 9082     // multiplies) have time to complete before their results are
 9083     // used.  This most benefits in-order implementations of the
 9084     // architecture but out-of-order ones also benefit.
 9085     void step() {
 9086       block_comment("step");
 9087       // MACC(Ra, Rb, t0, t1, t2);
 9088       // Ra = *++Pa;
 9089       // Rb = *--Pb;
 9090       umulh(Rhi_ab, Ra, Rb);
 9091       mul(Rlo_ab, Ra, Rb);
 9092       ldr(Ra, pre(Pa, wordSize));
 9093       ldr(Rb, pre(Pb, -wordSize));
 9094       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
 9095                                        // previous iteration.
 9096       // MACC(Rm, Rn, t0, t1, t2);
 9097       // Rm = *++Pm;
 9098       // Rn = *--Pn;
 9099       umulh(Rhi_mn, Rm, Rn);
 9100       mul(Rlo_mn, Rm, Rn);
 9101       ldr(Rm, pre(Pm, wordSize));
 9102       ldr(Rn, pre(Pn, -wordSize));
 9103       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9104     }
 9105 
 9106     void post1() {
 9107       block_comment("post1");
 9108 
 9109       // MACC(Ra, Rb, t0, t1, t2);
 9110       // Ra = *++Pa;
 9111       // Rb = *--Pb;
 9112       umulh(Rhi_ab, Ra, Rb);
 9113       mul(Rlo_ab, Ra, Rb);
 9114       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9115       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9116 
 9117       // *Pm = Rm = t0 * inv;
 9118       mul(Rm, t0, inv);
 9119       str(Rm, Address(Pm));
 9120 
 9121       // MACC(Rm, Rn, t0, t1, t2);
 9122       // t0 = t1; t1 = t2; t2 = 0;
 9123       umulh(Rhi_mn, Rm, Rn);
 9124 
 9125 #ifndef PRODUCT
 9126       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
 9127       {
 9128         mul(Rlo_mn, Rm, Rn);
 9129         add(Rlo_mn, t0, Rlo_mn);
 9130         Label ok;
 9131         cbz(Rlo_mn, ok); {
 9132           stop("broken Montgomery multiply");
 9133         } bind(ok);
 9134       }
 9135 #endif
 9136       // We have very carefully set things up so that
 9137       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
 9138       // the lower half of Rm * Rn because we know the result already:
 9139       // it must be -t0.  t0 + (-t0) must generate a carry iff
 9140       // t0 != 0.  So, rather than do a mul and an adds we just set
 9141       // the carry flag iff t0 is nonzero.
 9142       //
 9143       // mul(Rlo_mn, Rm, Rn);
 9144       // adds(zr, t0, Rlo_mn);
 9145       subs(zr, t0, 1); // Set carry iff t0 is nonzero
 9146       adcs(t0, t1, Rhi_mn);
 9147       adc(t1, t2, zr);
 9148       mov(t2, zr);
 9149     }
 9150 
 9151     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
 9152       block_comment("pre2");
 9153       // Pa = Pa_base + i-len;
 9154       // Pb = Pb_base + len;
 9155       // Pm = Pm_base + i-len;
 9156       // Pn = Pn_base + len;
 9157 
 9158       if (i.is_register()) {
 9159         sub(Rj, i.as_register(), len);
 9160       } else {
 9161         mov(Rj, i.as_constant());
 9162         sub(Rj, Rj, len);
 9163       }
 9164       // Rj == i-len
 9165 
 9166       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
 9167       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
 9168       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
 9169       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
 9170 
 9171       // Ra = *++Pa;
 9172       // Rb = *--Pb;
 9173       // Rm = *++Pm;
 9174       // Rn = *--Pn;
 9175       ldr(Ra, pre(Pa, wordSize));
 9176       ldr(Rb, pre(Pb, -wordSize));
 9177       ldr(Rm, pre(Pm, wordSize));
 9178       ldr(Rn, pre(Pn, -wordSize));
 9179 
 9180       mov(Rhi_mn, zr);
 9181       mov(Rlo_mn, zr);
 9182     }
 9183 
 9184     void post2(RegisterOrConstant i, RegisterOrConstant len) {
 9185       block_comment("post2");
 9186       if (i.is_constant()) {
 9187         mov(Rj, i.as_constant()-len.as_constant());
 9188       } else {
 9189         sub(Rj, i.as_register(), len);
 9190       }
 9191 
 9192       adds(t0, t0, Rlo_mn); // The pending m*n, low part
 9193 
 9194       // As soon as we know the least significant digit of our result,
 9195       // store it.
 9196       // Pm_base[i-len] = t0;
 9197       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
 9198 
 9199       // t0 = t1; t1 = t2; t2 = 0;
 9200       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
 9201       adc(t1, t2, zr);
 9202       mov(t2, zr);
 9203     }
 9204 
 9205     // A carry in t0 after Montgomery multiplication means that we
 9206     // should subtract multiples of n from our result in m.  We'll
 9207     // keep doing that until there is no carry.
 9208     void normalize(RegisterOrConstant len) {
 9209       block_comment("normalize");
 9210       // while (t0)
 9211       //   t0 = sub(Pm_base, Pn_base, t0, len);
 9212       Label loop, post, again;
 9213       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
 9214       cbz(t0, post); {
 9215         bind(again); {
 9216           mov(i, zr);
 9217           mov(cnt, len);
 9218           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9219           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9220           subs(zr, zr, zr); // set carry flag, i.e. no borrow
 9221           align(16);
 9222           bind(loop); {
 9223             sbcs(Rm, Rm, Rn);
 9224             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9225             add(i, i, 1);
 9226             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9227             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9228             sub(cnt, cnt, 1);
 9229           } cbnz(cnt, loop);
 9230           sbc(t0, t0, zr);
 9231         } cbnz(t0, again);
 9232       } bind(post);
 9233     }
 9234 
 9235     // Move memory at s to d, reversing words.
 9236     //    Increments d to end of copied memory
 9237     //    Destroys tmp1, tmp2
 9238     //    Preserves len
 9239     //    Leaves s pointing to the address which was in d at start
 9240     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
 9241       assert(tmp1->encoding() < r19->encoding(), "register corruption");
 9242       assert(tmp2->encoding() < r19->encoding(), "register corruption");
 9243 
 9244       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
 9245       mov(tmp1, len);
 9246       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
 9247       sub(s, d, len, ext::uxtw, LogBytesPerWord);
 9248     }
 9249     // where
 9250     void reverse1(Register d, Register s, Register tmp) {
 9251       ldr(tmp, pre(s, -wordSize));
 9252       ror(tmp, tmp, 32);
 9253       str(tmp, post(d, wordSize));
 9254     }
 9255 
 9256     void step_squaring() {
 9257       // An extra ACC
 9258       step();
 9259       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9260     }
 9261 
 9262     void last_squaring(RegisterOrConstant i) {
 9263       Label dont;
 9264       // if ((i & 1) == 0) {
 9265       tbnz(i.as_register(), 0, dont); {
 9266         // MACC(Ra, Rb, t0, t1, t2);
 9267         // Ra = *++Pa;
 9268         // Rb = *--Pb;
 9269         umulh(Rhi_ab, Ra, Rb);
 9270         mul(Rlo_ab, Ra, Rb);
 9271         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9272       } bind(dont);
 9273     }
 9274 
 9275     void extra_step_squaring() {
 9276       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9277 
 9278       // MACC(Rm, Rn, t0, t1, t2);
 9279       // Rm = *++Pm;
 9280       // Rn = *--Pn;
 9281       umulh(Rhi_mn, Rm, Rn);
 9282       mul(Rlo_mn, Rm, Rn);
 9283       ldr(Rm, pre(Pm, wordSize));
 9284       ldr(Rn, pre(Pn, -wordSize));
 9285     }
 9286 
 9287     void post1_squaring() {
 9288       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9289 
 9290       // *Pm = Rm = t0 * inv;
 9291       mul(Rm, t0, inv);
 9292       str(Rm, Address(Pm));
 9293 
 9294       // MACC(Rm, Rn, t0, t1, t2);
 9295       // t0 = t1; t1 = t2; t2 = 0;
 9296       umulh(Rhi_mn, Rm, Rn);
 9297 
 9298 #ifndef PRODUCT
 9299       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
 9300       {
 9301         mul(Rlo_mn, Rm, Rn);
 9302         add(Rlo_mn, t0, Rlo_mn);
 9303         Label ok;
 9304         cbz(Rlo_mn, ok); {
 9305           stop("broken Montgomery multiply");
 9306         } bind(ok);
 9307       }
 9308 #endif
 9309       // We have very carefully set things up so that
 9310       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
 9311       // the lower half of Rm * Rn because we know the result already:
 9312       // it must be -t0.  t0 + (-t0) must generate a carry iff
 9313       // t0 != 0.  So, rather than do a mul and an adds we just set
 9314       // the carry flag iff t0 is nonzero.
 9315       //
 9316       // mul(Rlo_mn, Rm, Rn);
 9317       // adds(zr, t0, Rlo_mn);
 9318       subs(zr, t0, 1); // Set carry iff t0 is nonzero
 9319       adcs(t0, t1, Rhi_mn);
 9320       adc(t1, t2, zr);
 9321       mov(t2, zr);
 9322     }
 9323 
 9324     void acc(Register Rhi, Register Rlo,
 9325              Register t0, Register t1, Register t2) {
 9326       adds(t0, t0, Rlo);
 9327       adcs(t1, t1, Rhi);
 9328       adc(t2, t2, zr);
 9329     }
 9330 
 9331   public:
 9332     /**
 9333      * Fast Montgomery multiplication.  The derivation of the
 9334      * algorithm is in A Cryptographic Library for the Motorola
 9335      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
 9336      *
 9337      * Arguments:
 9338      *
 9339      * Inputs for multiplication:
 9340      *   c_rarg0   - int array elements a
 9341      *   c_rarg1   - int array elements b
 9342      *   c_rarg2   - int array elements n (the modulus)
 9343      *   c_rarg3   - int length
 9344      *   c_rarg4   - int inv
 9345      *   c_rarg5   - int array elements m (the result)
 9346      *
 9347      * Inputs for squaring:
 9348      *   c_rarg0   - int array elements a
 9349      *   c_rarg1   - int array elements n (the modulus)
 9350      *   c_rarg2   - int length
 9351      *   c_rarg3   - int inv
 9352      *   c_rarg4   - int array elements m (the result)
 9353      *
 9354      */
 9355     address generate_multiply() {
 9356       Label argh, nothing;
 9357       bind(argh);
 9358       stop("MontgomeryMultiply total_allocation must be <= 8192");
 9359 
 9360       align(CodeEntryAlignment);
 9361       address entry = pc();
 9362 
 9363       cbzw(Rlen, nothing);
 9364 
 9365       enter();
 9366 
 9367       // Make room.
 9368       cmpw(Rlen, 512);
 9369       br(Assembler::HI, argh);
 9370       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
 9371       andr(sp, Ra, -2 * wordSize);
 9372 
 9373       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
 9374 
 9375       {
 9376         // Copy input args, reversing as we go.  We use Ra as a
 9377         // temporary variable.
 9378         reverse(Ra, Pa_base, Rlen, t0, t1);
 9379         if (!_squaring)
 9380           reverse(Ra, Pb_base, Rlen, t0, t1);
 9381         reverse(Ra, Pn_base, Rlen, t0, t1);
 9382       }
 9383 
 9384       // Push all call-saved registers and also Pm_base which we'll need
 9385       // at the end.
 9386       save_regs();
 9387 
 9388 #ifndef PRODUCT
 9389       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
 9390       {
 9391         ldr(Rn, Address(Pn_base, 0));
 9392         mul(Rlo_mn, Rn, inv);
 9393         subs(zr, Rlo_mn, -1);
 9394         Label ok;
 9395         br(EQ, ok); {
 9396           stop("broken inverse in Montgomery multiply");
 9397         } bind(ok);
 9398       }
 9399 #endif
 9400 
 9401       mov(Pm_base, Ra);
 9402 
 9403       mov(t0, zr);
 9404       mov(t1, zr);
 9405       mov(t2, zr);
 9406 
 9407       block_comment("for (int i = 0; i < len; i++) {");
 9408       mov(Ri, zr); {
 9409         Label loop, end;
 9410         cmpw(Ri, Rlen);
 9411         br(Assembler::GE, end);
 9412 
 9413         bind(loop);
 9414         pre1(Ri);
 9415 
 9416         block_comment("  for (j = i; j; j--) {"); {
 9417           movw(Rj, Ri);
 9418           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
 9419         } block_comment("  } // j");
 9420 
 9421         post1();
 9422         addw(Ri, Ri, 1);
 9423         cmpw(Ri, Rlen);
 9424         br(Assembler::LT, loop);
 9425         bind(end);
 9426         block_comment("} // i");
 9427       }
 9428 
 9429       block_comment("for (int i = len; i < 2*len; i++) {");
 9430       mov(Ri, Rlen); {
 9431         Label loop, end;
 9432         cmpw(Ri, Rlen, Assembler::LSL, 1);
 9433         br(Assembler::GE, end);
 9434 
 9435         bind(loop);
 9436         pre2(Ri, Rlen);
 9437 
 9438         block_comment("  for (j = len*2-i-1; j; j--) {"); {
 9439           lslw(Rj, Rlen, 1);
 9440           subw(Rj, Rj, Ri);
 9441           subw(Rj, Rj, 1);
 9442           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
 9443         } block_comment("  } // j");
 9444 
 9445         post2(Ri, Rlen);
 9446         addw(Ri, Ri, 1);
 9447         cmpw(Ri, Rlen, Assembler::LSL, 1);
 9448         br(Assembler::LT, loop);
 9449         bind(end);
 9450       }
 9451       block_comment("} // i");
 9452 
 9453       normalize(Rlen);
 9454 
 9455       mov(Ra, Pm_base);  // Save Pm_base in Ra
 9456       restore_regs();  // Restore caller's Pm_base
 9457 
 9458       // Copy our result into caller's Pm_base
 9459       reverse(Pm_base, Ra, Rlen, t0, t1);
 9460 
 9461       leave();
 9462       bind(nothing);
 9463       ret(lr);
 9464 
 9465       return entry;
 9466     }
 9467     // In C, approximately:
 9468 
 9469     // void
 9470     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
 9471     //                     julong Pn_base[], julong Pm_base[],
 9472     //                     julong inv, int len) {
 9473     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
 9474     //   julong *Pa, *Pb, *Pn, *Pm;
 9475     //   julong Ra, Rb, Rn, Rm;
 9476 
 9477     //   int i;
 9478 
 9479     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
 9480 
 9481     //   for (i = 0; i < len; i++) {
 9482     //     int j;
 9483 
 9484     //     Pa = Pa_base;
 9485     //     Pb = Pb_base + i;
 9486     //     Pm = Pm_base;
 9487     //     Pn = Pn_base + i;
 9488 
 9489     //     Ra = *Pa;
 9490     //     Rb = *Pb;
 9491     //     Rm = *Pm;
 9492     //     Rn = *Pn;
 9493 
 9494     //     int iters = i;
 9495     //     for (j = 0; iters--; j++) {
 9496     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
 9497     //       MACC(Ra, Rb, t0, t1, t2);
 9498     //       Ra = *++Pa;
 9499     //       Rb = *--Pb;
 9500     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9501     //       MACC(Rm, Rn, t0, t1, t2);
 9502     //       Rm = *++Pm;
 9503     //       Rn = *--Pn;
 9504     //     }
 9505 
 9506     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
 9507     //     MACC(Ra, Rb, t0, t1, t2);
 9508     //     *Pm = Rm = t0 * inv;
 9509     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
 9510     //     MACC(Rm, Rn, t0, t1, t2);
 9511 
 9512     //     assert(t0 == 0, "broken Montgomery multiply");
 9513 
 9514     //     t0 = t1; t1 = t2; t2 = 0;
 9515     //   }
 9516 
 9517     //   for (i = len; i < 2*len; i++) {
 9518     //     int j;
 9519 
 9520     //     Pa = Pa_base + i-len;
 9521     //     Pb = Pb_base + len;
 9522     //     Pm = Pm_base + i-len;
 9523     //     Pn = Pn_base + len;
 9524 
 9525     //     Ra = *++Pa;
 9526     //     Rb = *--Pb;
 9527     //     Rm = *++Pm;
 9528     //     Rn = *--Pn;
 9529 
 9530     //     int iters = len*2-i-1;
 9531     //     for (j = i-len+1; iters--; j++) {
 9532     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
 9533     //       MACC(Ra, Rb, t0, t1, t2);
 9534     //       Ra = *++Pa;
 9535     //       Rb = *--Pb;
 9536     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9537     //       MACC(Rm, Rn, t0, t1, t2);
 9538     //       Rm = *++Pm;
 9539     //       Rn = *--Pn;
 9540     //     }
 9541 
 9542     //     Pm_base[i-len] = t0;
 9543     //     t0 = t1; t1 = t2; t2 = 0;
 9544     //   }
 9545 
 9546     //   while (t0)
 9547     //     t0 = sub(Pm_base, Pn_base, t0, len);
 9548     // }
 9549 
 9550     /**
 9551      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
 9552      * multiplies than Montgomery multiplication so it should be up to
 9553      * 25% faster.  However, its loop control is more complex and it
 9554      * may actually run slower on some machines.
 9555      *
 9556      * Arguments:
 9557      *
 9558      * Inputs:
 9559      *   c_rarg0   - int array elements a
 9560      *   c_rarg1   - int array elements n (the modulus)
 9561      *   c_rarg2   - int length
 9562      *   c_rarg3   - int inv
 9563      *   c_rarg4   - int array elements m (the result)
 9564      *
 9565      */
 9566     address generate_square() {
 9567       Label argh;
 9568       bind(argh);
 9569       stop("MontgomeryMultiply total_allocation must be <= 8192");
 9570 
 9571       align(CodeEntryAlignment);
 9572       address entry = pc();
 9573 
 9574       enter();
 9575 
 9576       // Make room.
 9577       cmpw(Rlen, 512);
 9578       br(Assembler::HI, argh);
 9579       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
 9580       andr(sp, Ra, -2 * wordSize);
 9581 
 9582       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
 9583 
 9584       {
 9585         // Copy input args, reversing as we go.  We use Ra as a
 9586         // temporary variable.
 9587         reverse(Ra, Pa_base, Rlen, t0, t1);
 9588         reverse(Ra, Pn_base, Rlen, t0, t1);
 9589       }
 9590 
 9591       // Push all call-saved registers and also Pm_base which we'll need
 9592       // at the end.
 9593       save_regs();
 9594 
 9595       mov(Pm_base, Ra);
 9596 
 9597       mov(t0, zr);
 9598       mov(t1, zr);
 9599       mov(t2, zr);
 9600 
 9601       block_comment("for (int i = 0; i < len; i++) {");
 9602       mov(Ri, zr); {
 9603         Label loop, end;
 9604         bind(loop);
 9605         cmp(Ri, Rlen);
 9606         br(Assembler::GE, end);
 9607 
 9608         pre1(Ri);
 9609 
 9610         block_comment("for (j = (i+1)/2; j; j--) {"); {
 9611           add(Rj, Ri, 1);
 9612           lsr(Rj, Rj, 1);
 9613           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
 9614         } block_comment("  } // j");
 9615 
 9616         last_squaring(Ri);
 9617 
 9618         block_comment("  for (j = i/2; j; j--) {"); {
 9619           lsr(Rj, Ri, 1);
 9620           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
 9621         } block_comment("  } // j");
 9622 
 9623         post1_squaring();
 9624         add(Ri, Ri, 1);
 9625         cmp(Ri, Rlen);
 9626         br(Assembler::LT, loop);
 9627 
 9628         bind(end);
 9629         block_comment("} // i");
 9630       }
 9631 
 9632       block_comment("for (int i = len; i < 2*len; i++) {");
 9633       mov(Ri, Rlen); {
 9634         Label loop, end;
 9635         bind(loop);
 9636         cmp(Ri, Rlen, Assembler::LSL, 1);
 9637         br(Assembler::GE, end);
 9638 
 9639         pre2(Ri, Rlen);
 9640 
 9641         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
 9642           lsl(Rj, Rlen, 1);
 9643           sub(Rj, Rj, Ri);
 9644           sub(Rj, Rj, 1);
 9645           lsr(Rj, Rj, 1);
 9646           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
 9647         } block_comment("  } // j");
 9648 
 9649         last_squaring(Ri);
 9650 
 9651         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
 9652           lsl(Rj, Rlen, 1);
 9653           sub(Rj, Rj, Ri);
 9654           lsr(Rj, Rj, 1);
 9655           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
 9656         } block_comment("  } // j");
 9657 
 9658         post2(Ri, Rlen);
 9659         add(Ri, Ri, 1);
 9660         cmp(Ri, Rlen, Assembler::LSL, 1);
 9661 
 9662         br(Assembler::LT, loop);
 9663         bind(end);
 9664         block_comment("} // i");
 9665       }
 9666 
 9667       normalize(Rlen);
 9668 
 9669       mov(Ra, Pm_base);  // Save Pm_base in Ra
 9670       restore_regs();  // Restore caller's Pm_base
 9671 
 9672       // Copy our result into caller's Pm_base
 9673       reverse(Pm_base, Ra, Rlen, t0, t1);
 9674 
 9675       leave();
 9676       ret(lr);
 9677 
 9678       return entry;
 9679     }
 9680     // In C, approximately:
 9681 
 9682     // void
 9683     // montgomery_square(julong Pa_base[], julong Pn_base[],
 9684     //                   julong Pm_base[], julong inv, int len) {
 9685     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
 9686     //   julong *Pa, *Pb, *Pn, *Pm;
 9687     //   julong Ra, Rb, Rn, Rm;
 9688 
 9689     //   int i;
 9690 
 9691     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
 9692 
 9693     //   for (i = 0; i < len; i++) {
 9694     //     int j;
 9695 
 9696     //     Pa = Pa_base;
 9697     //     Pb = Pa_base + i;
 9698     //     Pm = Pm_base;
 9699     //     Pn = Pn_base + i;
 9700 
 9701     //     Ra = *Pa;
 9702     //     Rb = *Pb;
 9703     //     Rm = *Pm;
 9704     //     Rn = *Pn;
 9705 
 9706     //     int iters = (i+1)/2;
 9707     //     for (j = 0; iters--; j++) {
 9708     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
 9709     //       MACC2(Ra, Rb, t0, t1, t2);
 9710     //       Ra = *++Pa;
 9711     //       Rb = *--Pb;
 9712     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9713     //       MACC(Rm, Rn, t0, t1, t2);
 9714     //       Rm = *++Pm;
 9715     //       Rn = *--Pn;
 9716     //     }
 9717     //     if ((i & 1) == 0) {
 9718     //       assert(Ra == Pa_base[j], "must be");
 9719     //       MACC(Ra, Ra, t0, t1, t2);
 9720     //     }
 9721     //     iters = i/2;
 9722     //     assert(iters == i-j, "must be");
 9723     //     for (; iters--; j++) {
 9724     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9725     //       MACC(Rm, Rn, t0, t1, t2);
 9726     //       Rm = *++Pm;
 9727     //       Rn = *--Pn;
 9728     //     }
 9729 
 9730     //     *Pm = Rm = t0 * inv;
 9731     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
 9732     //     MACC(Rm, Rn, t0, t1, t2);
 9733 
 9734     //     assert(t0 == 0, "broken Montgomery multiply");
 9735 
 9736     //     t0 = t1; t1 = t2; t2 = 0;
 9737     //   }
 9738 
 9739     //   for (i = len; i < 2*len; i++) {
 9740     //     int start = i-len+1;
 9741     //     int end = start + (len - start)/2;
 9742     //     int j;
 9743 
 9744     //     Pa = Pa_base + i-len;
 9745     //     Pb = Pa_base + len;
 9746     //     Pm = Pm_base + i-len;
 9747     //     Pn = Pn_base + len;
 9748 
 9749     //     Ra = *++Pa;
 9750     //     Rb = *--Pb;
 9751     //     Rm = *++Pm;
 9752     //     Rn = *--Pn;
 9753 
 9754     //     int iters = (2*len-i-1)/2;
 9755     //     assert(iters == end-start, "must be");
 9756     //     for (j = start; iters--; j++) {
 9757     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
 9758     //       MACC2(Ra, Rb, t0, t1, t2);
 9759     //       Ra = *++Pa;
 9760     //       Rb = *--Pb;
 9761     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9762     //       MACC(Rm, Rn, t0, t1, t2);
 9763     //       Rm = *++Pm;
 9764     //       Rn = *--Pn;
 9765     //     }
 9766     //     if ((i & 1) == 0) {
 9767     //       assert(Ra == Pa_base[j], "must be");
 9768     //       MACC(Ra, Ra, t0, t1, t2);
 9769     //     }
 9770     //     iters =  (2*len-i)/2;
 9771     //     assert(iters == len-j, "must be");
 9772     //     for (; iters--; j++) {
 9773     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9774     //       MACC(Rm, Rn, t0, t1, t2);
 9775     //       Rm = *++Pm;
 9776     //       Rn = *--Pn;
 9777     //     }
 9778     //     Pm_base[i-len] = t0;
 9779     //     t0 = t1; t1 = t2; t2 = 0;
 9780     //   }
 9781 
 9782     //   while (t0)
 9783     //     t0 = sub(Pm_base, Pn_base, t0, len);
 9784     // }
 9785   };
 9786 
 9787   void generate_vector_math_stubs() {
 9788     // Get native vector math stub routine addresses
 9789     void* libsleef = nullptr;
 9790     char ebuf[1024];
 9791     char dll_name[JVM_MAXPATHLEN];
 9792     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
 9793       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
 9794     }
 9795     if (libsleef == nullptr) {
 9796       log_info(library)("Failed to load native vector math library, %s!", ebuf);
 9797       return;
 9798     }
 9799     // Method naming convention
 9800     //   All the methods are named as <OP><T><N>_<U><suffix>
 9801     //   Where:
 9802     //     <OP>     is the operation name, e.g. sin
 9803     //     <T>      is optional to indicate float/double
 9804     //              "f/d" for vector float/double operation
 9805     //     <N>      is the number of elements in the vector
 9806     //              "2/4" for neon, and "x" for sve
 9807     //     <U>      is the precision level
 9808     //              "u10/u05" represents 1.0/0.5 ULP error bounds
 9809     //               We use "u10" for all operations by default
 9810     //               But for those functions do not have u10 support, we use "u05" instead
 9811     //     <suffix> indicates neon/sve
 9812     //              "sve/advsimd" for sve/neon implementations
 9813     //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
 9814     //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
 9815     //
 9816     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
 9817 
 9818     // Math vector stubs implemented with SVE for scalable vector size.
 9819     if (UseSVE > 0) {
 9820       for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
 9821         int vop = VectorSupport::VECTOR_OP_MATH_START + op;
 9822         // Skip "tanh" because there is performance regression
 9823         if (vop == VectorSupport::VECTOR_OP_TANH) {
 9824           continue;
 9825         }
 9826 
 9827         // The native library does not support u10 level of "hypot".
 9828         const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
 9829 
 9830         snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
 9831         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
 9832 
 9833         snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
 9834         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
 9835       }
 9836     }
 9837 
 9838     // Math vector stubs implemented with NEON for 64/128 bits vector size.
 9839     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
 9840       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
 9841       // Skip "tanh" because there is performance regression
 9842       if (vop == VectorSupport::VECTOR_OP_TANH) {
 9843         continue;
 9844       }
 9845 
 9846       // The native library does not support u10 level of "hypot".
 9847       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
 9848 
 9849       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
 9850       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
 9851 
 9852       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
 9853       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
 9854 
 9855       snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
 9856       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
 9857     }
 9858   }
 9859 
 9860   // Initialization
 9861   void generate_initial_stubs() {
 9862     // Generate initial stubs and initializes the entry points
 9863 
 9864     // entry points that exist in all platforms Note: This is code
 9865     // that could be shared among different platforms - however the
 9866     // benefit seems to be smaller than the disadvantage of having a
 9867     // much more complicated generator structure. See also comment in
 9868     // stubRoutines.hpp.
 9869 
 9870     StubRoutines::_forward_exception_entry = generate_forward_exception();
 9871 
 9872     StubRoutines::_call_stub_entry =
 9873       generate_call_stub(StubRoutines::_call_stub_return_address);
 9874 
 9875     // is referenced by megamorphic call
 9876     StubRoutines::_catch_exception_entry = generate_catch_exception();
 9877 
 9878     // Initialize table for copy memory (arraycopy) check.
 9879     if (UnsafeMemoryAccess::_table == nullptr) {
 9880       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
 9881     }
 9882 
 9883     if (UseCRC32Intrinsics) {
 9884       // set table address before stub generation which use it
 9885       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
 9886       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
 9887     }
 9888 
 9889     if (UseCRC32CIntrinsics) {
 9890       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
 9891     }
 9892 
 9893     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
 9894       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
 9895     }
 9896 
 9897     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
 9898       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
 9899     }
 9900 
 9901     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
 9902         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
 9903       StubRoutines::_hf2f = generate_float16ToFloat();
 9904       StubRoutines::_f2hf = generate_floatToFloat16();
 9905     }
 9906   }
 9907 
 9908   void generate_continuation_stubs() {
 9909     // Continuation stubs:
 9910     StubRoutines::_cont_thaw          = generate_cont_thaw();
 9911     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
 9912     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
 9913     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
 9914   }
 9915 
 9916   void generate_final_stubs() {
 9917     // support for verify_oop (must happen after universe_init)
 9918     if (VerifyOops) {
 9919       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
 9920     }
 9921 
 9922     // arraycopy stubs used by compilers
 9923     generate_arraycopy_stubs();
 9924 
 9925     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
 9926 
 9927     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
 9928 
 9929     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
 9930     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
 9931 
 9932 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 9933 
 9934     generate_atomic_entry_points();
 9935 
 9936 #endif // LINUX
 9937 
 9938 #ifdef COMPILER2
 9939     if (UseSecondarySupersTable) {
 9940       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
 9941       if (! InlineSecondarySupersTest) {
 9942         generate_lookup_secondary_supers_table_stub();
 9943       }
 9944     }
 9945 #endif
 9946 
 9947     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
 9948   }
 9949 
 9950   void generate_compiler_stubs() {
 9951 #if COMPILER2_OR_JVMCI
 9952 
 9953     if (UseSVE == 0) {
 9954       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
 9955     }
 9956 
 9957     // array equals stub for large arrays.
 9958     if (!UseSimpleArrayEquals) {
 9959       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
 9960     }
 9961 
 9962     // arrays_hascode stub for large arrays.
 9963     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
 9964     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
 9965     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
 9966     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
 9967     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
 9968 
 9969     // byte_array_inflate stub for large arrays.
 9970     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
 9971 
 9972     // countPositives stub for large arrays.
 9973     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
 9974 
 9975     generate_compare_long_strings();
 9976 
 9977     generate_string_indexof_stubs();
 9978 
 9979 #ifdef COMPILER2
 9980     if (UseMultiplyToLenIntrinsic) {
 9981       StubRoutines::_multiplyToLen = generate_multiplyToLen();
 9982     }
 9983 
 9984     if (UseSquareToLenIntrinsic) {
 9985       StubRoutines::_squareToLen = generate_squareToLen();
 9986     }
 9987 
 9988     if (UseMulAddIntrinsic) {
 9989       StubRoutines::_mulAdd = generate_mulAdd();
 9990     }
 9991 
 9992     if (UseSIMDForBigIntegerShiftIntrinsics) {
 9993       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
 9994       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
 9995     }
 9996 
 9997     if (UseMontgomeryMultiplyIntrinsic) {
 9998       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
 9999       StubCodeMark mark(this, stub_id);
10000       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
10001       StubRoutines::_montgomeryMultiply = g.generate_multiply();
10002     }
10003 
10004     if (UseMontgomerySquareIntrinsic) {
10005       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
10006       StubCodeMark mark(this, stub_id);
10007       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
10008       // We use generate_multiply() rather than generate_square()
10009       // because it's faster for the sizes of modulus we care about.
10010       StubRoutines::_montgomerySquare = g.generate_multiply();
10011     }
10012 
10013     generate_vector_math_stubs();
10014 
10015 #endif // COMPILER2
10016 
10017     if (UseChaCha20Intrinsics) {
10018       StubRoutines::_chacha20Block = generate_chacha20Block_qrpar();
10019     }
10020 
10021     if (UseDilithiumIntrinsics) {
10022       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
10023       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
10024       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
10025       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
10026       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
10027     }
10028 
10029     if (UseBASE64Intrinsics) {
10030         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
10031         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
10032     }
10033 
10034     // data cache line writeback
10035     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
10036     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
10037 
10038     if (UseAESIntrinsics) {
10039       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
10040       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
10041       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
10042       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
10043       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
10044     }
10045     if (UseGHASHIntrinsics) {
10046       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
10047       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
10048     }
10049     if (UseAESIntrinsics && UseGHASHIntrinsics) {
10050       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
10051     }
10052 
10053     if (UseMD5Intrinsics) {
10054       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
10055       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
10056     }
10057     if (UseSHA1Intrinsics) {
10058       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
10059       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
10060     }
10061     if (UseSHA256Intrinsics) {
10062       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
10063       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
10064     }
10065     if (UseSHA512Intrinsics) {
10066       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
10067       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
10068     }
10069     if (UseSHA3Intrinsics) {
10070       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
10071       StubRoutines::_double_keccak         = generate_double_keccak();
10072       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
10073     }
10074 
10075     if (UsePoly1305Intrinsics) {
10076       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
10077     }
10078 
10079     // generate Adler32 intrinsics code
10080     if (UseAdler32Intrinsics) {
10081       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
10082     }
10083 
10084 #endif // COMPILER2_OR_JVMCI
10085   }
10086 
10087  public:
10088   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
10089     switch(blob_id) {
10090     case initial_id:
10091       generate_initial_stubs();
10092       break;
10093      case continuation_id:
10094       generate_continuation_stubs();
10095       break;
10096     case compiler_id:
10097       generate_compiler_stubs();
10098       break;
10099     case final_id:
10100       generate_final_stubs();
10101       break;
10102     default:
10103       fatal("unexpected blob id: %d", blob_id);
10104       break;
10105     };
10106   }
10107 }; // end class declaration
10108 
10109 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
10110   StubGenerator g(code, blob_id);
10111 }
10112 
10113 
10114 #if defined (LINUX)
10115 
10116 // Define pointers to atomic stubs and initialize them to point to the
10117 // code in atomic_aarch64.S.
10118 
10119 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
10120   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
10121     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
10122   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
10123     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
10124 
10125 DEFAULT_ATOMIC_OP(fetch_add, 4, )
10126 DEFAULT_ATOMIC_OP(fetch_add, 8, )
10127 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
10128 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
10129 DEFAULT_ATOMIC_OP(xchg, 4, )
10130 DEFAULT_ATOMIC_OP(xchg, 8, )
10131 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
10132 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
10133 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
10134 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
10135 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
10136 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
10137 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
10138 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
10139 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
10140 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
10141 
10142 #undef DEFAULT_ATOMIC_OP
10143 
10144 #endif // LINUX