1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "code/aotCodeCache.hpp"
   31 #include "compiler/oopMap.hpp"
   32 #include "gc/shared/barrierSet.hpp"
   33 #include "gc/shared/barrierSetAssembler.hpp"
   34 #include "gc/shared/gc_globals.hpp"
   35 #include "gc/shared/tlab_globals.hpp"
   36 #include "interpreter/interpreter.hpp"
   37 #include "memory/universe.hpp"
   38 #include "nativeInst_aarch64.hpp"
   39 #include "oops/instanceOop.hpp"
   40 #include "oops/method.hpp"
   41 #include "oops/objArrayKlass.hpp"
   42 #include "oops/oop.inline.hpp"
   43 #include "prims/methodHandles.hpp"
   44 #include "prims/upcallLinker.hpp"
   45 #include "runtime/arguments.hpp"
   46 #include "runtime/atomic.hpp"
   47 #include "runtime/continuation.hpp"
   48 #include "runtime/continuationEntry.inline.hpp"
   49 #include "runtime/frame.inline.hpp"
   50 #include "runtime/handles.inline.hpp"
   51 #include "runtime/javaThread.hpp"
   52 #include "runtime/sharedRuntime.hpp"
   53 #include "runtime/stubCodeGenerator.hpp"
   54 #include "runtime/stubRoutines.hpp"
   55 #include "utilities/align.hpp"
   56 #include "utilities/checkedCast.hpp"
   57 #include "utilities/debug.hpp"
   58 #include "utilities/globalDefinitions.hpp"
   59 #include "utilities/intpow.hpp"
   60 #include "utilities/powerOfTwo.hpp"
   61 #ifdef COMPILER2
   62 #include "opto/runtime.hpp"
   63 #endif
   64 #if INCLUDE_ZGC
   65 #include "gc/z/zThreadLocalData.hpp"
   66 #endif
   67 
   68 // Declaration and definition of StubGenerator (no .hpp file).
   69 // For a more detailed description of the stub routine structure
   70 // see the comment in stubRoutines.hpp
   71 
   72 #undef __
   73 #define __ _masm->
   74 
   75 #ifdef PRODUCT
   76 #define BLOCK_COMMENT(str) /* nothing */
   77 #else
   78 #define BLOCK_COMMENT(str) __ block_comment(str)
   79 #endif
   80 
   81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   82 
   83 // Stub Code definitions
   84 
   85 class StubGenerator: public StubCodeGenerator {
   86  private:
   87 
   88 #ifdef PRODUCT
   89 #define inc_counter_np(counter) ((void)0)
   90 #else
   91   void inc_counter_np_(uint& counter) {
   92     __ incrementw(ExternalAddress((address)&counter));
   93   }
   94 #define inc_counter_np(counter) \
   95   BLOCK_COMMENT("inc_counter " #counter); \
   96   inc_counter_np_(counter);
   97 #endif
   98 
   99   // Call stubs are used to call Java from C
  100   //
  101   // Arguments:
  102   //    c_rarg0:   call wrapper address                   address
  103   //    c_rarg1:   result                                 address
  104   //    c_rarg2:   result type                            BasicType
  105   //    c_rarg3:   method                                 Method*
  106   //    c_rarg4:   (interpreter) entry point              address
  107   //    c_rarg5:   parameters                             intptr_t*
  108   //    c_rarg6:   parameter size (in words)              int
  109   //    c_rarg7:   thread                                 Thread*
  110   //
  111   // There is no return from the stub itself as any Java result
  112   // is written to result
  113   //
  114   // we save r30 (lr) as the return PC at the base of the frame and
  115   // link r29 (fp) below it as the frame pointer installing sp (r31)
  116   // into fp.
  117   //
  118   // we save r0-r7, which accounts for all the c arguments.
  119   //
  120   // TODO: strictly do we need to save them all? they are treated as
  121   // volatile by C so could we omit saving the ones we are going to
  122   // place in global registers (thread? method?) or those we only use
  123   // during setup of the Java call?
  124   //
  125   // we don't need to save r8 which C uses as an indirect result location
  126   // return register.
  127   //
  128   // we don't need to save r9-r15 which both C and Java treat as
  129   // volatile
  130   //
  131   // we don't need to save r16-18 because Java does not use them
  132   //
  133   // we save r19-r28 which Java uses as scratch registers and C
  134   // expects to be callee-save
  135   //
  136   // we save the bottom 64 bits of each value stored in v8-v15; it is
  137   // the responsibility of the caller to preserve larger values.
  138   //
  139   // so the stub frame looks like this when we enter Java code
  140   //
  141   //     [ return_from_Java     ] <--- sp
  142   //     [ argument word n      ]
  143   //      ...
  144   // -29 [ argument word 1      ]
  145   // -28 [ saved Floating-point Control Register ]
  146   // -26 [ saved v15            ] <--- sp_after_call
  147   // -25 [ saved v14            ]
  148   // -24 [ saved v13            ]
  149   // -23 [ saved v12            ]
  150   // -22 [ saved v11            ]
  151   // -21 [ saved v10            ]
  152   // -20 [ saved v9             ]
  153   // -19 [ saved v8             ]
  154   // -18 [ saved r28            ]
  155   // -17 [ saved r27            ]
  156   // -16 [ saved r26            ]
  157   // -15 [ saved r25            ]
  158   // -14 [ saved r24            ]
  159   // -13 [ saved r23            ]
  160   // -12 [ saved r22            ]
  161   // -11 [ saved r21            ]
  162   // -10 [ saved r20            ]
  163   //  -9 [ saved r19            ]
  164   //  -8 [ call wrapper    (r0) ]
  165   //  -7 [ result          (r1) ]
  166   //  -6 [ result type     (r2) ]
  167   //  -5 [ method          (r3) ]
  168   //  -4 [ entry point     (r4) ]
  169   //  -3 [ parameters      (r5) ]
  170   //  -2 [ parameter size  (r6) ]
  171   //  -1 [ thread (r7)          ]
  172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  173   //   1 [ saved lr       (r30) ]
  174 
  175   // Call stub stack layout word offsets from fp
  176   enum call_stub_layout {
  177     sp_after_call_off  = -28,
  178 
  179     fpcr_off           = sp_after_call_off,
  180     d15_off            = -26,
  181     d13_off            = -24,
  182     d11_off            = -22,
  183     d9_off             = -20,
  184 
  185     r28_off            = -18,
  186     r26_off            = -16,
  187     r24_off            = -14,
  188     r22_off            = -12,
  189     r20_off            = -10,
  190     call_wrapper_off   =  -8,
  191     result_off         =  -7,
  192     result_type_off    =  -6,
  193     method_off         =  -5,
  194     entry_point_off    =  -4,
  195     parameter_size_off =  -2,
  196     thread_off         =  -1,
  197     fp_f               =   0,
  198     retaddr_off        =   1,
  199   };
  200 
  201   address generate_call_stub(address& return_address) {
  202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  204            "adjust this code");
  205 
  206     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  207     StubCodeMark mark(this, stub_id);
  208     address start = __ pc();
  209 
  210     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  211 
  212     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  213     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  214     const Address result        (rfp, result_off         * wordSize);
  215     const Address result_type   (rfp, result_type_off    * wordSize);
  216     const Address method        (rfp, method_off         * wordSize);
  217     const Address entry_point   (rfp, entry_point_off    * wordSize);
  218     const Address parameter_size(rfp, parameter_size_off * wordSize);
  219 
  220     const Address thread        (rfp, thread_off         * wordSize);
  221 
  222     const Address d15_save      (rfp, d15_off * wordSize);
  223     const Address d13_save      (rfp, d13_off * wordSize);
  224     const Address d11_save      (rfp, d11_off * wordSize);
  225     const Address d9_save       (rfp, d9_off * wordSize);
  226 
  227     const Address r28_save      (rfp, r28_off * wordSize);
  228     const Address r26_save      (rfp, r26_off * wordSize);
  229     const Address r24_save      (rfp, r24_off * wordSize);
  230     const Address r22_save      (rfp, r22_off * wordSize);
  231     const Address r20_save      (rfp, r20_off * wordSize);
  232 
  233     // stub code
  234 
  235     address aarch64_entry = __ pc();
  236 
  237     // set up frame and move sp to end of save area
  238     __ enter();
  239     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  240 
  241     // save register parameters and Java scratch/global registers
  242     // n.b. we save thread even though it gets installed in
  243     // rthread because we want to sanity check rthread later
  244     __ str(c_rarg7,  thread);
  245     __ strw(c_rarg6, parameter_size);
  246     __ stp(c_rarg4, c_rarg5,  entry_point);
  247     __ stp(c_rarg2, c_rarg3,  result_type);
  248     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  249 
  250     __ stp(r20, r19,   r20_save);
  251     __ stp(r22, r21,   r22_save);
  252     __ stp(r24, r23,   r24_save);
  253     __ stp(r26, r25,   r26_save);
  254     __ stp(r28, r27,   r28_save);
  255 
  256     __ stpd(v9,  v8,   d9_save);
  257     __ stpd(v11, v10,  d11_save);
  258     __ stpd(v13, v12,  d13_save);
  259     __ stpd(v15, v14,  d15_save);
  260 
  261     __ get_fpcr(rscratch1);
  262     __ str(rscratch1, fpcr_save);
  263     // Set FPCR to the state we need. We do want Round to Nearest. We
  264     // don't want non-IEEE rounding modes or floating-point traps.
  265     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  266     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  267     __ set_fpcr(rscratch1);
  268 
  269     // install Java thread in global register now we have saved
  270     // whatever value it held
  271     __ mov(rthread, c_rarg7);
  272     // And method
  273     __ mov(rmethod, c_rarg3);
  274 
  275     // set up the heapbase register
  276     __ reinit_heapbase();
  277 
  278 #ifdef ASSERT
  279     // make sure we have no pending exceptions
  280     {
  281       Label L;
  282       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  283       __ cmp(rscratch1, (u1)NULL_WORD);
  284       __ br(Assembler::EQ, L);
  285       __ stop("StubRoutines::call_stub: entered with pending exception");
  286       __ BIND(L);
  287     }
  288 #endif
  289     // pass parameters if any
  290     __ mov(esp, sp);
  291     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  292     __ andr(sp, rscratch1, -2 * wordSize);
  293 
  294     BLOCK_COMMENT("pass parameters if any");
  295     Label parameters_done;
  296     // parameter count is still in c_rarg6
  297     // and parameter pointer identifying param 1 is in c_rarg5
  298     __ cbzw(c_rarg6, parameters_done);
  299 
  300     address loop = __ pc();
  301     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  302     __ subsw(c_rarg6, c_rarg6, 1);
  303     __ push(rscratch1);
  304     __ br(Assembler::GT, loop);
  305 
  306     __ BIND(parameters_done);
  307 
  308     // call Java entry -- passing methdoOop, and current sp
  309     //      rmethod: Method*
  310     //      r19_sender_sp: sender sp
  311     BLOCK_COMMENT("call Java function");
  312     __ mov(r19_sender_sp, sp);
  313     __ blr(c_rarg4);
  314 
  315     // we do this here because the notify will already have been done
  316     // if we get to the next instruction via an exception
  317     //
  318     // n.b. adding this instruction here affects the calculation of
  319     // whether or not a routine returns to the call stub (used when
  320     // doing stack walks) since the normal test is to check the return
  321     // pc against the address saved below. so we may need to allow for
  322     // this extra instruction in the check.
  323 
  324     // save current address for use by exception handling code
  325 
  326     return_address = __ pc();
  327 
  328     // store result depending on type (everything that is not
  329     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  330     // n.b. this assumes Java returns an integral result in r0
  331     // and a floating result in j_farg0
  332     __ ldr(j_rarg2, result);
  333     Label is_long, is_float, is_double, exit;
  334     __ ldr(j_rarg1, result_type);
  335     __ cmp(j_rarg1, (u1)T_OBJECT);
  336     __ br(Assembler::EQ, is_long);
  337     __ cmp(j_rarg1, (u1)T_LONG);
  338     __ br(Assembler::EQ, is_long);
  339     __ cmp(j_rarg1, (u1)T_FLOAT);
  340     __ br(Assembler::EQ, is_float);
  341     __ cmp(j_rarg1, (u1)T_DOUBLE);
  342     __ br(Assembler::EQ, is_double);
  343 
  344     // handle T_INT case
  345     __ strw(r0, Address(j_rarg2));
  346 
  347     __ BIND(exit);
  348 
  349     // pop parameters
  350     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  351 
  352 #ifdef ASSERT
  353     // verify that threads correspond
  354     {
  355       Label L, S;
  356       __ ldr(rscratch1, thread);
  357       __ cmp(rthread, rscratch1);
  358       __ br(Assembler::NE, S);
  359       __ get_thread(rscratch1);
  360       __ cmp(rthread, rscratch1);
  361       __ br(Assembler::EQ, L);
  362       __ BIND(S);
  363       __ stop("StubRoutines::call_stub: threads must correspond");
  364       __ BIND(L);
  365     }
  366 #endif
  367 
  368     __ pop_cont_fastpath(rthread);
  369 
  370     // restore callee-save registers
  371     __ ldpd(v15, v14,  d15_save);
  372     __ ldpd(v13, v12,  d13_save);
  373     __ ldpd(v11, v10,  d11_save);
  374     __ ldpd(v9,  v8,   d9_save);
  375 
  376     __ ldp(r28, r27,   r28_save);
  377     __ ldp(r26, r25,   r26_save);
  378     __ ldp(r24, r23,   r24_save);
  379     __ ldp(r22, r21,   r22_save);
  380     __ ldp(r20, r19,   r20_save);
  381 
  382     // restore fpcr
  383     __ ldr(rscratch1,  fpcr_save);
  384     __ set_fpcr(rscratch1);
  385 
  386     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  387     __ ldrw(c_rarg2, result_type);
  388     __ ldr(c_rarg3,  method);
  389     __ ldp(c_rarg4, c_rarg5,  entry_point);
  390     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  391 
  392     // leave frame and return to caller
  393     __ leave();
  394     __ ret(lr);
  395 
  396     // handle return types different from T_INT
  397 
  398     __ BIND(is_long);
  399     __ str(r0, Address(j_rarg2, 0));
  400     __ br(Assembler::AL, exit);
  401 
  402     __ BIND(is_float);
  403     __ strs(j_farg0, Address(j_rarg2, 0));
  404     __ br(Assembler::AL, exit);
  405 
  406     __ BIND(is_double);
  407     __ strd(j_farg0, Address(j_rarg2, 0));
  408     __ br(Assembler::AL, exit);
  409 
  410     return start;
  411   }
  412 
  413   // Return point for a Java call if there's an exception thrown in
  414   // Java code.  The exception is caught and transformed into a
  415   // pending exception stored in JavaThread that can be tested from
  416   // within the VM.
  417   //
  418   // Note: Usually the parameters are removed by the callee. In case
  419   // of an exception crossing an activation frame boundary, that is
  420   // not the case if the callee is compiled code => need to setup the
  421   // rsp.
  422   //
  423   // r0: exception oop
  424 
  425   address generate_catch_exception() {
  426     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
  427     StubCodeMark mark(this, stub_id);
  428     address start = __ pc();
  429 
  430     // same as in generate_call_stub():
  431     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  432     const Address thread        (rfp, thread_off         * wordSize);
  433 
  434 #ifdef ASSERT
  435     // verify that threads correspond
  436     {
  437       Label L, S;
  438       __ ldr(rscratch1, thread);
  439       __ cmp(rthread, rscratch1);
  440       __ br(Assembler::NE, S);
  441       __ get_thread(rscratch1);
  442       __ cmp(rthread, rscratch1);
  443       __ br(Assembler::EQ, L);
  444       __ bind(S);
  445       __ stop("StubRoutines::catch_exception: threads must correspond");
  446       __ bind(L);
  447     }
  448 #endif
  449 
  450     // set pending exception
  451     __ verify_oop(r0);
  452 
  453     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  454     __ mov(rscratch1, (address)__FILE__);
  455     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  456     __ movw(rscratch1, (int)__LINE__);
  457     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  458 
  459     // complete return to VM
  460     assert(StubRoutines::_call_stub_return_address != nullptr,
  461            "_call_stub_return_address must have been generated before");
  462     __ b(StubRoutines::_call_stub_return_address);
  463 
  464     return start;
  465   }
  466 
  467   // Continuation point for runtime calls returning with a pending
  468   // exception.  The pending exception check happened in the runtime
  469   // or native call stub.  The pending exception in Thread is
  470   // converted into a Java-level exception.
  471   //
  472   // Contract with Java-level exception handlers:
  473   // r0: exception
  474   // r3: throwing pc
  475   //
  476   // NOTE: At entry of this stub, exception-pc must be in LR !!
  477 
  478   // NOTE: this is always used as a jump target within generated code
  479   // so it just needs to be generated code with no x86 prolog
  480 
  481   address generate_forward_exception() {
  482     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
  483     StubCodeMark mark(this, stub_id);
  484     address start = __ pc();
  485 
  486     // Upon entry, LR points to the return address returning into
  487     // Java (interpreted or compiled) code; i.e., the return address
  488     // becomes the throwing pc.
  489     //
  490     // Arguments pushed before the runtime call are still on the stack
  491     // but the exception handler will reset the stack pointer ->
  492     // ignore them.  A potential result in registers can be ignored as
  493     // well.
  494 
  495 #ifdef ASSERT
  496     // make sure this code is only executed if there is a pending exception
  497     {
  498       Label L;
  499       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  500       __ cbnz(rscratch1, L);
  501       __ stop("StubRoutines::forward exception: no pending exception (1)");
  502       __ bind(L);
  503     }
  504 #endif
  505 
  506     // compute exception handler into r19
  507 
  508     // call the VM to find the handler address associated with the
  509     // caller address. pass thread in r0 and caller pc (ret address)
  510     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  511     // the stack.
  512     __ mov(c_rarg1, lr);
  513     // lr will be trashed by the VM call so we move it to R19
  514     // (callee-saved) because we also need to pass it to the handler
  515     // returned by this call.
  516     __ mov(r19, lr);
  517     BLOCK_COMMENT("call exception_handler_for_return_address");
  518     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  519                          SharedRuntime::exception_handler_for_return_address),
  520                     rthread, c_rarg1);
  521     // Reinitialize the ptrue predicate register, in case the external runtime
  522     // call clobbers ptrue reg, as we may return to SVE compiled code.
  523     __ reinitialize_ptrue();
  524 
  525     // we should not really care that lr is no longer the callee
  526     // address. we saved the value the handler needs in r19 so we can
  527     // just copy it to r3. however, the C2 handler will push its own
  528     // frame and then calls into the VM and the VM code asserts that
  529     // the PC for the frame above the handler belongs to a compiled
  530     // Java method. So, we restore lr here to satisfy that assert.
  531     __ mov(lr, r19);
  532     // setup r0 & r3 & clear pending exception
  533     __ mov(r3, r19);
  534     __ mov(r19, r0);
  535     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  536     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  537 
  538 #ifdef ASSERT
  539     // make sure exception is set
  540     {
  541       Label L;
  542       __ cbnz(r0, L);
  543       __ stop("StubRoutines::forward exception: no pending exception (2)");
  544       __ bind(L);
  545     }
  546 #endif
  547 
  548     // continue at exception handler
  549     // r0: exception
  550     // r3: throwing pc
  551     // r19: exception handler
  552     __ verify_oop(r0);
  553     __ br(r19);
  554 
  555     return start;
  556   }
  557 
  558   // Non-destructive plausibility checks for oops
  559   //
  560   // Arguments:
  561   //    r0: oop to verify
  562   //    rscratch1: error message
  563   //
  564   // Stack after saving c_rarg3:
  565   //    [tos + 0]: saved c_rarg3
  566   //    [tos + 1]: saved c_rarg2
  567   //    [tos + 2]: saved lr
  568   //    [tos + 3]: saved rscratch2
  569   //    [tos + 4]: saved r0
  570   //    [tos + 5]: saved rscratch1
  571   address generate_verify_oop() {
  572     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
  573     StubCodeMark mark(this, stub_id);
  574     address start = __ pc();
  575 
  576     Label exit, error;
  577 
  578     // save c_rarg2 and c_rarg3
  579     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  580 
  581     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  583     __ ldr(c_rarg3, Address(c_rarg2));
  584     __ add(c_rarg3, c_rarg3, 1);
  585     __ str(c_rarg3, Address(c_rarg2));
  586 
  587     // object is in r0
  588     // make sure object is 'reasonable'
  589     __ cbz(r0, exit); // if obj is null it is OK
  590 
  591     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  592     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  593 
  594     // return if everything seems ok
  595     __ bind(exit);
  596 
  597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  598     __ ret(lr);
  599 
  600     // handle errors
  601     __ bind(error);
  602     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  603 
  604     __ push(RegSet::range(r0, r29), sp);
  605     // debug(char* msg, int64_t pc, int64_t regs[])
  606     __ mov(c_rarg0, rscratch1);      // pass address of error message
  607     __ mov(c_rarg1, lr);             // pass return address
  608     __ mov(c_rarg2, sp);             // pass address of regs on stack
  609 #ifndef PRODUCT
  610     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  611 #endif
  612     BLOCK_COMMENT("call MacroAssembler::debug");
  613     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  614     __ blr(rscratch1);
  615     __ hlt(0);
  616 
  617     return start;
  618   }
  619 
  620   // Generate indices for iota vector.
  621   address generate_iota_indices(StubGenStubId stub_id) {
  622     __ align(CodeEntryAlignment);
  623     StubCodeMark mark(this, stub_id);
  624     address start = __ pc();
  625     // B
  626     __ emit_data64(0x0706050403020100, relocInfo::none);
  627     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  628     // H
  629     __ emit_data64(0x0003000200010000, relocInfo::none);
  630     __ emit_data64(0x0007000600050004, relocInfo::none);
  631     // S
  632     __ emit_data64(0x0000000100000000, relocInfo::none);
  633     __ emit_data64(0x0000000300000002, relocInfo::none);
  634     // D
  635     __ emit_data64(0x0000000000000000, relocInfo::none);
  636     __ emit_data64(0x0000000000000001, relocInfo::none);
  637     // S - FP
  638     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  639     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  640     // D - FP
  641     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  642     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  643     return start;
  644   }
  645 
  646   // The inner part of zero_words().  This is the bulk operation,
  647   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  648   // caller is responsible for zeroing the last few words.
  649   //
  650   // Inputs:
  651   // r10: the HeapWord-aligned base address of an array to zero.
  652   // r11: the count in HeapWords, r11 > 0.
  653   //
  654   // Returns r10 and r11, adjusted for the caller to clear.
  655   // r10: the base address of the tail of words left to clear.
  656   // r11: the number of words in the tail.
  657   //      r11 < MacroAssembler::zero_words_block_size.
  658 
  659   address generate_zero_blocks() {
  660     Label done;
  661     Label base_aligned;
  662 
  663     Register base = r10, cnt = r11;
  664 
  665     __ align(CodeEntryAlignment);
  666     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
  667     StubCodeMark mark(this, stub_id);
  668     address start = __ pc();
  669 
  670     if (UseBlockZeroing) {
  671       int zva_length = VM_Version::zva_length();
  672 
  673       // Ensure ZVA length can be divided by 16. This is required by
  674       // the subsequent operations.
  675       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  676 
  677       __ tbz(base, 3, base_aligned);
  678       __ str(zr, Address(__ post(base, 8)));
  679       __ sub(cnt, cnt, 1);
  680       __ bind(base_aligned);
  681 
  682       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  683       // alignment.
  684       Label small;
  685       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  686       __ subs(rscratch1, cnt, low_limit >> 3);
  687       __ br(Assembler::LT, small);
  688       __ zero_dcache_blocks(base, cnt);
  689       __ bind(small);
  690     }
  691 
  692     {
  693       // Number of stp instructions we'll unroll
  694       const int unroll =
  695         MacroAssembler::zero_words_block_size / 2;
  696       // Clear the remaining blocks.
  697       Label loop;
  698       __ subs(cnt, cnt, unroll * 2);
  699       __ br(Assembler::LT, done);
  700       __ bind(loop);
  701       for (int i = 0; i < unroll; i++)
  702         __ stp(zr, zr, __ post(base, 16));
  703       __ subs(cnt, cnt, unroll * 2);
  704       __ br(Assembler::GE, loop);
  705       __ bind(done);
  706       __ add(cnt, cnt, unroll * 2);
  707     }
  708 
  709     __ ret(lr);
  710 
  711     return start;
  712   }
  713 
  714 
  715   typedef enum {
  716     copy_forwards = 1,
  717     copy_backwards = -1
  718   } copy_direction;
  719 
  720   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  721   // for arraycopy stubs.
  722   class ArrayCopyBarrierSetHelper : StackObj {
  723     BarrierSetAssembler* _bs_asm;
  724     MacroAssembler* _masm;
  725     DecoratorSet _decorators;
  726     BasicType _type;
  727     Register _gct1;
  728     Register _gct2;
  729     Register _gct3;
  730     FloatRegister _gcvt1;
  731     FloatRegister _gcvt2;
  732     FloatRegister _gcvt3;
  733 
  734   public:
  735     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  736                               DecoratorSet decorators,
  737                               BasicType type,
  738                               Register gct1,
  739                               Register gct2,
  740                               Register gct3,
  741                               FloatRegister gcvt1,
  742                               FloatRegister gcvt2,
  743                               FloatRegister gcvt3)
  744       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  745         _masm(masm),
  746         _decorators(decorators),
  747         _type(type),
  748         _gct1(gct1),
  749         _gct2(gct2),
  750         _gct3(gct3),
  751         _gcvt1(gcvt1),
  752         _gcvt2(gcvt2),
  753         _gcvt3(gcvt3) {
  754     }
  755 
  756     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  757       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  758                             dst1, dst2, src,
  759                             _gct1, _gct2, _gcvt1);
  760     }
  761 
  762     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  763       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  764                              dst, src1, src2,
  765                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  766     }
  767 
  768     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  769       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  770                             dst1, dst2, src,
  771                             _gct1);
  772     }
  773 
  774     void copy_store_at_16(Address dst, Register src1, Register src2) {
  775       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  776                              dst, src1, src2,
  777                              _gct1, _gct2, _gct3);
  778     }
  779 
  780     void copy_load_at_8(Register dst, Address src) {
  781       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  782                             dst, noreg, src,
  783                             _gct1);
  784     }
  785 
  786     void copy_store_at_8(Address dst, Register src) {
  787       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  788                              dst, src, noreg,
  789                              _gct1, _gct2, _gct3);
  790     }
  791   };
  792 
  793   // Bulk copy of blocks of 8 words.
  794   //
  795   // count is a count of words.
  796   //
  797   // Precondition: count >= 8
  798   //
  799   // Postconditions:
  800   //
  801   // The least significant bit of count contains the remaining count
  802   // of words to copy.  The rest of count is trash.
  803   //
  804   // s and d are adjusted to point to the remaining words to copy
  805   //
  806   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  807     BasicType type;
  808     copy_direction direction;
  809 
  810     switch (stub_id) {
  811     case copy_byte_f_id:
  812       direction = copy_forwards;
  813       type = T_BYTE;
  814       break;
  815     case copy_byte_b_id:
  816       direction = copy_backwards;
  817       type = T_BYTE;
  818       break;
  819     case copy_oop_f_id:
  820       direction = copy_forwards;
  821       type = T_OBJECT;
  822       break;
  823     case copy_oop_b_id:
  824       direction = copy_backwards;
  825       type = T_OBJECT;
  826       break;
  827     case copy_oop_uninit_f_id:
  828       direction = copy_forwards;
  829       type = T_OBJECT;
  830       break;
  831     case copy_oop_uninit_b_id:
  832       direction = copy_backwards;
  833       type = T_OBJECT;
  834       break;
  835     default:
  836       ShouldNotReachHere();
  837     }
  838 
  839     int unit = wordSize * direction;
  840     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  841 
  842     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  843       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  844     const Register stride = r14;
  845     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  846     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  847     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  848 
  849     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  850     assert_different_registers(s, d, count, rscratch1, rscratch2);
  851 
  852     Label again, drain;
  853 
  854     __ align(CodeEntryAlignment);
  855 
  856     StubCodeMark mark(this, stub_id);
  857 
  858     __ bind(start);
  859 
  860     Label unaligned_copy_long;
  861     if (AvoidUnalignedAccesses) {
  862       __ tbnz(d, 3, unaligned_copy_long);
  863     }
  864 
  865     if (direction == copy_forwards) {
  866       __ sub(s, s, bias);
  867       __ sub(d, d, bias);
  868     }
  869 
  870 #ifdef ASSERT
  871     // Make sure we are never given < 8 words
  872     {
  873       Label L;
  874       __ cmp(count, (u1)8);
  875       __ br(Assembler::GE, L);
  876       __ stop("genrate_copy_longs called with < 8 words");
  877       __ bind(L);
  878     }
  879 #endif
  880 
  881     // Fill 8 registers
  882     if (UseSIMDForMemoryOps) {
  883       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  884       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  885     } else {
  886       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  887       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  888       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  889       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  890     }
  891 
  892     __ subs(count, count, 16);
  893     __ br(Assembler::LO, drain);
  894 
  895     int prefetch = PrefetchCopyIntervalInBytes;
  896     bool use_stride = false;
  897     if (direction == copy_backwards) {
  898        use_stride = prefetch > 256;
  899        prefetch = -prefetch;
  900        if (use_stride) __ mov(stride, prefetch);
  901     }
  902 
  903     __ bind(again);
  904 
  905     if (PrefetchCopyIntervalInBytes > 0)
  906       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  907 
  908     if (UseSIMDForMemoryOps) {
  909       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  910       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  911       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  912       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  913     } else {
  914       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  915       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  916       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  917       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  918       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  919       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  920       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  921       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  922     }
  923 
  924     __ subs(count, count, 8);
  925     __ br(Assembler::HS, again);
  926 
  927     // Drain
  928     __ bind(drain);
  929     if (UseSIMDForMemoryOps) {
  930       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  931       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  932     } else {
  933       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  934       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  935       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  936       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  937     }
  938 
  939     {
  940       Label L1, L2;
  941       __ tbz(count, exact_log2(4), L1);
  942       if (UseSIMDForMemoryOps) {
  943         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  944         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  945       } else {
  946         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  947         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  948         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  950       }
  951       __ bind(L1);
  952 
  953       if (direction == copy_forwards) {
  954         __ add(s, s, bias);
  955         __ add(d, d, bias);
  956       }
  957 
  958       __ tbz(count, 1, L2);
  959       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  960       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  961       __ bind(L2);
  962     }
  963 
  964     __ ret(lr);
  965 
  966     if (AvoidUnalignedAccesses) {
  967       Label drain, again;
  968       // Register order for storing. Order is different for backward copy.
  969 
  970       __ bind(unaligned_copy_long);
  971 
  972       // source address is even aligned, target odd aligned
  973       //
  974       // when forward copying word pairs we read long pairs at offsets
  975       // {0, 2, 4, 6} (in long words). when backwards copying we read
  976       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  977       // address by -2 in the forwards case so we can compute the
  978       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  979       // or -1.
  980       //
  981       // when forward copying we need to store 1 word, 3 pairs and
  982       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  983       // zero offset We adjust the destination by -1 which means we
  984       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  985       //
  986       // When backwards copyng we need to store 1 word, 3 pairs and
  987       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  988       // offsets {1, 3, 5, 7, 8} * unit.
  989 
  990       if (direction == copy_forwards) {
  991         __ sub(s, s, 16);
  992         __ sub(d, d, 8);
  993       }
  994 
  995       // Fill 8 registers
  996       //
  997       // for forwards copy s was offset by -16 from the original input
  998       // value of s so the register contents are at these offsets
  999       // relative to the 64 bit block addressed by that original input
 1000       // and so on for each successive 64 byte block when s is updated
 1001       //
 1002       // t0 at offset 0,  t1 at offset 8
 1003       // t2 at offset 16, t3 at offset 24
 1004       // t4 at offset 32, t5 at offset 40
 1005       // t6 at offset 48, t7 at offset 56
 1006 
 1007       // for backwards copy s was not offset so the register contents
 1008       // are at these offsets into the preceding 64 byte block
 1009       // relative to that original input and so on for each successive
 1010       // preceding 64 byte block when s is updated. this explains the
 1011       // slightly counter-intuitive looking pattern of register usage
 1012       // in the stp instructions for backwards copy.
 1013       //
 1014       // t0 at offset -16, t1 at offset -8
 1015       // t2 at offset -32, t3 at offset -24
 1016       // t4 at offset -48, t5 at offset -40
 1017       // t6 at offset -64, t7 at offset -56
 1018 
 1019       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1020       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1021       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1022       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1023 
 1024       __ subs(count, count, 16);
 1025       __ br(Assembler::LO, drain);
 1026 
 1027       int prefetch = PrefetchCopyIntervalInBytes;
 1028       bool use_stride = false;
 1029       if (direction == copy_backwards) {
 1030          use_stride = prefetch > 256;
 1031          prefetch = -prefetch;
 1032          if (use_stride) __ mov(stride, prefetch);
 1033       }
 1034 
 1035       __ bind(again);
 1036 
 1037       if (PrefetchCopyIntervalInBytes > 0)
 1038         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1039 
 1040       if (direction == copy_forwards) {
 1041        // allowing for the offset of -8 the store instructions place
 1042        // registers into the target 64 bit block at the following
 1043        // offsets
 1044        //
 1045        // t0 at offset 0
 1046        // t1 at offset 8,  t2 at offset 16
 1047        // t3 at offset 24, t4 at offset 32
 1048        // t5 at offset 40, t6 at offset 48
 1049        // t7 at offset 56
 1050 
 1051         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1052         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1053         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1054         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1055         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1056         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1057         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1058         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1059         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1060       } else {
 1061        // d was not offset when we started so the registers are
 1062        // written into the 64 bit block preceding d with the following
 1063        // offsets
 1064        //
 1065        // t1 at offset -8
 1066        // t3 at offset -24, t0 at offset -16
 1067        // t5 at offset -48, t2 at offset -32
 1068        // t7 at offset -56, t4 at offset -48
 1069        //                   t6 at offset -64
 1070        //
 1071        // note that this matches the offsets previously noted for the
 1072        // loads
 1073 
 1074         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1075         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1076         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1077         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1078         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1079         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1080         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1081         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1082         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1083       }
 1084 
 1085       __ subs(count, count, 8);
 1086       __ br(Assembler::HS, again);
 1087 
 1088       // Drain
 1089       //
 1090       // this uses the same pattern of offsets and register arguments
 1091       // as above
 1092       __ bind(drain);
 1093       if (direction == copy_forwards) {
 1094         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1095         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1096         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1097         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1098         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1099       } else {
 1100         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1101         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1102         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1103         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1104         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1105       }
 1106       // now we need to copy any remaining part block which may
 1107       // include a 4 word block subblock and/or a 2 word subblock.
 1108       // bits 2 and 1 in the count are the tell-tale for whether we
 1109       // have each such subblock
 1110       {
 1111         Label L1, L2;
 1112         __ tbz(count, exact_log2(4), L1);
 1113        // this is the same as above but copying only 4 longs hence
 1114        // with only one intervening stp between the str instructions
 1115        // but note that the offsets and registers still follow the
 1116        // same pattern
 1117         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1118         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1119         if (direction == copy_forwards) {
 1120           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1121           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1122           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1123         } else {
 1124           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1125           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1126           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1127         }
 1128         __ bind(L1);
 1129 
 1130         __ tbz(count, 1, L2);
 1131        // this is the same as above but copying only 2 longs hence
 1132        // there is no intervening stp between the str instructions
 1133        // but note that the offset and register patterns are still
 1134        // the same
 1135         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1136         if (direction == copy_forwards) {
 1137           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1138           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1139         } else {
 1140           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1141           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1142         }
 1143         __ bind(L2);
 1144 
 1145        // for forwards copy we need to re-adjust the offsets we
 1146        // applied so that s and d are follow the last words written
 1147 
 1148        if (direction == copy_forwards) {
 1149          __ add(s, s, 16);
 1150          __ add(d, d, 8);
 1151        }
 1152 
 1153       }
 1154 
 1155       __ ret(lr);
 1156       }
 1157   }
 1158 
 1159   // Small copy: less than 16 bytes.
 1160   //
 1161   // NB: Ignores all of the bits of count which represent more than 15
 1162   // bytes, so a caller doesn't have to mask them.
 1163 
 1164   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1165     bool is_backwards = step < 0;
 1166     size_t granularity = g_uabs(step);
 1167     int direction = is_backwards ? -1 : 1;
 1168 
 1169     Label Lword, Lint, Lshort, Lbyte;
 1170 
 1171     assert(granularity
 1172            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1173 
 1174     const Register t0 = r3;
 1175     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1176     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1177 
 1178     // ??? I don't know if this bit-test-and-branch is the right thing
 1179     // to do.  It does a lot of jumping, resulting in several
 1180     // mispredicted branches.  It might make more sense to do this
 1181     // with something like Duff's device with a single computed branch.
 1182 
 1183     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1184     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1185     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1186     __ bind(Lword);
 1187 
 1188     if (granularity <= sizeof (jint)) {
 1189       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1190       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1191       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1192       __ bind(Lint);
 1193     }
 1194 
 1195     if (granularity <= sizeof (jshort)) {
 1196       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1197       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1198       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1199       __ bind(Lshort);
 1200     }
 1201 
 1202     if (granularity <= sizeof (jbyte)) {
 1203       __ tbz(count, 0, Lbyte);
 1204       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1205       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1206       __ bind(Lbyte);
 1207     }
 1208   }
 1209 
 1210   Label copy_f, copy_b;
 1211   Label copy_obj_f, copy_obj_b;
 1212   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1213 
 1214   // All-singing all-dancing memory copy.
 1215   //
 1216   // Copy count units of memory from s to d.  The size of a unit is
 1217   // step, which can be positive or negative depending on the direction
 1218   // of copy.  If is_aligned is false, we align the source address.
 1219   //
 1220 
 1221   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1222                    Register s, Register d, Register count, int step) {
 1223     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1224     bool is_backwards = step < 0;
 1225     unsigned int granularity = g_uabs(step);
 1226     const Register t0 = r3, t1 = r4;
 1227 
 1228     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1229     // load all the data before writing anything
 1230     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1231     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1232     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1233     const Register send = r17, dend = r16;
 1234     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1235     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1236     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1237 
 1238     if (PrefetchCopyIntervalInBytes > 0)
 1239       __ prfm(Address(s, 0), PLDL1KEEP);
 1240     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1241     __ br(Assembler::HI, copy_big);
 1242 
 1243     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1244     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1245 
 1246     __ cmp(count, u1(16/granularity));
 1247     __ br(Assembler::LS, copy16);
 1248 
 1249     __ cmp(count, u1(64/granularity));
 1250     __ br(Assembler::HI, copy80);
 1251 
 1252     __ cmp(count, u1(32/granularity));
 1253     __ br(Assembler::LS, copy32);
 1254 
 1255     // 33..64 bytes
 1256     if (UseSIMDForMemoryOps) {
 1257       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1258       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1259       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1260       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1261     } else {
 1262       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1263       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1264       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1265       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1266 
 1267       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1268       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1269       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1270       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1271     }
 1272     __ b(finish);
 1273 
 1274     // 17..32 bytes
 1275     __ bind(copy32);
 1276     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1277     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1278 
 1279     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1280     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1281     __ b(finish);
 1282 
 1283     // 65..80/96 bytes
 1284     // (96 bytes if SIMD because we do 32 byes per instruction)
 1285     __ bind(copy80);
 1286     if (UseSIMDForMemoryOps) {
 1287       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1288       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1289       // Unaligned pointers can be an issue for copying.
 1290       // The issue has more chances to happen when granularity of data is
 1291       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1292       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1293       // The most performance drop has been seen for the range 65-80 bytes.
 1294       // For such cases using the pair of ldp/stp instead of the third pair of
 1295       // ldpq/stpq fixes the performance issue.
 1296       if (granularity < sizeof (jint)) {
 1297         Label copy96;
 1298         __ cmp(count, u1(80/granularity));
 1299         __ br(Assembler::HI, copy96);
 1300         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1301 
 1302         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1303         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1304 
 1305         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1306         __ b(finish);
 1307 
 1308         __ bind(copy96);
 1309       }
 1310       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1311 
 1312       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1313       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1314 
 1315       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1316     } else {
 1317       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1318       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1319       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1320       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1321       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1322 
 1323       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1324       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1325       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1326       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1327       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1328     }
 1329     __ b(finish);
 1330 
 1331     // 0..16 bytes
 1332     __ bind(copy16);
 1333     __ cmp(count, u1(8/granularity));
 1334     __ br(Assembler::LO, copy8);
 1335 
 1336     // 8..16 bytes
 1337     bs.copy_load_at_8(t0, Address(s, 0));
 1338     bs.copy_load_at_8(t1, Address(send, -8));
 1339     bs.copy_store_at_8(Address(d, 0), t0);
 1340     bs.copy_store_at_8(Address(dend, -8), t1);
 1341     __ b(finish);
 1342 
 1343     if (granularity < 8) {
 1344       // 4..7 bytes
 1345       __ bind(copy8);
 1346       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1347       __ ldrw(t0, Address(s, 0));
 1348       __ ldrw(t1, Address(send, -4));
 1349       __ strw(t0, Address(d, 0));
 1350       __ strw(t1, Address(dend, -4));
 1351       __ b(finish);
 1352       if (granularity < 4) {
 1353         // 0..3 bytes
 1354         __ bind(copy4);
 1355         __ cbz(count, finish); // get rid of 0 case
 1356         if (granularity == 2) {
 1357           __ ldrh(t0, Address(s, 0));
 1358           __ strh(t0, Address(d, 0));
 1359         } else { // granularity == 1
 1360           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1361           // the first and last byte.
 1362           // Handle the 3 byte case by loading and storing base + count/2
 1363           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1364           // This does means in the 1 byte case we load/store the same
 1365           // byte 3 times.
 1366           __ lsr(count, count, 1);
 1367           __ ldrb(t0, Address(s, 0));
 1368           __ ldrb(t1, Address(send, -1));
 1369           __ ldrb(t2, Address(s, count));
 1370           __ strb(t0, Address(d, 0));
 1371           __ strb(t1, Address(dend, -1));
 1372           __ strb(t2, Address(d, count));
 1373         }
 1374         __ b(finish);
 1375       }
 1376     }
 1377 
 1378     __ bind(copy_big);
 1379     if (is_backwards) {
 1380       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1381       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1382     }
 1383 
 1384     // Now we've got the small case out of the way we can align the
 1385     // source address on a 2-word boundary.
 1386 
 1387     // Here we will materialize a count in r15, which is used by copy_memory_small
 1388     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1389     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1390     // can not be used as a temp register, as it contains the count.
 1391 
 1392     Label aligned;
 1393 
 1394     if (is_aligned) {
 1395       // We may have to adjust by 1 word to get s 2-word-aligned.
 1396       __ tbz(s, exact_log2(wordSize), aligned);
 1397       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1398       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1399       __ sub(count, count, wordSize/granularity);
 1400     } else {
 1401       if (is_backwards) {
 1402         __ andr(r15, s, 2 * wordSize - 1);
 1403       } else {
 1404         __ neg(r15, s);
 1405         __ andr(r15, r15, 2 * wordSize - 1);
 1406       }
 1407       // r15 is the byte adjustment needed to align s.
 1408       __ cbz(r15, aligned);
 1409       int shift = exact_log2(granularity);
 1410       if (shift > 0) {
 1411         __ lsr(r15, r15, shift);
 1412       }
 1413       __ sub(count, count, r15);
 1414 
 1415 #if 0
 1416       // ?? This code is only correct for a disjoint copy.  It may or
 1417       // may not make sense to use it in that case.
 1418 
 1419       // Copy the first pair; s and d may not be aligned.
 1420       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1421       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1422 
 1423       // Align s and d, adjust count
 1424       if (is_backwards) {
 1425         __ sub(s, s, r15);
 1426         __ sub(d, d, r15);
 1427       } else {
 1428         __ add(s, s, r15);
 1429         __ add(d, d, r15);
 1430       }
 1431 #else
 1432       copy_memory_small(decorators, type, s, d, r15, step);
 1433 #endif
 1434     }
 1435 
 1436     __ bind(aligned);
 1437 
 1438     // s is now 2-word-aligned.
 1439 
 1440     // We have a count of units and some trailing bytes. Adjust the
 1441     // count and do a bulk copy of words. If the shift is zero
 1442     // perform a move instead to benefit from zero latency moves.
 1443     int shift = exact_log2(wordSize/granularity);
 1444     if (shift > 0) {
 1445       __ lsr(r15, count, shift);
 1446     } else {
 1447       __ mov(r15, count);
 1448     }
 1449     if (direction == copy_forwards) {
 1450       if (type != T_OBJECT) {
 1451         __ bl(copy_f);
 1452       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1453         __ bl(copy_obj_uninit_f);
 1454       } else {
 1455         __ bl(copy_obj_f);
 1456       }
 1457     } else {
 1458       if (type != T_OBJECT) {
 1459         __ bl(copy_b);
 1460       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1461         __ bl(copy_obj_uninit_b);
 1462       } else {
 1463         __ bl(copy_obj_b);
 1464       }
 1465     }
 1466 
 1467     // And the tail.
 1468     copy_memory_small(decorators, type, s, d, count, step);
 1469 
 1470     if (granularity >= 8) __ bind(copy8);
 1471     if (granularity >= 4) __ bind(copy4);
 1472     __ bind(finish);
 1473   }
 1474 
 1475 
 1476   void clobber_registers() {
 1477 #ifdef ASSERT
 1478     RegSet clobbered
 1479       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1480     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1481     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1482     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1483       __ mov(*it, rscratch1);
 1484     }
 1485 #endif
 1486 
 1487   }
 1488 
 1489   // Scan over array at a for count oops, verifying each one.
 1490   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1491   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1492     Label loop, end;
 1493     __ mov(rscratch1, a);
 1494     __ mov(rscratch2, zr);
 1495     __ bind(loop);
 1496     __ cmp(rscratch2, count);
 1497     __ br(Assembler::HS, end);
 1498     if (size == wordSize) {
 1499       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1500       __ verify_oop(temp);
 1501     } else {
 1502       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1503       __ decode_heap_oop(temp); // calls verify_oop
 1504     }
 1505     __ add(rscratch2, rscratch2, 1);
 1506     __ b(loop);
 1507     __ bind(end);
 1508   }
 1509 
 1510   // Arguments:
 1511   //   stub_id - is used to name the stub and identify all details of
 1512   //             how to perform the copy.
 1513   //
 1514   //   entry - is assigned to the stub's post push entry point unless
 1515   //           it is null
 1516   //
 1517   // Inputs:
 1518   //   c_rarg0   - source array address
 1519   //   c_rarg1   - destination array address
 1520   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1521   //
 1522   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1523   // the hardware handle it.  The two dwords within qwords that span
 1524   // cache line boundaries will still be loaded and stored atomically.
 1525   //
 1526   // Side Effects: entry is set to the (post push) entry point so it
 1527   //               can be used by the corresponding conjoint copy
 1528   //               method
 1529   //
 1530   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
 1531     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1532     RegSet saved_reg = RegSet::of(s, d, count);
 1533     int size;
 1534     bool aligned;
 1535     bool is_oop;
 1536     bool dest_uninitialized;
 1537     switch (stub_id) {
 1538     case jbyte_disjoint_arraycopy_id:
 1539       size = sizeof(jbyte);
 1540       aligned = false;
 1541       is_oop = false;
 1542       dest_uninitialized = false;
 1543       break;
 1544     case arrayof_jbyte_disjoint_arraycopy_id:
 1545       size = sizeof(jbyte);
 1546       aligned = true;
 1547       is_oop = false;
 1548       dest_uninitialized = false;
 1549       break;
 1550     case jshort_disjoint_arraycopy_id:
 1551       size = sizeof(jshort);
 1552       aligned = false;
 1553       is_oop = false;
 1554       dest_uninitialized = false;
 1555       break;
 1556     case arrayof_jshort_disjoint_arraycopy_id:
 1557       size = sizeof(jshort);
 1558       aligned = true;
 1559       is_oop = false;
 1560       dest_uninitialized = false;
 1561       break;
 1562     case jint_disjoint_arraycopy_id:
 1563       size = sizeof(jint);
 1564       aligned = false;
 1565       is_oop = false;
 1566       dest_uninitialized = false;
 1567       break;
 1568     case arrayof_jint_disjoint_arraycopy_id:
 1569       size = sizeof(jint);
 1570       aligned = true;
 1571       is_oop = false;
 1572       dest_uninitialized = false;
 1573       break;
 1574     case jlong_disjoint_arraycopy_id:
 1575       // since this is always aligned we can (should!) use the same
 1576       // stub as for case arrayof_jlong_disjoint_arraycopy
 1577       ShouldNotReachHere();
 1578       break;
 1579     case arrayof_jlong_disjoint_arraycopy_id:
 1580       size = sizeof(jlong);
 1581       aligned = true;
 1582       is_oop = false;
 1583       dest_uninitialized = false;
 1584       break;
 1585     case oop_disjoint_arraycopy_id:
 1586       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1587       aligned = !UseCompressedOops;
 1588       is_oop = true;
 1589       dest_uninitialized = false;
 1590       break;
 1591     case arrayof_oop_disjoint_arraycopy_id:
 1592       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1593       aligned = !UseCompressedOops;
 1594       is_oop = true;
 1595       dest_uninitialized = false;
 1596       break;
 1597     case oop_disjoint_arraycopy_uninit_id:
 1598       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1599       aligned = !UseCompressedOops;
 1600       is_oop = true;
 1601       dest_uninitialized = true;
 1602       break;
 1603     case arrayof_oop_disjoint_arraycopy_uninit_id:
 1604       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1605       aligned = !UseCompressedOops;
 1606       is_oop = true;
 1607       dest_uninitialized = true;
 1608       break;
 1609     default:
 1610       ShouldNotReachHere();
 1611       break;
 1612     }
 1613 
 1614     __ align(CodeEntryAlignment);
 1615     StubCodeMark mark(this, stub_id);
 1616     address start = __ pc();
 1617     __ enter();
 1618 
 1619     if (entry != nullptr) {
 1620       *entry = __ pc();
 1621       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1622       BLOCK_COMMENT("Entry:");
 1623     }
 1624 
 1625     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1626     if (dest_uninitialized) {
 1627       decorators |= IS_DEST_UNINITIALIZED;
 1628     }
 1629     if (aligned) {
 1630       decorators |= ARRAYCOPY_ALIGNED;
 1631     }
 1632 
 1633     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1634     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1635 
 1636     if (is_oop) {
 1637       // save regs before copy_memory
 1638       __ push(RegSet::of(d, count), sp);
 1639     }
 1640     {
 1641       // UnsafeMemoryAccess page error: continue after unsafe access
 1642       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1643       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1644       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1645     }
 1646 
 1647     if (is_oop) {
 1648       __ pop(RegSet::of(d, count), sp);
 1649       if (VerifyOops)
 1650         verify_oop_array(size, d, count, r16);
 1651     }
 1652 
 1653     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1654 
 1655     __ leave();
 1656     __ mov(r0, zr); // return 0
 1657     __ ret(lr);
 1658     return start;
 1659   }
 1660 
 1661   // Arguments:
 1662   //   stub_id - is used to name the stub and identify all details of
 1663   //             how to perform the copy.
 1664   //
 1665   //   nooverlap_target - identifes the (post push) entry for the
 1666   //             corresponding disjoint copy routine which can be
 1667   //             jumped to if the ranges do not actually overlap
 1668   //
 1669   //   entry - is assigned to the stub's post push entry point unless
 1670   //           it is null
 1671   //
 1672   //
 1673   // Inputs:
 1674   //   c_rarg0   - source array address
 1675   //   c_rarg1   - destination array address
 1676   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1677   //
 1678   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1679   // the hardware handle it.  The two dwords within qwords that span
 1680   // cache line boundaries will still be loaded and stored atomically.
 1681   //
 1682   // Side Effects:
 1683   //   entry is set to the no-overlap entry point so it can be used by
 1684   //   some other conjoint copy method
 1685   //
 1686   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
 1687     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1688     RegSet saved_regs = RegSet::of(s, d, count);
 1689     int size;
 1690     bool aligned;
 1691     bool is_oop;
 1692     bool dest_uninitialized;
 1693     switch (stub_id) {
 1694     case jbyte_arraycopy_id:
 1695       size = sizeof(jbyte);
 1696       aligned = false;
 1697       is_oop = false;
 1698       dest_uninitialized = false;
 1699       break;
 1700     case arrayof_jbyte_arraycopy_id:
 1701       size = sizeof(jbyte);
 1702       aligned = true;
 1703       is_oop = false;
 1704       dest_uninitialized = false;
 1705       break;
 1706     case jshort_arraycopy_id:
 1707       size = sizeof(jshort);
 1708       aligned = false;
 1709       is_oop = false;
 1710       dest_uninitialized = false;
 1711       break;
 1712     case arrayof_jshort_arraycopy_id:
 1713       size = sizeof(jshort);
 1714       aligned = true;
 1715       is_oop = false;
 1716       dest_uninitialized = false;
 1717       break;
 1718     case jint_arraycopy_id:
 1719       size = sizeof(jint);
 1720       aligned = false;
 1721       is_oop = false;
 1722       dest_uninitialized = false;
 1723       break;
 1724     case arrayof_jint_arraycopy_id:
 1725       size = sizeof(jint);
 1726       aligned = true;
 1727       is_oop = false;
 1728       dest_uninitialized = false;
 1729       break;
 1730     case jlong_arraycopy_id:
 1731       // since this is always aligned we can (should!) use the same
 1732       // stub as for case arrayof_jlong_disjoint_arraycopy
 1733       ShouldNotReachHere();
 1734       break;
 1735     case arrayof_jlong_arraycopy_id:
 1736       size = sizeof(jlong);
 1737       aligned = true;
 1738       is_oop = false;
 1739       dest_uninitialized = false;
 1740       break;
 1741     case oop_arraycopy_id:
 1742       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1743       aligned = !UseCompressedOops;
 1744       is_oop = true;
 1745       dest_uninitialized = false;
 1746       break;
 1747     case arrayof_oop_arraycopy_id:
 1748       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1749       aligned = !UseCompressedOops;
 1750       is_oop = true;
 1751       dest_uninitialized = false;
 1752       break;
 1753     case oop_arraycopy_uninit_id:
 1754       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1755       aligned = !UseCompressedOops;
 1756       is_oop = true;
 1757       dest_uninitialized = true;
 1758       break;
 1759     case arrayof_oop_arraycopy_uninit_id:
 1760       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1761       aligned = !UseCompressedOops;
 1762       is_oop = true;
 1763       dest_uninitialized = true;
 1764       break;
 1765     default:
 1766       ShouldNotReachHere();
 1767     }
 1768 
 1769     StubCodeMark mark(this, stub_id);
 1770     address start = __ pc();
 1771     __ enter();
 1772 
 1773     if (entry != nullptr) {
 1774       *entry = __ pc();
 1775       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1776       BLOCK_COMMENT("Entry:");
 1777     }
 1778 
 1779     // use fwd copy when (d-s) above_equal (count*size)
 1780     __ sub(rscratch1, d, s);
 1781     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1782     __ br(Assembler::HS, nooverlap_target);
 1783 
 1784     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1785     if (dest_uninitialized) {
 1786       decorators |= IS_DEST_UNINITIALIZED;
 1787     }
 1788     if (aligned) {
 1789       decorators |= ARRAYCOPY_ALIGNED;
 1790     }
 1791 
 1792     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1793     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1794 
 1795     if (is_oop) {
 1796       // save regs before copy_memory
 1797       __ push(RegSet::of(d, count), sp);
 1798     }
 1799     {
 1800       // UnsafeMemoryAccess page error: continue after unsafe access
 1801       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1802       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1803       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1804     }
 1805     if (is_oop) {
 1806       __ pop(RegSet::of(d, count), sp);
 1807       if (VerifyOops)
 1808         verify_oop_array(size, d, count, r16);
 1809     }
 1810     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1811     __ leave();
 1812     __ mov(r0, zr); // return 0
 1813     __ ret(lr);
 1814     return start;
 1815   }
 1816 
 1817   // Helper for generating a dynamic type check.
 1818   // Smashes rscratch1, rscratch2.
 1819   void generate_type_check(Register sub_klass,
 1820                            Register super_check_offset,
 1821                            Register super_klass,
 1822                            Register temp1,
 1823                            Register temp2,
 1824                            Register result,
 1825                            Label& L_success) {
 1826     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1827 
 1828     BLOCK_COMMENT("type_check:");
 1829 
 1830     Label L_miss;
 1831 
 1832     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1833                                      super_check_offset);
 1834     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1835 
 1836     // Fall through on failure!
 1837     __ BIND(L_miss);
 1838   }
 1839 
 1840   //
 1841   //  Generate checkcasting array copy stub
 1842   //
 1843   //  Input:
 1844   //    c_rarg0   - source array address
 1845   //    c_rarg1   - destination array address
 1846   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1847   //    c_rarg3   - size_t ckoff (super_check_offset)
 1848   //    c_rarg4   - oop ckval (super_klass)
 1849   //
 1850   //  Output:
 1851   //    r0 ==  0  -  success
 1852   //    r0 == -1^K - failure, where K is partial transfer count
 1853   //
 1854   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
 1855     bool dest_uninitialized;
 1856     switch (stub_id) {
 1857     case checkcast_arraycopy_id:
 1858       dest_uninitialized = false;
 1859       break;
 1860     case checkcast_arraycopy_uninit_id:
 1861       dest_uninitialized = true;
 1862       break;
 1863     default:
 1864       ShouldNotReachHere();
 1865     }
 1866 
 1867     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1868 
 1869     // Input registers (after setup_arg_regs)
 1870     const Register from        = c_rarg0;   // source array address
 1871     const Register to          = c_rarg1;   // destination array address
 1872     const Register count       = c_rarg2;   // elementscount
 1873     const Register ckoff       = c_rarg3;   // super_check_offset
 1874     const Register ckval       = c_rarg4;   // super_klass
 1875 
 1876     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1877     RegSet wb_post_saved_regs = RegSet::of(count);
 1878 
 1879     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1880     const Register copied_oop  = r22;       // actual oop copied
 1881     const Register count_save  = r21;       // orig elementscount
 1882     const Register start_to    = r20;       // destination array start address
 1883     const Register r19_klass   = r19;       // oop._klass
 1884 
 1885     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1886     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1887 
 1888     //---------------------------------------------------------------
 1889     // Assembler stub will be used for this call to arraycopy
 1890     // if the two arrays are subtypes of Object[] but the
 1891     // destination array type is not equal to or a supertype
 1892     // of the source type.  Each element must be separately
 1893     // checked.
 1894 
 1895     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1896                                copied_oop, r19_klass, count_save);
 1897 
 1898     __ align(CodeEntryAlignment);
 1899     StubCodeMark mark(this, stub_id);
 1900     address start = __ pc();
 1901 
 1902     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1903 
 1904 #ifdef ASSERT
 1905     // caller guarantees that the arrays really are different
 1906     // otherwise, we would have to make conjoint checks
 1907     { Label L;
 1908       __ b(L);                  // conjoint check not yet implemented
 1909       __ stop("checkcast_copy within a single array");
 1910       __ bind(L);
 1911     }
 1912 #endif //ASSERT
 1913 
 1914     // Caller of this entry point must set up the argument registers.
 1915     if (entry != nullptr) {
 1916       *entry = __ pc();
 1917       BLOCK_COMMENT("Entry:");
 1918     }
 1919 
 1920      // Empty array:  Nothing to do.
 1921     __ cbz(count, L_done);
 1922     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1923 
 1924 #ifdef ASSERT
 1925     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1926     // The ckoff and ckval must be mutually consistent,
 1927     // even though caller generates both.
 1928     { Label L;
 1929       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1930       __ ldrw(start_to, Address(ckval, sco_offset));
 1931       __ cmpw(ckoff, start_to);
 1932       __ br(Assembler::EQ, L);
 1933       __ stop("super_check_offset inconsistent");
 1934       __ bind(L);
 1935     }
 1936 #endif //ASSERT
 1937 
 1938     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1939     bool is_oop = true;
 1940     int element_size = UseCompressedOops ? 4 : 8;
 1941     if (dest_uninitialized) {
 1942       decorators |= IS_DEST_UNINITIALIZED;
 1943     }
 1944 
 1945     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1946     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1947 
 1948     // save the original count
 1949     __ mov(count_save, count);
 1950 
 1951     // Copy from low to high addresses
 1952     __ mov(start_to, to);              // Save destination array start address
 1953     __ b(L_load_element);
 1954 
 1955     // ======== begin loop ========
 1956     // (Loop is rotated; its entry is L_load_element.)
 1957     // Loop control:
 1958     //   for (; count != 0; count--) {
 1959     //     copied_oop = load_heap_oop(from++);
 1960     //     ... generate_type_check ...;
 1961     //     store_heap_oop(to++, copied_oop);
 1962     //   }
 1963     __ align(OptoLoopAlignment);
 1964 
 1965     __ BIND(L_store_element);
 1966     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1967                       __ post(to, element_size), copied_oop, noreg,
 1968                       gct1, gct2, gct3);
 1969     __ sub(count, count, 1);
 1970     __ cbz(count, L_do_card_marks);
 1971 
 1972     // ======== loop entry is here ========
 1973     __ BIND(L_load_element);
 1974     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1975                      copied_oop, noreg, __ post(from, element_size),
 1976                      gct1);
 1977     __ cbz(copied_oop, L_store_element);
 1978 
 1979     __ load_klass(r19_klass, copied_oop);// query the object klass
 1980 
 1981     BLOCK_COMMENT("type_check:");
 1982     generate_type_check(/*sub_klass*/r19_klass,
 1983                         /*super_check_offset*/ckoff,
 1984                         /*super_klass*/ckval,
 1985                         /*r_array_base*/gct1,
 1986                         /*temp2*/gct2,
 1987                         /*result*/r10, L_store_element);
 1988 
 1989     // Fall through on failure!
 1990 
 1991     // ======== end loop ========
 1992 
 1993     // It was a real error; we must depend on the caller to finish the job.
 1994     // Register count = remaining oops, count_orig = total oops.
 1995     // Emit GC store barriers for the oops we have copied and report
 1996     // their number to the caller.
 1997 
 1998     __ subs(count, count_save, count);     // K = partially copied oop count
 1999     __ eon(count, count, zr);              // report (-1^K) to caller
 2000     __ br(Assembler::EQ, L_done_pop);
 2001 
 2002     __ BIND(L_do_card_marks);
 2003     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2004 
 2005     __ bind(L_done_pop);
 2006     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2007     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2008 
 2009     __ bind(L_done);
 2010     __ mov(r0, count);
 2011     __ leave();
 2012     __ ret(lr);
 2013 
 2014     return start;
 2015   }
 2016 
 2017   // Perform range checks on the proposed arraycopy.
 2018   // Kills temp, but nothing else.
 2019   // Also, clean the sign bits of src_pos and dst_pos.
 2020   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2021                               Register src_pos, // source position (c_rarg1)
 2022                               Register dst,     // destination array oo (c_rarg2)
 2023                               Register dst_pos, // destination position (c_rarg3)
 2024                               Register length,
 2025                               Register temp,
 2026                               Label& L_failed) {
 2027     BLOCK_COMMENT("arraycopy_range_checks:");
 2028 
 2029     assert_different_registers(rscratch1, temp);
 2030 
 2031     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2032     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2033     __ addw(temp, length, src_pos);
 2034     __ cmpw(temp, rscratch1);
 2035     __ br(Assembler::HI, L_failed);
 2036 
 2037     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2038     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2039     __ addw(temp, length, dst_pos);
 2040     __ cmpw(temp, rscratch1);
 2041     __ br(Assembler::HI, L_failed);
 2042 
 2043     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2044     __ movw(src_pos, src_pos);
 2045     __ movw(dst_pos, dst_pos);
 2046 
 2047     BLOCK_COMMENT("arraycopy_range_checks done");
 2048   }
 2049 
 2050   // These stubs get called from some dumb test routine.
 2051   // I'll write them properly when they're called from
 2052   // something that's actually doing something.
 2053   static void fake_arraycopy_stub(address src, address dst, int count) {
 2054     assert(count == 0, "huh?");
 2055   }
 2056 
 2057 
 2058   //
 2059   //  Generate 'unsafe' array copy stub
 2060   //  Though just as safe as the other stubs, it takes an unscaled
 2061   //  size_t argument instead of an element count.
 2062   //
 2063   //  Input:
 2064   //    c_rarg0   - source array address
 2065   //    c_rarg1   - destination array address
 2066   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2067   //
 2068   // Examines the alignment of the operands and dispatches
 2069   // to a long, int, short, or byte copy loop.
 2070   //
 2071   address generate_unsafe_copy(address byte_copy_entry,
 2072                                address short_copy_entry,
 2073                                address int_copy_entry,
 2074                                address long_copy_entry) {
 2075     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
 2076 
 2077     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2078     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2079 
 2080     __ align(CodeEntryAlignment);
 2081     StubCodeMark mark(this, stub_id);
 2082     address start = __ pc();
 2083     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2084 
 2085     // bump this on entry, not on exit:
 2086     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2087 
 2088     __ orr(rscratch1, s, d);
 2089     __ orr(rscratch1, rscratch1, count);
 2090 
 2091     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2092     __ cbz(rscratch1, L_long_aligned);
 2093     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2094     __ cbz(rscratch1, L_int_aligned);
 2095     __ tbz(rscratch1, 0, L_short_aligned);
 2096     __ b(RuntimeAddress(byte_copy_entry));
 2097 
 2098     __ BIND(L_short_aligned);
 2099     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2100     __ b(RuntimeAddress(short_copy_entry));
 2101     __ BIND(L_int_aligned);
 2102     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2103     __ b(RuntimeAddress(int_copy_entry));
 2104     __ BIND(L_long_aligned);
 2105     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2106     __ b(RuntimeAddress(long_copy_entry));
 2107 
 2108     return start;
 2109   }
 2110 
 2111   //
 2112   //  Generate generic array copy stubs
 2113   //
 2114   //  Input:
 2115   //    c_rarg0    -  src oop
 2116   //    c_rarg1    -  src_pos (32-bits)
 2117   //    c_rarg2    -  dst oop
 2118   //    c_rarg3    -  dst_pos (32-bits)
 2119   //    c_rarg4    -  element count (32-bits)
 2120   //
 2121   //  Output:
 2122   //    r0 ==  0  -  success
 2123   //    r0 == -1^K - failure, where K is partial transfer count
 2124   //
 2125   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2126                                 address int_copy_entry, address oop_copy_entry,
 2127                                 address long_copy_entry, address checkcast_copy_entry) {
 2128     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
 2129 
 2130     Label L_failed, L_objArray;
 2131     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2132 
 2133     // Input registers
 2134     const Register src        = c_rarg0;  // source array oop
 2135     const Register src_pos    = c_rarg1;  // source position
 2136     const Register dst        = c_rarg2;  // destination array oop
 2137     const Register dst_pos    = c_rarg3;  // destination position
 2138     const Register length     = c_rarg4;
 2139 
 2140 
 2141     // Registers used as temps
 2142     const Register dst_klass  = c_rarg5;
 2143 
 2144     __ align(CodeEntryAlignment);
 2145 
 2146     StubCodeMark mark(this, stub_id);
 2147 
 2148     address start = __ pc();
 2149 
 2150     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2151 
 2152     // bump this on entry, not on exit:
 2153     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2154 
 2155     //-----------------------------------------------------------------------
 2156     // Assembler stub will be used for this call to arraycopy
 2157     // if the following conditions are met:
 2158     //
 2159     // (1) src and dst must not be null.
 2160     // (2) src_pos must not be negative.
 2161     // (3) dst_pos must not be negative.
 2162     // (4) length  must not be negative.
 2163     // (5) src klass and dst klass should be the same and not null.
 2164     // (6) src and dst should be arrays.
 2165     // (7) src_pos + length must not exceed length of src.
 2166     // (8) dst_pos + length must not exceed length of dst.
 2167     //
 2168 
 2169     //  if (src == nullptr) return -1;
 2170     __ cbz(src, L_failed);
 2171 
 2172     //  if (src_pos < 0) return -1;
 2173     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2174 
 2175     //  if (dst == nullptr) return -1;
 2176     __ cbz(dst, L_failed);
 2177 
 2178     //  if (dst_pos < 0) return -1;
 2179     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2180 
 2181     // registers used as temp
 2182     const Register scratch_length    = r16; // elements count to copy
 2183     const Register scratch_src_klass = r17; // array klass
 2184     const Register lh                = r15; // layout helper
 2185 
 2186     //  if (length < 0) return -1;
 2187     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2188     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     __ load_klass(scratch_src_klass, src);
 2191 #ifdef ASSERT
 2192     //  assert(src->klass() != nullptr);
 2193     {
 2194       BLOCK_COMMENT("assert klasses not null {");
 2195       Label L1, L2;
 2196       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2197       __ bind(L1);
 2198       __ stop("broken null klass");
 2199       __ bind(L2);
 2200       __ load_klass(rscratch1, dst);
 2201       __ cbz(rscratch1, L1);     // this would be broken also
 2202       BLOCK_COMMENT("} assert klasses not null done");
 2203     }
 2204 #endif
 2205 
 2206     // Load layout helper (32-bits)
 2207     //
 2208     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2209     // 32        30    24            16              8     2                 0
 2210     //
 2211     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2212     //
 2213 
 2214     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2215 
 2216     // Handle objArrays completely differently...
 2217     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2218     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2219     __ movw(rscratch1, objArray_lh);
 2220     __ eorw(rscratch2, lh, rscratch1);
 2221     __ cbzw(rscratch2, L_objArray);
 2222 
 2223     //  if (src->klass() != dst->klass()) return -1;
 2224     __ load_klass(rscratch2, dst);
 2225     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2226     __ cbnz(rscratch2, L_failed);
 2227 
 2228     //  if (!src->is_Array()) return -1;
 2229     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2230 
 2231     // At this point, it is known to be a typeArray (array_tag 0x3).
 2232 #ifdef ASSERT
 2233     {
 2234       BLOCK_COMMENT("assert primitive array {");
 2235       Label L;
 2236       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2237       __ cmpw(lh, rscratch2);
 2238       __ br(Assembler::GE, L);
 2239       __ stop("must be a primitive array");
 2240       __ bind(L);
 2241       BLOCK_COMMENT("} assert primitive array done");
 2242     }
 2243 #endif
 2244 
 2245     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2246                            rscratch2, L_failed);
 2247 
 2248     // TypeArrayKlass
 2249     //
 2250     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2251     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2252     //
 2253 
 2254     const Register rscratch1_offset = rscratch1;    // array offset
 2255     const Register r15_elsize = lh; // element size
 2256 
 2257     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2258            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2259     __ add(src, src, rscratch1_offset);           // src array offset
 2260     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2261     BLOCK_COMMENT("choose copy loop based on element size");
 2262 
 2263     // next registers should be set before the jump to corresponding stub
 2264     const Register from     = c_rarg0;  // source array address
 2265     const Register to       = c_rarg1;  // destination array address
 2266     const Register count    = c_rarg2;  // elements count
 2267 
 2268     // 'from', 'to', 'count' registers should be set in such order
 2269     // since they are the same as 'src', 'src_pos', 'dst'.
 2270 
 2271     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2272 
 2273     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2274     // size in bytes).  We do a simple bitwise binary search.
 2275   __ BIND(L_copy_bytes);
 2276     __ tbnz(r15_elsize, 1, L_copy_ints);
 2277     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2278     __ lea(from, Address(src, src_pos));// src_addr
 2279     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2280     __ movw(count, scratch_length); // length
 2281     __ b(RuntimeAddress(byte_copy_entry));
 2282 
 2283   __ BIND(L_copy_shorts);
 2284     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2285     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2286     __ movw(count, scratch_length); // length
 2287     __ b(RuntimeAddress(short_copy_entry));
 2288 
 2289   __ BIND(L_copy_ints);
 2290     __ tbnz(r15_elsize, 0, L_copy_longs);
 2291     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2292     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2293     __ movw(count, scratch_length); // length
 2294     __ b(RuntimeAddress(int_copy_entry));
 2295 
 2296   __ BIND(L_copy_longs);
 2297 #ifdef ASSERT
 2298     {
 2299       BLOCK_COMMENT("assert long copy {");
 2300       Label L;
 2301       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2302       __ cmpw(r15_elsize, LogBytesPerLong);
 2303       __ br(Assembler::EQ, L);
 2304       __ stop("must be long copy, but elsize is wrong");
 2305       __ bind(L);
 2306       BLOCK_COMMENT("} assert long copy done");
 2307     }
 2308 #endif
 2309     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2310     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2311     __ movw(count, scratch_length); // length
 2312     __ b(RuntimeAddress(long_copy_entry));
 2313 
 2314     // ObjArrayKlass
 2315   __ BIND(L_objArray);
 2316     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2317 
 2318     Label L_plain_copy, L_checkcast_copy;
 2319     //  test array classes for subtyping
 2320     __ load_klass(r15, dst);
 2321     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2322     __ br(Assembler::NE, L_checkcast_copy);
 2323 
 2324     // Identically typed arrays can be copied without element-wise checks.
 2325     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2326                            rscratch2, L_failed);
 2327 
 2328     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2329     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2330     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2331     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2332     __ movw(count, scratch_length); // length
 2333   __ BIND(L_plain_copy);
 2334     __ b(RuntimeAddress(oop_copy_entry));
 2335 
 2336   __ BIND(L_checkcast_copy);
 2337     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2338     {
 2339       // Before looking at dst.length, make sure dst is also an objArray.
 2340       __ ldrw(rscratch1, Address(r15, lh_offset));
 2341       __ movw(rscratch2, objArray_lh);
 2342       __ eorw(rscratch1, rscratch1, rscratch2);
 2343       __ cbnzw(rscratch1, L_failed);
 2344 
 2345       // It is safe to examine both src.length and dst.length.
 2346       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                              r15, L_failed);
 2348 
 2349       __ load_klass(dst_klass, dst); // reload
 2350 
 2351       // Marshal the base address arguments now, freeing registers.
 2352       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2353       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2354       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2355       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2356       __ movw(count, length);           // length (reloaded)
 2357       Register sco_temp = c_rarg3;      // this register is free now
 2358       assert_different_registers(from, to, count, sco_temp,
 2359                                  dst_klass, scratch_src_klass);
 2360       // assert_clean_int(count, sco_temp);
 2361 
 2362       // Generate the type check.
 2363       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2364       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2365 
 2366       // Smashes rscratch1, rscratch2
 2367       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2368                           L_plain_copy);
 2369 
 2370       // Fetch destination element klass from the ObjArrayKlass header.
 2371       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2372       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2373       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2374 
 2375       // the checkcast_copy loop needs two extra arguments:
 2376       assert(c_rarg3 == sco_temp, "#3 already in place");
 2377       // Set up arguments for checkcast_copy_entry.
 2378       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2379       __ b(RuntimeAddress(checkcast_copy_entry));
 2380     }
 2381 
 2382   __ BIND(L_failed);
 2383     __ mov(r0, -1);
 2384     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2385     __ ret(lr);
 2386 
 2387     return start;
 2388   }
 2389 
 2390   //
 2391   // Generate stub for array fill. If "aligned" is true, the
 2392   // "to" address is assumed to be heapword aligned.
 2393   //
 2394   // Arguments for generated stub:
 2395   //   to:    c_rarg0
 2396   //   value: c_rarg1
 2397   //   count: c_rarg2 treated as signed
 2398   //
 2399   address generate_fill(StubGenStubId stub_id) {
 2400     BasicType t;
 2401     bool aligned;
 2402 
 2403     switch (stub_id) {
 2404     case jbyte_fill_id:
 2405       t = T_BYTE;
 2406       aligned = false;
 2407       break;
 2408     case jshort_fill_id:
 2409       t = T_SHORT;
 2410       aligned = false;
 2411       break;
 2412     case jint_fill_id:
 2413       t = T_INT;
 2414       aligned = false;
 2415       break;
 2416     case arrayof_jbyte_fill_id:
 2417       t = T_BYTE;
 2418       aligned = true;
 2419       break;
 2420     case arrayof_jshort_fill_id:
 2421       t = T_SHORT;
 2422       aligned = true;
 2423       break;
 2424     case arrayof_jint_fill_id:
 2425       t = T_INT;
 2426       aligned = true;
 2427       break;
 2428     default:
 2429       ShouldNotReachHere();
 2430     };
 2431 
 2432     __ align(CodeEntryAlignment);
 2433     StubCodeMark mark(this, stub_id);
 2434     address start = __ pc();
 2435 
 2436     BLOCK_COMMENT("Entry:");
 2437 
 2438     const Register to        = c_rarg0;  // source array address
 2439     const Register value     = c_rarg1;  // value
 2440     const Register count     = c_rarg2;  // elements count
 2441 
 2442     const Register bz_base = r10;        // base for block_zero routine
 2443     const Register cnt_words = r11;      // temp register
 2444 
 2445     __ enter();
 2446 
 2447     Label L_fill_elements, L_exit1;
 2448 
 2449     int shift = -1;
 2450     switch (t) {
 2451       case T_BYTE:
 2452         shift = 0;
 2453         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2454         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2455         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2456         __ br(Assembler::LO, L_fill_elements);
 2457         break;
 2458       case T_SHORT:
 2459         shift = 1;
 2460         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2461         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2462         __ br(Assembler::LO, L_fill_elements);
 2463         break;
 2464       case T_INT:
 2465         shift = 2;
 2466         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2467         __ br(Assembler::LO, L_fill_elements);
 2468         break;
 2469       default: ShouldNotReachHere();
 2470     }
 2471 
 2472     // Align source address at 8 bytes address boundary.
 2473     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2474     if (!aligned) {
 2475       switch (t) {
 2476         case T_BYTE:
 2477           // One byte misalignment happens only for byte arrays.
 2478           __ tbz(to, 0, L_skip_align1);
 2479           __ strb(value, Address(__ post(to, 1)));
 2480           __ subw(count, count, 1);
 2481           __ bind(L_skip_align1);
 2482           // Fallthrough
 2483         case T_SHORT:
 2484           // Two bytes misalignment happens only for byte and short (char) arrays.
 2485           __ tbz(to, 1, L_skip_align2);
 2486           __ strh(value, Address(__ post(to, 2)));
 2487           __ subw(count, count, 2 >> shift);
 2488           __ bind(L_skip_align2);
 2489           // Fallthrough
 2490         case T_INT:
 2491           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2492           __ tbz(to, 2, L_skip_align4);
 2493           __ strw(value, Address(__ post(to, 4)));
 2494           __ subw(count, count, 4 >> shift);
 2495           __ bind(L_skip_align4);
 2496           break;
 2497         default: ShouldNotReachHere();
 2498       }
 2499     }
 2500 
 2501     //
 2502     //  Fill large chunks
 2503     //
 2504     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2505     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2506     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2507     if (UseBlockZeroing) {
 2508       Label non_block_zeroing, rest;
 2509       // If the fill value is zero we can use the fast zero_words().
 2510       __ cbnz(value, non_block_zeroing);
 2511       __ mov(bz_base, to);
 2512       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2513       address tpc = __ zero_words(bz_base, cnt_words);
 2514       if (tpc == nullptr) {
 2515         fatal("CodeCache is full at generate_fill");
 2516       }
 2517       __ b(rest);
 2518       __ bind(non_block_zeroing);
 2519       __ fill_words(to, cnt_words, value);
 2520       __ bind(rest);
 2521     } else {
 2522       __ fill_words(to, cnt_words, value);
 2523     }
 2524 
 2525     // Remaining count is less than 8 bytes. Fill it by a single store.
 2526     // Note that the total length is no less than 8 bytes.
 2527     if (t == T_BYTE || t == T_SHORT) {
 2528       Label L_exit1;
 2529       __ cbzw(count, L_exit1);
 2530       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2531       __ str(value, Address(to, -8));    // overwrite some elements
 2532       __ bind(L_exit1);
 2533       __ leave();
 2534       __ ret(lr);
 2535     }
 2536 
 2537     // Handle copies less than 8 bytes.
 2538     Label L_fill_2, L_fill_4, L_exit2;
 2539     __ bind(L_fill_elements);
 2540     switch (t) {
 2541       case T_BYTE:
 2542         __ tbz(count, 0, L_fill_2);
 2543         __ strb(value, Address(__ post(to, 1)));
 2544         __ bind(L_fill_2);
 2545         __ tbz(count, 1, L_fill_4);
 2546         __ strh(value, Address(__ post(to, 2)));
 2547         __ bind(L_fill_4);
 2548         __ tbz(count, 2, L_exit2);
 2549         __ strw(value, Address(to));
 2550         break;
 2551       case T_SHORT:
 2552         __ tbz(count, 0, L_fill_4);
 2553         __ strh(value, Address(__ post(to, 2)));
 2554         __ bind(L_fill_4);
 2555         __ tbz(count, 1, L_exit2);
 2556         __ strw(value, Address(to));
 2557         break;
 2558       case T_INT:
 2559         __ cbzw(count, L_exit2);
 2560         __ strw(value, Address(to));
 2561         break;
 2562       default: ShouldNotReachHere();
 2563     }
 2564     __ bind(L_exit2);
 2565     __ leave();
 2566     __ ret(lr);
 2567     return start;
 2568   }
 2569 
 2570   address generate_unsafecopy_common_error_exit() {
 2571     address start_pc = __ pc();
 2572       __ leave();
 2573       __ mov(r0, 0);
 2574       __ ret(lr);
 2575     return start_pc;
 2576   }
 2577 
 2578   //
 2579   //  Generate 'unsafe' set memory stub
 2580   //  Though just as safe as the other stubs, it takes an unscaled
 2581   //  size_t (# bytes) argument instead of an element count.
 2582   //
 2583   //  This fill operation is atomicity preserving: as long as the
 2584   //  address supplied is sufficiently aligned, all writes of up to 64
 2585   //  bits in size are single-copy atomic.
 2586   //
 2587   //  Input:
 2588   //    c_rarg0   - destination array address
 2589   //    c_rarg1   - byte count (size_t)
 2590   //    c_rarg2   - byte value
 2591   //
 2592   address generate_unsafe_setmemory() {
 2593     __ align(CodeEntryAlignment);
 2594     StubCodeMark mark(this, StubGenStubId::unsafe_setmemory_id);
 2595     address start = __ pc();
 2596 
 2597     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2598     Label tail;
 2599 
 2600     UnsafeMemoryAccessMark umam(this, true, false);
 2601 
 2602     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2603 
 2604     __ dup(v0, __ T16B, value);
 2605 
 2606     if (AvoidUnalignedAccesses) {
 2607       __ cmp(count, (u1)16);
 2608       __ br(__ LO, tail);
 2609 
 2610       __ mov(rscratch1, 16);
 2611       __ andr(rscratch2, dest, 15);
 2612       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2613       __ strq(v0, Address(dest));
 2614       __ sub(count, count, rscratch1);
 2615       __ add(dest, dest, rscratch1);
 2616     }
 2617 
 2618     __ subs(count, count, (u1)64);
 2619     __ br(__ LO, tail);
 2620     {
 2621       Label again;
 2622       __ bind(again);
 2623       __ stpq(v0, v0, Address(dest));
 2624       __ stpq(v0, v0, Address(dest, 32));
 2625 
 2626       __ subs(count, count, 64);
 2627       __ add(dest, dest, 64);
 2628       __ br(__ HS, again);
 2629     }
 2630 
 2631     __ bind(tail);
 2632     // The count of bytes is off by 64, but we don't need to correct
 2633     // it because we're only going to use the least-significant few
 2634     // count bits from here on.
 2635     // __ add(count, count, 64);
 2636 
 2637     {
 2638       Label dont;
 2639       __ tbz(count, exact_log2(32), dont);
 2640       __ stpq(v0, v0, __ post(dest, 32));
 2641       __ bind(dont);
 2642     }
 2643     {
 2644       Label dont;
 2645       __ tbz(count, exact_log2(16), dont);
 2646       __ strq(v0, __ post(dest, 16));
 2647       __ bind(dont);
 2648     }
 2649     {
 2650       Label dont;
 2651       __ tbz(count, exact_log2(8), dont);
 2652       __ strd(v0, __ post(dest, 8));
 2653       __ bind(dont);
 2654     }
 2655 
 2656     Label finished;
 2657     __ tst(count, 7);
 2658     __ br(__ EQ, finished);
 2659 
 2660     {
 2661       Label dont;
 2662       __ tbz(count, exact_log2(4), dont);
 2663       __ strs(v0, __ post(dest, 4));
 2664       __ bind(dont);
 2665     }
 2666     {
 2667       Label dont;
 2668       __ tbz(count, exact_log2(2), dont);
 2669       __ bfi(value, value, 8, 8);
 2670       __ strh(value, __ post(dest, 2));
 2671       __ bind(dont);
 2672     }
 2673     {
 2674       Label dont;
 2675       __ tbz(count, exact_log2(1), dont);
 2676       __ strb(value, Address(dest));
 2677       __ bind(dont);
 2678     }
 2679 
 2680     __ bind(finished);
 2681     __ leave();
 2682     __ ret(lr);
 2683 
 2684     return start;
 2685   }
 2686 
 2687   address generate_data_cache_writeback() {
 2688     const Register line        = c_rarg0;  // address of line to write back
 2689 
 2690     __ align(CodeEntryAlignment);
 2691 
 2692     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
 2693     StubCodeMark mark(this, stub_id);
 2694 
 2695     address start = __ pc();
 2696     __ enter();
 2697     __ cache_wb(Address(line, 0));
 2698     __ leave();
 2699     __ ret(lr);
 2700 
 2701     return start;
 2702   }
 2703 
 2704   address generate_data_cache_writeback_sync() {
 2705     const Register is_pre     = c_rarg0;  // pre or post sync
 2706 
 2707     __ align(CodeEntryAlignment);
 2708 
 2709     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
 2710     StubCodeMark mark(this, stub_id);
 2711 
 2712     // pre wbsync is a no-op
 2713     // post wbsync translates to an sfence
 2714 
 2715     Label skip;
 2716     address start = __ pc();
 2717     __ enter();
 2718     __ cbnz(is_pre, skip);
 2719     __ cache_wbsync(false);
 2720     __ bind(skip);
 2721     __ leave();
 2722     __ ret(lr);
 2723 
 2724     return start;
 2725   }
 2726 
 2727   void generate_arraycopy_stubs() {
 2728     address entry;
 2729     address entry_jbyte_arraycopy;
 2730     address entry_jshort_arraycopy;
 2731     address entry_jint_arraycopy;
 2732     address entry_oop_arraycopy;
 2733     address entry_jlong_arraycopy;
 2734     address entry_checkcast_arraycopy;
 2735 
 2736     address ucm_common_error_exit       =  generate_unsafecopy_common_error_exit();
 2737     UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit);
 2738 
 2739     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2740     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2741 
 2742     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2743     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2744 
 2745     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2746     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2747 
 2748     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2749 
 2750     //*** jbyte
 2751     // Always need aligned and unaligned versions
 2752     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
 2753     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2754     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2755     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
 2756 
 2757     //*** jshort
 2758     // Always need aligned and unaligned versions
 2759     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
 2760     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2761     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
 2762     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
 2763 
 2764     //*** jint
 2765     // Aligned versions
 2766     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
 2767     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2768     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2769     // entry_jint_arraycopy always points to the unaligned version
 2770     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
 2771     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2772 
 2773     //*** jlong
 2774     // It is always aligned
 2775     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
 2776     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2777     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2778     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2779 
 2780     //*** oops
 2781     {
 2782       // With compressed oops we need unaligned versions; notice that
 2783       // we overwrite entry_oop_arraycopy.
 2784       bool aligned = !UseCompressedOops;
 2785 
 2786       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2787         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
 2788       StubRoutines::_arrayof_oop_arraycopy
 2789         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2790       // Aligned versions without pre-barriers
 2791       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2792         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2793       StubRoutines::_arrayof_oop_arraycopy_uninit
 2794         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2795     }
 2796 
 2797     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2798     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2799     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2800     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2801 
 2802     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2803     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 2804 
 2805     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2806                                                               entry_jshort_arraycopy,
 2807                                                               entry_jint_arraycopy,
 2808                                                               entry_jlong_arraycopy);
 2809 
 2810     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2811                                                                entry_jshort_arraycopy,
 2812                                                                entry_jint_arraycopy,
 2813                                                                entry_oop_arraycopy,
 2814                                                                entry_jlong_arraycopy,
 2815                                                                entry_checkcast_arraycopy);
 2816 
 2817     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 2818     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 2819     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 2820     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 2821     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 2822     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 2823   }
 2824 
 2825   void generate_math_stubs() { Unimplemented(); }
 2826 
 2827   // Arguments:
 2828   //
 2829   // Inputs:
 2830   //   c_rarg0   - source byte array address
 2831   //   c_rarg1   - destination byte array address
 2832   //   c_rarg2   - K (key) in little endian int array
 2833   //
 2834   address generate_aescrypt_encryptBlock() {
 2835     __ align(CodeEntryAlignment);
 2836     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
 2837     StubCodeMark mark(this, stub_id);
 2838 
 2839     const Register from        = c_rarg0;  // source array address
 2840     const Register to          = c_rarg1;  // destination array address
 2841     const Register key         = c_rarg2;  // key array address
 2842     const Register keylen      = rscratch1;
 2843 
 2844     address start = __ pc();
 2845     __ enter();
 2846 
 2847     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2848 
 2849     __ aesenc_loadkeys(key, keylen);
 2850     __ aesecb_encrypt(from, to, keylen);
 2851 
 2852     __ mov(r0, 0);
 2853 
 2854     __ leave();
 2855     __ ret(lr);
 2856 
 2857     return start;
 2858   }
 2859 
 2860   // Arguments:
 2861   //
 2862   // Inputs:
 2863   //   c_rarg0   - source byte array address
 2864   //   c_rarg1   - destination byte array address
 2865   //   c_rarg2   - K (key) in little endian int array
 2866   //
 2867   address generate_aescrypt_decryptBlock() {
 2868     assert(UseAES, "need AES cryptographic extension support");
 2869     __ align(CodeEntryAlignment);
 2870     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
 2871     StubCodeMark mark(this, stub_id);
 2872     Label L_doLast;
 2873 
 2874     const Register from        = c_rarg0;  // source array address
 2875     const Register to          = c_rarg1;  // destination array address
 2876     const Register key         = c_rarg2;  // key array address
 2877     const Register keylen      = rscratch1;
 2878 
 2879     address start = __ pc();
 2880     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2881 
 2882     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2883 
 2884     __ aesecb_decrypt(from, to, key, keylen);
 2885 
 2886     __ mov(r0, 0);
 2887 
 2888     __ leave();
 2889     __ ret(lr);
 2890 
 2891     return start;
 2892   }
 2893 
 2894   // Arguments:
 2895   //
 2896   // Inputs:
 2897   //   c_rarg0   - source byte array address
 2898   //   c_rarg1   - destination byte array address
 2899   //   c_rarg2   - K (key) in little endian int array
 2900   //   c_rarg3   - r vector byte array address
 2901   //   c_rarg4   - input length
 2902   //
 2903   // Output:
 2904   //   x0        - input length
 2905   //
 2906   address generate_cipherBlockChaining_encryptAESCrypt() {
 2907     assert(UseAES, "need AES cryptographic extension support");
 2908     __ align(CodeEntryAlignment);
 2909     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
 2910     StubCodeMark mark(this, stub_id);
 2911 
 2912     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2913 
 2914     const Register from        = c_rarg0;  // source array address
 2915     const Register to          = c_rarg1;  // destination array address
 2916     const Register key         = c_rarg2;  // key array address
 2917     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2918                                            // and left with the results of the last encryption block
 2919     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2920     const Register keylen      = rscratch1;
 2921 
 2922     address start = __ pc();
 2923 
 2924       __ enter();
 2925 
 2926       __ movw(rscratch2, len_reg);
 2927 
 2928       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2929 
 2930       __ ld1(v0, __ T16B, rvec);
 2931 
 2932       __ cmpw(keylen, 52);
 2933       __ br(Assembler::CC, L_loadkeys_44);
 2934       __ br(Assembler::EQ, L_loadkeys_52);
 2935 
 2936       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2937       __ rev32(v17, __ T16B, v17);
 2938       __ rev32(v18, __ T16B, v18);
 2939     __ BIND(L_loadkeys_52);
 2940       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2941       __ rev32(v19, __ T16B, v19);
 2942       __ rev32(v20, __ T16B, v20);
 2943     __ BIND(L_loadkeys_44);
 2944       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2945       __ rev32(v21, __ T16B, v21);
 2946       __ rev32(v22, __ T16B, v22);
 2947       __ rev32(v23, __ T16B, v23);
 2948       __ rev32(v24, __ T16B, v24);
 2949       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2950       __ rev32(v25, __ T16B, v25);
 2951       __ rev32(v26, __ T16B, v26);
 2952       __ rev32(v27, __ T16B, v27);
 2953       __ rev32(v28, __ T16B, v28);
 2954       __ ld1(v29, v30, v31, __ T16B, key);
 2955       __ rev32(v29, __ T16B, v29);
 2956       __ rev32(v30, __ T16B, v30);
 2957       __ rev32(v31, __ T16B, v31);
 2958 
 2959     __ BIND(L_aes_loop);
 2960       __ ld1(v1, __ T16B, __ post(from, 16));
 2961       __ eor(v0, __ T16B, v0, v1);
 2962 
 2963       __ br(Assembler::CC, L_rounds_44);
 2964       __ br(Assembler::EQ, L_rounds_52);
 2965 
 2966       __ aese(v0, v17); __ aesmc(v0, v0);
 2967       __ aese(v0, v18); __ aesmc(v0, v0);
 2968     __ BIND(L_rounds_52);
 2969       __ aese(v0, v19); __ aesmc(v0, v0);
 2970       __ aese(v0, v20); __ aesmc(v0, v0);
 2971     __ BIND(L_rounds_44);
 2972       __ aese(v0, v21); __ aesmc(v0, v0);
 2973       __ aese(v0, v22); __ aesmc(v0, v0);
 2974       __ aese(v0, v23); __ aesmc(v0, v0);
 2975       __ aese(v0, v24); __ aesmc(v0, v0);
 2976       __ aese(v0, v25); __ aesmc(v0, v0);
 2977       __ aese(v0, v26); __ aesmc(v0, v0);
 2978       __ aese(v0, v27); __ aesmc(v0, v0);
 2979       __ aese(v0, v28); __ aesmc(v0, v0);
 2980       __ aese(v0, v29); __ aesmc(v0, v0);
 2981       __ aese(v0, v30);
 2982       __ eor(v0, __ T16B, v0, v31);
 2983 
 2984       __ st1(v0, __ T16B, __ post(to, 16));
 2985 
 2986       __ subw(len_reg, len_reg, 16);
 2987       __ cbnzw(len_reg, L_aes_loop);
 2988 
 2989       __ st1(v0, __ T16B, rvec);
 2990 
 2991       __ mov(r0, rscratch2);
 2992 
 2993       __ leave();
 2994       __ ret(lr);
 2995 
 2996       return start;
 2997   }
 2998 
 2999   // Arguments:
 3000   //
 3001   // Inputs:
 3002   //   c_rarg0   - source byte array address
 3003   //   c_rarg1   - destination byte array address
 3004   //   c_rarg2   - K (key) in little endian int array
 3005   //   c_rarg3   - r vector byte array address
 3006   //   c_rarg4   - input length
 3007   //
 3008   // Output:
 3009   //   r0        - input length
 3010   //
 3011   address generate_cipherBlockChaining_decryptAESCrypt() {
 3012     assert(UseAES, "need AES cryptographic extension support");
 3013     __ align(CodeEntryAlignment);
 3014     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
 3015     StubCodeMark mark(this, stub_id);
 3016 
 3017     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3018 
 3019     const Register from        = c_rarg0;  // source array address
 3020     const Register to          = c_rarg1;  // destination array address
 3021     const Register key         = c_rarg2;  // key array address
 3022     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3023                                            // and left with the results of the last encryption block
 3024     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3025     const Register keylen      = rscratch1;
 3026 
 3027     address start = __ pc();
 3028 
 3029       __ enter();
 3030 
 3031       __ movw(rscratch2, len_reg);
 3032 
 3033       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3034 
 3035       __ ld1(v2, __ T16B, rvec);
 3036 
 3037       __ ld1(v31, __ T16B, __ post(key, 16));
 3038       __ rev32(v31, __ T16B, v31);
 3039 
 3040       __ cmpw(keylen, 52);
 3041       __ br(Assembler::CC, L_loadkeys_44);
 3042       __ br(Assembler::EQ, L_loadkeys_52);
 3043 
 3044       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3045       __ rev32(v17, __ T16B, v17);
 3046       __ rev32(v18, __ T16B, v18);
 3047     __ BIND(L_loadkeys_52);
 3048       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3049       __ rev32(v19, __ T16B, v19);
 3050       __ rev32(v20, __ T16B, v20);
 3051     __ BIND(L_loadkeys_44);
 3052       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3053       __ rev32(v21, __ T16B, v21);
 3054       __ rev32(v22, __ T16B, v22);
 3055       __ rev32(v23, __ T16B, v23);
 3056       __ rev32(v24, __ T16B, v24);
 3057       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3058       __ rev32(v25, __ T16B, v25);
 3059       __ rev32(v26, __ T16B, v26);
 3060       __ rev32(v27, __ T16B, v27);
 3061       __ rev32(v28, __ T16B, v28);
 3062       __ ld1(v29, v30, __ T16B, key);
 3063       __ rev32(v29, __ T16B, v29);
 3064       __ rev32(v30, __ T16B, v30);
 3065 
 3066     __ BIND(L_aes_loop);
 3067       __ ld1(v0, __ T16B, __ post(from, 16));
 3068       __ orr(v1, __ T16B, v0, v0);
 3069 
 3070       __ br(Assembler::CC, L_rounds_44);
 3071       __ br(Assembler::EQ, L_rounds_52);
 3072 
 3073       __ aesd(v0, v17); __ aesimc(v0, v0);
 3074       __ aesd(v0, v18); __ aesimc(v0, v0);
 3075     __ BIND(L_rounds_52);
 3076       __ aesd(v0, v19); __ aesimc(v0, v0);
 3077       __ aesd(v0, v20); __ aesimc(v0, v0);
 3078     __ BIND(L_rounds_44);
 3079       __ aesd(v0, v21); __ aesimc(v0, v0);
 3080       __ aesd(v0, v22); __ aesimc(v0, v0);
 3081       __ aesd(v0, v23); __ aesimc(v0, v0);
 3082       __ aesd(v0, v24); __ aesimc(v0, v0);
 3083       __ aesd(v0, v25); __ aesimc(v0, v0);
 3084       __ aesd(v0, v26); __ aesimc(v0, v0);
 3085       __ aesd(v0, v27); __ aesimc(v0, v0);
 3086       __ aesd(v0, v28); __ aesimc(v0, v0);
 3087       __ aesd(v0, v29); __ aesimc(v0, v0);
 3088       __ aesd(v0, v30);
 3089       __ eor(v0, __ T16B, v0, v31);
 3090       __ eor(v0, __ T16B, v0, v2);
 3091 
 3092       __ st1(v0, __ T16B, __ post(to, 16));
 3093       __ orr(v2, __ T16B, v1, v1);
 3094 
 3095       __ subw(len_reg, len_reg, 16);
 3096       __ cbnzw(len_reg, L_aes_loop);
 3097 
 3098       __ st1(v2, __ T16B, rvec);
 3099 
 3100       __ mov(r0, rscratch2);
 3101 
 3102       __ leave();
 3103       __ ret(lr);
 3104 
 3105     return start;
 3106   }
 3107 
 3108   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3109   // Inputs: 128-bits. in is preserved.
 3110   // The least-significant 64-bit word is in the upper dword of each vector.
 3111   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3112   // Output: result
 3113   void be_add_128_64(FloatRegister result, FloatRegister in,
 3114                      FloatRegister inc, FloatRegister tmp) {
 3115     assert_different_registers(result, tmp, inc);
 3116 
 3117     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3118                                            // input
 3119     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3120     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3121                                            // MSD == 0 (must be!) to LSD
 3122     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3123   }
 3124 
 3125   // CTR AES crypt.
 3126   // Arguments:
 3127   //
 3128   // Inputs:
 3129   //   c_rarg0   - source byte array address
 3130   //   c_rarg1   - destination byte array address
 3131   //   c_rarg2   - K (key) in little endian int array
 3132   //   c_rarg3   - counter vector byte array address
 3133   //   c_rarg4   - input length
 3134   //   c_rarg5   - saved encryptedCounter start
 3135   //   c_rarg6   - saved used length
 3136   //
 3137   // Output:
 3138   //   r0       - input length
 3139   //
 3140   address generate_counterMode_AESCrypt() {
 3141     const Register in = c_rarg0;
 3142     const Register out = c_rarg1;
 3143     const Register key = c_rarg2;
 3144     const Register counter = c_rarg3;
 3145     const Register saved_len = c_rarg4, len = r10;
 3146     const Register saved_encrypted_ctr = c_rarg5;
 3147     const Register used_ptr = c_rarg6, used = r12;
 3148 
 3149     const Register offset = r7;
 3150     const Register keylen = r11;
 3151 
 3152     const unsigned char block_size = 16;
 3153     const int bulk_width = 4;
 3154     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3155     // performance with larger data sizes, but it also means that the
 3156     // fast path isn't used until you have at least 8 blocks, and up
 3157     // to 127 bytes of data will be executed on the slow path. For
 3158     // that reason, and also so as not to blow away too much icache, 4
 3159     // blocks seems like a sensible compromise.
 3160 
 3161     // Algorithm:
 3162     //
 3163     //    if (len == 0) {
 3164     //        goto DONE;
 3165     //    }
 3166     //    int result = len;
 3167     //    do {
 3168     //        if (used >= blockSize) {
 3169     //            if (len >= bulk_width * blockSize) {
 3170     //                CTR_large_block();
 3171     //                if (len == 0)
 3172     //                    goto DONE;
 3173     //            }
 3174     //            for (;;) {
 3175     //                16ByteVector v0 = counter;
 3176     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3177     //                used = 0;
 3178     //                if (len < blockSize)
 3179     //                    break;    /* goto NEXT */
 3180     //                16ByteVector v1 = load16Bytes(in, offset);
 3181     //                v1 = v1 ^ encryptedCounter;
 3182     //                store16Bytes(out, offset);
 3183     //                used = blockSize;
 3184     //                offset += blockSize;
 3185     //                len -= blockSize;
 3186     //                if (len == 0)
 3187     //                    goto DONE;
 3188     //            }
 3189     //        }
 3190     //      NEXT:
 3191     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3192     //        len--;
 3193     //    } while (len != 0);
 3194     //  DONE:
 3195     //    return result;
 3196     //
 3197     // CTR_large_block()
 3198     //    Wide bulk encryption of whole blocks.
 3199 
 3200     __ align(CodeEntryAlignment);
 3201     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
 3202     StubCodeMark mark(this, stub_id);
 3203     const address start = __ pc();
 3204     __ enter();
 3205 
 3206     Label DONE, CTR_large_block, large_block_return;
 3207     __ ldrw(used, Address(used_ptr));
 3208     __ cbzw(saved_len, DONE);
 3209 
 3210     __ mov(len, saved_len);
 3211     __ mov(offset, 0);
 3212 
 3213     // Compute #rounds for AES based on the length of the key array
 3214     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3215 
 3216     __ aesenc_loadkeys(key, keylen);
 3217 
 3218     {
 3219       Label L_CTR_loop, NEXT;
 3220 
 3221       __ bind(L_CTR_loop);
 3222 
 3223       __ cmp(used, block_size);
 3224       __ br(__ LO, NEXT);
 3225 
 3226       // Maybe we have a lot of data
 3227       __ subsw(rscratch1, len, bulk_width * block_size);
 3228       __ br(__ HS, CTR_large_block);
 3229       __ BIND(large_block_return);
 3230       __ cbzw(len, DONE);
 3231 
 3232       // Setup the counter
 3233       __ movi(v4, __ T4S, 0);
 3234       __ movi(v5, __ T4S, 1);
 3235       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3236 
 3237       // 128-bit big-endian increment
 3238       __ ld1(v0, __ T16B, counter);
 3239       __ rev64(v16, __ T16B, v0);
 3240       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3241       __ rev64(v16, __ T16B, v16);
 3242       __ st1(v16, __ T16B, counter);
 3243       // Previous counter value is in v0
 3244       // v4 contains { 0, 1 }
 3245 
 3246       {
 3247         // We have fewer than bulk_width blocks of data left. Encrypt
 3248         // them one by one until there is less than a full block
 3249         // remaining, being careful to save both the encrypted counter
 3250         // and the counter.
 3251 
 3252         Label inner_loop;
 3253         __ bind(inner_loop);
 3254         // Counter to encrypt is in v0
 3255         __ aesecb_encrypt(noreg, noreg, keylen);
 3256         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3257 
 3258         // Do we have a remaining full block?
 3259 
 3260         __ mov(used, 0);
 3261         __ cmp(len, block_size);
 3262         __ br(__ LO, NEXT);
 3263 
 3264         // Yes, we have a full block
 3265         __ ldrq(v1, Address(in, offset));
 3266         __ eor(v1, __ T16B, v1, v0);
 3267         __ strq(v1, Address(out, offset));
 3268         __ mov(used, block_size);
 3269         __ add(offset, offset, block_size);
 3270 
 3271         __ subw(len, len, block_size);
 3272         __ cbzw(len, DONE);
 3273 
 3274         // Increment the counter, store it back
 3275         __ orr(v0, __ T16B, v16, v16);
 3276         __ rev64(v16, __ T16B, v16);
 3277         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3278         __ rev64(v16, __ T16B, v16);
 3279         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3280 
 3281         __ b(inner_loop);
 3282       }
 3283 
 3284       __ BIND(NEXT);
 3285 
 3286       // Encrypt a single byte, and loop.
 3287       // We expect this to be a rare event.
 3288       __ ldrb(rscratch1, Address(in, offset));
 3289       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3290       __ eor(rscratch1, rscratch1, rscratch2);
 3291       __ strb(rscratch1, Address(out, offset));
 3292       __ add(offset, offset, 1);
 3293       __ add(used, used, 1);
 3294       __ subw(len, len,1);
 3295       __ cbnzw(len, L_CTR_loop);
 3296     }
 3297 
 3298     __ bind(DONE);
 3299     __ strw(used, Address(used_ptr));
 3300     __ mov(r0, saved_len);
 3301 
 3302     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3303     __ ret(lr);
 3304 
 3305     // Bulk encryption
 3306 
 3307     __ BIND (CTR_large_block);
 3308     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3309 
 3310     if (bulk_width == 8) {
 3311       __ sub(sp, sp, 4 * 16);
 3312       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3313     }
 3314     __ sub(sp, sp, 4 * 16);
 3315     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3316     RegSet saved_regs = (RegSet::of(in, out, offset)
 3317                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3318     __ push(saved_regs, sp);
 3319     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3320     __ add(in, in, offset);
 3321     __ add(out, out, offset);
 3322 
 3323     // Keys should already be loaded into the correct registers
 3324 
 3325     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3326     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3327 
 3328     // AES/CTR loop
 3329     {
 3330       Label L_CTR_loop;
 3331       __ BIND(L_CTR_loop);
 3332 
 3333       // Setup the counters
 3334       __ movi(v8, __ T4S, 0);
 3335       __ movi(v9, __ T4S, 1);
 3336       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3337 
 3338       for (int i = 0; i < bulk_width; i++) {
 3339         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3340         __ rev64(v0_ofs, __ T16B, v16);
 3341         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3342       }
 3343 
 3344       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3345 
 3346       // Encrypt the counters
 3347       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3348 
 3349       if (bulk_width == 8) {
 3350         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3351       }
 3352 
 3353       // XOR the encrypted counters with the inputs
 3354       for (int i = 0; i < bulk_width; i++) {
 3355         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3356         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3357         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3358       }
 3359 
 3360       // Write the encrypted data
 3361       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3362       if (bulk_width == 8) {
 3363         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3364       }
 3365 
 3366       __ subw(len, len, 16 * bulk_width);
 3367       __ cbnzw(len, L_CTR_loop);
 3368     }
 3369 
 3370     // Save the counter back where it goes
 3371     __ rev64(v16, __ T16B, v16);
 3372     __ st1(v16, __ T16B, counter);
 3373 
 3374     __ pop(saved_regs, sp);
 3375 
 3376     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3377     if (bulk_width == 8) {
 3378       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3379     }
 3380 
 3381     __ andr(rscratch1, len, -16 * bulk_width);
 3382     __ sub(len, len, rscratch1);
 3383     __ add(offset, offset, rscratch1);
 3384     __ mov(used, 16);
 3385     __ strw(used, Address(used_ptr));
 3386     __ b(large_block_return);
 3387 
 3388     return start;
 3389   }
 3390 
 3391   // Vector AES Galois Counter Mode implementation. Parameters:
 3392   //
 3393   // in = c_rarg0
 3394   // len = c_rarg1
 3395   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3396   // out = c_rarg3
 3397   // key = c_rarg4
 3398   // state = c_rarg5 - GHASH.state
 3399   // subkeyHtbl = c_rarg6 - powers of H
 3400   // counter = c_rarg7 - 16 bytes of CTR
 3401   // return - number of processed bytes
 3402   address generate_galoisCounterMode_AESCrypt() {
 3403     address ghash_polynomial = __ pc();
 3404     __ emit_int64(0x87);  // The low-order bits of the field
 3405                           // polynomial (i.e. p = z^7+z^2+z+1)
 3406                           // repeated in the low and high parts of a
 3407                           // 128-bit vector
 3408     __ emit_int64(0x87);
 3409 
 3410     __ align(CodeEntryAlignment);
 3411     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
 3412     StubCodeMark mark(this, stub_id);
 3413     address start = __ pc();
 3414     __ enter();
 3415 
 3416     const Register in = c_rarg0;
 3417     const Register len = c_rarg1;
 3418     const Register ct = c_rarg2;
 3419     const Register out = c_rarg3;
 3420     // and updated with the incremented counter in the end
 3421 
 3422     const Register key = c_rarg4;
 3423     const Register state = c_rarg5;
 3424 
 3425     const Register subkeyHtbl = c_rarg6;
 3426 
 3427     const Register counter = c_rarg7;
 3428 
 3429     const Register keylen = r10;
 3430     // Save state before entering routine
 3431     __ sub(sp, sp, 4 * 16);
 3432     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3433     __ sub(sp, sp, 4 * 16);
 3434     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3435 
 3436     // __ andr(len, len, -512);
 3437     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3438     __ str(len, __ pre(sp, -2 * wordSize));
 3439 
 3440     Label DONE;
 3441     __ cbz(len, DONE);
 3442 
 3443     // Compute #rounds for AES based on the length of the key array
 3444     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3445 
 3446     __ aesenc_loadkeys(key, keylen);
 3447     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3448     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3449 
 3450     // AES/CTR loop
 3451     {
 3452       Label L_CTR_loop;
 3453       __ BIND(L_CTR_loop);
 3454 
 3455       // Setup the counters
 3456       __ movi(v8, __ T4S, 0);
 3457       __ movi(v9, __ T4S, 1);
 3458       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3459 
 3460       assert(v0->encoding() < v8->encoding(), "");
 3461       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3462         FloatRegister f = as_FloatRegister(i);
 3463         __ rev32(f, __ T16B, v16);
 3464         __ addv(v16, __ T4S, v16, v8);
 3465       }
 3466 
 3467       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3468 
 3469       // Encrypt the counters
 3470       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3471 
 3472       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3473 
 3474       // XOR the encrypted counters with the inputs
 3475       for (int i = 0; i < 8; i++) {
 3476         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3477         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3478         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3479       }
 3480       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3481       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3482 
 3483       __ subw(len, len, 16 * 8);
 3484       __ cbnzw(len, L_CTR_loop);
 3485     }
 3486 
 3487     __ rev32(v16, __ T16B, v16);
 3488     __ st1(v16, __ T16B, counter);
 3489 
 3490     __ ldr(len, Address(sp));
 3491     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3492 
 3493     // GHASH/CTR loop
 3494     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3495                                 len, /*unrolls*/4);
 3496 
 3497 #ifdef ASSERT
 3498     { Label L;
 3499       __ cmp(len, (unsigned char)0);
 3500       __ br(Assembler::EQ, L);
 3501       __ stop("stubGenerator: abort");
 3502       __ bind(L);
 3503   }
 3504 #endif
 3505 
 3506   __ bind(DONE);
 3507     // Return the number of bytes processed
 3508     __ ldr(r0, __ post(sp, 2 * wordSize));
 3509 
 3510     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3511     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3512 
 3513     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3514     __ ret(lr);
 3515      return start;
 3516   }
 3517 
 3518   class Cached64Bytes {
 3519   private:
 3520     MacroAssembler *_masm;
 3521     Register _regs[8];
 3522 
 3523   public:
 3524     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3525       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3526       auto it = rs.begin();
 3527       for (auto &r: _regs) {
 3528         r = *it;
 3529         ++it;
 3530       }
 3531     }
 3532 
 3533     void gen_loads(Register base) {
 3534       for (int i = 0; i < 8; i += 2) {
 3535         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3536       }
 3537     }
 3538 
 3539     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3540     void extract_u32(Register dest, int i) {
 3541       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3542     }
 3543   };
 3544 
 3545   // Utility routines for md5.
 3546   // Clobbers r10 and r11.
 3547   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3548               int k, int s, int t) {
 3549     Register rscratch3 = r10;
 3550     Register rscratch4 = r11;
 3551 
 3552     __ eorw(rscratch3, r3, r4);
 3553     __ movw(rscratch2, t);
 3554     __ andw(rscratch3, rscratch3, r2);
 3555     __ addw(rscratch4, r1, rscratch2);
 3556     reg_cache.extract_u32(rscratch1, k);
 3557     __ eorw(rscratch3, rscratch3, r4);
 3558     __ addw(rscratch4, rscratch4, rscratch1);
 3559     __ addw(rscratch3, rscratch3, rscratch4);
 3560     __ rorw(rscratch2, rscratch3, 32 - s);
 3561     __ addw(r1, rscratch2, r2);
 3562   }
 3563 
 3564   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3565               int k, int s, int t) {
 3566     Register rscratch3 = r10;
 3567     Register rscratch4 = r11;
 3568 
 3569     reg_cache.extract_u32(rscratch1, k);
 3570     __ movw(rscratch2, t);
 3571     __ addw(rscratch4, r1, rscratch2);
 3572     __ addw(rscratch4, rscratch4, rscratch1);
 3573     __ bicw(rscratch2, r3, r4);
 3574     __ andw(rscratch3, r2, r4);
 3575     __ addw(rscratch2, rscratch2, rscratch4);
 3576     __ addw(rscratch2, rscratch2, rscratch3);
 3577     __ rorw(rscratch2, rscratch2, 32 - s);
 3578     __ addw(r1, rscratch2, r2);
 3579   }
 3580 
 3581   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3582               int k, int s, int t) {
 3583     Register rscratch3 = r10;
 3584     Register rscratch4 = r11;
 3585 
 3586     __ eorw(rscratch3, r3, r4);
 3587     __ movw(rscratch2, t);
 3588     __ addw(rscratch4, r1, rscratch2);
 3589     reg_cache.extract_u32(rscratch1, k);
 3590     __ eorw(rscratch3, rscratch3, r2);
 3591     __ addw(rscratch4, rscratch4, rscratch1);
 3592     __ addw(rscratch3, rscratch3, rscratch4);
 3593     __ rorw(rscratch2, rscratch3, 32 - s);
 3594     __ addw(r1, rscratch2, r2);
 3595   }
 3596 
 3597   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3598               int k, int s, int t) {
 3599     Register rscratch3 = r10;
 3600     Register rscratch4 = r11;
 3601 
 3602     __ movw(rscratch3, t);
 3603     __ ornw(rscratch2, r2, r4);
 3604     __ addw(rscratch4, r1, rscratch3);
 3605     reg_cache.extract_u32(rscratch1, k);
 3606     __ eorw(rscratch3, rscratch2, r3);
 3607     __ addw(rscratch4, rscratch4, rscratch1);
 3608     __ addw(rscratch3, rscratch3, rscratch4);
 3609     __ rorw(rscratch2, rscratch3, 32 - s);
 3610     __ addw(r1, rscratch2, r2);
 3611   }
 3612 
 3613   // Arguments:
 3614   //
 3615   // Inputs:
 3616   //   c_rarg0   - byte[]  source+offset
 3617   //   c_rarg1   - int[]   SHA.state
 3618   //   c_rarg2   - int     offset
 3619   //   c_rarg3   - int     limit
 3620   //
 3621   address generate_md5_implCompress(StubGenStubId stub_id) {
 3622     bool multi_block;
 3623     switch (stub_id) {
 3624     case md5_implCompress_id:
 3625       multi_block = false;
 3626       break;
 3627     case md5_implCompressMB_id:
 3628       multi_block = true;
 3629       break;
 3630     default:
 3631       ShouldNotReachHere();
 3632     }
 3633     __ align(CodeEntryAlignment);
 3634 
 3635     StubCodeMark mark(this, stub_id);
 3636     address start = __ pc();
 3637 
 3638     Register buf       = c_rarg0;
 3639     Register state     = c_rarg1;
 3640     Register ofs       = c_rarg2;
 3641     Register limit     = c_rarg3;
 3642     Register a         = r4;
 3643     Register b         = r5;
 3644     Register c         = r6;
 3645     Register d         = r7;
 3646     Register rscratch3 = r10;
 3647     Register rscratch4 = r11;
 3648 
 3649     Register state_regs[2] = { r12, r13 };
 3650     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3651     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3652 
 3653     __ push(saved_regs, sp);
 3654 
 3655     __ ldp(state_regs[0], state_regs[1], Address(state));
 3656     __ ubfx(a, state_regs[0],  0, 32);
 3657     __ ubfx(b, state_regs[0], 32, 32);
 3658     __ ubfx(c, state_regs[1],  0, 32);
 3659     __ ubfx(d, state_regs[1], 32, 32);
 3660 
 3661     Label md5_loop;
 3662     __ BIND(md5_loop);
 3663 
 3664     reg_cache.gen_loads(buf);
 3665 
 3666     // Round 1
 3667     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3668     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3669     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3670     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3671     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3672     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3673     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3674     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3675     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3676     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3677     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3678     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3679     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3680     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3681     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3682     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3683 
 3684     // Round 2
 3685     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3686     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3687     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3688     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3689     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3690     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3691     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3692     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3693     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3694     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3695     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3696     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3697     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3698     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3699     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3700     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3701 
 3702     // Round 3
 3703     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3704     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3705     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3706     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3707     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3708     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3709     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3710     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3711     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3712     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3713     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3714     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3715     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3716     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3717     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3718     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3719 
 3720     // Round 4
 3721     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3722     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3723     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3724     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3725     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3726     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3727     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3728     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3729     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3730     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3731     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3732     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3733     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3734     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3735     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3736     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3737 
 3738     __ addw(a, state_regs[0], a);
 3739     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3740     __ addw(b, rscratch2, b);
 3741     __ addw(c, state_regs[1], c);
 3742     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3743     __ addw(d, rscratch4, d);
 3744 
 3745     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3746     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3747 
 3748     if (multi_block) {
 3749       __ add(buf, buf, 64);
 3750       __ add(ofs, ofs, 64);
 3751       __ cmp(ofs, limit);
 3752       __ br(Assembler::LE, md5_loop);
 3753       __ mov(c_rarg0, ofs); // return ofs
 3754     }
 3755 
 3756     // write hash values back in the correct order
 3757     __ stp(state_regs[0], state_regs[1], Address(state));
 3758 
 3759     __ pop(saved_regs, sp);
 3760 
 3761     __ ret(lr);
 3762 
 3763     return start;
 3764   }
 3765 
 3766   // Arguments:
 3767   //
 3768   // Inputs:
 3769   //   c_rarg0   - byte[]  source+offset
 3770   //   c_rarg1   - int[]   SHA.state
 3771   //   c_rarg2   - int     offset
 3772   //   c_rarg3   - int     limit
 3773   //
 3774   address generate_sha1_implCompress(StubGenStubId stub_id) {
 3775     bool multi_block;
 3776     switch (stub_id) {
 3777     case sha1_implCompress_id:
 3778       multi_block = false;
 3779       break;
 3780     case sha1_implCompressMB_id:
 3781       multi_block = true;
 3782       break;
 3783     default:
 3784       ShouldNotReachHere();
 3785     }
 3786 
 3787     __ align(CodeEntryAlignment);
 3788 
 3789     StubCodeMark mark(this, stub_id);
 3790     address start = __ pc();
 3791 
 3792     Register buf   = c_rarg0;
 3793     Register state = c_rarg1;
 3794     Register ofs   = c_rarg2;
 3795     Register limit = c_rarg3;
 3796 
 3797     Label keys;
 3798     Label sha1_loop;
 3799 
 3800     // load the keys into v0..v3
 3801     __ adr(rscratch1, keys);
 3802     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3803     // load 5 words state into v6, v7
 3804     __ ldrq(v6, Address(state, 0));
 3805     __ ldrs(v7, Address(state, 16));
 3806 
 3807 
 3808     __ BIND(sha1_loop);
 3809     // load 64 bytes of data into v16..v19
 3810     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3811     __ rev32(v16, __ T16B, v16);
 3812     __ rev32(v17, __ T16B, v17);
 3813     __ rev32(v18, __ T16B, v18);
 3814     __ rev32(v19, __ T16B, v19);
 3815 
 3816     // do the sha1
 3817     __ addv(v4, __ T4S, v16, v0);
 3818     __ orr(v20, __ T16B, v6, v6);
 3819 
 3820     FloatRegister d0 = v16;
 3821     FloatRegister d1 = v17;
 3822     FloatRegister d2 = v18;
 3823     FloatRegister d3 = v19;
 3824 
 3825     for (int round = 0; round < 20; round++) {
 3826       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3827       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3828       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3829       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3830       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3831 
 3832       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3833       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3834       __ sha1h(tmp2, __ T4S, v20);
 3835       if (round < 5)
 3836         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3837       else if (round < 10 || round >= 15)
 3838         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3839       else
 3840         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3841       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3842 
 3843       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3844     }
 3845 
 3846     __ addv(v7, __ T2S, v7, v21);
 3847     __ addv(v6, __ T4S, v6, v20);
 3848 
 3849     if (multi_block) {
 3850       __ add(ofs, ofs, 64);
 3851       __ cmp(ofs, limit);
 3852       __ br(Assembler::LE, sha1_loop);
 3853       __ mov(c_rarg0, ofs); // return ofs
 3854     }
 3855 
 3856     __ strq(v6, Address(state, 0));
 3857     __ strs(v7, Address(state, 16));
 3858 
 3859     __ ret(lr);
 3860 
 3861     __ bind(keys);
 3862     __ emit_int32(0x5a827999);
 3863     __ emit_int32(0x6ed9eba1);
 3864     __ emit_int32(0x8f1bbcdc);
 3865     __ emit_int32(0xca62c1d6);
 3866 
 3867     return start;
 3868   }
 3869 
 3870 
 3871   // Arguments:
 3872   //
 3873   // Inputs:
 3874   //   c_rarg0   - byte[]  source+offset
 3875   //   c_rarg1   - int[]   SHA.state
 3876   //   c_rarg2   - int     offset
 3877   //   c_rarg3   - int     limit
 3878   //
 3879   address generate_sha256_implCompress(StubGenStubId stub_id) {
 3880     bool multi_block;
 3881     switch (stub_id) {
 3882     case sha256_implCompress_id:
 3883       multi_block = false;
 3884       break;
 3885     case sha256_implCompressMB_id:
 3886       multi_block = true;
 3887       break;
 3888     default:
 3889       ShouldNotReachHere();
 3890     }
 3891 
 3892     static const uint32_t round_consts[64] = {
 3893       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3894       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3895       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3896       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3897       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3898       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3899       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3900       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3901       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3902       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3903       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3904       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3905       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3906       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3907       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3908       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3909     };
 3910 
 3911     __ align(CodeEntryAlignment);
 3912 
 3913     StubCodeMark mark(this, stub_id);
 3914     address start = __ pc();
 3915 
 3916     Register buf   = c_rarg0;
 3917     Register state = c_rarg1;
 3918     Register ofs   = c_rarg2;
 3919     Register limit = c_rarg3;
 3920 
 3921     Label sha1_loop;
 3922 
 3923     __ stpd(v8, v9, __ pre(sp, -32));
 3924     __ stpd(v10, v11, Address(sp, 16));
 3925 
 3926 // dga == v0
 3927 // dgb == v1
 3928 // dg0 == v2
 3929 // dg1 == v3
 3930 // dg2 == v4
 3931 // t0 == v6
 3932 // t1 == v7
 3933 
 3934     // load 16 keys to v16..v31
 3935     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3936     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3937     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3938     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3939     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3940 
 3941     // load 8 words (256 bits) state
 3942     __ ldpq(v0, v1, state);
 3943 
 3944     __ BIND(sha1_loop);
 3945     // load 64 bytes of data into v8..v11
 3946     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3947     __ rev32(v8, __ T16B, v8);
 3948     __ rev32(v9, __ T16B, v9);
 3949     __ rev32(v10, __ T16B, v10);
 3950     __ rev32(v11, __ T16B, v11);
 3951 
 3952     __ addv(v6, __ T4S, v8, v16);
 3953     __ orr(v2, __ T16B, v0, v0);
 3954     __ orr(v3, __ T16B, v1, v1);
 3955 
 3956     FloatRegister d0 = v8;
 3957     FloatRegister d1 = v9;
 3958     FloatRegister d2 = v10;
 3959     FloatRegister d3 = v11;
 3960 
 3961 
 3962     for (int round = 0; round < 16; round++) {
 3963       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3964       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3965       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3966       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3967 
 3968       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3969        __ orr(v4, __ T16B, v2, v2);
 3970       if (round < 15)
 3971         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3972       __ sha256h(v2, __ T4S, v3, tmp2);
 3973       __ sha256h2(v3, __ T4S, v4, tmp2);
 3974       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3975 
 3976       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3977     }
 3978 
 3979     __ addv(v0, __ T4S, v0, v2);
 3980     __ addv(v1, __ T4S, v1, v3);
 3981 
 3982     if (multi_block) {
 3983       __ add(ofs, ofs, 64);
 3984       __ cmp(ofs, limit);
 3985       __ br(Assembler::LE, sha1_loop);
 3986       __ mov(c_rarg0, ofs); // return ofs
 3987     }
 3988 
 3989     __ ldpd(v10, v11, Address(sp, 16));
 3990     __ ldpd(v8, v9, __ post(sp, 32));
 3991 
 3992     __ stpq(v0, v1, state);
 3993 
 3994     __ ret(lr);
 3995 
 3996     return start;
 3997   }
 3998 
 3999   // Double rounds for sha512.
 4000   void sha512_dround(int dr,
 4001                      FloatRegister vi0, FloatRegister vi1,
 4002                      FloatRegister vi2, FloatRegister vi3,
 4003                      FloatRegister vi4, FloatRegister vrc0,
 4004                      FloatRegister vrc1, FloatRegister vin0,
 4005                      FloatRegister vin1, FloatRegister vin2,
 4006                      FloatRegister vin3, FloatRegister vin4) {
 4007       if (dr < 36) {
 4008         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4009       }
 4010       __ addv(v5, __ T2D, vrc0, vin0);
 4011       __ ext(v6, __ T16B, vi2, vi3, 8);
 4012       __ ext(v5, __ T16B, v5, v5, 8);
 4013       __ ext(v7, __ T16B, vi1, vi2, 8);
 4014       __ addv(vi3, __ T2D, vi3, v5);
 4015       if (dr < 32) {
 4016         __ ext(v5, __ T16B, vin3, vin4, 8);
 4017         __ sha512su0(vin0, __ T2D, vin1);
 4018       }
 4019       __ sha512h(vi3, __ T2D, v6, v7);
 4020       if (dr < 32) {
 4021         __ sha512su1(vin0, __ T2D, vin2, v5);
 4022       }
 4023       __ addv(vi4, __ T2D, vi1, vi3);
 4024       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4025   }
 4026 
 4027   // Arguments:
 4028   //
 4029   // Inputs:
 4030   //   c_rarg0   - byte[]  source+offset
 4031   //   c_rarg1   - int[]   SHA.state
 4032   //   c_rarg2   - int     offset
 4033   //   c_rarg3   - int     limit
 4034   //
 4035   address generate_sha512_implCompress(StubGenStubId stub_id) {
 4036     bool multi_block;
 4037     switch (stub_id) {
 4038     case sha512_implCompress_id:
 4039       multi_block = false;
 4040       break;
 4041     case sha512_implCompressMB_id:
 4042       multi_block = true;
 4043       break;
 4044     default:
 4045       ShouldNotReachHere();
 4046     }
 4047 
 4048     static const uint64_t round_consts[80] = {
 4049       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4050       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4051       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4052       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4053       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4054       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4055       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4056       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4057       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4058       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4059       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4060       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4061       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4062       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4063       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4064       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4065       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4066       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4067       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4068       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4069       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4070       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4071       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4072       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4073       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4074       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4075       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4076     };
 4077 
 4078     __ align(CodeEntryAlignment);
 4079 
 4080     StubCodeMark mark(this, stub_id);
 4081     address start = __ pc();
 4082 
 4083     Register buf   = c_rarg0;
 4084     Register state = c_rarg1;
 4085     Register ofs   = c_rarg2;
 4086     Register limit = c_rarg3;
 4087 
 4088     __ stpd(v8, v9, __ pre(sp, -64));
 4089     __ stpd(v10, v11, Address(sp, 16));
 4090     __ stpd(v12, v13, Address(sp, 32));
 4091     __ stpd(v14, v15, Address(sp, 48));
 4092 
 4093     Label sha512_loop;
 4094 
 4095     // load state
 4096     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4097 
 4098     // load first 4 round constants
 4099     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4100     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4101 
 4102     __ BIND(sha512_loop);
 4103     // load 128B of data into v12..v19
 4104     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4105     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4106     __ rev64(v12, __ T16B, v12);
 4107     __ rev64(v13, __ T16B, v13);
 4108     __ rev64(v14, __ T16B, v14);
 4109     __ rev64(v15, __ T16B, v15);
 4110     __ rev64(v16, __ T16B, v16);
 4111     __ rev64(v17, __ T16B, v17);
 4112     __ rev64(v18, __ T16B, v18);
 4113     __ rev64(v19, __ T16B, v19);
 4114 
 4115     __ mov(rscratch2, rscratch1);
 4116 
 4117     __ mov(v0, __ T16B, v8);
 4118     __ mov(v1, __ T16B, v9);
 4119     __ mov(v2, __ T16B, v10);
 4120     __ mov(v3, __ T16B, v11);
 4121 
 4122     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4123     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4124     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4125     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4126     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4127     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4128     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4129     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4130     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4131     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4132     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4133     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4134     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4135     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4136     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4137     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4138     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4139     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4140     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4141     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4142     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4143     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4144     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4145     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4146     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4147     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4148     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4149     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4150     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4151     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4152     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4153     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4154     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4155     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4156     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4157     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4158     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4159     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4160     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4161     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4162 
 4163     __ addv(v8, __ T2D, v8, v0);
 4164     __ addv(v9, __ T2D, v9, v1);
 4165     __ addv(v10, __ T2D, v10, v2);
 4166     __ addv(v11, __ T2D, v11, v3);
 4167 
 4168     if (multi_block) {
 4169       __ add(ofs, ofs, 128);
 4170       __ cmp(ofs, limit);
 4171       __ br(Assembler::LE, sha512_loop);
 4172       __ mov(c_rarg0, ofs); // return ofs
 4173     }
 4174 
 4175     __ st1(v8, v9, v10, v11, __ T2D, state);
 4176 
 4177     __ ldpd(v14, v15, Address(sp, 48));
 4178     __ ldpd(v12, v13, Address(sp, 32));
 4179     __ ldpd(v10, v11, Address(sp, 16));
 4180     __ ldpd(v8, v9, __ post(sp, 64));
 4181 
 4182     __ ret(lr);
 4183 
 4184     return start;
 4185   }
 4186 
 4187   // Execute one round of keccak of two computations in parallel.
 4188   // One of the states should be loaded into the lower halves of
 4189   // the vector registers v0-v24, the other should be loaded into
 4190   // the upper halves of those registers. The ld1r instruction loads
 4191   // the round constant into both halves of register v31.
 4192   // Intermediate results c0...c5 and d0...d5 are computed
 4193   // in registers v25...v30.
 4194   // All vector instructions that are used operate on both register
 4195   // halves in parallel.
 4196   // If only a single computation is needed, one can only load the lower halves.
 4197   void keccak_round(Register rscratch1) {
 4198   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4199   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4200   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4201   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4202   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4203   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4204   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4205   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4206   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4207   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4208 
 4209   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4210   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4211   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4212   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4213   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4214 
 4215   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4216   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4217   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4218   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4219   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4220   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4221   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4222   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4223   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4224   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4225   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4226   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4227   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4228   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4229   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4230   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4231   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4232   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4233   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4234   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4235   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4236   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4237   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4238   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4239   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4240 
 4241   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4242   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4243   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4244   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4245   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4246 
 4247   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4248 
 4249   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4250   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4251   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4252   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4253   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4254 
 4255   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4256   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4257   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4258   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4259   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4260 
 4261   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4262   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4263   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4264   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4265   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4266 
 4267   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4268   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4269   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4270   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4271   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4272 
 4273   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4274   }
 4275 
 4276   // Arguments:
 4277   //
 4278   // Inputs:
 4279   //   c_rarg0   - byte[]  source+offset
 4280   //   c_rarg1   - byte[]  SHA.state
 4281   //   c_rarg2   - int     block_size
 4282   //   c_rarg3   - int     offset
 4283   //   c_rarg4   - int     limit
 4284   //
 4285   address generate_sha3_implCompress(StubGenStubId stub_id) {
 4286     bool multi_block;
 4287     switch (stub_id) {
 4288     case sha3_implCompress_id:
 4289       multi_block = false;
 4290       break;
 4291     case sha3_implCompressMB_id:
 4292       multi_block = true;
 4293       break;
 4294     default:
 4295       ShouldNotReachHere();
 4296     }
 4297 
 4298     static const uint64_t round_consts[24] = {
 4299       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4300       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4301       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4302       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4303       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4304       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4305       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4306       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4307     };
 4308 
 4309     __ align(CodeEntryAlignment);
 4310 
 4311     StubCodeMark mark(this, stub_id);
 4312     address start = __ pc();
 4313 
 4314     Register buf           = c_rarg0;
 4315     Register state         = c_rarg1;
 4316     Register block_size    = c_rarg2;
 4317     Register ofs           = c_rarg3;
 4318     Register limit         = c_rarg4;
 4319 
 4320     Label sha3_loop, rounds24_loop;
 4321     Label sha3_512_or_sha3_384, shake128;
 4322 
 4323     __ stpd(v8, v9, __ pre(sp, -64));
 4324     __ stpd(v10, v11, Address(sp, 16));
 4325     __ stpd(v12, v13, Address(sp, 32));
 4326     __ stpd(v14, v15, Address(sp, 48));
 4327 
 4328     // load state
 4329     __ add(rscratch1, state, 32);
 4330     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4331     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4332     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4333     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4334     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4335     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4336     __ ld1(v24, __ T1D, rscratch1);
 4337 
 4338     __ BIND(sha3_loop);
 4339 
 4340     // 24 keccak rounds
 4341     __ movw(rscratch2, 24);
 4342 
 4343     // load round_constants base
 4344     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4345 
 4346     // load input
 4347     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4348     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4349     __ eor(v0, __ T8B, v0, v25);
 4350     __ eor(v1, __ T8B, v1, v26);
 4351     __ eor(v2, __ T8B, v2, v27);
 4352     __ eor(v3, __ T8B, v3, v28);
 4353     __ eor(v4, __ T8B, v4, v29);
 4354     __ eor(v5, __ T8B, v5, v30);
 4355     __ eor(v6, __ T8B, v6, v31);
 4356 
 4357     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4358     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4359 
 4360     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4361     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4362     __ eor(v7, __ T8B, v7, v25);
 4363     __ eor(v8, __ T8B, v8, v26);
 4364     __ eor(v9, __ T8B, v9, v27);
 4365     __ eor(v10, __ T8B, v10, v28);
 4366     __ eor(v11, __ T8B, v11, v29);
 4367     __ eor(v12, __ T8B, v12, v30);
 4368     __ eor(v13, __ T8B, v13, v31);
 4369 
 4370     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4371     __ eor(v14, __ T8B, v14, v25);
 4372     __ eor(v15, __ T8B, v15, v26);
 4373     __ eor(v16, __ T8B, v16, v27);
 4374 
 4375     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4376     __ andw(c_rarg5, block_size, 48);
 4377     __ cbzw(c_rarg5, rounds24_loop);
 4378 
 4379     __ tbnz(block_size, 5, shake128);
 4380     // block_size == 144, bit5 == 0, SHA3-224
 4381     __ ldrd(v28, __ post(buf, 8));
 4382     __ eor(v17, __ T8B, v17, v28);
 4383     __ b(rounds24_loop);
 4384 
 4385     __ BIND(shake128);
 4386     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4387     __ eor(v17, __ T8B, v17, v28);
 4388     __ eor(v18, __ T8B, v18, v29);
 4389     __ eor(v19, __ T8B, v19, v30);
 4390     __ eor(v20, __ T8B, v20, v31);
 4391     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4392 
 4393     __ BIND(sha3_512_or_sha3_384);
 4394     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4395     __ eor(v7, __ T8B, v7, v25);
 4396     __ eor(v8, __ T8B, v8, v26);
 4397     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4398 
 4399     // SHA3-384
 4400     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4401     __ eor(v9,  __ T8B, v9,  v27);
 4402     __ eor(v10, __ T8B, v10, v28);
 4403     __ eor(v11, __ T8B, v11, v29);
 4404     __ eor(v12, __ T8B, v12, v30);
 4405 
 4406     __ BIND(rounds24_loop);
 4407     __ subw(rscratch2, rscratch2, 1);
 4408 
 4409     keccak_round(rscratch1);
 4410 
 4411     __ cbnzw(rscratch2, rounds24_loop);
 4412 
 4413     if (multi_block) {
 4414       __ add(ofs, ofs, block_size);
 4415       __ cmp(ofs, limit);
 4416       __ br(Assembler::LE, sha3_loop);
 4417       __ mov(c_rarg0, ofs); // return ofs
 4418     }
 4419 
 4420     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4421     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4422     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4423     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4424     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4425     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4426     __ st1(v24, __ T1D, state);
 4427 
 4428     // restore callee-saved registers
 4429     __ ldpd(v14, v15, Address(sp, 48));
 4430     __ ldpd(v12, v13, Address(sp, 32));
 4431     __ ldpd(v10, v11, Address(sp, 16));
 4432     __ ldpd(v8, v9, __ post(sp, 64));
 4433 
 4434     __ ret(lr);
 4435 
 4436     return start;
 4437   }
 4438 
 4439   // Inputs:
 4440   //   c_rarg0   - long[]  state0
 4441   //   c_rarg1   - long[]  state1
 4442   address generate_double_keccak() {
 4443     static const uint64_t round_consts[24] = {
 4444       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4445       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4446       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4447       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4448       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4449       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4450       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4451       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4452     };
 4453 
 4454     // Implements the double_keccak() method of the
 4455     // sun.secyrity.provider.SHA3Parallel class
 4456     __ align(CodeEntryAlignment);
 4457     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4458     address start = __ pc();
 4459     __ enter();
 4460 
 4461     Register state0        = c_rarg0;
 4462     Register state1        = c_rarg1;
 4463 
 4464     Label rounds24_loop;
 4465 
 4466     // save callee-saved registers
 4467     __ stpd(v8, v9, __ pre(sp, -64));
 4468     __ stpd(v10, v11, Address(sp, 16));
 4469     __ stpd(v12, v13, Address(sp, 32));
 4470     __ stpd(v14, v15, Address(sp, 48));
 4471 
 4472     // load states
 4473     __ add(rscratch1, state0, 32);
 4474     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4475     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4476     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4477     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4478     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4479     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4480     __ ld1(v24, __ D, 0, rscratch1);
 4481     __ add(rscratch1, state1, 32);
 4482     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4483     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4484     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4485     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4486     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4487     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4488     __ ld1(v24, __ D, 1, rscratch1);
 4489 
 4490     // 24 keccak rounds
 4491     __ movw(rscratch2, 24);
 4492 
 4493     // load round_constants base
 4494     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4495 
 4496     __ BIND(rounds24_loop);
 4497     __ subw(rscratch2, rscratch2, 1);
 4498     keccak_round(rscratch1);
 4499     __ cbnzw(rscratch2, rounds24_loop);
 4500 
 4501     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4502     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4503     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4504     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4505     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4506     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4507     __ st1(v24, __ D, 0, state0);
 4508     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4509     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4510     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4511     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4512     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4513     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4514     __ st1(v24, __ D, 1, state1);
 4515 
 4516     // restore callee-saved vector registers
 4517     __ ldpd(v14, v15, Address(sp, 48));
 4518     __ ldpd(v12, v13, Address(sp, 32));
 4519     __ ldpd(v10, v11, Address(sp, 16));
 4520     __ ldpd(v8, v9, __ post(sp, 64));
 4521 
 4522     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4523     __ mov(r0, zr); // return 0
 4524     __ ret(lr);
 4525 
 4526     return start;
 4527   }
 4528 
 4529   // ChaCha20 block function.  This version parallelizes the 32-bit
 4530   // state elements on each of 16 vectors, producing 4 blocks of
 4531   // keystream at a time.
 4532   //
 4533   // state (int[16]) = c_rarg0
 4534   // keystream (byte[256]) = c_rarg1
 4535   // return - number of bytes of produced keystream (always 256)
 4536   //
 4537   // This implementation takes each 32-bit integer from the state
 4538   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4539   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4540   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4541   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4542   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4543   // operations represented by one QUARTERROUND function, we instead stack all
 4544   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4545   // and then do the same for the second set of 4 quarter rounds.  This removes
 4546   // some latency that would otherwise be incurred by waiting for an add to
 4547   // complete before performing an xor (which depends on the result of the
 4548   // add), etc. An adjustment happens between the first and second groups of 4
 4549   // quarter rounds, but this is done only in the inputs to the macro functions
 4550   // that generate the assembly instructions - these adjustments themselves are
 4551   // not part of the resulting assembly.
 4552   // The 4 registers v0-v3 are used during the quarter round operations as
 4553   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4554   // registers become the vectors involved in adding the start state back onto
 4555   // the post-QR working state.  After the adds are complete, each of the 16
 4556   // vectors write their first lane back to the keystream buffer, followed
 4557   // by the second lane from all vectors and so on.
 4558   address generate_chacha20Block_blockpar() {
 4559     Label L_twoRounds, L_cc20_const;
 4560     // The constant data is broken into two 128-bit segments to be loaded
 4561     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4562     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4563     // The second 128-bits is a table constant used for 8-bit left rotations.
 4564     __ BIND(L_cc20_const);
 4565     __ emit_int64(0x0000000100000000UL);
 4566     __ emit_int64(0x0000000300000002UL);
 4567     __ emit_int64(0x0605040702010003UL);
 4568     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4569 
 4570     __ align(CodeEntryAlignment);
 4571     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
 4572     StubCodeMark mark(this, stub_id);
 4573     address start = __ pc();
 4574     __ enter();
 4575 
 4576     int i, j;
 4577     const Register state = c_rarg0;
 4578     const Register keystream = c_rarg1;
 4579     const Register loopCtr = r10;
 4580     const Register tmpAddr = r11;
 4581     const FloatRegister ctrAddOverlay = v28;
 4582     const FloatRegister lrot8Tbl = v29;
 4583 
 4584     // Organize SIMD registers in an array that facilitates
 4585     // putting repetitive opcodes into loop structures.  It is
 4586     // important that each grouping of 4 registers is monotonically
 4587     // increasing to support the requirements of multi-register
 4588     // instructions (e.g. ld4r, st4, etc.)
 4589     const FloatRegister workSt[16] = {
 4590          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4591         v20, v21, v22, v23, v24, v25, v26, v27
 4592     };
 4593 
 4594     // Pull in constant data.  The first 16 bytes are the add overlay
 4595     // which is applied to the vector holding the counter (state[12]).
 4596     // The second 16 bytes is the index register for the 8-bit left
 4597     // rotation tbl instruction.
 4598     __ adr(tmpAddr, L_cc20_const);
 4599     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4600 
 4601     // Load from memory and interlace across 16 SIMD registers,
 4602     // With each word from memory being broadcast to all lanes of
 4603     // each successive SIMD register.
 4604     //      Addr(0) -> All lanes in workSt[i]
 4605     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4606     __ mov(tmpAddr, state);
 4607     for (i = 0; i < 16; i += 4) {
 4608       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4609           __ post(tmpAddr, 16));
 4610     }
 4611     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4612 
 4613     // Before entering the loop, create 5 4-register arrays.  These
 4614     // will hold the 4 registers that represent the a/b/c/d fields
 4615     // in the quarter round operation.  For instance the "b" field
 4616     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4617     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4618     // since it is part of a diagonal organization.  The aSet and scratch
 4619     // register sets are defined at declaration time because they do not change
 4620     // organization at any point during the 20-round processing.
 4621     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4622     FloatRegister bSet[4];
 4623     FloatRegister cSet[4];
 4624     FloatRegister dSet[4];
 4625     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4626 
 4627     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4628     __ mov(loopCtr, 10);
 4629     __ BIND(L_twoRounds);
 4630 
 4631     // Set to columnar organization and do the following 4 quarter-rounds:
 4632     // QUARTERROUND(0, 4, 8, 12)
 4633     // QUARTERROUND(1, 5, 9, 13)
 4634     // QUARTERROUND(2, 6, 10, 14)
 4635     // QUARTERROUND(3, 7, 11, 15)
 4636     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4637     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4638     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4639 
 4640     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4641     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4642     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4643 
 4644     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4645     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4646     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4647 
 4648     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4649     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4650     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4651 
 4652     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4653     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4654     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4655 
 4656     // Set to diagonal organization and do the next 4 quarter-rounds:
 4657     // QUARTERROUND(0, 5, 10, 15)
 4658     // QUARTERROUND(1, 6, 11, 12)
 4659     // QUARTERROUND(2, 7, 8, 13)
 4660     // QUARTERROUND(3, 4, 9, 14)
 4661     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4662     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4663     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4664 
 4665     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4666     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4667     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4668 
 4669     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4670     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4671     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4672 
 4673     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4674     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4675     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4676 
 4677     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4678     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4679     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4680 
 4681     // Decrement and iterate
 4682     __ sub(loopCtr, loopCtr, 1);
 4683     __ cbnz(loopCtr, L_twoRounds);
 4684 
 4685     __ mov(tmpAddr, state);
 4686 
 4687     // Add the starting state back to the post-loop keystream
 4688     // state.  We read/interlace the state array from memory into
 4689     // 4 registers similar to what we did in the beginning.  Then
 4690     // add the counter overlay onto workSt[12] at the end.
 4691     for (i = 0; i < 16; i += 4) {
 4692       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4693       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4694       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4695       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4696       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4697     }
 4698     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4699 
 4700     // Write working state into the keystream buffer.  This is accomplished
 4701     // by taking the lane "i" from each of the four vectors and writing
 4702     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4703     // repeating with the next 4 vectors until all 16 vectors have been used.
 4704     // Then move to the next lane and repeat the process until all lanes have
 4705     // been written.
 4706     for (i = 0; i < 4; i++) {
 4707       for (j = 0; j < 16; j += 4) {
 4708         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4709             __ post(keystream, 16));
 4710       }
 4711     }
 4712 
 4713     __ mov(r0, 256);             // Return length of output keystream
 4714     __ leave();
 4715     __ ret(lr);
 4716 
 4717     return start;
 4718   }
 4719 
 4720   // Helpers to schedule parallel operation bundles across vector
 4721   // register sequences of size 2, 4 or 8.
 4722 
 4723   // Implement various primitive computations across vector sequences
 4724 
 4725   template<int N>
 4726   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4727                const VSeq<N>& v1, const VSeq<N>& v2) {
 4728     // output must not be constant
 4729     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4730     // output cannot overwrite pending inputs
 4731     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4732     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4733     for (int i = 0; i < N; i++) {
 4734       __ addv(v[i], T, v1[i], v2[i]);
 4735     }
 4736   }
 4737 
 4738   template<int N>
 4739   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4740                const VSeq<N>& v1, const VSeq<N>& v2) {
 4741     // output must not be constant
 4742     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4743     // output cannot overwrite pending inputs
 4744     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4745     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4746     for (int i = 0; i < N; i++) {
 4747       __ subv(v[i], T, v1[i], v2[i]);
 4748     }
 4749   }
 4750 
 4751   template<int N>
 4752   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4753                const VSeq<N>& v1, const VSeq<N>& v2) {
 4754     // output must not be constant
 4755     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4756     // output cannot overwrite pending inputs
 4757     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4758     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4759     for (int i = 0; i < N; i++) {
 4760       __ mulv(v[i], T, v1[i], v2[i]);
 4761     }
 4762   }
 4763 
 4764   template<int N>
 4765   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4766     // output must not be constant
 4767     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4768     // output cannot overwrite pending inputs
 4769     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4770     for (int i = 0; i < N; i++) {
 4771       __ negr(v[i], T, v1[i]);
 4772     }
 4773   }
 4774 
 4775   template<int N>
 4776   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4777                const VSeq<N>& v1, int shift) {
 4778     // output must not be constant
 4779     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4780     // output cannot overwrite pending inputs
 4781     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4782     for (int i = 0; i < N; i++) {
 4783       __ sshr(v[i], T, v1[i], shift);
 4784     }
 4785   }
 4786 
 4787   template<int N>
 4788   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4789     // output must not be constant
 4790     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4791     // output cannot overwrite pending inputs
 4792     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4793     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4794     for (int i = 0; i < N; i++) {
 4795       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4796     }
 4797   }
 4798 
 4799   template<int N>
 4800   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4801     // output must not be constant
 4802     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4803     // output cannot overwrite pending inputs
 4804     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4805     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4806     for (int i = 0; i < N; i++) {
 4807       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4808     }
 4809   }
 4810 
 4811   template<int N>
 4812   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4813     // output must not be constant
 4814     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4815     // output cannot overwrite pending inputs
 4816     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4817     for (int i = 0; i < N; i++) {
 4818       __ notr(v[i], __ T16B, v1[i]);
 4819     }
 4820   }
 4821 
 4822   template<int N>
 4823   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4824     // output must not be constant
 4825     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4826     // output cannot overwrite pending inputs
 4827     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4828     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4829     for (int i = 0; i < N; i++) {
 4830       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4831     }
 4832   }
 4833 
 4834   template<int N>
 4835   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4836     // output must not be constant
 4837     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4838     // output cannot overwrite pending inputs
 4839     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4840     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4841     for (int i = 0; i < N; i++) {
 4842       __ mlsv(v[i], T, v1[i], v2[i]);
 4843     }
 4844   }
 4845 
 4846   // load N/2 successive pairs of quadword values from memory in order
 4847   // into N successive vector registers of the sequence via the
 4848   // address supplied in base.
 4849   template<int N>
 4850   void vs_ldpq(const VSeq<N>& v, Register base) {
 4851     for (int i = 0; i < N; i += 2) {
 4852       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4853     }
 4854   }
 4855 
 4856   // load N/2 successive pairs of quadword values from memory in order
 4857   // into N vector registers of the sequence via the address supplied
 4858   // in base using post-increment addressing
 4859   template<int N>
 4860   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4861     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4862     for (int i = 0; i < N; i += 2) {
 4863       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4864     }
 4865   }
 4866 
 4867   // store N successive vector registers of the sequence into N/2
 4868   // successive pairs of quadword memory locations via the address
 4869   // supplied in base using post-increment addressing
 4870   template<int N>
 4871   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4872     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4873     for (int i = 0; i < N; i += 2) {
 4874       __ stpq(v[i], v[i+1], __ post(base, 32));
 4875     }
 4876   }
 4877 
 4878   // load N/2 pairs of quadword values from memory de-interleaved into
 4879   // N vector registers 2 at a time via the address supplied in base
 4880   // using post-increment addressing.
 4881   template<int N>
 4882   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4883     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4884     for (int i = 0; i < N; i += 2) {
 4885       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4886     }
 4887   }
 4888 
 4889   // store N vector registers interleaved into N/2 pairs of quadword
 4890   // memory locations via the address supplied in base using
 4891   // post-increment addressing.
 4892   template<int N>
 4893   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4894     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4895     for (int i = 0; i < N; i += 2) {
 4896       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4897     }
 4898   }
 4899 
 4900   // load N quadword values from memory de-interleaved into N vector
 4901   // registers 3 elements at a time via the address supplied in base.
 4902   template<int N>
 4903   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4904     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4905     for (int i = 0; i < N; i += 3) {
 4906       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4907     }
 4908   }
 4909 
 4910   // load N quadword values from memory de-interleaved into N vector
 4911   // registers 3 elements at a time via the address supplied in base
 4912   // using post-increment addressing.
 4913   template<int N>
 4914   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4915     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4916     for (int i = 0; i < N; i += 3) {
 4917       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4918     }
 4919   }
 4920 
 4921   // load N/2 pairs of quadword values from memory into N vector
 4922   // registers via the address supplied in base with each pair indexed
 4923   // using the the start offset plus the corresponding entry in the
 4924   // offsets array
 4925   template<int N>
 4926   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4927     for (int i = 0; i < N/2; i++) {
 4928       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4929     }
 4930   }
 4931 
 4932   // store N vector registers into N/2 pairs of quadword memory
 4933   // locations via the address supplied in base with each pair indexed
 4934   // using the the start offset plus the corresponding entry in the
 4935   // offsets array
 4936   template<int N>
 4937   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4938     for (int i = 0; i < N/2; i++) {
 4939       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4940     }
 4941   }
 4942 
 4943   // load N single quadword values from memory into N vector registers
 4944   // via the address supplied in base with each value indexed using
 4945   // the the start offset plus the corresponding entry in the offsets
 4946   // array
 4947   template<int N>
 4948   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4949                       int start, int (&offsets)[N]) {
 4950     for (int i = 0; i < N; i++) {
 4951       __ ldr(v[i], T, Address(base, start + offsets[i]));
 4952     }
 4953   }
 4954 
 4955   // store N vector registers into N single quadword memory locations
 4956   // via the address supplied in base with each value indexed using
 4957   // the the start offset plus the corresponding entry in the offsets
 4958   // array
 4959   template<int N>
 4960   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4961                       int start, int (&offsets)[N]) {
 4962     for (int i = 0; i < N; i++) {
 4963       __ str(v[i], T, Address(base, start + offsets[i]));
 4964     }
 4965   }
 4966 
 4967   // load N/2 pairs of quadword values from memory de-interleaved into
 4968   // N vector registers 2 at a time via the address supplied in base
 4969   // with each pair indexed using the the start offset plus the
 4970   // corresponding entry in the offsets array
 4971   template<int N>
 4972   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4973                       Register tmp, int start, int (&offsets)[N/2]) {
 4974     for (int i = 0; i < N/2; i++) {
 4975       __ add(tmp, base, start + offsets[i]);
 4976       __ ld2(v[2*i], v[2*i+1], T, tmp);
 4977     }
 4978   }
 4979 
 4980   // store N vector registers 2 at a time interleaved into N/2 pairs
 4981   // of quadword memory locations via the address supplied in base
 4982   // with each pair indexed using the the start offset plus the
 4983   // corresponding entry in the offsets array
 4984   template<int N>
 4985   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4986                       Register tmp, int start, int (&offsets)[N/2]) {
 4987     for (int i = 0; i < N/2; i++) {
 4988       __ add(tmp, base, start + offsets[i]);
 4989       __ st2(v[2*i], v[2*i+1], T, tmp);
 4990     }
 4991   }
 4992 
 4993   // Helper routines for various flavours of Montgomery multiply
 4994 
 4995   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 4996   // multiplications in parallel
 4997   //
 4998 
 4999   // See the montMul() method of the sun.security.provider.ML_DSA
 5000   // class.
 5001   //
 5002   // Computes 4x4S results or 8x8H results
 5003   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5004   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5005   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5006   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5007   // Outputs: va - 4x4S or 4x8H vector register sequences
 5008   // vb, vc, vtmp and vq must all be disjoint
 5009   // va must be disjoint from all other inputs/temps or must equal vc
 5010   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5011   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5012   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5013                    Assembler::SIMD_Arrangement T,
 5014                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5015     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5016     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5017     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5018     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5019 
 5020     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5021     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5022 
 5023     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5024 
 5025     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5026     assert(vs_disjoint(va, vb), "va and vb overlap");
 5027     assert(vs_disjoint(va, vq), "va and vq overlap");
 5028     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5029     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5030 
 5031     // schedule 4 streams of instructions across the vector sequences
 5032     for (int i = 0; i < 4; i++) {
 5033       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5034       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5035     }
 5036 
 5037     for (int i = 0; i < 4; i++) {
 5038       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5039     }
 5040 
 5041     for (int i = 0; i < 4; i++) {
 5042       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5043     }
 5044 
 5045     for (int i = 0; i < 4; i++) {
 5046       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5047     }
 5048   }
 5049 
 5050   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5051   // multiplications in parallel
 5052   //
 5053 
 5054   // See the montMul() method of the sun.security.provider.ML_DSA
 5055   // class.
 5056   //
 5057   // Computes 4x4S results or 8x8H results
 5058   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5059   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5060   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5061   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5062   // Outputs: va - 4x4S or 4x8H vector register sequences
 5063   // vb, vc, vtmp and vq must all be disjoint
 5064   // va must be disjoint from all other inputs/temps or must equal vc
 5065   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5066   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5067   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5068                    Assembler::SIMD_Arrangement T,
 5069                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5070     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5071     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5072     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5073     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5074 
 5075     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5076     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5077 
 5078     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5079 
 5080     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5081     assert(vs_disjoint(va, vb), "va and vb overlap");
 5082     assert(vs_disjoint(va, vq), "va and vq overlap");
 5083     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5084     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5085 
 5086     // schedule 2 streams of instructions across the vector sequences
 5087     for (int i = 0; i < 2; i++) {
 5088       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5089       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5090     }
 5091 
 5092     for (int i = 0; i < 2; i++) {
 5093       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5094     }
 5095 
 5096     for (int i = 0; i < 2; i++) {
 5097       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5098     }
 5099 
 5100     for (int i = 0; i < 2; i++) {
 5101       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5102     }
 5103   }
 5104 
 5105   // Perform 16 16-bit Montgomery multiplications in parallel.
 5106   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5107                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5108     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5109     // It will assert that the register use is valid
 5110     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5111   }
 5112 
 5113   // Perform 32 16-bit Montgomery multiplications in parallel.
 5114   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5115                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5116     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5117     // It will assert that the register use is valid
 5118     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5119   }
 5120 
 5121   // Perform 64 16-bit Montgomery multiplications in parallel.
 5122   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5123                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5124     // Schedule two successive 4x8H multiplies via the montmul helper
 5125     // on the front and back halves of va, vb and vc. The helper will
 5126     // assert that the register use has no overlap conflicts on each
 5127     // individual call but we also need to ensure that the necessary
 5128     // disjoint/equality constraints are met across both calls.
 5129 
 5130     // vb, vc, vtmp and vq must be disjoint. va must either be
 5131     // disjoint from all other registers or equal vc
 5132 
 5133     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5134     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5135     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5136 
 5137     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5138     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5139 
 5140     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5141 
 5142     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5143     assert(vs_disjoint(va, vb), "va and vb overlap");
 5144     assert(vs_disjoint(va, vq), "va and vq overlap");
 5145     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5146 
 5147     // we multiply the front and back halves of each sequence 4 at a
 5148     // time because
 5149     //
 5150     // 1) we are currently only able to get 4-way instruction
 5151     // parallelism at best
 5152     //
 5153     // 2) we need registers for the constants in vq and temporary
 5154     // scratch registers to hold intermediate results so vtmp can only
 5155     // be a VSeq<4> which means we only have 4 scratch slots
 5156 
 5157     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5158     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5159   }
 5160 
 5161   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5162                                const VSeq<4>& vc,
 5163                                const VSeq<4>& vtmp,
 5164                                const VSeq<2>& vq) {
 5165     // compute a = montmul(a1, c)
 5166     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5167     // ouptut a1 = a0 - a
 5168     vs_subv(va1, __ T8H, va0, vc);
 5169     //    and a0 = a0 + a
 5170     vs_addv(va0, __ T8H, va0, vc);
 5171   }
 5172 
 5173   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5174                                const VSeq<4>& vb,
 5175                                const VSeq<4>& vtmp1,
 5176                                const VSeq<4>& vtmp2,
 5177                                const VSeq<2>& vq) {
 5178     // compute c = a0 - a1
 5179     vs_subv(vtmp1, __ T8H, va0, va1);
 5180     // output a0 = a0 + a1
 5181     vs_addv(va0, __ T8H, va0, va1);
 5182     // output a1 = b montmul c
 5183     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5184   }
 5185 
 5186   void load64shorts(const VSeq<8>& v, Register shorts) {
 5187     vs_ldpq_post(v, shorts);
 5188   }
 5189 
 5190   void load32shorts(const VSeq<4>& v, Register shorts) {
 5191     vs_ldpq_post(v, shorts);
 5192   }
 5193 
 5194   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5195     vs_stpq_post(v, tmpAddr);
 5196   }
 5197 
 5198   // Kyber NTT function.
 5199   // Implements
 5200   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5201   //
 5202   // coeffs (short[256]) = c_rarg0
 5203   // ntt_zetas (short[256]) = c_rarg1
 5204   address generate_kyberNtt() {
 5205 
 5206     __ align(CodeEntryAlignment);
 5207     StubGenStubId stub_id = StubGenStubId::kyberNtt_id;
 5208     StubCodeMark mark(this, stub_id);
 5209     address start = __ pc();
 5210     __ enter();
 5211 
 5212     const Register coeffs = c_rarg0;
 5213     const Register zetas = c_rarg1;
 5214 
 5215     const Register kyberConsts = r10;
 5216     const Register tmpAddr = r11;
 5217 
 5218     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5219     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5220     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5221 
 5222     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5223     // load the montmul constants
 5224     vs_ldpq(vq, kyberConsts);
 5225 
 5226     // Each level corresponds to an iteration of the outermost loop of the
 5227     // Java method seilerNTT(int[] coeffs). There are some differences
 5228     // from what is done in the seilerNTT() method, though:
 5229     // 1. The computation is using 16-bit signed values, we do not convert them
 5230     // to ints here.
 5231     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5232     // this array for each level, it is easier that way to fill up the vector
 5233     // registers.
 5234     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5235     // multiplications (this is because that way there should not be any
 5236     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5237     // that we can use the 16-bit arithmetic in the vector unit.
 5238     //
 5239     // On each level, we fill up the vector registers in such a way that the
 5240     // array elements that need to be multiplied by the zetas go into one
 5241     // set of vector registers while the corresponding ones that don't need to
 5242     // be multiplied, go into another set.
 5243     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5244     // registers interleaving the steps of 4 identical computations,
 5245     // each done on 8 16-bit values per register.
 5246 
 5247     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5248     // to the zetas occur in discrete blocks whose size is some multiple
 5249     // of 32.
 5250 
 5251     // level 0
 5252     __ add(tmpAddr, coeffs, 256);
 5253     load64shorts(vs1, tmpAddr);
 5254     load64shorts(vs2, zetas);
 5255     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5256     __ add(tmpAddr, coeffs, 0);
 5257     load64shorts(vs1, tmpAddr);
 5258     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5259     vs_addv(vs1, __ T8H, vs1, vs2);
 5260     __ add(tmpAddr, coeffs, 0);
 5261     vs_stpq_post(vs1, tmpAddr);
 5262     __ add(tmpAddr, coeffs, 256);
 5263     vs_stpq_post(vs3, tmpAddr);
 5264     // restore montmul constants
 5265     vs_ldpq(vq, kyberConsts);
 5266     load64shorts(vs1, tmpAddr);
 5267     load64shorts(vs2, zetas);
 5268     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5269     __ add(tmpAddr, coeffs, 128);
 5270     load64shorts(vs1, tmpAddr);
 5271     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5272     vs_addv(vs1, __ T8H, vs1, vs2);
 5273     __ add(tmpAddr, coeffs, 128);
 5274     store64shorts(vs1, tmpAddr);
 5275     __ add(tmpAddr, coeffs, 384);
 5276     store64shorts(vs3, tmpAddr);
 5277 
 5278     // level 1
 5279     // restore montmul constants
 5280     vs_ldpq(vq, kyberConsts);
 5281     __ add(tmpAddr, coeffs, 128);
 5282     load64shorts(vs1, tmpAddr);
 5283     load64shorts(vs2, zetas);
 5284     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5285     __ add(tmpAddr, coeffs, 0);
 5286     load64shorts(vs1, tmpAddr);
 5287     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5288     vs_addv(vs1, __ T8H, vs1, vs2);
 5289     __ add(tmpAddr, coeffs, 0);
 5290     store64shorts(vs1, tmpAddr);
 5291     store64shorts(vs3, tmpAddr);
 5292     vs_ldpq(vq, kyberConsts);
 5293     __ add(tmpAddr, coeffs, 384);
 5294     load64shorts(vs1, tmpAddr);
 5295     load64shorts(vs2, zetas);
 5296     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5297     __ add(tmpAddr, coeffs, 256);
 5298     load64shorts(vs1, tmpAddr);
 5299     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5300     vs_addv(vs1, __ T8H, vs1, vs2);
 5301     __ add(tmpAddr, coeffs, 256);
 5302     store64shorts(vs1, tmpAddr);
 5303     store64shorts(vs3, tmpAddr);
 5304 
 5305     // level 2
 5306     vs_ldpq(vq, kyberConsts);
 5307     int offsets1[4] = { 0, 32, 128, 160 };
 5308     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5309     load64shorts(vs2, zetas);
 5310     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5311     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5312     // kyber_subv_addv64();
 5313     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5314     vs_addv(vs1, __ T8H, vs1, vs2);
 5315     __ add(tmpAddr, coeffs, 0);
 5316     vs_stpq_post(vs_front(vs1), tmpAddr);
 5317     vs_stpq_post(vs_front(vs3), tmpAddr);
 5318     vs_stpq_post(vs_back(vs1), tmpAddr);
 5319     vs_stpq_post(vs_back(vs3), tmpAddr);
 5320     vs_ldpq(vq, kyberConsts);
 5321     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5322     load64shorts(vs2, zetas);
 5323     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5324     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5325     // kyber_subv_addv64();
 5326     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5327     vs_addv(vs1, __ T8H, vs1, vs2);
 5328     __ add(tmpAddr, coeffs, 256);
 5329     vs_stpq_post(vs_front(vs1), tmpAddr);
 5330     vs_stpq_post(vs_front(vs3), tmpAddr);
 5331     vs_stpq_post(vs_back(vs1), tmpAddr);
 5332     vs_stpq_post(vs_back(vs3), tmpAddr);
 5333 
 5334     // level 3
 5335     vs_ldpq(vq, kyberConsts);
 5336     int offsets2[4] = { 0, 64, 128, 192 };
 5337     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5338     load64shorts(vs2, zetas);
 5339     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5340     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5341     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5342     vs_addv(vs1, __ T8H, vs1, vs2);
 5343     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5344     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5345 
 5346     vs_ldpq(vq, kyberConsts);
 5347     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5348     load64shorts(vs2, zetas);
 5349     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5350     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5351     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5352     vs_addv(vs1, __ T8H, vs1, vs2);
 5353     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5354     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5355 
 5356     // level 4
 5357     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5358     // so they are loaded using employing an ldr at 8 distinct offsets.
 5359 
 5360     vs_ldpq(vq, kyberConsts);
 5361     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5362     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5363     load64shorts(vs2, zetas);
 5364     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5365     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5366     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5367     vs_addv(vs1, __ T8H, vs1, vs2);
 5368     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5369     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5370 
 5371     vs_ldpq(vq, kyberConsts);
 5372     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5373     load64shorts(vs2, zetas);
 5374     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5375     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5376     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5377     vs_addv(vs1, __ T8H, vs1, vs2);
 5378     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5379     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5380 
 5381     // level 5
 5382     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5383     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5384 
 5385     vs_ldpq(vq, kyberConsts);
 5386     int offsets4[4] = { 0, 32, 64, 96 };
 5387     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5388     load32shorts(vs_front(vs2), zetas);
 5389     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5390     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5391     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5392     load32shorts(vs_front(vs2), zetas);
 5393     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5394     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5395     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5396     load32shorts(vs_front(vs2), zetas);
 5397     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5398     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5399 
 5400     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5401     load32shorts(vs_front(vs2), zetas);
 5402     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5403     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5404 
 5405     // level 6
 5406     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5407     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5408 
 5409     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5410     load32shorts(vs_front(vs2), zetas);
 5411     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5412     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5413     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5414     // __ ldpq(v18, v19, __ post(zetas, 32));
 5415     load32shorts(vs_front(vs2), zetas);
 5416     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5417     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5418 
 5419     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5420     load32shorts(vs_front(vs2), zetas);
 5421     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5422     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5423 
 5424     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5425     load32shorts(vs_front(vs2), zetas);
 5426     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5427     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5428 
 5429     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5430     __ mov(r0, zr); // return 0
 5431     __ ret(lr);
 5432 
 5433     return start;
 5434   }
 5435 
 5436   // Kyber Inverse NTT function
 5437   // Implements
 5438   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5439   //
 5440   // coeffs (short[256]) = c_rarg0
 5441   // ntt_zetas (short[256]) = c_rarg1
 5442   address generate_kyberInverseNtt() {
 5443 
 5444     __ align(CodeEntryAlignment);
 5445     StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id;
 5446     StubCodeMark mark(this, stub_id);
 5447     address start = __ pc();
 5448     __ enter();
 5449 
 5450     const Register coeffs = c_rarg0;
 5451     const Register zetas = c_rarg1;
 5452 
 5453     const Register kyberConsts = r10;
 5454     const Register tmpAddr = r11;
 5455     const Register tmpAddr2 = c_rarg2;
 5456 
 5457     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5458     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5459     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5460 
 5461     __ lea(kyberConsts,
 5462              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5463 
 5464     // level 0
 5465     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5466     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5467 
 5468     vs_ldpq(vq, kyberConsts);
 5469     int offsets4[4] = { 0, 32, 64, 96 };
 5470     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5471     load32shorts(vs_front(vs2), zetas);
 5472     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5473                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5474     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5475     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5476     load32shorts(vs_front(vs2), zetas);
 5477     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5478                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5479     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5480     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5481     load32shorts(vs_front(vs2), zetas);
 5482     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5483                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5484     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5485     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5486     load32shorts(vs_front(vs2), zetas);
 5487     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5488                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5489     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5490 
 5491     // level 1
 5492     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5493     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5494 
 5495     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5496     load32shorts(vs_front(vs2), zetas);
 5497     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5498                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5499     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5500     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5501     load32shorts(vs_front(vs2), zetas);
 5502     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5503                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5504     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5505 
 5506     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5507     load32shorts(vs_front(vs2), zetas);
 5508     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5509                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5510     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5511     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5512     load32shorts(vs_front(vs2), zetas);
 5513     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5514                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5515     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5516 
 5517     // level 2
 5518     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5519     // so they are loaded using employing an ldr at 8 distinct offsets.
 5520 
 5521     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5522     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5523     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5524     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5525     vs_subv(vs1, __ T8H, vs1, vs2);
 5526     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5527     load64shorts(vs2, zetas);
 5528     vs_ldpq(vq, kyberConsts);
 5529     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5530     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5531 
 5532     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5533     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5534     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5535     vs_subv(vs1, __ T8H, vs1, vs2);
 5536     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5537     load64shorts(vs2, zetas);
 5538     vs_ldpq(vq, kyberConsts);
 5539     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5540     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5541 
 5542     // Barrett reduction at indexes where overflow may happen
 5543 
 5544     // load q and the multiplier for the Barrett reduction
 5545     __ add(tmpAddr, kyberConsts, 16);
 5546     vs_ldpq(vq, tmpAddr);
 5547 
 5548     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5549     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5550     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5551     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5552     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5553     vs_sshr(vs2, __ T8H, vs2, 11);
 5554     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5555     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5556     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5557     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5558     vs_sshr(vs2, __ T8H, vs2, 11);
 5559     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5560     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5561 
 5562     // level 3
 5563     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5564     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5565 
 5566     int offsets2[4] = { 0, 64, 128, 192 };
 5567     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5568     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5569     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5570     vs_subv(vs1, __ T8H, vs1, vs2);
 5571     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5572     load64shorts(vs2, zetas);
 5573     vs_ldpq(vq, kyberConsts);
 5574     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5575     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5576 
 5577     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5578     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5579     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5580     vs_subv(vs1, __ T8H, vs1, vs2);
 5581     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5582     load64shorts(vs2, zetas);
 5583     vs_ldpq(vq, kyberConsts);
 5584     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5585     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5586 
 5587     // level 4
 5588 
 5589     int offsets1[4] = { 0, 32, 128, 160 };
 5590     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5591     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5592     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5593     vs_subv(vs1, __ T8H, vs1, vs2);
 5594     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5595     load64shorts(vs2, zetas);
 5596     vs_ldpq(vq, kyberConsts);
 5597     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5598     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5599 
 5600     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5601     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5602     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5603     vs_subv(vs1, __ T8H, vs1, vs2);
 5604     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5605     load64shorts(vs2, zetas);
 5606     vs_ldpq(vq, kyberConsts);
 5607     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5608     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5609 
 5610     // level 5
 5611 
 5612     __ add(tmpAddr, coeffs, 0);
 5613     load64shorts(vs1, tmpAddr);
 5614     __ add(tmpAddr, coeffs, 128);
 5615     load64shorts(vs2, tmpAddr);
 5616     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5617     vs_subv(vs1, __ T8H, vs1, vs2);
 5618     __ add(tmpAddr, coeffs, 0);
 5619     store64shorts(vs3, tmpAddr);
 5620     load64shorts(vs2, zetas);
 5621     vs_ldpq(vq, kyberConsts);
 5622     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5623     __ add(tmpAddr, coeffs, 128);
 5624     store64shorts(vs2, tmpAddr);
 5625 
 5626     load64shorts(vs1, tmpAddr);
 5627     __ add(tmpAddr, coeffs, 384);
 5628     load64shorts(vs2, tmpAddr);
 5629     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5630     vs_subv(vs1, __ T8H, vs1, vs2);
 5631     __ add(tmpAddr, coeffs, 256);
 5632     store64shorts(vs3, tmpAddr);
 5633     load64shorts(vs2, zetas);
 5634     vs_ldpq(vq, kyberConsts);
 5635     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5636     __ add(tmpAddr, coeffs, 384);
 5637     store64shorts(vs2, tmpAddr);
 5638 
 5639     // Barrett reduction at indexes where overflow may happen
 5640 
 5641     // load q and the multiplier for the Barrett reduction
 5642     __ add(tmpAddr, kyberConsts, 16);
 5643     vs_ldpq(vq, tmpAddr);
 5644 
 5645     int offsets0[2] = { 0, 256 };
 5646     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5647     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5648     vs_sshr(vs2, __ T8H, vs2, 11);
 5649     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5650     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5651 
 5652     // level 6
 5653 
 5654     __ add(tmpAddr, coeffs, 0);
 5655     load64shorts(vs1, tmpAddr);
 5656     __ add(tmpAddr, coeffs, 256);
 5657     load64shorts(vs2, tmpAddr);
 5658     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5659     vs_subv(vs1, __ T8H, vs1, vs2);
 5660     __ add(tmpAddr, coeffs, 0);
 5661     store64shorts(vs3, tmpAddr);
 5662     load64shorts(vs2, zetas);
 5663     vs_ldpq(vq, kyberConsts);
 5664     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5665     __ add(tmpAddr, coeffs, 256);
 5666     store64shorts(vs2, tmpAddr);
 5667 
 5668     __ add(tmpAddr, coeffs, 128);
 5669     load64shorts(vs1, tmpAddr);
 5670     __ add(tmpAddr, coeffs, 384);
 5671     load64shorts(vs2, tmpAddr);
 5672     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5673     vs_subv(vs1, __ T8H, vs1, vs2);
 5674     __ add(tmpAddr, coeffs, 128);
 5675     store64shorts(vs3, tmpAddr);
 5676     load64shorts(vs2, zetas);
 5677     vs_ldpq(vq, kyberConsts);
 5678     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5679     __ add(tmpAddr, coeffs, 384);
 5680     store64shorts(vs2, tmpAddr);
 5681 
 5682     // multiply by 2^-n
 5683 
 5684     // load toMont(2^-n mod q)
 5685     __ add(tmpAddr, kyberConsts, 48);
 5686     __ ldr(v29, __ Q, tmpAddr);
 5687 
 5688     vs_ldpq(vq, kyberConsts);
 5689     __ add(tmpAddr, coeffs, 0);
 5690     load64shorts(vs1, tmpAddr);
 5691     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5692     __ add(tmpAddr, coeffs, 0);
 5693     store64shorts(vs2, tmpAddr);
 5694 
 5695     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5696     load64shorts(vs1, tmpAddr);
 5697     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5698     __ add(tmpAddr, coeffs, 128);
 5699     store64shorts(vs2, tmpAddr);
 5700 
 5701     // now tmpAddr contains coeffs + 256
 5702     load64shorts(vs1, tmpAddr);
 5703     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5704     __ add(tmpAddr, coeffs, 256);
 5705     store64shorts(vs2, tmpAddr);
 5706 
 5707     // now tmpAddr contains coeffs + 384
 5708     load64shorts(vs1, tmpAddr);
 5709     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5710     __ add(tmpAddr, coeffs, 384);
 5711     store64shorts(vs2, tmpAddr);
 5712 
 5713     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5714     __ mov(r0, zr); // return 0
 5715     __ ret(lr);
 5716 
 5717     return start;
 5718   }
 5719 
 5720   // Kyber multiply polynomials in the NTT domain.
 5721   // Implements
 5722   // static int implKyberNttMult(
 5723   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5724   //
 5725   // result (short[256]) = c_rarg0
 5726   // ntta (short[256]) = c_rarg1
 5727   // nttb (short[256]) = c_rarg2
 5728   // zetas (short[128]) = c_rarg3
 5729   address generate_kyberNttMult() {
 5730 
 5731     __ align(CodeEntryAlignment);
 5732     StubGenStubId stub_id = StubGenStubId::kyberNttMult_id;
 5733     StubCodeMark mark(this, stub_id);
 5734     address start = __ pc();
 5735     __ enter();
 5736 
 5737     const Register result = c_rarg0;
 5738     const Register ntta = c_rarg1;
 5739     const Register nttb = c_rarg2;
 5740     const Register zetas = c_rarg3;
 5741 
 5742     const Register kyberConsts = r10;
 5743     const Register limit = r11;
 5744 
 5745     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5746     VSeq<4> vs3(16), vs4(20);
 5747     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5748     VSeq<2> vz(28);          // pair of zetas
 5749     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5750 
 5751     __ lea(kyberConsts,
 5752              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5753 
 5754     Label kyberNttMult_loop;
 5755 
 5756     __ add(limit, result, 512);
 5757 
 5758     // load q and qinv
 5759     vs_ldpq(vq, kyberConsts);
 5760 
 5761     // load R^2 mod q (to convert back from Montgomery representation)
 5762     __ add(kyberConsts, kyberConsts, 64);
 5763     __ ldr(v27, __ Q, kyberConsts);
 5764 
 5765     __ BIND(kyberNttMult_loop);
 5766 
 5767     // load 16 zetas
 5768     vs_ldpq_post(vz, zetas);
 5769 
 5770     // load 2 sets of 32 coefficients from the two input arrays
 5771     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5772     // are striped across pairs of vector registers
 5773     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5774     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5775     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5776     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5777 
 5778     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5779     // i.e. montmul the first and second halves of vs1 in order and
 5780     // then with one sequence reversed storing the two results in vs3
 5781     //
 5782     // vs3[0] <- montmul(a0, b0)
 5783     // vs3[1] <- montmul(a1, b1)
 5784     // vs3[2] <- montmul(a0, b1)
 5785     // vs3[3] <- montmul(a1, b0)
 5786     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5787     kyber_montmul16(vs_back(vs3),
 5788                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5789 
 5790     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5791     // i.e. montmul the first and second halves of vs4 in order and
 5792     // then with one sequence reversed storing the two results in vs1
 5793     //
 5794     // vs1[0] <- montmul(a2, b2)
 5795     // vs1[1] <- montmul(a3, b3)
 5796     // vs1[2] <- montmul(a2, b3)
 5797     // vs1[3] <- montmul(a3, b2)
 5798     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5799     kyber_montmul16(vs_back(vs1),
 5800                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5801 
 5802     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5803     // We can schedule two montmuls at a time if we use a suitable vector
 5804     // sequence <vs3[1], vs1[1]>.
 5805     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5806     VSeq<2> vs5(vs3[1], delta);
 5807 
 5808     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5809     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5810     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5811 
 5812     // add results in pairs storing in vs3
 5813     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5814     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5815     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5816 
 5817     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5818     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5819     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5820 
 5821     // vs1 <- montmul(vs3, montRSquareModQ)
 5822     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5823 
 5824     // store back the two pairs of result vectors de-interleaved as 8H elements
 5825     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5826     // in memory
 5827     vs_st2_post(vs1, __ T8H, result);
 5828 
 5829     __ cmp(result, limit);
 5830     __ br(Assembler::NE, kyberNttMult_loop);
 5831 
 5832     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5833     __ mov(r0, zr); // return 0
 5834     __ ret(lr);
 5835 
 5836     return start;
 5837   }
 5838 
 5839   // Kyber add 2 polynomials.
 5840   // Implements
 5841   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5842   //
 5843   // result (short[256]) = c_rarg0
 5844   // a (short[256]) = c_rarg1
 5845   // b (short[256]) = c_rarg2
 5846   address generate_kyberAddPoly_2() {
 5847 
 5848     __ align(CodeEntryAlignment);
 5849     StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id;
 5850     StubCodeMark mark(this, stub_id);
 5851     address start = __ pc();
 5852     __ enter();
 5853 
 5854     const Register result = c_rarg0;
 5855     const Register a = c_rarg1;
 5856     const Register b = c_rarg2;
 5857 
 5858     const Register kyberConsts = r11;
 5859 
 5860     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5861     // So, we can load, add and store the data in 3 groups of 11,
 5862     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5863     // registers. A further constraint is that the mapping needs
 5864     // to skip callee saves. So, we allocate the register
 5865     // sequences using two 8 sequences, two 2 sequences and two
 5866     // single registers.
 5867     VSeq<8> vs1_1(0);
 5868     VSeq<2> vs1_2(16);
 5869     FloatRegister vs1_3 = v28;
 5870     VSeq<8> vs2_1(18);
 5871     VSeq<2> vs2_2(26);
 5872     FloatRegister vs2_3 = v29;
 5873 
 5874     // two constant vector sequences
 5875     VSeq<8> vc_1(31, 0);
 5876     VSeq<2> vc_2(31, 0);
 5877 
 5878     FloatRegister vc_3 = v31;
 5879     __ lea(kyberConsts,
 5880              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5881 
 5882     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5883     for (int i = 0; i < 3; i++) {
 5884       // load 80 or 88 values from a into vs1_1/2/3
 5885       vs_ldpq_post(vs1_1, a);
 5886       vs_ldpq_post(vs1_2, a);
 5887       if (i < 2) {
 5888         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5889       }
 5890       // load 80 or 88 values from b into vs2_1/2/3
 5891       vs_ldpq_post(vs2_1, b);
 5892       vs_ldpq_post(vs2_2, b);
 5893       if (i < 2) {
 5894         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5895       }
 5896       // sum 80 or 88 values across vs1 and vs2 into vs1
 5897       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5898       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5899       if (i < 2) {
 5900         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5901       }
 5902       // add constant to all 80 or 88 results
 5903       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5904       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5905       if (i < 2) {
 5906         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5907       }
 5908       // store 80 or 88 values
 5909       vs_stpq_post(vs1_1, result);
 5910       vs_stpq_post(vs1_2, result);
 5911       if (i < 2) {
 5912         __ str(vs1_3, __ Q, __ post(result, 16));
 5913       }
 5914     }
 5915 
 5916     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5917     __ mov(r0, zr); // return 0
 5918     __ ret(lr);
 5919 
 5920     return start;
 5921   }
 5922 
 5923   // Kyber add 3 polynomials.
 5924   // Implements
 5925   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5926   //
 5927   // result (short[256]) = c_rarg0
 5928   // a (short[256]) = c_rarg1
 5929   // b (short[256]) = c_rarg2
 5930   // c (short[256]) = c_rarg3
 5931   address generate_kyberAddPoly_3() {
 5932 
 5933     __ align(CodeEntryAlignment);
 5934     StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id;
 5935     StubCodeMark mark(this, stub_id);
 5936     address start = __ pc();
 5937     __ enter();
 5938 
 5939     const Register result = c_rarg0;
 5940     const Register a = c_rarg1;
 5941     const Register b = c_rarg2;
 5942     const Register c = c_rarg3;
 5943 
 5944     const Register kyberConsts = r11;
 5945 
 5946     // As above we sum 256 sets of values in total i.e. 32 x 8H
 5947     // quadwords.  So, we can load, add and store the data in 3
 5948     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 5949     // of 10 or 11 registers. A further constraint is that the
 5950     // mapping needs to skip callee saves. So, we allocate the
 5951     // register sequences using two 8 sequences, two 2 sequences
 5952     // and two single registers.
 5953     VSeq<8> vs1_1(0);
 5954     VSeq<2> vs1_2(16);
 5955     FloatRegister vs1_3 = v28;
 5956     VSeq<8> vs2_1(18);
 5957     VSeq<2> vs2_2(26);
 5958     FloatRegister vs2_3 = v29;
 5959 
 5960     // two constant vector sequences
 5961     VSeq<8> vc_1(31, 0);
 5962     VSeq<2> vc_2(31, 0);
 5963 
 5964     FloatRegister vc_3 = v31;
 5965 
 5966     __ lea(kyberConsts,
 5967              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5968 
 5969     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5970     for (int i = 0; i < 3; i++) {
 5971       // load 80 or 88 values from a into vs1_1/2/3
 5972       vs_ldpq_post(vs1_1, a);
 5973       vs_ldpq_post(vs1_2, a);
 5974       if (i < 2) {
 5975         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5976       }
 5977       // load 80 or 88 values from b into vs2_1/2/3
 5978       vs_ldpq_post(vs2_1, b);
 5979       vs_ldpq_post(vs2_2, b);
 5980       if (i < 2) {
 5981         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5982       }
 5983       // sum 80 or 88 values across vs1 and vs2 into vs1
 5984       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5985       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5986       if (i < 2) {
 5987         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5988       }
 5989       // load 80 or 88 values from c into vs2_1/2/3
 5990       vs_ldpq_post(vs2_1, c);
 5991       vs_ldpq_post(vs2_2, c);
 5992       if (i < 2) {
 5993         __ ldr(vs2_3, __ Q, __ post(c, 16));
 5994       }
 5995       // sum 80 or 88 values across vs1 and vs2 into vs1
 5996       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5997       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5998       if (i < 2) {
 5999         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6000       }
 6001       // add constant to all 80 or 88 results
 6002       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6003       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6004       if (i < 2) {
 6005         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6006       }
 6007       // store 80 or 88 values
 6008       vs_stpq_post(vs1_1, result);
 6009       vs_stpq_post(vs1_2, result);
 6010       if (i < 2) {
 6011         __ str(vs1_3, __ Q, __ post(result, 16));
 6012       }
 6013     }
 6014 
 6015     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6016     __ mov(r0, zr); // return 0
 6017     __ ret(lr);
 6018 
 6019     return start;
 6020   }
 6021 
 6022   // Kyber parse XOF output to polynomial coefficient candidates
 6023   // or decodePoly(12, ...).
 6024   // Implements
 6025   // static int implKyber12To16(
 6026   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6027   //
 6028   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 6029   //
 6030   // condensed (byte[]) = c_rarg0
 6031   // condensedIndex = c_rarg1
 6032   // parsed (short[112 or 256]) = c_rarg2
 6033   // parsedLength (112 or 256) = c_rarg3
 6034   address generate_kyber12To16() {
 6035     Label L_F00, L_loop, L_end;
 6036 
 6037     __ BIND(L_F00);
 6038     __ emit_int64(0x0f000f000f000f00);
 6039     __ emit_int64(0x0f000f000f000f00);
 6040 
 6041     __ align(CodeEntryAlignment);
 6042     StubGenStubId stub_id = StubGenStubId::kyber12To16_id;
 6043     StubCodeMark mark(this, stub_id);
 6044     address start = __ pc();
 6045     __ enter();
 6046 
 6047     const Register condensed = c_rarg0;
 6048     const Register condensedOffs = c_rarg1;
 6049     const Register parsed = c_rarg2;
 6050     const Register parsedLength = c_rarg3;
 6051 
 6052     const Register tmpAddr = r11;
 6053 
 6054     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6055     // quadwords so we need a 6 vector sequence for the inputs.
 6056     // Parsing produces 64 shorts, employing two 8 vector
 6057     // sequences to store and combine the intermediate data.
 6058     VSeq<6> vin(24);
 6059     VSeq<8> va(0), vb(16);
 6060 
 6061     __ adr(tmpAddr, L_F00);
 6062     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6063     __ add(condensed, condensed, condensedOffs);
 6064 
 6065     __ BIND(L_loop);
 6066     // load 96 (6 x 16B) byte values
 6067     vs_ld3_post(vin, __ T16B, condensed);
 6068 
 6069     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6070     // holds 48 (16x3) contiguous bytes from memory striped
 6071     // horizontally across each of the 16 byte lanes. Equivalently,
 6072     // that is 16 pairs of 12-bit integers. Likewise the back half
 6073     // holds the next 48 bytes in the same arrangement.
 6074 
 6075     // Each vector in the front half can also be viewed as a vertical
 6076     // strip across the 16 pairs of 12 bit integers. Each byte in
 6077     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6078     // byte in vin[1] stores the high 4 bits of the first int and the
 6079     // low 4 bits of the second int. Each byte in vin[2] stores the
 6080     // high 8 bits of the second int. Likewise the vectors in second
 6081     // half.
 6082 
 6083     // Converting the data to 16-bit shorts requires first of all
 6084     // expanding each of the 6 x 16B vectors into 6 corresponding
 6085     // pairs of 8H vectors. Mask, shift and add operations on the
 6086     // resulting vector pairs can be used to combine 4 and 8 bit
 6087     // parts of related 8H vector elements.
 6088     //
 6089     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6090     // twice, one copy manipulated to provide the lower 4 bits
 6091     // belonging to the first short in a pair and another copy
 6092     // manipulated to provide the higher 4 bits belonging to the
 6093     // second short in a pair. This is why the the vector sequences va
 6094     // and vb used to hold the expanded 8H elements are of length 8.
 6095 
 6096     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6097     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6098     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6099     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6100     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6101     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6102     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6103     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6104 
 6105     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6106     // and vb[4:5]
 6107     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6108     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6109     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6110     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6111     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6112     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6113 
 6114     // shift lo byte of copy 1 of the middle stripe into the high byte
 6115     __ shl(va[2], __ T8H, va[2], 8);
 6116     __ shl(va[3], __ T8H, va[3], 8);
 6117     __ shl(vb[2], __ T8H, vb[2], 8);
 6118     __ shl(vb[3], __ T8H, vb[3], 8);
 6119 
 6120     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6121     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6122     // are in bit positions [4..11].
 6123     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6124     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6125     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6126     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6127 
 6128     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6129     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6130     // copy2
 6131     __ andr(va[2], __ T16B, va[2], v31);
 6132     __ andr(va[3], __ T16B, va[3], v31);
 6133     __ ushr(va[4], __ T8H, va[4], 4);
 6134     __ ushr(va[5], __ T8H, va[5], 4);
 6135     __ andr(vb[2], __ T16B, vb[2], v31);
 6136     __ andr(vb[3], __ T16B, vb[3], v31);
 6137     __ ushr(vb[4], __ T8H, vb[4], 4);
 6138     __ ushr(vb[5], __ T8H, vb[5], 4);
 6139 
 6140     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6141     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6142     // n.b. the ordering ensures: i) inputs are consumed before they
 6143     // are overwritten ii) the order of 16-bit results across successive
 6144     // pairs of vectors in va and then vb reflects the order of the
 6145     // corresponding 12-bit inputs
 6146     __ addv(va[0], __ T8H, va[0], va[2]);
 6147     __ addv(va[2], __ T8H, va[1], va[3]);
 6148     __ addv(va[1], __ T8H, va[4], va[6]);
 6149     __ addv(va[3], __ T8H, va[5], va[7]);
 6150     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6151     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6152     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6153     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6154 
 6155     // store 64 results interleaved as shorts
 6156     vs_st2_post(vs_front(va), __ T8H, parsed);
 6157     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6158 
 6159     __ sub(parsedLength, parsedLength, 64);
 6160     __ cmp(parsedLength, (u1)64);
 6161     __ br(Assembler::GE, L_loop);
 6162     __ cbz(parsedLength, L_end);
 6163 
 6164     // if anything is left it should be a final 72 bytes of input
 6165     // i.e. a final 48 12-bit values. so we handle this by loading
 6166     // 48 bytes into all 16B lanes of front(vin) and only 24
 6167     // bytes into the lower 8B lane of back(vin)
 6168     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6169     vs_ld3(vs_back(vin), __ T8B, condensed);
 6170 
 6171     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6172     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6173     // 5 and target element 2 of vb duplicates element 4.
 6174     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6175     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6176     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6177     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6178     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6179     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6180 
 6181     // This time expand just the lower 8 lanes
 6182     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6183     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6184     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6185 
 6186     // shift lo byte of copy 1 of the middle stripe into the high byte
 6187     __ shl(va[2], __ T8H, va[2], 8);
 6188     __ shl(va[3], __ T8H, va[3], 8);
 6189     __ shl(vb[2], __ T8H, vb[2], 8);
 6190 
 6191     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6192     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6193     // int are in bit positions [4..11].
 6194     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6195     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6196     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6197 
 6198     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6199     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6200     // copy2
 6201     __ andr(va[2], __ T16B, va[2], v31);
 6202     __ andr(va[3], __ T16B, va[3], v31);
 6203     __ ushr(va[4], __ T8H, va[4], 4);
 6204     __ ushr(va[5], __ T8H, va[5], 4);
 6205     __ andr(vb[2], __ T16B, vb[2], v31);
 6206     __ ushr(vb[4], __ T8H, vb[4], 4);
 6207 
 6208 
 6209 
 6210     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6211     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6212 
 6213     // n.b. ordering ensures: i) inputs are consumed before they are
 6214     // overwritten ii) order of 16-bit results across succsessive
 6215     // pairs of vectors in va and then lower half of vb reflects order
 6216     // of corresponding 12-bit inputs
 6217     __ addv(va[0], __ T8H, va[0], va[2]);
 6218     __ addv(va[2], __ T8H, va[1], va[3]);
 6219     __ addv(va[1], __ T8H, va[4], va[6]);
 6220     __ addv(va[3], __ T8H, va[5], va[7]);
 6221     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6222     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6223 
 6224     // store 48 results interleaved as shorts
 6225     vs_st2_post(vs_front(va), __ T8H, parsed);
 6226     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6227 
 6228     __ BIND(L_end);
 6229 
 6230     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6231     __ mov(r0, zr); // return 0
 6232     __ ret(lr);
 6233 
 6234     return start;
 6235   }
 6236 
 6237   // Kyber Barrett reduce function.
 6238   // Implements
 6239   // static int implKyberBarrettReduce(short[] coeffs) {}
 6240   //
 6241   // coeffs (short[256]) = c_rarg0
 6242   address generate_kyberBarrettReduce() {
 6243 
 6244     __ align(CodeEntryAlignment);
 6245     StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id;
 6246     StubCodeMark mark(this, stub_id);
 6247     address start = __ pc();
 6248     __ enter();
 6249 
 6250     const Register coeffs = c_rarg0;
 6251 
 6252     const Register kyberConsts = r10;
 6253     const Register result = r11;
 6254 
 6255     // As above we process 256 sets of values in total i.e. 32 x
 6256     // 8H quadwords. So, we can load, add and store the data in 3
 6257     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6258     // of 10 or 11 registers. A further constraint is that the
 6259     // mapping needs to skip callee saves. So, we allocate the
 6260     // register sequences using two 8 sequences, two 2 sequences
 6261     // and two single registers.
 6262     VSeq<8> vs1_1(0);
 6263     VSeq<2> vs1_2(16);
 6264     FloatRegister vs1_3 = v28;
 6265     VSeq<8> vs2_1(18);
 6266     VSeq<2> vs2_2(26);
 6267     FloatRegister vs2_3 = v29;
 6268 
 6269     // we also need a pair of corresponding constant sequences
 6270 
 6271     VSeq<8> vc1_1(30, 0);
 6272     VSeq<2> vc1_2(30, 0);
 6273     FloatRegister vc1_3 = v30; // for kyber_q
 6274 
 6275     VSeq<8> vc2_1(31, 0);
 6276     VSeq<2> vc2_2(31, 0);
 6277     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6278 
 6279     __ add(result, coeffs, 0);
 6280     __ lea(kyberConsts,
 6281              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6282 
 6283     // load q and the multiplier for the Barrett reduction
 6284     __ add(kyberConsts, kyberConsts, 16);
 6285     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6286 
 6287     for (int i = 0; i < 3; i++) {
 6288       // load 80 or 88 coefficients
 6289       vs_ldpq_post(vs1_1, coeffs);
 6290       vs_ldpq_post(vs1_2, coeffs);
 6291       if (i < 2) {
 6292         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6293       }
 6294 
 6295       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6296       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6297       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6298       if (i < 2) {
 6299         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6300       }
 6301 
 6302       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6303       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6304       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6305       if (i < 2) {
 6306         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6307       }
 6308 
 6309       // vs1 <- vs1 - vs2 * kyber_q
 6310       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6311       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6312       if (i < 2) {
 6313         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6314       }
 6315 
 6316       vs_stpq_post(vs1_1, result);
 6317       vs_stpq_post(vs1_2, result);
 6318       if (i < 2) {
 6319         __ str(vs1_3, __ Q, __ post(result, 16));
 6320       }
 6321     }
 6322 
 6323     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6324     __ mov(r0, zr); // return 0
 6325     __ ret(lr);
 6326 
 6327     return start;
 6328   }
 6329 
 6330 
 6331   // Dilithium-specific montmul helper routines that generate parallel
 6332   // code for, respectively, a single 4x4s vector sequence montmul or
 6333   // two such multiplies in a row.
 6334 
 6335   // Perform 16 32-bit Montgomery multiplications in parallel
 6336   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6337                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6338     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6339     // It will assert that the register use is valid
 6340     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6341   }
 6342 
 6343   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6344   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6345                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6346     // Schedule two successive 4x4S multiplies via the montmul helper
 6347     // on the front and back halves of va, vb and vc. The helper will
 6348     // assert that the register use has no overlap conflicts on each
 6349     // individual call but we also need to ensure that the necessary
 6350     // disjoint/equality constraints are met across both calls.
 6351 
 6352     // vb, vc, vtmp and vq must be disjoint. va must either be
 6353     // disjoint from all other registers or equal vc
 6354 
 6355     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6356     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6357     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6358 
 6359     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6360     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6361 
 6362     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6363 
 6364     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6365     assert(vs_disjoint(va, vb), "va and vb overlap");
 6366     assert(vs_disjoint(va, vq), "va and vq overlap");
 6367     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6368 
 6369     // We multiply the front and back halves of each sequence 4 at a
 6370     // time because
 6371     //
 6372     // 1) we are currently only able to get 4-way instruction
 6373     // parallelism at best
 6374     //
 6375     // 2) we need registers for the constants in vq and temporary
 6376     // scratch registers to hold intermediate results so vtmp can only
 6377     // be a VSeq<4> which means we only have 4 scratch slots.
 6378 
 6379     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6380     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6381   }
 6382 
 6383   // Perform combined montmul then add/sub on 4x4S vectors.
 6384   void dilithium_montmul16_sub_add(
 6385           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6386           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6387     // compute a = montmul(a1, c)
 6388     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6389     // ouptut a1 = a0 - a
 6390     vs_subv(va1, __ T4S, va0, vc);
 6391     //    and a0 = a0 + a
 6392     vs_addv(va0, __ T4S, va0, vc);
 6393   }
 6394 
 6395   // Perform combined add/sub then montul on 4x4S vectors.
 6396   void dilithium_sub_add_montmul16(
 6397           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6398           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6399     // compute c = a0 - a1
 6400     vs_subv(vtmp1, __ T4S, va0, va1);
 6401     // output a0 = a0 + a1
 6402     vs_addv(va0, __ T4S, va0, va1);
 6403     // output a1 = b montmul c
 6404     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6405   }
 6406 
 6407   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6408   // in the Java implementation come in sequences of at least 8, so we
 6409   // can use ldpq to collect the corresponding data into pairs of vector
 6410   // registers.
 6411   // We collect the coefficients corresponding to the 'j+l' indexes into
 6412   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6413   // then we do the (Montgomery) multiplications by the zetas in parallel
 6414   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6415   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6416   // v0-v7 and finally save the results back to the coeffs array.
 6417   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6418     const Register coeffs, const Register zetas) {
 6419     int c1 = 0;
 6420     int c2 = 512;
 6421     int startIncr;
 6422     // don't use callee save registers v8 - v15
 6423     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6424     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6425     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6426     int offsets[4] = { 0, 32, 64, 96 };
 6427 
 6428     for (int level = 0; level < 5; level++) {
 6429       int c1Start = c1;
 6430       int c2Start = c2;
 6431       if (level == 3) {
 6432         offsets[1] = 32;
 6433         offsets[2] = 128;
 6434         offsets[3] = 160;
 6435       } else if (level == 4) {
 6436         offsets[1] = 64;
 6437         offsets[2] = 128;
 6438         offsets[3] = 192;
 6439       }
 6440 
 6441       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6442       // time at 4 different offsets and multiply them in order by the
 6443       // next set of input values. So we employ indexed load and store
 6444       // pair instructions with arrangement 4S.
 6445       for (int i = 0; i < 4; i++) {
 6446         // reload q and qinv
 6447         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6448         // load 8x4S coefficients via second start pos == c2
 6449         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6450         // load next 8x4S inputs == b
 6451         vs_ldpq_post(vs2, zetas);
 6452         // compute a == c2 * b mod MONT_Q
 6453         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6454         // load 8x4s coefficients via first start pos == c1
 6455         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6456         // compute a1 =  c1 + a
 6457         vs_addv(vs3, __ T4S, vs1, vs2);
 6458         // compute a2 =  c1 - a
 6459         vs_subv(vs1, __ T4S, vs1, vs2);
 6460         // output a1 and a2
 6461         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6462         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6463 
 6464         int k = 4 * level + i;
 6465 
 6466         if (k > 7) {
 6467           startIncr = 256;
 6468         } else if (k == 5) {
 6469           startIncr = 384;
 6470         } else {
 6471           startIncr = 128;
 6472         }
 6473 
 6474         c1Start += startIncr;
 6475         c2Start += startIncr;
 6476       }
 6477 
 6478       c2 /= 2;
 6479     }
 6480   }
 6481 
 6482   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6483   // Implements the method
 6484   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6485   // of the Java class sun.security.provider
 6486   //
 6487   // coeffs (int[256]) = c_rarg0
 6488   // zetas (int[256]) = c_rarg1
 6489   address generate_dilithiumAlmostNtt() {
 6490 
 6491     __ align(CodeEntryAlignment);
 6492     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id;
 6493     StubCodeMark mark(this, stub_id);
 6494     address start = __ pc();
 6495     __ enter();
 6496 
 6497     const Register coeffs = c_rarg0;
 6498     const Register zetas = c_rarg1;
 6499 
 6500     const Register tmpAddr = r9;
 6501     const Register dilithiumConsts = r10;
 6502     const Register result = r11;
 6503     // don't use callee save registers v8 - v15
 6504     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6505     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6506     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6507     int offsets[4] = { 0, 32, 64, 96};
 6508     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6509     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6510     __ add(result, coeffs, 0);
 6511     __ lea(dilithiumConsts,
 6512              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6513 
 6514     // Each level represents one iteration of the outer for loop of the Java version.
 6515 
 6516     // level 0-4
 6517     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6518 
 6519     // level 5
 6520 
 6521     // At level 5 the coefficients we need to combine with the zetas
 6522     // are grouped in memory in blocks of size 4. So, for both sets of
 6523     // coefficients we load 4 adjacent values at 8 different offsets
 6524     // using an indexed ldr with register variant Q and multiply them
 6525     // in sequence order by the next set of inputs. Likewise we store
 6526     // the resuls using an indexed str with register variant Q.
 6527     for (int i = 0; i < 1024; i += 256) {
 6528       // reload constants q, qinv each iteration as they get clobbered later
 6529       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6530       // load 32 (8x4S) coefficients via first offsets = c1
 6531       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6532       // load next 32 (8x4S) inputs = b
 6533       vs_ldpq_post(vs2, zetas);
 6534       // a = b montul c1
 6535       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6536       // load 32 (8x4S) coefficients via second offsets = c2
 6537       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6538       // add/sub with result of multiply
 6539       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6540       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6541       // write back new coefficients using same offsets
 6542       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6543       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6544     }
 6545 
 6546     // level 6
 6547     // At level 6 the coefficients we need to combine with the zetas
 6548     // are grouped in memory in pairs, the first two being montmul
 6549     // inputs and the second add/sub inputs. We can still implement
 6550     // the montmul+sub+add using 4-way parallelism but only if we
 6551     // combine the coefficients with the zetas 16 at a time. We load 8
 6552     // adjacent values at 4 different offsets using an ld2 load with
 6553     // arrangement 2D. That interleaves the lower and upper halves of
 6554     // each pair of quadwords into successive vector registers. We
 6555     // then need to montmul the 4 even elements of the coefficients
 6556     // register sequence by the zetas in order and then add/sub the 4
 6557     // odd elements of the coefficients register sequence. We use an
 6558     // equivalent st2 operation to store the results back into memory
 6559     // de-interleaved.
 6560     for (int i = 0; i < 1024; i += 128) {
 6561       // reload constants q, qinv each iteration as they get clobbered later
 6562       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6563       // load interleaved 16 (4x2D) coefficients via offsets
 6564       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6565       // load next 16 (4x4S) inputs
 6566       vs_ldpq_post(vs_front(vs2), zetas);
 6567       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6568       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6569                                   vs_front(vs2), vtmp, vq);
 6570       // store interleaved 16 (4x2D) coefficients via offsets
 6571       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6572     }
 6573 
 6574     // level 7
 6575     // At level 7 the coefficients we need to combine with the zetas
 6576     // occur singly with montmul inputs alterating with add/sub
 6577     // inputs. Once again we can use 4-way parallelism to combine 16
 6578     // zetas at a time. However, we have to load 8 adjacent values at
 6579     // 4 different offsets using an ld2 load with arrangement 4S. That
 6580     // interleaves the the odd words of each pair into one
 6581     // coefficients vector register and the even words of the pair
 6582     // into the next register. We then need to montmul the 4 even
 6583     // elements of the coefficients register sequence by the zetas in
 6584     // order and then add/sub the 4 odd elements of the coefficients
 6585     // register sequence. We use an equivalent st2 operation to store
 6586     // the results back into memory de-interleaved.
 6587 
 6588     for (int i = 0; i < 1024; i += 128) {
 6589       // reload constants q, qinv each iteration as they get clobbered later
 6590       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6591       // load interleaved 16 (4x4S) coefficients via offsets
 6592       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6593       // load next 16 (4x4S) inputs
 6594       vs_ldpq_post(vs_front(vs2), zetas);
 6595       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6596       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6597                                   vs_front(vs2), vtmp, vq);
 6598       // store interleaved 16 (4x4S) coefficients via offsets
 6599       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6600     }
 6601     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6602     __ mov(r0, zr); // return 0
 6603     __ ret(lr);
 6604 
 6605     return start;
 6606   }
 6607 
 6608   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6609   // in the Java implementation come in sequences of at least 8, so we
 6610   // can use ldpq to collect the corresponding data into pairs of vector
 6611   // registers
 6612   // We collect the coefficients that correspond to the 'j's into vs1
 6613   // the coefficiets that correspond to the 'j+l's into vs2 then
 6614   // do the additions into vs3 and the subtractions into vs1 then
 6615   // save the result of the additions, load the zetas into vs2
 6616   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6617   // finally save the results back to the coeffs array
 6618   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6619     const Register coeffs, const Register zetas) {
 6620     int c1 = 0;
 6621     int c2 = 32;
 6622     int startIncr;
 6623     int offsets[4];
 6624     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6625     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6626     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6627 
 6628     offsets[0] = 0;
 6629 
 6630     for (int level = 3; level < 8; level++) {
 6631       int c1Start = c1;
 6632       int c2Start = c2;
 6633       if (level == 3) {
 6634         offsets[1] = 64;
 6635         offsets[2] = 128;
 6636         offsets[3] = 192;
 6637       } else if (level == 4) {
 6638         offsets[1] = 32;
 6639         offsets[2] = 128;
 6640         offsets[3] = 160;
 6641       } else {
 6642         offsets[1] = 32;
 6643         offsets[2] = 64;
 6644         offsets[3] = 96;
 6645       }
 6646 
 6647       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6648       // time at 4 different offsets and multiply them in order by the
 6649       // next set of input values. So we employ indexed load and store
 6650       // pair instructions with arrangement 4S.
 6651       for (int i = 0; i < 4; i++) {
 6652         // load v1 32 (8x4S) coefficients relative to first start index
 6653         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6654         // load v2 32 (8x4S) coefficients relative to second start index
 6655         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6656         // a0 = v1 + v2 -- n.b. clobbers vqs
 6657         vs_addv(vs3, __ T4S, vs1, vs2);
 6658         // a1 = v1 - v2
 6659         vs_subv(vs1, __ T4S, vs1, vs2);
 6660         // save a1 relative to first start index
 6661         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6662         // load constants q, qinv each iteration as they get clobbered above
 6663         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6664         // load b next 32 (8x4S) inputs
 6665         vs_ldpq_post(vs2, zetas);
 6666         // a = a1 montmul b
 6667         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6668         // save a relative to second start index
 6669         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6670 
 6671         int k = 4 * level + i;
 6672 
 6673         if (k < 24) {
 6674           startIncr = 256;
 6675         } else if (k == 25) {
 6676           startIncr = 384;
 6677         } else {
 6678           startIncr = 128;
 6679         }
 6680 
 6681         c1Start += startIncr;
 6682         c2Start += startIncr;
 6683       }
 6684 
 6685       c2 *= 2;
 6686     }
 6687   }
 6688 
 6689   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6690   // Implements the method
 6691   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6692   // the sun.security.provider.ML_DSA class.
 6693   //
 6694   // coeffs (int[256]) = c_rarg0
 6695   // zetas (int[256]) = c_rarg1
 6696   address generate_dilithiumAlmostInverseNtt() {
 6697 
 6698     __ align(CodeEntryAlignment);
 6699     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id;
 6700     StubCodeMark mark(this, stub_id);
 6701     address start = __ pc();
 6702     __ enter();
 6703 
 6704     const Register coeffs = c_rarg0;
 6705     const Register zetas = c_rarg1;
 6706 
 6707     const Register tmpAddr = r9;
 6708     const Register dilithiumConsts = r10;
 6709     const Register result = r11;
 6710     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6711     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6712     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6713     int offsets[4] = { 0, 32, 64, 96 };
 6714     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6715     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6716 
 6717     __ add(result, coeffs, 0);
 6718     __ lea(dilithiumConsts,
 6719              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6720 
 6721     // Each level represents one iteration of the outer for loop of the Java version
 6722 
 6723     // level 0
 6724     // At level 0 we need to interleave adjacent quartets of
 6725     // coefficients before we multiply and add/sub by the next 16
 6726     // zetas just as we did for level 7 in the multiply code. So we
 6727     // load and store the values using an ld2/st2 with arrangement 4S.
 6728     for (int i = 0; i < 1024; i += 128) {
 6729       // load constants q, qinv
 6730       // n.b. this can be moved out of the loop as they do not get
 6731       // clobbered by first two loops
 6732       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6733       // a0/a1 load interleaved 32 (8x4S) coefficients
 6734       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6735       // b load next 32 (8x4S) inputs
 6736       vs_ldpq_post(vs_front(vs2), zetas);
 6737       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6738       // n.b. second half of vs2 provides temporary register storage
 6739       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6740                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6741       // a0/a1 store interleaved 32 (8x4S) coefficients
 6742       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6743     }
 6744 
 6745     // level 1
 6746     // At level 1 we need to interleave pairs of adjacent pairs of
 6747     // coefficients before we multiply by the next 16 zetas just as we
 6748     // did for level 6 in the multiply code. So we load and store the
 6749     // values an ld2/st2 with arrangement 2D.
 6750     for (int i = 0; i < 1024; i += 128) {
 6751       // a0/a1 load interleaved 32 (8x2D) coefficients
 6752       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6753       // b load next 16 (4x4S) inputs
 6754       vs_ldpq_post(vs_front(vs2), zetas);
 6755       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6756       // n.b. second half of vs2 provides temporary register storage
 6757       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6758                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6759       // a0/a1 store interleaved 32 (8x2D) coefficients
 6760       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6761     }
 6762 
 6763     // level 2
 6764     // At level 2 coefficients come in blocks of 4. So, we load 4
 6765     // adjacent coefficients at 8 distinct offsets for both the first
 6766     // and second coefficient sequences, using an ldr with register
 6767     // variant Q then combine them with next set of 32 zetas. Likewise
 6768     // we store the results using an str with register variant Q.
 6769     for (int i = 0; i < 1024; i += 256) {
 6770       // c0 load 32 (8x4S) coefficients via first offsets
 6771       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6772       // c1 load 32 (8x4S) coefficients via second offsets
 6773       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6774       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6775       vs_addv(vs3, __ T4S, vs1, vs2);
 6776       // c = c0 - c1
 6777       vs_subv(vs1, __ T4S, vs1, vs2);
 6778       // store a0 32 (8x4S) coefficients via first offsets
 6779       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6780       // b load 32 (8x4S) next inputs
 6781       vs_ldpq_post(vs2, zetas);
 6782       // reload constants q, qinv -- they were clobbered earlier
 6783       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6784       // compute a1 = b montmul c
 6785       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6786       // store a1 32 (8x4S) coefficients via second offsets
 6787       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6788     }
 6789 
 6790     // level 3-7
 6791     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6792 
 6793     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6794     __ mov(r0, zr); // return 0
 6795     __ ret(lr);
 6796 
 6797     return start;
 6798   }
 6799 
 6800   // Dilithium multiply polynomials in the NTT domain.
 6801   // Straightforward implementation of the method
 6802   // static int implDilithiumNttMult(
 6803   //              int[] result, int[] ntta, int[] nttb {} of
 6804   // the sun.security.provider.ML_DSA class.
 6805   //
 6806   // result (int[256]) = c_rarg0
 6807   // poly1 (int[256]) = c_rarg1
 6808   // poly2 (int[256]) = c_rarg2
 6809   address generate_dilithiumNttMult() {
 6810 
 6811         __ align(CodeEntryAlignment);
 6812     StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id;
 6813     StubCodeMark mark(this, stub_id);
 6814     address start = __ pc();
 6815     __ enter();
 6816 
 6817     Label L_loop;
 6818 
 6819     const Register result = c_rarg0;
 6820     const Register poly1 = c_rarg1;
 6821     const Register poly2 = c_rarg2;
 6822 
 6823     const Register dilithiumConsts = r10;
 6824     const Register len = r11;
 6825 
 6826     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6827     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6828     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6829     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6830 
 6831     __ lea(dilithiumConsts,
 6832              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6833 
 6834     // load constants q, qinv
 6835     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6836     // load constant rSquare into v29
 6837     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6838 
 6839     __ mov(len, zr);
 6840     __ add(len, len, 1024);
 6841 
 6842     __ BIND(L_loop);
 6843 
 6844     // b load 32 (8x4S) next inputs from poly1
 6845     vs_ldpq_post(vs1, poly1);
 6846     // c load 32 (8x4S) next inputs from poly2
 6847     vs_ldpq_post(vs2, poly2);
 6848     // compute a = b montmul c
 6849     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6850     // compute a = rsquare montmul a
 6851     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6852     // save a 32 (8x4S) results
 6853     vs_stpq_post(vs2, result);
 6854 
 6855     __ sub(len, len, 128);
 6856     __ cmp(len, (u1)128);
 6857     __ br(Assembler::GE, L_loop);
 6858 
 6859     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6860     __ mov(r0, zr); // return 0
 6861     __ ret(lr);
 6862 
 6863     return start;
 6864   }
 6865 
 6866   // Dilithium Motgomery multiply an array by a constant.
 6867   // A straightforward implementation of the method
 6868   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6869   // of the sun.security.provider.MLDSA class
 6870   //
 6871   // coeffs (int[256]) = c_rarg0
 6872   // constant (int) = c_rarg1
 6873   address generate_dilithiumMontMulByConstant() {
 6874 
 6875     __ align(CodeEntryAlignment);
 6876     StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id;
 6877     StubCodeMark mark(this, stub_id);
 6878     address start = __ pc();
 6879     __ enter();
 6880 
 6881     Label L_loop;
 6882 
 6883     const Register coeffs = c_rarg0;
 6884     const Register constant = c_rarg1;
 6885 
 6886     const Register dilithiumConsts = r10;
 6887     const Register result = r11;
 6888     const Register len = r12;
 6889 
 6890     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6891     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6892     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6893     VSeq<8> vconst(29, 0);             // for montmul by constant
 6894 
 6895     // results track inputs
 6896     __ add(result, coeffs, 0);
 6897     __ lea(dilithiumConsts,
 6898              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6899 
 6900     // load constants q, qinv -- they do not get clobbered by first two loops
 6901     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6902     // copy caller supplied constant across vconst
 6903     __ dup(vconst[0], __ T4S, constant);
 6904     __ mov(len, zr);
 6905     __ add(len, len, 1024);
 6906 
 6907     __ BIND(L_loop);
 6908 
 6909     // load next 32 inputs
 6910     vs_ldpq_post(vs2, coeffs);
 6911     // mont mul by constant
 6912     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6913     // write next 32 results
 6914     vs_stpq_post(vs2, result);
 6915 
 6916     __ sub(len, len, 128);
 6917     __ cmp(len, (u1)128);
 6918     __ br(Assembler::GE, L_loop);
 6919 
 6920     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6921     __ mov(r0, zr); // return 0
 6922     __ ret(lr);
 6923 
 6924     return start;
 6925   }
 6926 
 6927   // Dilithium decompose poly.
 6928   // Implements the method
 6929   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6930   // of the sun.security.provider.ML_DSA class
 6931   //
 6932   // input (int[256]) = c_rarg0
 6933   // lowPart (int[256]) = c_rarg1
 6934   // highPart (int[256]) = c_rarg2
 6935   // twoGamma2  (int) = c_rarg3
 6936   // multiplier (int) = c_rarg4
 6937   address generate_dilithiumDecomposePoly() {
 6938 
 6939     __ align(CodeEntryAlignment);
 6940     StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id;
 6941     StubCodeMark mark(this, stub_id);
 6942     address start = __ pc();
 6943     Label L_loop;
 6944 
 6945     const Register input = c_rarg0;
 6946     const Register lowPart = c_rarg1;
 6947     const Register highPart = c_rarg2;
 6948     const Register twoGamma2 = c_rarg3;
 6949     const Register multiplier = c_rarg4;
 6950 
 6951     const Register len = r9;
 6952     const Register dilithiumConsts = r10;
 6953     const Register tmp = r11;
 6954 
 6955     // 6 independent sets of 4x4s values
 6956     VSeq<4> vs1(0), vs2(4), vs3(8);
 6957     VSeq<4> vs4(12), vs5(16), vtmp(20);
 6958 
 6959     // 7 constants for cross-multiplying
 6960     VSeq<4> one(25, 0);
 6961     VSeq<4> qminus1(26, 0);
 6962     VSeq<4> g2(27, 0);
 6963     VSeq<4> twog2(28, 0);
 6964     VSeq<4> mult(29, 0);
 6965     VSeq<4> q(30, 0);
 6966     VSeq<4> qadd(31, 0);
 6967 
 6968     __ enter();
 6969 
 6970     __ lea(dilithiumConsts,
 6971              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6972 
 6973     // save callee-saved registers
 6974     __ stpd(v8, v9, __ pre(sp, -64));
 6975     __ stpd(v10, v11, Address(sp, 16));
 6976     __ stpd(v12, v13, Address(sp, 32));
 6977     __ stpd(v14, v15, Address(sp, 48));
 6978 
 6979     // populate constant registers
 6980     __ mov(tmp, zr);
 6981     __ add(tmp, tmp, 1);
 6982     __ dup(one[0], __ T4S, tmp); // 1
 6983     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 6984     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 6985     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 6986     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 6987     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 6988     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 6989 
 6990     __ mov(len, zr);
 6991     __ add(len, len, 1024);
 6992 
 6993     __ BIND(L_loop);
 6994 
 6995     // load next 4x4S inputs interleaved: rplus --> vs1
 6996     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 6997 
 6998     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 6999     vs_addv(vtmp, __ T4S, vs1, qadd);
 7000     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7001     vs_mulv(vtmp, __ T4S, vtmp, q);
 7002     vs_subv(vs1, __ T4S, vs1, vtmp);
 7003 
 7004     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7005     vs_sshr(vtmp, __ T4S, vs1, 31);
 7006     vs_andr(vtmp, vtmp, q);
 7007     vs_addv(vs1, __ T4S, vs1, vtmp);
 7008 
 7009     // quotient --> vs2
 7010     // int quotient = (rplus * multiplier) >> 22;
 7011     vs_mulv(vtmp, __ T4S, vs1, mult);
 7012     vs_sshr(vs2, __ T4S, vtmp, 22);
 7013 
 7014     // r0 --> vs3
 7015     // int r0 = rplus - quotient * twoGamma2;
 7016     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7017     vs_subv(vs3, __ T4S, vs1, vtmp);
 7018 
 7019     // mask --> vs4
 7020     // int mask = (twoGamma2 - r0) >> 22;
 7021     vs_subv(vtmp, __ T4S, twog2, vs3);
 7022     vs_sshr(vs4, __ T4S, vtmp, 22);
 7023 
 7024     // r0 -= (mask & twoGamma2);
 7025     vs_andr(vtmp, vs4, twog2);
 7026     vs_subv(vs3, __ T4S, vs3, vtmp);
 7027 
 7028     //  quotient += (mask & 1);
 7029     vs_andr(vtmp, vs4, one);
 7030     vs_addv(vs2, __ T4S, vs2, vtmp);
 7031 
 7032     // mask = (twoGamma2 / 2 - r0) >> 31;
 7033     vs_subv(vtmp, __ T4S, g2, vs3);
 7034     vs_sshr(vs4, __ T4S, vtmp, 31);
 7035 
 7036     // r0 -= (mask & twoGamma2);
 7037     vs_andr(vtmp, vs4, twog2);
 7038     vs_subv(vs3, __ T4S, vs3, vtmp);
 7039 
 7040     // quotient += (mask & 1);
 7041     vs_andr(vtmp, vs4, one);
 7042     vs_addv(vs2, __ T4S, vs2, vtmp);
 7043 
 7044     // r1 --> vs5
 7045     // int r1 = rplus - r0 - (dilithium_q - 1);
 7046     vs_subv(vtmp, __ T4S, vs1, vs3);
 7047     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7048 
 7049     // r1 --> vs1 (overwriting rplus)
 7050     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7051     vs_negr(vtmp, __ T4S, vs5);
 7052     vs_orr(vtmp, vs5, vtmp);
 7053     vs_sshr(vs1, __ T4S, vtmp, 31);
 7054 
 7055     // r0 += ~r1;
 7056     vs_notr(vtmp, vs1);
 7057     vs_addv(vs3, __ T4S, vs3, vtmp);
 7058 
 7059     // r1 = r1 & quotient;
 7060     vs_andr(vs1, vs2, vs1);
 7061 
 7062     // store results inteleaved
 7063     // lowPart[m] = r0;
 7064     // highPart[m] = r1;
 7065     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7066     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7067 
 7068     __ sub(len, len, 64);
 7069     __ cmp(len, (u1)64);
 7070     __ br(Assembler::GE, L_loop);
 7071 
 7072     // restore callee-saved vector registers
 7073     __ ldpd(v14, v15, Address(sp, 48));
 7074     __ ldpd(v12, v13, Address(sp, 32));
 7075     __ ldpd(v10, v11, Address(sp, 16));
 7076     __ ldpd(v8, v9, __ post(sp, 64));
 7077 
 7078     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7079     __ mov(r0, zr); // return 0
 7080     __ ret(lr);
 7081 
 7082     return start;
 7083   }
 7084 
 7085   /**
 7086    *  Arguments:
 7087    *
 7088    * Inputs:
 7089    *   c_rarg0   - int crc
 7090    *   c_rarg1   - byte* buf
 7091    *   c_rarg2   - int length
 7092    *
 7093    * Output:
 7094    *       rax   - int crc result
 7095    */
 7096   address generate_updateBytesCRC32() {
 7097     assert(UseCRC32Intrinsics, "what are we doing here?");
 7098 
 7099     __ align(CodeEntryAlignment);
 7100     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
 7101     StubCodeMark mark(this, stub_id);
 7102 
 7103     address start = __ pc();
 7104 
 7105     const Register crc   = c_rarg0;  // crc
 7106     const Register buf   = c_rarg1;  // source java byte array address
 7107     const Register len   = c_rarg2;  // length
 7108     const Register table0 = c_rarg3; // crc_table address
 7109     const Register table1 = c_rarg4;
 7110     const Register table2 = c_rarg5;
 7111     const Register table3 = c_rarg6;
 7112     const Register tmp3 = c_rarg7;
 7113 
 7114     BLOCK_COMMENT("Entry:");
 7115     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7116 
 7117     __ kernel_crc32(crc, buf, len,
 7118               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7119 
 7120     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7121     __ ret(lr);
 7122 
 7123     return start;
 7124   }
 7125 
 7126   /**
 7127    *  Arguments:
 7128    *
 7129    * Inputs:
 7130    *   c_rarg0   - int crc
 7131    *   c_rarg1   - byte* buf
 7132    *   c_rarg2   - int length
 7133    *   c_rarg3   - int* table
 7134    *
 7135    * Output:
 7136    *       r0   - int crc result
 7137    */
 7138   address generate_updateBytesCRC32C() {
 7139     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7140 
 7141     __ align(CodeEntryAlignment);
 7142     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
 7143     StubCodeMark mark(this, stub_id);
 7144 
 7145     address start = __ pc();
 7146 
 7147     const Register crc   = c_rarg0;  // crc
 7148     const Register buf   = c_rarg1;  // source java byte array address
 7149     const Register len   = c_rarg2;  // length
 7150     const Register table0 = c_rarg3; // crc_table address
 7151     const Register table1 = c_rarg4;
 7152     const Register table2 = c_rarg5;
 7153     const Register table3 = c_rarg6;
 7154     const Register tmp3 = c_rarg7;
 7155 
 7156     BLOCK_COMMENT("Entry:");
 7157     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7158 
 7159     __ kernel_crc32c(crc, buf, len,
 7160               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7161 
 7162     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7163     __ ret(lr);
 7164 
 7165     return start;
 7166   }
 7167 
 7168   /***
 7169    *  Arguments:
 7170    *
 7171    *  Inputs:
 7172    *   c_rarg0   - int   adler
 7173    *   c_rarg1   - byte* buff
 7174    *   c_rarg2   - int   len
 7175    *
 7176    * Output:
 7177    *   c_rarg0   - int adler result
 7178    */
 7179   address generate_updateBytesAdler32() {
 7180     __ align(CodeEntryAlignment);
 7181     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
 7182     StubCodeMark mark(this, stub_id);
 7183     address start = __ pc();
 7184 
 7185     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7186 
 7187     // Aliases
 7188     Register adler  = c_rarg0;
 7189     Register s1     = c_rarg0;
 7190     Register s2     = c_rarg3;
 7191     Register buff   = c_rarg1;
 7192     Register len    = c_rarg2;
 7193     Register nmax  = r4;
 7194     Register base  = r5;
 7195     Register count = r6;
 7196     Register temp0 = rscratch1;
 7197     Register temp1 = rscratch2;
 7198     FloatRegister vbytes = v0;
 7199     FloatRegister vs1acc = v1;
 7200     FloatRegister vs2acc = v2;
 7201     FloatRegister vtable = v3;
 7202 
 7203     // Max number of bytes we can process before having to take the mod
 7204     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7205     uint64_t BASE = 0xfff1;
 7206     uint64_t NMAX = 0x15B0;
 7207 
 7208     __ mov(base, BASE);
 7209     __ mov(nmax, NMAX);
 7210 
 7211     // Load accumulation coefficients for the upper 16 bits
 7212     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7213     __ ld1(vtable, __ T16B, Address(temp0));
 7214 
 7215     // s1 is initialized to the lower 16 bits of adler
 7216     // s2 is initialized to the upper 16 bits of adler
 7217     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7218     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7219 
 7220     // The pipelined loop needs at least 16 elements for 1 iteration
 7221     // It does check this, but it is more effective to skip to the cleanup loop
 7222     __ cmp(len, (u1)16);
 7223     __ br(Assembler::HS, L_nmax);
 7224     __ cbz(len, L_combine);
 7225 
 7226     __ bind(L_simple_by1_loop);
 7227     __ ldrb(temp0, Address(__ post(buff, 1)));
 7228     __ add(s1, s1, temp0);
 7229     __ add(s2, s2, s1);
 7230     __ subs(len, len, 1);
 7231     __ br(Assembler::HI, L_simple_by1_loop);
 7232 
 7233     // s1 = s1 % BASE
 7234     __ subs(temp0, s1, base);
 7235     __ csel(s1, temp0, s1, Assembler::HS);
 7236 
 7237     // s2 = s2 % BASE
 7238     __ lsr(temp0, s2, 16);
 7239     __ lsl(temp1, temp0, 4);
 7240     __ sub(temp1, temp1, temp0);
 7241     __ add(s2, temp1, s2, ext::uxth);
 7242 
 7243     __ subs(temp0, s2, base);
 7244     __ csel(s2, temp0, s2, Assembler::HS);
 7245 
 7246     __ b(L_combine);
 7247 
 7248     __ bind(L_nmax);
 7249     __ subs(len, len, nmax);
 7250     __ sub(count, nmax, 16);
 7251     __ br(Assembler::LO, L_by16);
 7252 
 7253     __ bind(L_nmax_loop);
 7254 
 7255     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7256                                       vbytes, vs1acc, vs2acc, vtable);
 7257 
 7258     __ subs(count, count, 16);
 7259     __ br(Assembler::HS, L_nmax_loop);
 7260 
 7261     // s1 = s1 % BASE
 7262     __ lsr(temp0, s1, 16);
 7263     __ lsl(temp1, temp0, 4);
 7264     __ sub(temp1, temp1, temp0);
 7265     __ add(temp1, temp1, s1, ext::uxth);
 7266 
 7267     __ lsr(temp0, temp1, 16);
 7268     __ lsl(s1, temp0, 4);
 7269     __ sub(s1, s1, temp0);
 7270     __ add(s1, s1, temp1, ext:: uxth);
 7271 
 7272     __ subs(temp0, s1, base);
 7273     __ csel(s1, temp0, s1, Assembler::HS);
 7274 
 7275     // s2 = s2 % BASE
 7276     __ lsr(temp0, s2, 16);
 7277     __ lsl(temp1, temp0, 4);
 7278     __ sub(temp1, temp1, temp0);
 7279     __ add(temp1, temp1, s2, ext::uxth);
 7280 
 7281     __ lsr(temp0, temp1, 16);
 7282     __ lsl(s2, temp0, 4);
 7283     __ sub(s2, s2, temp0);
 7284     __ add(s2, s2, temp1, ext:: uxth);
 7285 
 7286     __ subs(temp0, s2, base);
 7287     __ csel(s2, temp0, s2, Assembler::HS);
 7288 
 7289     __ subs(len, len, nmax);
 7290     __ sub(count, nmax, 16);
 7291     __ br(Assembler::HS, L_nmax_loop);
 7292 
 7293     __ bind(L_by16);
 7294     __ adds(len, len, count);
 7295     __ br(Assembler::LO, L_by1);
 7296 
 7297     __ bind(L_by16_loop);
 7298 
 7299     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7300                                       vbytes, vs1acc, vs2acc, vtable);
 7301 
 7302     __ subs(len, len, 16);
 7303     __ br(Assembler::HS, L_by16_loop);
 7304 
 7305     __ bind(L_by1);
 7306     __ adds(len, len, 15);
 7307     __ br(Assembler::LO, L_do_mod);
 7308 
 7309     __ bind(L_by1_loop);
 7310     __ ldrb(temp0, Address(__ post(buff, 1)));
 7311     __ add(s1, temp0, s1);
 7312     __ add(s2, s2, s1);
 7313     __ subs(len, len, 1);
 7314     __ br(Assembler::HS, L_by1_loop);
 7315 
 7316     __ bind(L_do_mod);
 7317     // s1 = s1 % BASE
 7318     __ lsr(temp0, s1, 16);
 7319     __ lsl(temp1, temp0, 4);
 7320     __ sub(temp1, temp1, temp0);
 7321     __ add(temp1, temp1, s1, ext::uxth);
 7322 
 7323     __ lsr(temp0, temp1, 16);
 7324     __ lsl(s1, temp0, 4);
 7325     __ sub(s1, s1, temp0);
 7326     __ add(s1, s1, temp1, ext:: uxth);
 7327 
 7328     __ subs(temp0, s1, base);
 7329     __ csel(s1, temp0, s1, Assembler::HS);
 7330 
 7331     // s2 = s2 % BASE
 7332     __ lsr(temp0, s2, 16);
 7333     __ lsl(temp1, temp0, 4);
 7334     __ sub(temp1, temp1, temp0);
 7335     __ add(temp1, temp1, s2, ext::uxth);
 7336 
 7337     __ lsr(temp0, temp1, 16);
 7338     __ lsl(s2, temp0, 4);
 7339     __ sub(s2, s2, temp0);
 7340     __ add(s2, s2, temp1, ext:: uxth);
 7341 
 7342     __ subs(temp0, s2, base);
 7343     __ csel(s2, temp0, s2, Assembler::HS);
 7344 
 7345     // Combine lower bits and higher bits
 7346     __ bind(L_combine);
 7347     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7348 
 7349     __ ret(lr);
 7350 
 7351     return start;
 7352   }
 7353 
 7354   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7355           Register temp0, Register temp1, FloatRegister vbytes,
 7356           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7357     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7358     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7359     // In non-vectorized code, we update s1 and s2 as:
 7360     //   s1 <- s1 + b1
 7361     //   s2 <- s2 + s1
 7362     //   s1 <- s1 + b2
 7363     //   s2 <- s2 + b1
 7364     //   ...
 7365     //   s1 <- s1 + b16
 7366     //   s2 <- s2 + s1
 7367     // Putting above assignments together, we have:
 7368     //   s1_new = s1 + b1 + b2 + ... + b16
 7369     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7370     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7371     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7372     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7373 
 7374     // s2 = s2 + s1 * 16
 7375     __ add(s2, s2, s1, Assembler::LSL, 4);
 7376 
 7377     // vs1acc = b1 + b2 + b3 + ... + b16
 7378     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7379     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7380     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7381     __ uaddlv(vs1acc, __ T16B, vbytes);
 7382     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7383 
 7384     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7385     __ fmovd(temp0, vs1acc);
 7386     __ fmovd(temp1, vs2acc);
 7387     __ add(s1, s1, temp0);
 7388     __ add(s2, s2, temp1);
 7389   }
 7390 
 7391   /**
 7392    *  Arguments:
 7393    *
 7394    *  Input:
 7395    *    c_rarg0   - x address
 7396    *    c_rarg1   - x length
 7397    *    c_rarg2   - y address
 7398    *    c_rarg3   - y length
 7399    *    c_rarg4   - z address
 7400    */
 7401   address generate_multiplyToLen() {
 7402     __ align(CodeEntryAlignment);
 7403     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
 7404     StubCodeMark mark(this, stub_id);
 7405 
 7406     address start = __ pc();
 7407  
 7408     if (AOTCodeCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) {
 7409       return start;
 7410     }
 7411     const Register x     = r0;
 7412     const Register xlen  = r1;
 7413     const Register y     = r2;
 7414     const Register ylen  = r3;
 7415     const Register z     = r4;
 7416 
 7417     const Register tmp0  = r5;
 7418     const Register tmp1  = r10;
 7419     const Register tmp2  = r11;
 7420     const Register tmp3  = r12;
 7421     const Register tmp4  = r13;
 7422     const Register tmp5  = r14;
 7423     const Register tmp6  = r15;
 7424     const Register tmp7  = r16;
 7425 
 7426     BLOCK_COMMENT("Entry:");
 7427     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7428     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7429     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7430     __ ret(lr);
 7431 
 7432     AOTCodeCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start);
 7433     return start;
 7434   }
 7435 
 7436   address generate_squareToLen() {
 7437     // squareToLen algorithm for sizes 1..127 described in java code works
 7438     // faster than multiply_to_len on some CPUs and slower on others, but
 7439     // multiply_to_len shows a bit better overall results
 7440     __ align(CodeEntryAlignment);
 7441     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
 7442     StubCodeMark mark(this, stub_id);
 7443     address start = __ pc();
 7444 
 7445     if (AOTCodeCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) {
 7446       return start;
 7447     }
 7448     const Register x     = r0;
 7449     const Register xlen  = r1;
 7450     const Register z     = r2;
 7451     const Register y     = r4; // == x
 7452     const Register ylen  = r5; // == xlen
 7453 
 7454     const Register tmp0  = r3;
 7455     const Register tmp1  = r10;
 7456     const Register tmp2  = r11;
 7457     const Register tmp3  = r12;
 7458     const Register tmp4  = r13;
 7459     const Register tmp5  = r14;
 7460     const Register tmp6  = r15;
 7461     const Register tmp7  = r16;
 7462 
 7463     RegSet spilled_regs = RegSet::of(y, ylen);
 7464     BLOCK_COMMENT("Entry:");
 7465     __ enter();
 7466     __ push(spilled_regs, sp);
 7467     __ mov(y, x);
 7468     __ mov(ylen, xlen);
 7469     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7470     __ pop(spilled_regs, sp);
 7471     __ leave();
 7472     __ ret(lr);
 7473 
 7474     AOTCodeCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start);
 7475     return start;
 7476   }
 7477 
 7478   address generate_mulAdd() {
 7479     __ align(CodeEntryAlignment);
 7480     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
 7481     StubCodeMark mark(this, stub_id);
 7482 
 7483     address start = __ pc();
 7484 
 7485     if (AOTCodeCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) {
 7486       return start;
 7487     }
 7488     const Register out     = r0;
 7489     const Register in      = r1;
 7490     const Register offset  = r2;
 7491     const Register len     = r3;
 7492     const Register k       = r4;
 7493 
 7494     BLOCK_COMMENT("Entry:");
 7495     __ enter();
 7496     __ mul_add(out, in, offset, len, k);
 7497     __ leave();
 7498     __ ret(lr);
 7499 
 7500     AOTCodeCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start);
 7501     return start;
 7502   }
 7503 
 7504   // Arguments:
 7505   //
 7506   // Input:
 7507   //   c_rarg0   - newArr address
 7508   //   c_rarg1   - oldArr address
 7509   //   c_rarg2   - newIdx
 7510   //   c_rarg3   - shiftCount
 7511   //   c_rarg4   - numIter
 7512   //
 7513   address generate_bigIntegerRightShift() {
 7514     __ align(CodeEntryAlignment);
 7515     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
 7516     StubCodeMark mark(this, stub_id);
 7517     address start = __ pc();
 7518 
 7519     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7520 
 7521     Register newArr        = c_rarg0;
 7522     Register oldArr        = c_rarg1;
 7523     Register newIdx        = c_rarg2;
 7524     Register shiftCount    = c_rarg3;
 7525     Register numIter       = c_rarg4;
 7526     Register idx           = numIter;
 7527 
 7528     Register newArrCur     = rscratch1;
 7529     Register shiftRevCount = rscratch2;
 7530     Register oldArrCur     = r13;
 7531     Register oldArrNext    = r14;
 7532 
 7533     FloatRegister oldElem0        = v0;
 7534     FloatRegister oldElem1        = v1;
 7535     FloatRegister newElem         = v2;
 7536     FloatRegister shiftVCount     = v3;
 7537     FloatRegister shiftVRevCount  = v4;
 7538 
 7539     __ cbz(idx, Exit);
 7540 
 7541     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7542 
 7543     // left shift count
 7544     __ movw(shiftRevCount, 32);
 7545     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7546 
 7547     // numIter too small to allow a 4-words SIMD loop, rolling back
 7548     __ cmp(numIter, (u1)4);
 7549     __ br(Assembler::LT, ShiftThree);
 7550 
 7551     __ dup(shiftVCount,    __ T4S, shiftCount);
 7552     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7553     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7554 
 7555     __ BIND(ShiftSIMDLoop);
 7556 
 7557     // Calculate the load addresses
 7558     __ sub(idx, idx, 4);
 7559     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7560     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7561     __ add(oldArrCur,  oldArrNext, 4);
 7562 
 7563     // Load 4 words and process
 7564     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7565     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7566     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7567     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7568     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7569     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7570 
 7571     __ cmp(idx, (u1)4);
 7572     __ br(Assembler::LT, ShiftTwoLoop);
 7573     __ b(ShiftSIMDLoop);
 7574 
 7575     __ BIND(ShiftTwoLoop);
 7576     __ cbz(idx, Exit);
 7577     __ cmp(idx, (u1)1);
 7578     __ br(Assembler::EQ, ShiftOne);
 7579 
 7580     // Calculate the load addresses
 7581     __ sub(idx, idx, 2);
 7582     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7583     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7584     __ add(oldArrCur,  oldArrNext, 4);
 7585 
 7586     // Load 2 words and process
 7587     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7588     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7589     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7590     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7591     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7592     __ st1(newElem,   __ T2S, Address(newArrCur));
 7593     __ b(ShiftTwoLoop);
 7594 
 7595     __ BIND(ShiftThree);
 7596     __ tbz(idx, 1, ShiftOne);
 7597     __ tbz(idx, 0, ShiftTwo);
 7598     __ ldrw(r10,  Address(oldArr, 12));
 7599     __ ldrw(r11,  Address(oldArr, 8));
 7600     __ lsrvw(r10, r10, shiftCount);
 7601     __ lslvw(r11, r11, shiftRevCount);
 7602     __ orrw(r12,  r10, r11);
 7603     __ strw(r12,  Address(newArr, 8));
 7604 
 7605     __ BIND(ShiftTwo);
 7606     __ ldrw(r10,  Address(oldArr, 8));
 7607     __ ldrw(r11,  Address(oldArr, 4));
 7608     __ lsrvw(r10, r10, shiftCount);
 7609     __ lslvw(r11, r11, shiftRevCount);
 7610     __ orrw(r12,  r10, r11);
 7611     __ strw(r12,  Address(newArr, 4));
 7612 
 7613     __ BIND(ShiftOne);
 7614     __ ldrw(r10,  Address(oldArr, 4));
 7615     __ ldrw(r11,  Address(oldArr));
 7616     __ lsrvw(r10, r10, shiftCount);
 7617     __ lslvw(r11, r11, shiftRevCount);
 7618     __ orrw(r12,  r10, r11);
 7619     __ strw(r12,  Address(newArr));
 7620 
 7621     __ BIND(Exit);
 7622     __ ret(lr);
 7623 
 7624     return start;
 7625   }
 7626 
 7627   // Arguments:
 7628   //
 7629   // Input:
 7630   //   c_rarg0   - newArr address
 7631   //   c_rarg1   - oldArr address
 7632   //   c_rarg2   - newIdx
 7633   //   c_rarg3   - shiftCount
 7634   //   c_rarg4   - numIter
 7635   //
 7636   address generate_bigIntegerLeftShift() {
 7637     __ align(CodeEntryAlignment);
 7638     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
 7639     StubCodeMark mark(this, stub_id);
 7640     address start = __ pc();
 7641 
 7642     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7643 
 7644     Register newArr        = c_rarg0;
 7645     Register oldArr        = c_rarg1;
 7646     Register newIdx        = c_rarg2;
 7647     Register shiftCount    = c_rarg3;
 7648     Register numIter       = c_rarg4;
 7649 
 7650     Register shiftRevCount = rscratch1;
 7651     Register oldArrNext    = rscratch2;
 7652 
 7653     FloatRegister oldElem0        = v0;
 7654     FloatRegister oldElem1        = v1;
 7655     FloatRegister newElem         = v2;
 7656     FloatRegister shiftVCount     = v3;
 7657     FloatRegister shiftVRevCount  = v4;
 7658 
 7659     __ cbz(numIter, Exit);
 7660 
 7661     __ add(oldArrNext, oldArr, 4);
 7662     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7663 
 7664     // right shift count
 7665     __ movw(shiftRevCount, 32);
 7666     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7667 
 7668     // numIter too small to allow a 4-words SIMD loop, rolling back
 7669     __ cmp(numIter, (u1)4);
 7670     __ br(Assembler::LT, ShiftThree);
 7671 
 7672     __ dup(shiftVCount,     __ T4S, shiftCount);
 7673     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 7674     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 7675 
 7676     __ BIND(ShiftSIMDLoop);
 7677 
 7678     // load 4 words and process
 7679     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 7680     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 7681     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7682     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7683     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7684     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 7685     __ sub(numIter,   numIter, 4);
 7686 
 7687     __ cmp(numIter, (u1)4);
 7688     __ br(Assembler::LT, ShiftTwoLoop);
 7689     __ b(ShiftSIMDLoop);
 7690 
 7691     __ BIND(ShiftTwoLoop);
 7692     __ cbz(numIter, Exit);
 7693     __ cmp(numIter, (u1)1);
 7694     __ br(Assembler::EQ, ShiftOne);
 7695 
 7696     // load 2 words and process
 7697     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 7698     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 7699     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 7700     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 7701     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 7702     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 7703     __ sub(numIter,   numIter, 2);
 7704     __ b(ShiftTwoLoop);
 7705 
 7706     __ BIND(ShiftThree);
 7707     __ ldrw(r10,  __ post(oldArr, 4));
 7708     __ ldrw(r11,  __ post(oldArrNext, 4));
 7709     __ lslvw(r10, r10, shiftCount);
 7710     __ lsrvw(r11, r11, shiftRevCount);
 7711     __ orrw(r12,  r10, r11);
 7712     __ strw(r12,  __ post(newArr, 4));
 7713     __ tbz(numIter, 1, Exit);
 7714     __ tbz(numIter, 0, ShiftOne);
 7715 
 7716     __ BIND(ShiftTwo);
 7717     __ ldrw(r10,  __ post(oldArr, 4));
 7718     __ ldrw(r11,  __ post(oldArrNext, 4));
 7719     __ lslvw(r10, r10, shiftCount);
 7720     __ lsrvw(r11, r11, shiftRevCount);
 7721     __ orrw(r12,  r10, r11);
 7722     __ strw(r12,  __ post(newArr, 4));
 7723 
 7724     __ BIND(ShiftOne);
 7725     __ ldrw(r10,  Address(oldArr));
 7726     __ ldrw(r11,  Address(oldArrNext));
 7727     __ lslvw(r10, r10, shiftCount);
 7728     __ lsrvw(r11, r11, shiftRevCount);
 7729     __ orrw(r12,  r10, r11);
 7730     __ strw(r12,  Address(newArr));
 7731 
 7732     __ BIND(Exit);
 7733     __ ret(lr);
 7734 
 7735     return start;
 7736   }
 7737 
 7738   address generate_count_positives(address &count_positives_long) {
 7739     const u1 large_loop_size = 64;
 7740     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 7741     int dcache_line = VM_Version::dcache_line_size();
 7742 
 7743     Register ary1 = r1, len = r2, result = r0;
 7744 
 7745     __ align(CodeEntryAlignment);
 7746 
 7747     StubGenStubId stub_id = StubGenStubId::count_positives_id;
 7748     StubCodeMark mark(this, stub_id);
 7749 
 7750     address entry = __ pc();
 7751 
 7752     __ enter();
 7753     // precondition: a copy of len is already in result
 7754     // __ mov(result, len);
 7755 
 7756   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 7757         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 7758 
 7759   __ cmp(len, (u1)15);
 7760   __ br(Assembler::GT, LEN_OVER_15);
 7761   // The only case when execution falls into this code is when pointer is near
 7762   // the end of memory page and we have to avoid reading next page
 7763   __ add(ary1, ary1, len);
 7764   __ subs(len, len, 8);
 7765   __ br(Assembler::GT, LEN_OVER_8);
 7766   __ ldr(rscratch2, Address(ary1, -8));
 7767   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 7768   __ lsrv(rscratch2, rscratch2, rscratch1);
 7769   __ tst(rscratch2, UPPER_BIT_MASK);
 7770   __ csel(result, zr, result, Assembler::NE);
 7771   __ leave();
 7772   __ ret(lr);
 7773   __ bind(LEN_OVER_8);
 7774   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 7775   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 7776   __ tst(rscratch2, UPPER_BIT_MASK);
 7777   __ br(Assembler::NE, RET_NO_POP);
 7778   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 7779   __ lsrv(rscratch1, rscratch1, rscratch2);
 7780   __ tst(rscratch1, UPPER_BIT_MASK);
 7781   __ bind(RET_NO_POP);
 7782   __ csel(result, zr, result, Assembler::NE);
 7783   __ leave();
 7784   __ ret(lr);
 7785 
 7786   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 7787   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 7788 
 7789   count_positives_long = __ pc(); // 2nd entry point
 7790 
 7791   __ enter();
 7792 
 7793   __ bind(LEN_OVER_15);
 7794     __ push(spilled_regs, sp);
 7795     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 7796     __ cbz(rscratch2, ALIGNED);
 7797     __ ldp(tmp6, tmp1, Address(ary1));
 7798     __ mov(tmp5, 16);
 7799     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 7800     __ add(ary1, ary1, rscratch1);
 7801     __ orr(tmp6, tmp6, tmp1);
 7802     __ tst(tmp6, UPPER_BIT_MASK);
 7803     __ br(Assembler::NE, RET_ADJUST);
 7804     __ sub(len, len, rscratch1);
 7805 
 7806   __ bind(ALIGNED);
 7807     __ cmp(len, large_loop_size);
 7808     __ br(Assembler::LT, CHECK_16);
 7809     // Perform 16-byte load as early return in pre-loop to handle situation
 7810     // when initially aligned large array has negative values at starting bytes,
 7811     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 7812     // slower. Cases with negative bytes further ahead won't be affected that
 7813     // much. In fact, it'll be faster due to early loads, less instructions and
 7814     // less branches in LARGE_LOOP.
 7815     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 7816     __ sub(len, len, 16);
 7817     __ orr(tmp6, tmp6, tmp1);
 7818     __ tst(tmp6, UPPER_BIT_MASK);
 7819     __ br(Assembler::NE, RET_ADJUST_16);
 7820     __ cmp(len, large_loop_size);
 7821     __ br(Assembler::LT, CHECK_16);
 7822 
 7823     if (SoftwarePrefetchHintDistance >= 0
 7824         && SoftwarePrefetchHintDistance >= dcache_line) {
 7825       // initial prefetch
 7826       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 7827     }
 7828   __ bind(LARGE_LOOP);
 7829     if (SoftwarePrefetchHintDistance >= 0) {
 7830       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 7831     }
 7832     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 7833     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 7834     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 7835     // instructions per cycle and have less branches, but this approach disables
 7836     // early return, thus, all 64 bytes are loaded and checked every time.
 7837     __ ldp(tmp2, tmp3, Address(ary1));
 7838     __ ldp(tmp4, tmp5, Address(ary1, 16));
 7839     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 7840     __ ldp(tmp6, tmp1, Address(ary1, 48));
 7841     __ add(ary1, ary1, large_loop_size);
 7842     __ sub(len, len, large_loop_size);
 7843     __ orr(tmp2, tmp2, tmp3);
 7844     __ orr(tmp4, tmp4, tmp5);
 7845     __ orr(rscratch1, rscratch1, rscratch2);
 7846     __ orr(tmp6, tmp6, tmp1);
 7847     __ orr(tmp2, tmp2, tmp4);
 7848     __ orr(rscratch1, rscratch1, tmp6);
 7849     __ orr(tmp2, tmp2, rscratch1);
 7850     __ tst(tmp2, UPPER_BIT_MASK);
 7851     __ br(Assembler::NE, RET_ADJUST_LONG);
 7852     __ cmp(len, large_loop_size);
 7853     __ br(Assembler::GE, LARGE_LOOP);
 7854 
 7855   __ bind(CHECK_16); // small 16-byte load pre-loop
 7856     __ cmp(len, (u1)16);
 7857     __ br(Assembler::LT, POST_LOOP16);
 7858 
 7859   __ bind(LOOP16); // small 16-byte load loop
 7860     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 7861     __ sub(len, len, 16);
 7862     __ orr(tmp2, tmp2, tmp3);
 7863     __ tst(tmp2, UPPER_BIT_MASK);
 7864     __ br(Assembler::NE, RET_ADJUST_16);
 7865     __ cmp(len, (u1)16);
 7866     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 7867 
 7868   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 7869     __ cmp(len, (u1)8);
 7870     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 7871     __ ldr(tmp3, Address(__ post(ary1, 8)));
 7872     __ tst(tmp3, UPPER_BIT_MASK);
 7873     __ br(Assembler::NE, RET_ADJUST);
 7874     __ sub(len, len, 8);
 7875 
 7876   __ bind(POST_LOOP16_LOAD_TAIL);
 7877     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 7878     __ ldr(tmp1, Address(ary1));
 7879     __ mov(tmp2, 64);
 7880     __ sub(tmp4, tmp2, len, __ LSL, 3);
 7881     __ lslv(tmp1, tmp1, tmp4);
 7882     __ tst(tmp1, UPPER_BIT_MASK);
 7883     __ br(Assembler::NE, RET_ADJUST);
 7884     // Fallthrough
 7885 
 7886   __ bind(RET_LEN);
 7887     __ pop(spilled_regs, sp);
 7888     __ leave();
 7889     __ ret(lr);
 7890 
 7891     // difference result - len is the count of guaranteed to be
 7892     // positive bytes
 7893 
 7894   __ bind(RET_ADJUST_LONG);
 7895     __ add(len, len, (u1)(large_loop_size - 16));
 7896   __ bind(RET_ADJUST_16);
 7897     __ add(len, len, 16);
 7898   __ bind(RET_ADJUST);
 7899     __ pop(spilled_regs, sp);
 7900     __ leave();
 7901     __ sub(result, result, len);
 7902     __ ret(lr);
 7903 
 7904     return entry;
 7905   }
 7906 
 7907   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 7908         bool usePrefetch, Label &NOT_EQUAL) {
 7909     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7910         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 7911         tmp7 = r12, tmp8 = r13;
 7912     Label LOOP;
 7913 
 7914     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7915     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7916     __ bind(LOOP);
 7917     if (usePrefetch) {
 7918       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 7919       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 7920     }
 7921     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 7922     __ eor(tmp1, tmp1, tmp2);
 7923     __ eor(tmp3, tmp3, tmp4);
 7924     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 7925     __ orr(tmp1, tmp1, tmp3);
 7926     __ cbnz(tmp1, NOT_EQUAL);
 7927     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7928     __ eor(tmp5, tmp5, tmp6);
 7929     __ eor(tmp7, tmp7, tmp8);
 7930     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7931     __ orr(tmp5, tmp5, tmp7);
 7932     __ cbnz(tmp5, NOT_EQUAL);
 7933     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 7934     __ eor(tmp1, tmp1, tmp2);
 7935     __ eor(tmp3, tmp3, tmp4);
 7936     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 7937     __ orr(tmp1, tmp1, tmp3);
 7938     __ cbnz(tmp1, NOT_EQUAL);
 7939     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 7940     __ eor(tmp5, tmp5, tmp6);
 7941     __ sub(cnt1, cnt1, 8 * wordSize);
 7942     __ eor(tmp7, tmp7, tmp8);
 7943     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 7944     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 7945     // cmp) because subs allows an unlimited range of immediate operand.
 7946     __ subs(tmp6, cnt1, loopThreshold);
 7947     __ orr(tmp5, tmp5, tmp7);
 7948     __ cbnz(tmp5, NOT_EQUAL);
 7949     __ br(__ GE, LOOP);
 7950     // post-loop
 7951     __ eor(tmp1, tmp1, tmp2);
 7952     __ eor(tmp3, tmp3, tmp4);
 7953     __ orr(tmp1, tmp1, tmp3);
 7954     __ sub(cnt1, cnt1, 2 * wordSize);
 7955     __ cbnz(tmp1, NOT_EQUAL);
 7956   }
 7957 
 7958   void generate_large_array_equals_loop_simd(int loopThreshold,
 7959         bool usePrefetch, Label &NOT_EQUAL) {
 7960     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7961         tmp2 = rscratch2;
 7962     Label LOOP;
 7963 
 7964     __ bind(LOOP);
 7965     if (usePrefetch) {
 7966       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 7967       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 7968     }
 7969     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 7970     __ sub(cnt1, cnt1, 8 * wordSize);
 7971     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 7972     __ subs(tmp1, cnt1, loopThreshold);
 7973     __ eor(v0, __ T16B, v0, v4);
 7974     __ eor(v1, __ T16B, v1, v5);
 7975     __ eor(v2, __ T16B, v2, v6);
 7976     __ eor(v3, __ T16B, v3, v7);
 7977     __ orr(v0, __ T16B, v0, v1);
 7978     __ orr(v1, __ T16B, v2, v3);
 7979     __ orr(v0, __ T16B, v0, v1);
 7980     __ umov(tmp1, v0, __ D, 0);
 7981     __ umov(tmp2, v0, __ D, 1);
 7982     __ orr(tmp1, tmp1, tmp2);
 7983     __ cbnz(tmp1, NOT_EQUAL);
 7984     __ br(__ GE, LOOP);
 7985   }
 7986 
 7987   // a1 = r1 - array1 address
 7988   // a2 = r2 - array2 address
 7989   // result = r0 - return value. Already contains "false"
 7990   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 7991   // r3-r5 are reserved temporary registers
 7992   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 7993   address generate_large_array_equals() {
 7994     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 7995         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 7996         tmp7 = r12, tmp8 = r13;
 7997     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 7998         SMALL_LOOP, POST_LOOP;
 7999     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8000     // calculate if at least 32 prefetched bytes are used
 8001     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8002     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8003     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8004     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8005         tmp5, tmp6, tmp7, tmp8);
 8006 
 8007     __ align(CodeEntryAlignment);
 8008 
 8009     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
 8010     StubCodeMark mark(this, stub_id);
 8011 
 8012     address entry = __ pc();
 8013     __ enter();
 8014     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8015     // also advance pointers to use post-increment instead of pre-increment
 8016     __ add(a1, a1, wordSize);
 8017     __ add(a2, a2, wordSize);
 8018     if (AvoidUnalignedAccesses) {
 8019       // both implementations (SIMD/nonSIMD) are using relatively large load
 8020       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8021       // on some CPUs in case of address is not at least 16-byte aligned.
 8022       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8023       // load if needed at least for 1st address and make if 16-byte aligned.
 8024       Label ALIGNED16;
 8025       __ tbz(a1, 3, ALIGNED16);
 8026       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8027       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8028       __ sub(cnt1, cnt1, wordSize);
 8029       __ eor(tmp1, tmp1, tmp2);
 8030       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8031       __ bind(ALIGNED16);
 8032     }
 8033     if (UseSIMDForArrayEquals) {
 8034       if (SoftwarePrefetchHintDistance >= 0) {
 8035         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8036         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8037         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8038             /* prfm = */ true, NOT_EQUAL);
 8039         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8040         __ br(__ LT, TAIL);
 8041       }
 8042       __ bind(NO_PREFETCH_LARGE_LOOP);
 8043       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8044           /* prfm = */ false, NOT_EQUAL);
 8045     } else {
 8046       __ push(spilled_regs, sp);
 8047       if (SoftwarePrefetchHintDistance >= 0) {
 8048         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8049         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8050         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8051             /* prfm = */ true, NOT_EQUAL);
 8052         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8053         __ br(__ LT, TAIL);
 8054       }
 8055       __ bind(NO_PREFETCH_LARGE_LOOP);
 8056       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8057           /* prfm = */ false, NOT_EQUAL);
 8058     }
 8059     __ bind(TAIL);
 8060       __ cbz(cnt1, EQUAL);
 8061       __ subs(cnt1, cnt1, wordSize);
 8062       __ br(__ LE, POST_LOOP);
 8063     __ bind(SMALL_LOOP);
 8064       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8065       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8066       __ subs(cnt1, cnt1, wordSize);
 8067       __ eor(tmp1, tmp1, tmp2);
 8068       __ cbnz(tmp1, NOT_EQUAL);
 8069       __ br(__ GT, SMALL_LOOP);
 8070     __ bind(POST_LOOP);
 8071       __ ldr(tmp1, Address(a1, cnt1));
 8072       __ ldr(tmp2, Address(a2, cnt1));
 8073       __ eor(tmp1, tmp1, tmp2);
 8074       __ cbnz(tmp1, NOT_EQUAL);
 8075     __ bind(EQUAL);
 8076       __ mov(result, true);
 8077     __ bind(NOT_EQUAL);
 8078       if (!UseSIMDForArrayEquals) {
 8079         __ pop(spilled_regs, sp);
 8080       }
 8081     __ bind(NOT_EQUAL_NO_POP);
 8082     __ leave();
 8083     __ ret(lr);
 8084     return entry;
 8085   }
 8086 
 8087   // result = r0 - return value. Contains initial hashcode value on entry.
 8088   // ary = r1 - array address
 8089   // cnt = r2 - elements count
 8090   // Clobbers: v0-v13, rscratch1, rscratch2
 8091   address generate_large_arrays_hashcode(BasicType eltype) {
 8092     const Register result = r0, ary = r1, cnt = r2;
 8093     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8094     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8095     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8096     const FloatRegister vpowm = v13;
 8097 
 8098     ARRAYS_HASHCODE_REGISTERS;
 8099 
 8100     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8101 
 8102     unsigned int vf; // vectorization factor
 8103     bool multiply_by_halves;
 8104     Assembler::SIMD_Arrangement load_arrangement;
 8105     switch (eltype) {
 8106     case T_BOOLEAN:
 8107     case T_BYTE:
 8108       load_arrangement = Assembler::T8B;
 8109       multiply_by_halves = true;
 8110       vf = 8;
 8111       break;
 8112     case T_CHAR:
 8113     case T_SHORT:
 8114       load_arrangement = Assembler::T8H;
 8115       multiply_by_halves = true;
 8116       vf = 8;
 8117       break;
 8118     case T_INT:
 8119       load_arrangement = Assembler::T4S;
 8120       multiply_by_halves = false;
 8121       vf = 4;
 8122       break;
 8123     default:
 8124       ShouldNotReachHere();
 8125     }
 8126 
 8127     // Unroll factor
 8128     const unsigned uf = 4;
 8129 
 8130     // Effective vectorization factor
 8131     const unsigned evf = vf * uf;
 8132 
 8133     __ align(CodeEntryAlignment);
 8134 
 8135     StubGenStubId stub_id;
 8136     switch (eltype) {
 8137     case T_BOOLEAN:
 8138       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
 8139       break;
 8140     case T_BYTE:
 8141       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
 8142       break;
 8143     case T_CHAR:
 8144       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
 8145       break;
 8146     case T_SHORT:
 8147       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
 8148       break;
 8149     case T_INT:
 8150       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
 8151       break;
 8152     default:
 8153       stub_id = StubGenStubId::NO_STUBID;
 8154       ShouldNotReachHere();
 8155     };
 8156 
 8157     StubCodeMark mark(this, stub_id);
 8158 
 8159     address entry = __ pc();
 8160     __ enter();
 8161 
 8162     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8163     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8164     // value shouldn't change throughout both loops.
 8165     __ movw(rscratch1, intpow(31U, 3));
 8166     __ mov(vpow, Assembler::S, 0, rscratch1);
 8167     __ movw(rscratch1, intpow(31U, 2));
 8168     __ mov(vpow, Assembler::S, 1, rscratch1);
 8169     __ movw(rscratch1, intpow(31U, 1));
 8170     __ mov(vpow, Assembler::S, 2, rscratch1);
 8171     __ movw(rscratch1, intpow(31U, 0));
 8172     __ mov(vpow, Assembler::S, 3, rscratch1);
 8173 
 8174     __ mov(vmul0, Assembler::T16B, 0);
 8175     __ mov(vmul0, Assembler::S, 3, result);
 8176 
 8177     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8178     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8179 
 8180     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8181     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8182 
 8183     // SMALL LOOP
 8184     __ bind(SMALL_LOOP);
 8185 
 8186     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8187     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8188     __ subsw(rscratch2, rscratch2, vf);
 8189 
 8190     if (load_arrangement == Assembler::T8B) {
 8191       // Extend 8B to 8H to be able to use vector multiply
 8192       // instructions
 8193       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8194       if (is_signed_subword_type(eltype)) {
 8195         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8196       } else {
 8197         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8198       }
 8199     }
 8200 
 8201     switch (load_arrangement) {
 8202     case Assembler::T4S:
 8203       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8204       break;
 8205     case Assembler::T8B:
 8206     case Assembler::T8H:
 8207       assert(is_subword_type(eltype), "subword type expected");
 8208       if (is_signed_subword_type(eltype)) {
 8209         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8210       } else {
 8211         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8212       }
 8213       break;
 8214     default:
 8215       __ should_not_reach_here();
 8216     }
 8217 
 8218     // Process the upper half of a vector
 8219     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8220       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8221       if (is_signed_subword_type(eltype)) {
 8222         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8223       } else {
 8224         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8225       }
 8226     }
 8227 
 8228     __ br(Assembler::HI, SMALL_LOOP);
 8229 
 8230     // SMALL LOOP'S EPILOQUE
 8231     __ lsr(rscratch2, cnt, exact_log2(evf));
 8232     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8233 
 8234     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8235     __ addv(vmul0, Assembler::T4S, vmul0);
 8236     __ umov(result, vmul0, Assembler::S, 0);
 8237 
 8238     // TAIL
 8239     __ bind(TAIL);
 8240 
 8241     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8242     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8243     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8244     __ andr(rscratch2, cnt, vf - 1);
 8245     __ bind(TAIL_SHORTCUT);
 8246     __ adr(rscratch1, BR_BASE);
 8247     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8248     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8249     __ movw(rscratch2, 0x1f);
 8250     __ br(rscratch1);
 8251 
 8252     for (size_t i = 0; i < vf - 1; ++i) {
 8253       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8254                                    eltype);
 8255       __ maddw(result, result, rscratch2, rscratch1);
 8256       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8257       // Generate 2nd nop to have 4 instructions per iteration.
 8258       if (VM_Version::supports_a53mac()) {
 8259         __ nop();
 8260       }
 8261     }
 8262     __ bind(BR_BASE);
 8263 
 8264     __ leave();
 8265     __ ret(lr);
 8266 
 8267     // LARGE LOOP
 8268     __ bind(LARGE_LOOP_PREHEADER);
 8269 
 8270     __ lsr(rscratch2, cnt, exact_log2(evf));
 8271 
 8272     if (multiply_by_halves) {
 8273       // 31^4 - multiplier between lower and upper parts of a register
 8274       __ movw(rscratch1, intpow(31U, vf / 2));
 8275       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8276       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8277       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8278       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8279     } else {
 8280       // 31^16
 8281       __ movw(rscratch1, intpow(31U, evf));
 8282       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8283     }
 8284 
 8285     __ mov(vmul3, Assembler::T16B, 0);
 8286     __ mov(vmul2, Assembler::T16B, 0);
 8287     __ mov(vmul1, Assembler::T16B, 0);
 8288 
 8289     __ bind(LARGE_LOOP);
 8290 
 8291     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8292     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8293     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8294     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8295 
 8296     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8297            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8298 
 8299     if (load_arrangement == Assembler::T8B) {
 8300       // Extend 8B to 8H to be able to use vector multiply
 8301       // instructions
 8302       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8303       if (is_signed_subword_type(eltype)) {
 8304         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8305         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8306         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8307         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8308       } else {
 8309         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8310         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8311         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8312         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8313       }
 8314     }
 8315 
 8316     switch (load_arrangement) {
 8317     case Assembler::T4S:
 8318       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8319       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8320       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8321       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8322       break;
 8323     case Assembler::T8B:
 8324     case Assembler::T8H:
 8325       assert(is_subword_type(eltype), "subword type expected");
 8326       if (is_signed_subword_type(eltype)) {
 8327         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8328         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8329         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8330         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8331       } else {
 8332         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8333         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8334         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8335         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8336       }
 8337       break;
 8338     default:
 8339       __ should_not_reach_here();
 8340     }
 8341 
 8342     // Process the upper half of a vector
 8343     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8344       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8345       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8346       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8347       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8348       if (is_signed_subword_type(eltype)) {
 8349         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8350         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8351         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8352         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8353       } else {
 8354         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8355         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8356         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8357         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8358       }
 8359     }
 8360 
 8361     __ subsw(rscratch2, rscratch2, 1);
 8362     __ br(Assembler::HI, LARGE_LOOP);
 8363 
 8364     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8365     __ addv(vmul3, Assembler::T4S, vmul3);
 8366     __ umov(result, vmul3, Assembler::S, 0);
 8367 
 8368     __ mov(rscratch2, intpow(31U, vf));
 8369 
 8370     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8371     __ addv(vmul2, Assembler::T4S, vmul2);
 8372     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8373     __ maddw(result, result, rscratch2, rscratch1);
 8374 
 8375     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8376     __ addv(vmul1, Assembler::T4S, vmul1);
 8377     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8378     __ maddw(result, result, rscratch2, rscratch1);
 8379 
 8380     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8381     __ addv(vmul0, Assembler::T4S, vmul0);
 8382     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8383     __ maddw(result, result, rscratch2, rscratch1);
 8384 
 8385     __ andr(rscratch2, cnt, vf - 1);
 8386     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8387 
 8388     __ leave();
 8389     __ ret(lr);
 8390 
 8391     return entry;
 8392   }
 8393 
 8394   address generate_dsin_dcos(bool isCos) {
 8395     __ align(CodeEntryAlignment);
 8396     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
 8397     StubCodeMark mark(this, stub_id);
 8398     address start = __ pc();
 8399     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8400         (address)StubRoutines::aarch64::_two_over_pi,
 8401         (address)StubRoutines::aarch64::_pio2,
 8402         (address)StubRoutines::aarch64::_dsin_coef,
 8403         (address)StubRoutines::aarch64::_dcos_coef);
 8404     return start;
 8405   }
 8406 
 8407   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8408   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8409       Label &DIFF2) {
 8410     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8411     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8412 
 8413     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8414     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8415     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8416     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8417 
 8418     __ fmovd(tmpL, vtmp3);
 8419     __ eor(rscratch2, tmp3, tmpL);
 8420     __ cbnz(rscratch2, DIFF2);
 8421 
 8422     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8423     __ umov(tmpL, vtmp3, __ D, 1);
 8424     __ eor(rscratch2, tmpU, tmpL);
 8425     __ cbnz(rscratch2, DIFF1);
 8426 
 8427     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8428     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8429     __ fmovd(tmpL, vtmp);
 8430     __ eor(rscratch2, tmp3, tmpL);
 8431     __ cbnz(rscratch2, DIFF2);
 8432 
 8433     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8434     __ umov(tmpL, vtmp, __ D, 1);
 8435     __ eor(rscratch2, tmpU, tmpL);
 8436     __ cbnz(rscratch2, DIFF1);
 8437   }
 8438 
 8439   // r0  = result
 8440   // r1  = str1
 8441   // r2  = cnt1
 8442   // r3  = str2
 8443   // r4  = cnt2
 8444   // r10 = tmp1
 8445   // r11 = tmp2
 8446   address generate_compare_long_string_different_encoding(bool isLU) {
 8447     __ align(CodeEntryAlignment);
 8448     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
 8449     StubCodeMark mark(this, stub_id);
 8450     address entry = __ pc();
 8451     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8452         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8453         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8454     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8455         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8456     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8457     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8458 
 8459     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8460 
 8461     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8462     // cnt2 == amount of characters left to compare
 8463     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8464     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8465     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8466     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8467     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8468     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8469     __ eor(rscratch2, tmp1, tmp2);
 8470     __ mov(rscratch1, tmp2);
 8471     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8472     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8473              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8474     __ push(spilled_regs, sp);
 8475     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8476     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8477 
 8478     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8479 
 8480     if (SoftwarePrefetchHintDistance >= 0) {
 8481       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8482       __ br(__ LT, NO_PREFETCH);
 8483       __ bind(LARGE_LOOP_PREFETCH);
 8484         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8485         __ mov(tmp4, 2);
 8486         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8487         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8488           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8489           __ subs(tmp4, tmp4, 1);
 8490           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8491           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8492           __ mov(tmp4, 2);
 8493         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8494           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8495           __ subs(tmp4, tmp4, 1);
 8496           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8497           __ sub(cnt2, cnt2, 64);
 8498           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8499           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8500     }
 8501     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8502     __ bind(NO_PREFETCH);
 8503     __ subs(cnt2, cnt2, 16);
 8504     __ br(__ LT, TAIL);
 8505     __ align(OptoLoopAlignment);
 8506     __ bind(SMALL_LOOP); // smaller loop
 8507       __ subs(cnt2, cnt2, 16);
 8508       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8509       __ br(__ GE, SMALL_LOOP);
 8510       __ cmn(cnt2, (u1)16);
 8511       __ br(__ EQ, LOAD_LAST);
 8512     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8513       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8514       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8515       __ ldr(tmp3, Address(cnt1, -8));
 8516       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8517       __ b(LOAD_LAST);
 8518     __ bind(DIFF2);
 8519       __ mov(tmpU, tmp3);
 8520     __ bind(DIFF1);
 8521       __ pop(spilled_regs, sp);
 8522       __ b(CALCULATE_DIFFERENCE);
 8523     __ bind(LOAD_LAST);
 8524       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8525       // No need to load it again
 8526       __ mov(tmpU, tmp3);
 8527       __ pop(spilled_regs, sp);
 8528 
 8529       // tmp2 points to the address of the last 4 Latin1 characters right now
 8530       __ ldrs(vtmp, Address(tmp2));
 8531       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8532       __ fmovd(tmpL, vtmp);
 8533 
 8534       __ eor(rscratch2, tmpU, tmpL);
 8535       __ cbz(rscratch2, DONE);
 8536 
 8537     // Find the first different characters in the longwords and
 8538     // compute their difference.
 8539     __ bind(CALCULATE_DIFFERENCE);
 8540       __ rev(rscratch2, rscratch2);
 8541       __ clz(rscratch2, rscratch2);
 8542       __ andr(rscratch2, rscratch2, -16);
 8543       __ lsrv(tmp1, tmp1, rscratch2);
 8544       __ uxthw(tmp1, tmp1);
 8545       __ lsrv(rscratch1, rscratch1, rscratch2);
 8546       __ uxthw(rscratch1, rscratch1);
 8547       __ subw(result, tmp1, rscratch1);
 8548     __ bind(DONE);
 8549       __ ret(lr);
 8550     return entry;
 8551   }
 8552 
 8553   // r0 = input (float16)
 8554   // v0 = result (float)
 8555   // v1 = temporary float register
 8556   address generate_float16ToFloat() {
 8557     __ align(CodeEntryAlignment);
 8558     StubGenStubId stub_id = StubGenStubId::hf2f_id;
 8559     StubCodeMark mark(this, stub_id);
 8560     address entry = __ pc();
 8561     BLOCK_COMMENT("Entry:");
 8562     __ flt16_to_flt(v0, r0, v1);
 8563     __ ret(lr);
 8564     return entry;
 8565   }
 8566 
 8567   // v0 = input (float)
 8568   // r0 = result (float16)
 8569   // v1 = temporary float register
 8570   address generate_floatToFloat16() {
 8571     __ align(CodeEntryAlignment);
 8572     StubGenStubId stub_id = StubGenStubId::f2hf_id;
 8573     StubCodeMark mark(this, stub_id);
 8574     address entry = __ pc();
 8575     BLOCK_COMMENT("Entry:");
 8576     __ flt_to_flt16(r0, v0, v1);
 8577     __ ret(lr);
 8578     return entry;
 8579   }
 8580 
 8581   address generate_method_entry_barrier() {
 8582     __ align(CodeEntryAlignment);
 8583     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
 8584     StubCodeMark mark(this, stub_id);
 8585 
 8586     Label deoptimize_label;
 8587 
 8588     address start = __ pc();
 8589 
 8590     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8591 
 8592     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8593       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8594       // We can get here despite the nmethod being good, if we have not
 8595       // yet applied our cross modification fence (or data fence).
 8596       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 8597       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 8598       __ ldrw(rscratch2, rscratch2);
 8599       __ strw(rscratch2, thread_epoch_addr);
 8600       __ isb();
 8601       __ membar(__ LoadLoad);
 8602     }
 8603 
 8604     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 8605 
 8606     __ enter();
 8607     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 8608 
 8609     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 8610 
 8611     __ push_call_clobbered_registers();
 8612 
 8613     __ mov(c_rarg0, rscratch2);
 8614     __ call_VM_leaf
 8615          (CAST_FROM_FN_PTR
 8616           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 8617 
 8618     __ reset_last_Java_frame(true);
 8619 
 8620     __ mov(rscratch1, r0);
 8621 
 8622     __ pop_call_clobbered_registers();
 8623 
 8624     __ cbnz(rscratch1, deoptimize_label);
 8625 
 8626     __ leave();
 8627     __ ret(lr);
 8628 
 8629     __ BIND(deoptimize_label);
 8630 
 8631     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 8632     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 8633 
 8634     __ mov(sp, rscratch1);
 8635     __ br(rscratch2);
 8636 
 8637     return start;
 8638   }
 8639 
 8640   // r0  = result
 8641   // r1  = str1
 8642   // r2  = cnt1
 8643   // r3  = str2
 8644   // r4  = cnt2
 8645   // r10 = tmp1
 8646   // r11 = tmp2
 8647   address generate_compare_long_string_same_encoding(bool isLL) {
 8648     __ align(CodeEntryAlignment);
 8649     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
 8650     StubCodeMark mark(this, stub_id);
 8651     address entry = __ pc();
 8652     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8653         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 8654 
 8655     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 8656 
 8657     // exit from large loop when less than 64 bytes left to read or we're about
 8658     // to prefetch memory behind array border
 8659     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 8660 
 8661     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 8662     __ eor(rscratch2, tmp1, tmp2);
 8663     __ cbnz(rscratch2, CAL_DIFFERENCE);
 8664 
 8665     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 8666     // update pointers, because of previous read
 8667     __ add(str1, str1, wordSize);
 8668     __ add(str2, str2, wordSize);
 8669     if (SoftwarePrefetchHintDistance >= 0) {
 8670       __ align(OptoLoopAlignment);
 8671       __ bind(LARGE_LOOP_PREFETCH);
 8672         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 8673         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 8674 
 8675         for (int i = 0; i < 4; i++) {
 8676           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 8677           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 8678           __ cmp(tmp1, tmp2);
 8679           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8680           __ br(Assembler::NE, DIFF);
 8681         }
 8682         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 8683         __ add(str1, str1, 64);
 8684         __ add(str2, str2, 64);
 8685         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 8686         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 8687         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 8688     }
 8689 
 8690     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 8691     __ br(Assembler::LE, LESS16);
 8692     __ align(OptoLoopAlignment);
 8693     __ bind(LOOP_COMPARE16);
 8694       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 8695       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 8696       __ cmp(tmp1, tmp2);
 8697       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8698       __ br(Assembler::NE, DIFF);
 8699       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 8700       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 8701       __ br(Assembler::LT, LESS16);
 8702 
 8703       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 8704       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 8705       __ cmp(tmp1, tmp2);
 8706       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 8707       __ br(Assembler::NE, DIFF);
 8708       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 8709       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 8710       __ br(Assembler::GE, LOOP_COMPARE16);
 8711       __ cbz(cnt2, LENGTH_DIFF);
 8712 
 8713     __ bind(LESS16);
 8714       // each 8 compare
 8715       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 8716       __ br(Assembler::LE, LESS8);
 8717       __ ldr(tmp1, Address(__ post(str1, 8)));
 8718       __ ldr(tmp2, Address(__ post(str2, 8)));
 8719       __ eor(rscratch2, tmp1, tmp2);
 8720       __ cbnz(rscratch2, CAL_DIFFERENCE);
 8721       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 8722 
 8723     __ bind(LESS8); // directly load last 8 bytes
 8724       if (!isLL) {
 8725         __ add(cnt2, cnt2, cnt2);
 8726       }
 8727       __ ldr(tmp1, Address(str1, cnt2));
 8728       __ ldr(tmp2, Address(str2, cnt2));
 8729       __ eor(rscratch2, tmp1, tmp2);
 8730       __ cbz(rscratch2, LENGTH_DIFF);
 8731       __ b(CAL_DIFFERENCE);
 8732 
 8733     __ bind(DIFF);
 8734       __ cmp(tmp1, tmp2);
 8735       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 8736       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 8737       // reuse rscratch2 register for the result of eor instruction
 8738       __ eor(rscratch2, tmp1, tmp2);
 8739 
 8740     __ bind(CAL_DIFFERENCE);
 8741       __ rev(rscratch2, rscratch2);
 8742       __ clz(rscratch2, rscratch2);
 8743       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 8744       __ lsrv(tmp1, tmp1, rscratch2);
 8745       __ lsrv(tmp2, tmp2, rscratch2);
 8746       if (isLL) {
 8747         __ uxtbw(tmp1, tmp1);
 8748         __ uxtbw(tmp2, tmp2);
 8749       } else {
 8750         __ uxthw(tmp1, tmp1);
 8751         __ uxthw(tmp2, tmp2);
 8752       }
 8753       __ subw(result, tmp1, tmp2);
 8754 
 8755     __ bind(LENGTH_DIFF);
 8756       __ ret(lr);
 8757     return entry;
 8758   }
 8759 
 8760   enum string_compare_mode {
 8761     LL,
 8762     LU,
 8763     UL,
 8764     UU,
 8765   };
 8766 
 8767   // The following registers are declared in aarch64.ad
 8768   // r0  = result
 8769   // r1  = str1
 8770   // r2  = cnt1
 8771   // r3  = str2
 8772   // r4  = cnt2
 8773   // r10 = tmp1
 8774   // r11 = tmp2
 8775   // z0  = ztmp1
 8776   // z1  = ztmp2
 8777   // p0  = pgtmp1
 8778   // p1  = pgtmp2
 8779   address generate_compare_long_string_sve(string_compare_mode mode) {
 8780     StubGenStubId stub_id;
 8781     switch (mode) {
 8782       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
 8783       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
 8784       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
 8785       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
 8786       default: ShouldNotReachHere();
 8787     }
 8788 
 8789     __ align(CodeEntryAlignment);
 8790     address entry = __ pc();
 8791     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8792              tmp1 = r10, tmp2 = r11;
 8793 
 8794     Label LOOP, DONE, MISMATCH;
 8795     Register vec_len = tmp1;
 8796     Register idx = tmp2;
 8797     // The minimum of the string lengths has been stored in cnt2.
 8798     Register cnt = cnt2;
 8799     FloatRegister ztmp1 = z0, ztmp2 = z1;
 8800     PRegister pgtmp1 = p0, pgtmp2 = p1;
 8801 
 8802 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 8803     switch (mode) {                                                            \
 8804       case LL:                                                                 \
 8805         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 8806         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 8807         break;                                                                 \
 8808       case LU:                                                                 \
 8809         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 8810         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 8811         break;                                                                 \
 8812       case UL:                                                                 \
 8813         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 8814         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 8815         break;                                                                 \
 8816       case UU:                                                                 \
 8817         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 8818         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 8819         break;                                                                 \
 8820       default:                                                                 \
 8821         ShouldNotReachHere();                                                  \
 8822     }
 8823 
 8824     StubCodeMark mark(this, stub_id);
 8825 
 8826     __ mov(idx, 0);
 8827     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 8828 
 8829     if (mode == LL) {
 8830       __ sve_cntb(vec_len);
 8831     } else {
 8832       __ sve_cnth(vec_len);
 8833     }
 8834 
 8835     __ sub(rscratch1, cnt, vec_len);
 8836 
 8837     __ bind(LOOP);
 8838 
 8839       // main loop
 8840       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 8841       __ add(idx, idx, vec_len);
 8842       // Compare strings.
 8843       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 8844       __ br(__ NE, MISMATCH);
 8845       __ cmp(idx, rscratch1);
 8846       __ br(__ LT, LOOP);
 8847 
 8848     // post loop, last iteration
 8849     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 8850 
 8851     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 8852     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 8853     __ br(__ EQ, DONE);
 8854 
 8855     __ bind(MISMATCH);
 8856 
 8857     // Crop the vector to find its location.
 8858     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 8859     // Extract the first different characters of each string.
 8860     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 8861     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 8862 
 8863     // Compute the difference of the first different characters.
 8864     __ sub(result, rscratch1, rscratch2);
 8865 
 8866     __ bind(DONE);
 8867     __ ret(lr);
 8868 #undef LOAD_PAIR
 8869     return entry;
 8870   }
 8871 
 8872   void generate_compare_long_strings() {
 8873     if (UseSVE == 0) {
 8874       StubRoutines::aarch64::_compare_long_string_LL
 8875           = generate_compare_long_string_same_encoding(true);
 8876       StubRoutines::aarch64::_compare_long_string_UU
 8877           = generate_compare_long_string_same_encoding(false);
 8878       StubRoutines::aarch64::_compare_long_string_LU
 8879           = generate_compare_long_string_different_encoding(true);
 8880       StubRoutines::aarch64::_compare_long_string_UL
 8881           = generate_compare_long_string_different_encoding(false);
 8882     } else {
 8883       StubRoutines::aarch64::_compare_long_string_LL
 8884           = generate_compare_long_string_sve(LL);
 8885       StubRoutines::aarch64::_compare_long_string_UU
 8886           = generate_compare_long_string_sve(UU);
 8887       StubRoutines::aarch64::_compare_long_string_LU
 8888           = generate_compare_long_string_sve(LU);
 8889       StubRoutines::aarch64::_compare_long_string_UL
 8890           = generate_compare_long_string_sve(UL);
 8891     }
 8892   }
 8893 
 8894   // R0 = result
 8895   // R1 = str2
 8896   // R2 = cnt1
 8897   // R3 = str1
 8898   // R4 = cnt2
 8899   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 8900   //
 8901   // This generic linear code use few additional ideas, which makes it faster:
 8902   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 8903   // in order to skip initial loading(help in systems with 1 ld pipeline)
 8904   // 2) we can use "fast" algorithm of finding single character to search for
 8905   // first symbol with less branches(1 branch per each loaded register instead
 8906   // of branch for each symbol), so, this is where constants like
 8907   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 8908   // 3) after loading and analyzing 1st register of source string, it can be
 8909   // used to search for every 1st character entry, saving few loads in
 8910   // comparison with "simplier-but-slower" implementation
 8911   // 4) in order to avoid lots of push/pop operations, code below is heavily
 8912   // re-using/re-initializing/compressing register values, which makes code
 8913   // larger and a bit less readable, however, most of extra operations are
 8914   // issued during loads or branches, so, penalty is minimal
 8915   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 8916     StubGenStubId stub_id;
 8917     if (str1_isL) {
 8918       if (str2_isL) {
 8919         stub_id = StubGenStubId::string_indexof_linear_ll_id;
 8920       } else {
 8921         stub_id = StubGenStubId::string_indexof_linear_ul_id;
 8922       }
 8923     } else {
 8924       if (str2_isL) {
 8925         ShouldNotReachHere();
 8926       } else {
 8927         stub_id = StubGenStubId::string_indexof_linear_uu_id;
 8928       }
 8929     }
 8930     __ align(CodeEntryAlignment);
 8931     StubCodeMark mark(this, stub_id);
 8932     address entry = __ pc();
 8933 
 8934     int str1_chr_size = str1_isL ? 1 : 2;
 8935     int str2_chr_size = str2_isL ? 1 : 2;
 8936     int str1_chr_shift = str1_isL ? 0 : 1;
 8937     int str2_chr_shift = str2_isL ? 0 : 1;
 8938     bool isL = str1_isL && str2_isL;
 8939    // parameters
 8940     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 8941     // temporary registers
 8942     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 8943     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 8944     // redefinitions
 8945     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 8946 
 8947     __ push(spilled_regs, sp);
 8948     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 8949         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 8950         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 8951         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 8952         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 8953         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 8954     // Read whole register from str1. It is safe, because length >=8 here
 8955     __ ldr(ch1, Address(str1));
 8956     // Read whole register from str2. It is safe, because length >=8 here
 8957     __ ldr(ch2, Address(str2));
 8958     __ sub(cnt2, cnt2, cnt1);
 8959     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 8960     if (str1_isL != str2_isL) {
 8961       __ eor(v0, __ T16B, v0, v0);
 8962     }
 8963     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 8964     __ mul(first, first, tmp1);
 8965     // check if we have less than 1 register to check
 8966     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 8967     if (str1_isL != str2_isL) {
 8968       __ fmovd(v1, ch1);
 8969     }
 8970     __ br(__ LE, L_SMALL);
 8971     __ eor(ch2, first, ch2);
 8972     if (str1_isL != str2_isL) {
 8973       __ zip1(v1, __ T16B, v1, v0);
 8974     }
 8975     __ sub(tmp2, ch2, tmp1);
 8976     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8977     __ bics(tmp2, tmp2, ch2);
 8978     if (str1_isL != str2_isL) {
 8979       __ fmovd(ch1, v1);
 8980     }
 8981     __ br(__ NE, L_HAS_ZERO);
 8982     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 8983     __ add(result, result, wordSize/str2_chr_size);
 8984     __ add(str2, str2, wordSize);
 8985     __ br(__ LT, L_POST_LOOP);
 8986     __ BIND(L_LOOP);
 8987       __ ldr(ch2, Address(str2));
 8988       __ eor(ch2, first, ch2);
 8989       __ sub(tmp2, ch2, tmp1);
 8990       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 8991       __ bics(tmp2, tmp2, ch2);
 8992       __ br(__ NE, L_HAS_ZERO);
 8993     __ BIND(L_LOOP_PROCEED);
 8994       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 8995       __ add(str2, str2, wordSize);
 8996       __ add(result, result, wordSize/str2_chr_size);
 8997       __ br(__ GE, L_LOOP);
 8998     __ BIND(L_POST_LOOP);
 8999       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9000       __ br(__ LE, NOMATCH);
 9001       __ ldr(ch2, Address(str2));
 9002       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9003       __ eor(ch2, first, ch2);
 9004       __ sub(tmp2, ch2, tmp1);
 9005       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9006       __ mov(tmp4, -1); // all bits set
 9007       __ b(L_SMALL_PROCEED);
 9008     __ align(OptoLoopAlignment);
 9009     __ BIND(L_SMALL);
 9010       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9011       __ eor(ch2, first, ch2);
 9012       if (str1_isL != str2_isL) {
 9013         __ zip1(v1, __ T16B, v1, v0);
 9014       }
 9015       __ sub(tmp2, ch2, tmp1);
 9016       __ mov(tmp4, -1); // all bits set
 9017       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9018       if (str1_isL != str2_isL) {
 9019         __ fmovd(ch1, v1); // move converted 4 symbols
 9020       }
 9021     __ BIND(L_SMALL_PROCEED);
 9022       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9023       __ bic(tmp2, tmp2, ch2);
 9024       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9025       __ rbit(tmp2, tmp2);
 9026       __ br(__ EQ, NOMATCH);
 9027     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9028       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9029       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9030       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9031       if (str2_isL) { // LL
 9032         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9033         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9034         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9035         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9036         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9037       } else {
 9038         __ mov(ch2, 0xE); // all bits in byte set except last one
 9039         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9040         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9041         __ lslv(tmp2, tmp2, tmp4);
 9042         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9043         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9044         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9045         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9046       }
 9047       __ cmp(ch1, ch2);
 9048       __ mov(tmp4, wordSize/str2_chr_size);
 9049       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9050     __ BIND(L_SMALL_CMP_LOOP);
 9051       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9052                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9053       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9054                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9055       __ add(tmp4, tmp4, 1);
 9056       __ cmp(tmp4, cnt1);
 9057       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9058       __ cmp(first, ch2);
 9059       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9060     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9061       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9062       __ clz(tmp4, tmp2);
 9063       __ add(result, result, 1); // advance index
 9064       __ add(str2, str2, str2_chr_size); // advance pointer
 9065       __ b(L_SMALL_HAS_ZERO_LOOP);
 9066     __ align(OptoLoopAlignment);
 9067     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9068       __ cmp(first, ch2);
 9069       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9070       __ b(DONE);
 9071     __ align(OptoLoopAlignment);
 9072     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9073       if (str2_isL) { // LL
 9074         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9075         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9076         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9077         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9078         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9079       } else {
 9080         __ mov(ch2, 0xE); // all bits in byte set except last one
 9081         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9082         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9083         __ lslv(tmp2, tmp2, tmp4);
 9084         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9085         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9086         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9087         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9088       }
 9089       __ cmp(ch1, ch2);
 9090       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9091       __ b(DONE);
 9092     __ align(OptoLoopAlignment);
 9093     __ BIND(L_HAS_ZERO);
 9094       __ rbit(tmp2, tmp2);
 9095       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9096       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9097       // It's fine because both counters are 32bit and are not changed in this
 9098       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9099       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9100       __ sub(result, result, 1);
 9101     __ BIND(L_HAS_ZERO_LOOP);
 9102       __ mov(cnt1, wordSize/str2_chr_size);
 9103       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9104       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9105       if (str2_isL) {
 9106         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9107         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9108         __ lslv(tmp2, tmp2, tmp4);
 9109         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9110         __ add(tmp4, tmp4, 1);
 9111         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9112         __ lsl(tmp2, tmp2, 1);
 9113         __ mov(tmp4, wordSize/str2_chr_size);
 9114       } else {
 9115         __ mov(ch2, 0xE);
 9116         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9117         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9118         __ lslv(tmp2, tmp2, tmp4);
 9119         __ add(tmp4, tmp4, 1);
 9120         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9121         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9122         __ lsl(tmp2, tmp2, 1);
 9123         __ mov(tmp4, wordSize/str2_chr_size);
 9124         __ sub(str2, str2, str2_chr_size);
 9125       }
 9126       __ cmp(ch1, ch2);
 9127       __ mov(tmp4, wordSize/str2_chr_size);
 9128       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9129     __ BIND(L_CMP_LOOP);
 9130       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9131                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9132       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9133                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9134       __ add(tmp4, tmp4, 1);
 9135       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9136       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9137       __ cmp(cnt1, ch2);
 9138       __ br(__ EQ, L_CMP_LOOP);
 9139     __ BIND(L_CMP_LOOP_NOMATCH);
 9140       // here we're not matched
 9141       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9142       __ clz(tmp4, tmp2);
 9143       __ add(str2, str2, str2_chr_size); // advance pointer
 9144       __ b(L_HAS_ZERO_LOOP);
 9145     __ align(OptoLoopAlignment);
 9146     __ BIND(L_CMP_LOOP_LAST_CMP);
 9147       __ cmp(cnt1, ch2);
 9148       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9149       __ b(DONE);
 9150     __ align(OptoLoopAlignment);
 9151     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9152       if (str2_isL) {
 9153         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9154         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9155         __ lslv(tmp2, tmp2, tmp4);
 9156         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9157         __ add(tmp4, tmp4, 1);
 9158         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9159         __ lsl(tmp2, tmp2, 1);
 9160       } else {
 9161         __ mov(ch2, 0xE);
 9162         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9163         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9164         __ lslv(tmp2, tmp2, tmp4);
 9165         __ add(tmp4, tmp4, 1);
 9166         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9167         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9168         __ lsl(tmp2, tmp2, 1);
 9169         __ sub(str2, str2, str2_chr_size);
 9170       }
 9171       __ cmp(ch1, ch2);
 9172       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9173       __ b(DONE);
 9174     __ align(OptoLoopAlignment);
 9175     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9176       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9177       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9178       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9179       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9180       // result by analyzed characters value, so, we can just reset lower bits
 9181       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9182       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9183       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9184       // index of last analyzed substring inside current octet. So, str2 in at
 9185       // respective start address. We need to advance it to next octet
 9186       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9187       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9188       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9189       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9190       __ movw(cnt2, cnt2);
 9191       __ b(L_LOOP_PROCEED);
 9192     __ align(OptoLoopAlignment);
 9193     __ BIND(NOMATCH);
 9194       __ mov(result, -1);
 9195     __ BIND(DONE);
 9196       __ pop(spilled_regs, sp);
 9197       __ ret(lr);
 9198     return entry;
 9199   }
 9200 
 9201   void generate_string_indexof_stubs() {
 9202     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9203     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9204     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9205   }
 9206 
 9207   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9208       FloatRegister src1, FloatRegister src2) {
 9209     Register dst = r1;
 9210     __ zip1(v1, __ T16B, src1, v0);
 9211     __ zip2(v2, __ T16B, src1, v0);
 9212     if (generatePrfm) {
 9213       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9214     }
 9215     __ zip1(v3, __ T16B, src2, v0);
 9216     __ zip2(v4, __ T16B, src2, v0);
 9217     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9218   }
 9219 
 9220   // R0 = src
 9221   // R1 = dst
 9222   // R2 = len
 9223   // R3 = len >> 3
 9224   // V0 = 0
 9225   // v1 = loaded 8 bytes
 9226   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9227   address generate_large_byte_array_inflate() {
 9228     __ align(CodeEntryAlignment);
 9229     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
 9230     StubCodeMark mark(this, stub_id);
 9231     address entry = __ pc();
 9232     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9233     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9234     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9235 
 9236     // do one more 8-byte read to have address 16-byte aligned in most cases
 9237     // also use single store instruction
 9238     __ ldrd(v2, __ post(src, 8));
 9239     __ sub(octetCounter, octetCounter, 2);
 9240     __ zip1(v1, __ T16B, v1, v0);
 9241     __ zip1(v2, __ T16B, v2, v0);
 9242     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9243     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9244     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9245     __ br(__ LE, LOOP_START);
 9246     __ b(LOOP_PRFM_START);
 9247     __ bind(LOOP_PRFM);
 9248       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9249     __ bind(LOOP_PRFM_START);
 9250       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9251       __ sub(octetCounter, octetCounter, 8);
 9252       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9253       inflate_and_store_2_fp_registers(true, v3, v4);
 9254       inflate_and_store_2_fp_registers(true, v5, v6);
 9255       __ br(__ GT, LOOP_PRFM);
 9256       __ cmp(octetCounter, (u1)8);
 9257       __ br(__ LT, DONE);
 9258     __ bind(LOOP);
 9259       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9260       __ bind(LOOP_START);
 9261       __ sub(octetCounter, octetCounter, 8);
 9262       __ cmp(octetCounter, (u1)8);
 9263       inflate_and_store_2_fp_registers(false, v3, v4);
 9264       inflate_and_store_2_fp_registers(false, v5, v6);
 9265       __ br(__ GE, LOOP);
 9266     __ bind(DONE);
 9267       __ ret(lr);
 9268     return entry;
 9269   }
 9270 
 9271   /**
 9272    *  Arguments:
 9273    *
 9274    *  Input:
 9275    *  c_rarg0   - current state address
 9276    *  c_rarg1   - H key address
 9277    *  c_rarg2   - data address
 9278    *  c_rarg3   - number of blocks
 9279    *
 9280    *  Output:
 9281    *  Updated state at c_rarg0
 9282    */
 9283   address generate_ghash_processBlocks() {
 9284     // Bafflingly, GCM uses little-endian for the byte order, but
 9285     // big-endian for the bit order.  For example, the polynomial 1 is
 9286     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9287     //
 9288     // So, we must either reverse the bytes in each word and do
 9289     // everything big-endian or reverse the bits in each byte and do
 9290     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9291     // the bits in each byte (we have an instruction, RBIT, to do
 9292     // that) and keep the data in little-endian bit order through the
 9293     // calculation, bit-reversing the inputs and outputs.
 9294 
 9295     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
 9296     StubCodeMark mark(this, stub_id);
 9297     __ align(wordSize * 2);
 9298     address p = __ pc();
 9299     __ emit_int64(0x87);  // The low-order bits of the field
 9300                           // polynomial (i.e. p = z^7+z^2+z+1)
 9301                           // repeated in the low and high parts of a
 9302                           // 128-bit vector
 9303     __ emit_int64(0x87);
 9304 
 9305     __ align(CodeEntryAlignment);
 9306     address start = __ pc();
 9307 
 9308     Register state   = c_rarg0;
 9309     Register subkeyH = c_rarg1;
 9310     Register data    = c_rarg2;
 9311     Register blocks  = c_rarg3;
 9312 
 9313     FloatRegister vzr = v30;
 9314     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9315 
 9316     __ ldrq(v24, p);    // The field polynomial
 9317 
 9318     __ ldrq(v0, Address(state));
 9319     __ ldrq(v1, Address(subkeyH));
 9320 
 9321     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9322     __ rbit(v0, __ T16B, v0);
 9323     __ rev64(v1, __ T16B, v1);
 9324     __ rbit(v1, __ T16B, v1);
 9325 
 9326     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9327     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9328 
 9329     {
 9330       Label L_ghash_loop;
 9331       __ bind(L_ghash_loop);
 9332 
 9333       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9334                                                  // reversing each byte
 9335       __ rbit(v2, __ T16B, v2);
 9336       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9337 
 9338       // Multiply state in v2 by subkey in v1
 9339       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9340                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9341                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9342       // Reduce v7:v5 by the field polynomial
 9343       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9344 
 9345       __ sub(blocks, blocks, 1);
 9346       __ cbnz(blocks, L_ghash_loop);
 9347     }
 9348 
 9349     // The bit-reversed result is at this point in v0
 9350     __ rev64(v0, __ T16B, v0);
 9351     __ rbit(v0, __ T16B, v0);
 9352 
 9353     __ st1(v0, __ T16B, state);
 9354     __ ret(lr);
 9355 
 9356     return start;
 9357   }
 9358 
 9359   address generate_ghash_processBlocks_wide() {
 9360     address small = generate_ghash_processBlocks();
 9361 
 9362     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
 9363     StubCodeMark mark(this, stub_id);
 9364     __ align(wordSize * 2);
 9365     address p = __ pc();
 9366     __ emit_int64(0x87);  // The low-order bits of the field
 9367                           // polynomial (i.e. p = z^7+z^2+z+1)
 9368                           // repeated in the low and high parts of a
 9369                           // 128-bit vector
 9370     __ emit_int64(0x87);
 9371 
 9372     __ align(CodeEntryAlignment);
 9373     address start = __ pc();
 9374 
 9375     Register state   = c_rarg0;
 9376     Register subkeyH = c_rarg1;
 9377     Register data    = c_rarg2;
 9378     Register blocks  = c_rarg3;
 9379 
 9380     const int unroll = 4;
 9381 
 9382     __ cmp(blocks, (unsigned char)(unroll * 2));
 9383     __ br(__ LT, small);
 9384 
 9385     if (unroll > 1) {
 9386     // Save state before entering routine
 9387       __ sub(sp, sp, 4 * 16);
 9388       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9389       __ sub(sp, sp, 4 * 16);
 9390       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9391     }
 9392 
 9393     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 9394 
 9395     if (unroll > 1) {
 9396       // And restore state
 9397       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9398       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9399     }
 9400 
 9401     __ cmp(blocks, (unsigned char)0);
 9402     __ br(__ GT, small);
 9403 
 9404     __ ret(lr);
 9405 
 9406     return start;
 9407   }
 9408 
 9409   void generate_base64_encode_simdround(Register src, Register dst,
 9410         FloatRegister codec, u8 size) {
 9411 
 9412     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9413     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9414     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9415 
 9416     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9417 
 9418     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9419 
 9420     __ ushr(ind0, arrangement, in0,  2);
 9421 
 9422     __ ushr(ind1, arrangement, in1,  2);
 9423     __ shl(in0,   arrangement, in0,  6);
 9424     __ orr(ind1,  arrangement, ind1, in0);
 9425     __ ushr(ind1, arrangement, ind1, 2);
 9426 
 9427     __ ushr(ind2, arrangement, in2,  4);
 9428     __ shl(in1,   arrangement, in1,  4);
 9429     __ orr(ind2,  arrangement, in1,  ind2);
 9430     __ ushr(ind2, arrangement, ind2, 2);
 9431 
 9432     __ shl(ind3,  arrangement, in2,  2);
 9433     __ ushr(ind3, arrangement, ind3, 2);
 9434 
 9435     __ tbl(out0,  arrangement, codec,  4, ind0);
 9436     __ tbl(out1,  arrangement, codec,  4, ind1);
 9437     __ tbl(out2,  arrangement, codec,  4, ind2);
 9438     __ tbl(out3,  arrangement, codec,  4, ind3);
 9439 
 9440     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9441   }
 9442 
 9443    /**
 9444    *  Arguments:
 9445    *
 9446    *  Input:
 9447    *  c_rarg0   - src_start
 9448    *  c_rarg1   - src_offset
 9449    *  c_rarg2   - src_length
 9450    *  c_rarg3   - dest_start
 9451    *  c_rarg4   - dest_offset
 9452    *  c_rarg5   - isURL
 9453    *
 9454    */
 9455   address generate_base64_encodeBlock() {
 9456 
 9457     static const char toBase64[64] = {
 9458       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9459       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9460       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9461       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9462       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9463     };
 9464 
 9465     static const char toBase64URL[64] = {
 9466       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9467       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9468       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9469       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9470       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9471     };
 9472 
 9473     __ align(CodeEntryAlignment);
 9474     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
 9475     StubCodeMark mark(this, stub_id);
 9476     address start = __ pc();
 9477 
 9478     Register src   = c_rarg0;  // source array
 9479     Register soff  = c_rarg1;  // source start offset
 9480     Register send  = c_rarg2;  // source end offset
 9481     Register dst   = c_rarg3;  // dest array
 9482     Register doff  = c_rarg4;  // position for writing to dest array
 9483     Register isURL = c_rarg5;  // Base64 or URL character set
 9484 
 9485     // c_rarg6 and c_rarg7 are free to use as temps
 9486     Register codec  = c_rarg6;
 9487     Register length = c_rarg7;
 9488 
 9489     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9490 
 9491     __ add(src, src, soff);
 9492     __ add(dst, dst, doff);
 9493     __ sub(length, send, soff);
 9494 
 9495     // load the codec base address
 9496     __ lea(codec, ExternalAddress((address) toBase64));
 9497     __ cbz(isURL, ProcessData);
 9498     __ lea(codec, ExternalAddress((address) toBase64URL));
 9499 
 9500     __ BIND(ProcessData);
 9501 
 9502     // too short to formup a SIMD loop, roll back
 9503     __ cmp(length, (u1)24);
 9504     __ br(Assembler::LT, Process3B);
 9505 
 9506     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9507 
 9508     __ BIND(Process48B);
 9509     __ cmp(length, (u1)48);
 9510     __ br(Assembler::LT, Process24B);
 9511     generate_base64_encode_simdround(src, dst, v0, 16);
 9512     __ sub(length, length, 48);
 9513     __ b(Process48B);
 9514 
 9515     __ BIND(Process24B);
 9516     __ cmp(length, (u1)24);
 9517     __ br(Assembler::LT, SIMDExit);
 9518     generate_base64_encode_simdround(src, dst, v0, 8);
 9519     __ sub(length, length, 24);
 9520 
 9521     __ BIND(SIMDExit);
 9522     __ cbz(length, Exit);
 9523 
 9524     __ BIND(Process3B);
 9525     //  3 src bytes, 24 bits
 9526     __ ldrb(r10, __ post(src, 1));
 9527     __ ldrb(r11, __ post(src, 1));
 9528     __ ldrb(r12, __ post(src, 1));
 9529     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9530     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9531     // codec index
 9532     __ ubfmw(r15, r12, 18, 23);
 9533     __ ubfmw(r14, r12, 12, 17);
 9534     __ ubfmw(r13, r12, 6,  11);
 9535     __ andw(r12,  r12, 63);
 9536     // get the code based on the codec
 9537     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9538     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9539     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9540     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9541     __ strb(r15, __ post(dst, 1));
 9542     __ strb(r14, __ post(dst, 1));
 9543     __ strb(r13, __ post(dst, 1));
 9544     __ strb(r12, __ post(dst, 1));
 9545     __ sub(length, length, 3);
 9546     __ cbnz(length, Process3B);
 9547 
 9548     __ BIND(Exit);
 9549     __ ret(lr);
 9550 
 9551     return start;
 9552   }
 9553 
 9554   void generate_base64_decode_simdround(Register src, Register dst,
 9555         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9556 
 9557     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9558     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9559 
 9560     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9561     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9562 
 9563     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9564 
 9565     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9566 
 9567     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9568 
 9569     // we need unsigned saturating subtract, to make sure all input values
 9570     // in range [0, 63] will have 0U value in the higher half lookup
 9571     __ uqsubv(decH0, __ T16B, in0, v27);
 9572     __ uqsubv(decH1, __ T16B, in1, v27);
 9573     __ uqsubv(decH2, __ T16B, in2, v27);
 9574     __ uqsubv(decH3, __ T16B, in3, v27);
 9575 
 9576     // lower half lookup
 9577     __ tbl(decL0, arrangement, codecL, 4, in0);
 9578     __ tbl(decL1, arrangement, codecL, 4, in1);
 9579     __ tbl(decL2, arrangement, codecL, 4, in2);
 9580     __ tbl(decL3, arrangement, codecL, 4, in3);
 9581 
 9582     // higher half lookup
 9583     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9584     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9585     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9586     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9587 
 9588     // combine lower and higher
 9589     __ orr(decL0, arrangement, decL0, decH0);
 9590     __ orr(decL1, arrangement, decL1, decH1);
 9591     __ orr(decL2, arrangement, decL2, decH2);
 9592     __ orr(decL3, arrangement, decL3, decH3);
 9593 
 9594     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 9595     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 9596     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 9597     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 9598     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 9599     __ orr(in0, arrangement, decH0, decH1);
 9600     __ orr(in1, arrangement, decH2, decH3);
 9601     __ orr(in2, arrangement, in0,   in1);
 9602     __ umaxv(in3, arrangement, in2);
 9603     __ umov(rscratch2, in3, __ B, 0);
 9604 
 9605     // get the data to output
 9606     __ shl(out0,  arrangement, decL0, 2);
 9607     __ ushr(out1, arrangement, decL1, 4);
 9608     __ orr(out0,  arrangement, out0,  out1);
 9609     __ shl(out1,  arrangement, decL1, 4);
 9610     __ ushr(out2, arrangement, decL2, 2);
 9611     __ orr(out1,  arrangement, out1,  out2);
 9612     __ shl(out2,  arrangement, decL2, 6);
 9613     __ orr(out2,  arrangement, out2,  decL3);
 9614 
 9615     __ cbz(rscratch2, NoIllegalData);
 9616 
 9617     // handle illegal input
 9618     __ umov(r10, in2, __ D, 0);
 9619     if (size == 16) {
 9620       __ cbnz(r10, ErrorInLowerHalf);
 9621 
 9622       // illegal input is in higher half, store the lower half now.
 9623       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 9624 
 9625       __ umov(r10, in2,  __ D, 1);
 9626       __ umov(r11, out0, __ D, 1);
 9627       __ umov(r12, out1, __ D, 1);
 9628       __ umov(r13, out2, __ D, 1);
 9629       __ b(StoreLegalData);
 9630 
 9631       __ BIND(ErrorInLowerHalf);
 9632     }
 9633     __ umov(r11, out0, __ D, 0);
 9634     __ umov(r12, out1, __ D, 0);
 9635     __ umov(r13, out2, __ D, 0);
 9636 
 9637     __ BIND(StoreLegalData);
 9638     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 9639     __ strb(r11, __ post(dst, 1));
 9640     __ strb(r12, __ post(dst, 1));
 9641     __ strb(r13, __ post(dst, 1));
 9642     __ lsr(r10, r10, 8);
 9643     __ lsr(r11, r11, 8);
 9644     __ lsr(r12, r12, 8);
 9645     __ lsr(r13, r13, 8);
 9646     __ b(StoreLegalData);
 9647 
 9648     __ BIND(NoIllegalData);
 9649     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 9650   }
 9651 
 9652 
 9653    /**
 9654    *  Arguments:
 9655    *
 9656    *  Input:
 9657    *  c_rarg0   - src_start
 9658    *  c_rarg1   - src_offset
 9659    *  c_rarg2   - src_length
 9660    *  c_rarg3   - dest_start
 9661    *  c_rarg4   - dest_offset
 9662    *  c_rarg5   - isURL
 9663    *  c_rarg6   - isMIME
 9664    *
 9665    */
 9666   address generate_base64_decodeBlock() {
 9667 
 9668     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
 9669     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
 9670     // titled "Base64 decoding".
 9671 
 9672     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
 9673     // except the trailing character '=' is also treated illegal value in this intrinsic. That
 9674     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
 9675     static const uint8_t fromBase64ForNoSIMD[256] = {
 9676       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9677       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9678       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 9679        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9680       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 9681        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
 9682       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 9683        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 9684       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9685       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9686       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9687       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9688       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9689       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9690       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9691       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9692     };
 9693 
 9694     static const uint8_t fromBase64URLForNoSIMD[256] = {
 9695       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9696       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9697       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 9698        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9699       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 9700        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
 9701       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 9702        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 9703       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9704       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9705       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9706       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9707       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9708       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9709       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9710       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9711     };
 9712 
 9713     // A legal value of base64 code is in range [0, 127].  We need two lookups
 9714     // with tbl/tbx and combine them to get the decode data. The 1st table vector
 9715     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
 9716     // table vector lookup use tbx, out of range indices are unchanged in
 9717     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
 9718     // The value of index 64 is set to 0, so that we know that we already get the
 9719     // decoded data with the 1st lookup.
 9720     static const uint8_t fromBase64ForSIMD[128] = {
 9721       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9722       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9723       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 9724        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9725         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 9726        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 9727       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 9728        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 9729     };
 9730 
 9731     static const uint8_t fromBase64URLForSIMD[128] = {
 9732       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9733       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 9734       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 9735        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 9736         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 9737        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 9738        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 9739        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 9740     };
 9741 
 9742     __ align(CodeEntryAlignment);
 9743     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
 9744     StubCodeMark mark(this, stub_id);
 9745     address start = __ pc();
 9746 
 9747     Register src    = c_rarg0;  // source array
 9748     Register soff   = c_rarg1;  // source start offset
 9749     Register send   = c_rarg2;  // source end offset
 9750     Register dst    = c_rarg3;  // dest array
 9751     Register doff   = c_rarg4;  // position for writing to dest array
 9752     Register isURL  = c_rarg5;  // Base64 or URL character set
 9753     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
 9754 
 9755     Register length = send;    // reuse send as length of source data to process
 9756 
 9757     Register simd_codec   = c_rarg6;
 9758     Register nosimd_codec = c_rarg7;
 9759 
 9760     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
 9761 
 9762     __ enter();
 9763 
 9764     __ add(src, src, soff);
 9765     __ add(dst, dst, doff);
 9766 
 9767     __ mov(doff, dst);
 9768 
 9769     __ sub(length, send, soff);
 9770     __ bfm(length, zr, 0, 1);
 9771 
 9772     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
 9773     __ cbz(isURL, ProcessData);
 9774     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
 9775 
 9776     __ BIND(ProcessData);
 9777     __ mov(rscratch1, length);
 9778     __ cmp(length, (u1)144); // 144 = 80 + 64
 9779     __ br(Assembler::LT, Process4B);
 9780 
 9781     // In the MIME case, the line length cannot be more than 76
 9782     // bytes (see RFC 2045). This is too short a block for SIMD
 9783     // to be worthwhile, so we use non-SIMD here.
 9784     __ movw(rscratch1, 79);
 9785 
 9786     __ BIND(Process4B);
 9787     __ ldrw(r14, __ post(src, 4));
 9788     __ ubfxw(r10, r14, 0,  8);
 9789     __ ubfxw(r11, r14, 8,  8);
 9790     __ ubfxw(r12, r14, 16, 8);
 9791     __ ubfxw(r13, r14, 24, 8);
 9792     // get the de-code
 9793     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
 9794     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
 9795     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
 9796     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
 9797     // error detection, 255u indicates an illegal input
 9798     __ orrw(r14, r10, r11);
 9799     __ orrw(r15, r12, r13);
 9800     __ orrw(r14, r14, r15);
 9801     __ tbnz(r14, 7, Exit);
 9802     // recover the data
 9803     __ lslw(r14, r10, 10);
 9804     __ bfiw(r14, r11, 4, 6);
 9805     __ bfmw(r14, r12, 2, 5);
 9806     __ rev16w(r14, r14);
 9807     __ bfiw(r13, r12, 6, 2);
 9808     __ strh(r14, __ post(dst, 2));
 9809     __ strb(r13, __ post(dst, 1));
 9810     // non-simd loop
 9811     __ subsw(rscratch1, rscratch1, 4);
 9812     __ br(Assembler::GT, Process4B);
 9813 
 9814     // if exiting from PreProcess80B, rscratch1 == -1;
 9815     // otherwise, rscratch1 == 0.
 9816     __ cbzw(rscratch1, Exit);
 9817     __ sub(length, length, 80);
 9818 
 9819     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
 9820     __ cbz(isURL, SIMDEnter);
 9821     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
 9822 
 9823     __ BIND(SIMDEnter);
 9824     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
 9825     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
 9826     __ mov(rscratch1, 63);
 9827     __ dup(v27, __ T16B, rscratch1);
 9828 
 9829     __ BIND(Process64B);
 9830     __ cmp(length, (u1)64);
 9831     __ br(Assembler::LT, Process32B);
 9832     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
 9833     __ sub(length, length, 64);
 9834     __ b(Process64B);
 9835 
 9836     __ BIND(Process32B);
 9837     __ cmp(length, (u1)32);
 9838     __ br(Assembler::LT, SIMDExit);
 9839     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
 9840     __ sub(length, length, 32);
 9841     __ b(Process32B);
 9842 
 9843     __ BIND(SIMDExit);
 9844     __ cbz(length, Exit);
 9845     __ movw(rscratch1, length);
 9846     __ b(Process4B);
 9847 
 9848     __ BIND(Exit);
 9849     __ sub(c_rarg0, dst, doff);
 9850 
 9851     __ leave();
 9852     __ ret(lr);
 9853 
 9854     return start;
 9855   }
 9856 
 9857   // Support for spin waits.
 9858   address generate_spin_wait() {
 9859     __ align(CodeEntryAlignment);
 9860     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
 9861     StubCodeMark mark(this, stub_id);
 9862     address start = __ pc();
 9863 
 9864     __ spin_wait();
 9865     __ ret(lr);
 9866 
 9867     return start;
 9868   }
 9869 
 9870   void generate_lookup_secondary_supers_table_stub() {
 9871     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 9872     StubCodeMark mark(this, stub_id);
 9873 
 9874     const Register
 9875       r_super_klass  = r0,
 9876       r_array_base   = r1,
 9877       r_array_length = r2,
 9878       r_array_index  = r3,
 9879       r_sub_klass    = r4,
 9880       r_bitmap       = rscratch2,
 9881       result         = r5;
 9882     const FloatRegister
 9883       vtemp          = v0;
 9884 
 9885     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 9886       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 9887       Label L_success;
 9888       __ enter();
 9889       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 9890                                              r_array_base, r_array_length, r_array_index,
 9891                                              vtemp, result, slot,
 9892                                              /*stub_is_near*/true);
 9893       __ leave();
 9894       __ ret(lr);
 9895     }
 9896   }
 9897 
 9898   // Slow path implementation for UseSecondarySupersTable.
 9899   address generate_lookup_secondary_supers_table_slow_path_stub() {
 9900     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 9901     StubCodeMark mark(this, stub_id);
 9902 
 9903     address start = __ pc();
 9904     const Register
 9905       r_super_klass  = r0,        // argument
 9906       r_array_base   = r1,        // argument
 9907       temp1          = r2,        // temp
 9908       r_array_index  = r3,        // argument
 9909       r_bitmap       = rscratch2, // argument
 9910       result         = r5;        // argument
 9911 
 9912     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
 9913     __ ret(lr);
 9914 
 9915     return start;
 9916   }
 9917 
 9918 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 9919 
 9920   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
 9921   //
 9922   // If LSE is in use, generate LSE versions of all the stubs. The
 9923   // non-LSE versions are in atomic_aarch64.S.
 9924 
 9925   // class AtomicStubMark records the entry point of a stub and the
 9926   // stub pointer which will point to it. The stub pointer is set to
 9927   // the entry point when ~AtomicStubMark() is called, which must be
 9928   // after ICache::invalidate_range. This ensures safe publication of
 9929   // the generated code.
 9930   class AtomicStubMark {
 9931     address _entry_point;
 9932     aarch64_atomic_stub_t *_stub;
 9933     MacroAssembler *_masm;
 9934   public:
 9935     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
 9936       _masm = masm;
 9937       __ align(32);
 9938       _entry_point = __ pc();
 9939       _stub = stub;
 9940     }
 9941     ~AtomicStubMark() {
 9942       *_stub = (aarch64_atomic_stub_t)_entry_point;
 9943     }
 9944   };
 9945 
 9946   // NB: For memory_order_conservative we need a trailing membar after
 9947   // LSE atomic operations but not a leading membar.
 9948   //
 9949   // We don't need a leading membar because a clause in the Arm ARM
 9950   // says:
 9951   //
 9952   //   Barrier-ordered-before
 9953   //
 9954   //   Barrier instructions order prior Memory effects before subsequent
 9955   //   Memory effects generated by the same Observer. A read or a write
 9956   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
 9957   //   Observer if and only if RW1 appears in program order before RW 2
 9958   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
 9959   //   instruction with both Acquire and Release semantics.
 9960   //
 9961   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
 9962   // and Release semantics, therefore we don't need a leading
 9963   // barrier. However, there is no corresponding Barrier-ordered-after
 9964   // relationship, therefore we need a trailing membar to prevent a
 9965   // later store or load from being reordered with the store in an
 9966   // atomic instruction.
 9967   //
 9968   // This was checked by using the herd7 consistency model simulator
 9969   // (http://diy.inria.fr/) with this test case:
 9970   //
 9971   // AArch64 LseCas
 9972   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
 9973   // P0 | P1;
 9974   // LDR W4, [X2] | MOV W3, #0;
 9975   // DMB LD       | MOV W4, #1;
 9976   // LDR W3, [X1] | CASAL W3, W4, [X1];
 9977   //              | DMB ISH;
 9978   //              | STR W4, [X2];
 9979   // exists
 9980   // (0:X3=0 /\ 0:X4=1)
 9981   //
 9982   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
 9983   // with the store to x in P1. Without the DMB in P1 this may happen.
 9984   //
 9985   // At the time of writing we don't know of any AArch64 hardware that
 9986   // reorders stores in this way, but the Reference Manual permits it.
 9987 
 9988   void gen_cas_entry(Assembler::operand_size size,
 9989                      atomic_memory_order order) {
 9990     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
 9991       exchange_val = c_rarg2;
 9992     bool acquire, release;
 9993     switch (order) {
 9994       case memory_order_relaxed:
 9995         acquire = false;
 9996         release = false;
 9997         break;
 9998       case memory_order_release:
 9999         acquire = false;
10000         release = true;
10001         break;
10002       default:
10003         acquire = true;
10004         release = true;
10005         break;
10006     }
10007     __ mov(prev, compare_val);
10008     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10009     if (order == memory_order_conservative) {
10010       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10011     }
10012     if (size == Assembler::xword) {
10013       __ mov(r0, prev);
10014     } else {
10015       __ movw(r0, prev);
10016     }
10017     __ ret(lr);
10018   }
10019 
10020   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10021     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10022     // If not relaxed, then default to conservative.  Relaxed is the only
10023     // case we use enough to be worth specializing.
10024     if (order == memory_order_relaxed) {
10025       __ ldadd(size, incr, prev, addr);
10026     } else {
10027       __ ldaddal(size, incr, prev, addr);
10028       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10029     }
10030     if (size == Assembler::xword) {
10031       __ mov(r0, prev);
10032     } else {
10033       __ movw(r0, prev);
10034     }
10035     __ ret(lr);
10036   }
10037 
10038   void gen_swpal_entry(Assembler::operand_size size) {
10039     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10040     __ swpal(size, incr, prev, addr);
10041     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10042     if (size == Assembler::xword) {
10043       __ mov(r0, prev);
10044     } else {
10045       __ movw(r0, prev);
10046     }
10047     __ ret(lr);
10048   }
10049 
10050   void generate_atomic_entry_points() {
10051     if (! UseLSE) {
10052       return;
10053     }
10054     __ align(CodeEntryAlignment);
10055     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
10056     StubCodeMark mark(this, stub_id);
10057     address first_entry = __ pc();
10058 
10059     // ADD, memory_order_conservative
10060     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10061     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10062     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10063     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10064 
10065     // ADD, memory_order_relaxed
10066     AtomicStubMark mark_fetch_add_4_relaxed
10067       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10068     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10069     AtomicStubMark mark_fetch_add_8_relaxed
10070       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10071     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10072 
10073     // XCHG, memory_order_conservative
10074     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10075     gen_swpal_entry(Assembler::word);
10076     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10077     gen_swpal_entry(Assembler::xword);
10078 
10079     // CAS, memory_order_conservative
10080     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10081     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10082     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10083     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10084     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10085     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10086 
10087     // CAS, memory_order_relaxed
10088     AtomicStubMark mark_cmpxchg_1_relaxed
10089       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10090     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10091     AtomicStubMark mark_cmpxchg_4_relaxed
10092       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10093     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10094     AtomicStubMark mark_cmpxchg_8_relaxed
10095       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10096     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10097 
10098     AtomicStubMark mark_cmpxchg_4_release
10099       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10100     gen_cas_entry(MacroAssembler::word, memory_order_release);
10101     AtomicStubMark mark_cmpxchg_8_release
10102       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10103     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10104 
10105     AtomicStubMark mark_cmpxchg_4_seq_cst
10106       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10107     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10108     AtomicStubMark mark_cmpxchg_8_seq_cst
10109       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10110     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10111 
10112     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10113   }
10114 #endif // LINUX
10115 
10116   address generate_cont_thaw(Continuation::thaw_kind kind) {
10117     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10118     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10119 
10120     address start = __ pc();
10121 
10122     if (return_barrier) {
10123       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10124       __ mov(sp, rscratch1);
10125     }
10126     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10127 
10128     if (return_barrier) {
10129       // preserve possible return value from a method returning to the return barrier
10130       __ fmovd(rscratch1, v0);
10131       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10132     }
10133 
10134     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10135     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10136     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10137 
10138     if (return_barrier) {
10139       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10140       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10141       __ fmovd(v0, rscratch1);
10142     }
10143     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10144 
10145 
10146     Label thaw_success;
10147     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10148     __ cbnz(rscratch2, thaw_success);
10149     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10150     __ br(rscratch1);
10151     __ bind(thaw_success);
10152 
10153     // make room for the thawed frames
10154     __ sub(rscratch1, sp, rscratch2);
10155     __ andr(rscratch1, rscratch1, -16); // align
10156     __ mov(sp, rscratch1);
10157 
10158     if (return_barrier) {
10159       // save original return value -- again
10160       __ fmovd(rscratch1, v0);
10161       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10162     }
10163 
10164     // If we want, we can templatize thaw by kind, and have three different entries
10165     __ movw(c_rarg1, (uint32_t)kind);
10166 
10167     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10168     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10169 
10170     if (return_barrier) {
10171       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10172       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10173       __ fmovd(v0, rscratch1);
10174     } else {
10175       __ mov(r0, zr); // return 0 (success) from doYield
10176     }
10177 
10178     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10179     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10180     __ mov(rfp, sp);
10181 
10182     if (return_barrier_exception) {
10183       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10184       __ authenticate_return_address(c_rarg1);
10185       __ verify_oop(r0);
10186       // save return value containing the exception oop in callee-saved R19
10187       __ mov(r19, r0);
10188 
10189       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10190 
10191       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10192       // __ reinitialize_ptrue();
10193 
10194       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10195 
10196       __ mov(r1, r0); // the exception handler
10197       __ mov(r0, r19); // restore return value containing the exception oop
10198       __ verify_oop(r0);
10199 
10200       __ leave();
10201       __ mov(r3, lr);
10202       __ br(r1); // the exception handler
10203     } else {
10204       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10205       __ leave();
10206       __ ret(lr);
10207     }
10208 
10209     return start;
10210   }
10211 
10212   address generate_cont_thaw() {
10213     if (!Continuations::enabled()) return nullptr;
10214 
10215     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
10216     StubCodeMark mark(this, stub_id);
10217     address start = __ pc();
10218     generate_cont_thaw(Continuation::thaw_top);
10219     return start;
10220   }
10221 
10222   address generate_cont_returnBarrier() {
10223     if (!Continuations::enabled()) return nullptr;
10224 
10225     // TODO: will probably need multiple return barriers depending on return type
10226     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
10227     StubCodeMark mark(this, stub_id);
10228     address start = __ pc();
10229 
10230     generate_cont_thaw(Continuation::thaw_return_barrier);
10231 
10232     return start;
10233   }
10234 
10235   address generate_cont_returnBarrier_exception() {
10236     if (!Continuations::enabled()) return nullptr;
10237 
10238     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
10239     StubCodeMark mark(this, stub_id);
10240     address start = __ pc();
10241 
10242     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10243 
10244     return start;
10245   }
10246 
10247   address generate_cont_preempt_stub() {
10248     if (!Continuations::enabled()) return nullptr;
10249     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
10250     StubCodeMark mark(this, stub_id);
10251     address start = __ pc();
10252 
10253     __ reset_last_Java_frame(true);
10254 
10255     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10256     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10257     __ mov(sp, rscratch2);
10258 
10259     Label preemption_cancelled;
10260     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10261     __ cbnz(rscratch1, preemption_cancelled);
10262 
10263     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10264     SharedRuntime::continuation_enter_cleanup(_masm);
10265     __ leave();
10266     __ ret(lr);
10267 
10268     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10269     __ bind(preemption_cancelled);
10270     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10271     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10272     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10273     __ ldr(rscratch1, Address(rscratch1));
10274     __ br(rscratch1);
10275 
10276     return start;
10277   }
10278 
10279   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10280   // are represented as long[5], with BITS_PER_LIMB = 26.
10281   // Pack five 26-bit limbs into three 64-bit registers.
10282   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10283     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10284     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10285     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10286     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10287 
10288     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10289     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10290     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10291     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10292 
10293     if (dest2->is_valid()) {
10294       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10295     } else {
10296 #ifdef ASSERT
10297       Label OK;
10298       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10299       __ br(__ EQ, OK);
10300       __ stop("high bits of Poly1305 integer should be zero");
10301       __ should_not_reach_here();
10302       __ bind(OK);
10303 #endif
10304     }
10305   }
10306 
10307   // As above, but return only a 128-bit integer, packed into two
10308   // 64-bit registers.
10309   void pack_26(Register dest0, Register dest1, Register src) {
10310     pack_26(dest0, dest1, noreg, src);
10311   }
10312 
10313   // Multiply and multiply-accumulate unsigned 64-bit registers.
10314   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10315     __ mul(prod_lo, n, m);
10316     __ umulh(prod_hi, n, m);
10317   }
10318   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10319     wide_mul(rscratch1, rscratch2, n, m);
10320     __ adds(sum_lo, sum_lo, rscratch1);
10321     __ adc(sum_hi, sum_hi, rscratch2);
10322   }
10323 
10324   // Poly1305, RFC 7539
10325 
10326   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10327   // description of the tricks used to simplify and accelerate this
10328   // computation.
10329 
10330   address generate_poly1305_processBlocks() {
10331     __ align(CodeEntryAlignment);
10332     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
10333     StubCodeMark mark(this, stub_id);
10334     address start = __ pc();
10335     Label here;
10336     __ enter();
10337     RegSet callee_saved = RegSet::range(r19, r28);
10338     __ push(callee_saved, sp);
10339 
10340     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10341 
10342     // Arguments
10343     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10344 
10345     // R_n is the 128-bit randomly-generated key, packed into two
10346     // registers.  The caller passes this key to us as long[5], with
10347     // BITS_PER_LIMB = 26.
10348     const Register R_0 = *++regs, R_1 = *++regs;
10349     pack_26(R_0, R_1, r_start);
10350 
10351     // RR_n is (R_n >> 2) * 5
10352     const Register RR_0 = *++regs, RR_1 = *++regs;
10353     __ lsr(RR_0, R_0, 2);
10354     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10355     __ lsr(RR_1, R_1, 2);
10356     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10357 
10358     // U_n is the current checksum
10359     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10360     pack_26(U_0, U_1, U_2, acc_start);
10361 
10362     static constexpr int BLOCK_LENGTH = 16;
10363     Label DONE, LOOP;
10364 
10365     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10366     __ br(Assembler::LT, DONE); {
10367       __ bind(LOOP);
10368 
10369       // S_n is to be the sum of U_n and the next block of data
10370       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10371       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10372       __ adds(S_0, U_0, S_0);
10373       __ adcs(S_1, U_1, S_1);
10374       __ adc(S_2, U_2, zr);
10375       __ add(S_2, S_2, 1);
10376 
10377       const Register U_0HI = *++regs, U_1HI = *++regs;
10378 
10379       // NB: this logic depends on some of the special properties of
10380       // Poly1305 keys. In particular, because we know that the top
10381       // four bits of R_0 and R_1 are zero, we can add together
10382       // partial products without any risk of needing to propagate a
10383       // carry out.
10384       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10385       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10386       __ andr(U_2, R_0, 3);
10387       __ mul(U_2, S_2, U_2);
10388 
10389       // Recycle registers S_0, S_1, S_2
10390       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10391 
10392       // Partial reduction mod 2**130 - 5
10393       __ adds(U_1, U_0HI, U_1);
10394       __ adc(U_2, U_1HI, U_2);
10395       // Sum now in U_2:U_1:U_0.
10396       // Dead: U_0HI, U_1HI.
10397       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10398 
10399       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10400 
10401       // First, U_2:U_1:U_0 += (U_2 >> 2)
10402       __ lsr(rscratch1, U_2, 2);
10403       __ andr(U_2, U_2, (u8)3);
10404       __ adds(U_0, U_0, rscratch1);
10405       __ adcs(U_1, U_1, zr);
10406       __ adc(U_2, U_2, zr);
10407       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10408       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10409       __ adcs(U_1, U_1, zr);
10410       __ adc(U_2, U_2, zr);
10411 
10412       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10413       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10414       __ br(~ Assembler::LT, LOOP);
10415     }
10416 
10417     // Further reduce modulo 2^130 - 5
10418     __ lsr(rscratch1, U_2, 2);
10419     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10420     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10421     __ adcs(U_1, U_1, zr);
10422     __ andr(U_2, U_2, (u1)3);
10423     __ adc(U_2, U_2, zr);
10424 
10425     // Unpack the sum into five 26-bit limbs and write to memory.
10426     __ ubfiz(rscratch1, U_0, 0, 26);
10427     __ ubfx(rscratch2, U_0, 26, 26);
10428     __ stp(rscratch1, rscratch2, Address(acc_start));
10429     __ ubfx(rscratch1, U_0, 52, 12);
10430     __ bfi(rscratch1, U_1, 12, 14);
10431     __ ubfx(rscratch2, U_1, 14, 26);
10432     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10433     __ ubfx(rscratch1, U_1, 40, 24);
10434     __ bfi(rscratch1, U_2, 24, 3);
10435     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10436 
10437     __ bind(DONE);
10438     __ pop(callee_saved, sp);
10439     __ leave();
10440     __ ret(lr);
10441 
10442     return start;
10443   }
10444 
10445   // exception handler for upcall stubs
10446   address generate_upcall_stub_exception_handler() {
10447     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
10448     StubCodeMark mark(this, stub_id);
10449     address start = __ pc();
10450 
10451     // Native caller has no idea how to handle exceptions,
10452     // so we just crash here. Up to callee to catch exceptions.
10453     __ verify_oop(r0);
10454     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10455     __ blr(rscratch1);
10456     __ should_not_reach_here();
10457 
10458     return start;
10459   }
10460 
10461   // load Method* target of MethodHandle
10462   // j_rarg0 = jobject receiver
10463   // rmethod = result
10464   address generate_upcall_stub_load_target() {
10465     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
10466     StubCodeMark mark(this, stub_id);
10467     address start = __ pc();
10468 
10469     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10470       // Load target method from receiver
10471     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10472     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10473     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10474     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10475                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10476                       noreg, noreg);
10477     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10478 
10479     __ ret(lr);
10480 
10481     return start;
10482   }
10483 
10484 #undef __
10485 #define __ masm->
10486 
10487   class MontgomeryMultiplyGenerator : public MacroAssembler {
10488 
10489     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10490       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10491 
10492     RegSet _toSave;
10493     bool _squaring;
10494 
10495   public:
10496     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10497       : MacroAssembler(as->code()), _squaring(squaring) {
10498 
10499       // Register allocation
10500 
10501       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10502       Pa_base = *regs;       // Argument registers
10503       if (squaring)
10504         Pb_base = Pa_base;
10505       else
10506         Pb_base = *++regs;
10507       Pn_base = *++regs;
10508       Rlen= *++regs;
10509       inv = *++regs;
10510       Pm_base = *++regs;
10511 
10512                           // Working registers:
10513       Ra =  *++regs;        // The current digit of a, b, n, and m.
10514       Rb =  *++regs;
10515       Rm =  *++regs;
10516       Rn =  *++regs;
10517 
10518       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10519       Pb =  *++regs;
10520       Pm =  *++regs;
10521       Pn =  *++regs;
10522 
10523       t0 =  *++regs;        // Three registers which form a
10524       t1 =  *++regs;        // triple-precision accumuator.
10525       t2 =  *++regs;
10526 
10527       Ri =  *++regs;        // Inner and outer loop indexes.
10528       Rj =  *++regs;
10529 
10530       Rhi_ab = *++regs;     // Product registers: low and high parts
10531       Rlo_ab = *++regs;     // of a*b and m*n.
10532       Rhi_mn = *++regs;
10533       Rlo_mn = *++regs;
10534 
10535       // r19 and up are callee-saved.
10536       _toSave = RegSet::range(r19, *regs) + Pm_base;
10537     }
10538 
10539   private:
10540     void save_regs() {
10541       push(_toSave, sp);
10542     }
10543 
10544     void restore_regs() {
10545       pop(_toSave, sp);
10546     }
10547 
10548     template <typename T>
10549     void unroll_2(Register count, T block) {
10550       Label loop, end, odd;
10551       tbnz(count, 0, odd);
10552       cbz(count, end);
10553       align(16);
10554       bind(loop);
10555       (this->*block)();
10556       bind(odd);
10557       (this->*block)();
10558       subs(count, count, 2);
10559       br(Assembler::GT, loop);
10560       bind(end);
10561     }
10562 
10563     template <typename T>
10564     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10565       Label loop, end, odd;
10566       tbnz(count, 0, odd);
10567       cbz(count, end);
10568       align(16);
10569       bind(loop);
10570       (this->*block)(d, s, tmp);
10571       bind(odd);
10572       (this->*block)(d, s, tmp);
10573       subs(count, count, 2);
10574       br(Assembler::GT, loop);
10575       bind(end);
10576     }
10577 
10578     void pre1(RegisterOrConstant i) {
10579       block_comment("pre1");
10580       // Pa = Pa_base;
10581       // Pb = Pb_base + i;
10582       // Pm = Pm_base;
10583       // Pn = Pn_base + i;
10584       // Ra = *Pa;
10585       // Rb = *Pb;
10586       // Rm = *Pm;
10587       // Rn = *Pn;
10588       ldr(Ra, Address(Pa_base));
10589       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10590       ldr(Rm, Address(Pm_base));
10591       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10592       lea(Pa, Address(Pa_base));
10593       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10594       lea(Pm, Address(Pm_base));
10595       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10596 
10597       // Zero the m*n result.
10598       mov(Rhi_mn, zr);
10599       mov(Rlo_mn, zr);
10600     }
10601 
10602     // The core multiply-accumulate step of a Montgomery
10603     // multiplication.  The idea is to schedule operations as a
10604     // pipeline so that instructions with long latencies (loads and
10605     // multiplies) have time to complete before their results are
10606     // used.  This most benefits in-order implementations of the
10607     // architecture but out-of-order ones also benefit.
10608     void step() {
10609       block_comment("step");
10610       // MACC(Ra, Rb, t0, t1, t2);
10611       // Ra = *++Pa;
10612       // Rb = *--Pb;
10613       umulh(Rhi_ab, Ra, Rb);
10614       mul(Rlo_ab, Ra, Rb);
10615       ldr(Ra, pre(Pa, wordSize));
10616       ldr(Rb, pre(Pb, -wordSize));
10617       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
10618                                        // previous iteration.
10619       // MACC(Rm, Rn, t0, t1, t2);
10620       // Rm = *++Pm;
10621       // Rn = *--Pn;
10622       umulh(Rhi_mn, Rm, Rn);
10623       mul(Rlo_mn, Rm, Rn);
10624       ldr(Rm, pre(Pm, wordSize));
10625       ldr(Rn, pre(Pn, -wordSize));
10626       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10627     }
10628 
10629     void post1() {
10630       block_comment("post1");
10631 
10632       // MACC(Ra, Rb, t0, t1, t2);
10633       // Ra = *++Pa;
10634       // Rb = *--Pb;
10635       umulh(Rhi_ab, Ra, Rb);
10636       mul(Rlo_ab, Ra, Rb);
10637       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10638       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10639 
10640       // *Pm = Rm = t0 * inv;
10641       mul(Rm, t0, inv);
10642       str(Rm, Address(Pm));
10643 
10644       // MACC(Rm, Rn, t0, t1, t2);
10645       // t0 = t1; t1 = t2; t2 = 0;
10646       umulh(Rhi_mn, Rm, Rn);
10647 
10648 #ifndef PRODUCT
10649       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10650       {
10651         mul(Rlo_mn, Rm, Rn);
10652         add(Rlo_mn, t0, Rlo_mn);
10653         Label ok;
10654         cbz(Rlo_mn, ok); {
10655           stop("broken Montgomery multiply");
10656         } bind(ok);
10657       }
10658 #endif
10659       // We have very carefully set things up so that
10660       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
10661       // the lower half of Rm * Rn because we know the result already:
10662       // it must be -t0.  t0 + (-t0) must generate a carry iff
10663       // t0 != 0.  So, rather than do a mul and an adds we just set
10664       // the carry flag iff t0 is nonzero.
10665       //
10666       // mul(Rlo_mn, Rm, Rn);
10667       // adds(zr, t0, Rlo_mn);
10668       subs(zr, t0, 1); // Set carry iff t0 is nonzero
10669       adcs(t0, t1, Rhi_mn);
10670       adc(t1, t2, zr);
10671       mov(t2, zr);
10672     }
10673 
10674     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
10675       block_comment("pre2");
10676       // Pa = Pa_base + i-len;
10677       // Pb = Pb_base + len;
10678       // Pm = Pm_base + i-len;
10679       // Pn = Pn_base + len;
10680 
10681       if (i.is_register()) {
10682         sub(Rj, i.as_register(), len);
10683       } else {
10684         mov(Rj, i.as_constant());
10685         sub(Rj, Rj, len);
10686       }
10687       // Rj == i-len
10688 
10689       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
10690       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
10691       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
10692       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
10693 
10694       // Ra = *++Pa;
10695       // Rb = *--Pb;
10696       // Rm = *++Pm;
10697       // Rn = *--Pn;
10698       ldr(Ra, pre(Pa, wordSize));
10699       ldr(Rb, pre(Pb, -wordSize));
10700       ldr(Rm, pre(Pm, wordSize));
10701       ldr(Rn, pre(Pn, -wordSize));
10702 
10703       mov(Rhi_mn, zr);
10704       mov(Rlo_mn, zr);
10705     }
10706 
10707     void post2(RegisterOrConstant i, RegisterOrConstant len) {
10708       block_comment("post2");
10709       if (i.is_constant()) {
10710         mov(Rj, i.as_constant()-len.as_constant());
10711       } else {
10712         sub(Rj, i.as_register(), len);
10713       }
10714 
10715       adds(t0, t0, Rlo_mn); // The pending m*n, low part
10716 
10717       // As soon as we know the least significant digit of our result,
10718       // store it.
10719       // Pm_base[i-len] = t0;
10720       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
10721 
10722       // t0 = t1; t1 = t2; t2 = 0;
10723       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
10724       adc(t1, t2, zr);
10725       mov(t2, zr);
10726     }
10727 
10728     // A carry in t0 after Montgomery multiplication means that we
10729     // should subtract multiples of n from our result in m.  We'll
10730     // keep doing that until there is no carry.
10731     void normalize(RegisterOrConstant len) {
10732       block_comment("normalize");
10733       // while (t0)
10734       //   t0 = sub(Pm_base, Pn_base, t0, len);
10735       Label loop, post, again;
10736       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
10737       cbz(t0, post); {
10738         bind(again); {
10739           mov(i, zr);
10740           mov(cnt, len);
10741           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10742           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10743           subs(zr, zr, zr); // set carry flag, i.e. no borrow
10744           align(16);
10745           bind(loop); {
10746             sbcs(Rm, Rm, Rn);
10747             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10748             add(i, i, 1);
10749             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
10750             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10751             sub(cnt, cnt, 1);
10752           } cbnz(cnt, loop);
10753           sbc(t0, t0, zr);
10754         } cbnz(t0, again);
10755       } bind(post);
10756     }
10757 
10758     // Move memory at s to d, reversing words.
10759     //    Increments d to end of copied memory
10760     //    Destroys tmp1, tmp2
10761     //    Preserves len
10762     //    Leaves s pointing to the address which was in d at start
10763     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
10764       assert(tmp1->encoding() < r19->encoding(), "register corruption");
10765       assert(tmp2->encoding() < r19->encoding(), "register corruption");
10766 
10767       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
10768       mov(tmp1, len);
10769       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
10770       sub(s, d, len, ext::uxtw, LogBytesPerWord);
10771     }
10772     // where
10773     void reverse1(Register d, Register s, Register tmp) {
10774       ldr(tmp, pre(s, -wordSize));
10775       ror(tmp, tmp, 32);
10776       str(tmp, post(d, wordSize));
10777     }
10778 
10779     void step_squaring() {
10780       // An extra ACC
10781       step();
10782       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10783     }
10784 
10785     void last_squaring(RegisterOrConstant i) {
10786       Label dont;
10787       // if ((i & 1) == 0) {
10788       tbnz(i.as_register(), 0, dont); {
10789         // MACC(Ra, Rb, t0, t1, t2);
10790         // Ra = *++Pa;
10791         // Rb = *--Pb;
10792         umulh(Rhi_ab, Ra, Rb);
10793         mul(Rlo_ab, Ra, Rb);
10794         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10795       } bind(dont);
10796     }
10797 
10798     void extra_step_squaring() {
10799       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10800 
10801       // MACC(Rm, Rn, t0, t1, t2);
10802       // Rm = *++Pm;
10803       // Rn = *--Pn;
10804       umulh(Rhi_mn, Rm, Rn);
10805       mul(Rlo_mn, Rm, Rn);
10806       ldr(Rm, pre(Pm, wordSize));
10807       ldr(Rn, pre(Pn, -wordSize));
10808     }
10809 
10810     void post1_squaring() {
10811       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
10812 
10813       // *Pm = Rm = t0 * inv;
10814       mul(Rm, t0, inv);
10815       str(Rm, Address(Pm));
10816 
10817       // MACC(Rm, Rn, t0, t1, t2);
10818       // t0 = t1; t1 = t2; t2 = 0;
10819       umulh(Rhi_mn, Rm, Rn);
10820 
10821 #ifndef PRODUCT
10822       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10823       {
10824         mul(Rlo_mn, Rm, Rn);
10825         add(Rlo_mn, t0, Rlo_mn);
10826         Label ok;
10827         cbz(Rlo_mn, ok); {
10828           stop("broken Montgomery multiply");
10829         } bind(ok);
10830       }
10831 #endif
10832       // We have very carefully set things up so that
10833       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
10834       // the lower half of Rm * Rn because we know the result already:
10835       // it must be -t0.  t0 + (-t0) must generate a carry iff
10836       // t0 != 0.  So, rather than do a mul and an adds we just set
10837       // the carry flag iff t0 is nonzero.
10838       //
10839       // mul(Rlo_mn, Rm, Rn);
10840       // adds(zr, t0, Rlo_mn);
10841       subs(zr, t0, 1); // Set carry iff t0 is nonzero
10842       adcs(t0, t1, Rhi_mn);
10843       adc(t1, t2, zr);
10844       mov(t2, zr);
10845     }
10846 
10847     void acc(Register Rhi, Register Rlo,
10848              Register t0, Register t1, Register t2) {
10849       adds(t0, t0, Rlo);
10850       adcs(t1, t1, Rhi);
10851       adc(t2, t2, zr);
10852     }
10853 
10854   public:
10855     /**
10856      * Fast Montgomery multiplication.  The derivation of the
10857      * algorithm is in A Cryptographic Library for the Motorola
10858      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
10859      *
10860      * Arguments:
10861      *
10862      * Inputs for multiplication:
10863      *   c_rarg0   - int array elements a
10864      *   c_rarg1   - int array elements b
10865      *   c_rarg2   - int array elements n (the modulus)
10866      *   c_rarg3   - int length
10867      *   c_rarg4   - int inv
10868      *   c_rarg5   - int array elements m (the result)
10869      *
10870      * Inputs for squaring:
10871      *   c_rarg0   - int array elements a
10872      *   c_rarg1   - int array elements n (the modulus)
10873      *   c_rarg2   - int length
10874      *   c_rarg3   - int inv
10875      *   c_rarg4   - int array elements m (the result)
10876      *
10877      */
10878     address generate_multiply() {
10879       Label argh, nothing;
10880       bind(argh);
10881       stop("MontgomeryMultiply total_allocation must be <= 8192");
10882 
10883       align(CodeEntryAlignment);
10884       address entry = pc();
10885 
10886       cbzw(Rlen, nothing);
10887 
10888       enter();
10889 
10890       // Make room.
10891       cmpw(Rlen, 512);
10892       br(Assembler::HI, argh);
10893       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
10894       andr(sp, Ra, -2 * wordSize);
10895 
10896       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
10897 
10898       {
10899         // Copy input args, reversing as we go.  We use Ra as a
10900         // temporary variable.
10901         reverse(Ra, Pa_base, Rlen, t0, t1);
10902         if (!_squaring)
10903           reverse(Ra, Pb_base, Rlen, t0, t1);
10904         reverse(Ra, Pn_base, Rlen, t0, t1);
10905       }
10906 
10907       // Push all call-saved registers and also Pm_base which we'll need
10908       // at the end.
10909       save_regs();
10910 
10911 #ifndef PRODUCT
10912       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
10913       {
10914         ldr(Rn, Address(Pn_base, 0));
10915         mul(Rlo_mn, Rn, inv);
10916         subs(zr, Rlo_mn, -1);
10917         Label ok;
10918         br(EQ, ok); {
10919           stop("broken inverse in Montgomery multiply");
10920         } bind(ok);
10921       }
10922 #endif
10923 
10924       mov(Pm_base, Ra);
10925 
10926       mov(t0, zr);
10927       mov(t1, zr);
10928       mov(t2, zr);
10929 
10930       block_comment("for (int i = 0; i < len; i++) {");
10931       mov(Ri, zr); {
10932         Label loop, end;
10933         cmpw(Ri, Rlen);
10934         br(Assembler::GE, end);
10935 
10936         bind(loop);
10937         pre1(Ri);
10938 
10939         block_comment("  for (j = i; j; j--) {"); {
10940           movw(Rj, Ri);
10941           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
10942         } block_comment("  } // j");
10943 
10944         post1();
10945         addw(Ri, Ri, 1);
10946         cmpw(Ri, Rlen);
10947         br(Assembler::LT, loop);
10948         bind(end);
10949         block_comment("} // i");
10950       }
10951 
10952       block_comment("for (int i = len; i < 2*len; i++) {");
10953       mov(Ri, Rlen); {
10954         Label loop, end;
10955         cmpw(Ri, Rlen, Assembler::LSL, 1);
10956         br(Assembler::GE, end);
10957 
10958         bind(loop);
10959         pre2(Ri, Rlen);
10960 
10961         block_comment("  for (j = len*2-i-1; j; j--) {"); {
10962           lslw(Rj, Rlen, 1);
10963           subw(Rj, Rj, Ri);
10964           subw(Rj, Rj, 1);
10965           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
10966         } block_comment("  } // j");
10967 
10968         post2(Ri, Rlen);
10969         addw(Ri, Ri, 1);
10970         cmpw(Ri, Rlen, Assembler::LSL, 1);
10971         br(Assembler::LT, loop);
10972         bind(end);
10973       }
10974       block_comment("} // i");
10975 
10976       normalize(Rlen);
10977 
10978       mov(Ra, Pm_base);  // Save Pm_base in Ra
10979       restore_regs();  // Restore caller's Pm_base
10980 
10981       // Copy our result into caller's Pm_base
10982       reverse(Pm_base, Ra, Rlen, t0, t1);
10983 
10984       leave();
10985       bind(nothing);
10986       ret(lr);
10987 
10988       return entry;
10989     }
10990     // In C, approximately:
10991 
10992     // void
10993     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
10994     //                     julong Pn_base[], julong Pm_base[],
10995     //                     julong inv, int len) {
10996     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
10997     //   julong *Pa, *Pb, *Pn, *Pm;
10998     //   julong Ra, Rb, Rn, Rm;
10999 
11000     //   int i;
11001 
11002     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11003 
11004     //   for (i = 0; i < len; i++) {
11005     //     int j;
11006 
11007     //     Pa = Pa_base;
11008     //     Pb = Pb_base + i;
11009     //     Pm = Pm_base;
11010     //     Pn = Pn_base + i;
11011 
11012     //     Ra = *Pa;
11013     //     Rb = *Pb;
11014     //     Rm = *Pm;
11015     //     Rn = *Pn;
11016 
11017     //     int iters = i;
11018     //     for (j = 0; iters--; j++) {
11019     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11020     //       MACC(Ra, Rb, t0, t1, t2);
11021     //       Ra = *++Pa;
11022     //       Rb = *--Pb;
11023     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11024     //       MACC(Rm, Rn, t0, t1, t2);
11025     //       Rm = *++Pm;
11026     //       Rn = *--Pn;
11027     //     }
11028 
11029     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11030     //     MACC(Ra, Rb, t0, t1, t2);
11031     //     *Pm = Rm = t0 * inv;
11032     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11033     //     MACC(Rm, Rn, t0, t1, t2);
11034 
11035     //     assert(t0 == 0, "broken Montgomery multiply");
11036 
11037     //     t0 = t1; t1 = t2; t2 = 0;
11038     //   }
11039 
11040     //   for (i = len; i < 2*len; i++) {
11041     //     int j;
11042 
11043     //     Pa = Pa_base + i-len;
11044     //     Pb = Pb_base + len;
11045     //     Pm = Pm_base + i-len;
11046     //     Pn = Pn_base + len;
11047 
11048     //     Ra = *++Pa;
11049     //     Rb = *--Pb;
11050     //     Rm = *++Pm;
11051     //     Rn = *--Pn;
11052 
11053     //     int iters = len*2-i-1;
11054     //     for (j = i-len+1; iters--; j++) {
11055     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11056     //       MACC(Ra, Rb, t0, t1, t2);
11057     //       Ra = *++Pa;
11058     //       Rb = *--Pb;
11059     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11060     //       MACC(Rm, Rn, t0, t1, t2);
11061     //       Rm = *++Pm;
11062     //       Rn = *--Pn;
11063     //     }
11064 
11065     //     Pm_base[i-len] = t0;
11066     //     t0 = t1; t1 = t2; t2 = 0;
11067     //   }
11068 
11069     //   while (t0)
11070     //     t0 = sub(Pm_base, Pn_base, t0, len);
11071     // }
11072 
11073     /**
11074      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11075      * multiplies than Montgomery multiplication so it should be up to
11076      * 25% faster.  However, its loop control is more complex and it
11077      * may actually run slower on some machines.
11078      *
11079      * Arguments:
11080      *
11081      * Inputs:
11082      *   c_rarg0   - int array elements a
11083      *   c_rarg1   - int array elements n (the modulus)
11084      *   c_rarg2   - int length
11085      *   c_rarg3   - int inv
11086      *   c_rarg4   - int array elements m (the result)
11087      *
11088      */
11089     address generate_square() {
11090       Label argh;
11091       bind(argh);
11092       stop("MontgomeryMultiply total_allocation must be <= 8192");
11093 
11094       align(CodeEntryAlignment);
11095       address entry = pc();
11096 
11097       enter();
11098 
11099       // Make room.
11100       cmpw(Rlen, 512);
11101       br(Assembler::HI, argh);
11102       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11103       andr(sp, Ra, -2 * wordSize);
11104 
11105       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11106 
11107       {
11108         // Copy input args, reversing as we go.  We use Ra as a
11109         // temporary variable.
11110         reverse(Ra, Pa_base, Rlen, t0, t1);
11111         reverse(Ra, Pn_base, Rlen, t0, t1);
11112       }
11113 
11114       // Push all call-saved registers and also Pm_base which we'll need
11115       // at the end.
11116       save_regs();
11117 
11118       mov(Pm_base, Ra);
11119 
11120       mov(t0, zr);
11121       mov(t1, zr);
11122       mov(t2, zr);
11123 
11124       block_comment("for (int i = 0; i < len; i++) {");
11125       mov(Ri, zr); {
11126         Label loop, end;
11127         bind(loop);
11128         cmp(Ri, Rlen);
11129         br(Assembler::GE, end);
11130 
11131         pre1(Ri);
11132 
11133         block_comment("for (j = (i+1)/2; j; j--) {"); {
11134           add(Rj, Ri, 1);
11135           lsr(Rj, Rj, 1);
11136           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11137         } block_comment("  } // j");
11138 
11139         last_squaring(Ri);
11140 
11141         block_comment("  for (j = i/2; j; j--) {"); {
11142           lsr(Rj, Ri, 1);
11143           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11144         } block_comment("  } // j");
11145 
11146         post1_squaring();
11147         add(Ri, Ri, 1);
11148         cmp(Ri, Rlen);
11149         br(Assembler::LT, loop);
11150 
11151         bind(end);
11152         block_comment("} // i");
11153       }
11154 
11155       block_comment("for (int i = len; i < 2*len; i++) {");
11156       mov(Ri, Rlen); {
11157         Label loop, end;
11158         bind(loop);
11159         cmp(Ri, Rlen, Assembler::LSL, 1);
11160         br(Assembler::GE, end);
11161 
11162         pre2(Ri, Rlen);
11163 
11164         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11165           lsl(Rj, Rlen, 1);
11166           sub(Rj, Rj, Ri);
11167           sub(Rj, Rj, 1);
11168           lsr(Rj, Rj, 1);
11169           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11170         } block_comment("  } // j");
11171 
11172         last_squaring(Ri);
11173 
11174         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11175           lsl(Rj, Rlen, 1);
11176           sub(Rj, Rj, Ri);
11177           lsr(Rj, Rj, 1);
11178           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11179         } block_comment("  } // j");
11180 
11181         post2(Ri, Rlen);
11182         add(Ri, Ri, 1);
11183         cmp(Ri, Rlen, Assembler::LSL, 1);
11184 
11185         br(Assembler::LT, loop);
11186         bind(end);
11187         block_comment("} // i");
11188       }
11189 
11190       normalize(Rlen);
11191 
11192       mov(Ra, Pm_base);  // Save Pm_base in Ra
11193       restore_regs();  // Restore caller's Pm_base
11194 
11195       // Copy our result into caller's Pm_base
11196       reverse(Pm_base, Ra, Rlen, t0, t1);
11197 
11198       leave();
11199       ret(lr);
11200 
11201       return entry;
11202     }
11203     // In C, approximately:
11204 
11205     // void
11206     // montgomery_square(julong Pa_base[], julong Pn_base[],
11207     //                   julong Pm_base[], julong inv, int len) {
11208     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11209     //   julong *Pa, *Pb, *Pn, *Pm;
11210     //   julong Ra, Rb, Rn, Rm;
11211 
11212     //   int i;
11213 
11214     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11215 
11216     //   for (i = 0; i < len; i++) {
11217     //     int j;
11218 
11219     //     Pa = Pa_base;
11220     //     Pb = Pa_base + i;
11221     //     Pm = Pm_base;
11222     //     Pn = Pn_base + i;
11223 
11224     //     Ra = *Pa;
11225     //     Rb = *Pb;
11226     //     Rm = *Pm;
11227     //     Rn = *Pn;
11228 
11229     //     int iters = (i+1)/2;
11230     //     for (j = 0; iters--; j++) {
11231     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11232     //       MACC2(Ra, Rb, t0, t1, t2);
11233     //       Ra = *++Pa;
11234     //       Rb = *--Pb;
11235     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11236     //       MACC(Rm, Rn, t0, t1, t2);
11237     //       Rm = *++Pm;
11238     //       Rn = *--Pn;
11239     //     }
11240     //     if ((i & 1) == 0) {
11241     //       assert(Ra == Pa_base[j], "must be");
11242     //       MACC(Ra, Ra, t0, t1, t2);
11243     //     }
11244     //     iters = i/2;
11245     //     assert(iters == i-j, "must be");
11246     //     for (; iters--; j++) {
11247     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11248     //       MACC(Rm, Rn, t0, t1, t2);
11249     //       Rm = *++Pm;
11250     //       Rn = *--Pn;
11251     //     }
11252 
11253     //     *Pm = Rm = t0 * inv;
11254     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11255     //     MACC(Rm, Rn, t0, t1, t2);
11256 
11257     //     assert(t0 == 0, "broken Montgomery multiply");
11258 
11259     //     t0 = t1; t1 = t2; t2 = 0;
11260     //   }
11261 
11262     //   for (i = len; i < 2*len; i++) {
11263     //     int start = i-len+1;
11264     //     int end = start + (len - start)/2;
11265     //     int j;
11266 
11267     //     Pa = Pa_base + i-len;
11268     //     Pb = Pa_base + len;
11269     //     Pm = Pm_base + i-len;
11270     //     Pn = Pn_base + len;
11271 
11272     //     Ra = *++Pa;
11273     //     Rb = *--Pb;
11274     //     Rm = *++Pm;
11275     //     Rn = *--Pn;
11276 
11277     //     int iters = (2*len-i-1)/2;
11278     //     assert(iters == end-start, "must be");
11279     //     for (j = start; iters--; j++) {
11280     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11281     //       MACC2(Ra, Rb, t0, t1, t2);
11282     //       Ra = *++Pa;
11283     //       Rb = *--Pb;
11284     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11285     //       MACC(Rm, Rn, t0, t1, t2);
11286     //       Rm = *++Pm;
11287     //       Rn = *--Pn;
11288     //     }
11289     //     if ((i & 1) == 0) {
11290     //       assert(Ra == Pa_base[j], "must be");
11291     //       MACC(Ra, Ra, t0, t1, t2);
11292     //     }
11293     //     iters =  (2*len-i)/2;
11294     //     assert(iters == len-j, "must be");
11295     //     for (; iters--; j++) {
11296     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11297     //       MACC(Rm, Rn, t0, t1, t2);
11298     //       Rm = *++Pm;
11299     //       Rn = *--Pn;
11300     //     }
11301     //     Pm_base[i-len] = t0;
11302     //     t0 = t1; t1 = t2; t2 = 0;
11303     //   }
11304 
11305     //   while (t0)
11306     //     t0 = sub(Pm_base, Pn_base, t0, len);
11307     // }
11308   };
11309 
11310   // Initialization
11311   void generate_initial_stubs() {
11312     // Generate initial stubs and initializes the entry points
11313 
11314     // entry points that exist in all platforms Note: This is code
11315     // that could be shared among different platforms - however the
11316     // benefit seems to be smaller than the disadvantage of having a
11317     // much more complicated generator structure. See also comment in
11318     // stubRoutines.hpp.
11319 
11320     StubRoutines::_forward_exception_entry = generate_forward_exception();
11321 
11322     StubRoutines::_call_stub_entry =
11323       generate_call_stub(StubRoutines::_call_stub_return_address);
11324 
11325     // is referenced by megamorphic call
11326     StubRoutines::_catch_exception_entry = generate_catch_exception();
11327 
11328     // Initialize table for copy memory (arraycopy) check.
11329     if (UnsafeMemoryAccess::_table == nullptr) {
11330       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11331     }
11332 
11333     if (UseCRC32Intrinsics) {
11334       // set table address before stub generation which use it
11335       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
11336       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11337     }
11338 
11339     if (UseCRC32CIntrinsics) {
11340       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11341     }
11342 
11343     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11344       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11345     }
11346 
11347     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11348       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11349     }
11350 
11351     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11352         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11353       StubRoutines::_hf2f = generate_float16ToFloat();
11354       StubRoutines::_f2hf = generate_floatToFloat16();
11355     }
11356   }
11357 
11358   void generate_continuation_stubs() {
11359     // Continuation stubs:
11360     StubRoutines::_cont_thaw          = generate_cont_thaw();
11361     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11362     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11363     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11364   }
11365 
11366   void generate_final_stubs() {
11367     // support for verify_oop (must happen after universe_init)
11368     if (VerifyOops) {
11369       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11370     }
11371 
11372     // arraycopy stubs used by compilers
11373     generate_arraycopy_stubs();
11374 
11375     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11376 
11377     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11378 
11379     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11380     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11381 
11382 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11383 
11384     generate_atomic_entry_points();
11385 
11386 #endif // LINUX
11387 
11388 #ifdef COMPILER2
11389     if (UseSecondarySupersTable) {
11390       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11391       if (! InlineSecondarySupersTest) {
11392         generate_lookup_secondary_supers_table_stub();
11393       }
11394     }
11395 #endif
11396 
11397     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11398 
11399     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11400   }
11401 
11402   void generate_compiler_stubs() {
11403 #if COMPILER2_OR_JVMCI
11404 
11405     if (UseSVE == 0) {
11406       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
11407     }
11408 
11409     // array equals stub for large arrays.
11410     if (!UseSimpleArrayEquals) {
11411       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11412     }
11413 
11414     // arrays_hascode stub for large arrays.
11415     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11416     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11417     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11418     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11419     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11420 
11421     // byte_array_inflate stub for large arrays.
11422     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11423 
11424     // countPositives stub for large arrays.
11425     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11426 
11427     generate_compare_long_strings();
11428 
11429     generate_string_indexof_stubs();
11430 
11431 #ifdef COMPILER2
11432     if (UseMultiplyToLenIntrinsic) {
11433       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11434     }
11435 
11436     if (UseSquareToLenIntrinsic) {
11437       StubRoutines::_squareToLen = generate_squareToLen();
11438     }
11439 
11440     if (UseMulAddIntrinsic) {
11441       StubRoutines::_mulAdd = generate_mulAdd();
11442     }
11443 
11444     if (UseSIMDForBigIntegerShiftIntrinsics) {
11445       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11446       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11447     }
11448 
11449     if (UseMontgomeryMultiplyIntrinsic) {
11450       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
11451       StubCodeMark mark(this, stub_id);
11452       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11453       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11454     }
11455 
11456     if (UseMontgomerySquareIntrinsic) {
11457       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
11458       StubCodeMark mark(this, stub_id);
11459       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11460       // We use generate_multiply() rather than generate_square()
11461       // because it's faster for the sizes of modulus we care about.
11462       StubRoutines::_montgomerySquare = g.generate_multiply();
11463     }
11464 
11465 #endif // COMPILER2
11466 
11467     if (UseChaCha20Intrinsics) {
11468       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11469     }
11470 
11471     if (UseKyberIntrinsics) {
11472       StubRoutines::_kyberNtt = generate_kyberNtt();
11473       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11474       StubRoutines::_kyberNttMult = generate_kyberNttMult();
11475       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11476       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11477       StubRoutines::_kyber12To16 = generate_kyber12To16();
11478       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11479     }
11480 
11481     if (UseDilithiumIntrinsics) {
11482       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11483       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11484       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11485       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11486       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11487     }
11488 
11489     if (UseBASE64Intrinsics) {
11490         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11491         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11492     }
11493 
11494     // data cache line writeback
11495     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11496     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11497 
11498     if (UseAESIntrinsics) {
11499       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11500       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11501       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11502       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11503       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11504     }
11505     if (UseGHASHIntrinsics) {
11506       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11507       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11508     }
11509     if (UseAESIntrinsics && UseGHASHIntrinsics) {
11510       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11511     }
11512 
11513     if (UseMD5Intrinsics) {
11514       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
11515       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
11516     }
11517     if (UseSHA1Intrinsics) {
11518       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
11519       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
11520     }
11521     if (UseSHA256Intrinsics) {
11522       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
11523       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
11524     }
11525     if (UseSHA512Intrinsics) {
11526       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
11527       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
11528     }
11529     if (UseSHA3Intrinsics) {
11530       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
11531       StubRoutines::_double_keccak         = generate_double_keccak();
11532       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
11533     }
11534 
11535     if (UsePoly1305Intrinsics) {
11536       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11537     }
11538 
11539     // generate Adler32 intrinsics code
11540     if (UseAdler32Intrinsics) {
11541       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11542     }
11543 
11544 #endif // COMPILER2_OR_JVMCI
11545   }
11546 
11547  public:
11548   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
11549     switch(blob_id) {
11550     case initial_id:
11551       generate_initial_stubs();
11552       break;
11553      case continuation_id:
11554       generate_continuation_stubs();
11555       break;
11556     case compiler_id:
11557       generate_compiler_stubs();
11558       break;
11559     case final_id:
11560       generate_final_stubs();
11561       break;
11562     default:
11563       fatal("unexpected blob id: %d", blob_id);
11564       break;
11565     };
11566   }
11567 }; // end class declaration
11568 
11569 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
11570   StubGenerator g(code, blob_id);
11571 }
11572 
11573 
11574 #if defined (LINUX)
11575 
11576 // Define pointers to atomic stubs and initialize them to point to the
11577 // code in atomic_aarch64.S.
11578 
11579 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
11580   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
11581     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
11582   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
11583     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
11584 
11585 DEFAULT_ATOMIC_OP(fetch_add, 4, )
11586 DEFAULT_ATOMIC_OP(fetch_add, 8, )
11587 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
11588 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
11589 DEFAULT_ATOMIC_OP(xchg, 4, )
11590 DEFAULT_ATOMIC_OP(xchg, 8, )
11591 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
11592 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
11593 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
11594 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
11595 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
11596 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
11597 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
11598 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
11599 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
11600 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
11601 
11602 #undef DEFAULT_ATOMIC_OP
11603 
11604 #endif // LINUX