1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomic.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubId stub_id = StubId::stubgen_call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     // All of j_rargN may be used to return inline type fields so be careful
  332     // not to clobber those.
  333     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
  334     // assignment of Rresult below.
  335     Register Rresult = r14, Rresult_type = r15;
  336     __ ldr(Rresult, result);
  337     Label is_long, is_float, is_double, check_prim, exit;
  338     __ ldr(Rresult_type, result_type);
  339     __ cmp(Rresult_type, (u1)T_OBJECT);
  340     __ br(Assembler::EQ, check_prim);
  341     __ cmp(Rresult_type, (u1)T_LONG);
  342     __ br(Assembler::EQ, is_long);
  343     __ cmp(Rresult_type, (u1)T_FLOAT);
  344     __ br(Assembler::EQ, is_float);
  345     __ cmp(Rresult_type, (u1)T_DOUBLE);
  346     __ br(Assembler::EQ, is_double);
  347 
  348     // handle T_INT case
  349     __ strw(r0, Address(Rresult));
  350 
  351     __ BIND(exit);
  352 
  353     // pop parameters
  354     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  355 
  356 #ifdef ASSERT
  357     // verify that threads correspond
  358     {
  359       Label L, S;
  360       __ ldr(rscratch1, thread);
  361       __ cmp(rthread, rscratch1);
  362       __ br(Assembler::NE, S);
  363       __ get_thread(rscratch1);
  364       __ cmp(rthread, rscratch1);
  365       __ br(Assembler::EQ, L);
  366       __ BIND(S);
  367       __ stop("StubRoutines::call_stub: threads must correspond");
  368       __ BIND(L);
  369     }
  370 #endif
  371 
  372     __ pop_cont_fastpath(rthread);
  373 
  374     // restore callee-save registers
  375     __ ldpd(v15, v14,  d15_save);
  376     __ ldpd(v13, v12,  d13_save);
  377     __ ldpd(v11, v10,  d11_save);
  378     __ ldpd(v9,  v8,   d9_save);
  379 
  380     __ ldp(r28, r27,   r28_save);
  381     __ ldp(r26, r25,   r26_save);
  382     __ ldp(r24, r23,   r24_save);
  383     __ ldp(r22, r21,   r22_save);
  384     __ ldp(r20, r19,   r20_save);
  385 
  386     // restore fpcr
  387     __ ldr(rscratch1,  fpcr_save);
  388     __ set_fpcr(rscratch1);
  389 
  390     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  391     __ ldrw(c_rarg2, result_type);
  392     __ ldr(c_rarg3,  method);
  393     __ ldp(c_rarg4, c_rarg5,  entry_point);
  394     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  395 
  396     // leave frame and return to caller
  397     __ leave();
  398     __ ret(lr);
  399 
  400     // handle return types different from T_INT
  401     __ BIND(check_prim);
  402     if (InlineTypeReturnedAsFields) {
  403       // Check for scalarized return value
  404       __ tbz(r0, 0, is_long);
  405       // Load pack handler address
  406       __ andr(rscratch1, r0, -2);
  407       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
  408       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
  409       __ blr(rscratch1);
  410       __ b(exit);
  411     }
  412 
  413     __ BIND(is_long);
  414     __ str(r0, Address(Rresult, 0));
  415     __ br(Assembler::AL, exit);
  416 
  417     __ BIND(is_float);
  418     __ strs(j_farg0, Address(Rresult, 0));
  419     __ br(Assembler::AL, exit);
  420 
  421     __ BIND(is_double);
  422     __ strd(j_farg0, Address(Rresult, 0));
  423     __ br(Assembler::AL, exit);
  424 
  425     return start;
  426   }
  427 
  428   // Return point for a Java call if there's an exception thrown in
  429   // Java code.  The exception is caught and transformed into a
  430   // pending exception stored in JavaThread that can be tested from
  431   // within the VM.
  432   //
  433   // Note: Usually the parameters are removed by the callee. In case
  434   // of an exception crossing an activation frame boundary, that is
  435   // not the case if the callee is compiled code => need to setup the
  436   // rsp.
  437   //
  438   // r0: exception oop
  439 
  440   address generate_catch_exception() {
  441     StubId stub_id = StubId::stubgen_catch_exception_id;
  442     StubCodeMark mark(this, stub_id);
  443     address start = __ pc();
  444 
  445     // same as in generate_call_stub():
  446     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  447     const Address thread        (rfp, thread_off         * wordSize);
  448 
  449 #ifdef ASSERT
  450     // verify that threads correspond
  451     {
  452       Label L, S;
  453       __ ldr(rscratch1, thread);
  454       __ cmp(rthread, rscratch1);
  455       __ br(Assembler::NE, S);
  456       __ get_thread(rscratch1);
  457       __ cmp(rthread, rscratch1);
  458       __ br(Assembler::EQ, L);
  459       __ bind(S);
  460       __ stop("StubRoutines::catch_exception: threads must correspond");
  461       __ bind(L);
  462     }
  463 #endif
  464 
  465     // set pending exception
  466     __ verify_oop(r0);
  467 
  468     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  469     __ mov(rscratch1, (address)__FILE__);
  470     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  471     __ movw(rscratch1, (int)__LINE__);
  472     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  473 
  474     // complete return to VM
  475     assert(StubRoutines::_call_stub_return_address != nullptr,
  476            "_call_stub_return_address must have been generated before");
  477     __ b(StubRoutines::_call_stub_return_address);
  478 
  479     return start;
  480   }
  481 
  482   // Continuation point for runtime calls returning with a pending
  483   // exception.  The pending exception check happened in the runtime
  484   // or native call stub.  The pending exception in Thread is
  485   // converted into a Java-level exception.
  486   //
  487   // Contract with Java-level exception handlers:
  488   // r0: exception
  489   // r3: throwing pc
  490   //
  491   // NOTE: At entry of this stub, exception-pc must be in LR !!
  492 
  493   // NOTE: this is always used as a jump target within generated code
  494   // so it just needs to be generated code with no x86 prolog
  495 
  496   address generate_forward_exception() {
  497     StubId stub_id = StubId::stubgen_forward_exception_id;
  498     StubCodeMark mark(this, stub_id);
  499     address start = __ pc();
  500 
  501     // Upon entry, LR points to the return address returning into
  502     // Java (interpreted or compiled) code; i.e., the return address
  503     // becomes the throwing pc.
  504     //
  505     // Arguments pushed before the runtime call are still on the stack
  506     // but the exception handler will reset the stack pointer ->
  507     // ignore them.  A potential result in registers can be ignored as
  508     // well.
  509 
  510 #ifdef ASSERT
  511     // make sure this code is only executed if there is a pending exception
  512     {
  513       Label L;
  514       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  515       __ cbnz(rscratch1, L);
  516       __ stop("StubRoutines::forward exception: no pending exception (1)");
  517       __ bind(L);
  518     }
  519 #endif
  520 
  521     // compute exception handler into r19
  522 
  523     // call the VM to find the handler address associated with the
  524     // caller address. pass thread in r0 and caller pc (ret address)
  525     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  526     // the stack.
  527     __ mov(c_rarg1, lr);
  528     // lr will be trashed by the VM call so we move it to R19
  529     // (callee-saved) because we also need to pass it to the handler
  530     // returned by this call.
  531     __ mov(r19, lr);
  532     BLOCK_COMMENT("call exception_handler_for_return_address");
  533     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  534                          SharedRuntime::exception_handler_for_return_address),
  535                     rthread, c_rarg1);
  536     // Reinitialize the ptrue predicate register, in case the external runtime
  537     // call clobbers ptrue reg, as we may return to SVE compiled code.
  538     __ reinitialize_ptrue();
  539 
  540     // we should not really care that lr is no longer the callee
  541     // address. we saved the value the handler needs in r19 so we can
  542     // just copy it to r3. however, the C2 handler will push its own
  543     // frame and then calls into the VM and the VM code asserts that
  544     // the PC for the frame above the handler belongs to a compiled
  545     // Java method. So, we restore lr here to satisfy that assert.
  546     __ mov(lr, r19);
  547     // setup r0 & r3 & clear pending exception
  548     __ mov(r3, r19);
  549     __ mov(r19, r0);
  550     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  551     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  552 
  553 #ifdef ASSERT
  554     // make sure exception is set
  555     {
  556       Label L;
  557       __ cbnz(r0, L);
  558       __ stop("StubRoutines::forward exception: no pending exception (2)");
  559       __ bind(L);
  560     }
  561 #endif
  562 
  563     // continue at exception handler
  564     // r0: exception
  565     // r3: throwing pc
  566     // r19: exception handler
  567     __ verify_oop(r0);
  568     __ br(r19);
  569 
  570     return start;
  571   }
  572 
  573   // Non-destructive plausibility checks for oops
  574   //
  575   // Arguments:
  576   //    r0: oop to verify
  577   //    rscratch1: error message
  578   //
  579   // Stack after saving c_rarg3:
  580   //    [tos + 0]: saved c_rarg3
  581   //    [tos + 1]: saved c_rarg2
  582   //    [tos + 2]: saved lr
  583   //    [tos + 3]: saved rscratch2
  584   //    [tos + 4]: saved r0
  585   //    [tos + 5]: saved rscratch1
  586   address generate_verify_oop() {
  587     StubId stub_id = StubId::stubgen_verify_oop_id;
  588     StubCodeMark mark(this, stub_id);
  589     address start = __ pc();
  590 
  591     Label exit, error;
  592 
  593     // save c_rarg2 and c_rarg3
  594     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  595 
  596     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  597     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  598     __ ldr(c_rarg3, Address(c_rarg2));
  599     __ add(c_rarg3, c_rarg3, 1);
  600     __ str(c_rarg3, Address(c_rarg2));
  601 
  602     // object is in r0
  603     // make sure object is 'reasonable'
  604     __ cbz(r0, exit); // if obj is null it is OK
  605 
  606     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  607     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  608 
  609     // return if everything seems ok
  610     __ bind(exit);
  611 
  612     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  613     __ ret(lr);
  614 
  615     // handle errors
  616     __ bind(error);
  617     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  618 
  619     __ push(RegSet::range(r0, r29), sp);
  620     // debug(char* msg, int64_t pc, int64_t regs[])
  621     __ mov(c_rarg0, rscratch1);      // pass address of error message
  622     __ mov(c_rarg1, lr);             // pass return address
  623     __ mov(c_rarg2, sp);             // pass address of regs on stack
  624 #ifndef PRODUCT
  625     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  626 #endif
  627     BLOCK_COMMENT("call MacroAssembler::debug");
  628     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  629     __ blr(rscratch1);
  630     __ hlt(0);
  631 
  632     return start;
  633   }
  634 
  635   // Generate indices for iota vector.
  636   address generate_iota_indices(StubId stub_id) {
  637     __ align(CodeEntryAlignment);
  638     StubCodeMark mark(this, stub_id);
  639     address start = __ pc();
  640     // B
  641     __ emit_data64(0x0706050403020100, relocInfo::none);
  642     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  643     // H
  644     __ emit_data64(0x0003000200010000, relocInfo::none);
  645     __ emit_data64(0x0007000600050004, relocInfo::none);
  646     // S
  647     __ emit_data64(0x0000000100000000, relocInfo::none);
  648     __ emit_data64(0x0000000300000002, relocInfo::none);
  649     // D
  650     __ emit_data64(0x0000000000000000, relocInfo::none);
  651     __ emit_data64(0x0000000000000001, relocInfo::none);
  652     // S - FP
  653     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  654     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  655     // D - FP
  656     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  657     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  658     return start;
  659   }
  660 
  661   // The inner part of zero_words().  This is the bulk operation,
  662   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  663   // caller is responsible for zeroing the last few words.
  664   //
  665   // Inputs:
  666   // r10: the HeapWord-aligned base address of an array to zero.
  667   // r11: the count in HeapWords, r11 > 0.
  668   //
  669   // Returns r10 and r11, adjusted for the caller to clear.
  670   // r10: the base address of the tail of words left to clear.
  671   // r11: the number of words in the tail.
  672   //      r11 < MacroAssembler::zero_words_block_size.
  673 
  674   address generate_zero_blocks() {
  675     Label done;
  676     Label base_aligned;
  677 
  678     Register base = r10, cnt = r11;
  679 
  680     __ align(CodeEntryAlignment);
  681     StubId stub_id = StubId::stubgen_zero_blocks_id;
  682     StubCodeMark mark(this, stub_id);
  683     address start = __ pc();
  684 
  685     if (UseBlockZeroing) {
  686       int zva_length = VM_Version::zva_length();
  687 
  688       // Ensure ZVA length can be divided by 16. This is required by
  689       // the subsequent operations.
  690       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  691 
  692       __ tbz(base, 3, base_aligned);
  693       __ str(zr, Address(__ post(base, 8)));
  694       __ sub(cnt, cnt, 1);
  695       __ bind(base_aligned);
  696 
  697       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  698       // alignment.
  699       Label small;
  700       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  701       __ subs(rscratch1, cnt, low_limit >> 3);
  702       __ br(Assembler::LT, small);
  703       __ zero_dcache_blocks(base, cnt);
  704       __ bind(small);
  705     }
  706 
  707     {
  708       // Number of stp instructions we'll unroll
  709       const int unroll =
  710         MacroAssembler::zero_words_block_size / 2;
  711       // Clear the remaining blocks.
  712       Label loop;
  713       __ subs(cnt, cnt, unroll * 2);
  714       __ br(Assembler::LT, done);
  715       __ bind(loop);
  716       for (int i = 0; i < unroll; i++)
  717         __ stp(zr, zr, __ post(base, 16));
  718       __ subs(cnt, cnt, unroll * 2);
  719       __ br(Assembler::GE, loop);
  720       __ bind(done);
  721       __ add(cnt, cnt, unroll * 2);
  722     }
  723 
  724     __ ret(lr);
  725 
  726     return start;
  727   }
  728 
  729 
  730   typedef enum {
  731     copy_forwards = 1,
  732     copy_backwards = -1
  733   } copy_direction;
  734 
  735   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  736   // for arraycopy stubs.
  737   class ArrayCopyBarrierSetHelper : StackObj {
  738     BarrierSetAssembler* _bs_asm;
  739     MacroAssembler* _masm;
  740     DecoratorSet _decorators;
  741     BasicType _type;
  742     Register _gct1;
  743     Register _gct2;
  744     Register _gct3;
  745     FloatRegister _gcvt1;
  746     FloatRegister _gcvt2;
  747     FloatRegister _gcvt3;
  748 
  749   public:
  750     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  751                               DecoratorSet decorators,
  752                               BasicType type,
  753                               Register gct1,
  754                               Register gct2,
  755                               Register gct3,
  756                               FloatRegister gcvt1,
  757                               FloatRegister gcvt2,
  758                               FloatRegister gcvt3)
  759       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  760         _masm(masm),
  761         _decorators(decorators),
  762         _type(type),
  763         _gct1(gct1),
  764         _gct2(gct2),
  765         _gct3(gct3),
  766         _gcvt1(gcvt1),
  767         _gcvt2(gcvt2),
  768         _gcvt3(gcvt3) {
  769     }
  770 
  771     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  772       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  773                             dst1, dst2, src,
  774                             _gct1, _gct2, _gcvt1);
  775     }
  776 
  777     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  778       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  779                              dst, src1, src2,
  780                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  781     }
  782 
  783     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  784       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  785                             dst1, dst2, src,
  786                             _gct1);
  787     }
  788 
  789     void copy_store_at_16(Address dst, Register src1, Register src2) {
  790       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  791                              dst, src1, src2,
  792                              _gct1, _gct2, _gct3);
  793     }
  794 
  795     void copy_load_at_8(Register dst, Address src) {
  796       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  797                             dst, noreg, src,
  798                             _gct1);
  799     }
  800 
  801     void copy_store_at_8(Address dst, Register src) {
  802       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  803                              dst, src, noreg,
  804                              _gct1, _gct2, _gct3);
  805     }
  806   };
  807 
  808   // Bulk copy of blocks of 8 words.
  809   //
  810   // count is a count of words.
  811   //
  812   // Precondition: count >= 8
  813   //
  814   // Postconditions:
  815   //
  816   // The least significant bit of count contains the remaining count
  817   // of words to copy.  The rest of count is trash.
  818   //
  819   // s and d are adjusted to point to the remaining words to copy
  820   //
  821   void generate_copy_longs(StubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  822     BasicType type;
  823     copy_direction direction;
  824 
  825     switch (stub_id) {
  826     case StubId::stubgen_copy_byte_f_id:
  827       direction = copy_forwards;
  828       type = T_BYTE;
  829       break;
  830     case StubId::stubgen_copy_byte_b_id:
  831       direction = copy_backwards;
  832       type = T_BYTE;
  833       break;
  834     case StubId::stubgen_copy_oop_f_id:
  835       direction = copy_forwards;
  836       type = T_OBJECT;
  837       break;
  838     case StubId::stubgen_copy_oop_b_id:
  839       direction = copy_backwards;
  840       type = T_OBJECT;
  841       break;
  842     case StubId::stubgen_copy_oop_uninit_f_id:
  843       direction = copy_forwards;
  844       type = T_OBJECT;
  845       break;
  846     case StubId::stubgen_copy_oop_uninit_b_id:
  847       direction = copy_backwards;
  848       type = T_OBJECT;
  849       break;
  850     default:
  851       ShouldNotReachHere();
  852     }
  853 
  854     int unit = wordSize * direction;
  855     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  856 
  857     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  858       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  859     const Register stride = r14;
  860     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  861     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  862     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  863 
  864     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  865     assert_different_registers(s, d, count, rscratch1, rscratch2);
  866 
  867     Label again, drain;
  868 
  869     __ align(CodeEntryAlignment);
  870 
  871     StubCodeMark mark(this, stub_id);
  872 
  873     __ bind(start);
  874 
  875     Label unaligned_copy_long;
  876     if (AvoidUnalignedAccesses) {
  877       __ tbnz(d, 3, unaligned_copy_long);
  878     }
  879 
  880     if (direction == copy_forwards) {
  881       __ sub(s, s, bias);
  882       __ sub(d, d, bias);
  883     }
  884 
  885 #ifdef ASSERT
  886     // Make sure we are never given < 8 words
  887     {
  888       Label L;
  889       __ cmp(count, (u1)8);
  890       __ br(Assembler::GE, L);
  891       __ stop("genrate_copy_longs called with < 8 words");
  892       __ bind(L);
  893     }
  894 #endif
  895 
  896     // Fill 8 registers
  897     if (UseSIMDForMemoryOps) {
  898       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  899       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  900     } else {
  901       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  902       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  903       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  904       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  905     }
  906 
  907     __ subs(count, count, 16);
  908     __ br(Assembler::LO, drain);
  909 
  910     int prefetch = PrefetchCopyIntervalInBytes;
  911     bool use_stride = false;
  912     if (direction == copy_backwards) {
  913        use_stride = prefetch > 256;
  914        prefetch = -prefetch;
  915        if (use_stride) __ mov(stride, prefetch);
  916     }
  917 
  918     __ bind(again);
  919 
  920     if (PrefetchCopyIntervalInBytes > 0)
  921       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  922 
  923     if (UseSIMDForMemoryOps) {
  924       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  925       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  926       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  927       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  928     } else {
  929       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  930       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  931       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  932       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  933       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  934       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  937     }
  938 
  939     __ subs(count, count, 8);
  940     __ br(Assembler::HS, again);
  941 
  942     // Drain
  943     __ bind(drain);
  944     if (UseSIMDForMemoryOps) {
  945       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  946       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  947     } else {
  948       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  950       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  951       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  952     }
  953 
  954     {
  955       Label L1, L2;
  956       __ tbz(count, exact_log2(4), L1);
  957       if (UseSIMDForMemoryOps) {
  958         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  959         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  960       } else {
  961         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  962         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  963         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  964         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  965       }
  966       __ bind(L1);
  967 
  968       if (direction == copy_forwards) {
  969         __ add(s, s, bias);
  970         __ add(d, d, bias);
  971       }
  972 
  973       __ tbz(count, 1, L2);
  974       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  975       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  976       __ bind(L2);
  977     }
  978 
  979     __ ret(lr);
  980 
  981     if (AvoidUnalignedAccesses) {
  982       Label drain, again;
  983       // Register order for storing. Order is different for backward copy.
  984 
  985       __ bind(unaligned_copy_long);
  986 
  987       // source address is even aligned, target odd aligned
  988       //
  989       // when forward copying word pairs we read long pairs at offsets
  990       // {0, 2, 4, 6} (in long words). when backwards copying we read
  991       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  992       // address by -2 in the forwards case so we can compute the
  993       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  994       // or -1.
  995       //
  996       // when forward copying we need to store 1 word, 3 pairs and
  997       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  998       // zero offset We adjust the destination by -1 which means we
  999       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1000       //
 1001       // When backwards copyng we need to store 1 word, 3 pairs and
 1002       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1003       // offsets {1, 3, 5, 7, 8} * unit.
 1004 
 1005       if (direction == copy_forwards) {
 1006         __ sub(s, s, 16);
 1007         __ sub(d, d, 8);
 1008       }
 1009 
 1010       // Fill 8 registers
 1011       //
 1012       // for forwards copy s was offset by -16 from the original input
 1013       // value of s so the register contents are at these offsets
 1014       // relative to the 64 bit block addressed by that original input
 1015       // and so on for each successive 64 byte block when s is updated
 1016       //
 1017       // t0 at offset 0,  t1 at offset 8
 1018       // t2 at offset 16, t3 at offset 24
 1019       // t4 at offset 32, t5 at offset 40
 1020       // t6 at offset 48, t7 at offset 56
 1021 
 1022       // for backwards copy s was not offset so the register contents
 1023       // are at these offsets into the preceding 64 byte block
 1024       // relative to that original input and so on for each successive
 1025       // preceding 64 byte block when s is updated. this explains the
 1026       // slightly counter-intuitive looking pattern of register usage
 1027       // in the stp instructions for backwards copy.
 1028       //
 1029       // t0 at offset -16, t1 at offset -8
 1030       // t2 at offset -32, t3 at offset -24
 1031       // t4 at offset -48, t5 at offset -40
 1032       // t6 at offset -64, t7 at offset -56
 1033 
 1034       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1035       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1036       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1037       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1038 
 1039       __ subs(count, count, 16);
 1040       __ br(Assembler::LO, drain);
 1041 
 1042       int prefetch = PrefetchCopyIntervalInBytes;
 1043       bool use_stride = false;
 1044       if (direction == copy_backwards) {
 1045          use_stride = prefetch > 256;
 1046          prefetch = -prefetch;
 1047          if (use_stride) __ mov(stride, prefetch);
 1048       }
 1049 
 1050       __ bind(again);
 1051 
 1052       if (PrefetchCopyIntervalInBytes > 0)
 1053         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1054 
 1055       if (direction == copy_forwards) {
 1056        // allowing for the offset of -8 the store instructions place
 1057        // registers into the target 64 bit block at the following
 1058        // offsets
 1059        //
 1060        // t0 at offset 0
 1061        // t1 at offset 8,  t2 at offset 16
 1062        // t3 at offset 24, t4 at offset 32
 1063        // t5 at offset 40, t6 at offset 48
 1064        // t7 at offset 56
 1065 
 1066         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1067         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1068         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1069         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1070         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1071         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1072         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1073         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1074         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1075       } else {
 1076        // d was not offset when we started so the registers are
 1077        // written into the 64 bit block preceding d with the following
 1078        // offsets
 1079        //
 1080        // t1 at offset -8
 1081        // t3 at offset -24, t0 at offset -16
 1082        // t5 at offset -48, t2 at offset -32
 1083        // t7 at offset -56, t4 at offset -48
 1084        //                   t6 at offset -64
 1085        //
 1086        // note that this matches the offsets previously noted for the
 1087        // loads
 1088 
 1089         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1090         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1091         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1092         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1093         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1094         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1095         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1096         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1097         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1098       }
 1099 
 1100       __ subs(count, count, 8);
 1101       __ br(Assembler::HS, again);
 1102 
 1103       // Drain
 1104       //
 1105       // this uses the same pattern of offsets and register arguments
 1106       // as above
 1107       __ bind(drain);
 1108       if (direction == copy_forwards) {
 1109         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1110         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1111         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1112         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1113         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1114       } else {
 1115         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1116         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1117         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1118         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1119         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1120       }
 1121       // now we need to copy any remaining part block which may
 1122       // include a 4 word block subblock and/or a 2 word subblock.
 1123       // bits 2 and 1 in the count are the tell-tale for whether we
 1124       // have each such subblock
 1125       {
 1126         Label L1, L2;
 1127         __ tbz(count, exact_log2(4), L1);
 1128        // this is the same as above but copying only 4 longs hence
 1129        // with only one intervening stp between the str instructions
 1130        // but note that the offsets and registers still follow the
 1131        // same pattern
 1132         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1133         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1134         if (direction == copy_forwards) {
 1135           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1136           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1137           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1141           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1142         }
 1143         __ bind(L1);
 1144 
 1145         __ tbz(count, 1, L2);
 1146        // this is the same as above but copying only 2 longs hence
 1147        // there is no intervening stp between the str instructions
 1148        // but note that the offset and register patterns are still
 1149        // the same
 1150         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1151         if (direction == copy_forwards) {
 1152           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1153           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1154         } else {
 1155           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1156           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1157         }
 1158         __ bind(L2);
 1159 
 1160        // for forwards copy we need to re-adjust the offsets we
 1161        // applied so that s and d are follow the last words written
 1162 
 1163        if (direction == copy_forwards) {
 1164          __ add(s, s, 16);
 1165          __ add(d, d, 8);
 1166        }
 1167 
 1168       }
 1169 
 1170       __ ret(lr);
 1171       }
 1172   }
 1173 
 1174   // Small copy: less than 16 bytes.
 1175   //
 1176   // NB: Ignores all of the bits of count which represent more than 15
 1177   // bytes, so a caller doesn't have to mask them.
 1178 
 1179   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1180     bool is_backwards = step < 0;
 1181     size_t granularity = g_uabs(step);
 1182     int direction = is_backwards ? -1 : 1;
 1183 
 1184     Label Lword, Lint, Lshort, Lbyte;
 1185 
 1186     assert(granularity
 1187            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1188 
 1189     const Register t0 = r3;
 1190     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1191     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1192 
 1193     // ??? I don't know if this bit-test-and-branch is the right thing
 1194     // to do.  It does a lot of jumping, resulting in several
 1195     // mispredicted branches.  It might make more sense to do this
 1196     // with something like Duff's device with a single computed branch.
 1197 
 1198     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1199     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1200     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1201     __ bind(Lword);
 1202 
 1203     if (granularity <= sizeof (jint)) {
 1204       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1205       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1206       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1207       __ bind(Lint);
 1208     }
 1209 
 1210     if (granularity <= sizeof (jshort)) {
 1211       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1212       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1213       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1214       __ bind(Lshort);
 1215     }
 1216 
 1217     if (granularity <= sizeof (jbyte)) {
 1218       __ tbz(count, 0, Lbyte);
 1219       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1220       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1221       __ bind(Lbyte);
 1222     }
 1223   }
 1224 
 1225   Label copy_f, copy_b;
 1226   Label copy_obj_f, copy_obj_b;
 1227   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1228 
 1229   // All-singing all-dancing memory copy.
 1230   //
 1231   // Copy count units of memory from s to d.  The size of a unit is
 1232   // step, which can be positive or negative depending on the direction
 1233   // of copy.  If is_aligned is false, we align the source address.
 1234   //
 1235 
 1236   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1237                    Register s, Register d, Register count, int step) {
 1238     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1239     bool is_backwards = step < 0;
 1240     unsigned int granularity = g_uabs(step);
 1241     const Register t0 = r3, t1 = r4;
 1242 
 1243     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1244     // load all the data before writing anything
 1245     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1246     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1247     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1248     const Register send = r17, dend = r16;
 1249     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1250     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1251     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1252 
 1253     if (PrefetchCopyIntervalInBytes > 0)
 1254       __ prfm(Address(s, 0), PLDL1KEEP);
 1255     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1256     __ br(Assembler::HI, copy_big);
 1257 
 1258     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1259     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1260 
 1261     __ cmp(count, u1(16/granularity));
 1262     __ br(Assembler::LS, copy16);
 1263 
 1264     __ cmp(count, u1(64/granularity));
 1265     __ br(Assembler::HI, copy80);
 1266 
 1267     __ cmp(count, u1(32/granularity));
 1268     __ br(Assembler::LS, copy32);
 1269 
 1270     // 33..64 bytes
 1271     if (UseSIMDForMemoryOps) {
 1272       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1273       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1274       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1275       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1276     } else {
 1277       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1278       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1279       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1280       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1281 
 1282       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1283       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1284       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1285       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1286     }
 1287     __ b(finish);
 1288 
 1289     // 17..32 bytes
 1290     __ bind(copy32);
 1291     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1292     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1293 
 1294     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1295     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1296     __ b(finish);
 1297 
 1298     // 65..80/96 bytes
 1299     // (96 bytes if SIMD because we do 32 byes per instruction)
 1300     __ bind(copy80);
 1301     if (UseSIMDForMemoryOps) {
 1302       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1303       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1304       // Unaligned pointers can be an issue for copying.
 1305       // The issue has more chances to happen when granularity of data is
 1306       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1307       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1308       // The most performance drop has been seen for the range 65-80 bytes.
 1309       // For such cases using the pair of ldp/stp instead of the third pair of
 1310       // ldpq/stpq fixes the performance issue.
 1311       if (granularity < sizeof (jint)) {
 1312         Label copy96;
 1313         __ cmp(count, u1(80/granularity));
 1314         __ br(Assembler::HI, copy96);
 1315         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1316 
 1317         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1318         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1319 
 1320         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1321         __ b(finish);
 1322 
 1323         __ bind(copy96);
 1324       }
 1325       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1326 
 1327       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1328       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1329 
 1330       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1331     } else {
 1332       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1333       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1334       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1335       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1336       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1337 
 1338       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1339       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1340       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1341       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1342       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1343     }
 1344     __ b(finish);
 1345 
 1346     // 0..16 bytes
 1347     __ bind(copy16);
 1348     __ cmp(count, u1(8/granularity));
 1349     __ br(Assembler::LO, copy8);
 1350 
 1351     // 8..16 bytes
 1352     bs.copy_load_at_8(t0, Address(s, 0));
 1353     bs.copy_load_at_8(t1, Address(send, -8));
 1354     bs.copy_store_at_8(Address(d, 0), t0);
 1355     bs.copy_store_at_8(Address(dend, -8), t1);
 1356     __ b(finish);
 1357 
 1358     if (granularity < 8) {
 1359       // 4..7 bytes
 1360       __ bind(copy8);
 1361       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1362       __ ldrw(t0, Address(s, 0));
 1363       __ ldrw(t1, Address(send, -4));
 1364       __ strw(t0, Address(d, 0));
 1365       __ strw(t1, Address(dend, -4));
 1366       __ b(finish);
 1367       if (granularity < 4) {
 1368         // 0..3 bytes
 1369         __ bind(copy4);
 1370         __ cbz(count, finish); // get rid of 0 case
 1371         if (granularity == 2) {
 1372           __ ldrh(t0, Address(s, 0));
 1373           __ strh(t0, Address(d, 0));
 1374         } else { // granularity == 1
 1375           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1376           // the first and last byte.
 1377           // Handle the 3 byte case by loading and storing base + count/2
 1378           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1379           // This does means in the 1 byte case we load/store the same
 1380           // byte 3 times.
 1381           __ lsr(count, count, 1);
 1382           __ ldrb(t0, Address(s, 0));
 1383           __ ldrb(t1, Address(send, -1));
 1384           __ ldrb(t2, Address(s, count));
 1385           __ strb(t0, Address(d, 0));
 1386           __ strb(t1, Address(dend, -1));
 1387           __ strb(t2, Address(d, count));
 1388         }
 1389         __ b(finish);
 1390       }
 1391     }
 1392 
 1393     __ bind(copy_big);
 1394     if (is_backwards) {
 1395       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1396       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1397     }
 1398 
 1399     // Now we've got the small case out of the way we can align the
 1400     // source address on a 2-word boundary.
 1401 
 1402     // Here we will materialize a count in r15, which is used by copy_memory_small
 1403     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1404     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1405     // can not be used as a temp register, as it contains the count.
 1406 
 1407     Label aligned;
 1408 
 1409     if (is_aligned) {
 1410       // We may have to adjust by 1 word to get s 2-word-aligned.
 1411       __ tbz(s, exact_log2(wordSize), aligned);
 1412       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1413       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1414       __ sub(count, count, wordSize/granularity);
 1415     } else {
 1416       if (is_backwards) {
 1417         __ andr(r15, s, 2 * wordSize - 1);
 1418       } else {
 1419         __ neg(r15, s);
 1420         __ andr(r15, r15, 2 * wordSize - 1);
 1421       }
 1422       // r15 is the byte adjustment needed to align s.
 1423       __ cbz(r15, aligned);
 1424       int shift = exact_log2(granularity);
 1425       if (shift > 0) {
 1426         __ lsr(r15, r15, shift);
 1427       }
 1428       __ sub(count, count, r15);
 1429 
 1430 #if 0
 1431       // ?? This code is only correct for a disjoint copy.  It may or
 1432       // may not make sense to use it in that case.
 1433 
 1434       // Copy the first pair; s and d may not be aligned.
 1435       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1436       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1437 
 1438       // Align s and d, adjust count
 1439       if (is_backwards) {
 1440         __ sub(s, s, r15);
 1441         __ sub(d, d, r15);
 1442       } else {
 1443         __ add(s, s, r15);
 1444         __ add(d, d, r15);
 1445       }
 1446 #else
 1447       copy_memory_small(decorators, type, s, d, r15, step);
 1448 #endif
 1449     }
 1450 
 1451     __ bind(aligned);
 1452 
 1453     // s is now 2-word-aligned.
 1454 
 1455     // We have a count of units and some trailing bytes. Adjust the
 1456     // count and do a bulk copy of words. If the shift is zero
 1457     // perform a move instead to benefit from zero latency moves.
 1458     int shift = exact_log2(wordSize/granularity);
 1459     if (shift > 0) {
 1460       __ lsr(r15, count, shift);
 1461     } else {
 1462       __ mov(r15, count);
 1463     }
 1464     if (direction == copy_forwards) {
 1465       if (type != T_OBJECT) {
 1466         __ bl(copy_f);
 1467       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1468         __ bl(copy_obj_uninit_f);
 1469       } else {
 1470         __ bl(copy_obj_f);
 1471       }
 1472     } else {
 1473       if (type != T_OBJECT) {
 1474         __ bl(copy_b);
 1475       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1476         __ bl(copy_obj_uninit_b);
 1477       } else {
 1478         __ bl(copy_obj_b);
 1479       }
 1480     }
 1481 
 1482     // And the tail.
 1483     copy_memory_small(decorators, type, s, d, count, step);
 1484 
 1485     if (granularity >= 8) __ bind(copy8);
 1486     if (granularity >= 4) __ bind(copy4);
 1487     __ bind(finish);
 1488   }
 1489 
 1490 
 1491   void clobber_registers() {
 1492 #ifdef ASSERT
 1493     RegSet clobbered
 1494       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1495     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1496     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1497     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1498       __ mov(*it, rscratch1);
 1499     }
 1500 #endif
 1501 
 1502   }
 1503 
 1504   // Scan over array at a for count oops, verifying each one.
 1505   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1506   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1507     Label loop, end;
 1508     __ mov(rscratch1, a);
 1509     __ mov(rscratch2, zr);
 1510     __ bind(loop);
 1511     __ cmp(rscratch2, count);
 1512     __ br(Assembler::HS, end);
 1513     if (size == wordSize) {
 1514       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1515       __ verify_oop(temp);
 1516     } else {
 1517       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1518       __ decode_heap_oop(temp); // calls verify_oop
 1519     }
 1520     __ add(rscratch2, rscratch2, 1);
 1521     __ b(loop);
 1522     __ bind(end);
 1523   }
 1524 
 1525   // Arguments:
 1526   //   stub_id - is used to name the stub and identify all details of
 1527   //             how to perform the copy.
 1528   //
 1529   //   entry - is assigned to the stub's post push entry point unless
 1530   //           it is null
 1531   //
 1532   // Inputs:
 1533   //   c_rarg0   - source array address
 1534   //   c_rarg1   - destination array address
 1535   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1536   //
 1537   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1538   // the hardware handle it.  The two dwords within qwords that span
 1539   // cache line boundaries will still be loaded and stored atomically.
 1540   //
 1541   // Side Effects: entry is set to the (post push) entry point so it
 1542   //               can be used by the corresponding conjoint copy
 1543   //               method
 1544   //
 1545   address generate_disjoint_copy(StubId stub_id, address *entry) {
 1546     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1547     RegSet saved_reg = RegSet::of(s, d, count);
 1548     int size;
 1549     bool aligned;
 1550     bool is_oop;
 1551     bool dest_uninitialized;
 1552     switch (stub_id) {
 1553     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1554       size = sizeof(jbyte);
 1555       aligned = false;
 1556       is_oop = false;
 1557       dest_uninitialized = false;
 1558       break;
 1559     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1560       size = sizeof(jbyte);
 1561       aligned = true;
 1562       is_oop = false;
 1563       dest_uninitialized = false;
 1564       break;
 1565     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1566       size = sizeof(jshort);
 1567       aligned = false;
 1568       is_oop = false;
 1569       dest_uninitialized = false;
 1570       break;
 1571     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1572       size = sizeof(jshort);
 1573       aligned = true;
 1574       is_oop = false;
 1575       dest_uninitialized = false;
 1576       break;
 1577     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1578       size = sizeof(jint);
 1579       aligned = false;
 1580       is_oop = false;
 1581       dest_uninitialized = false;
 1582       break;
 1583     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1584       size = sizeof(jint);
 1585       aligned = true;
 1586       is_oop = false;
 1587       dest_uninitialized = false;
 1588       break;
 1589     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1590       // since this is always aligned we can (should!) use the same
 1591       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1592       ShouldNotReachHere();
 1593       break;
 1594     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1595       size = sizeof(jlong);
 1596       aligned = true;
 1597       is_oop = false;
 1598       dest_uninitialized = false;
 1599       break;
 1600     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1601       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1602       aligned = !UseCompressedOops;
 1603       is_oop = true;
 1604       dest_uninitialized = false;
 1605       break;
 1606     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1607       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1608       aligned = !UseCompressedOops;
 1609       is_oop = true;
 1610       dest_uninitialized = false;
 1611       break;
 1612     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1613       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1614       aligned = !UseCompressedOops;
 1615       is_oop = true;
 1616       dest_uninitialized = true;
 1617       break;
 1618     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1619       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1620       aligned = !UseCompressedOops;
 1621       is_oop = true;
 1622       dest_uninitialized = true;
 1623       break;
 1624     default:
 1625       ShouldNotReachHere();
 1626       break;
 1627     }
 1628 
 1629     __ align(CodeEntryAlignment);
 1630     StubCodeMark mark(this, stub_id);
 1631     address start = __ pc();
 1632     __ enter();
 1633 
 1634     if (entry != nullptr) {
 1635       *entry = __ pc();
 1636       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1637       BLOCK_COMMENT("Entry:");
 1638     }
 1639 
 1640     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1641     if (dest_uninitialized) {
 1642       decorators |= IS_DEST_UNINITIALIZED;
 1643     }
 1644     if (aligned) {
 1645       decorators |= ARRAYCOPY_ALIGNED;
 1646     }
 1647 
 1648     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1649     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1650 
 1651     if (is_oop) {
 1652       // save regs before copy_memory
 1653       __ push(RegSet::of(d, count), sp);
 1654     }
 1655     {
 1656       // UnsafeMemoryAccess page error: continue after unsafe access
 1657       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1658       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1659       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1660     }
 1661 
 1662     if (is_oop) {
 1663       __ pop(RegSet::of(d, count), sp);
 1664       if (VerifyOops)
 1665         verify_oop_array(size, d, count, r16);
 1666     }
 1667 
 1668     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1669 
 1670     __ leave();
 1671     __ mov(r0, zr); // return 0
 1672     __ ret(lr);
 1673     return start;
 1674   }
 1675 
 1676   // Arguments:
 1677   //   stub_id - is used to name the stub and identify all details of
 1678   //             how to perform the copy.
 1679   //
 1680   //   nooverlap_target - identifes the (post push) entry for the
 1681   //             corresponding disjoint copy routine which can be
 1682   //             jumped to if the ranges do not actually overlap
 1683   //
 1684   //   entry - is assigned to the stub's post push entry point unless
 1685   //           it is null
 1686   //
 1687   //
 1688   // Inputs:
 1689   //   c_rarg0   - source array address
 1690   //   c_rarg1   - destination array address
 1691   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1692   //
 1693   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1694   // the hardware handle it.  The two dwords within qwords that span
 1695   // cache line boundaries will still be loaded and stored atomically.
 1696   //
 1697   // Side Effects:
 1698   //   entry is set to the no-overlap entry point so it can be used by
 1699   //   some other conjoint copy method
 1700   //
 1701   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *entry) {
 1702     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1703     RegSet saved_regs = RegSet::of(s, d, count);
 1704     int size;
 1705     bool aligned;
 1706     bool is_oop;
 1707     bool dest_uninitialized;
 1708     switch (stub_id) {
 1709     case StubId::stubgen_jbyte_arraycopy_id:
 1710       size = sizeof(jbyte);
 1711       aligned = false;
 1712       is_oop = false;
 1713       dest_uninitialized = false;
 1714       break;
 1715     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1716       size = sizeof(jbyte);
 1717       aligned = true;
 1718       is_oop = false;
 1719       dest_uninitialized = false;
 1720       break;
 1721     case StubId::stubgen_jshort_arraycopy_id:
 1722       size = sizeof(jshort);
 1723       aligned = false;
 1724       is_oop = false;
 1725       dest_uninitialized = false;
 1726       break;
 1727     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 1728       size = sizeof(jshort);
 1729       aligned = true;
 1730       is_oop = false;
 1731       dest_uninitialized = false;
 1732       break;
 1733     case StubId::stubgen_jint_arraycopy_id:
 1734       size = sizeof(jint);
 1735       aligned = false;
 1736       is_oop = false;
 1737       dest_uninitialized = false;
 1738       break;
 1739     case StubId::stubgen_arrayof_jint_arraycopy_id:
 1740       size = sizeof(jint);
 1741       aligned = true;
 1742       is_oop = false;
 1743       dest_uninitialized = false;
 1744       break;
 1745     case StubId::stubgen_jlong_arraycopy_id:
 1746       // since this is always aligned we can (should!) use the same
 1747       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1748       ShouldNotReachHere();
 1749       break;
 1750     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 1751       size = sizeof(jlong);
 1752       aligned = true;
 1753       is_oop = false;
 1754       dest_uninitialized = false;
 1755       break;
 1756     case StubId::stubgen_oop_arraycopy_id:
 1757       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1758       aligned = !UseCompressedOops;
 1759       is_oop = true;
 1760       dest_uninitialized = false;
 1761       break;
 1762     case StubId::stubgen_arrayof_oop_arraycopy_id:
 1763       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1764       aligned = !UseCompressedOops;
 1765       is_oop = true;
 1766       dest_uninitialized = false;
 1767       break;
 1768     case StubId::stubgen_oop_arraycopy_uninit_id:
 1769       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1770       aligned = !UseCompressedOops;
 1771       is_oop = true;
 1772       dest_uninitialized = true;
 1773       break;
 1774     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 1775       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1776       aligned = !UseCompressedOops;
 1777       is_oop = true;
 1778       dest_uninitialized = true;
 1779       break;
 1780     default:
 1781       ShouldNotReachHere();
 1782     }
 1783 
 1784     StubCodeMark mark(this, stub_id);
 1785     address start = __ pc();
 1786     __ enter();
 1787 
 1788     if (entry != nullptr) {
 1789       *entry = __ pc();
 1790       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1791       BLOCK_COMMENT("Entry:");
 1792     }
 1793 
 1794     // use fwd copy when (d-s) above_equal (count*size)
 1795     __ sub(rscratch1, d, s);
 1796     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1797     __ br(Assembler::HS, nooverlap_target);
 1798 
 1799     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1800     if (dest_uninitialized) {
 1801       decorators |= IS_DEST_UNINITIALIZED;
 1802     }
 1803     if (aligned) {
 1804       decorators |= ARRAYCOPY_ALIGNED;
 1805     }
 1806 
 1807     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1808     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1809 
 1810     if (is_oop) {
 1811       // save regs before copy_memory
 1812       __ push(RegSet::of(d, count), sp);
 1813     }
 1814     {
 1815       // UnsafeMemoryAccess page error: continue after unsafe access
 1816       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1817       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1818       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1819     }
 1820     if (is_oop) {
 1821       __ pop(RegSet::of(d, count), sp);
 1822       if (VerifyOops)
 1823         verify_oop_array(size, d, count, r16);
 1824     }
 1825     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1826     __ leave();
 1827     __ mov(r0, zr); // return 0
 1828     __ ret(lr);
 1829     return start;
 1830   }
 1831 
 1832   // Helper for generating a dynamic type check.
 1833   // Smashes rscratch1, rscratch2.
 1834   void generate_type_check(Register sub_klass,
 1835                            Register super_check_offset,
 1836                            Register super_klass,
 1837                            Register temp1,
 1838                            Register temp2,
 1839                            Register result,
 1840                            Label& L_success) {
 1841     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1842 
 1843     BLOCK_COMMENT("type_check:");
 1844 
 1845     Label L_miss;
 1846 
 1847     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1848                                      super_check_offset);
 1849     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1850 
 1851     // Fall through on failure!
 1852     __ BIND(L_miss);
 1853   }
 1854 
 1855   //
 1856   //  Generate checkcasting array copy stub
 1857   //
 1858   //  Input:
 1859   //    c_rarg0   - source array address
 1860   //    c_rarg1   - destination array address
 1861   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1862   //    c_rarg3   - size_t ckoff (super_check_offset)
 1863   //    c_rarg4   - oop ckval (super_klass)
 1864   //
 1865   //  Output:
 1866   //    r0 ==  0  -  success
 1867   //    r0 == -1^K - failure, where K is partial transfer count
 1868   //
 1869   address generate_checkcast_copy(StubId stub_id, address *entry) {
 1870     bool dest_uninitialized;
 1871     switch (stub_id) {
 1872     case StubId::stubgen_checkcast_arraycopy_id:
 1873       dest_uninitialized = false;
 1874       break;
 1875     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 1876       dest_uninitialized = true;
 1877       break;
 1878     default:
 1879       ShouldNotReachHere();
 1880     }
 1881 
 1882     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1883 
 1884     // Input registers (after setup_arg_regs)
 1885     const Register from        = c_rarg0;   // source array address
 1886     const Register to          = c_rarg1;   // destination array address
 1887     const Register count       = c_rarg2;   // elementscount
 1888     const Register ckoff       = c_rarg3;   // super_check_offset
 1889     const Register ckval       = c_rarg4;   // super_klass
 1890 
 1891     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1892     RegSet wb_post_saved_regs = RegSet::of(count);
 1893 
 1894     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1895     const Register copied_oop  = r22;       // actual oop copied
 1896     const Register count_save  = r21;       // orig elementscount
 1897     const Register start_to    = r20;       // destination array start address
 1898     const Register r19_klass   = r19;       // oop._klass
 1899 
 1900     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1901     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1902 
 1903     //---------------------------------------------------------------
 1904     // Assembler stub will be used for this call to arraycopy
 1905     // if the two arrays are subtypes of Object[] but the
 1906     // destination array type is not equal to or a supertype
 1907     // of the source type.  Each element must be separately
 1908     // checked.
 1909 
 1910     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1911                                copied_oop, r19_klass, count_save);
 1912 
 1913     __ align(CodeEntryAlignment);
 1914     StubCodeMark mark(this, stub_id);
 1915     address start = __ pc();
 1916 
 1917     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1918 
 1919 #ifdef ASSERT
 1920     // caller guarantees that the arrays really are different
 1921     // otherwise, we would have to make conjoint checks
 1922     { Label L;
 1923       __ b(L);                  // conjoint check not yet implemented
 1924       __ stop("checkcast_copy within a single array");
 1925       __ bind(L);
 1926     }
 1927 #endif //ASSERT
 1928 
 1929     // Caller of this entry point must set up the argument registers.
 1930     if (entry != nullptr) {
 1931       *entry = __ pc();
 1932       BLOCK_COMMENT("Entry:");
 1933     }
 1934 
 1935      // Empty array:  Nothing to do.
 1936     __ cbz(count, L_done);
 1937     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1938 
 1939 #ifdef ASSERT
 1940     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1941     // The ckoff and ckval must be mutually consistent,
 1942     // even though caller generates both.
 1943     { Label L;
 1944       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1945       __ ldrw(start_to, Address(ckval, sco_offset));
 1946       __ cmpw(ckoff, start_to);
 1947       __ br(Assembler::EQ, L);
 1948       __ stop("super_check_offset inconsistent");
 1949       __ bind(L);
 1950     }
 1951 #endif //ASSERT
 1952 
 1953     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1954     bool is_oop = true;
 1955     int element_size = UseCompressedOops ? 4 : 8;
 1956     if (dest_uninitialized) {
 1957       decorators |= IS_DEST_UNINITIALIZED;
 1958     }
 1959 
 1960     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1961     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1962 
 1963     // save the original count
 1964     __ mov(count_save, count);
 1965 
 1966     // Copy from low to high addresses
 1967     __ mov(start_to, to);              // Save destination array start address
 1968     __ b(L_load_element);
 1969 
 1970     // ======== begin loop ========
 1971     // (Loop is rotated; its entry is L_load_element.)
 1972     // Loop control:
 1973     //   for (; count != 0; count--) {
 1974     //     copied_oop = load_heap_oop(from++);
 1975     //     ... generate_type_check ...;
 1976     //     store_heap_oop(to++, copied_oop);
 1977     //   }
 1978     __ align(OptoLoopAlignment);
 1979 
 1980     __ BIND(L_store_element);
 1981     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1982                       __ post(to, element_size), copied_oop, noreg,
 1983                       gct1, gct2, gct3);
 1984     __ sub(count, count, 1);
 1985     __ cbz(count, L_do_card_marks);
 1986 
 1987     // ======== loop entry is here ========
 1988     __ BIND(L_load_element);
 1989     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1990                      copied_oop, noreg, __ post(from, element_size),
 1991                      gct1);
 1992     __ cbz(copied_oop, L_store_element);
 1993 
 1994     __ load_klass(r19_klass, copied_oop);// query the object klass
 1995 
 1996     BLOCK_COMMENT("type_check:");
 1997     generate_type_check(/*sub_klass*/r19_klass,
 1998                         /*super_check_offset*/ckoff,
 1999                         /*super_klass*/ckval,
 2000                         /*r_array_base*/gct1,
 2001                         /*temp2*/gct2,
 2002                         /*result*/r10, L_store_element);
 2003 
 2004     // Fall through on failure!
 2005 
 2006     // ======== end loop ========
 2007 
 2008     // It was a real error; we must depend on the caller to finish the job.
 2009     // Register count = remaining oops, count_orig = total oops.
 2010     // Emit GC store barriers for the oops we have copied and report
 2011     // their number to the caller.
 2012 
 2013     __ subs(count, count_save, count);     // K = partially copied oop count
 2014     __ eon(count, count, zr);              // report (-1^K) to caller
 2015     __ br(Assembler::EQ, L_done_pop);
 2016 
 2017     __ BIND(L_do_card_marks);
 2018     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2019 
 2020     __ bind(L_done_pop);
 2021     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2022     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2023 
 2024     __ bind(L_done);
 2025     __ mov(r0, count);
 2026     __ leave();
 2027     __ ret(lr);
 2028 
 2029     return start;
 2030   }
 2031 
 2032   // Perform range checks on the proposed arraycopy.
 2033   // Kills temp, but nothing else.
 2034   // Also, clean the sign bits of src_pos and dst_pos.
 2035   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2036                               Register src_pos, // source position (c_rarg1)
 2037                               Register dst,     // destination array oo (c_rarg2)
 2038                               Register dst_pos, // destination position (c_rarg3)
 2039                               Register length,
 2040                               Register temp,
 2041                               Label& L_failed) {
 2042     BLOCK_COMMENT("arraycopy_range_checks:");
 2043 
 2044     assert_different_registers(rscratch1, temp);
 2045 
 2046     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2047     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2048     __ addw(temp, length, src_pos);
 2049     __ cmpw(temp, rscratch1);
 2050     __ br(Assembler::HI, L_failed);
 2051 
 2052     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2053     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2054     __ addw(temp, length, dst_pos);
 2055     __ cmpw(temp, rscratch1);
 2056     __ br(Assembler::HI, L_failed);
 2057 
 2058     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2059     __ movw(src_pos, src_pos);
 2060     __ movw(dst_pos, dst_pos);
 2061 
 2062     BLOCK_COMMENT("arraycopy_range_checks done");
 2063   }
 2064 
 2065   // These stubs get called from some dumb test routine.
 2066   // I'll write them properly when they're called from
 2067   // something that's actually doing something.
 2068   static void fake_arraycopy_stub(address src, address dst, int count) {
 2069     assert(count == 0, "huh?");
 2070   }
 2071 
 2072 
 2073   //
 2074   //  Generate 'unsafe' array copy stub
 2075   //  Though just as safe as the other stubs, it takes an unscaled
 2076   //  size_t argument instead of an element count.
 2077   //
 2078   //  Input:
 2079   //    c_rarg0   - source array address
 2080   //    c_rarg1   - destination array address
 2081   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2082   //
 2083   // Examines the alignment of the operands and dispatches
 2084   // to a long, int, short, or byte copy loop.
 2085   //
 2086   address generate_unsafe_copy(address byte_copy_entry,
 2087                                address short_copy_entry,
 2088                                address int_copy_entry,
 2089                                address long_copy_entry) {
 2090     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2091 
 2092     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2093     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2094 
 2095     __ align(CodeEntryAlignment);
 2096     StubCodeMark mark(this, stub_id);
 2097     address start = __ pc();
 2098     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2099 
 2100     // bump this on entry, not on exit:
 2101     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2102 
 2103     __ orr(rscratch1, s, d);
 2104     __ orr(rscratch1, rscratch1, count);
 2105 
 2106     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2107     __ cbz(rscratch1, L_long_aligned);
 2108     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2109     __ cbz(rscratch1, L_int_aligned);
 2110     __ tbz(rscratch1, 0, L_short_aligned);
 2111     __ b(RuntimeAddress(byte_copy_entry));
 2112 
 2113     __ BIND(L_short_aligned);
 2114     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2115     __ b(RuntimeAddress(short_copy_entry));
 2116     __ BIND(L_int_aligned);
 2117     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2118     __ b(RuntimeAddress(int_copy_entry));
 2119     __ BIND(L_long_aligned);
 2120     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2121     __ b(RuntimeAddress(long_copy_entry));
 2122 
 2123     return start;
 2124   }
 2125 
 2126   //
 2127   //  Generate generic array copy stubs
 2128   //
 2129   //  Input:
 2130   //    c_rarg0    -  src oop
 2131   //    c_rarg1    -  src_pos (32-bits)
 2132   //    c_rarg2    -  dst oop
 2133   //    c_rarg3    -  dst_pos (32-bits)
 2134   //    c_rarg4    -  element count (32-bits)
 2135   //
 2136   //  Output:
 2137   //    r0 ==  0  -  success
 2138   //    r0 == -1^K - failure, where K is partial transfer count
 2139   //
 2140   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2141                                 address int_copy_entry, address oop_copy_entry,
 2142                                 address long_copy_entry, address checkcast_copy_entry) {
 2143     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2144 
 2145     Label L_failed, L_objArray;
 2146     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2147 
 2148     // Input registers
 2149     const Register src        = c_rarg0;  // source array oop
 2150     const Register src_pos    = c_rarg1;  // source position
 2151     const Register dst        = c_rarg2;  // destination array oop
 2152     const Register dst_pos    = c_rarg3;  // destination position
 2153     const Register length     = c_rarg4;
 2154 
 2155 
 2156     // Registers used as temps
 2157     const Register dst_klass  = c_rarg5;
 2158 
 2159     __ align(CodeEntryAlignment);
 2160 
 2161     StubCodeMark mark(this, stub_id);
 2162 
 2163     address start = __ pc();
 2164 
 2165     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2166 
 2167     // bump this on entry, not on exit:
 2168     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2169 
 2170     //-----------------------------------------------------------------------
 2171     // Assembler stub will be used for this call to arraycopy
 2172     // if the following conditions are met:
 2173     //
 2174     // (1) src and dst must not be null.
 2175     // (2) src_pos must not be negative.
 2176     // (3) dst_pos must not be negative.
 2177     // (4) length  must not be negative.
 2178     // (5) src klass and dst klass should be the same and not null.
 2179     // (6) src and dst should be arrays.
 2180     // (7) src_pos + length must not exceed length of src.
 2181     // (8) dst_pos + length must not exceed length of dst.
 2182     //
 2183 
 2184     //  if (src == nullptr) return -1;
 2185     __ cbz(src, L_failed);
 2186 
 2187     //  if (src_pos < 0) return -1;
 2188     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     //  if (dst == nullptr) return -1;
 2191     __ cbz(dst, L_failed);
 2192 
 2193     //  if (dst_pos < 0) return -1;
 2194     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2195 
 2196     // registers used as temp
 2197     const Register scratch_length    = r16; // elements count to copy
 2198     const Register scratch_src_klass = r17; // array klass
 2199     const Register lh                = r15; // layout helper
 2200 
 2201     //  if (length < 0) return -1;
 2202     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2203     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2204 
 2205     __ load_klass(scratch_src_klass, src);
 2206 #ifdef ASSERT
 2207     //  assert(src->klass() != nullptr);
 2208     {
 2209       BLOCK_COMMENT("assert klasses not null {");
 2210       Label L1, L2;
 2211       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2212       __ bind(L1);
 2213       __ stop("broken null klass");
 2214       __ bind(L2);
 2215       __ load_klass(rscratch1, dst);
 2216       __ cbz(rscratch1, L1);     // this would be broken also
 2217       BLOCK_COMMENT("} assert klasses not null done");
 2218     }
 2219 #endif
 2220 
 2221     // Load layout helper (32-bits)
 2222     //
 2223     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2224     // 32        30    24            16              8     2                 0
 2225     //
 2226     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2227     //
 2228 
 2229     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2230 
 2231     // Handle objArrays completely differently...
 2232     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2233     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2234     __ movw(rscratch1, objArray_lh);
 2235     __ eorw(rscratch2, lh, rscratch1);
 2236     __ cbzw(rscratch2, L_objArray);
 2237 
 2238     //  if (src->klass() != dst->klass()) return -1;
 2239     __ load_klass(rscratch2, dst);
 2240     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2241     __ cbnz(rscratch2, L_failed);
 2242 
 2243     // Check for flat inline type array -> return -1
 2244     __ test_flat_array_oop(src, rscratch2, L_failed);
 2245 
 2246     // Check for null-free (non-flat) inline type array -> handle as object array
 2247     __ test_null_free_array_oop(src, rscratch2, L_objArray);
 2248 
 2249     //  if (!src->is_Array()) return -1;
 2250     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2251 
 2252     // At this point, it is known to be a typeArray (array_tag 0x3).
 2253 #ifdef ASSERT
 2254     {
 2255       BLOCK_COMMENT("assert primitive array {");
 2256       Label L;
 2257       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2258       __ cmpw(lh, rscratch2);
 2259       __ br(Assembler::GE, L);
 2260       __ stop("must be a primitive array");
 2261       __ bind(L);
 2262       BLOCK_COMMENT("} assert primitive array done");
 2263     }
 2264 #endif
 2265 
 2266     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2267                            rscratch2, L_failed);
 2268 
 2269     // TypeArrayKlass
 2270     //
 2271     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2272     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2273     //
 2274 
 2275     const Register rscratch1_offset = rscratch1;    // array offset
 2276     const Register r15_elsize = lh; // element size
 2277 
 2278     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2279            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2280     __ add(src, src, rscratch1_offset);           // src array offset
 2281     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2282     BLOCK_COMMENT("choose copy loop based on element size");
 2283 
 2284     // next registers should be set before the jump to corresponding stub
 2285     const Register from     = c_rarg0;  // source array address
 2286     const Register to       = c_rarg1;  // destination array address
 2287     const Register count    = c_rarg2;  // elements count
 2288 
 2289     // 'from', 'to', 'count' registers should be set in such order
 2290     // since they are the same as 'src', 'src_pos', 'dst'.
 2291 
 2292     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2293 
 2294     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2295     // size in bytes).  We do a simple bitwise binary search.
 2296   __ BIND(L_copy_bytes);
 2297     __ tbnz(r15_elsize, 1, L_copy_ints);
 2298     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2299     __ lea(from, Address(src, src_pos));// src_addr
 2300     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2301     __ movw(count, scratch_length); // length
 2302     __ b(RuntimeAddress(byte_copy_entry));
 2303 
 2304   __ BIND(L_copy_shorts);
 2305     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2306     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2307     __ movw(count, scratch_length); // length
 2308     __ b(RuntimeAddress(short_copy_entry));
 2309 
 2310   __ BIND(L_copy_ints);
 2311     __ tbnz(r15_elsize, 0, L_copy_longs);
 2312     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2313     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2314     __ movw(count, scratch_length); // length
 2315     __ b(RuntimeAddress(int_copy_entry));
 2316 
 2317   __ BIND(L_copy_longs);
 2318 #ifdef ASSERT
 2319     {
 2320       BLOCK_COMMENT("assert long copy {");
 2321       Label L;
 2322       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2323       __ cmpw(r15_elsize, LogBytesPerLong);
 2324       __ br(Assembler::EQ, L);
 2325       __ stop("must be long copy, but elsize is wrong");
 2326       __ bind(L);
 2327       BLOCK_COMMENT("} assert long copy done");
 2328     }
 2329 #endif
 2330     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2331     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2332     __ movw(count, scratch_length); // length
 2333     __ b(RuntimeAddress(long_copy_entry));
 2334 
 2335     // ObjArrayKlass
 2336   __ BIND(L_objArray);
 2337     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2338 
 2339     Label L_plain_copy, L_checkcast_copy;
 2340     //  test array classes for subtyping
 2341     __ load_klass(r15, dst);
 2342     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2343     __ br(Assembler::NE, L_checkcast_copy);
 2344 
 2345     // Identically typed arrays can be copied without element-wise checks.
 2346     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                            rscratch2, L_failed);
 2348 
 2349     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2350     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2351     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2352     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353     __ movw(count, scratch_length); // length
 2354   __ BIND(L_plain_copy);
 2355     __ b(RuntimeAddress(oop_copy_entry));
 2356 
 2357   __ BIND(L_checkcast_copy);
 2358     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2359     {
 2360       // Before looking at dst.length, make sure dst is also an objArray.
 2361       __ ldrw(rscratch1, Address(r15, lh_offset));
 2362       __ movw(rscratch2, objArray_lh);
 2363       __ eorw(rscratch1, rscratch1, rscratch2);
 2364       __ cbnzw(rscratch1, L_failed);
 2365 
 2366       // It is safe to examine both src.length and dst.length.
 2367       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2368                              r15, L_failed);
 2369 
 2370       __ load_klass(dst_klass, dst); // reload
 2371 
 2372       // Marshal the base address arguments now, freeing registers.
 2373       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2374       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2375       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2376       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2377       __ movw(count, length);           // length (reloaded)
 2378       Register sco_temp = c_rarg3;      // this register is free now
 2379       assert_different_registers(from, to, count, sco_temp,
 2380                                  dst_klass, scratch_src_klass);
 2381       // assert_clean_int(count, sco_temp);
 2382 
 2383       // Generate the type check.
 2384       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2385       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2386 
 2387       // Smashes rscratch1, rscratch2
 2388       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2389                           L_plain_copy);
 2390 
 2391       // Fetch destination element klass from the ObjArrayKlass header.
 2392       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2393       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2394       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2395 
 2396       // the checkcast_copy loop needs two extra arguments:
 2397       assert(c_rarg3 == sco_temp, "#3 already in place");
 2398       // Set up arguments for checkcast_copy_entry.
 2399       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2400       __ b(RuntimeAddress(checkcast_copy_entry));
 2401     }
 2402 
 2403   __ BIND(L_failed);
 2404     __ mov(r0, -1);
 2405     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2406     __ ret(lr);
 2407 
 2408     return start;
 2409   }
 2410 
 2411   //
 2412   // Generate stub for array fill. If "aligned" is true, the
 2413   // "to" address is assumed to be heapword aligned.
 2414   //
 2415   // Arguments for generated stub:
 2416   //   to:    c_rarg0
 2417   //   value: c_rarg1
 2418   //   count: c_rarg2 treated as signed
 2419   //
 2420   address generate_fill(StubId stub_id) {
 2421     BasicType t;
 2422     bool aligned;
 2423 
 2424     switch (stub_id) {
 2425     case StubId::stubgen_jbyte_fill_id:
 2426       t = T_BYTE;
 2427       aligned = false;
 2428       break;
 2429     case StubId::stubgen_jshort_fill_id:
 2430       t = T_SHORT;
 2431       aligned = false;
 2432       break;
 2433     case StubId::stubgen_jint_fill_id:
 2434       t = T_INT;
 2435       aligned = false;
 2436       break;
 2437     case StubId::stubgen_arrayof_jbyte_fill_id:
 2438       t = T_BYTE;
 2439       aligned = true;
 2440       break;
 2441     case StubId::stubgen_arrayof_jshort_fill_id:
 2442       t = T_SHORT;
 2443       aligned = true;
 2444       break;
 2445     case StubId::stubgen_arrayof_jint_fill_id:
 2446       t = T_INT;
 2447       aligned = true;
 2448       break;
 2449     default:
 2450       ShouldNotReachHere();
 2451     };
 2452 
 2453     __ align(CodeEntryAlignment);
 2454     StubCodeMark mark(this, stub_id);
 2455     address start = __ pc();
 2456 
 2457     BLOCK_COMMENT("Entry:");
 2458 
 2459     const Register to        = c_rarg0;  // source array address
 2460     const Register value     = c_rarg1;  // value
 2461     const Register count     = c_rarg2;  // elements count
 2462 
 2463     const Register bz_base = r10;        // base for block_zero routine
 2464     const Register cnt_words = r11;      // temp register
 2465 
 2466     __ enter();
 2467 
 2468     Label L_fill_elements, L_exit1;
 2469 
 2470     int shift = -1;
 2471     switch (t) {
 2472       case T_BYTE:
 2473         shift = 0;
 2474         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2475         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2476         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2477         __ br(Assembler::LO, L_fill_elements);
 2478         break;
 2479       case T_SHORT:
 2480         shift = 1;
 2481         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2482         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2483         __ br(Assembler::LO, L_fill_elements);
 2484         break;
 2485       case T_INT:
 2486         shift = 2;
 2487         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2488         __ br(Assembler::LO, L_fill_elements);
 2489         break;
 2490       default: ShouldNotReachHere();
 2491     }
 2492 
 2493     // Align source address at 8 bytes address boundary.
 2494     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2495     if (!aligned) {
 2496       switch (t) {
 2497         case T_BYTE:
 2498           // One byte misalignment happens only for byte arrays.
 2499           __ tbz(to, 0, L_skip_align1);
 2500           __ strb(value, Address(__ post(to, 1)));
 2501           __ subw(count, count, 1);
 2502           __ bind(L_skip_align1);
 2503           // Fallthrough
 2504         case T_SHORT:
 2505           // Two bytes misalignment happens only for byte and short (char) arrays.
 2506           __ tbz(to, 1, L_skip_align2);
 2507           __ strh(value, Address(__ post(to, 2)));
 2508           __ subw(count, count, 2 >> shift);
 2509           __ bind(L_skip_align2);
 2510           // Fallthrough
 2511         case T_INT:
 2512           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2513           __ tbz(to, 2, L_skip_align4);
 2514           __ strw(value, Address(__ post(to, 4)));
 2515           __ subw(count, count, 4 >> shift);
 2516           __ bind(L_skip_align4);
 2517           break;
 2518         default: ShouldNotReachHere();
 2519       }
 2520     }
 2521 
 2522     //
 2523     //  Fill large chunks
 2524     //
 2525     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2526     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2527     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2528     if (UseBlockZeroing) {
 2529       Label non_block_zeroing, rest;
 2530       // If the fill value is zero we can use the fast zero_words().
 2531       __ cbnz(value, non_block_zeroing);
 2532       __ mov(bz_base, to);
 2533       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2534       address tpc = __ zero_words(bz_base, cnt_words);
 2535       if (tpc == nullptr) {
 2536         fatal("CodeCache is full at generate_fill");
 2537       }
 2538       __ b(rest);
 2539       __ bind(non_block_zeroing);
 2540       __ fill_words(to, cnt_words, value);
 2541       __ bind(rest);
 2542     } else {
 2543       __ fill_words(to, cnt_words, value);
 2544     }
 2545 
 2546     // Remaining count is less than 8 bytes. Fill it by a single store.
 2547     // Note that the total length is no less than 8 bytes.
 2548     if (t == T_BYTE || t == T_SHORT) {
 2549       Label L_exit1;
 2550       __ cbzw(count, L_exit1);
 2551       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2552       __ str(value, Address(to, -8));    // overwrite some elements
 2553       __ bind(L_exit1);
 2554       __ leave();
 2555       __ ret(lr);
 2556     }
 2557 
 2558     // Handle copies less than 8 bytes.
 2559     Label L_fill_2, L_fill_4, L_exit2;
 2560     __ bind(L_fill_elements);
 2561     switch (t) {
 2562       case T_BYTE:
 2563         __ tbz(count, 0, L_fill_2);
 2564         __ strb(value, Address(__ post(to, 1)));
 2565         __ bind(L_fill_2);
 2566         __ tbz(count, 1, L_fill_4);
 2567         __ strh(value, Address(__ post(to, 2)));
 2568         __ bind(L_fill_4);
 2569         __ tbz(count, 2, L_exit2);
 2570         __ strw(value, Address(to));
 2571         break;
 2572       case T_SHORT:
 2573         __ tbz(count, 0, L_fill_4);
 2574         __ strh(value, Address(__ post(to, 2)));
 2575         __ bind(L_fill_4);
 2576         __ tbz(count, 1, L_exit2);
 2577         __ strw(value, Address(to));
 2578         break;
 2579       case T_INT:
 2580         __ cbzw(count, L_exit2);
 2581         __ strw(value, Address(to));
 2582         break;
 2583       default: ShouldNotReachHere();
 2584     }
 2585     __ bind(L_exit2);
 2586     __ leave();
 2587     __ ret(lr);
 2588     return start;
 2589   }
 2590 
 2591   address generate_unsafecopy_common_error_exit() {
 2592     address start_pc = __ pc();
 2593       __ leave();
 2594       __ mov(r0, 0);
 2595       __ ret(lr);
 2596     return start_pc;
 2597   }
 2598 
 2599   //
 2600   //  Generate 'unsafe' set memory stub
 2601   //  Though just as safe as the other stubs, it takes an unscaled
 2602   //  size_t (# bytes) argument instead of an element count.
 2603   //
 2604   //  This fill operation is atomicity preserving: as long as the
 2605   //  address supplied is sufficiently aligned, all writes of up to 64
 2606   //  bits in size are single-copy atomic.
 2607   //
 2608   //  Input:
 2609   //    c_rarg0   - destination array address
 2610   //    c_rarg1   - byte count (size_t)
 2611   //    c_rarg2   - byte value
 2612   //
 2613   address generate_unsafe_setmemory() {
 2614     __ align(CodeEntryAlignment);
 2615     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
 2616     address start = __ pc();
 2617 
 2618     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2619     Label tail;
 2620 
 2621     UnsafeMemoryAccessMark umam(this, true, false);
 2622 
 2623     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2624 
 2625     __ dup(v0, __ T16B, value);
 2626 
 2627     if (AvoidUnalignedAccesses) {
 2628       __ cmp(count, (u1)16);
 2629       __ br(__ LO, tail);
 2630 
 2631       __ mov(rscratch1, 16);
 2632       __ andr(rscratch2, dest, 15);
 2633       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2634       __ strq(v0, Address(dest));
 2635       __ sub(count, count, rscratch1);
 2636       __ add(dest, dest, rscratch1);
 2637     }
 2638 
 2639     __ subs(count, count, (u1)64);
 2640     __ br(__ LO, tail);
 2641     {
 2642       Label again;
 2643       __ bind(again);
 2644       __ stpq(v0, v0, Address(dest));
 2645       __ stpq(v0, v0, Address(dest, 32));
 2646 
 2647       __ subs(count, count, 64);
 2648       __ add(dest, dest, 64);
 2649       __ br(__ HS, again);
 2650     }
 2651 
 2652     __ bind(tail);
 2653     // The count of bytes is off by 64, but we don't need to correct
 2654     // it because we're only going to use the least-significant few
 2655     // count bits from here on.
 2656     // __ add(count, count, 64);
 2657 
 2658     {
 2659       Label dont;
 2660       __ tbz(count, exact_log2(32), dont);
 2661       __ stpq(v0, v0, __ post(dest, 32));
 2662       __ bind(dont);
 2663     }
 2664     {
 2665       Label dont;
 2666       __ tbz(count, exact_log2(16), dont);
 2667       __ strq(v0, __ post(dest, 16));
 2668       __ bind(dont);
 2669     }
 2670     {
 2671       Label dont;
 2672       __ tbz(count, exact_log2(8), dont);
 2673       __ strd(v0, __ post(dest, 8));
 2674       __ bind(dont);
 2675     }
 2676 
 2677     Label finished;
 2678     __ tst(count, 7);
 2679     __ br(__ EQ, finished);
 2680 
 2681     {
 2682       Label dont;
 2683       __ tbz(count, exact_log2(4), dont);
 2684       __ strs(v0, __ post(dest, 4));
 2685       __ bind(dont);
 2686     }
 2687     {
 2688       Label dont;
 2689       __ tbz(count, exact_log2(2), dont);
 2690       __ bfi(value, value, 8, 8);
 2691       __ strh(value, __ post(dest, 2));
 2692       __ bind(dont);
 2693     }
 2694     {
 2695       Label dont;
 2696       __ tbz(count, exact_log2(1), dont);
 2697       __ strb(value, Address(dest));
 2698       __ bind(dont);
 2699     }
 2700 
 2701     __ bind(finished);
 2702     __ leave();
 2703     __ ret(lr);
 2704 
 2705     return start;
 2706   }
 2707 
 2708   address generate_data_cache_writeback() {
 2709     const Register line        = c_rarg0;  // address of line to write back
 2710 
 2711     __ align(CodeEntryAlignment);
 2712 
 2713     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 2714     StubCodeMark mark(this, stub_id);
 2715 
 2716     address start = __ pc();
 2717     __ enter();
 2718     __ cache_wb(Address(line, 0));
 2719     __ leave();
 2720     __ ret(lr);
 2721 
 2722     return start;
 2723   }
 2724 
 2725   address generate_data_cache_writeback_sync() {
 2726     const Register is_pre     = c_rarg0;  // pre or post sync
 2727 
 2728     __ align(CodeEntryAlignment);
 2729 
 2730     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 2731     StubCodeMark mark(this, stub_id);
 2732 
 2733     // pre wbsync is a no-op
 2734     // post wbsync translates to an sfence
 2735 
 2736     Label skip;
 2737     address start = __ pc();
 2738     __ enter();
 2739     __ cbnz(is_pre, skip);
 2740     __ cache_wbsync(false);
 2741     __ bind(skip);
 2742     __ leave();
 2743     __ ret(lr);
 2744 
 2745     return start;
 2746   }
 2747 
 2748   void generate_arraycopy_stubs() {
 2749     address entry;
 2750     address entry_jbyte_arraycopy;
 2751     address entry_jshort_arraycopy;
 2752     address entry_jint_arraycopy;
 2753     address entry_oop_arraycopy;
 2754     address entry_jlong_arraycopy;
 2755     address entry_checkcast_arraycopy;
 2756 
 2757     // generate the common exit first so later stubs can rely on it if
 2758     // they want an UnsafeMemoryAccess exit non-local to the stub
 2759     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 2760     // register the stub as the default exit with class UnsafeMemoryAccess
 2761     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 2762 
 2763     generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2764     generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2765 
 2766     generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2767     generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2768 
 2769     generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2770     generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2771 
 2772     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2773 
 2774     //*** jbyte
 2775     // Always need aligned and unaligned versions
 2776     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &entry);
 2777     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2778     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2779     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, entry, nullptr);
 2780 
 2781     //*** jshort
 2782     // Always need aligned and unaligned versions
 2783     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &entry);
 2784     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2785     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &entry);
 2786     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, entry, nullptr);
 2787 
 2788     //*** jint
 2789     // Aligned versions
 2790     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &entry);
 2791     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2792     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2793     // entry_jint_arraycopy always points to the unaligned version
 2794     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &entry);
 2795     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2796 
 2797     //*** jlong
 2798     // It is always aligned
 2799     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &entry);
 2800     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2801     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2802     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2803 
 2804     //*** oops
 2805     {
 2806       // With compressed oops we need unaligned versions; notice that
 2807       // we overwrite entry_oop_arraycopy.
 2808       bool aligned = !UseCompressedOops;
 2809 
 2810       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2811         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &entry);
 2812       StubRoutines::_arrayof_oop_arraycopy
 2813         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2814       // Aligned versions without pre-barriers
 2815       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2816         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2817       StubRoutines::_arrayof_oop_arraycopy_uninit
 2818         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2819     }
 2820 
 2821     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2822     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2823     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2824     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2825 
 2826     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2827     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 2828 
 2829     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2830                                                               entry_jshort_arraycopy,
 2831                                                               entry_jint_arraycopy,
 2832                                                               entry_jlong_arraycopy);
 2833 
 2834     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2835                                                                entry_jshort_arraycopy,
 2836                                                                entry_jint_arraycopy,
 2837                                                                entry_oop_arraycopy,
 2838                                                                entry_jlong_arraycopy,
 2839                                                                entry_checkcast_arraycopy);
 2840 
 2841     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 2842     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 2843     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 2844     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 2845     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 2846     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 2847   }
 2848 
 2849   void generate_math_stubs() { Unimplemented(); }
 2850 
 2851   // Arguments:
 2852   //
 2853   // Inputs:
 2854   //   c_rarg0   - source byte array address
 2855   //   c_rarg1   - destination byte array address
 2856   //   c_rarg2   - K (key) in little endian int array
 2857   //
 2858   address generate_aescrypt_encryptBlock() {
 2859     __ align(CodeEntryAlignment);
 2860     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 2861     StubCodeMark mark(this, stub_id);
 2862 
 2863     const Register from        = c_rarg0;  // source array address
 2864     const Register to          = c_rarg1;  // destination array address
 2865     const Register key         = c_rarg2;  // key array address
 2866     const Register keylen      = rscratch1;
 2867 
 2868     address start = __ pc();
 2869     __ enter();
 2870 
 2871     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2872 
 2873     __ aesenc_loadkeys(key, keylen);
 2874     __ aesecb_encrypt(from, to, keylen);
 2875 
 2876     __ mov(r0, 0);
 2877 
 2878     __ leave();
 2879     __ ret(lr);
 2880 
 2881     return start;
 2882   }
 2883 
 2884   // Arguments:
 2885   //
 2886   // Inputs:
 2887   //   c_rarg0   - source byte array address
 2888   //   c_rarg1   - destination byte array address
 2889   //   c_rarg2   - K (key) in little endian int array
 2890   //
 2891   address generate_aescrypt_decryptBlock() {
 2892     assert(UseAES, "need AES cryptographic extension support");
 2893     __ align(CodeEntryAlignment);
 2894     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 2895     StubCodeMark mark(this, stub_id);
 2896     Label L_doLast;
 2897 
 2898     const Register from        = c_rarg0;  // source array address
 2899     const Register to          = c_rarg1;  // destination array address
 2900     const Register key         = c_rarg2;  // key array address
 2901     const Register keylen      = rscratch1;
 2902 
 2903     address start = __ pc();
 2904     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2905 
 2906     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2907 
 2908     __ aesecb_decrypt(from, to, key, keylen);
 2909 
 2910     __ mov(r0, 0);
 2911 
 2912     __ leave();
 2913     __ ret(lr);
 2914 
 2915     return start;
 2916   }
 2917 
 2918   // Arguments:
 2919   //
 2920   // Inputs:
 2921   //   c_rarg0   - source byte array address
 2922   //   c_rarg1   - destination byte array address
 2923   //   c_rarg2   - K (key) in little endian int array
 2924   //   c_rarg3   - r vector byte array address
 2925   //   c_rarg4   - input length
 2926   //
 2927   // Output:
 2928   //   x0        - input length
 2929   //
 2930   address generate_cipherBlockChaining_encryptAESCrypt() {
 2931     assert(UseAES, "need AES cryptographic extension support");
 2932     __ align(CodeEntryAlignment);
 2933     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 2934     StubCodeMark mark(this, stub_id);
 2935 
 2936     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2937 
 2938     const Register from        = c_rarg0;  // source array address
 2939     const Register to          = c_rarg1;  // destination array address
 2940     const Register key         = c_rarg2;  // key array address
 2941     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2942                                            // and left with the results of the last encryption block
 2943     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2944     const Register keylen      = rscratch1;
 2945 
 2946     address start = __ pc();
 2947 
 2948       __ enter();
 2949 
 2950       __ movw(rscratch2, len_reg);
 2951 
 2952       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2953 
 2954       __ ld1(v0, __ T16B, rvec);
 2955 
 2956       __ cmpw(keylen, 52);
 2957       __ br(Assembler::CC, L_loadkeys_44);
 2958       __ br(Assembler::EQ, L_loadkeys_52);
 2959 
 2960       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2961       __ rev32(v17, __ T16B, v17);
 2962       __ rev32(v18, __ T16B, v18);
 2963     __ BIND(L_loadkeys_52);
 2964       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2965       __ rev32(v19, __ T16B, v19);
 2966       __ rev32(v20, __ T16B, v20);
 2967     __ BIND(L_loadkeys_44);
 2968       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2969       __ rev32(v21, __ T16B, v21);
 2970       __ rev32(v22, __ T16B, v22);
 2971       __ rev32(v23, __ T16B, v23);
 2972       __ rev32(v24, __ T16B, v24);
 2973       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2974       __ rev32(v25, __ T16B, v25);
 2975       __ rev32(v26, __ T16B, v26);
 2976       __ rev32(v27, __ T16B, v27);
 2977       __ rev32(v28, __ T16B, v28);
 2978       __ ld1(v29, v30, v31, __ T16B, key);
 2979       __ rev32(v29, __ T16B, v29);
 2980       __ rev32(v30, __ T16B, v30);
 2981       __ rev32(v31, __ T16B, v31);
 2982 
 2983     __ BIND(L_aes_loop);
 2984       __ ld1(v1, __ T16B, __ post(from, 16));
 2985       __ eor(v0, __ T16B, v0, v1);
 2986 
 2987       __ br(Assembler::CC, L_rounds_44);
 2988       __ br(Assembler::EQ, L_rounds_52);
 2989 
 2990       __ aese(v0, v17); __ aesmc(v0, v0);
 2991       __ aese(v0, v18); __ aesmc(v0, v0);
 2992     __ BIND(L_rounds_52);
 2993       __ aese(v0, v19); __ aesmc(v0, v0);
 2994       __ aese(v0, v20); __ aesmc(v0, v0);
 2995     __ BIND(L_rounds_44);
 2996       __ aese(v0, v21); __ aesmc(v0, v0);
 2997       __ aese(v0, v22); __ aesmc(v0, v0);
 2998       __ aese(v0, v23); __ aesmc(v0, v0);
 2999       __ aese(v0, v24); __ aesmc(v0, v0);
 3000       __ aese(v0, v25); __ aesmc(v0, v0);
 3001       __ aese(v0, v26); __ aesmc(v0, v0);
 3002       __ aese(v0, v27); __ aesmc(v0, v0);
 3003       __ aese(v0, v28); __ aesmc(v0, v0);
 3004       __ aese(v0, v29); __ aesmc(v0, v0);
 3005       __ aese(v0, v30);
 3006       __ eor(v0, __ T16B, v0, v31);
 3007 
 3008       __ st1(v0, __ T16B, __ post(to, 16));
 3009 
 3010       __ subw(len_reg, len_reg, 16);
 3011       __ cbnzw(len_reg, L_aes_loop);
 3012 
 3013       __ st1(v0, __ T16B, rvec);
 3014 
 3015       __ mov(r0, rscratch2);
 3016 
 3017       __ leave();
 3018       __ ret(lr);
 3019 
 3020       return start;
 3021   }
 3022 
 3023   // Arguments:
 3024   //
 3025   // Inputs:
 3026   //   c_rarg0   - source byte array address
 3027   //   c_rarg1   - destination byte array address
 3028   //   c_rarg2   - K (key) in little endian int array
 3029   //   c_rarg3   - r vector byte array address
 3030   //   c_rarg4   - input length
 3031   //
 3032   // Output:
 3033   //   r0        - input length
 3034   //
 3035   address generate_cipherBlockChaining_decryptAESCrypt() {
 3036     assert(UseAES, "need AES cryptographic extension support");
 3037     __ align(CodeEntryAlignment);
 3038     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3039     StubCodeMark mark(this, stub_id);
 3040 
 3041     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3042 
 3043     const Register from        = c_rarg0;  // source array address
 3044     const Register to          = c_rarg1;  // destination array address
 3045     const Register key         = c_rarg2;  // key array address
 3046     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3047                                            // and left with the results of the last encryption block
 3048     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3049     const Register keylen      = rscratch1;
 3050 
 3051     address start = __ pc();
 3052 
 3053       __ enter();
 3054 
 3055       __ movw(rscratch2, len_reg);
 3056 
 3057       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3058 
 3059       __ ld1(v2, __ T16B, rvec);
 3060 
 3061       __ ld1(v31, __ T16B, __ post(key, 16));
 3062       __ rev32(v31, __ T16B, v31);
 3063 
 3064       __ cmpw(keylen, 52);
 3065       __ br(Assembler::CC, L_loadkeys_44);
 3066       __ br(Assembler::EQ, L_loadkeys_52);
 3067 
 3068       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3069       __ rev32(v17, __ T16B, v17);
 3070       __ rev32(v18, __ T16B, v18);
 3071     __ BIND(L_loadkeys_52);
 3072       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3073       __ rev32(v19, __ T16B, v19);
 3074       __ rev32(v20, __ T16B, v20);
 3075     __ BIND(L_loadkeys_44);
 3076       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3077       __ rev32(v21, __ T16B, v21);
 3078       __ rev32(v22, __ T16B, v22);
 3079       __ rev32(v23, __ T16B, v23);
 3080       __ rev32(v24, __ T16B, v24);
 3081       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3082       __ rev32(v25, __ T16B, v25);
 3083       __ rev32(v26, __ T16B, v26);
 3084       __ rev32(v27, __ T16B, v27);
 3085       __ rev32(v28, __ T16B, v28);
 3086       __ ld1(v29, v30, __ T16B, key);
 3087       __ rev32(v29, __ T16B, v29);
 3088       __ rev32(v30, __ T16B, v30);
 3089 
 3090     __ BIND(L_aes_loop);
 3091       __ ld1(v0, __ T16B, __ post(from, 16));
 3092       __ orr(v1, __ T16B, v0, v0);
 3093 
 3094       __ br(Assembler::CC, L_rounds_44);
 3095       __ br(Assembler::EQ, L_rounds_52);
 3096 
 3097       __ aesd(v0, v17); __ aesimc(v0, v0);
 3098       __ aesd(v0, v18); __ aesimc(v0, v0);
 3099     __ BIND(L_rounds_52);
 3100       __ aesd(v0, v19); __ aesimc(v0, v0);
 3101       __ aesd(v0, v20); __ aesimc(v0, v0);
 3102     __ BIND(L_rounds_44);
 3103       __ aesd(v0, v21); __ aesimc(v0, v0);
 3104       __ aesd(v0, v22); __ aesimc(v0, v0);
 3105       __ aesd(v0, v23); __ aesimc(v0, v0);
 3106       __ aesd(v0, v24); __ aesimc(v0, v0);
 3107       __ aesd(v0, v25); __ aesimc(v0, v0);
 3108       __ aesd(v0, v26); __ aesimc(v0, v0);
 3109       __ aesd(v0, v27); __ aesimc(v0, v0);
 3110       __ aesd(v0, v28); __ aesimc(v0, v0);
 3111       __ aesd(v0, v29); __ aesimc(v0, v0);
 3112       __ aesd(v0, v30);
 3113       __ eor(v0, __ T16B, v0, v31);
 3114       __ eor(v0, __ T16B, v0, v2);
 3115 
 3116       __ st1(v0, __ T16B, __ post(to, 16));
 3117       __ orr(v2, __ T16B, v1, v1);
 3118 
 3119       __ subw(len_reg, len_reg, 16);
 3120       __ cbnzw(len_reg, L_aes_loop);
 3121 
 3122       __ st1(v2, __ T16B, rvec);
 3123 
 3124       __ mov(r0, rscratch2);
 3125 
 3126       __ leave();
 3127       __ ret(lr);
 3128 
 3129     return start;
 3130   }
 3131 
 3132   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3133   // Inputs: 128-bits. in is preserved.
 3134   // The least-significant 64-bit word is in the upper dword of each vector.
 3135   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3136   // Output: result
 3137   void be_add_128_64(FloatRegister result, FloatRegister in,
 3138                      FloatRegister inc, FloatRegister tmp) {
 3139     assert_different_registers(result, tmp, inc);
 3140 
 3141     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3142                                            // input
 3143     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3144     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3145                                            // MSD == 0 (must be!) to LSD
 3146     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3147   }
 3148 
 3149   // CTR AES crypt.
 3150   // Arguments:
 3151   //
 3152   // Inputs:
 3153   //   c_rarg0   - source byte array address
 3154   //   c_rarg1   - destination byte array address
 3155   //   c_rarg2   - K (key) in little endian int array
 3156   //   c_rarg3   - counter vector byte array address
 3157   //   c_rarg4   - input length
 3158   //   c_rarg5   - saved encryptedCounter start
 3159   //   c_rarg6   - saved used length
 3160   //
 3161   // Output:
 3162   //   r0       - input length
 3163   //
 3164   address generate_counterMode_AESCrypt() {
 3165     const Register in = c_rarg0;
 3166     const Register out = c_rarg1;
 3167     const Register key = c_rarg2;
 3168     const Register counter = c_rarg3;
 3169     const Register saved_len = c_rarg4, len = r10;
 3170     const Register saved_encrypted_ctr = c_rarg5;
 3171     const Register used_ptr = c_rarg6, used = r12;
 3172 
 3173     const Register offset = r7;
 3174     const Register keylen = r11;
 3175 
 3176     const unsigned char block_size = 16;
 3177     const int bulk_width = 4;
 3178     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3179     // performance with larger data sizes, but it also means that the
 3180     // fast path isn't used until you have at least 8 blocks, and up
 3181     // to 127 bytes of data will be executed on the slow path. For
 3182     // that reason, and also so as not to blow away too much icache, 4
 3183     // blocks seems like a sensible compromise.
 3184 
 3185     // Algorithm:
 3186     //
 3187     //    if (len == 0) {
 3188     //        goto DONE;
 3189     //    }
 3190     //    int result = len;
 3191     //    do {
 3192     //        if (used >= blockSize) {
 3193     //            if (len >= bulk_width * blockSize) {
 3194     //                CTR_large_block();
 3195     //                if (len == 0)
 3196     //                    goto DONE;
 3197     //            }
 3198     //            for (;;) {
 3199     //                16ByteVector v0 = counter;
 3200     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3201     //                used = 0;
 3202     //                if (len < blockSize)
 3203     //                    break;    /* goto NEXT */
 3204     //                16ByteVector v1 = load16Bytes(in, offset);
 3205     //                v1 = v1 ^ encryptedCounter;
 3206     //                store16Bytes(out, offset);
 3207     //                used = blockSize;
 3208     //                offset += blockSize;
 3209     //                len -= blockSize;
 3210     //                if (len == 0)
 3211     //                    goto DONE;
 3212     //            }
 3213     //        }
 3214     //      NEXT:
 3215     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3216     //        len--;
 3217     //    } while (len != 0);
 3218     //  DONE:
 3219     //    return result;
 3220     //
 3221     // CTR_large_block()
 3222     //    Wide bulk encryption of whole blocks.
 3223 
 3224     __ align(CodeEntryAlignment);
 3225     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3226     StubCodeMark mark(this, stub_id);
 3227     const address start = __ pc();
 3228     __ enter();
 3229 
 3230     Label DONE, CTR_large_block, large_block_return;
 3231     __ ldrw(used, Address(used_ptr));
 3232     __ cbzw(saved_len, DONE);
 3233 
 3234     __ mov(len, saved_len);
 3235     __ mov(offset, 0);
 3236 
 3237     // Compute #rounds for AES based on the length of the key array
 3238     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3239 
 3240     __ aesenc_loadkeys(key, keylen);
 3241 
 3242     {
 3243       Label L_CTR_loop, NEXT;
 3244 
 3245       __ bind(L_CTR_loop);
 3246 
 3247       __ cmp(used, block_size);
 3248       __ br(__ LO, NEXT);
 3249 
 3250       // Maybe we have a lot of data
 3251       __ subsw(rscratch1, len, bulk_width * block_size);
 3252       __ br(__ HS, CTR_large_block);
 3253       __ BIND(large_block_return);
 3254       __ cbzw(len, DONE);
 3255 
 3256       // Setup the counter
 3257       __ movi(v4, __ T4S, 0);
 3258       __ movi(v5, __ T4S, 1);
 3259       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3260 
 3261       // 128-bit big-endian increment
 3262       __ ld1(v0, __ T16B, counter);
 3263       __ rev64(v16, __ T16B, v0);
 3264       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3265       __ rev64(v16, __ T16B, v16);
 3266       __ st1(v16, __ T16B, counter);
 3267       // Previous counter value is in v0
 3268       // v4 contains { 0, 1 }
 3269 
 3270       {
 3271         // We have fewer than bulk_width blocks of data left. Encrypt
 3272         // them one by one until there is less than a full block
 3273         // remaining, being careful to save both the encrypted counter
 3274         // and the counter.
 3275 
 3276         Label inner_loop;
 3277         __ bind(inner_loop);
 3278         // Counter to encrypt is in v0
 3279         __ aesecb_encrypt(noreg, noreg, keylen);
 3280         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3281 
 3282         // Do we have a remaining full block?
 3283 
 3284         __ mov(used, 0);
 3285         __ cmp(len, block_size);
 3286         __ br(__ LO, NEXT);
 3287 
 3288         // Yes, we have a full block
 3289         __ ldrq(v1, Address(in, offset));
 3290         __ eor(v1, __ T16B, v1, v0);
 3291         __ strq(v1, Address(out, offset));
 3292         __ mov(used, block_size);
 3293         __ add(offset, offset, block_size);
 3294 
 3295         __ subw(len, len, block_size);
 3296         __ cbzw(len, DONE);
 3297 
 3298         // Increment the counter, store it back
 3299         __ orr(v0, __ T16B, v16, v16);
 3300         __ rev64(v16, __ T16B, v16);
 3301         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3302         __ rev64(v16, __ T16B, v16);
 3303         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3304 
 3305         __ b(inner_loop);
 3306       }
 3307 
 3308       __ BIND(NEXT);
 3309 
 3310       // Encrypt a single byte, and loop.
 3311       // We expect this to be a rare event.
 3312       __ ldrb(rscratch1, Address(in, offset));
 3313       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3314       __ eor(rscratch1, rscratch1, rscratch2);
 3315       __ strb(rscratch1, Address(out, offset));
 3316       __ add(offset, offset, 1);
 3317       __ add(used, used, 1);
 3318       __ subw(len, len,1);
 3319       __ cbnzw(len, L_CTR_loop);
 3320     }
 3321 
 3322     __ bind(DONE);
 3323     __ strw(used, Address(used_ptr));
 3324     __ mov(r0, saved_len);
 3325 
 3326     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3327     __ ret(lr);
 3328 
 3329     // Bulk encryption
 3330 
 3331     __ BIND (CTR_large_block);
 3332     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3333 
 3334     if (bulk_width == 8) {
 3335       __ sub(sp, sp, 4 * 16);
 3336       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3337     }
 3338     __ sub(sp, sp, 4 * 16);
 3339     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3340     RegSet saved_regs = (RegSet::of(in, out, offset)
 3341                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3342     __ push(saved_regs, sp);
 3343     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3344     __ add(in, in, offset);
 3345     __ add(out, out, offset);
 3346 
 3347     // Keys should already be loaded into the correct registers
 3348 
 3349     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3350     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3351 
 3352     // AES/CTR loop
 3353     {
 3354       Label L_CTR_loop;
 3355       __ BIND(L_CTR_loop);
 3356 
 3357       // Setup the counters
 3358       __ movi(v8, __ T4S, 0);
 3359       __ movi(v9, __ T4S, 1);
 3360       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3361 
 3362       for (int i = 0; i < bulk_width; i++) {
 3363         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3364         __ rev64(v0_ofs, __ T16B, v16);
 3365         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3366       }
 3367 
 3368       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3369 
 3370       // Encrypt the counters
 3371       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3372 
 3373       if (bulk_width == 8) {
 3374         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3375       }
 3376 
 3377       // XOR the encrypted counters with the inputs
 3378       for (int i = 0; i < bulk_width; i++) {
 3379         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3380         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3381         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3382       }
 3383 
 3384       // Write the encrypted data
 3385       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3386       if (bulk_width == 8) {
 3387         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3388       }
 3389 
 3390       __ subw(len, len, 16 * bulk_width);
 3391       __ cbnzw(len, L_CTR_loop);
 3392     }
 3393 
 3394     // Save the counter back where it goes
 3395     __ rev64(v16, __ T16B, v16);
 3396     __ st1(v16, __ T16B, counter);
 3397 
 3398     __ pop(saved_regs, sp);
 3399 
 3400     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3401     if (bulk_width == 8) {
 3402       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3403     }
 3404 
 3405     __ andr(rscratch1, len, -16 * bulk_width);
 3406     __ sub(len, len, rscratch1);
 3407     __ add(offset, offset, rscratch1);
 3408     __ mov(used, 16);
 3409     __ strw(used, Address(used_ptr));
 3410     __ b(large_block_return);
 3411 
 3412     return start;
 3413   }
 3414 
 3415   // Vector AES Galois Counter Mode implementation. Parameters:
 3416   //
 3417   // in = c_rarg0
 3418   // len = c_rarg1
 3419   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3420   // out = c_rarg3
 3421   // key = c_rarg4
 3422   // state = c_rarg5 - GHASH.state
 3423   // subkeyHtbl = c_rarg6 - powers of H
 3424   // counter = c_rarg7 - 16 bytes of CTR
 3425   // return - number of processed bytes
 3426   address generate_galoisCounterMode_AESCrypt() {
 3427     address ghash_polynomial = __ pc();
 3428     __ emit_int64(0x87);  // The low-order bits of the field
 3429                           // polynomial (i.e. p = z^7+z^2+z+1)
 3430                           // repeated in the low and high parts of a
 3431                           // 128-bit vector
 3432     __ emit_int64(0x87);
 3433 
 3434     __ align(CodeEntryAlignment);
 3435     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3436     StubCodeMark mark(this, stub_id);
 3437     address start = __ pc();
 3438     __ enter();
 3439 
 3440     const Register in = c_rarg0;
 3441     const Register len = c_rarg1;
 3442     const Register ct = c_rarg2;
 3443     const Register out = c_rarg3;
 3444     // and updated with the incremented counter in the end
 3445 
 3446     const Register key = c_rarg4;
 3447     const Register state = c_rarg5;
 3448 
 3449     const Register subkeyHtbl = c_rarg6;
 3450 
 3451     const Register counter = c_rarg7;
 3452 
 3453     const Register keylen = r10;
 3454     // Save state before entering routine
 3455     __ sub(sp, sp, 4 * 16);
 3456     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3457     __ sub(sp, sp, 4 * 16);
 3458     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3459 
 3460     // __ andr(len, len, -512);
 3461     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3462     __ str(len, __ pre(sp, -2 * wordSize));
 3463 
 3464     Label DONE;
 3465     __ cbz(len, DONE);
 3466 
 3467     // Compute #rounds for AES based on the length of the key array
 3468     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3469 
 3470     __ aesenc_loadkeys(key, keylen);
 3471     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3472     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3473 
 3474     // AES/CTR loop
 3475     {
 3476       Label L_CTR_loop;
 3477       __ BIND(L_CTR_loop);
 3478 
 3479       // Setup the counters
 3480       __ movi(v8, __ T4S, 0);
 3481       __ movi(v9, __ T4S, 1);
 3482       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3483 
 3484       assert(v0->encoding() < v8->encoding(), "");
 3485       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3486         FloatRegister f = as_FloatRegister(i);
 3487         __ rev32(f, __ T16B, v16);
 3488         __ addv(v16, __ T4S, v16, v8);
 3489       }
 3490 
 3491       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3492 
 3493       // Encrypt the counters
 3494       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3495 
 3496       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3497 
 3498       // XOR the encrypted counters with the inputs
 3499       for (int i = 0; i < 8; i++) {
 3500         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3501         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3502         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3503       }
 3504       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3505       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3506 
 3507       __ subw(len, len, 16 * 8);
 3508       __ cbnzw(len, L_CTR_loop);
 3509     }
 3510 
 3511     __ rev32(v16, __ T16B, v16);
 3512     __ st1(v16, __ T16B, counter);
 3513 
 3514     __ ldr(len, Address(sp));
 3515     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3516 
 3517     // GHASH/CTR loop
 3518     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3519                                 len, /*unrolls*/4);
 3520 
 3521 #ifdef ASSERT
 3522     { Label L;
 3523       __ cmp(len, (unsigned char)0);
 3524       __ br(Assembler::EQ, L);
 3525       __ stop("stubGenerator: abort");
 3526       __ bind(L);
 3527   }
 3528 #endif
 3529 
 3530   __ bind(DONE);
 3531     // Return the number of bytes processed
 3532     __ ldr(r0, __ post(sp, 2 * wordSize));
 3533 
 3534     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3535     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3536 
 3537     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3538     __ ret(lr);
 3539      return start;
 3540   }
 3541 
 3542   class Cached64Bytes {
 3543   private:
 3544     MacroAssembler *_masm;
 3545     Register _regs[8];
 3546 
 3547   public:
 3548     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3549       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3550       auto it = rs.begin();
 3551       for (auto &r: _regs) {
 3552         r = *it;
 3553         ++it;
 3554       }
 3555     }
 3556 
 3557     void gen_loads(Register base) {
 3558       for (int i = 0; i < 8; i += 2) {
 3559         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3560       }
 3561     }
 3562 
 3563     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3564     void extract_u32(Register dest, int i) {
 3565       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3566     }
 3567   };
 3568 
 3569   // Utility routines for md5.
 3570   // Clobbers r10 and r11.
 3571   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3572               int k, int s, int t) {
 3573     Register rscratch3 = r10;
 3574     Register rscratch4 = r11;
 3575 
 3576     __ eorw(rscratch3, r3, r4);
 3577     __ movw(rscratch2, t);
 3578     __ andw(rscratch3, rscratch3, r2);
 3579     __ addw(rscratch4, r1, rscratch2);
 3580     reg_cache.extract_u32(rscratch1, k);
 3581     __ eorw(rscratch3, rscratch3, r4);
 3582     __ addw(rscratch4, rscratch4, rscratch1);
 3583     __ addw(rscratch3, rscratch3, rscratch4);
 3584     __ rorw(rscratch2, rscratch3, 32 - s);
 3585     __ addw(r1, rscratch2, r2);
 3586   }
 3587 
 3588   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3589               int k, int s, int t) {
 3590     Register rscratch3 = r10;
 3591     Register rscratch4 = r11;
 3592 
 3593     reg_cache.extract_u32(rscratch1, k);
 3594     __ movw(rscratch2, t);
 3595     __ addw(rscratch4, r1, rscratch2);
 3596     __ addw(rscratch4, rscratch4, rscratch1);
 3597     __ bicw(rscratch2, r3, r4);
 3598     __ andw(rscratch3, r2, r4);
 3599     __ addw(rscratch2, rscratch2, rscratch4);
 3600     __ addw(rscratch2, rscratch2, rscratch3);
 3601     __ rorw(rscratch2, rscratch2, 32 - s);
 3602     __ addw(r1, rscratch2, r2);
 3603   }
 3604 
 3605   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3606               int k, int s, int t) {
 3607     Register rscratch3 = r10;
 3608     Register rscratch4 = r11;
 3609 
 3610     __ eorw(rscratch3, r3, r4);
 3611     __ movw(rscratch2, t);
 3612     __ addw(rscratch4, r1, rscratch2);
 3613     reg_cache.extract_u32(rscratch1, k);
 3614     __ eorw(rscratch3, rscratch3, r2);
 3615     __ addw(rscratch4, rscratch4, rscratch1);
 3616     __ addw(rscratch3, rscratch3, rscratch4);
 3617     __ rorw(rscratch2, rscratch3, 32 - s);
 3618     __ addw(r1, rscratch2, r2);
 3619   }
 3620 
 3621   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3622               int k, int s, int t) {
 3623     Register rscratch3 = r10;
 3624     Register rscratch4 = r11;
 3625 
 3626     __ movw(rscratch3, t);
 3627     __ ornw(rscratch2, r2, r4);
 3628     __ addw(rscratch4, r1, rscratch3);
 3629     reg_cache.extract_u32(rscratch1, k);
 3630     __ eorw(rscratch3, rscratch2, r3);
 3631     __ addw(rscratch4, rscratch4, rscratch1);
 3632     __ addw(rscratch3, rscratch3, rscratch4);
 3633     __ rorw(rscratch2, rscratch3, 32 - s);
 3634     __ addw(r1, rscratch2, r2);
 3635   }
 3636 
 3637   // Arguments:
 3638   //
 3639   // Inputs:
 3640   //   c_rarg0   - byte[]  source+offset
 3641   //   c_rarg1   - int[]   SHA.state
 3642   //   c_rarg2   - int     offset
 3643   //   c_rarg3   - int     limit
 3644   //
 3645   address generate_md5_implCompress(StubId stub_id) {
 3646     bool multi_block;
 3647     switch (stub_id) {
 3648     case StubId::stubgen_md5_implCompress_id:
 3649       multi_block = false;
 3650       break;
 3651     case StubId::stubgen_md5_implCompressMB_id:
 3652       multi_block = true;
 3653       break;
 3654     default:
 3655       ShouldNotReachHere();
 3656     }
 3657     __ align(CodeEntryAlignment);
 3658 
 3659     StubCodeMark mark(this, stub_id);
 3660     address start = __ pc();
 3661 
 3662     Register buf       = c_rarg0;
 3663     Register state     = c_rarg1;
 3664     Register ofs       = c_rarg2;
 3665     Register limit     = c_rarg3;
 3666     Register a         = r4;
 3667     Register b         = r5;
 3668     Register c         = r6;
 3669     Register d         = r7;
 3670     Register rscratch3 = r10;
 3671     Register rscratch4 = r11;
 3672 
 3673     Register state_regs[2] = { r12, r13 };
 3674     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3675     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3676 
 3677     __ push(saved_regs, sp);
 3678 
 3679     __ ldp(state_regs[0], state_regs[1], Address(state));
 3680     __ ubfx(a, state_regs[0],  0, 32);
 3681     __ ubfx(b, state_regs[0], 32, 32);
 3682     __ ubfx(c, state_regs[1],  0, 32);
 3683     __ ubfx(d, state_regs[1], 32, 32);
 3684 
 3685     Label md5_loop;
 3686     __ BIND(md5_loop);
 3687 
 3688     reg_cache.gen_loads(buf);
 3689 
 3690     // Round 1
 3691     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3692     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3693     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3694     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3695     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3696     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3697     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3698     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3699     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3700     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3701     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3702     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3703     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3704     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3705     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3706     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3707 
 3708     // Round 2
 3709     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3710     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3711     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3712     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3713     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3714     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3715     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3716     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3717     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3718     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3719     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3720     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3721     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3722     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3723     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3724     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3725 
 3726     // Round 3
 3727     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3728     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3729     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3730     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3731     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3732     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3733     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3734     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3735     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3736     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3737     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3738     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3739     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3740     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3741     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3742     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3743 
 3744     // Round 4
 3745     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3746     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3747     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3748     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3749     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3750     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3751     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3752     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3753     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3754     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3755     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3756     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3757     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3758     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3759     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3760     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3761 
 3762     __ addw(a, state_regs[0], a);
 3763     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3764     __ addw(b, rscratch2, b);
 3765     __ addw(c, state_regs[1], c);
 3766     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3767     __ addw(d, rscratch4, d);
 3768 
 3769     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3770     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3771 
 3772     if (multi_block) {
 3773       __ add(buf, buf, 64);
 3774       __ add(ofs, ofs, 64);
 3775       __ cmp(ofs, limit);
 3776       __ br(Assembler::LE, md5_loop);
 3777       __ mov(c_rarg0, ofs); // return ofs
 3778     }
 3779 
 3780     // write hash values back in the correct order
 3781     __ stp(state_regs[0], state_regs[1], Address(state));
 3782 
 3783     __ pop(saved_regs, sp);
 3784 
 3785     __ ret(lr);
 3786 
 3787     return start;
 3788   }
 3789 
 3790   // Arguments:
 3791   //
 3792   // Inputs:
 3793   //   c_rarg0   - byte[]  source+offset
 3794   //   c_rarg1   - int[]   SHA.state
 3795   //   c_rarg2   - int     offset
 3796   //   c_rarg3   - int     limit
 3797   //
 3798   address generate_sha1_implCompress(StubId stub_id) {
 3799     bool multi_block;
 3800     switch (stub_id) {
 3801     case StubId::stubgen_sha1_implCompress_id:
 3802       multi_block = false;
 3803       break;
 3804     case StubId::stubgen_sha1_implCompressMB_id:
 3805       multi_block = true;
 3806       break;
 3807     default:
 3808       ShouldNotReachHere();
 3809     }
 3810 
 3811     __ align(CodeEntryAlignment);
 3812 
 3813     StubCodeMark mark(this, stub_id);
 3814     address start = __ pc();
 3815 
 3816     Register buf   = c_rarg0;
 3817     Register state = c_rarg1;
 3818     Register ofs   = c_rarg2;
 3819     Register limit = c_rarg3;
 3820 
 3821     Label keys;
 3822     Label sha1_loop;
 3823 
 3824     // load the keys into v0..v3
 3825     __ adr(rscratch1, keys);
 3826     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3827     // load 5 words state into v6, v7
 3828     __ ldrq(v6, Address(state, 0));
 3829     __ ldrs(v7, Address(state, 16));
 3830 
 3831 
 3832     __ BIND(sha1_loop);
 3833     // load 64 bytes of data into v16..v19
 3834     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3835     __ rev32(v16, __ T16B, v16);
 3836     __ rev32(v17, __ T16B, v17);
 3837     __ rev32(v18, __ T16B, v18);
 3838     __ rev32(v19, __ T16B, v19);
 3839 
 3840     // do the sha1
 3841     __ addv(v4, __ T4S, v16, v0);
 3842     __ orr(v20, __ T16B, v6, v6);
 3843 
 3844     FloatRegister d0 = v16;
 3845     FloatRegister d1 = v17;
 3846     FloatRegister d2 = v18;
 3847     FloatRegister d3 = v19;
 3848 
 3849     for (int round = 0; round < 20; round++) {
 3850       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3851       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3852       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3853       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3854       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3855 
 3856       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3857       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3858       __ sha1h(tmp2, __ T4S, v20);
 3859       if (round < 5)
 3860         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3861       else if (round < 10 || round >= 15)
 3862         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3863       else
 3864         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3865       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3866 
 3867       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3868     }
 3869 
 3870     __ addv(v7, __ T2S, v7, v21);
 3871     __ addv(v6, __ T4S, v6, v20);
 3872 
 3873     if (multi_block) {
 3874       __ add(ofs, ofs, 64);
 3875       __ cmp(ofs, limit);
 3876       __ br(Assembler::LE, sha1_loop);
 3877       __ mov(c_rarg0, ofs); // return ofs
 3878     }
 3879 
 3880     __ strq(v6, Address(state, 0));
 3881     __ strs(v7, Address(state, 16));
 3882 
 3883     __ ret(lr);
 3884 
 3885     __ bind(keys);
 3886     __ emit_int32(0x5a827999);
 3887     __ emit_int32(0x6ed9eba1);
 3888     __ emit_int32(0x8f1bbcdc);
 3889     __ emit_int32(0xca62c1d6);
 3890 
 3891     return start;
 3892   }
 3893 
 3894 
 3895   // Arguments:
 3896   //
 3897   // Inputs:
 3898   //   c_rarg0   - byte[]  source+offset
 3899   //   c_rarg1   - int[]   SHA.state
 3900   //   c_rarg2   - int     offset
 3901   //   c_rarg3   - int     limit
 3902   //
 3903   address generate_sha256_implCompress(StubId stub_id) {
 3904     bool multi_block;
 3905     switch (stub_id) {
 3906     case StubId::stubgen_sha256_implCompress_id:
 3907       multi_block = false;
 3908       break;
 3909     case StubId::stubgen_sha256_implCompressMB_id:
 3910       multi_block = true;
 3911       break;
 3912     default:
 3913       ShouldNotReachHere();
 3914     }
 3915 
 3916     static const uint32_t round_consts[64] = {
 3917       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3918       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3919       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3920       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3921       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3922       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3923       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3924       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3925       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3926       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3927       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3928       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3929       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3930       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3931       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3932       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3933     };
 3934 
 3935     __ align(CodeEntryAlignment);
 3936 
 3937     StubCodeMark mark(this, stub_id);
 3938     address start = __ pc();
 3939 
 3940     Register buf   = c_rarg0;
 3941     Register state = c_rarg1;
 3942     Register ofs   = c_rarg2;
 3943     Register limit = c_rarg3;
 3944 
 3945     Label sha1_loop;
 3946 
 3947     __ stpd(v8, v9, __ pre(sp, -32));
 3948     __ stpd(v10, v11, Address(sp, 16));
 3949 
 3950 // dga == v0
 3951 // dgb == v1
 3952 // dg0 == v2
 3953 // dg1 == v3
 3954 // dg2 == v4
 3955 // t0 == v6
 3956 // t1 == v7
 3957 
 3958     // load 16 keys to v16..v31
 3959     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3960     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3961     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3962     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3963     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3964 
 3965     // load 8 words (256 bits) state
 3966     __ ldpq(v0, v1, state);
 3967 
 3968     __ BIND(sha1_loop);
 3969     // load 64 bytes of data into v8..v11
 3970     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3971     __ rev32(v8, __ T16B, v8);
 3972     __ rev32(v9, __ T16B, v9);
 3973     __ rev32(v10, __ T16B, v10);
 3974     __ rev32(v11, __ T16B, v11);
 3975 
 3976     __ addv(v6, __ T4S, v8, v16);
 3977     __ orr(v2, __ T16B, v0, v0);
 3978     __ orr(v3, __ T16B, v1, v1);
 3979 
 3980     FloatRegister d0 = v8;
 3981     FloatRegister d1 = v9;
 3982     FloatRegister d2 = v10;
 3983     FloatRegister d3 = v11;
 3984 
 3985 
 3986     for (int round = 0; round < 16; round++) {
 3987       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3988       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3989       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3990       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3991 
 3992       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3993        __ orr(v4, __ T16B, v2, v2);
 3994       if (round < 15)
 3995         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3996       __ sha256h(v2, __ T4S, v3, tmp2);
 3997       __ sha256h2(v3, __ T4S, v4, tmp2);
 3998       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3999 
 4000       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4001     }
 4002 
 4003     __ addv(v0, __ T4S, v0, v2);
 4004     __ addv(v1, __ T4S, v1, v3);
 4005 
 4006     if (multi_block) {
 4007       __ add(ofs, ofs, 64);
 4008       __ cmp(ofs, limit);
 4009       __ br(Assembler::LE, sha1_loop);
 4010       __ mov(c_rarg0, ofs); // return ofs
 4011     }
 4012 
 4013     __ ldpd(v10, v11, Address(sp, 16));
 4014     __ ldpd(v8, v9, __ post(sp, 32));
 4015 
 4016     __ stpq(v0, v1, state);
 4017 
 4018     __ ret(lr);
 4019 
 4020     return start;
 4021   }
 4022 
 4023   // Double rounds for sha512.
 4024   void sha512_dround(int dr,
 4025                      FloatRegister vi0, FloatRegister vi1,
 4026                      FloatRegister vi2, FloatRegister vi3,
 4027                      FloatRegister vi4, FloatRegister vrc0,
 4028                      FloatRegister vrc1, FloatRegister vin0,
 4029                      FloatRegister vin1, FloatRegister vin2,
 4030                      FloatRegister vin3, FloatRegister vin4) {
 4031       if (dr < 36) {
 4032         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4033       }
 4034       __ addv(v5, __ T2D, vrc0, vin0);
 4035       __ ext(v6, __ T16B, vi2, vi3, 8);
 4036       __ ext(v5, __ T16B, v5, v5, 8);
 4037       __ ext(v7, __ T16B, vi1, vi2, 8);
 4038       __ addv(vi3, __ T2D, vi3, v5);
 4039       if (dr < 32) {
 4040         __ ext(v5, __ T16B, vin3, vin4, 8);
 4041         __ sha512su0(vin0, __ T2D, vin1);
 4042       }
 4043       __ sha512h(vi3, __ T2D, v6, v7);
 4044       if (dr < 32) {
 4045         __ sha512su1(vin0, __ T2D, vin2, v5);
 4046       }
 4047       __ addv(vi4, __ T2D, vi1, vi3);
 4048       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4049   }
 4050 
 4051   // Arguments:
 4052   //
 4053   // Inputs:
 4054   //   c_rarg0   - byte[]  source+offset
 4055   //   c_rarg1   - int[]   SHA.state
 4056   //   c_rarg2   - int     offset
 4057   //   c_rarg3   - int     limit
 4058   //
 4059   address generate_sha512_implCompress(StubId stub_id) {
 4060     bool multi_block;
 4061     switch (stub_id) {
 4062     case StubId::stubgen_sha512_implCompress_id:
 4063       multi_block = false;
 4064       break;
 4065     case StubId::stubgen_sha512_implCompressMB_id:
 4066       multi_block = true;
 4067       break;
 4068     default:
 4069       ShouldNotReachHere();
 4070     }
 4071 
 4072     static const uint64_t round_consts[80] = {
 4073       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4074       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4075       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4076       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4077       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4078       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4079       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4080       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4081       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4082       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4083       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4084       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4085       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4086       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4087       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4088       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4089       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4090       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4091       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4092       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4093       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4094       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4095       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4096       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4097       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4098       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4099       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4100     };
 4101 
 4102     __ align(CodeEntryAlignment);
 4103 
 4104     StubCodeMark mark(this, stub_id);
 4105     address start = __ pc();
 4106 
 4107     Register buf   = c_rarg0;
 4108     Register state = c_rarg1;
 4109     Register ofs   = c_rarg2;
 4110     Register limit = c_rarg3;
 4111 
 4112     __ stpd(v8, v9, __ pre(sp, -64));
 4113     __ stpd(v10, v11, Address(sp, 16));
 4114     __ stpd(v12, v13, Address(sp, 32));
 4115     __ stpd(v14, v15, Address(sp, 48));
 4116 
 4117     Label sha512_loop;
 4118 
 4119     // load state
 4120     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4121 
 4122     // load first 4 round constants
 4123     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4124     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4125 
 4126     __ BIND(sha512_loop);
 4127     // load 128B of data into v12..v19
 4128     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4129     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4130     __ rev64(v12, __ T16B, v12);
 4131     __ rev64(v13, __ T16B, v13);
 4132     __ rev64(v14, __ T16B, v14);
 4133     __ rev64(v15, __ T16B, v15);
 4134     __ rev64(v16, __ T16B, v16);
 4135     __ rev64(v17, __ T16B, v17);
 4136     __ rev64(v18, __ T16B, v18);
 4137     __ rev64(v19, __ T16B, v19);
 4138 
 4139     __ mov(rscratch2, rscratch1);
 4140 
 4141     __ mov(v0, __ T16B, v8);
 4142     __ mov(v1, __ T16B, v9);
 4143     __ mov(v2, __ T16B, v10);
 4144     __ mov(v3, __ T16B, v11);
 4145 
 4146     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4147     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4148     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4149     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4150     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4151     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4152     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4153     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4154     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4155     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4156     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4157     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4158     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4159     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4160     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4161     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4162     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4163     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4164     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4165     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4166     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4167     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4168     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4169     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4170     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4171     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4172     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4173     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4174     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4175     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4176     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4177     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4178     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4179     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4180     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4181     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4182     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4183     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4184     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4185     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4186 
 4187     __ addv(v8, __ T2D, v8, v0);
 4188     __ addv(v9, __ T2D, v9, v1);
 4189     __ addv(v10, __ T2D, v10, v2);
 4190     __ addv(v11, __ T2D, v11, v3);
 4191 
 4192     if (multi_block) {
 4193       __ add(ofs, ofs, 128);
 4194       __ cmp(ofs, limit);
 4195       __ br(Assembler::LE, sha512_loop);
 4196       __ mov(c_rarg0, ofs); // return ofs
 4197     }
 4198 
 4199     __ st1(v8, v9, v10, v11, __ T2D, state);
 4200 
 4201     __ ldpd(v14, v15, Address(sp, 48));
 4202     __ ldpd(v12, v13, Address(sp, 32));
 4203     __ ldpd(v10, v11, Address(sp, 16));
 4204     __ ldpd(v8, v9, __ post(sp, 64));
 4205 
 4206     __ ret(lr);
 4207 
 4208     return start;
 4209   }
 4210 
 4211   // Execute one round of keccak of two computations in parallel.
 4212   // One of the states should be loaded into the lower halves of
 4213   // the vector registers v0-v24, the other should be loaded into
 4214   // the upper halves of those registers. The ld1r instruction loads
 4215   // the round constant into both halves of register v31.
 4216   // Intermediate results c0...c5 and d0...d5 are computed
 4217   // in registers v25...v30.
 4218   // All vector instructions that are used operate on both register
 4219   // halves in parallel.
 4220   // If only a single computation is needed, one can only load the lower halves.
 4221   void keccak_round(Register rscratch1) {
 4222   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4223   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4224   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4225   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4226   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4227   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4228   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4229   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4230   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4231   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4232 
 4233   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4234   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4235   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4236   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4237   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4238 
 4239   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4240   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4241   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4242   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4243   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4244   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4245   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4246   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4247   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4248   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4249   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4250   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4251   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4252   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4253   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4254   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4255   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4256   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4257   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4258   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4259   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4260   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4261   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4262   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4263   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4264 
 4265   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4266   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4267   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4268   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4269   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4270 
 4271   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4272 
 4273   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4274   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4275   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4276   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4277   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4278 
 4279   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4280   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4281   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4282   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4283   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4284 
 4285   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4286   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4287   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4288   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4289   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4290 
 4291   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4292   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4293   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4294   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4295   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4296 
 4297   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4298   }
 4299 
 4300   // Arguments:
 4301   //
 4302   // Inputs:
 4303   //   c_rarg0   - byte[]  source+offset
 4304   //   c_rarg1   - byte[]  SHA.state
 4305   //   c_rarg2   - int     block_size
 4306   //   c_rarg3   - int     offset
 4307   //   c_rarg4   - int     limit
 4308   //
 4309   address generate_sha3_implCompress(StubId stub_id) {
 4310     bool multi_block;
 4311     switch (stub_id) {
 4312     case StubId::stubgen_sha3_implCompress_id:
 4313       multi_block = false;
 4314       break;
 4315     case StubId::stubgen_sha3_implCompressMB_id:
 4316       multi_block = true;
 4317       break;
 4318     default:
 4319       ShouldNotReachHere();
 4320     }
 4321 
 4322     static const uint64_t round_consts[24] = {
 4323       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4324       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4325       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4326       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4327       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4328       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4329       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4330       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4331     };
 4332 
 4333     __ align(CodeEntryAlignment);
 4334 
 4335     StubCodeMark mark(this, stub_id);
 4336     address start = __ pc();
 4337 
 4338     Register buf           = c_rarg0;
 4339     Register state         = c_rarg1;
 4340     Register block_size    = c_rarg2;
 4341     Register ofs           = c_rarg3;
 4342     Register limit         = c_rarg4;
 4343 
 4344     Label sha3_loop, rounds24_loop;
 4345     Label sha3_512_or_sha3_384, shake128;
 4346 
 4347     __ stpd(v8, v9, __ pre(sp, -64));
 4348     __ stpd(v10, v11, Address(sp, 16));
 4349     __ stpd(v12, v13, Address(sp, 32));
 4350     __ stpd(v14, v15, Address(sp, 48));
 4351 
 4352     // load state
 4353     __ add(rscratch1, state, 32);
 4354     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4355     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4356     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4357     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4358     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4359     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4360     __ ld1(v24, __ T1D, rscratch1);
 4361 
 4362     __ BIND(sha3_loop);
 4363 
 4364     // 24 keccak rounds
 4365     __ movw(rscratch2, 24);
 4366 
 4367     // load round_constants base
 4368     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4369 
 4370     // load input
 4371     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4372     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4373     __ eor(v0, __ T8B, v0, v25);
 4374     __ eor(v1, __ T8B, v1, v26);
 4375     __ eor(v2, __ T8B, v2, v27);
 4376     __ eor(v3, __ T8B, v3, v28);
 4377     __ eor(v4, __ T8B, v4, v29);
 4378     __ eor(v5, __ T8B, v5, v30);
 4379     __ eor(v6, __ T8B, v6, v31);
 4380 
 4381     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4382     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4383 
 4384     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4385     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4386     __ eor(v7, __ T8B, v7, v25);
 4387     __ eor(v8, __ T8B, v8, v26);
 4388     __ eor(v9, __ T8B, v9, v27);
 4389     __ eor(v10, __ T8B, v10, v28);
 4390     __ eor(v11, __ T8B, v11, v29);
 4391     __ eor(v12, __ T8B, v12, v30);
 4392     __ eor(v13, __ T8B, v13, v31);
 4393 
 4394     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4395     __ eor(v14, __ T8B, v14, v25);
 4396     __ eor(v15, __ T8B, v15, v26);
 4397     __ eor(v16, __ T8B, v16, v27);
 4398 
 4399     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4400     __ andw(c_rarg5, block_size, 48);
 4401     __ cbzw(c_rarg5, rounds24_loop);
 4402 
 4403     __ tbnz(block_size, 5, shake128);
 4404     // block_size == 144, bit5 == 0, SHA3-224
 4405     __ ldrd(v28, __ post(buf, 8));
 4406     __ eor(v17, __ T8B, v17, v28);
 4407     __ b(rounds24_loop);
 4408 
 4409     __ BIND(shake128);
 4410     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4411     __ eor(v17, __ T8B, v17, v28);
 4412     __ eor(v18, __ T8B, v18, v29);
 4413     __ eor(v19, __ T8B, v19, v30);
 4414     __ eor(v20, __ T8B, v20, v31);
 4415     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4416 
 4417     __ BIND(sha3_512_or_sha3_384);
 4418     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4419     __ eor(v7, __ T8B, v7, v25);
 4420     __ eor(v8, __ T8B, v8, v26);
 4421     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4422 
 4423     // SHA3-384
 4424     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4425     __ eor(v9,  __ T8B, v9,  v27);
 4426     __ eor(v10, __ T8B, v10, v28);
 4427     __ eor(v11, __ T8B, v11, v29);
 4428     __ eor(v12, __ T8B, v12, v30);
 4429 
 4430     __ BIND(rounds24_loop);
 4431     __ subw(rscratch2, rscratch2, 1);
 4432 
 4433     keccak_round(rscratch1);
 4434 
 4435     __ cbnzw(rscratch2, rounds24_loop);
 4436 
 4437     if (multi_block) {
 4438       __ add(ofs, ofs, block_size);
 4439       __ cmp(ofs, limit);
 4440       __ br(Assembler::LE, sha3_loop);
 4441       __ mov(c_rarg0, ofs); // return ofs
 4442     }
 4443 
 4444     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4445     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4446     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4447     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4448     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4449     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4450     __ st1(v24, __ T1D, state);
 4451 
 4452     // restore callee-saved registers
 4453     __ ldpd(v14, v15, Address(sp, 48));
 4454     __ ldpd(v12, v13, Address(sp, 32));
 4455     __ ldpd(v10, v11, Address(sp, 16));
 4456     __ ldpd(v8, v9, __ post(sp, 64));
 4457 
 4458     __ ret(lr);
 4459 
 4460     return start;
 4461   }
 4462 
 4463   // Inputs:
 4464   //   c_rarg0   - long[]  state0
 4465   //   c_rarg1   - long[]  state1
 4466   address generate_double_keccak() {
 4467     static const uint64_t round_consts[24] = {
 4468       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4469       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4470       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4471       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4472       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4473       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4474       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4475       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4476     };
 4477 
 4478     // Implements the double_keccak() method of the
 4479     // sun.secyrity.provider.SHA3Parallel class
 4480     __ align(CodeEntryAlignment);
 4481     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4482     address start = __ pc();
 4483     __ enter();
 4484 
 4485     Register state0        = c_rarg0;
 4486     Register state1        = c_rarg1;
 4487 
 4488     Label rounds24_loop;
 4489 
 4490     // save callee-saved registers
 4491     __ stpd(v8, v9, __ pre(sp, -64));
 4492     __ stpd(v10, v11, Address(sp, 16));
 4493     __ stpd(v12, v13, Address(sp, 32));
 4494     __ stpd(v14, v15, Address(sp, 48));
 4495 
 4496     // load states
 4497     __ add(rscratch1, state0, 32);
 4498     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4499     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4500     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4501     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4502     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4503     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4504     __ ld1(v24, __ D, 0, rscratch1);
 4505     __ add(rscratch1, state1, 32);
 4506     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4507     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4508     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4509     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4510     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4511     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4512     __ ld1(v24, __ D, 1, rscratch1);
 4513 
 4514     // 24 keccak rounds
 4515     __ movw(rscratch2, 24);
 4516 
 4517     // load round_constants base
 4518     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4519 
 4520     __ BIND(rounds24_loop);
 4521     __ subw(rscratch2, rscratch2, 1);
 4522     keccak_round(rscratch1);
 4523     __ cbnzw(rscratch2, rounds24_loop);
 4524 
 4525     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4526     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4527     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4528     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4529     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4530     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4531     __ st1(v24, __ D, 0, state0);
 4532     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4533     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4534     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4535     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4536     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4537     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4538     __ st1(v24, __ D, 1, state1);
 4539 
 4540     // restore callee-saved vector registers
 4541     __ ldpd(v14, v15, Address(sp, 48));
 4542     __ ldpd(v12, v13, Address(sp, 32));
 4543     __ ldpd(v10, v11, Address(sp, 16));
 4544     __ ldpd(v8, v9, __ post(sp, 64));
 4545 
 4546     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4547     __ mov(r0, zr); // return 0
 4548     __ ret(lr);
 4549 
 4550     return start;
 4551   }
 4552 
 4553   // ChaCha20 block function.  This version parallelizes the 32-bit
 4554   // state elements on each of 16 vectors, producing 4 blocks of
 4555   // keystream at a time.
 4556   //
 4557   // state (int[16]) = c_rarg0
 4558   // keystream (byte[256]) = c_rarg1
 4559   // return - number of bytes of produced keystream (always 256)
 4560   //
 4561   // This implementation takes each 32-bit integer from the state
 4562   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4563   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4564   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4565   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4566   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4567   // operations represented by one QUARTERROUND function, we instead stack all
 4568   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4569   // and then do the same for the second set of 4 quarter rounds.  This removes
 4570   // some latency that would otherwise be incurred by waiting for an add to
 4571   // complete before performing an xor (which depends on the result of the
 4572   // add), etc. An adjustment happens between the first and second groups of 4
 4573   // quarter rounds, but this is done only in the inputs to the macro functions
 4574   // that generate the assembly instructions - these adjustments themselves are
 4575   // not part of the resulting assembly.
 4576   // The 4 registers v0-v3 are used during the quarter round operations as
 4577   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4578   // registers become the vectors involved in adding the start state back onto
 4579   // the post-QR working state.  After the adds are complete, each of the 16
 4580   // vectors write their first lane back to the keystream buffer, followed
 4581   // by the second lane from all vectors and so on.
 4582   address generate_chacha20Block_blockpar() {
 4583     Label L_twoRounds, L_cc20_const;
 4584     // The constant data is broken into two 128-bit segments to be loaded
 4585     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4586     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4587     // The second 128-bits is a table constant used for 8-bit left rotations.
 4588     __ BIND(L_cc20_const);
 4589     __ emit_int64(0x0000000100000000UL);
 4590     __ emit_int64(0x0000000300000002UL);
 4591     __ emit_int64(0x0605040702010003UL);
 4592     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4593 
 4594     __ align(CodeEntryAlignment);
 4595     StubId stub_id = StubId::stubgen_chacha20Block_id;
 4596     StubCodeMark mark(this, stub_id);
 4597     address start = __ pc();
 4598     __ enter();
 4599 
 4600     int i, j;
 4601     const Register state = c_rarg0;
 4602     const Register keystream = c_rarg1;
 4603     const Register loopCtr = r10;
 4604     const Register tmpAddr = r11;
 4605     const FloatRegister ctrAddOverlay = v28;
 4606     const FloatRegister lrot8Tbl = v29;
 4607 
 4608     // Organize SIMD registers in an array that facilitates
 4609     // putting repetitive opcodes into loop structures.  It is
 4610     // important that each grouping of 4 registers is monotonically
 4611     // increasing to support the requirements of multi-register
 4612     // instructions (e.g. ld4r, st4, etc.)
 4613     const FloatRegister workSt[16] = {
 4614          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4615         v20, v21, v22, v23, v24, v25, v26, v27
 4616     };
 4617 
 4618     // Pull in constant data.  The first 16 bytes are the add overlay
 4619     // which is applied to the vector holding the counter (state[12]).
 4620     // The second 16 bytes is the index register for the 8-bit left
 4621     // rotation tbl instruction.
 4622     __ adr(tmpAddr, L_cc20_const);
 4623     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4624 
 4625     // Load from memory and interlace across 16 SIMD registers,
 4626     // With each word from memory being broadcast to all lanes of
 4627     // each successive SIMD register.
 4628     //      Addr(0) -> All lanes in workSt[i]
 4629     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4630     __ mov(tmpAddr, state);
 4631     for (i = 0; i < 16; i += 4) {
 4632       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4633           __ post(tmpAddr, 16));
 4634     }
 4635     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4636 
 4637     // Before entering the loop, create 5 4-register arrays.  These
 4638     // will hold the 4 registers that represent the a/b/c/d fields
 4639     // in the quarter round operation.  For instance the "b" field
 4640     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4641     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4642     // since it is part of a diagonal organization.  The aSet and scratch
 4643     // register sets are defined at declaration time because they do not change
 4644     // organization at any point during the 20-round processing.
 4645     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4646     FloatRegister bSet[4];
 4647     FloatRegister cSet[4];
 4648     FloatRegister dSet[4];
 4649     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4650 
 4651     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4652     __ mov(loopCtr, 10);
 4653     __ BIND(L_twoRounds);
 4654 
 4655     // Set to columnar organization and do the following 4 quarter-rounds:
 4656     // QUARTERROUND(0, 4, 8, 12)
 4657     // QUARTERROUND(1, 5, 9, 13)
 4658     // QUARTERROUND(2, 6, 10, 14)
 4659     // QUARTERROUND(3, 7, 11, 15)
 4660     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4661     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4662     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4663 
 4664     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4665     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4666     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4667 
 4668     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4669     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4670     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4671 
 4672     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4673     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4674     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4675 
 4676     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4677     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4678     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4679 
 4680     // Set to diagonal organization and do the next 4 quarter-rounds:
 4681     // QUARTERROUND(0, 5, 10, 15)
 4682     // QUARTERROUND(1, 6, 11, 12)
 4683     // QUARTERROUND(2, 7, 8, 13)
 4684     // QUARTERROUND(3, 4, 9, 14)
 4685     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4686     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4687     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4688 
 4689     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4690     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4691     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4692 
 4693     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4694     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4695     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4696 
 4697     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4698     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4699     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4700 
 4701     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4702     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4703     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4704 
 4705     // Decrement and iterate
 4706     __ sub(loopCtr, loopCtr, 1);
 4707     __ cbnz(loopCtr, L_twoRounds);
 4708 
 4709     __ mov(tmpAddr, state);
 4710 
 4711     // Add the starting state back to the post-loop keystream
 4712     // state.  We read/interlace the state array from memory into
 4713     // 4 registers similar to what we did in the beginning.  Then
 4714     // add the counter overlay onto workSt[12] at the end.
 4715     for (i = 0; i < 16; i += 4) {
 4716       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4717       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4718       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4719       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4720       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4721     }
 4722     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4723 
 4724     // Write working state into the keystream buffer.  This is accomplished
 4725     // by taking the lane "i" from each of the four vectors and writing
 4726     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4727     // repeating with the next 4 vectors until all 16 vectors have been used.
 4728     // Then move to the next lane and repeat the process until all lanes have
 4729     // been written.
 4730     for (i = 0; i < 4; i++) {
 4731       for (j = 0; j < 16; j += 4) {
 4732         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4733             __ post(keystream, 16));
 4734       }
 4735     }
 4736 
 4737     __ mov(r0, 256);             // Return length of output keystream
 4738     __ leave();
 4739     __ ret(lr);
 4740 
 4741     return start;
 4742   }
 4743 
 4744   // Helpers to schedule parallel operation bundles across vector
 4745   // register sequences of size 2, 4 or 8.
 4746 
 4747   // Implement various primitive computations across vector sequences
 4748 
 4749   template<int N>
 4750   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4751                const VSeq<N>& v1, const VSeq<N>& v2) {
 4752     // output must not be constant
 4753     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4754     // output cannot overwrite pending inputs
 4755     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4756     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4757     for (int i = 0; i < N; i++) {
 4758       __ addv(v[i], T, v1[i], v2[i]);
 4759     }
 4760   }
 4761 
 4762   template<int N>
 4763   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4764                const VSeq<N>& v1, const VSeq<N>& v2) {
 4765     // output must not be constant
 4766     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4767     // output cannot overwrite pending inputs
 4768     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4769     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4770     for (int i = 0; i < N; i++) {
 4771       __ subv(v[i], T, v1[i], v2[i]);
 4772     }
 4773   }
 4774 
 4775   template<int N>
 4776   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4777                const VSeq<N>& v1, const VSeq<N>& v2) {
 4778     // output must not be constant
 4779     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4780     // output cannot overwrite pending inputs
 4781     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4782     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4783     for (int i = 0; i < N; i++) {
 4784       __ mulv(v[i], T, v1[i], v2[i]);
 4785     }
 4786   }
 4787 
 4788   template<int N>
 4789   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4790     // output must not be constant
 4791     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4792     // output cannot overwrite pending inputs
 4793     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4794     for (int i = 0; i < N; i++) {
 4795       __ negr(v[i], T, v1[i]);
 4796     }
 4797   }
 4798 
 4799   template<int N>
 4800   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4801                const VSeq<N>& v1, int shift) {
 4802     // output must not be constant
 4803     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4804     // output cannot overwrite pending inputs
 4805     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4806     for (int i = 0; i < N; i++) {
 4807       __ sshr(v[i], T, v1[i], shift);
 4808     }
 4809   }
 4810 
 4811   template<int N>
 4812   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4813     // output must not be constant
 4814     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4815     // output cannot overwrite pending inputs
 4816     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4817     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4818     for (int i = 0; i < N; i++) {
 4819       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4820     }
 4821   }
 4822 
 4823   template<int N>
 4824   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4825     // output must not be constant
 4826     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4827     // output cannot overwrite pending inputs
 4828     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4829     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4830     for (int i = 0; i < N; i++) {
 4831       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4832     }
 4833   }
 4834 
 4835   template<int N>
 4836   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4837     // output must not be constant
 4838     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4839     // output cannot overwrite pending inputs
 4840     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4841     for (int i = 0; i < N; i++) {
 4842       __ notr(v[i], __ T16B, v1[i]);
 4843     }
 4844   }
 4845 
 4846   template<int N>
 4847   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4848     // output must not be constant
 4849     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4850     // output cannot overwrite pending inputs
 4851     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4852     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4853     for (int i = 0; i < N; i++) {
 4854       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4855     }
 4856   }
 4857 
 4858   template<int N>
 4859   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4860     // output must not be constant
 4861     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4862     // output cannot overwrite pending inputs
 4863     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4864     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4865     for (int i = 0; i < N; i++) {
 4866       __ mlsv(v[i], T, v1[i], v2[i]);
 4867     }
 4868   }
 4869 
 4870   // load N/2 successive pairs of quadword values from memory in order
 4871   // into N successive vector registers of the sequence via the
 4872   // address supplied in base.
 4873   template<int N>
 4874   void vs_ldpq(const VSeq<N>& v, Register base) {
 4875     for (int i = 0; i < N; i += 2) {
 4876       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4877     }
 4878   }
 4879 
 4880   // load N/2 successive pairs of quadword values from memory in order
 4881   // into N vector registers of the sequence via the address supplied
 4882   // in base using post-increment addressing
 4883   template<int N>
 4884   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4885     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4886     for (int i = 0; i < N; i += 2) {
 4887       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4888     }
 4889   }
 4890 
 4891   // store N successive vector registers of the sequence into N/2
 4892   // successive pairs of quadword memory locations via the address
 4893   // supplied in base using post-increment addressing
 4894   template<int N>
 4895   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4896     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4897     for (int i = 0; i < N; i += 2) {
 4898       __ stpq(v[i], v[i+1], __ post(base, 32));
 4899     }
 4900   }
 4901 
 4902   // load N/2 pairs of quadword values from memory de-interleaved into
 4903   // N vector registers 2 at a time via the address supplied in base
 4904   // using post-increment addressing.
 4905   template<int N>
 4906   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4907     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4908     for (int i = 0; i < N; i += 2) {
 4909       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4910     }
 4911   }
 4912 
 4913   // store N vector registers interleaved into N/2 pairs of quadword
 4914   // memory locations via the address supplied in base using
 4915   // post-increment addressing.
 4916   template<int N>
 4917   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4918     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4919     for (int i = 0; i < N; i += 2) {
 4920       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4921     }
 4922   }
 4923 
 4924   // load N quadword values from memory de-interleaved into N vector
 4925   // registers 3 elements at a time via the address supplied in base.
 4926   template<int N>
 4927   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4928     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4929     for (int i = 0; i < N; i += 3) {
 4930       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4931     }
 4932   }
 4933 
 4934   // load N quadword values from memory de-interleaved into N vector
 4935   // registers 3 elements at a time via the address supplied in base
 4936   // using post-increment addressing.
 4937   template<int N>
 4938   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4939     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4940     for (int i = 0; i < N; i += 3) {
 4941       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4942     }
 4943   }
 4944 
 4945   // load N/2 pairs of quadword values from memory into N vector
 4946   // registers via the address supplied in base with each pair indexed
 4947   // using the the start offset plus the corresponding entry in the
 4948   // offsets array
 4949   template<int N>
 4950   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4951     for (int i = 0; i < N/2; i++) {
 4952       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4953     }
 4954   }
 4955 
 4956   // store N vector registers into N/2 pairs of quadword memory
 4957   // locations via the address supplied in base with each pair indexed
 4958   // using the the start offset plus the corresponding entry in the
 4959   // offsets array
 4960   template<int N>
 4961   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4962     for (int i = 0; i < N/2; i++) {
 4963       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4964     }
 4965   }
 4966 
 4967   // load N single quadword values from memory into N vector registers
 4968   // via the address supplied in base with each value indexed using
 4969   // the the start offset plus the corresponding entry in the offsets
 4970   // array
 4971   template<int N>
 4972   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4973                       int start, int (&offsets)[N]) {
 4974     for (int i = 0; i < N; i++) {
 4975       __ ldr(v[i], T, Address(base, start + offsets[i]));
 4976     }
 4977   }
 4978 
 4979   // store N vector registers into N single quadword memory locations
 4980   // via the address supplied in base with each value indexed using
 4981   // the the start offset plus the corresponding entry in the offsets
 4982   // array
 4983   template<int N>
 4984   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4985                       int start, int (&offsets)[N]) {
 4986     for (int i = 0; i < N; i++) {
 4987       __ str(v[i], T, Address(base, start + offsets[i]));
 4988     }
 4989   }
 4990 
 4991   // load N/2 pairs of quadword values from memory de-interleaved into
 4992   // N vector registers 2 at a time via the address supplied in base
 4993   // with each pair indexed using the the start offset plus the
 4994   // corresponding entry in the offsets array
 4995   template<int N>
 4996   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4997                       Register tmp, int start, int (&offsets)[N/2]) {
 4998     for (int i = 0; i < N/2; i++) {
 4999       __ add(tmp, base, start + offsets[i]);
 5000       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5001     }
 5002   }
 5003 
 5004   // store N vector registers 2 at a time interleaved into N/2 pairs
 5005   // of quadword memory locations via the address supplied in base
 5006   // with each pair indexed using the the start offset plus the
 5007   // corresponding entry in the offsets array
 5008   template<int N>
 5009   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5010                       Register tmp, int start, int (&offsets)[N/2]) {
 5011     for (int i = 0; i < N/2; i++) {
 5012       __ add(tmp, base, start + offsets[i]);
 5013       __ st2(v[2*i], v[2*i+1], T, tmp);
 5014     }
 5015   }
 5016 
 5017   // Helper routines for various flavours of Montgomery multiply
 5018 
 5019   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5020   // multiplications in parallel
 5021   //
 5022 
 5023   // See the montMul() method of the sun.security.provider.ML_DSA
 5024   // class.
 5025   //
 5026   // Computes 4x4S results or 8x8H results
 5027   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5028   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5029   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5030   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5031   // Outputs: va - 4x4S or 4x8H vector register sequences
 5032   // vb, vc, vtmp and vq must all be disjoint
 5033   // va must be disjoint from all other inputs/temps or must equal vc
 5034   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5035   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5036   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5037                    Assembler::SIMD_Arrangement T,
 5038                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5039     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5040     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5041     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5042     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5043 
 5044     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5045     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5046 
 5047     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5048 
 5049     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5050     assert(vs_disjoint(va, vb), "va and vb overlap");
 5051     assert(vs_disjoint(va, vq), "va and vq overlap");
 5052     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5053     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5054 
 5055     // schedule 4 streams of instructions across the vector sequences
 5056     for (int i = 0; i < 4; i++) {
 5057       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5058       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5059     }
 5060 
 5061     for (int i = 0; i < 4; i++) {
 5062       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5063     }
 5064 
 5065     for (int i = 0; i < 4; i++) {
 5066       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5067     }
 5068 
 5069     for (int i = 0; i < 4; i++) {
 5070       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5071     }
 5072   }
 5073 
 5074   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5075   // multiplications in parallel
 5076   //
 5077 
 5078   // See the montMul() method of the sun.security.provider.ML_DSA
 5079   // class.
 5080   //
 5081   // Computes 4x4S results or 8x8H results
 5082   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5083   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5084   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5085   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5086   // Outputs: va - 4x4S or 4x8H vector register sequences
 5087   // vb, vc, vtmp and vq must all be disjoint
 5088   // va must be disjoint from all other inputs/temps or must equal vc
 5089   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5090   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5091   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5092                    Assembler::SIMD_Arrangement T,
 5093                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5094     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5095     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5096     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5097     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5098 
 5099     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5100     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5101 
 5102     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5103 
 5104     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5105     assert(vs_disjoint(va, vb), "va and vb overlap");
 5106     assert(vs_disjoint(va, vq), "va and vq overlap");
 5107     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5108     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5109 
 5110     // schedule 2 streams of instructions across the vector sequences
 5111     for (int i = 0; i < 2; i++) {
 5112       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5113       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5114     }
 5115 
 5116     for (int i = 0; i < 2; i++) {
 5117       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5118     }
 5119 
 5120     for (int i = 0; i < 2; i++) {
 5121       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5122     }
 5123 
 5124     for (int i = 0; i < 2; i++) {
 5125       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5126     }
 5127   }
 5128 
 5129   // Perform 16 16-bit Montgomery multiplications in parallel.
 5130   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5131                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5132     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5133     // It will assert that the register use is valid
 5134     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5135   }
 5136 
 5137   // Perform 32 16-bit Montgomery multiplications in parallel.
 5138   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5139                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5140     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5141     // It will assert that the register use is valid
 5142     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5143   }
 5144 
 5145   // Perform 64 16-bit Montgomery multiplications in parallel.
 5146   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5147                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5148     // Schedule two successive 4x8H multiplies via the montmul helper
 5149     // on the front and back halves of va, vb and vc. The helper will
 5150     // assert that the register use has no overlap conflicts on each
 5151     // individual call but we also need to ensure that the necessary
 5152     // disjoint/equality constraints are met across both calls.
 5153 
 5154     // vb, vc, vtmp and vq must be disjoint. va must either be
 5155     // disjoint from all other registers or equal vc
 5156 
 5157     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5158     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5159     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5160 
 5161     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5162     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5163 
 5164     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5165 
 5166     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5167     assert(vs_disjoint(va, vb), "va and vb overlap");
 5168     assert(vs_disjoint(va, vq), "va and vq overlap");
 5169     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5170 
 5171     // we multiply the front and back halves of each sequence 4 at a
 5172     // time because
 5173     //
 5174     // 1) we are currently only able to get 4-way instruction
 5175     // parallelism at best
 5176     //
 5177     // 2) we need registers for the constants in vq and temporary
 5178     // scratch registers to hold intermediate results so vtmp can only
 5179     // be a VSeq<4> which means we only have 4 scratch slots
 5180 
 5181     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5182     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5183   }
 5184 
 5185   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5186                                const VSeq<4>& vc,
 5187                                const VSeq<4>& vtmp,
 5188                                const VSeq<2>& vq) {
 5189     // compute a = montmul(a1, c)
 5190     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5191     // ouptut a1 = a0 - a
 5192     vs_subv(va1, __ T8H, va0, vc);
 5193     //    and a0 = a0 + a
 5194     vs_addv(va0, __ T8H, va0, vc);
 5195   }
 5196 
 5197   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5198                                const VSeq<4>& vb,
 5199                                const VSeq<4>& vtmp1,
 5200                                const VSeq<4>& vtmp2,
 5201                                const VSeq<2>& vq) {
 5202     // compute c = a0 - a1
 5203     vs_subv(vtmp1, __ T8H, va0, va1);
 5204     // output a0 = a0 + a1
 5205     vs_addv(va0, __ T8H, va0, va1);
 5206     // output a1 = b montmul c
 5207     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5208   }
 5209 
 5210   void load64shorts(const VSeq<8>& v, Register shorts) {
 5211     vs_ldpq_post(v, shorts);
 5212   }
 5213 
 5214   void load32shorts(const VSeq<4>& v, Register shorts) {
 5215     vs_ldpq_post(v, shorts);
 5216   }
 5217 
 5218   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5219     vs_stpq_post(v, tmpAddr);
 5220   }
 5221 
 5222   // Kyber NTT function.
 5223   // Implements
 5224   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5225   //
 5226   // coeffs (short[256]) = c_rarg0
 5227   // ntt_zetas (short[256]) = c_rarg1
 5228   address generate_kyberNtt() {
 5229 
 5230     __ align(CodeEntryAlignment);
 5231     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5232     StubCodeMark mark(this, stub_id);
 5233     address start = __ pc();
 5234     __ enter();
 5235 
 5236     const Register coeffs = c_rarg0;
 5237     const Register zetas = c_rarg1;
 5238 
 5239     const Register kyberConsts = r10;
 5240     const Register tmpAddr = r11;
 5241 
 5242     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5243     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5244     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5245 
 5246     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5247     // load the montmul constants
 5248     vs_ldpq(vq, kyberConsts);
 5249 
 5250     // Each level corresponds to an iteration of the outermost loop of the
 5251     // Java method seilerNTT(int[] coeffs). There are some differences
 5252     // from what is done in the seilerNTT() method, though:
 5253     // 1. The computation is using 16-bit signed values, we do not convert them
 5254     // to ints here.
 5255     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5256     // this array for each level, it is easier that way to fill up the vector
 5257     // registers.
 5258     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5259     // multiplications (this is because that way there should not be any
 5260     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5261     // that we can use the 16-bit arithmetic in the vector unit.
 5262     //
 5263     // On each level, we fill up the vector registers in such a way that the
 5264     // array elements that need to be multiplied by the zetas go into one
 5265     // set of vector registers while the corresponding ones that don't need to
 5266     // be multiplied, go into another set.
 5267     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5268     // registers interleaving the steps of 4 identical computations,
 5269     // each done on 8 16-bit values per register.
 5270 
 5271     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5272     // to the zetas occur in discrete blocks whose size is some multiple
 5273     // of 32.
 5274 
 5275     // level 0
 5276     __ add(tmpAddr, coeffs, 256);
 5277     load64shorts(vs1, tmpAddr);
 5278     load64shorts(vs2, zetas);
 5279     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5280     __ add(tmpAddr, coeffs, 0);
 5281     load64shorts(vs1, tmpAddr);
 5282     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5283     vs_addv(vs1, __ T8H, vs1, vs2);
 5284     __ add(tmpAddr, coeffs, 0);
 5285     vs_stpq_post(vs1, tmpAddr);
 5286     __ add(tmpAddr, coeffs, 256);
 5287     vs_stpq_post(vs3, tmpAddr);
 5288     // restore montmul constants
 5289     vs_ldpq(vq, kyberConsts);
 5290     load64shorts(vs1, tmpAddr);
 5291     load64shorts(vs2, zetas);
 5292     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5293     __ add(tmpAddr, coeffs, 128);
 5294     load64shorts(vs1, tmpAddr);
 5295     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5296     vs_addv(vs1, __ T8H, vs1, vs2);
 5297     __ add(tmpAddr, coeffs, 128);
 5298     store64shorts(vs1, tmpAddr);
 5299     __ add(tmpAddr, coeffs, 384);
 5300     store64shorts(vs3, tmpAddr);
 5301 
 5302     // level 1
 5303     // restore montmul constants
 5304     vs_ldpq(vq, kyberConsts);
 5305     __ add(tmpAddr, coeffs, 128);
 5306     load64shorts(vs1, tmpAddr);
 5307     load64shorts(vs2, zetas);
 5308     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5309     __ add(tmpAddr, coeffs, 0);
 5310     load64shorts(vs1, tmpAddr);
 5311     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5312     vs_addv(vs1, __ T8H, vs1, vs2);
 5313     __ add(tmpAddr, coeffs, 0);
 5314     store64shorts(vs1, tmpAddr);
 5315     store64shorts(vs3, tmpAddr);
 5316     vs_ldpq(vq, kyberConsts);
 5317     __ add(tmpAddr, coeffs, 384);
 5318     load64shorts(vs1, tmpAddr);
 5319     load64shorts(vs2, zetas);
 5320     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5321     __ add(tmpAddr, coeffs, 256);
 5322     load64shorts(vs1, tmpAddr);
 5323     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5324     vs_addv(vs1, __ T8H, vs1, vs2);
 5325     __ add(tmpAddr, coeffs, 256);
 5326     store64shorts(vs1, tmpAddr);
 5327     store64shorts(vs3, tmpAddr);
 5328 
 5329     // level 2
 5330     vs_ldpq(vq, kyberConsts);
 5331     int offsets1[4] = { 0, 32, 128, 160 };
 5332     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5333     load64shorts(vs2, zetas);
 5334     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5335     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5336     // kyber_subv_addv64();
 5337     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5338     vs_addv(vs1, __ T8H, vs1, vs2);
 5339     __ add(tmpAddr, coeffs, 0);
 5340     vs_stpq_post(vs_front(vs1), tmpAddr);
 5341     vs_stpq_post(vs_front(vs3), tmpAddr);
 5342     vs_stpq_post(vs_back(vs1), tmpAddr);
 5343     vs_stpq_post(vs_back(vs3), tmpAddr);
 5344     vs_ldpq(vq, kyberConsts);
 5345     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5346     load64shorts(vs2, zetas);
 5347     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5348     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5349     // kyber_subv_addv64();
 5350     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5351     vs_addv(vs1, __ T8H, vs1, vs2);
 5352     __ add(tmpAddr, coeffs, 256);
 5353     vs_stpq_post(vs_front(vs1), tmpAddr);
 5354     vs_stpq_post(vs_front(vs3), tmpAddr);
 5355     vs_stpq_post(vs_back(vs1), tmpAddr);
 5356     vs_stpq_post(vs_back(vs3), tmpAddr);
 5357 
 5358     // level 3
 5359     vs_ldpq(vq, kyberConsts);
 5360     int offsets2[4] = { 0, 64, 128, 192 };
 5361     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5362     load64shorts(vs2, zetas);
 5363     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5364     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5365     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5366     vs_addv(vs1, __ T8H, vs1, vs2);
 5367     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5368     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5369 
 5370     vs_ldpq(vq, kyberConsts);
 5371     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5372     load64shorts(vs2, zetas);
 5373     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5374     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5375     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5376     vs_addv(vs1, __ T8H, vs1, vs2);
 5377     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5378     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5379 
 5380     // level 4
 5381     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5382     // so they are loaded using employing an ldr at 8 distinct offsets.
 5383 
 5384     vs_ldpq(vq, kyberConsts);
 5385     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5386     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5387     load64shorts(vs2, zetas);
 5388     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5389     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5390     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5391     vs_addv(vs1, __ T8H, vs1, vs2);
 5392     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5393     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5394 
 5395     vs_ldpq(vq, kyberConsts);
 5396     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5397     load64shorts(vs2, zetas);
 5398     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5399     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5400     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5401     vs_addv(vs1, __ T8H, vs1, vs2);
 5402     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5403     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5404 
 5405     // level 5
 5406     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5407     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5408 
 5409     vs_ldpq(vq, kyberConsts);
 5410     int offsets4[4] = { 0, 32, 64, 96 };
 5411     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5412     load32shorts(vs_front(vs2), zetas);
 5413     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5414     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5415     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5416     load32shorts(vs_front(vs2), zetas);
 5417     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5418     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5419     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5420     load32shorts(vs_front(vs2), zetas);
 5421     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5422     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5423 
 5424     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5425     load32shorts(vs_front(vs2), zetas);
 5426     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5427     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5428 
 5429     // level 6
 5430     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5431     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5432 
 5433     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5434     load32shorts(vs_front(vs2), zetas);
 5435     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5436     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5437     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5438     // __ ldpq(v18, v19, __ post(zetas, 32));
 5439     load32shorts(vs_front(vs2), zetas);
 5440     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5441     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5442 
 5443     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5444     load32shorts(vs_front(vs2), zetas);
 5445     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5446     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5447 
 5448     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5449     load32shorts(vs_front(vs2), zetas);
 5450     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5451     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5452 
 5453     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5454     __ mov(r0, zr); // return 0
 5455     __ ret(lr);
 5456 
 5457     return start;
 5458   }
 5459 
 5460   // Kyber Inverse NTT function
 5461   // Implements
 5462   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5463   //
 5464   // coeffs (short[256]) = c_rarg0
 5465   // ntt_zetas (short[256]) = c_rarg1
 5466   address generate_kyberInverseNtt() {
 5467 
 5468     __ align(CodeEntryAlignment);
 5469     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5470     StubCodeMark mark(this, stub_id);
 5471     address start = __ pc();
 5472     __ enter();
 5473 
 5474     const Register coeffs = c_rarg0;
 5475     const Register zetas = c_rarg1;
 5476 
 5477     const Register kyberConsts = r10;
 5478     const Register tmpAddr = r11;
 5479     const Register tmpAddr2 = c_rarg2;
 5480 
 5481     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5482     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5483     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5484 
 5485     __ lea(kyberConsts,
 5486              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5487 
 5488     // level 0
 5489     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5490     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5491 
 5492     vs_ldpq(vq, kyberConsts);
 5493     int offsets4[4] = { 0, 32, 64, 96 };
 5494     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5495     load32shorts(vs_front(vs2), zetas);
 5496     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5497                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5498     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5499     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5500     load32shorts(vs_front(vs2), zetas);
 5501     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5502                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5503     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5504     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5505     load32shorts(vs_front(vs2), zetas);
 5506     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5507                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5508     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5509     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5510     load32shorts(vs_front(vs2), zetas);
 5511     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5512                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5513     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5514 
 5515     // level 1
 5516     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5517     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5518 
 5519     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5520     load32shorts(vs_front(vs2), zetas);
 5521     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5522                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5523     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5524     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5525     load32shorts(vs_front(vs2), zetas);
 5526     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5527                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5528     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5529 
 5530     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5531     load32shorts(vs_front(vs2), zetas);
 5532     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5533                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5534     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5535     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5536     load32shorts(vs_front(vs2), zetas);
 5537     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5538                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5539     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5540 
 5541     // level 2
 5542     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5543     // so they are loaded using employing an ldr at 8 distinct offsets.
 5544 
 5545     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5546     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5547     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5548     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5549     vs_subv(vs1, __ T8H, vs1, vs2);
 5550     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5551     load64shorts(vs2, zetas);
 5552     vs_ldpq(vq, kyberConsts);
 5553     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5554     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5555 
 5556     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5557     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5558     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5559     vs_subv(vs1, __ T8H, vs1, vs2);
 5560     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5561     load64shorts(vs2, zetas);
 5562     vs_ldpq(vq, kyberConsts);
 5563     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5564     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5565 
 5566     // Barrett reduction at indexes where overflow may happen
 5567 
 5568     // load q and the multiplier for the Barrett reduction
 5569     __ add(tmpAddr, kyberConsts, 16);
 5570     vs_ldpq(vq, tmpAddr);
 5571 
 5572     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5573     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5574     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5575     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5576     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5577     vs_sshr(vs2, __ T8H, vs2, 11);
 5578     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5579     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5580     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5581     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5582     vs_sshr(vs2, __ T8H, vs2, 11);
 5583     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5584     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5585 
 5586     // level 3
 5587     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5588     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5589 
 5590     int offsets2[4] = { 0, 64, 128, 192 };
 5591     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5592     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5593     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5594     vs_subv(vs1, __ T8H, vs1, vs2);
 5595     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5596     load64shorts(vs2, zetas);
 5597     vs_ldpq(vq, kyberConsts);
 5598     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5599     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5600 
 5601     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5602     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5603     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5604     vs_subv(vs1, __ T8H, vs1, vs2);
 5605     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5606     load64shorts(vs2, zetas);
 5607     vs_ldpq(vq, kyberConsts);
 5608     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5609     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5610 
 5611     // level 4
 5612 
 5613     int offsets1[4] = { 0, 32, 128, 160 };
 5614     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5615     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5616     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5617     vs_subv(vs1, __ T8H, vs1, vs2);
 5618     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5619     load64shorts(vs2, zetas);
 5620     vs_ldpq(vq, kyberConsts);
 5621     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5622     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5623 
 5624     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5625     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5626     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5627     vs_subv(vs1, __ T8H, vs1, vs2);
 5628     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5629     load64shorts(vs2, zetas);
 5630     vs_ldpq(vq, kyberConsts);
 5631     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5632     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5633 
 5634     // level 5
 5635 
 5636     __ add(tmpAddr, coeffs, 0);
 5637     load64shorts(vs1, tmpAddr);
 5638     __ add(tmpAddr, coeffs, 128);
 5639     load64shorts(vs2, tmpAddr);
 5640     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5641     vs_subv(vs1, __ T8H, vs1, vs2);
 5642     __ add(tmpAddr, coeffs, 0);
 5643     store64shorts(vs3, tmpAddr);
 5644     load64shorts(vs2, zetas);
 5645     vs_ldpq(vq, kyberConsts);
 5646     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5647     __ add(tmpAddr, coeffs, 128);
 5648     store64shorts(vs2, tmpAddr);
 5649 
 5650     load64shorts(vs1, tmpAddr);
 5651     __ add(tmpAddr, coeffs, 384);
 5652     load64shorts(vs2, tmpAddr);
 5653     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5654     vs_subv(vs1, __ T8H, vs1, vs2);
 5655     __ add(tmpAddr, coeffs, 256);
 5656     store64shorts(vs3, tmpAddr);
 5657     load64shorts(vs2, zetas);
 5658     vs_ldpq(vq, kyberConsts);
 5659     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5660     __ add(tmpAddr, coeffs, 384);
 5661     store64shorts(vs2, tmpAddr);
 5662 
 5663     // Barrett reduction at indexes where overflow may happen
 5664 
 5665     // load q and the multiplier for the Barrett reduction
 5666     __ add(tmpAddr, kyberConsts, 16);
 5667     vs_ldpq(vq, tmpAddr);
 5668 
 5669     int offsets0[2] = { 0, 256 };
 5670     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5671     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5672     vs_sshr(vs2, __ T8H, vs2, 11);
 5673     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5674     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5675 
 5676     // level 6
 5677 
 5678     __ add(tmpAddr, coeffs, 0);
 5679     load64shorts(vs1, tmpAddr);
 5680     __ add(tmpAddr, coeffs, 256);
 5681     load64shorts(vs2, tmpAddr);
 5682     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5683     vs_subv(vs1, __ T8H, vs1, vs2);
 5684     __ add(tmpAddr, coeffs, 0);
 5685     store64shorts(vs3, tmpAddr);
 5686     load64shorts(vs2, zetas);
 5687     vs_ldpq(vq, kyberConsts);
 5688     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5689     __ add(tmpAddr, coeffs, 256);
 5690     store64shorts(vs2, tmpAddr);
 5691 
 5692     __ add(tmpAddr, coeffs, 128);
 5693     load64shorts(vs1, tmpAddr);
 5694     __ add(tmpAddr, coeffs, 384);
 5695     load64shorts(vs2, tmpAddr);
 5696     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5697     vs_subv(vs1, __ T8H, vs1, vs2);
 5698     __ add(tmpAddr, coeffs, 128);
 5699     store64shorts(vs3, tmpAddr);
 5700     load64shorts(vs2, zetas);
 5701     vs_ldpq(vq, kyberConsts);
 5702     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5703     __ add(tmpAddr, coeffs, 384);
 5704     store64shorts(vs2, tmpAddr);
 5705 
 5706     // multiply by 2^-n
 5707 
 5708     // load toMont(2^-n mod q)
 5709     __ add(tmpAddr, kyberConsts, 48);
 5710     __ ldr(v29, __ Q, tmpAddr);
 5711 
 5712     vs_ldpq(vq, kyberConsts);
 5713     __ add(tmpAddr, coeffs, 0);
 5714     load64shorts(vs1, tmpAddr);
 5715     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5716     __ add(tmpAddr, coeffs, 0);
 5717     store64shorts(vs2, tmpAddr);
 5718 
 5719     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5720     load64shorts(vs1, tmpAddr);
 5721     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5722     __ add(tmpAddr, coeffs, 128);
 5723     store64shorts(vs2, tmpAddr);
 5724 
 5725     // now tmpAddr contains coeffs + 256
 5726     load64shorts(vs1, tmpAddr);
 5727     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5728     __ add(tmpAddr, coeffs, 256);
 5729     store64shorts(vs2, tmpAddr);
 5730 
 5731     // now tmpAddr contains coeffs + 384
 5732     load64shorts(vs1, tmpAddr);
 5733     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5734     __ add(tmpAddr, coeffs, 384);
 5735     store64shorts(vs2, tmpAddr);
 5736 
 5737     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5738     __ mov(r0, zr); // return 0
 5739     __ ret(lr);
 5740 
 5741     return start;
 5742   }
 5743 
 5744   // Kyber multiply polynomials in the NTT domain.
 5745   // Implements
 5746   // static int implKyberNttMult(
 5747   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5748   //
 5749   // result (short[256]) = c_rarg0
 5750   // ntta (short[256]) = c_rarg1
 5751   // nttb (short[256]) = c_rarg2
 5752   // zetas (short[128]) = c_rarg3
 5753   address generate_kyberNttMult() {
 5754 
 5755     __ align(CodeEntryAlignment);
 5756     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 5757     StubCodeMark mark(this, stub_id);
 5758     address start = __ pc();
 5759     __ enter();
 5760 
 5761     const Register result = c_rarg0;
 5762     const Register ntta = c_rarg1;
 5763     const Register nttb = c_rarg2;
 5764     const Register zetas = c_rarg3;
 5765 
 5766     const Register kyberConsts = r10;
 5767     const Register limit = r11;
 5768 
 5769     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5770     VSeq<4> vs3(16), vs4(20);
 5771     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5772     VSeq<2> vz(28);          // pair of zetas
 5773     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5774 
 5775     __ lea(kyberConsts,
 5776              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5777 
 5778     Label kyberNttMult_loop;
 5779 
 5780     __ add(limit, result, 512);
 5781 
 5782     // load q and qinv
 5783     vs_ldpq(vq, kyberConsts);
 5784 
 5785     // load R^2 mod q (to convert back from Montgomery representation)
 5786     __ add(kyberConsts, kyberConsts, 64);
 5787     __ ldr(v27, __ Q, kyberConsts);
 5788 
 5789     __ BIND(kyberNttMult_loop);
 5790 
 5791     // load 16 zetas
 5792     vs_ldpq_post(vz, zetas);
 5793 
 5794     // load 2 sets of 32 coefficients from the two input arrays
 5795     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5796     // are striped across pairs of vector registers
 5797     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5798     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5799     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5800     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5801 
 5802     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5803     // i.e. montmul the first and second halves of vs1 in order and
 5804     // then with one sequence reversed storing the two results in vs3
 5805     //
 5806     // vs3[0] <- montmul(a0, b0)
 5807     // vs3[1] <- montmul(a1, b1)
 5808     // vs3[2] <- montmul(a0, b1)
 5809     // vs3[3] <- montmul(a1, b0)
 5810     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5811     kyber_montmul16(vs_back(vs3),
 5812                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5813 
 5814     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5815     // i.e. montmul the first and second halves of vs4 in order and
 5816     // then with one sequence reversed storing the two results in vs1
 5817     //
 5818     // vs1[0] <- montmul(a2, b2)
 5819     // vs1[1] <- montmul(a3, b3)
 5820     // vs1[2] <- montmul(a2, b3)
 5821     // vs1[3] <- montmul(a3, b2)
 5822     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5823     kyber_montmul16(vs_back(vs1),
 5824                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5825 
 5826     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5827     // We can schedule two montmuls at a time if we use a suitable vector
 5828     // sequence <vs3[1], vs1[1]>.
 5829     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5830     VSeq<2> vs5(vs3[1], delta);
 5831 
 5832     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5833     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5834     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5835 
 5836     // add results in pairs storing in vs3
 5837     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5838     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5839     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5840 
 5841     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5842     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5843     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5844 
 5845     // vs1 <- montmul(vs3, montRSquareModQ)
 5846     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5847 
 5848     // store back the two pairs of result vectors de-interleaved as 8H elements
 5849     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5850     // in memory
 5851     vs_st2_post(vs1, __ T8H, result);
 5852 
 5853     __ cmp(result, limit);
 5854     __ br(Assembler::NE, kyberNttMult_loop);
 5855 
 5856     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5857     __ mov(r0, zr); // return 0
 5858     __ ret(lr);
 5859 
 5860     return start;
 5861   }
 5862 
 5863   // Kyber add 2 polynomials.
 5864   // Implements
 5865   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5866   //
 5867   // result (short[256]) = c_rarg0
 5868   // a (short[256]) = c_rarg1
 5869   // b (short[256]) = c_rarg2
 5870   address generate_kyberAddPoly_2() {
 5871 
 5872     __ align(CodeEntryAlignment);
 5873     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 5874     StubCodeMark mark(this, stub_id);
 5875     address start = __ pc();
 5876     __ enter();
 5877 
 5878     const Register result = c_rarg0;
 5879     const Register a = c_rarg1;
 5880     const Register b = c_rarg2;
 5881 
 5882     const Register kyberConsts = r11;
 5883 
 5884     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5885     // So, we can load, add and store the data in 3 groups of 11,
 5886     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5887     // registers. A further constraint is that the mapping needs
 5888     // to skip callee saves. So, we allocate the register
 5889     // sequences using two 8 sequences, two 2 sequences and two
 5890     // single registers.
 5891     VSeq<8> vs1_1(0);
 5892     VSeq<2> vs1_2(16);
 5893     FloatRegister vs1_3 = v28;
 5894     VSeq<8> vs2_1(18);
 5895     VSeq<2> vs2_2(26);
 5896     FloatRegister vs2_3 = v29;
 5897 
 5898     // two constant vector sequences
 5899     VSeq<8> vc_1(31, 0);
 5900     VSeq<2> vc_2(31, 0);
 5901 
 5902     FloatRegister vc_3 = v31;
 5903     __ lea(kyberConsts,
 5904              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5905 
 5906     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5907     for (int i = 0; i < 3; i++) {
 5908       // load 80 or 88 values from a into vs1_1/2/3
 5909       vs_ldpq_post(vs1_1, a);
 5910       vs_ldpq_post(vs1_2, a);
 5911       if (i < 2) {
 5912         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5913       }
 5914       // load 80 or 88 values from b into vs2_1/2/3
 5915       vs_ldpq_post(vs2_1, b);
 5916       vs_ldpq_post(vs2_2, b);
 5917       if (i < 2) {
 5918         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5919       }
 5920       // sum 80 or 88 values across vs1 and vs2 into vs1
 5921       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5922       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5923       if (i < 2) {
 5924         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5925       }
 5926       // add constant to all 80 or 88 results
 5927       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5928       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5929       if (i < 2) {
 5930         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5931       }
 5932       // store 80 or 88 values
 5933       vs_stpq_post(vs1_1, result);
 5934       vs_stpq_post(vs1_2, result);
 5935       if (i < 2) {
 5936         __ str(vs1_3, __ Q, __ post(result, 16));
 5937       }
 5938     }
 5939 
 5940     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5941     __ mov(r0, zr); // return 0
 5942     __ ret(lr);
 5943 
 5944     return start;
 5945   }
 5946 
 5947   // Kyber add 3 polynomials.
 5948   // Implements
 5949   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5950   //
 5951   // result (short[256]) = c_rarg0
 5952   // a (short[256]) = c_rarg1
 5953   // b (short[256]) = c_rarg2
 5954   // c (short[256]) = c_rarg3
 5955   address generate_kyberAddPoly_3() {
 5956 
 5957     __ align(CodeEntryAlignment);
 5958     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 5959     StubCodeMark mark(this, stub_id);
 5960     address start = __ pc();
 5961     __ enter();
 5962 
 5963     const Register result = c_rarg0;
 5964     const Register a = c_rarg1;
 5965     const Register b = c_rarg2;
 5966     const Register c = c_rarg3;
 5967 
 5968     const Register kyberConsts = r11;
 5969 
 5970     // As above we sum 256 sets of values in total i.e. 32 x 8H
 5971     // quadwords.  So, we can load, add and store the data in 3
 5972     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 5973     // of 10 or 11 registers. A further constraint is that the
 5974     // mapping needs to skip callee saves. So, we allocate the
 5975     // register sequences using two 8 sequences, two 2 sequences
 5976     // and two single registers.
 5977     VSeq<8> vs1_1(0);
 5978     VSeq<2> vs1_2(16);
 5979     FloatRegister vs1_3 = v28;
 5980     VSeq<8> vs2_1(18);
 5981     VSeq<2> vs2_2(26);
 5982     FloatRegister vs2_3 = v29;
 5983 
 5984     // two constant vector sequences
 5985     VSeq<8> vc_1(31, 0);
 5986     VSeq<2> vc_2(31, 0);
 5987 
 5988     FloatRegister vc_3 = v31;
 5989 
 5990     __ lea(kyberConsts,
 5991              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5992 
 5993     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5994     for (int i = 0; i < 3; i++) {
 5995       // load 80 or 88 values from a into vs1_1/2/3
 5996       vs_ldpq_post(vs1_1, a);
 5997       vs_ldpq_post(vs1_2, a);
 5998       if (i < 2) {
 5999         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6000       }
 6001       // load 80 or 88 values from b into vs2_1/2/3
 6002       vs_ldpq_post(vs2_1, b);
 6003       vs_ldpq_post(vs2_2, b);
 6004       if (i < 2) {
 6005         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6006       }
 6007       // sum 80 or 88 values across vs1 and vs2 into vs1
 6008       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6009       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6010       if (i < 2) {
 6011         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6012       }
 6013       // load 80 or 88 values from c into vs2_1/2/3
 6014       vs_ldpq_post(vs2_1, c);
 6015       vs_ldpq_post(vs2_2, c);
 6016       if (i < 2) {
 6017         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6018       }
 6019       // sum 80 or 88 values across vs1 and vs2 into vs1
 6020       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6021       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6022       if (i < 2) {
 6023         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6024       }
 6025       // add constant to all 80 or 88 results
 6026       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6027       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6028       if (i < 2) {
 6029         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6030       }
 6031       // store 80 or 88 values
 6032       vs_stpq_post(vs1_1, result);
 6033       vs_stpq_post(vs1_2, result);
 6034       if (i < 2) {
 6035         __ str(vs1_3, __ Q, __ post(result, 16));
 6036       }
 6037     }
 6038 
 6039     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6040     __ mov(r0, zr); // return 0
 6041     __ ret(lr);
 6042 
 6043     return start;
 6044   }
 6045 
 6046   // Kyber parse XOF output to polynomial coefficient candidates
 6047   // or decodePoly(12, ...).
 6048   // Implements
 6049   // static int implKyber12To16(
 6050   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6051   //
 6052   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 6053   //
 6054   // condensed (byte[]) = c_rarg0
 6055   // condensedIndex = c_rarg1
 6056   // parsed (short[112 or 256]) = c_rarg2
 6057   // parsedLength (112 or 256) = c_rarg3
 6058   address generate_kyber12To16() {
 6059     Label L_F00, L_loop, L_end;
 6060 
 6061     __ BIND(L_F00);
 6062     __ emit_int64(0x0f000f000f000f00);
 6063     __ emit_int64(0x0f000f000f000f00);
 6064 
 6065     __ align(CodeEntryAlignment);
 6066     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6067     StubCodeMark mark(this, stub_id);
 6068     address start = __ pc();
 6069     __ enter();
 6070 
 6071     const Register condensed = c_rarg0;
 6072     const Register condensedOffs = c_rarg1;
 6073     const Register parsed = c_rarg2;
 6074     const Register parsedLength = c_rarg3;
 6075 
 6076     const Register tmpAddr = r11;
 6077 
 6078     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6079     // quadwords so we need a 6 vector sequence for the inputs.
 6080     // Parsing produces 64 shorts, employing two 8 vector
 6081     // sequences to store and combine the intermediate data.
 6082     VSeq<6> vin(24);
 6083     VSeq<8> va(0), vb(16);
 6084 
 6085     __ adr(tmpAddr, L_F00);
 6086     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6087     __ add(condensed, condensed, condensedOffs);
 6088 
 6089     __ BIND(L_loop);
 6090     // load 96 (6 x 16B) byte values
 6091     vs_ld3_post(vin, __ T16B, condensed);
 6092 
 6093     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6094     // holds 48 (16x3) contiguous bytes from memory striped
 6095     // horizontally across each of the 16 byte lanes. Equivalently,
 6096     // that is 16 pairs of 12-bit integers. Likewise the back half
 6097     // holds the next 48 bytes in the same arrangement.
 6098 
 6099     // Each vector in the front half can also be viewed as a vertical
 6100     // strip across the 16 pairs of 12 bit integers. Each byte in
 6101     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6102     // byte in vin[1] stores the high 4 bits of the first int and the
 6103     // low 4 bits of the second int. Each byte in vin[2] stores the
 6104     // high 8 bits of the second int. Likewise the vectors in second
 6105     // half.
 6106 
 6107     // Converting the data to 16-bit shorts requires first of all
 6108     // expanding each of the 6 x 16B vectors into 6 corresponding
 6109     // pairs of 8H vectors. Mask, shift and add operations on the
 6110     // resulting vector pairs can be used to combine 4 and 8 bit
 6111     // parts of related 8H vector elements.
 6112     //
 6113     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6114     // twice, one copy manipulated to provide the lower 4 bits
 6115     // belonging to the first short in a pair and another copy
 6116     // manipulated to provide the higher 4 bits belonging to the
 6117     // second short in a pair. This is why the the vector sequences va
 6118     // and vb used to hold the expanded 8H elements are of length 8.
 6119 
 6120     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6121     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6122     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6123     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6124     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6125     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6126     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6127     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6128 
 6129     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6130     // and vb[4:5]
 6131     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6132     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6133     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6134     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6135     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6136     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6137 
 6138     // shift lo byte of copy 1 of the middle stripe into the high byte
 6139     __ shl(va[2], __ T8H, va[2], 8);
 6140     __ shl(va[3], __ T8H, va[3], 8);
 6141     __ shl(vb[2], __ T8H, vb[2], 8);
 6142     __ shl(vb[3], __ T8H, vb[3], 8);
 6143 
 6144     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6145     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6146     // are in bit positions [4..11].
 6147     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6148     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6149     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6150     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6151 
 6152     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6153     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6154     // copy2
 6155     __ andr(va[2], __ T16B, va[2], v31);
 6156     __ andr(va[3], __ T16B, va[3], v31);
 6157     __ ushr(va[4], __ T8H, va[4], 4);
 6158     __ ushr(va[5], __ T8H, va[5], 4);
 6159     __ andr(vb[2], __ T16B, vb[2], v31);
 6160     __ andr(vb[3], __ T16B, vb[3], v31);
 6161     __ ushr(vb[4], __ T8H, vb[4], 4);
 6162     __ ushr(vb[5], __ T8H, vb[5], 4);
 6163 
 6164     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6165     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6166     // n.b. the ordering ensures: i) inputs are consumed before they
 6167     // are overwritten ii) the order of 16-bit results across successive
 6168     // pairs of vectors in va and then vb reflects the order of the
 6169     // corresponding 12-bit inputs
 6170     __ addv(va[0], __ T8H, va[0], va[2]);
 6171     __ addv(va[2], __ T8H, va[1], va[3]);
 6172     __ addv(va[1], __ T8H, va[4], va[6]);
 6173     __ addv(va[3], __ T8H, va[5], va[7]);
 6174     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6175     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6176     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6177     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6178 
 6179     // store 64 results interleaved as shorts
 6180     vs_st2_post(vs_front(va), __ T8H, parsed);
 6181     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6182 
 6183     __ sub(parsedLength, parsedLength, 64);
 6184     __ cmp(parsedLength, (u1)64);
 6185     __ br(Assembler::GE, L_loop);
 6186     __ cbz(parsedLength, L_end);
 6187 
 6188     // if anything is left it should be a final 72 bytes of input
 6189     // i.e. a final 48 12-bit values. so we handle this by loading
 6190     // 48 bytes into all 16B lanes of front(vin) and only 24
 6191     // bytes into the lower 8B lane of back(vin)
 6192     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6193     vs_ld3(vs_back(vin), __ T8B, condensed);
 6194 
 6195     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6196     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6197     // 5 and target element 2 of vb duplicates element 4.
 6198     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6199     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6200     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6201     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6202     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6203     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6204 
 6205     // This time expand just the lower 8 lanes
 6206     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6207     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6208     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6209 
 6210     // shift lo byte of copy 1 of the middle stripe into the high byte
 6211     __ shl(va[2], __ T8H, va[2], 8);
 6212     __ shl(va[3], __ T8H, va[3], 8);
 6213     __ shl(vb[2], __ T8H, vb[2], 8);
 6214 
 6215     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6216     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6217     // int are in bit positions [4..11].
 6218     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6219     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6220     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6221 
 6222     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6223     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6224     // copy2
 6225     __ andr(va[2], __ T16B, va[2], v31);
 6226     __ andr(va[3], __ T16B, va[3], v31);
 6227     __ ushr(va[4], __ T8H, va[4], 4);
 6228     __ ushr(va[5], __ T8H, va[5], 4);
 6229     __ andr(vb[2], __ T16B, vb[2], v31);
 6230     __ ushr(vb[4], __ T8H, vb[4], 4);
 6231 
 6232 
 6233 
 6234     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6235     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6236 
 6237     // n.b. ordering ensures: i) inputs are consumed before they are
 6238     // overwritten ii) order of 16-bit results across succsessive
 6239     // pairs of vectors in va and then lower half of vb reflects order
 6240     // of corresponding 12-bit inputs
 6241     __ addv(va[0], __ T8H, va[0], va[2]);
 6242     __ addv(va[2], __ T8H, va[1], va[3]);
 6243     __ addv(va[1], __ T8H, va[4], va[6]);
 6244     __ addv(va[3], __ T8H, va[5], va[7]);
 6245     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6246     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6247 
 6248     // store 48 results interleaved as shorts
 6249     vs_st2_post(vs_front(va), __ T8H, parsed);
 6250     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6251 
 6252     __ BIND(L_end);
 6253 
 6254     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6255     __ mov(r0, zr); // return 0
 6256     __ ret(lr);
 6257 
 6258     return start;
 6259   }
 6260 
 6261   // Kyber Barrett reduce function.
 6262   // Implements
 6263   // static int implKyberBarrettReduce(short[] coeffs) {}
 6264   //
 6265   // coeffs (short[256]) = c_rarg0
 6266   address generate_kyberBarrettReduce() {
 6267 
 6268     __ align(CodeEntryAlignment);
 6269     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6270     StubCodeMark mark(this, stub_id);
 6271     address start = __ pc();
 6272     __ enter();
 6273 
 6274     const Register coeffs = c_rarg0;
 6275 
 6276     const Register kyberConsts = r10;
 6277     const Register result = r11;
 6278 
 6279     // As above we process 256 sets of values in total i.e. 32 x
 6280     // 8H quadwords. So, we can load, add and store the data in 3
 6281     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6282     // of 10 or 11 registers. A further constraint is that the
 6283     // mapping needs to skip callee saves. So, we allocate the
 6284     // register sequences using two 8 sequences, two 2 sequences
 6285     // and two single registers.
 6286     VSeq<8> vs1_1(0);
 6287     VSeq<2> vs1_2(16);
 6288     FloatRegister vs1_3 = v28;
 6289     VSeq<8> vs2_1(18);
 6290     VSeq<2> vs2_2(26);
 6291     FloatRegister vs2_3 = v29;
 6292 
 6293     // we also need a pair of corresponding constant sequences
 6294 
 6295     VSeq<8> vc1_1(30, 0);
 6296     VSeq<2> vc1_2(30, 0);
 6297     FloatRegister vc1_3 = v30; // for kyber_q
 6298 
 6299     VSeq<8> vc2_1(31, 0);
 6300     VSeq<2> vc2_2(31, 0);
 6301     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6302 
 6303     __ add(result, coeffs, 0);
 6304     __ lea(kyberConsts,
 6305              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6306 
 6307     // load q and the multiplier for the Barrett reduction
 6308     __ add(kyberConsts, kyberConsts, 16);
 6309     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6310 
 6311     for (int i = 0; i < 3; i++) {
 6312       // load 80 or 88 coefficients
 6313       vs_ldpq_post(vs1_1, coeffs);
 6314       vs_ldpq_post(vs1_2, coeffs);
 6315       if (i < 2) {
 6316         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6317       }
 6318 
 6319       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6320       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6321       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6322       if (i < 2) {
 6323         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6324       }
 6325 
 6326       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6327       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6328       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6329       if (i < 2) {
 6330         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6331       }
 6332 
 6333       // vs1 <- vs1 - vs2 * kyber_q
 6334       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6335       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6336       if (i < 2) {
 6337         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6338       }
 6339 
 6340       vs_stpq_post(vs1_1, result);
 6341       vs_stpq_post(vs1_2, result);
 6342       if (i < 2) {
 6343         __ str(vs1_3, __ Q, __ post(result, 16));
 6344       }
 6345     }
 6346 
 6347     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6348     __ mov(r0, zr); // return 0
 6349     __ ret(lr);
 6350 
 6351     return start;
 6352   }
 6353 
 6354 
 6355   // Dilithium-specific montmul helper routines that generate parallel
 6356   // code for, respectively, a single 4x4s vector sequence montmul or
 6357   // two such multiplies in a row.
 6358 
 6359   // Perform 16 32-bit Montgomery multiplications in parallel
 6360   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6361                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6362     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6363     // It will assert that the register use is valid
 6364     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6365   }
 6366 
 6367   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6368   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6369                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6370     // Schedule two successive 4x4S multiplies via the montmul helper
 6371     // on the front and back halves of va, vb and vc. The helper will
 6372     // assert that the register use has no overlap conflicts on each
 6373     // individual call but we also need to ensure that the necessary
 6374     // disjoint/equality constraints are met across both calls.
 6375 
 6376     // vb, vc, vtmp and vq must be disjoint. va must either be
 6377     // disjoint from all other registers or equal vc
 6378 
 6379     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6380     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6381     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6382 
 6383     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6384     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6385 
 6386     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6387 
 6388     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6389     assert(vs_disjoint(va, vb), "va and vb overlap");
 6390     assert(vs_disjoint(va, vq), "va and vq overlap");
 6391     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6392 
 6393     // We multiply the front and back halves of each sequence 4 at a
 6394     // time because
 6395     //
 6396     // 1) we are currently only able to get 4-way instruction
 6397     // parallelism at best
 6398     //
 6399     // 2) we need registers for the constants in vq and temporary
 6400     // scratch registers to hold intermediate results so vtmp can only
 6401     // be a VSeq<4> which means we only have 4 scratch slots.
 6402 
 6403     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6404     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6405   }
 6406 
 6407   // Perform combined montmul then add/sub on 4x4S vectors.
 6408   void dilithium_montmul16_sub_add(
 6409           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6410           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6411     // compute a = montmul(a1, c)
 6412     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6413     // ouptut a1 = a0 - a
 6414     vs_subv(va1, __ T4S, va0, vc);
 6415     //    and a0 = a0 + a
 6416     vs_addv(va0, __ T4S, va0, vc);
 6417   }
 6418 
 6419   // Perform combined add/sub then montul on 4x4S vectors.
 6420   void dilithium_sub_add_montmul16(
 6421           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6422           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6423     // compute c = a0 - a1
 6424     vs_subv(vtmp1, __ T4S, va0, va1);
 6425     // output a0 = a0 + a1
 6426     vs_addv(va0, __ T4S, va0, va1);
 6427     // output a1 = b montmul c
 6428     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6429   }
 6430 
 6431   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6432   // in the Java implementation come in sequences of at least 8, so we
 6433   // can use ldpq to collect the corresponding data into pairs of vector
 6434   // registers.
 6435   // We collect the coefficients corresponding to the 'j+l' indexes into
 6436   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6437   // then we do the (Montgomery) multiplications by the zetas in parallel
 6438   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6439   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6440   // v0-v7 and finally save the results back to the coeffs array.
 6441   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6442     const Register coeffs, const Register zetas) {
 6443     int c1 = 0;
 6444     int c2 = 512;
 6445     int startIncr;
 6446     // don't use callee save registers v8 - v15
 6447     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6448     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6449     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6450     int offsets[4] = { 0, 32, 64, 96 };
 6451 
 6452     for (int level = 0; level < 5; level++) {
 6453       int c1Start = c1;
 6454       int c2Start = c2;
 6455       if (level == 3) {
 6456         offsets[1] = 32;
 6457         offsets[2] = 128;
 6458         offsets[3] = 160;
 6459       } else if (level == 4) {
 6460         offsets[1] = 64;
 6461         offsets[2] = 128;
 6462         offsets[3] = 192;
 6463       }
 6464 
 6465       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6466       // time at 4 different offsets and multiply them in order by the
 6467       // next set of input values. So we employ indexed load and store
 6468       // pair instructions with arrangement 4S.
 6469       for (int i = 0; i < 4; i++) {
 6470         // reload q and qinv
 6471         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6472         // load 8x4S coefficients via second start pos == c2
 6473         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6474         // load next 8x4S inputs == b
 6475         vs_ldpq_post(vs2, zetas);
 6476         // compute a == c2 * b mod MONT_Q
 6477         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6478         // load 8x4s coefficients via first start pos == c1
 6479         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6480         // compute a1 =  c1 + a
 6481         vs_addv(vs3, __ T4S, vs1, vs2);
 6482         // compute a2 =  c1 - a
 6483         vs_subv(vs1, __ T4S, vs1, vs2);
 6484         // output a1 and a2
 6485         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6486         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6487 
 6488         int k = 4 * level + i;
 6489 
 6490         if (k > 7) {
 6491           startIncr = 256;
 6492         } else if (k == 5) {
 6493           startIncr = 384;
 6494         } else {
 6495           startIncr = 128;
 6496         }
 6497 
 6498         c1Start += startIncr;
 6499         c2Start += startIncr;
 6500       }
 6501 
 6502       c2 /= 2;
 6503     }
 6504   }
 6505 
 6506   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6507   // Implements the method
 6508   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6509   // of the Java class sun.security.provider
 6510   //
 6511   // coeffs (int[256]) = c_rarg0
 6512   // zetas (int[256]) = c_rarg1
 6513   address generate_dilithiumAlmostNtt() {
 6514 
 6515     __ align(CodeEntryAlignment);
 6516     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 6517     StubCodeMark mark(this, stub_id);
 6518     address start = __ pc();
 6519     __ enter();
 6520 
 6521     const Register coeffs = c_rarg0;
 6522     const Register zetas = c_rarg1;
 6523 
 6524     const Register tmpAddr = r9;
 6525     const Register dilithiumConsts = r10;
 6526     const Register result = r11;
 6527     // don't use callee save registers v8 - v15
 6528     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6529     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6530     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6531     int offsets[4] = { 0, 32, 64, 96};
 6532     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6533     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6534     __ add(result, coeffs, 0);
 6535     __ lea(dilithiumConsts,
 6536              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6537 
 6538     // Each level represents one iteration of the outer for loop of the Java version.
 6539 
 6540     // level 0-4
 6541     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6542 
 6543     // level 5
 6544 
 6545     // At level 5 the coefficients we need to combine with the zetas
 6546     // are grouped in memory in blocks of size 4. So, for both sets of
 6547     // coefficients we load 4 adjacent values at 8 different offsets
 6548     // using an indexed ldr with register variant Q and multiply them
 6549     // in sequence order by the next set of inputs. Likewise we store
 6550     // the resuls using an indexed str with register variant Q.
 6551     for (int i = 0; i < 1024; i += 256) {
 6552       // reload constants q, qinv each iteration as they get clobbered later
 6553       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6554       // load 32 (8x4S) coefficients via first offsets = c1
 6555       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6556       // load next 32 (8x4S) inputs = b
 6557       vs_ldpq_post(vs2, zetas);
 6558       // a = b montul c1
 6559       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6560       // load 32 (8x4S) coefficients via second offsets = c2
 6561       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6562       // add/sub with result of multiply
 6563       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6564       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6565       // write back new coefficients using same offsets
 6566       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6567       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6568     }
 6569 
 6570     // level 6
 6571     // At level 6 the coefficients we need to combine with the zetas
 6572     // are grouped in memory in pairs, the first two being montmul
 6573     // inputs and the second add/sub inputs. We can still implement
 6574     // the montmul+sub+add using 4-way parallelism but only if we
 6575     // combine the coefficients with the zetas 16 at a time. We load 8
 6576     // adjacent values at 4 different offsets using an ld2 load with
 6577     // arrangement 2D. That interleaves the lower and upper halves of
 6578     // each pair of quadwords into successive vector registers. We
 6579     // then need to montmul the 4 even elements of the coefficients
 6580     // register sequence by the zetas in order and then add/sub the 4
 6581     // odd elements of the coefficients register sequence. We use an
 6582     // equivalent st2 operation to store the results back into memory
 6583     // de-interleaved.
 6584     for (int i = 0; i < 1024; i += 128) {
 6585       // reload constants q, qinv each iteration as they get clobbered later
 6586       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6587       // load interleaved 16 (4x2D) coefficients via offsets
 6588       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6589       // load next 16 (4x4S) inputs
 6590       vs_ldpq_post(vs_front(vs2), zetas);
 6591       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6592       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6593                                   vs_front(vs2), vtmp, vq);
 6594       // store interleaved 16 (4x2D) coefficients via offsets
 6595       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6596     }
 6597 
 6598     // level 7
 6599     // At level 7 the coefficients we need to combine with the zetas
 6600     // occur singly with montmul inputs alterating with add/sub
 6601     // inputs. Once again we can use 4-way parallelism to combine 16
 6602     // zetas at a time. However, we have to load 8 adjacent values at
 6603     // 4 different offsets using an ld2 load with arrangement 4S. That
 6604     // interleaves the the odd words of each pair into one
 6605     // coefficients vector register and the even words of the pair
 6606     // into the next register. We then need to montmul the 4 even
 6607     // elements of the coefficients register sequence by the zetas in
 6608     // order and then add/sub the 4 odd elements of the coefficients
 6609     // register sequence. We use an equivalent st2 operation to store
 6610     // the results back into memory de-interleaved.
 6611 
 6612     for (int i = 0; i < 1024; i += 128) {
 6613       // reload constants q, qinv each iteration as they get clobbered later
 6614       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6615       // load interleaved 16 (4x4S) coefficients via offsets
 6616       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6617       // load next 16 (4x4S) inputs
 6618       vs_ldpq_post(vs_front(vs2), zetas);
 6619       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6620       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6621                                   vs_front(vs2), vtmp, vq);
 6622       // store interleaved 16 (4x4S) coefficients via offsets
 6623       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6624     }
 6625     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6626     __ mov(r0, zr); // return 0
 6627     __ ret(lr);
 6628 
 6629     return start;
 6630   }
 6631 
 6632   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6633   // in the Java implementation come in sequences of at least 8, so we
 6634   // can use ldpq to collect the corresponding data into pairs of vector
 6635   // registers
 6636   // We collect the coefficients that correspond to the 'j's into vs1
 6637   // the coefficiets that correspond to the 'j+l's into vs2 then
 6638   // do the additions into vs3 and the subtractions into vs1 then
 6639   // save the result of the additions, load the zetas into vs2
 6640   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6641   // finally save the results back to the coeffs array
 6642   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6643     const Register coeffs, const Register zetas) {
 6644     int c1 = 0;
 6645     int c2 = 32;
 6646     int startIncr;
 6647     int offsets[4];
 6648     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6649     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6650     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6651 
 6652     offsets[0] = 0;
 6653 
 6654     for (int level = 3; level < 8; level++) {
 6655       int c1Start = c1;
 6656       int c2Start = c2;
 6657       if (level == 3) {
 6658         offsets[1] = 64;
 6659         offsets[2] = 128;
 6660         offsets[3] = 192;
 6661       } else if (level == 4) {
 6662         offsets[1] = 32;
 6663         offsets[2] = 128;
 6664         offsets[3] = 160;
 6665       } else {
 6666         offsets[1] = 32;
 6667         offsets[2] = 64;
 6668         offsets[3] = 96;
 6669       }
 6670 
 6671       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6672       // time at 4 different offsets and multiply them in order by the
 6673       // next set of input values. So we employ indexed load and store
 6674       // pair instructions with arrangement 4S.
 6675       for (int i = 0; i < 4; i++) {
 6676         // load v1 32 (8x4S) coefficients relative to first start index
 6677         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6678         // load v2 32 (8x4S) coefficients relative to second start index
 6679         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6680         // a0 = v1 + v2 -- n.b. clobbers vqs
 6681         vs_addv(vs3, __ T4S, vs1, vs2);
 6682         // a1 = v1 - v2
 6683         vs_subv(vs1, __ T4S, vs1, vs2);
 6684         // save a1 relative to first start index
 6685         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6686         // load constants q, qinv each iteration as they get clobbered above
 6687         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6688         // load b next 32 (8x4S) inputs
 6689         vs_ldpq_post(vs2, zetas);
 6690         // a = a1 montmul b
 6691         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6692         // save a relative to second start index
 6693         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6694 
 6695         int k = 4 * level + i;
 6696 
 6697         if (k < 24) {
 6698           startIncr = 256;
 6699         } else if (k == 25) {
 6700           startIncr = 384;
 6701         } else {
 6702           startIncr = 128;
 6703         }
 6704 
 6705         c1Start += startIncr;
 6706         c2Start += startIncr;
 6707       }
 6708 
 6709       c2 *= 2;
 6710     }
 6711   }
 6712 
 6713   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6714   // Implements the method
 6715   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6716   // the sun.security.provider.ML_DSA class.
 6717   //
 6718   // coeffs (int[256]) = c_rarg0
 6719   // zetas (int[256]) = c_rarg1
 6720   address generate_dilithiumAlmostInverseNtt() {
 6721 
 6722     __ align(CodeEntryAlignment);
 6723     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 6724     StubCodeMark mark(this, stub_id);
 6725     address start = __ pc();
 6726     __ enter();
 6727 
 6728     const Register coeffs = c_rarg0;
 6729     const Register zetas = c_rarg1;
 6730 
 6731     const Register tmpAddr = r9;
 6732     const Register dilithiumConsts = r10;
 6733     const Register result = r11;
 6734     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6735     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6736     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6737     int offsets[4] = { 0, 32, 64, 96 };
 6738     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6739     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6740 
 6741     __ add(result, coeffs, 0);
 6742     __ lea(dilithiumConsts,
 6743              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6744 
 6745     // Each level represents one iteration of the outer for loop of the Java version
 6746 
 6747     // level 0
 6748     // At level 0 we need to interleave adjacent quartets of
 6749     // coefficients before we multiply and add/sub by the next 16
 6750     // zetas just as we did for level 7 in the multiply code. So we
 6751     // load and store the values using an ld2/st2 with arrangement 4S.
 6752     for (int i = 0; i < 1024; i += 128) {
 6753       // load constants q, qinv
 6754       // n.b. this can be moved out of the loop as they do not get
 6755       // clobbered by first two loops
 6756       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6757       // a0/a1 load interleaved 32 (8x4S) coefficients
 6758       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6759       // b load next 32 (8x4S) inputs
 6760       vs_ldpq_post(vs_front(vs2), zetas);
 6761       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6762       // n.b. second half of vs2 provides temporary register storage
 6763       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6764                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6765       // a0/a1 store interleaved 32 (8x4S) coefficients
 6766       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6767     }
 6768 
 6769     // level 1
 6770     // At level 1 we need to interleave pairs of adjacent pairs of
 6771     // coefficients before we multiply by the next 16 zetas just as we
 6772     // did for level 6 in the multiply code. So we load and store the
 6773     // values an ld2/st2 with arrangement 2D.
 6774     for (int i = 0; i < 1024; i += 128) {
 6775       // a0/a1 load interleaved 32 (8x2D) coefficients
 6776       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6777       // b load next 16 (4x4S) inputs
 6778       vs_ldpq_post(vs_front(vs2), zetas);
 6779       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6780       // n.b. second half of vs2 provides temporary register storage
 6781       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6782                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6783       // a0/a1 store interleaved 32 (8x2D) coefficients
 6784       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6785     }
 6786 
 6787     // level 2
 6788     // At level 2 coefficients come in blocks of 4. So, we load 4
 6789     // adjacent coefficients at 8 distinct offsets for both the first
 6790     // and second coefficient sequences, using an ldr with register
 6791     // variant Q then combine them with next set of 32 zetas. Likewise
 6792     // we store the results using an str with register variant Q.
 6793     for (int i = 0; i < 1024; i += 256) {
 6794       // c0 load 32 (8x4S) coefficients via first offsets
 6795       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6796       // c1 load 32 (8x4S) coefficients via second offsets
 6797       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6798       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6799       vs_addv(vs3, __ T4S, vs1, vs2);
 6800       // c = c0 - c1
 6801       vs_subv(vs1, __ T4S, vs1, vs2);
 6802       // store a0 32 (8x4S) coefficients via first offsets
 6803       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6804       // b load 32 (8x4S) next inputs
 6805       vs_ldpq_post(vs2, zetas);
 6806       // reload constants q, qinv -- they were clobbered earlier
 6807       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6808       // compute a1 = b montmul c
 6809       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6810       // store a1 32 (8x4S) coefficients via second offsets
 6811       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6812     }
 6813 
 6814     // level 3-7
 6815     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6816 
 6817     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6818     __ mov(r0, zr); // return 0
 6819     __ ret(lr);
 6820 
 6821     return start;
 6822   }
 6823 
 6824   // Dilithium multiply polynomials in the NTT domain.
 6825   // Straightforward implementation of the method
 6826   // static int implDilithiumNttMult(
 6827   //              int[] result, int[] ntta, int[] nttb {} of
 6828   // the sun.security.provider.ML_DSA class.
 6829   //
 6830   // result (int[256]) = c_rarg0
 6831   // poly1 (int[256]) = c_rarg1
 6832   // poly2 (int[256]) = c_rarg2
 6833   address generate_dilithiumNttMult() {
 6834 
 6835         __ align(CodeEntryAlignment);
 6836     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 6837     StubCodeMark mark(this, stub_id);
 6838     address start = __ pc();
 6839     __ enter();
 6840 
 6841     Label L_loop;
 6842 
 6843     const Register result = c_rarg0;
 6844     const Register poly1 = c_rarg1;
 6845     const Register poly2 = c_rarg2;
 6846 
 6847     const Register dilithiumConsts = r10;
 6848     const Register len = r11;
 6849 
 6850     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6851     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6852     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6853     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6854 
 6855     __ lea(dilithiumConsts,
 6856              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6857 
 6858     // load constants q, qinv
 6859     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6860     // load constant rSquare into v29
 6861     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6862 
 6863     __ mov(len, zr);
 6864     __ add(len, len, 1024);
 6865 
 6866     __ BIND(L_loop);
 6867 
 6868     // b load 32 (8x4S) next inputs from poly1
 6869     vs_ldpq_post(vs1, poly1);
 6870     // c load 32 (8x4S) next inputs from poly2
 6871     vs_ldpq_post(vs2, poly2);
 6872     // compute a = b montmul c
 6873     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6874     // compute a = rsquare montmul a
 6875     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6876     // save a 32 (8x4S) results
 6877     vs_stpq_post(vs2, result);
 6878 
 6879     __ sub(len, len, 128);
 6880     __ cmp(len, (u1)128);
 6881     __ br(Assembler::GE, L_loop);
 6882 
 6883     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6884     __ mov(r0, zr); // return 0
 6885     __ ret(lr);
 6886 
 6887     return start;
 6888   }
 6889 
 6890   // Dilithium Motgomery multiply an array by a constant.
 6891   // A straightforward implementation of the method
 6892   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6893   // of the sun.security.provider.MLDSA class
 6894   //
 6895   // coeffs (int[256]) = c_rarg0
 6896   // constant (int) = c_rarg1
 6897   address generate_dilithiumMontMulByConstant() {
 6898 
 6899     __ align(CodeEntryAlignment);
 6900     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 6901     StubCodeMark mark(this, stub_id);
 6902     address start = __ pc();
 6903     __ enter();
 6904 
 6905     Label L_loop;
 6906 
 6907     const Register coeffs = c_rarg0;
 6908     const Register constant = c_rarg1;
 6909 
 6910     const Register dilithiumConsts = r10;
 6911     const Register result = r11;
 6912     const Register len = r12;
 6913 
 6914     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6915     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6916     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6917     VSeq<8> vconst(29, 0);             // for montmul by constant
 6918 
 6919     // results track inputs
 6920     __ add(result, coeffs, 0);
 6921     __ lea(dilithiumConsts,
 6922              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6923 
 6924     // load constants q, qinv -- they do not get clobbered by first two loops
 6925     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6926     // copy caller supplied constant across vconst
 6927     __ dup(vconst[0], __ T4S, constant);
 6928     __ mov(len, zr);
 6929     __ add(len, len, 1024);
 6930 
 6931     __ BIND(L_loop);
 6932 
 6933     // load next 32 inputs
 6934     vs_ldpq_post(vs2, coeffs);
 6935     // mont mul by constant
 6936     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6937     // write next 32 results
 6938     vs_stpq_post(vs2, result);
 6939 
 6940     __ sub(len, len, 128);
 6941     __ cmp(len, (u1)128);
 6942     __ br(Assembler::GE, L_loop);
 6943 
 6944     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6945     __ mov(r0, zr); // return 0
 6946     __ ret(lr);
 6947 
 6948     return start;
 6949   }
 6950 
 6951   // Dilithium decompose poly.
 6952   // Implements the method
 6953   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6954   // of the sun.security.provider.ML_DSA class
 6955   //
 6956   // input (int[256]) = c_rarg0
 6957   // lowPart (int[256]) = c_rarg1
 6958   // highPart (int[256]) = c_rarg2
 6959   // twoGamma2  (int) = c_rarg3
 6960   // multiplier (int) = c_rarg4
 6961   address generate_dilithiumDecomposePoly() {
 6962 
 6963     __ align(CodeEntryAlignment);
 6964     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 6965     StubCodeMark mark(this, stub_id);
 6966     address start = __ pc();
 6967     Label L_loop;
 6968 
 6969     const Register input = c_rarg0;
 6970     const Register lowPart = c_rarg1;
 6971     const Register highPart = c_rarg2;
 6972     const Register twoGamma2 = c_rarg3;
 6973     const Register multiplier = c_rarg4;
 6974 
 6975     const Register len = r9;
 6976     const Register dilithiumConsts = r10;
 6977     const Register tmp = r11;
 6978 
 6979     // 6 independent sets of 4x4s values
 6980     VSeq<4> vs1(0), vs2(4), vs3(8);
 6981     VSeq<4> vs4(12), vs5(16), vtmp(20);
 6982 
 6983     // 7 constants for cross-multiplying
 6984     VSeq<4> one(25, 0);
 6985     VSeq<4> qminus1(26, 0);
 6986     VSeq<4> g2(27, 0);
 6987     VSeq<4> twog2(28, 0);
 6988     VSeq<4> mult(29, 0);
 6989     VSeq<4> q(30, 0);
 6990     VSeq<4> qadd(31, 0);
 6991 
 6992     __ enter();
 6993 
 6994     __ lea(dilithiumConsts,
 6995              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6996 
 6997     // save callee-saved registers
 6998     __ stpd(v8, v9, __ pre(sp, -64));
 6999     __ stpd(v10, v11, Address(sp, 16));
 7000     __ stpd(v12, v13, Address(sp, 32));
 7001     __ stpd(v14, v15, Address(sp, 48));
 7002 
 7003     // populate constant registers
 7004     __ mov(tmp, zr);
 7005     __ add(tmp, tmp, 1);
 7006     __ dup(one[0], __ T4S, tmp); // 1
 7007     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7008     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7009     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7010     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7011     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7012     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7013 
 7014     __ mov(len, zr);
 7015     __ add(len, len, 1024);
 7016 
 7017     __ BIND(L_loop);
 7018 
 7019     // load next 4x4S inputs interleaved: rplus --> vs1
 7020     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7021 
 7022     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7023     vs_addv(vtmp, __ T4S, vs1, qadd);
 7024     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7025     vs_mulv(vtmp, __ T4S, vtmp, q);
 7026     vs_subv(vs1, __ T4S, vs1, vtmp);
 7027 
 7028     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7029     vs_sshr(vtmp, __ T4S, vs1, 31);
 7030     vs_andr(vtmp, vtmp, q);
 7031     vs_addv(vs1, __ T4S, vs1, vtmp);
 7032 
 7033     // quotient --> vs2
 7034     // int quotient = (rplus * multiplier) >> 22;
 7035     vs_mulv(vtmp, __ T4S, vs1, mult);
 7036     vs_sshr(vs2, __ T4S, vtmp, 22);
 7037 
 7038     // r0 --> vs3
 7039     // int r0 = rplus - quotient * twoGamma2;
 7040     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7041     vs_subv(vs3, __ T4S, vs1, vtmp);
 7042 
 7043     // mask --> vs4
 7044     // int mask = (twoGamma2 - r0) >> 22;
 7045     vs_subv(vtmp, __ T4S, twog2, vs3);
 7046     vs_sshr(vs4, __ T4S, vtmp, 22);
 7047 
 7048     // r0 -= (mask & twoGamma2);
 7049     vs_andr(vtmp, vs4, twog2);
 7050     vs_subv(vs3, __ T4S, vs3, vtmp);
 7051 
 7052     //  quotient += (mask & 1);
 7053     vs_andr(vtmp, vs4, one);
 7054     vs_addv(vs2, __ T4S, vs2, vtmp);
 7055 
 7056     // mask = (twoGamma2 / 2 - r0) >> 31;
 7057     vs_subv(vtmp, __ T4S, g2, vs3);
 7058     vs_sshr(vs4, __ T4S, vtmp, 31);
 7059 
 7060     // r0 -= (mask & twoGamma2);
 7061     vs_andr(vtmp, vs4, twog2);
 7062     vs_subv(vs3, __ T4S, vs3, vtmp);
 7063 
 7064     // quotient += (mask & 1);
 7065     vs_andr(vtmp, vs4, one);
 7066     vs_addv(vs2, __ T4S, vs2, vtmp);
 7067 
 7068     // r1 --> vs5
 7069     // int r1 = rplus - r0 - (dilithium_q - 1);
 7070     vs_subv(vtmp, __ T4S, vs1, vs3);
 7071     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7072 
 7073     // r1 --> vs1 (overwriting rplus)
 7074     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7075     vs_negr(vtmp, __ T4S, vs5);
 7076     vs_orr(vtmp, vs5, vtmp);
 7077     vs_sshr(vs1, __ T4S, vtmp, 31);
 7078 
 7079     // r0 += ~r1;
 7080     vs_notr(vtmp, vs1);
 7081     vs_addv(vs3, __ T4S, vs3, vtmp);
 7082 
 7083     // r1 = r1 & quotient;
 7084     vs_andr(vs1, vs2, vs1);
 7085 
 7086     // store results inteleaved
 7087     // lowPart[m] = r0;
 7088     // highPart[m] = r1;
 7089     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7090     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7091 
 7092     __ sub(len, len, 64);
 7093     __ cmp(len, (u1)64);
 7094     __ br(Assembler::GE, L_loop);
 7095 
 7096     // restore callee-saved vector registers
 7097     __ ldpd(v14, v15, Address(sp, 48));
 7098     __ ldpd(v12, v13, Address(sp, 32));
 7099     __ ldpd(v10, v11, Address(sp, 16));
 7100     __ ldpd(v8, v9, __ post(sp, 64));
 7101 
 7102     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7103     __ mov(r0, zr); // return 0
 7104     __ ret(lr);
 7105 
 7106     return start;
 7107   }
 7108 
 7109   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7110              Register tmp0, Register tmp1, Register tmp2) {
 7111     __ bic(tmp0, a2, a1); // for a0
 7112     __ bic(tmp1, a3, a2); // for a1
 7113     __ bic(tmp2, a4, a3); // for a2
 7114     __ eor(a2, a2, tmp2);
 7115     __ bic(tmp2, a0, a4); // for a3
 7116     __ eor(a3, a3, tmp2);
 7117     __ bic(tmp2, a1, a0); // for a4
 7118     __ eor(a0, a0, tmp0);
 7119     __ eor(a1, a1, tmp1);
 7120     __ eor(a4, a4, tmp2);
 7121   }
 7122 
 7123   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7124                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7125                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7126                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7127                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7128                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7129                         Register tmp0, Register tmp1, Register tmp2) {
 7130     __ eor3(tmp1, a4, a9, a14);
 7131     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7132     __ eor3(tmp2, a1, a6, a11);
 7133     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7134     __ rax1(tmp2, tmp0, tmp1); // d0
 7135     {
 7136 
 7137       Register tmp3, tmp4;
 7138       if (can_use_fp && can_use_r18) {
 7139         tmp3 = rfp;
 7140         tmp4 = r18_tls;
 7141       } else {
 7142         tmp3 = a4;
 7143         tmp4 = a9;
 7144         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7145       }
 7146 
 7147       __ eor3(tmp3, a0, a5, a10);
 7148       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7149       __ eor(a0, a0, tmp2);
 7150       __ eor(a5, a5, tmp2);
 7151       __ eor(a10, a10, tmp2);
 7152       __ eor(a15, a15, tmp2);
 7153       __ eor(a20, a20, tmp2); // d0(tmp2)
 7154       __ eor3(tmp3, a2, a7, a12);
 7155       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7156       __ rax1(tmp3, tmp4, tmp2); // d1
 7157       __ eor(a1, a1, tmp3);
 7158       __ eor(a6, a6, tmp3);
 7159       __ eor(a11, a11, tmp3);
 7160       __ eor(a16, a16, tmp3);
 7161       __ eor(a21, a21, tmp3); // d1(tmp3)
 7162       __ rax1(tmp3, tmp2, tmp0); // d3
 7163       __ eor3(tmp2, a3, a8, a13);
 7164       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7165       __ eor(a3, a3, tmp3);
 7166       __ eor(a8, a8, tmp3);
 7167       __ eor(a13, a13, tmp3);
 7168       __ eor(a18, a18, tmp3);
 7169       __ eor(a23, a23, tmp3);
 7170       __ rax1(tmp2, tmp1, tmp0); // d2
 7171       __ eor(a2, a2, tmp2);
 7172       __ eor(a7, a7, tmp2);
 7173       __ eor(a12, a12, tmp2);
 7174       __ rax1(tmp0, tmp0, tmp4); // d4
 7175       if (!can_use_fp || !can_use_r18) {
 7176         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7177       }
 7178       __ eor(a17, a17, tmp2);
 7179       __ eor(a22, a22, tmp2);
 7180       __ eor(a4, a4, tmp0);
 7181       __ eor(a9, a9, tmp0);
 7182       __ eor(a14, a14, tmp0);
 7183       __ eor(a19, a19, tmp0);
 7184       __ eor(a24, a24, tmp0);
 7185     }
 7186 
 7187     __ rol(tmp0, a10, 3);
 7188     __ rol(a10, a1, 1);
 7189     __ rol(a1, a6, 44);
 7190     __ rol(a6, a9, 20);
 7191     __ rol(a9, a22, 61);
 7192     __ rol(a22, a14, 39);
 7193     __ rol(a14, a20, 18);
 7194     __ rol(a20, a2, 62);
 7195     __ rol(a2, a12, 43);
 7196     __ rol(a12, a13, 25);
 7197     __ rol(a13, a19, 8) ;
 7198     __ rol(a19, a23, 56);
 7199     __ rol(a23, a15, 41);
 7200     __ rol(a15, a4, 27);
 7201     __ rol(a4, a24, 14);
 7202     __ rol(a24, a21, 2);
 7203     __ rol(a21, a8, 55);
 7204     __ rol(a8, a16, 45);
 7205     __ rol(a16, a5, 36);
 7206     __ rol(a5, a3, 28);
 7207     __ rol(a3, a18, 21);
 7208     __ rol(a18, a17, 15);
 7209     __ rol(a17, a11, 10);
 7210     __ rol(a11, a7, 6);
 7211     __ mov(a7, tmp0);
 7212 
 7213     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7214     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7215     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7216     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7217     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7218 
 7219     __ ldr(tmp1, __ post(rc, 8));
 7220     __ eor(a0, a0, tmp1);
 7221 
 7222   }
 7223 
 7224   // Arguments:
 7225   //
 7226   // Inputs:
 7227   //   c_rarg0   - byte[]  source+offset
 7228   //   c_rarg1   - byte[]  SHA.state
 7229   //   c_rarg2   - int     block_size
 7230   //   c_rarg3   - int     offset
 7231   //   c_rarg4   - int     limit
 7232   //
 7233   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7234     bool multi_block;
 7235     switch (stub_id) {
 7236     case StubId::stubgen_sha3_implCompress_id:
 7237       multi_block = false;
 7238       break;
 7239     case StubId::stubgen_sha3_implCompressMB_id:
 7240       multi_block = true;
 7241       break;
 7242     default:
 7243       ShouldNotReachHere();
 7244     }
 7245 
 7246     static const uint64_t round_consts[24] = {
 7247       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 7248       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 7249       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 7250       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 7251       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 7252       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 7253       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 7254       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 7255     };
 7256 
 7257     __ align(CodeEntryAlignment);
 7258     StubCodeMark mark(this, stub_id);
 7259     address start = __ pc();
 7260 
 7261     Register buf           = c_rarg0;
 7262     Register state         = c_rarg1;
 7263     Register block_size    = c_rarg2;
 7264     Register ofs           = c_rarg3;
 7265     Register limit         = c_rarg4;
 7266 
 7267     // use r3.r17,r19..r28 to keep a0..a24.
 7268     // a0..a24 are respective locals from SHA3.java
 7269     Register a0 = r25,
 7270              a1 = r26,
 7271              a2 = r27,
 7272              a3 = r3,
 7273              a4 = r4,
 7274              a5 = r5,
 7275              a6 = r6,
 7276              a7 = r7,
 7277              a8 = rscratch1, // r8
 7278              a9 = rscratch2, // r9
 7279              a10 = r10,
 7280              a11 = r11,
 7281              a12 = r12,
 7282              a13 = r13,
 7283              a14 = r14,
 7284              a15 = r15,
 7285              a16 = r16,
 7286              a17 = r17,
 7287              a18 = r28,
 7288              a19 = r19,
 7289              a20 = r20,
 7290              a21 = r21,
 7291              a22 = r22,
 7292              a23 = r23,
 7293              a24 = r24;
 7294 
 7295     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7296 
 7297     Label sha3_loop, rounds24_preloop, loop_body;
 7298     Label sha3_512_or_sha3_384, shake128;
 7299 
 7300     bool can_use_r18 = false;
 7301 #ifndef R18_RESERVED
 7302     can_use_r18 = true;
 7303 #endif
 7304     bool can_use_fp = !PreserveFramePointer;
 7305 
 7306     __ enter();
 7307 
 7308     // save almost all yet unsaved gpr registers on stack
 7309     __ str(block_size, __ pre(sp, -128));
 7310     if (multi_block) {
 7311       __ stpw(ofs, limit, Address(sp, 8));
 7312     }
 7313     // 8 bytes at sp+16 will be used to keep buf
 7314     __ stp(r19, r20, Address(sp, 32));
 7315     __ stp(r21, r22, Address(sp, 48));
 7316     __ stp(r23, r24, Address(sp, 64));
 7317     __ stp(r25, r26, Address(sp, 80));
 7318     __ stp(r27, r28, Address(sp, 96));
 7319     if (can_use_r18 && can_use_fp) {
 7320       __ stp(r18_tls, state, Address(sp, 112));
 7321     } else {
 7322       __ str(state, Address(sp, 112));
 7323     }
 7324 
 7325     // begin sha3 calculations: loading a0..a24 from state arrary
 7326     __ ldp(a0, a1, state);
 7327     __ ldp(a2, a3, Address(state, 16));
 7328     __ ldp(a4, a5, Address(state, 32));
 7329     __ ldp(a6, a7, Address(state, 48));
 7330     __ ldp(a8, a9, Address(state, 64));
 7331     __ ldp(a10, a11, Address(state, 80));
 7332     __ ldp(a12, a13, Address(state, 96));
 7333     __ ldp(a14, a15, Address(state, 112));
 7334     __ ldp(a16, a17, Address(state, 128));
 7335     __ ldp(a18, a19, Address(state, 144));
 7336     __ ldp(a20, a21, Address(state, 160));
 7337     __ ldp(a22, a23, Address(state, 176));
 7338     __ ldr(a24, Address(state, 192));
 7339 
 7340     __ BIND(sha3_loop);
 7341 
 7342     // load input
 7343     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7344     __ eor(a0, a0, tmp3);
 7345     __ eor(a1, a1, tmp2);
 7346     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7347     __ eor(a2, a2, tmp3);
 7348     __ eor(a3, a3, tmp2);
 7349     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7350     __ eor(a4, a4, tmp3);
 7351     __ eor(a5, a5, tmp2);
 7352     __ ldr(tmp3, __ post(buf, 8));
 7353     __ eor(a6, a6, tmp3);
 7354 
 7355     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7356     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7357 
 7358     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7359     __ eor(a7, a7, tmp3);
 7360     __ eor(a8, a8, tmp2);
 7361     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7362     __ eor(a9, a9, tmp3);
 7363     __ eor(a10, a10, tmp2);
 7364     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7365     __ eor(a11, a11, tmp3);
 7366     __ eor(a12, a12, tmp2);
 7367     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7368     __ eor(a13, a13, tmp3);
 7369     __ eor(a14, a14, tmp2);
 7370     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7371     __ eor(a15, a15, tmp3);
 7372     __ eor(a16, a16, tmp2);
 7373 
 7374     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7375     __ andw(tmp2, block_size, 48);
 7376     __ cbzw(tmp2, rounds24_preloop);
 7377     __ tbnz(block_size, 5, shake128);
 7378     // block_size == 144, bit5 == 0, SHA3-244
 7379     __ ldr(tmp3, __ post(buf, 8));
 7380     __ eor(a17, a17, tmp3);
 7381     __ b(rounds24_preloop);
 7382 
 7383     __ BIND(shake128);
 7384     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7385     __ eor(a17, a17, tmp3);
 7386     __ eor(a18, a18, tmp2);
 7387     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7388     __ eor(a19, a19, tmp3);
 7389     __ eor(a20, a20, tmp2);
 7390     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7391 
 7392     __ BIND(sha3_512_or_sha3_384);
 7393     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7394     __ eor(a7, a7, tmp3);
 7395     __ eor(a8, a8, tmp2);
 7396     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7397 
 7398     // SHA3-384
 7399     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7400     __ eor(a9, a9, tmp3);
 7401     __ eor(a10, a10, tmp2);
 7402     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7403     __ eor(a11, a11, tmp3);
 7404     __ eor(a12, a12, tmp2);
 7405 
 7406     __ BIND(rounds24_preloop);
 7407     __ fmovs(v0, 24.0); // float loop counter,
 7408     __ fmovs(v1, 1.0);  // exact representation
 7409 
 7410     __ str(buf, Address(sp, 16));
 7411     __ lea(tmp3, ExternalAddress((address) round_consts));
 7412 
 7413     __ BIND(loop_body);
 7414     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7415                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7416                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7417                      tmp0, tmp1, tmp2);
 7418     __ fsubs(v0, v0, v1);
 7419     __ fcmps(v0, 0.0);
 7420     __ br(__ NE, loop_body);
 7421 
 7422     if (multi_block) {
 7423       __ ldrw(block_size, sp); // block_size
 7424       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7425       __ addw(tmp2, tmp2, block_size);
 7426       __ cmpw(tmp2, tmp1);
 7427       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7428       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7429       __ br(Assembler::LE, sha3_loop);
 7430       __ movw(c_rarg0, tmp2); // return offset
 7431     }
 7432     if (can_use_fp && can_use_r18) {
 7433       __ ldp(r18_tls, state, Address(sp, 112));
 7434     } else {
 7435       __ ldr(state, Address(sp, 112));
 7436     }
 7437     // save calculated sha3 state
 7438     __ stp(a0, a1, Address(state));
 7439     __ stp(a2, a3, Address(state, 16));
 7440     __ stp(a4, a5, Address(state, 32));
 7441     __ stp(a6, a7, Address(state, 48));
 7442     __ stp(a8, a9, Address(state, 64));
 7443     __ stp(a10, a11, Address(state, 80));
 7444     __ stp(a12, a13, Address(state, 96));
 7445     __ stp(a14, a15, Address(state, 112));
 7446     __ stp(a16, a17, Address(state, 128));
 7447     __ stp(a18, a19, Address(state, 144));
 7448     __ stp(a20, a21, Address(state, 160));
 7449     __ stp(a22, a23, Address(state, 176));
 7450     __ str(a24, Address(state, 192));
 7451 
 7452     // restore required registers from stack
 7453     __ ldp(r19, r20, Address(sp, 32));
 7454     __ ldp(r21, r22, Address(sp, 48));
 7455     __ ldp(r23, r24, Address(sp, 64));
 7456     __ ldp(r25, r26, Address(sp, 80));
 7457     __ ldp(r27, r28, Address(sp, 96));
 7458     if (can_use_fp && can_use_r18) {
 7459       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 7460     } // else no need to recalculate rfp, since it wasn't changed
 7461 
 7462     __ leave();
 7463 
 7464     __ ret(lr);
 7465 
 7466     return start;
 7467   }
 7468 
 7469   /**
 7470    *  Arguments:
 7471    *
 7472    * Inputs:
 7473    *   c_rarg0   - int crc
 7474    *   c_rarg1   - byte* buf
 7475    *   c_rarg2   - int length
 7476    *
 7477    * Output:
 7478    *       rax   - int crc result
 7479    */
 7480   address generate_updateBytesCRC32() {
 7481     assert(UseCRC32Intrinsics, "what are we doing here?");
 7482 
 7483     __ align(CodeEntryAlignment);
 7484     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 7485     StubCodeMark mark(this, stub_id);
 7486 
 7487     address start = __ pc();
 7488 
 7489     const Register crc   = c_rarg0;  // crc
 7490     const Register buf   = c_rarg1;  // source java byte array address
 7491     const Register len   = c_rarg2;  // length
 7492     const Register table0 = c_rarg3; // crc_table address
 7493     const Register table1 = c_rarg4;
 7494     const Register table2 = c_rarg5;
 7495     const Register table3 = c_rarg6;
 7496     const Register tmp3 = c_rarg7;
 7497 
 7498     BLOCK_COMMENT("Entry:");
 7499     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7500 
 7501     __ kernel_crc32(crc, buf, len,
 7502               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7503 
 7504     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7505     __ ret(lr);
 7506 
 7507     return start;
 7508   }
 7509 
 7510   /**
 7511    *  Arguments:
 7512    *
 7513    * Inputs:
 7514    *   c_rarg0   - int crc
 7515    *   c_rarg1   - byte* buf
 7516    *   c_rarg2   - int length
 7517    *   c_rarg3   - int* table
 7518    *
 7519    * Output:
 7520    *       r0   - int crc result
 7521    */
 7522   address generate_updateBytesCRC32C() {
 7523     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7524 
 7525     __ align(CodeEntryAlignment);
 7526     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 7527     StubCodeMark mark(this, stub_id);
 7528 
 7529     address start = __ pc();
 7530 
 7531     const Register crc   = c_rarg0;  // crc
 7532     const Register buf   = c_rarg1;  // source java byte array address
 7533     const Register len   = c_rarg2;  // length
 7534     const Register table0 = c_rarg3; // crc_table address
 7535     const Register table1 = c_rarg4;
 7536     const Register table2 = c_rarg5;
 7537     const Register table3 = c_rarg6;
 7538     const Register tmp3 = c_rarg7;
 7539 
 7540     BLOCK_COMMENT("Entry:");
 7541     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7542 
 7543     __ kernel_crc32c(crc, buf, len,
 7544               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7545 
 7546     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7547     __ ret(lr);
 7548 
 7549     return start;
 7550   }
 7551 
 7552   /***
 7553    *  Arguments:
 7554    *
 7555    *  Inputs:
 7556    *   c_rarg0   - int   adler
 7557    *   c_rarg1   - byte* buff
 7558    *   c_rarg2   - int   len
 7559    *
 7560    * Output:
 7561    *   c_rarg0   - int adler result
 7562    */
 7563   address generate_updateBytesAdler32() {
 7564     __ align(CodeEntryAlignment);
 7565     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 7566     StubCodeMark mark(this, stub_id);
 7567     address start = __ pc();
 7568 
 7569     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7570 
 7571     // Aliases
 7572     Register adler  = c_rarg0;
 7573     Register s1     = c_rarg0;
 7574     Register s2     = c_rarg3;
 7575     Register buff   = c_rarg1;
 7576     Register len    = c_rarg2;
 7577     Register nmax  = r4;
 7578     Register base  = r5;
 7579     Register count = r6;
 7580     Register temp0 = rscratch1;
 7581     Register temp1 = rscratch2;
 7582     FloatRegister vbytes = v0;
 7583     FloatRegister vs1acc = v1;
 7584     FloatRegister vs2acc = v2;
 7585     FloatRegister vtable = v3;
 7586 
 7587     // Max number of bytes we can process before having to take the mod
 7588     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7589     uint64_t BASE = 0xfff1;
 7590     uint64_t NMAX = 0x15B0;
 7591 
 7592     __ mov(base, BASE);
 7593     __ mov(nmax, NMAX);
 7594 
 7595     // Load accumulation coefficients for the upper 16 bits
 7596     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7597     __ ld1(vtable, __ T16B, Address(temp0));
 7598 
 7599     // s1 is initialized to the lower 16 bits of adler
 7600     // s2 is initialized to the upper 16 bits of adler
 7601     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7602     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7603 
 7604     // The pipelined loop needs at least 16 elements for 1 iteration
 7605     // It does check this, but it is more effective to skip to the cleanup loop
 7606     __ cmp(len, (u1)16);
 7607     __ br(Assembler::HS, L_nmax);
 7608     __ cbz(len, L_combine);
 7609 
 7610     __ bind(L_simple_by1_loop);
 7611     __ ldrb(temp0, Address(__ post(buff, 1)));
 7612     __ add(s1, s1, temp0);
 7613     __ add(s2, s2, s1);
 7614     __ subs(len, len, 1);
 7615     __ br(Assembler::HI, L_simple_by1_loop);
 7616 
 7617     // s1 = s1 % BASE
 7618     __ subs(temp0, s1, base);
 7619     __ csel(s1, temp0, s1, Assembler::HS);
 7620 
 7621     // s2 = s2 % BASE
 7622     __ lsr(temp0, s2, 16);
 7623     __ lsl(temp1, temp0, 4);
 7624     __ sub(temp1, temp1, temp0);
 7625     __ add(s2, temp1, s2, ext::uxth);
 7626 
 7627     __ subs(temp0, s2, base);
 7628     __ csel(s2, temp0, s2, Assembler::HS);
 7629 
 7630     __ b(L_combine);
 7631 
 7632     __ bind(L_nmax);
 7633     __ subs(len, len, nmax);
 7634     __ sub(count, nmax, 16);
 7635     __ br(Assembler::LO, L_by16);
 7636 
 7637     __ bind(L_nmax_loop);
 7638 
 7639     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7640                                       vbytes, vs1acc, vs2acc, vtable);
 7641 
 7642     __ subs(count, count, 16);
 7643     __ br(Assembler::HS, L_nmax_loop);
 7644 
 7645     // s1 = s1 % BASE
 7646     __ lsr(temp0, s1, 16);
 7647     __ lsl(temp1, temp0, 4);
 7648     __ sub(temp1, temp1, temp0);
 7649     __ add(temp1, temp1, s1, ext::uxth);
 7650 
 7651     __ lsr(temp0, temp1, 16);
 7652     __ lsl(s1, temp0, 4);
 7653     __ sub(s1, s1, temp0);
 7654     __ add(s1, s1, temp1, ext:: uxth);
 7655 
 7656     __ subs(temp0, s1, base);
 7657     __ csel(s1, temp0, s1, Assembler::HS);
 7658 
 7659     // s2 = s2 % BASE
 7660     __ lsr(temp0, s2, 16);
 7661     __ lsl(temp1, temp0, 4);
 7662     __ sub(temp1, temp1, temp0);
 7663     __ add(temp1, temp1, s2, ext::uxth);
 7664 
 7665     __ lsr(temp0, temp1, 16);
 7666     __ lsl(s2, temp0, 4);
 7667     __ sub(s2, s2, temp0);
 7668     __ add(s2, s2, temp1, ext:: uxth);
 7669 
 7670     __ subs(temp0, s2, base);
 7671     __ csel(s2, temp0, s2, Assembler::HS);
 7672 
 7673     __ subs(len, len, nmax);
 7674     __ sub(count, nmax, 16);
 7675     __ br(Assembler::HS, L_nmax_loop);
 7676 
 7677     __ bind(L_by16);
 7678     __ adds(len, len, count);
 7679     __ br(Assembler::LO, L_by1);
 7680 
 7681     __ bind(L_by16_loop);
 7682 
 7683     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7684                                       vbytes, vs1acc, vs2acc, vtable);
 7685 
 7686     __ subs(len, len, 16);
 7687     __ br(Assembler::HS, L_by16_loop);
 7688 
 7689     __ bind(L_by1);
 7690     __ adds(len, len, 15);
 7691     __ br(Assembler::LO, L_do_mod);
 7692 
 7693     __ bind(L_by1_loop);
 7694     __ ldrb(temp0, Address(__ post(buff, 1)));
 7695     __ add(s1, temp0, s1);
 7696     __ add(s2, s2, s1);
 7697     __ subs(len, len, 1);
 7698     __ br(Assembler::HS, L_by1_loop);
 7699 
 7700     __ bind(L_do_mod);
 7701     // s1 = s1 % BASE
 7702     __ lsr(temp0, s1, 16);
 7703     __ lsl(temp1, temp0, 4);
 7704     __ sub(temp1, temp1, temp0);
 7705     __ add(temp1, temp1, s1, ext::uxth);
 7706 
 7707     __ lsr(temp0, temp1, 16);
 7708     __ lsl(s1, temp0, 4);
 7709     __ sub(s1, s1, temp0);
 7710     __ add(s1, s1, temp1, ext:: uxth);
 7711 
 7712     __ subs(temp0, s1, base);
 7713     __ csel(s1, temp0, s1, Assembler::HS);
 7714 
 7715     // s2 = s2 % BASE
 7716     __ lsr(temp0, s2, 16);
 7717     __ lsl(temp1, temp0, 4);
 7718     __ sub(temp1, temp1, temp0);
 7719     __ add(temp1, temp1, s2, ext::uxth);
 7720 
 7721     __ lsr(temp0, temp1, 16);
 7722     __ lsl(s2, temp0, 4);
 7723     __ sub(s2, s2, temp0);
 7724     __ add(s2, s2, temp1, ext:: uxth);
 7725 
 7726     __ subs(temp0, s2, base);
 7727     __ csel(s2, temp0, s2, Assembler::HS);
 7728 
 7729     // Combine lower bits and higher bits
 7730     __ bind(L_combine);
 7731     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7732 
 7733     __ ret(lr);
 7734 
 7735     return start;
 7736   }
 7737 
 7738   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7739           Register temp0, Register temp1, FloatRegister vbytes,
 7740           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7741     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7742     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7743     // In non-vectorized code, we update s1 and s2 as:
 7744     //   s1 <- s1 + b1
 7745     //   s2 <- s2 + s1
 7746     //   s1 <- s1 + b2
 7747     //   s2 <- s2 + b1
 7748     //   ...
 7749     //   s1 <- s1 + b16
 7750     //   s2 <- s2 + s1
 7751     // Putting above assignments together, we have:
 7752     //   s1_new = s1 + b1 + b2 + ... + b16
 7753     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7754     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7755     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7756     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7757 
 7758     // s2 = s2 + s1 * 16
 7759     __ add(s2, s2, s1, Assembler::LSL, 4);
 7760 
 7761     // vs1acc = b1 + b2 + b3 + ... + b16
 7762     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7763     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7764     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7765     __ uaddlv(vs1acc, __ T16B, vbytes);
 7766     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7767 
 7768     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7769     __ fmovd(temp0, vs1acc);
 7770     __ fmovd(temp1, vs2acc);
 7771     __ add(s1, s1, temp0);
 7772     __ add(s2, s2, temp1);
 7773   }
 7774 
 7775   /**
 7776    *  Arguments:
 7777    *
 7778    *  Input:
 7779    *    c_rarg0   - x address
 7780    *    c_rarg1   - x length
 7781    *    c_rarg2   - y address
 7782    *    c_rarg3   - y length
 7783    *    c_rarg4   - z address
 7784    */
 7785   address generate_multiplyToLen() {
 7786     __ align(CodeEntryAlignment);
 7787     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 7788     StubCodeMark mark(this, stub_id);
 7789 
 7790     address start = __ pc();
 7791     const Register x     = r0;
 7792     const Register xlen  = r1;
 7793     const Register y     = r2;
 7794     const Register ylen  = r3;
 7795     const Register z     = r4;
 7796 
 7797     const Register tmp0  = r5;
 7798     const Register tmp1  = r10;
 7799     const Register tmp2  = r11;
 7800     const Register tmp3  = r12;
 7801     const Register tmp4  = r13;
 7802     const Register tmp5  = r14;
 7803     const Register tmp6  = r15;
 7804     const Register tmp7  = r16;
 7805 
 7806     BLOCK_COMMENT("Entry:");
 7807     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7808     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7809     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7810     __ ret(lr);
 7811 
 7812     return start;
 7813   }
 7814 
 7815   address generate_squareToLen() {
 7816     // squareToLen algorithm for sizes 1..127 described in java code works
 7817     // faster than multiply_to_len on some CPUs and slower on others, but
 7818     // multiply_to_len shows a bit better overall results
 7819     __ align(CodeEntryAlignment);
 7820     StubId stub_id = StubId::stubgen_squareToLen_id;
 7821     StubCodeMark mark(this, stub_id);
 7822     address start = __ pc();
 7823 
 7824     const Register x     = r0;
 7825     const Register xlen  = r1;
 7826     const Register z     = r2;
 7827     const Register y     = r4; // == x
 7828     const Register ylen  = r5; // == xlen
 7829 
 7830     const Register tmp0  = r3;
 7831     const Register tmp1  = r10;
 7832     const Register tmp2  = r11;
 7833     const Register tmp3  = r12;
 7834     const Register tmp4  = r13;
 7835     const Register tmp5  = r14;
 7836     const Register tmp6  = r15;
 7837     const Register tmp7  = r16;
 7838 
 7839     RegSet spilled_regs = RegSet::of(y, ylen);
 7840     BLOCK_COMMENT("Entry:");
 7841     __ enter();
 7842     __ push(spilled_regs, sp);
 7843     __ mov(y, x);
 7844     __ mov(ylen, xlen);
 7845     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7846     __ pop(spilled_regs, sp);
 7847     __ leave();
 7848     __ ret(lr);
 7849     return start;
 7850   }
 7851 
 7852   address generate_mulAdd() {
 7853     __ align(CodeEntryAlignment);
 7854     StubId stub_id = StubId::stubgen_mulAdd_id;
 7855     StubCodeMark mark(this, stub_id);
 7856 
 7857     address start = __ pc();
 7858 
 7859     const Register out     = r0;
 7860     const Register in      = r1;
 7861     const Register offset  = r2;
 7862     const Register len     = r3;
 7863     const Register k       = r4;
 7864 
 7865     BLOCK_COMMENT("Entry:");
 7866     __ enter();
 7867     __ mul_add(out, in, offset, len, k);
 7868     __ leave();
 7869     __ ret(lr);
 7870 
 7871     return start;
 7872   }
 7873 
 7874   // Arguments:
 7875   //
 7876   // Input:
 7877   //   c_rarg0   - newArr address
 7878   //   c_rarg1   - oldArr address
 7879   //   c_rarg2   - newIdx
 7880   //   c_rarg3   - shiftCount
 7881   //   c_rarg4   - numIter
 7882   //
 7883   address generate_bigIntegerRightShift() {
 7884     __ align(CodeEntryAlignment);
 7885     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 7886     StubCodeMark mark(this, stub_id);
 7887     address start = __ pc();
 7888 
 7889     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7890 
 7891     Register newArr        = c_rarg0;
 7892     Register oldArr        = c_rarg1;
 7893     Register newIdx        = c_rarg2;
 7894     Register shiftCount    = c_rarg3;
 7895     Register numIter       = c_rarg4;
 7896     Register idx           = numIter;
 7897 
 7898     Register newArrCur     = rscratch1;
 7899     Register shiftRevCount = rscratch2;
 7900     Register oldArrCur     = r13;
 7901     Register oldArrNext    = r14;
 7902 
 7903     FloatRegister oldElem0        = v0;
 7904     FloatRegister oldElem1        = v1;
 7905     FloatRegister newElem         = v2;
 7906     FloatRegister shiftVCount     = v3;
 7907     FloatRegister shiftVRevCount  = v4;
 7908 
 7909     __ cbz(idx, Exit);
 7910 
 7911     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7912 
 7913     // left shift count
 7914     __ movw(shiftRevCount, 32);
 7915     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7916 
 7917     // numIter too small to allow a 4-words SIMD loop, rolling back
 7918     __ cmp(numIter, (u1)4);
 7919     __ br(Assembler::LT, ShiftThree);
 7920 
 7921     __ dup(shiftVCount,    __ T4S, shiftCount);
 7922     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7923     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7924 
 7925     __ BIND(ShiftSIMDLoop);
 7926 
 7927     // Calculate the load addresses
 7928     __ sub(idx, idx, 4);
 7929     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7930     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7931     __ add(oldArrCur,  oldArrNext, 4);
 7932 
 7933     // Load 4 words and process
 7934     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7935     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7936     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7937     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7938     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7939     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7940 
 7941     __ cmp(idx, (u1)4);
 7942     __ br(Assembler::LT, ShiftTwoLoop);
 7943     __ b(ShiftSIMDLoop);
 7944 
 7945     __ BIND(ShiftTwoLoop);
 7946     __ cbz(idx, Exit);
 7947     __ cmp(idx, (u1)1);
 7948     __ br(Assembler::EQ, ShiftOne);
 7949 
 7950     // Calculate the load addresses
 7951     __ sub(idx, idx, 2);
 7952     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7953     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7954     __ add(oldArrCur,  oldArrNext, 4);
 7955 
 7956     // Load 2 words and process
 7957     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7958     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7959     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7960     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7961     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7962     __ st1(newElem,   __ T2S, Address(newArrCur));
 7963     __ b(ShiftTwoLoop);
 7964 
 7965     __ BIND(ShiftThree);
 7966     __ tbz(idx, 1, ShiftOne);
 7967     __ tbz(idx, 0, ShiftTwo);
 7968     __ ldrw(r10,  Address(oldArr, 12));
 7969     __ ldrw(r11,  Address(oldArr, 8));
 7970     __ lsrvw(r10, r10, shiftCount);
 7971     __ lslvw(r11, r11, shiftRevCount);
 7972     __ orrw(r12,  r10, r11);
 7973     __ strw(r12,  Address(newArr, 8));
 7974 
 7975     __ BIND(ShiftTwo);
 7976     __ ldrw(r10,  Address(oldArr, 8));
 7977     __ ldrw(r11,  Address(oldArr, 4));
 7978     __ lsrvw(r10, r10, shiftCount);
 7979     __ lslvw(r11, r11, shiftRevCount);
 7980     __ orrw(r12,  r10, r11);
 7981     __ strw(r12,  Address(newArr, 4));
 7982 
 7983     __ BIND(ShiftOne);
 7984     __ ldrw(r10,  Address(oldArr, 4));
 7985     __ ldrw(r11,  Address(oldArr));
 7986     __ lsrvw(r10, r10, shiftCount);
 7987     __ lslvw(r11, r11, shiftRevCount);
 7988     __ orrw(r12,  r10, r11);
 7989     __ strw(r12,  Address(newArr));
 7990 
 7991     __ BIND(Exit);
 7992     __ ret(lr);
 7993 
 7994     return start;
 7995   }
 7996 
 7997   // Arguments:
 7998   //
 7999   // Input:
 8000   //   c_rarg0   - newArr address
 8001   //   c_rarg1   - oldArr address
 8002   //   c_rarg2   - newIdx
 8003   //   c_rarg3   - shiftCount
 8004   //   c_rarg4   - numIter
 8005   //
 8006   address generate_bigIntegerLeftShift() {
 8007     __ align(CodeEntryAlignment);
 8008     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 8009     StubCodeMark mark(this, stub_id);
 8010     address start = __ pc();
 8011 
 8012     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8013 
 8014     Register newArr        = c_rarg0;
 8015     Register oldArr        = c_rarg1;
 8016     Register newIdx        = c_rarg2;
 8017     Register shiftCount    = c_rarg3;
 8018     Register numIter       = c_rarg4;
 8019 
 8020     Register shiftRevCount = rscratch1;
 8021     Register oldArrNext    = rscratch2;
 8022 
 8023     FloatRegister oldElem0        = v0;
 8024     FloatRegister oldElem1        = v1;
 8025     FloatRegister newElem         = v2;
 8026     FloatRegister shiftVCount     = v3;
 8027     FloatRegister shiftVRevCount  = v4;
 8028 
 8029     __ cbz(numIter, Exit);
 8030 
 8031     __ add(oldArrNext, oldArr, 4);
 8032     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8033 
 8034     // right shift count
 8035     __ movw(shiftRevCount, 32);
 8036     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8037 
 8038     // numIter too small to allow a 4-words SIMD loop, rolling back
 8039     __ cmp(numIter, (u1)4);
 8040     __ br(Assembler::LT, ShiftThree);
 8041 
 8042     __ dup(shiftVCount,     __ T4S, shiftCount);
 8043     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8044     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8045 
 8046     __ BIND(ShiftSIMDLoop);
 8047 
 8048     // load 4 words and process
 8049     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8050     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8051     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8052     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8053     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8054     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8055     __ sub(numIter,   numIter, 4);
 8056 
 8057     __ cmp(numIter, (u1)4);
 8058     __ br(Assembler::LT, ShiftTwoLoop);
 8059     __ b(ShiftSIMDLoop);
 8060 
 8061     __ BIND(ShiftTwoLoop);
 8062     __ cbz(numIter, Exit);
 8063     __ cmp(numIter, (u1)1);
 8064     __ br(Assembler::EQ, ShiftOne);
 8065 
 8066     // load 2 words and process
 8067     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8068     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8069     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8070     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8071     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8072     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8073     __ sub(numIter,   numIter, 2);
 8074     __ b(ShiftTwoLoop);
 8075 
 8076     __ BIND(ShiftThree);
 8077     __ ldrw(r10,  __ post(oldArr, 4));
 8078     __ ldrw(r11,  __ post(oldArrNext, 4));
 8079     __ lslvw(r10, r10, shiftCount);
 8080     __ lsrvw(r11, r11, shiftRevCount);
 8081     __ orrw(r12,  r10, r11);
 8082     __ strw(r12,  __ post(newArr, 4));
 8083     __ tbz(numIter, 1, Exit);
 8084     __ tbz(numIter, 0, ShiftOne);
 8085 
 8086     __ BIND(ShiftTwo);
 8087     __ ldrw(r10,  __ post(oldArr, 4));
 8088     __ ldrw(r11,  __ post(oldArrNext, 4));
 8089     __ lslvw(r10, r10, shiftCount);
 8090     __ lsrvw(r11, r11, shiftRevCount);
 8091     __ orrw(r12,  r10, r11);
 8092     __ strw(r12,  __ post(newArr, 4));
 8093 
 8094     __ BIND(ShiftOne);
 8095     __ ldrw(r10,  Address(oldArr));
 8096     __ ldrw(r11,  Address(oldArrNext));
 8097     __ lslvw(r10, r10, shiftCount);
 8098     __ lsrvw(r11, r11, shiftRevCount);
 8099     __ orrw(r12,  r10, r11);
 8100     __ strw(r12,  Address(newArr));
 8101 
 8102     __ BIND(Exit);
 8103     __ ret(lr);
 8104 
 8105     return start;
 8106   }
 8107 
 8108   address generate_count_positives(address &count_positives_long) {
 8109     const u1 large_loop_size = 64;
 8110     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8111     int dcache_line = VM_Version::dcache_line_size();
 8112 
 8113     Register ary1 = r1, len = r2, result = r0;
 8114 
 8115     __ align(CodeEntryAlignment);
 8116 
 8117     StubId stub_id = StubId::stubgen_count_positives_id;
 8118     StubCodeMark mark(this, stub_id);
 8119 
 8120     address entry = __ pc();
 8121 
 8122     __ enter();
 8123     // precondition: a copy of len is already in result
 8124     // __ mov(result, len);
 8125 
 8126   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8127         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8128 
 8129   __ cmp(len, (u1)15);
 8130   __ br(Assembler::GT, LEN_OVER_15);
 8131   // The only case when execution falls into this code is when pointer is near
 8132   // the end of memory page and we have to avoid reading next page
 8133   __ add(ary1, ary1, len);
 8134   __ subs(len, len, 8);
 8135   __ br(Assembler::GT, LEN_OVER_8);
 8136   __ ldr(rscratch2, Address(ary1, -8));
 8137   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8138   __ lsrv(rscratch2, rscratch2, rscratch1);
 8139   __ tst(rscratch2, UPPER_BIT_MASK);
 8140   __ csel(result, zr, result, Assembler::NE);
 8141   __ leave();
 8142   __ ret(lr);
 8143   __ bind(LEN_OVER_8);
 8144   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8145   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8146   __ tst(rscratch2, UPPER_BIT_MASK);
 8147   __ br(Assembler::NE, RET_NO_POP);
 8148   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8149   __ lsrv(rscratch1, rscratch1, rscratch2);
 8150   __ tst(rscratch1, UPPER_BIT_MASK);
 8151   __ bind(RET_NO_POP);
 8152   __ csel(result, zr, result, Assembler::NE);
 8153   __ leave();
 8154   __ ret(lr);
 8155 
 8156   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8157   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8158 
 8159   count_positives_long = __ pc(); // 2nd entry point
 8160 
 8161   __ enter();
 8162 
 8163   __ bind(LEN_OVER_15);
 8164     __ push(spilled_regs, sp);
 8165     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8166     __ cbz(rscratch2, ALIGNED);
 8167     __ ldp(tmp6, tmp1, Address(ary1));
 8168     __ mov(tmp5, 16);
 8169     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8170     __ add(ary1, ary1, rscratch1);
 8171     __ orr(tmp6, tmp6, tmp1);
 8172     __ tst(tmp6, UPPER_BIT_MASK);
 8173     __ br(Assembler::NE, RET_ADJUST);
 8174     __ sub(len, len, rscratch1);
 8175 
 8176   __ bind(ALIGNED);
 8177     __ cmp(len, large_loop_size);
 8178     __ br(Assembler::LT, CHECK_16);
 8179     // Perform 16-byte load as early return in pre-loop to handle situation
 8180     // when initially aligned large array has negative values at starting bytes,
 8181     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8182     // slower. Cases with negative bytes further ahead won't be affected that
 8183     // much. In fact, it'll be faster due to early loads, less instructions and
 8184     // less branches in LARGE_LOOP.
 8185     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8186     __ sub(len, len, 16);
 8187     __ orr(tmp6, tmp6, tmp1);
 8188     __ tst(tmp6, UPPER_BIT_MASK);
 8189     __ br(Assembler::NE, RET_ADJUST_16);
 8190     __ cmp(len, large_loop_size);
 8191     __ br(Assembler::LT, CHECK_16);
 8192 
 8193     if (SoftwarePrefetchHintDistance >= 0
 8194         && SoftwarePrefetchHintDistance >= dcache_line) {
 8195       // initial prefetch
 8196       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8197     }
 8198   __ bind(LARGE_LOOP);
 8199     if (SoftwarePrefetchHintDistance >= 0) {
 8200       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8201     }
 8202     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8203     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8204     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8205     // instructions per cycle and have less branches, but this approach disables
 8206     // early return, thus, all 64 bytes are loaded and checked every time.
 8207     __ ldp(tmp2, tmp3, Address(ary1));
 8208     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8209     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8210     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8211     __ add(ary1, ary1, large_loop_size);
 8212     __ sub(len, len, large_loop_size);
 8213     __ orr(tmp2, tmp2, tmp3);
 8214     __ orr(tmp4, tmp4, tmp5);
 8215     __ orr(rscratch1, rscratch1, rscratch2);
 8216     __ orr(tmp6, tmp6, tmp1);
 8217     __ orr(tmp2, tmp2, tmp4);
 8218     __ orr(rscratch1, rscratch1, tmp6);
 8219     __ orr(tmp2, tmp2, rscratch1);
 8220     __ tst(tmp2, UPPER_BIT_MASK);
 8221     __ br(Assembler::NE, RET_ADJUST_LONG);
 8222     __ cmp(len, large_loop_size);
 8223     __ br(Assembler::GE, LARGE_LOOP);
 8224 
 8225   __ bind(CHECK_16); // small 16-byte load pre-loop
 8226     __ cmp(len, (u1)16);
 8227     __ br(Assembler::LT, POST_LOOP16);
 8228 
 8229   __ bind(LOOP16); // small 16-byte load loop
 8230     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8231     __ sub(len, len, 16);
 8232     __ orr(tmp2, tmp2, tmp3);
 8233     __ tst(tmp2, UPPER_BIT_MASK);
 8234     __ br(Assembler::NE, RET_ADJUST_16);
 8235     __ cmp(len, (u1)16);
 8236     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8237 
 8238   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8239     __ cmp(len, (u1)8);
 8240     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8241     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8242     __ tst(tmp3, UPPER_BIT_MASK);
 8243     __ br(Assembler::NE, RET_ADJUST);
 8244     __ sub(len, len, 8);
 8245 
 8246   __ bind(POST_LOOP16_LOAD_TAIL);
 8247     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8248     __ ldr(tmp1, Address(ary1));
 8249     __ mov(tmp2, 64);
 8250     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8251     __ lslv(tmp1, tmp1, tmp4);
 8252     __ tst(tmp1, UPPER_BIT_MASK);
 8253     __ br(Assembler::NE, RET_ADJUST);
 8254     // Fallthrough
 8255 
 8256   __ bind(RET_LEN);
 8257     __ pop(spilled_regs, sp);
 8258     __ leave();
 8259     __ ret(lr);
 8260 
 8261     // difference result - len is the count of guaranteed to be
 8262     // positive bytes
 8263 
 8264   __ bind(RET_ADJUST_LONG);
 8265     __ add(len, len, (u1)(large_loop_size - 16));
 8266   __ bind(RET_ADJUST_16);
 8267     __ add(len, len, 16);
 8268   __ bind(RET_ADJUST);
 8269     __ pop(spilled_regs, sp);
 8270     __ leave();
 8271     __ sub(result, result, len);
 8272     __ ret(lr);
 8273 
 8274     return entry;
 8275   }
 8276 
 8277   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8278         bool usePrefetch, Label &NOT_EQUAL) {
 8279     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8280         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8281         tmp7 = r12, tmp8 = r13;
 8282     Label LOOP;
 8283 
 8284     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8285     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8286     __ bind(LOOP);
 8287     if (usePrefetch) {
 8288       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8289       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8290     }
 8291     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8292     __ eor(tmp1, tmp1, tmp2);
 8293     __ eor(tmp3, tmp3, tmp4);
 8294     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8295     __ orr(tmp1, tmp1, tmp3);
 8296     __ cbnz(tmp1, NOT_EQUAL);
 8297     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8298     __ eor(tmp5, tmp5, tmp6);
 8299     __ eor(tmp7, tmp7, tmp8);
 8300     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8301     __ orr(tmp5, tmp5, tmp7);
 8302     __ cbnz(tmp5, NOT_EQUAL);
 8303     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8304     __ eor(tmp1, tmp1, tmp2);
 8305     __ eor(tmp3, tmp3, tmp4);
 8306     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8307     __ orr(tmp1, tmp1, tmp3);
 8308     __ cbnz(tmp1, NOT_EQUAL);
 8309     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8310     __ eor(tmp5, tmp5, tmp6);
 8311     __ sub(cnt1, cnt1, 8 * wordSize);
 8312     __ eor(tmp7, tmp7, tmp8);
 8313     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8314     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8315     // cmp) because subs allows an unlimited range of immediate operand.
 8316     __ subs(tmp6, cnt1, loopThreshold);
 8317     __ orr(tmp5, tmp5, tmp7);
 8318     __ cbnz(tmp5, NOT_EQUAL);
 8319     __ br(__ GE, LOOP);
 8320     // post-loop
 8321     __ eor(tmp1, tmp1, tmp2);
 8322     __ eor(tmp3, tmp3, tmp4);
 8323     __ orr(tmp1, tmp1, tmp3);
 8324     __ sub(cnt1, cnt1, 2 * wordSize);
 8325     __ cbnz(tmp1, NOT_EQUAL);
 8326   }
 8327 
 8328   void generate_large_array_equals_loop_simd(int loopThreshold,
 8329         bool usePrefetch, Label &NOT_EQUAL) {
 8330     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8331         tmp2 = rscratch2;
 8332     Label LOOP;
 8333 
 8334     __ bind(LOOP);
 8335     if (usePrefetch) {
 8336       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8337       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8338     }
 8339     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8340     __ sub(cnt1, cnt1, 8 * wordSize);
 8341     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8342     __ subs(tmp1, cnt1, loopThreshold);
 8343     __ eor(v0, __ T16B, v0, v4);
 8344     __ eor(v1, __ T16B, v1, v5);
 8345     __ eor(v2, __ T16B, v2, v6);
 8346     __ eor(v3, __ T16B, v3, v7);
 8347     __ orr(v0, __ T16B, v0, v1);
 8348     __ orr(v1, __ T16B, v2, v3);
 8349     __ orr(v0, __ T16B, v0, v1);
 8350     __ umov(tmp1, v0, __ D, 0);
 8351     __ umov(tmp2, v0, __ D, 1);
 8352     __ orr(tmp1, tmp1, tmp2);
 8353     __ cbnz(tmp1, NOT_EQUAL);
 8354     __ br(__ GE, LOOP);
 8355   }
 8356 
 8357   // a1 = r1 - array1 address
 8358   // a2 = r2 - array2 address
 8359   // result = r0 - return value. Already contains "false"
 8360   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8361   // r3-r5 are reserved temporary registers
 8362   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8363   address generate_large_array_equals() {
 8364     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8365         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8366         tmp7 = r12, tmp8 = r13;
 8367     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 8368         SMALL_LOOP, POST_LOOP;
 8369     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8370     // calculate if at least 32 prefetched bytes are used
 8371     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8372     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8373     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8374     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8375         tmp5, tmp6, tmp7, tmp8);
 8376 
 8377     __ align(CodeEntryAlignment);
 8378 
 8379     StubId stub_id = StubId::stubgen_large_array_equals_id;
 8380     StubCodeMark mark(this, stub_id);
 8381 
 8382     address entry = __ pc();
 8383     __ enter();
 8384     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8385     // also advance pointers to use post-increment instead of pre-increment
 8386     __ add(a1, a1, wordSize);
 8387     __ add(a2, a2, wordSize);
 8388     if (AvoidUnalignedAccesses) {
 8389       // both implementations (SIMD/nonSIMD) are using relatively large load
 8390       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8391       // on some CPUs in case of address is not at least 16-byte aligned.
 8392       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8393       // load if needed at least for 1st address and make if 16-byte aligned.
 8394       Label ALIGNED16;
 8395       __ tbz(a1, 3, ALIGNED16);
 8396       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8397       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8398       __ sub(cnt1, cnt1, wordSize);
 8399       __ eor(tmp1, tmp1, tmp2);
 8400       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8401       __ bind(ALIGNED16);
 8402     }
 8403     if (UseSIMDForArrayEquals) {
 8404       if (SoftwarePrefetchHintDistance >= 0) {
 8405         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8406         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8407         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8408             /* prfm = */ true, NOT_EQUAL);
 8409         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8410         __ br(__ LT, TAIL);
 8411       }
 8412       __ bind(NO_PREFETCH_LARGE_LOOP);
 8413       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8414           /* prfm = */ false, NOT_EQUAL);
 8415     } else {
 8416       __ push(spilled_regs, sp);
 8417       if (SoftwarePrefetchHintDistance >= 0) {
 8418         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8419         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8420         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8421             /* prfm = */ true, NOT_EQUAL);
 8422         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8423         __ br(__ LT, TAIL);
 8424       }
 8425       __ bind(NO_PREFETCH_LARGE_LOOP);
 8426       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8427           /* prfm = */ false, NOT_EQUAL);
 8428     }
 8429     __ bind(TAIL);
 8430       __ cbz(cnt1, EQUAL);
 8431       __ subs(cnt1, cnt1, wordSize);
 8432       __ br(__ LE, POST_LOOP);
 8433     __ bind(SMALL_LOOP);
 8434       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8435       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8436       __ subs(cnt1, cnt1, wordSize);
 8437       __ eor(tmp1, tmp1, tmp2);
 8438       __ cbnz(tmp1, NOT_EQUAL);
 8439       __ br(__ GT, SMALL_LOOP);
 8440     __ bind(POST_LOOP);
 8441       __ ldr(tmp1, Address(a1, cnt1));
 8442       __ ldr(tmp2, Address(a2, cnt1));
 8443       __ eor(tmp1, tmp1, tmp2);
 8444       __ cbnz(tmp1, NOT_EQUAL);
 8445     __ bind(EQUAL);
 8446       __ mov(result, true);
 8447     __ bind(NOT_EQUAL);
 8448       if (!UseSIMDForArrayEquals) {
 8449         __ pop(spilled_regs, sp);
 8450       }
 8451     __ bind(NOT_EQUAL_NO_POP);
 8452     __ leave();
 8453     __ ret(lr);
 8454     return entry;
 8455   }
 8456 
 8457   // result = r0 - return value. Contains initial hashcode value on entry.
 8458   // ary = r1 - array address
 8459   // cnt = r2 - elements count
 8460   // Clobbers: v0-v13, rscratch1, rscratch2
 8461   address generate_large_arrays_hashcode(BasicType eltype) {
 8462     const Register result = r0, ary = r1, cnt = r2;
 8463     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8464     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8465     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8466     const FloatRegister vpowm = v13;
 8467 
 8468     ARRAYS_HASHCODE_REGISTERS;
 8469 
 8470     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8471 
 8472     unsigned int vf; // vectorization factor
 8473     bool multiply_by_halves;
 8474     Assembler::SIMD_Arrangement load_arrangement;
 8475     switch (eltype) {
 8476     case T_BOOLEAN:
 8477     case T_BYTE:
 8478       load_arrangement = Assembler::T8B;
 8479       multiply_by_halves = true;
 8480       vf = 8;
 8481       break;
 8482     case T_CHAR:
 8483     case T_SHORT:
 8484       load_arrangement = Assembler::T8H;
 8485       multiply_by_halves = true;
 8486       vf = 8;
 8487       break;
 8488     case T_INT:
 8489       load_arrangement = Assembler::T4S;
 8490       multiply_by_halves = false;
 8491       vf = 4;
 8492       break;
 8493     default:
 8494       ShouldNotReachHere();
 8495     }
 8496 
 8497     // Unroll factor
 8498     const unsigned uf = 4;
 8499 
 8500     // Effective vectorization factor
 8501     const unsigned evf = vf * uf;
 8502 
 8503     __ align(CodeEntryAlignment);
 8504 
 8505     StubId stub_id;
 8506     switch (eltype) {
 8507     case T_BOOLEAN:
 8508       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 8509       break;
 8510     case T_BYTE:
 8511       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 8512       break;
 8513     case T_CHAR:
 8514       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 8515       break;
 8516     case T_SHORT:
 8517       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 8518       break;
 8519     case T_INT:
 8520       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 8521       break;
 8522     default:
 8523       stub_id = StubId::NO_STUBID;
 8524       ShouldNotReachHere();
 8525     };
 8526 
 8527     StubCodeMark mark(this, stub_id);
 8528 
 8529     address entry = __ pc();
 8530     __ enter();
 8531 
 8532     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8533     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8534     // value shouldn't change throughout both loops.
 8535     __ movw(rscratch1, intpow(31U, 3));
 8536     __ mov(vpow, Assembler::S, 0, rscratch1);
 8537     __ movw(rscratch1, intpow(31U, 2));
 8538     __ mov(vpow, Assembler::S, 1, rscratch1);
 8539     __ movw(rscratch1, intpow(31U, 1));
 8540     __ mov(vpow, Assembler::S, 2, rscratch1);
 8541     __ movw(rscratch1, intpow(31U, 0));
 8542     __ mov(vpow, Assembler::S, 3, rscratch1);
 8543 
 8544     __ mov(vmul0, Assembler::T16B, 0);
 8545     __ mov(vmul0, Assembler::S, 3, result);
 8546 
 8547     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8548     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8549 
 8550     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8551     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8552 
 8553     // SMALL LOOP
 8554     __ bind(SMALL_LOOP);
 8555 
 8556     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8557     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8558     __ subsw(rscratch2, rscratch2, vf);
 8559 
 8560     if (load_arrangement == Assembler::T8B) {
 8561       // Extend 8B to 8H to be able to use vector multiply
 8562       // instructions
 8563       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8564       if (is_signed_subword_type(eltype)) {
 8565         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8566       } else {
 8567         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8568       }
 8569     }
 8570 
 8571     switch (load_arrangement) {
 8572     case Assembler::T4S:
 8573       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8574       break;
 8575     case Assembler::T8B:
 8576     case Assembler::T8H:
 8577       assert(is_subword_type(eltype), "subword type expected");
 8578       if (is_signed_subword_type(eltype)) {
 8579         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8580       } else {
 8581         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8582       }
 8583       break;
 8584     default:
 8585       __ should_not_reach_here();
 8586     }
 8587 
 8588     // Process the upper half of a vector
 8589     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8590       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8591       if (is_signed_subword_type(eltype)) {
 8592         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8593       } else {
 8594         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8595       }
 8596     }
 8597 
 8598     __ br(Assembler::HI, SMALL_LOOP);
 8599 
 8600     // SMALL LOOP'S EPILOQUE
 8601     __ lsr(rscratch2, cnt, exact_log2(evf));
 8602     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8603 
 8604     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8605     __ addv(vmul0, Assembler::T4S, vmul0);
 8606     __ umov(result, vmul0, Assembler::S, 0);
 8607 
 8608     // TAIL
 8609     __ bind(TAIL);
 8610 
 8611     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8612     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8613     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8614     __ andr(rscratch2, cnt, vf - 1);
 8615     __ bind(TAIL_SHORTCUT);
 8616     __ adr(rscratch1, BR_BASE);
 8617     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8618     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8619     __ movw(rscratch2, 0x1f);
 8620     __ br(rscratch1);
 8621 
 8622     for (size_t i = 0; i < vf - 1; ++i) {
 8623       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8624                                    eltype);
 8625       __ maddw(result, result, rscratch2, rscratch1);
 8626       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8627       // Generate 2nd nop to have 4 instructions per iteration.
 8628       if (VM_Version::supports_a53mac()) {
 8629         __ nop();
 8630       }
 8631     }
 8632     __ bind(BR_BASE);
 8633 
 8634     __ leave();
 8635     __ ret(lr);
 8636 
 8637     // LARGE LOOP
 8638     __ bind(LARGE_LOOP_PREHEADER);
 8639 
 8640     __ lsr(rscratch2, cnt, exact_log2(evf));
 8641 
 8642     if (multiply_by_halves) {
 8643       // 31^4 - multiplier between lower and upper parts of a register
 8644       __ movw(rscratch1, intpow(31U, vf / 2));
 8645       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8646       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8647       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8648       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8649     } else {
 8650       // 31^16
 8651       __ movw(rscratch1, intpow(31U, evf));
 8652       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8653     }
 8654 
 8655     __ mov(vmul3, Assembler::T16B, 0);
 8656     __ mov(vmul2, Assembler::T16B, 0);
 8657     __ mov(vmul1, Assembler::T16B, 0);
 8658 
 8659     __ bind(LARGE_LOOP);
 8660 
 8661     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8662     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8663     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8664     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8665 
 8666     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8667            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8668 
 8669     if (load_arrangement == Assembler::T8B) {
 8670       // Extend 8B to 8H to be able to use vector multiply
 8671       // instructions
 8672       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8673       if (is_signed_subword_type(eltype)) {
 8674         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8675         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8676         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8677         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8678       } else {
 8679         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8680         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8681         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8682         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8683       }
 8684     }
 8685 
 8686     switch (load_arrangement) {
 8687     case Assembler::T4S:
 8688       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8689       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8690       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8691       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8692       break;
 8693     case Assembler::T8B:
 8694     case Assembler::T8H:
 8695       assert(is_subword_type(eltype), "subword type expected");
 8696       if (is_signed_subword_type(eltype)) {
 8697         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8698         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8699         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8700         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8701       } else {
 8702         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8703         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8704         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8705         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8706       }
 8707       break;
 8708     default:
 8709       __ should_not_reach_here();
 8710     }
 8711 
 8712     // Process the upper half of a vector
 8713     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8714       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8715       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8716       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8717       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8718       if (is_signed_subword_type(eltype)) {
 8719         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8720         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8721         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8722         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8723       } else {
 8724         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8725         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8726         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8727         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8728       }
 8729     }
 8730 
 8731     __ subsw(rscratch2, rscratch2, 1);
 8732     __ br(Assembler::HI, LARGE_LOOP);
 8733 
 8734     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8735     __ addv(vmul3, Assembler::T4S, vmul3);
 8736     __ umov(result, vmul3, Assembler::S, 0);
 8737 
 8738     __ mov(rscratch2, intpow(31U, vf));
 8739 
 8740     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8741     __ addv(vmul2, Assembler::T4S, vmul2);
 8742     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8743     __ maddw(result, result, rscratch2, rscratch1);
 8744 
 8745     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8746     __ addv(vmul1, Assembler::T4S, vmul1);
 8747     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8748     __ maddw(result, result, rscratch2, rscratch1);
 8749 
 8750     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8751     __ addv(vmul0, Assembler::T4S, vmul0);
 8752     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8753     __ maddw(result, result, rscratch2, rscratch1);
 8754 
 8755     __ andr(rscratch2, cnt, vf - 1);
 8756     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8757 
 8758     __ leave();
 8759     __ ret(lr);
 8760 
 8761     return entry;
 8762   }
 8763 
 8764   address generate_dsin_dcos(bool isCos) {
 8765     __ align(CodeEntryAlignment);
 8766     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 8767     StubCodeMark mark(this, stub_id);
 8768     address start = __ pc();
 8769     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8770         (address)StubRoutines::aarch64::_two_over_pi,
 8771         (address)StubRoutines::aarch64::_pio2,
 8772         (address)StubRoutines::aarch64::_dsin_coef,
 8773         (address)StubRoutines::aarch64::_dcos_coef);
 8774     return start;
 8775   }
 8776 
 8777   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8778   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8779       Label &DIFF2) {
 8780     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8781     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8782 
 8783     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8784     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8785     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8786     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8787 
 8788     __ fmovd(tmpL, vtmp3);
 8789     __ eor(rscratch2, tmp3, tmpL);
 8790     __ cbnz(rscratch2, DIFF2);
 8791 
 8792     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8793     __ umov(tmpL, vtmp3, __ D, 1);
 8794     __ eor(rscratch2, tmpU, tmpL);
 8795     __ cbnz(rscratch2, DIFF1);
 8796 
 8797     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8798     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8799     __ fmovd(tmpL, vtmp);
 8800     __ eor(rscratch2, tmp3, tmpL);
 8801     __ cbnz(rscratch2, DIFF2);
 8802 
 8803     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8804     __ umov(tmpL, vtmp, __ D, 1);
 8805     __ eor(rscratch2, tmpU, tmpL);
 8806     __ cbnz(rscratch2, DIFF1);
 8807   }
 8808 
 8809   // r0  = result
 8810   // r1  = str1
 8811   // r2  = cnt1
 8812   // r3  = str2
 8813   // r4  = cnt2
 8814   // r10 = tmp1
 8815   // r11 = tmp2
 8816   address generate_compare_long_string_different_encoding(bool isLU) {
 8817     __ align(CodeEntryAlignment);
 8818     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 8819     StubCodeMark mark(this, stub_id);
 8820     address entry = __ pc();
 8821     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8822         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8823         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8824     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8825         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8826     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8827     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8828 
 8829     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8830 
 8831     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8832     // cnt2 == amount of characters left to compare
 8833     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8834     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8835     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8836     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8837     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8838     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8839     __ eor(rscratch2, tmp1, tmp2);
 8840     __ mov(rscratch1, tmp2);
 8841     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8842     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8843              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8844     __ push(spilled_regs, sp);
 8845     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8846     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8847 
 8848     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8849 
 8850     if (SoftwarePrefetchHintDistance >= 0) {
 8851       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8852       __ br(__ LT, NO_PREFETCH);
 8853       __ bind(LARGE_LOOP_PREFETCH);
 8854         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8855         __ mov(tmp4, 2);
 8856         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8857         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8858           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8859           __ subs(tmp4, tmp4, 1);
 8860           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8861           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8862           __ mov(tmp4, 2);
 8863         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8864           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8865           __ subs(tmp4, tmp4, 1);
 8866           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8867           __ sub(cnt2, cnt2, 64);
 8868           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8869           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8870     }
 8871     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8872     __ bind(NO_PREFETCH);
 8873     __ subs(cnt2, cnt2, 16);
 8874     __ br(__ LT, TAIL);
 8875     __ align(OptoLoopAlignment);
 8876     __ bind(SMALL_LOOP); // smaller loop
 8877       __ subs(cnt2, cnt2, 16);
 8878       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8879       __ br(__ GE, SMALL_LOOP);
 8880       __ cmn(cnt2, (u1)16);
 8881       __ br(__ EQ, LOAD_LAST);
 8882     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8883       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8884       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8885       __ ldr(tmp3, Address(cnt1, -8));
 8886       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8887       __ b(LOAD_LAST);
 8888     __ bind(DIFF2);
 8889       __ mov(tmpU, tmp3);
 8890     __ bind(DIFF1);
 8891       __ pop(spilled_regs, sp);
 8892       __ b(CALCULATE_DIFFERENCE);
 8893     __ bind(LOAD_LAST);
 8894       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8895       // No need to load it again
 8896       __ mov(tmpU, tmp3);
 8897       __ pop(spilled_regs, sp);
 8898 
 8899       // tmp2 points to the address of the last 4 Latin1 characters right now
 8900       __ ldrs(vtmp, Address(tmp2));
 8901       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8902       __ fmovd(tmpL, vtmp);
 8903 
 8904       __ eor(rscratch2, tmpU, tmpL);
 8905       __ cbz(rscratch2, DONE);
 8906 
 8907     // Find the first different characters in the longwords and
 8908     // compute their difference.
 8909     __ bind(CALCULATE_DIFFERENCE);
 8910       __ rev(rscratch2, rscratch2);
 8911       __ clz(rscratch2, rscratch2);
 8912       __ andr(rscratch2, rscratch2, -16);
 8913       __ lsrv(tmp1, tmp1, rscratch2);
 8914       __ uxthw(tmp1, tmp1);
 8915       __ lsrv(rscratch1, rscratch1, rscratch2);
 8916       __ uxthw(rscratch1, rscratch1);
 8917       __ subw(result, tmp1, rscratch1);
 8918     __ bind(DONE);
 8919       __ ret(lr);
 8920     return entry;
 8921   }
 8922 
 8923   // r0 = input (float16)
 8924   // v0 = result (float)
 8925   // v1 = temporary float register
 8926   address generate_float16ToFloat() {
 8927     __ align(CodeEntryAlignment);
 8928     StubId stub_id = StubId::stubgen_hf2f_id;
 8929     StubCodeMark mark(this, stub_id);
 8930     address entry = __ pc();
 8931     BLOCK_COMMENT("Entry:");
 8932     __ flt16_to_flt(v0, r0, v1);
 8933     __ ret(lr);
 8934     return entry;
 8935   }
 8936 
 8937   // v0 = input (float)
 8938   // r0 = result (float16)
 8939   // v1 = temporary float register
 8940   address generate_floatToFloat16() {
 8941     __ align(CodeEntryAlignment);
 8942     StubId stub_id = StubId::stubgen_f2hf_id;
 8943     StubCodeMark mark(this, stub_id);
 8944     address entry = __ pc();
 8945     BLOCK_COMMENT("Entry:");
 8946     __ flt_to_flt16(r0, v0, v1);
 8947     __ ret(lr);
 8948     return entry;
 8949   }
 8950 
 8951   address generate_method_entry_barrier() {
 8952     __ align(CodeEntryAlignment);
 8953     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 8954     StubCodeMark mark(this, stub_id);
 8955 
 8956     Label deoptimize_label;
 8957 
 8958     address start = __ pc();
 8959 
 8960     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8961 
 8962     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8963       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8964       // We can get here despite the nmethod being good, if we have not
 8965       // yet applied our cross modification fence (or data fence).
 8966       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 8967       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 8968       __ ldrw(rscratch2, rscratch2);
 8969       __ strw(rscratch2, thread_epoch_addr);
 8970       __ isb();
 8971       __ membar(__ LoadLoad);
 8972     }
 8973 
 8974     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 8975 
 8976     __ enter();
 8977     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 8978 
 8979     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 8980 
 8981     __ push_call_clobbered_registers();
 8982 
 8983     __ mov(c_rarg0, rscratch2);
 8984     __ call_VM_leaf
 8985          (CAST_FROM_FN_PTR
 8986           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 8987 
 8988     __ reset_last_Java_frame(true);
 8989 
 8990     __ mov(rscratch1, r0);
 8991 
 8992     __ pop_call_clobbered_registers();
 8993 
 8994     __ cbnz(rscratch1, deoptimize_label);
 8995 
 8996     __ leave();
 8997     __ ret(lr);
 8998 
 8999     __ BIND(deoptimize_label);
 9000 
 9001     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 9002     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 9003 
 9004     __ mov(sp, rscratch1);
 9005     __ br(rscratch2);
 9006 
 9007     return start;
 9008   }
 9009 
 9010   // r0  = result
 9011   // r1  = str1
 9012   // r2  = cnt1
 9013   // r3  = str2
 9014   // r4  = cnt2
 9015   // r10 = tmp1
 9016   // r11 = tmp2
 9017   address generate_compare_long_string_same_encoding(bool isLL) {
 9018     __ align(CodeEntryAlignment);
 9019     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9020     StubCodeMark mark(this, stub_id);
 9021     address entry = __ pc();
 9022     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9023         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9024 
 9025     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9026 
 9027     // exit from large loop when less than 64 bytes left to read or we're about
 9028     // to prefetch memory behind array border
 9029     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9030 
 9031     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9032     __ eor(rscratch2, tmp1, tmp2);
 9033     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9034 
 9035     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9036     // update pointers, because of previous read
 9037     __ add(str1, str1, wordSize);
 9038     __ add(str2, str2, wordSize);
 9039     if (SoftwarePrefetchHintDistance >= 0) {
 9040       __ align(OptoLoopAlignment);
 9041       __ bind(LARGE_LOOP_PREFETCH);
 9042         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9043         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9044 
 9045         for (int i = 0; i < 4; i++) {
 9046           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9047           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9048           __ cmp(tmp1, tmp2);
 9049           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9050           __ br(Assembler::NE, DIFF);
 9051         }
 9052         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9053         __ add(str1, str1, 64);
 9054         __ add(str2, str2, 64);
 9055         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9056         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9057         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9058     }
 9059 
 9060     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9061     __ br(Assembler::LE, LESS16);
 9062     __ align(OptoLoopAlignment);
 9063     __ bind(LOOP_COMPARE16);
 9064       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9065       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9066       __ cmp(tmp1, tmp2);
 9067       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9068       __ br(Assembler::NE, DIFF);
 9069       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9070       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9071       __ br(Assembler::LT, LESS16);
 9072 
 9073       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9074       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9075       __ cmp(tmp1, tmp2);
 9076       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9077       __ br(Assembler::NE, DIFF);
 9078       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9079       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9080       __ br(Assembler::GE, LOOP_COMPARE16);
 9081       __ cbz(cnt2, LENGTH_DIFF);
 9082 
 9083     __ bind(LESS16);
 9084       // each 8 compare
 9085       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9086       __ br(Assembler::LE, LESS8);
 9087       __ ldr(tmp1, Address(__ post(str1, 8)));
 9088       __ ldr(tmp2, Address(__ post(str2, 8)));
 9089       __ eor(rscratch2, tmp1, tmp2);
 9090       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9091       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9092 
 9093     __ bind(LESS8); // directly load last 8 bytes
 9094       if (!isLL) {
 9095         __ add(cnt2, cnt2, cnt2);
 9096       }
 9097       __ ldr(tmp1, Address(str1, cnt2));
 9098       __ ldr(tmp2, Address(str2, cnt2));
 9099       __ eor(rscratch2, tmp1, tmp2);
 9100       __ cbz(rscratch2, LENGTH_DIFF);
 9101       __ b(CAL_DIFFERENCE);
 9102 
 9103     __ bind(DIFF);
 9104       __ cmp(tmp1, tmp2);
 9105       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9106       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9107       // reuse rscratch2 register for the result of eor instruction
 9108       __ eor(rscratch2, tmp1, tmp2);
 9109 
 9110     __ bind(CAL_DIFFERENCE);
 9111       __ rev(rscratch2, rscratch2);
 9112       __ clz(rscratch2, rscratch2);
 9113       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9114       __ lsrv(tmp1, tmp1, rscratch2);
 9115       __ lsrv(tmp2, tmp2, rscratch2);
 9116       if (isLL) {
 9117         __ uxtbw(tmp1, tmp1);
 9118         __ uxtbw(tmp2, tmp2);
 9119       } else {
 9120         __ uxthw(tmp1, tmp1);
 9121         __ uxthw(tmp2, tmp2);
 9122       }
 9123       __ subw(result, tmp1, tmp2);
 9124 
 9125     __ bind(LENGTH_DIFF);
 9126       __ ret(lr);
 9127     return entry;
 9128   }
 9129 
 9130   enum string_compare_mode {
 9131     LL,
 9132     LU,
 9133     UL,
 9134     UU,
 9135   };
 9136 
 9137   // The following registers are declared in aarch64.ad
 9138   // r0  = result
 9139   // r1  = str1
 9140   // r2  = cnt1
 9141   // r3  = str2
 9142   // r4  = cnt2
 9143   // r10 = tmp1
 9144   // r11 = tmp2
 9145   // z0  = ztmp1
 9146   // z1  = ztmp2
 9147   // p0  = pgtmp1
 9148   // p1  = pgtmp2
 9149   address generate_compare_long_string_sve(string_compare_mode mode) {
 9150     StubId stub_id;
 9151     switch (mode) {
 9152       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9153       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9154       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9155       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9156       default: ShouldNotReachHere();
 9157     }
 9158 
 9159     __ align(CodeEntryAlignment);
 9160     address entry = __ pc();
 9161     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9162              tmp1 = r10, tmp2 = r11;
 9163 
 9164     Label LOOP, DONE, MISMATCH;
 9165     Register vec_len = tmp1;
 9166     Register idx = tmp2;
 9167     // The minimum of the string lengths has been stored in cnt2.
 9168     Register cnt = cnt2;
 9169     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9170     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9171 
 9172 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9173     switch (mode) {                                                            \
 9174       case LL:                                                                 \
 9175         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9176         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9177         break;                                                                 \
 9178       case LU:                                                                 \
 9179         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9180         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9181         break;                                                                 \
 9182       case UL:                                                                 \
 9183         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9184         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9185         break;                                                                 \
 9186       case UU:                                                                 \
 9187         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9188         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9189         break;                                                                 \
 9190       default:                                                                 \
 9191         ShouldNotReachHere();                                                  \
 9192     }
 9193 
 9194     StubCodeMark mark(this, stub_id);
 9195 
 9196     __ mov(idx, 0);
 9197     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9198 
 9199     if (mode == LL) {
 9200       __ sve_cntb(vec_len);
 9201     } else {
 9202       __ sve_cnth(vec_len);
 9203     }
 9204 
 9205     __ sub(rscratch1, cnt, vec_len);
 9206 
 9207     __ bind(LOOP);
 9208 
 9209       // main loop
 9210       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9211       __ add(idx, idx, vec_len);
 9212       // Compare strings.
 9213       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9214       __ br(__ NE, MISMATCH);
 9215       __ cmp(idx, rscratch1);
 9216       __ br(__ LT, LOOP);
 9217 
 9218     // post loop, last iteration
 9219     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9220 
 9221     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9222     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9223     __ br(__ EQ, DONE);
 9224 
 9225     __ bind(MISMATCH);
 9226 
 9227     // Crop the vector to find its location.
 9228     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9229     // Extract the first different characters of each string.
 9230     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9231     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9232 
 9233     // Compute the difference of the first different characters.
 9234     __ sub(result, rscratch1, rscratch2);
 9235 
 9236     __ bind(DONE);
 9237     __ ret(lr);
 9238 #undef LOAD_PAIR
 9239     return entry;
 9240   }
 9241 
 9242   void generate_compare_long_strings() {
 9243     if (UseSVE == 0) {
 9244       StubRoutines::aarch64::_compare_long_string_LL
 9245           = generate_compare_long_string_same_encoding(true);
 9246       StubRoutines::aarch64::_compare_long_string_UU
 9247           = generate_compare_long_string_same_encoding(false);
 9248       StubRoutines::aarch64::_compare_long_string_LU
 9249           = generate_compare_long_string_different_encoding(true);
 9250       StubRoutines::aarch64::_compare_long_string_UL
 9251           = generate_compare_long_string_different_encoding(false);
 9252     } else {
 9253       StubRoutines::aarch64::_compare_long_string_LL
 9254           = generate_compare_long_string_sve(LL);
 9255       StubRoutines::aarch64::_compare_long_string_UU
 9256           = generate_compare_long_string_sve(UU);
 9257       StubRoutines::aarch64::_compare_long_string_LU
 9258           = generate_compare_long_string_sve(LU);
 9259       StubRoutines::aarch64::_compare_long_string_UL
 9260           = generate_compare_long_string_sve(UL);
 9261     }
 9262   }
 9263 
 9264   // R0 = result
 9265   // R1 = str2
 9266   // R2 = cnt1
 9267   // R3 = str1
 9268   // R4 = cnt2
 9269   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9270   //
 9271   // This generic linear code use few additional ideas, which makes it faster:
 9272   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9273   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9274   // 2) we can use "fast" algorithm of finding single character to search for
 9275   // first symbol with less branches(1 branch per each loaded register instead
 9276   // of branch for each symbol), so, this is where constants like
 9277   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9278   // 3) after loading and analyzing 1st register of source string, it can be
 9279   // used to search for every 1st character entry, saving few loads in
 9280   // comparison with "simplier-but-slower" implementation
 9281   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9282   // re-using/re-initializing/compressing register values, which makes code
 9283   // larger and a bit less readable, however, most of extra operations are
 9284   // issued during loads or branches, so, penalty is minimal
 9285   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 9286     StubId stub_id;
 9287     if (str1_isL) {
 9288       if (str2_isL) {
 9289         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
 9290       } else {
 9291         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
 9292       }
 9293     } else {
 9294       if (str2_isL) {
 9295         ShouldNotReachHere();
 9296       } else {
 9297         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
 9298       }
 9299     }
 9300     __ align(CodeEntryAlignment);
 9301     StubCodeMark mark(this, stub_id);
 9302     address entry = __ pc();
 9303 
 9304     int str1_chr_size = str1_isL ? 1 : 2;
 9305     int str2_chr_size = str2_isL ? 1 : 2;
 9306     int str1_chr_shift = str1_isL ? 0 : 1;
 9307     int str2_chr_shift = str2_isL ? 0 : 1;
 9308     bool isL = str1_isL && str2_isL;
 9309    // parameters
 9310     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 9311     // temporary registers
 9312     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 9313     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 9314     // redefinitions
 9315     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 9316 
 9317     __ push(spilled_regs, sp);
 9318     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 9319         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 9320         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 9321         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 9322         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 9323         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 9324     // Read whole register from str1. It is safe, because length >=8 here
 9325     __ ldr(ch1, Address(str1));
 9326     // Read whole register from str2. It is safe, because length >=8 here
 9327     __ ldr(ch2, Address(str2));
 9328     __ sub(cnt2, cnt2, cnt1);
 9329     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 9330     if (str1_isL != str2_isL) {
 9331       __ eor(v0, __ T16B, v0, v0);
 9332     }
 9333     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 9334     __ mul(first, first, tmp1);
 9335     // check if we have less than 1 register to check
 9336     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 9337     if (str1_isL != str2_isL) {
 9338       __ fmovd(v1, ch1);
 9339     }
 9340     __ br(__ LE, L_SMALL);
 9341     __ eor(ch2, first, ch2);
 9342     if (str1_isL != str2_isL) {
 9343       __ zip1(v1, __ T16B, v1, v0);
 9344     }
 9345     __ sub(tmp2, ch2, tmp1);
 9346     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9347     __ bics(tmp2, tmp2, ch2);
 9348     if (str1_isL != str2_isL) {
 9349       __ fmovd(ch1, v1);
 9350     }
 9351     __ br(__ NE, L_HAS_ZERO);
 9352     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9353     __ add(result, result, wordSize/str2_chr_size);
 9354     __ add(str2, str2, wordSize);
 9355     __ br(__ LT, L_POST_LOOP);
 9356     __ BIND(L_LOOP);
 9357       __ ldr(ch2, Address(str2));
 9358       __ eor(ch2, first, ch2);
 9359       __ sub(tmp2, ch2, tmp1);
 9360       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9361       __ bics(tmp2, tmp2, ch2);
 9362       __ br(__ NE, L_HAS_ZERO);
 9363     __ BIND(L_LOOP_PROCEED);
 9364       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9365       __ add(str2, str2, wordSize);
 9366       __ add(result, result, wordSize/str2_chr_size);
 9367       __ br(__ GE, L_LOOP);
 9368     __ BIND(L_POST_LOOP);
 9369       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9370       __ br(__ LE, NOMATCH);
 9371       __ ldr(ch2, Address(str2));
 9372       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9373       __ eor(ch2, first, ch2);
 9374       __ sub(tmp2, ch2, tmp1);
 9375       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9376       __ mov(tmp4, -1); // all bits set
 9377       __ b(L_SMALL_PROCEED);
 9378     __ align(OptoLoopAlignment);
 9379     __ BIND(L_SMALL);
 9380       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9381       __ eor(ch2, first, ch2);
 9382       if (str1_isL != str2_isL) {
 9383         __ zip1(v1, __ T16B, v1, v0);
 9384       }
 9385       __ sub(tmp2, ch2, tmp1);
 9386       __ mov(tmp4, -1); // all bits set
 9387       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9388       if (str1_isL != str2_isL) {
 9389         __ fmovd(ch1, v1); // move converted 4 symbols
 9390       }
 9391     __ BIND(L_SMALL_PROCEED);
 9392       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9393       __ bic(tmp2, tmp2, ch2);
 9394       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9395       __ rbit(tmp2, tmp2);
 9396       __ br(__ EQ, NOMATCH);
 9397     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9398       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9399       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9400       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9401       if (str2_isL) { // LL
 9402         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9403         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9404         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9405         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9406         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9407       } else {
 9408         __ mov(ch2, 0xE); // all bits in byte set except last one
 9409         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9410         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9411         __ lslv(tmp2, tmp2, tmp4);
 9412         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9413         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9414         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9415         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9416       }
 9417       __ cmp(ch1, ch2);
 9418       __ mov(tmp4, wordSize/str2_chr_size);
 9419       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9420     __ BIND(L_SMALL_CMP_LOOP);
 9421       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9422                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9423       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9424                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9425       __ add(tmp4, tmp4, 1);
 9426       __ cmp(tmp4, cnt1);
 9427       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9428       __ cmp(first, ch2);
 9429       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9430     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9431       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9432       __ clz(tmp4, tmp2);
 9433       __ add(result, result, 1); // advance index
 9434       __ add(str2, str2, str2_chr_size); // advance pointer
 9435       __ b(L_SMALL_HAS_ZERO_LOOP);
 9436     __ align(OptoLoopAlignment);
 9437     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9438       __ cmp(first, ch2);
 9439       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9440       __ b(DONE);
 9441     __ align(OptoLoopAlignment);
 9442     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9443       if (str2_isL) { // LL
 9444         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9445         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9446         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9447         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9448         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9449       } else {
 9450         __ mov(ch2, 0xE); // all bits in byte set except last one
 9451         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9452         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9453         __ lslv(tmp2, tmp2, tmp4);
 9454         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9455         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9456         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9457         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9458       }
 9459       __ cmp(ch1, ch2);
 9460       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9461       __ b(DONE);
 9462     __ align(OptoLoopAlignment);
 9463     __ BIND(L_HAS_ZERO);
 9464       __ rbit(tmp2, tmp2);
 9465       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9466       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9467       // It's fine because both counters are 32bit and are not changed in this
 9468       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9469       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9470       __ sub(result, result, 1);
 9471     __ BIND(L_HAS_ZERO_LOOP);
 9472       __ mov(cnt1, wordSize/str2_chr_size);
 9473       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9474       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9475       if (str2_isL) {
 9476         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9477         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9478         __ lslv(tmp2, tmp2, tmp4);
 9479         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9480         __ add(tmp4, tmp4, 1);
 9481         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9482         __ lsl(tmp2, tmp2, 1);
 9483         __ mov(tmp4, wordSize/str2_chr_size);
 9484       } else {
 9485         __ mov(ch2, 0xE);
 9486         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9487         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9488         __ lslv(tmp2, tmp2, tmp4);
 9489         __ add(tmp4, tmp4, 1);
 9490         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9491         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9492         __ lsl(tmp2, tmp2, 1);
 9493         __ mov(tmp4, wordSize/str2_chr_size);
 9494         __ sub(str2, str2, str2_chr_size);
 9495       }
 9496       __ cmp(ch1, ch2);
 9497       __ mov(tmp4, wordSize/str2_chr_size);
 9498       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9499     __ BIND(L_CMP_LOOP);
 9500       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9501                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9502       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9503                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9504       __ add(tmp4, tmp4, 1);
 9505       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9506       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9507       __ cmp(cnt1, ch2);
 9508       __ br(__ EQ, L_CMP_LOOP);
 9509     __ BIND(L_CMP_LOOP_NOMATCH);
 9510       // here we're not matched
 9511       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9512       __ clz(tmp4, tmp2);
 9513       __ add(str2, str2, str2_chr_size); // advance pointer
 9514       __ b(L_HAS_ZERO_LOOP);
 9515     __ align(OptoLoopAlignment);
 9516     __ BIND(L_CMP_LOOP_LAST_CMP);
 9517       __ cmp(cnt1, ch2);
 9518       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9519       __ b(DONE);
 9520     __ align(OptoLoopAlignment);
 9521     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9522       if (str2_isL) {
 9523         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9524         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9525         __ lslv(tmp2, tmp2, tmp4);
 9526         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9527         __ add(tmp4, tmp4, 1);
 9528         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9529         __ lsl(tmp2, tmp2, 1);
 9530       } else {
 9531         __ mov(ch2, 0xE);
 9532         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9533         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9534         __ lslv(tmp2, tmp2, tmp4);
 9535         __ add(tmp4, tmp4, 1);
 9536         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9537         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9538         __ lsl(tmp2, tmp2, 1);
 9539         __ sub(str2, str2, str2_chr_size);
 9540       }
 9541       __ cmp(ch1, ch2);
 9542       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9543       __ b(DONE);
 9544     __ align(OptoLoopAlignment);
 9545     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9546       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9547       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9548       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9549       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9550       // result by analyzed characters value, so, we can just reset lower bits
 9551       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9552       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9553       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9554       // index of last analyzed substring inside current octet. So, str2 in at
 9555       // respective start address. We need to advance it to next octet
 9556       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9557       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9558       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9559       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9560       __ movw(cnt2, cnt2);
 9561       __ b(L_LOOP_PROCEED);
 9562     __ align(OptoLoopAlignment);
 9563     __ BIND(NOMATCH);
 9564       __ mov(result, -1);
 9565     __ BIND(DONE);
 9566       __ pop(spilled_regs, sp);
 9567       __ ret(lr);
 9568     return entry;
 9569   }
 9570 
 9571   void generate_string_indexof_stubs() {
 9572     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9573     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9574     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9575   }
 9576 
 9577   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9578       FloatRegister src1, FloatRegister src2) {
 9579     Register dst = r1;
 9580     __ zip1(v1, __ T16B, src1, v0);
 9581     __ zip2(v2, __ T16B, src1, v0);
 9582     if (generatePrfm) {
 9583       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9584     }
 9585     __ zip1(v3, __ T16B, src2, v0);
 9586     __ zip2(v4, __ T16B, src2, v0);
 9587     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9588   }
 9589 
 9590   // R0 = src
 9591   // R1 = dst
 9592   // R2 = len
 9593   // R3 = len >> 3
 9594   // V0 = 0
 9595   // v1 = loaded 8 bytes
 9596   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9597   address generate_large_byte_array_inflate() {
 9598     __ align(CodeEntryAlignment);
 9599     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
 9600     StubCodeMark mark(this, stub_id);
 9601     address entry = __ pc();
 9602     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9603     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9604     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9605 
 9606     // do one more 8-byte read to have address 16-byte aligned in most cases
 9607     // also use single store instruction
 9608     __ ldrd(v2, __ post(src, 8));
 9609     __ sub(octetCounter, octetCounter, 2);
 9610     __ zip1(v1, __ T16B, v1, v0);
 9611     __ zip1(v2, __ T16B, v2, v0);
 9612     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9613     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9614     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9615     __ br(__ LE, LOOP_START);
 9616     __ b(LOOP_PRFM_START);
 9617     __ bind(LOOP_PRFM);
 9618       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9619     __ bind(LOOP_PRFM_START);
 9620       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9621       __ sub(octetCounter, octetCounter, 8);
 9622       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9623       inflate_and_store_2_fp_registers(true, v3, v4);
 9624       inflate_and_store_2_fp_registers(true, v5, v6);
 9625       __ br(__ GT, LOOP_PRFM);
 9626       __ cmp(octetCounter, (u1)8);
 9627       __ br(__ LT, DONE);
 9628     __ bind(LOOP);
 9629       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9630       __ bind(LOOP_START);
 9631       __ sub(octetCounter, octetCounter, 8);
 9632       __ cmp(octetCounter, (u1)8);
 9633       inflate_and_store_2_fp_registers(false, v3, v4);
 9634       inflate_and_store_2_fp_registers(false, v5, v6);
 9635       __ br(__ GE, LOOP);
 9636     __ bind(DONE);
 9637       __ ret(lr);
 9638     return entry;
 9639   }
 9640 
 9641   /**
 9642    *  Arguments:
 9643    *
 9644    *  Input:
 9645    *  c_rarg0   - current state address
 9646    *  c_rarg1   - H key address
 9647    *  c_rarg2   - data address
 9648    *  c_rarg3   - number of blocks
 9649    *
 9650    *  Output:
 9651    *  Updated state at c_rarg0
 9652    */
 9653   address generate_ghash_processBlocks() {
 9654     // Bafflingly, GCM uses little-endian for the byte order, but
 9655     // big-endian for the bit order.  For example, the polynomial 1 is
 9656     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9657     //
 9658     // So, we must either reverse the bytes in each word and do
 9659     // everything big-endian or reverse the bits in each byte and do
 9660     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9661     // the bits in each byte (we have an instruction, RBIT, to do
 9662     // that) and keep the data in little-endian bit order through the
 9663     // calculation, bit-reversing the inputs and outputs.
 9664 
 9665     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
 9666     StubCodeMark mark(this, stub_id);
 9667     __ align(wordSize * 2);
 9668     address p = __ pc();
 9669     __ emit_int64(0x87);  // The low-order bits of the field
 9670                           // polynomial (i.e. p = z^7+z^2+z+1)
 9671                           // repeated in the low and high parts of a
 9672                           // 128-bit vector
 9673     __ emit_int64(0x87);
 9674 
 9675     __ align(CodeEntryAlignment);
 9676     address start = __ pc();
 9677 
 9678     Register state   = c_rarg0;
 9679     Register subkeyH = c_rarg1;
 9680     Register data    = c_rarg2;
 9681     Register blocks  = c_rarg3;
 9682 
 9683     FloatRegister vzr = v30;
 9684     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9685 
 9686     __ ldrq(v24, p);    // The field polynomial
 9687 
 9688     __ ldrq(v0, Address(state));
 9689     __ ldrq(v1, Address(subkeyH));
 9690 
 9691     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9692     __ rbit(v0, __ T16B, v0);
 9693     __ rev64(v1, __ T16B, v1);
 9694     __ rbit(v1, __ T16B, v1);
 9695 
 9696     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9697     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9698 
 9699     {
 9700       Label L_ghash_loop;
 9701       __ bind(L_ghash_loop);
 9702 
 9703       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9704                                                  // reversing each byte
 9705       __ rbit(v2, __ T16B, v2);
 9706       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9707 
 9708       // Multiply state in v2 by subkey in v1
 9709       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9710                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9711                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9712       // Reduce v7:v5 by the field polynomial
 9713       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9714 
 9715       __ sub(blocks, blocks, 1);
 9716       __ cbnz(blocks, L_ghash_loop);
 9717     }
 9718 
 9719     // The bit-reversed result is at this point in v0
 9720     __ rev64(v0, __ T16B, v0);
 9721     __ rbit(v0, __ T16B, v0);
 9722 
 9723     __ st1(v0, __ T16B, state);
 9724     __ ret(lr);
 9725 
 9726     return start;
 9727   }
 9728 
 9729   address generate_ghash_processBlocks_wide() {
 9730     address small = generate_ghash_processBlocks();
 9731 
 9732     StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
 9733     StubCodeMark mark(this, stub_id);
 9734     __ align(wordSize * 2);
 9735     address p = __ pc();
 9736     __ emit_int64(0x87);  // The low-order bits of the field
 9737                           // polynomial (i.e. p = z^7+z^2+z+1)
 9738                           // repeated in the low and high parts of a
 9739                           // 128-bit vector
 9740     __ emit_int64(0x87);
 9741 
 9742     __ align(CodeEntryAlignment);
 9743     address start = __ pc();
 9744 
 9745     Register state   = c_rarg0;
 9746     Register subkeyH = c_rarg1;
 9747     Register data    = c_rarg2;
 9748     Register blocks  = c_rarg3;
 9749 
 9750     const int unroll = 4;
 9751 
 9752     __ cmp(blocks, (unsigned char)(unroll * 2));
 9753     __ br(__ LT, small);
 9754 
 9755     if (unroll > 1) {
 9756     // Save state before entering routine
 9757       __ sub(sp, sp, 4 * 16);
 9758       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9759       __ sub(sp, sp, 4 * 16);
 9760       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9761     }
 9762 
 9763     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 9764 
 9765     if (unroll > 1) {
 9766       // And restore state
 9767       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9768       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9769     }
 9770 
 9771     __ cmp(blocks, (unsigned char)0);
 9772     __ br(__ GT, small);
 9773 
 9774     __ ret(lr);
 9775 
 9776     return start;
 9777   }
 9778 
 9779   void generate_base64_encode_simdround(Register src, Register dst,
 9780         FloatRegister codec, u8 size) {
 9781 
 9782     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9783     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9784     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9785 
 9786     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9787 
 9788     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9789 
 9790     __ ushr(ind0, arrangement, in0,  2);
 9791 
 9792     __ ushr(ind1, arrangement, in1,  2);
 9793     __ shl(in0,   arrangement, in0,  6);
 9794     __ orr(ind1,  arrangement, ind1, in0);
 9795     __ ushr(ind1, arrangement, ind1, 2);
 9796 
 9797     __ ushr(ind2, arrangement, in2,  4);
 9798     __ shl(in1,   arrangement, in1,  4);
 9799     __ orr(ind2,  arrangement, in1,  ind2);
 9800     __ ushr(ind2, arrangement, ind2, 2);
 9801 
 9802     __ shl(ind3,  arrangement, in2,  2);
 9803     __ ushr(ind3, arrangement, ind3, 2);
 9804 
 9805     __ tbl(out0,  arrangement, codec,  4, ind0);
 9806     __ tbl(out1,  arrangement, codec,  4, ind1);
 9807     __ tbl(out2,  arrangement, codec,  4, ind2);
 9808     __ tbl(out3,  arrangement, codec,  4, ind3);
 9809 
 9810     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9811   }
 9812 
 9813    /**
 9814    *  Arguments:
 9815    *
 9816    *  Input:
 9817    *  c_rarg0   - src_start
 9818    *  c_rarg1   - src_offset
 9819    *  c_rarg2   - src_length
 9820    *  c_rarg3   - dest_start
 9821    *  c_rarg4   - dest_offset
 9822    *  c_rarg5   - isURL
 9823    *
 9824    */
 9825   address generate_base64_encodeBlock() {
 9826 
 9827     static const char toBase64[64] = {
 9828       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9829       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9830       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9831       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9832       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9833     };
 9834 
 9835     static const char toBase64URL[64] = {
 9836       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9837       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9838       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9839       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9840       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9841     };
 9842 
 9843     __ align(CodeEntryAlignment);
 9844     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
 9845     StubCodeMark mark(this, stub_id);
 9846     address start = __ pc();
 9847 
 9848     Register src   = c_rarg0;  // source array
 9849     Register soff  = c_rarg1;  // source start offset
 9850     Register send  = c_rarg2;  // source end offset
 9851     Register dst   = c_rarg3;  // dest array
 9852     Register doff  = c_rarg4;  // position for writing to dest array
 9853     Register isURL = c_rarg5;  // Base64 or URL character set
 9854 
 9855     // c_rarg6 and c_rarg7 are free to use as temps
 9856     Register codec  = c_rarg6;
 9857     Register length = c_rarg7;
 9858 
 9859     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9860 
 9861     __ add(src, src, soff);
 9862     __ add(dst, dst, doff);
 9863     __ sub(length, send, soff);
 9864 
 9865     // load the codec base address
 9866     __ lea(codec, ExternalAddress((address) toBase64));
 9867     __ cbz(isURL, ProcessData);
 9868     __ lea(codec, ExternalAddress((address) toBase64URL));
 9869 
 9870     __ BIND(ProcessData);
 9871 
 9872     // too short to formup a SIMD loop, roll back
 9873     __ cmp(length, (u1)24);
 9874     __ br(Assembler::LT, Process3B);
 9875 
 9876     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9877 
 9878     __ BIND(Process48B);
 9879     __ cmp(length, (u1)48);
 9880     __ br(Assembler::LT, Process24B);
 9881     generate_base64_encode_simdround(src, dst, v0, 16);
 9882     __ sub(length, length, 48);
 9883     __ b(Process48B);
 9884 
 9885     __ BIND(Process24B);
 9886     __ cmp(length, (u1)24);
 9887     __ br(Assembler::LT, SIMDExit);
 9888     generate_base64_encode_simdround(src, dst, v0, 8);
 9889     __ sub(length, length, 24);
 9890 
 9891     __ BIND(SIMDExit);
 9892     __ cbz(length, Exit);
 9893 
 9894     __ BIND(Process3B);
 9895     //  3 src bytes, 24 bits
 9896     __ ldrb(r10, __ post(src, 1));
 9897     __ ldrb(r11, __ post(src, 1));
 9898     __ ldrb(r12, __ post(src, 1));
 9899     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9900     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9901     // codec index
 9902     __ ubfmw(r15, r12, 18, 23);
 9903     __ ubfmw(r14, r12, 12, 17);
 9904     __ ubfmw(r13, r12, 6,  11);
 9905     __ andw(r12,  r12, 63);
 9906     // get the code based on the codec
 9907     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9908     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9909     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9910     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9911     __ strb(r15, __ post(dst, 1));
 9912     __ strb(r14, __ post(dst, 1));
 9913     __ strb(r13, __ post(dst, 1));
 9914     __ strb(r12, __ post(dst, 1));
 9915     __ sub(length, length, 3);
 9916     __ cbnz(length, Process3B);
 9917 
 9918     __ BIND(Exit);
 9919     __ ret(lr);
 9920 
 9921     return start;
 9922   }
 9923 
 9924   void generate_base64_decode_simdround(Register src, Register dst,
 9925         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9926 
 9927     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9928     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9929 
 9930     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9931     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9932 
 9933     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9934 
 9935     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9936 
 9937     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9938 
 9939     // we need unsigned saturating subtract, to make sure all input values
 9940     // in range [0, 63] will have 0U value in the higher half lookup
 9941     __ uqsubv(decH0, __ T16B, in0, v27);
 9942     __ uqsubv(decH1, __ T16B, in1, v27);
 9943     __ uqsubv(decH2, __ T16B, in2, v27);
 9944     __ uqsubv(decH3, __ T16B, in3, v27);
 9945 
 9946     // lower half lookup
 9947     __ tbl(decL0, arrangement, codecL, 4, in0);
 9948     __ tbl(decL1, arrangement, codecL, 4, in1);
 9949     __ tbl(decL2, arrangement, codecL, 4, in2);
 9950     __ tbl(decL3, arrangement, codecL, 4, in3);
 9951 
 9952     // higher half lookup
 9953     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9954     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9955     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9956     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9957 
 9958     // combine lower and higher
 9959     __ orr(decL0, arrangement, decL0, decH0);
 9960     __ orr(decL1, arrangement, decL1, decH1);
 9961     __ orr(decL2, arrangement, decL2, decH2);
 9962     __ orr(decL3, arrangement, decL3, decH3);
 9963 
 9964     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 9965     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 9966     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 9967     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 9968     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 9969     __ orr(in0, arrangement, decH0, decH1);
 9970     __ orr(in1, arrangement, decH2, decH3);
 9971     __ orr(in2, arrangement, in0,   in1);
 9972     __ umaxv(in3, arrangement, in2);
 9973     __ umov(rscratch2, in3, __ B, 0);
 9974 
 9975     // get the data to output
 9976     __ shl(out0,  arrangement, decL0, 2);
 9977     __ ushr(out1, arrangement, decL1, 4);
 9978     __ orr(out0,  arrangement, out0,  out1);
 9979     __ shl(out1,  arrangement, decL1, 4);
 9980     __ ushr(out2, arrangement, decL2, 2);
 9981     __ orr(out1,  arrangement, out1,  out2);
 9982     __ shl(out2,  arrangement, decL2, 6);
 9983     __ orr(out2,  arrangement, out2,  decL3);
 9984 
 9985     __ cbz(rscratch2, NoIllegalData);
 9986 
 9987     // handle illegal input
 9988     __ umov(r10, in2, __ D, 0);
 9989     if (size == 16) {
 9990       __ cbnz(r10, ErrorInLowerHalf);
 9991 
 9992       // illegal input is in higher half, store the lower half now.
 9993       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 9994 
 9995       __ umov(r10, in2,  __ D, 1);
 9996       __ umov(r11, out0, __ D, 1);
 9997       __ umov(r12, out1, __ D, 1);
 9998       __ umov(r13, out2, __ D, 1);
 9999       __ b(StoreLegalData);
10000 
10001       __ BIND(ErrorInLowerHalf);
10002     }
10003     __ umov(r11, out0, __ D, 0);
10004     __ umov(r12, out1, __ D, 0);
10005     __ umov(r13, out2, __ D, 0);
10006 
10007     __ BIND(StoreLegalData);
10008     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10009     __ strb(r11, __ post(dst, 1));
10010     __ strb(r12, __ post(dst, 1));
10011     __ strb(r13, __ post(dst, 1));
10012     __ lsr(r10, r10, 8);
10013     __ lsr(r11, r11, 8);
10014     __ lsr(r12, r12, 8);
10015     __ lsr(r13, r13, 8);
10016     __ b(StoreLegalData);
10017 
10018     __ BIND(NoIllegalData);
10019     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10020   }
10021 
10022 
10023    /**
10024    *  Arguments:
10025    *
10026    *  Input:
10027    *  c_rarg0   - src_start
10028    *  c_rarg1   - src_offset
10029    *  c_rarg2   - src_length
10030    *  c_rarg3   - dest_start
10031    *  c_rarg4   - dest_offset
10032    *  c_rarg5   - isURL
10033    *  c_rarg6   - isMIME
10034    *
10035    */
10036   address generate_base64_decodeBlock() {
10037 
10038     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10039     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10040     // titled "Base64 decoding".
10041 
10042     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10043     // except the trailing character '=' is also treated illegal value in this intrinsic. That
10044     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10045     static const uint8_t fromBase64ForNoSIMD[256] = {
10046       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10047       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10048       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10049        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10050       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10051        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
10052       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10053        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10054       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10055       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10056       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10057       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10058       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10059       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10060       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10061       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10062     };
10063 
10064     static const uint8_t fromBase64URLForNoSIMD[256] = {
10065       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10066       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10067       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10068        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10069       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10070        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
10071       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10072        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10073       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10074       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10075       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10076       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10077       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10078       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10079       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10080       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10081     };
10082 
10083     // A legal value of base64 code is in range [0, 127].  We need two lookups
10084     // with tbl/tbx and combine them to get the decode data. The 1st table vector
10085     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10086     // table vector lookup use tbx, out of range indices are unchanged in
10087     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10088     // The value of index 64 is set to 0, so that we know that we already get the
10089     // decoded data with the 1st lookup.
10090     static const uint8_t fromBase64ForSIMD[128] = {
10091       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10092       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10093       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10094        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10095         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10096        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10097       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10098        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10099     };
10100 
10101     static const uint8_t fromBase64URLForSIMD[128] = {
10102       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10103       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10104       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10105        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10106         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10107        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10108        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10109        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10110     };
10111 
10112     __ align(CodeEntryAlignment);
10113     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10114     StubCodeMark mark(this, stub_id);
10115     address start = __ pc();
10116 
10117     Register src    = c_rarg0;  // source array
10118     Register soff   = c_rarg1;  // source start offset
10119     Register send   = c_rarg2;  // source end offset
10120     Register dst    = c_rarg3;  // dest array
10121     Register doff   = c_rarg4;  // position for writing to dest array
10122     Register isURL  = c_rarg5;  // Base64 or URL character set
10123     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10124 
10125     Register length = send;    // reuse send as length of source data to process
10126 
10127     Register simd_codec   = c_rarg6;
10128     Register nosimd_codec = c_rarg7;
10129 
10130     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10131 
10132     __ enter();
10133 
10134     __ add(src, src, soff);
10135     __ add(dst, dst, doff);
10136 
10137     __ mov(doff, dst);
10138 
10139     __ sub(length, send, soff);
10140     __ bfm(length, zr, 0, 1);
10141 
10142     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10143     __ cbz(isURL, ProcessData);
10144     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10145 
10146     __ BIND(ProcessData);
10147     __ mov(rscratch1, length);
10148     __ cmp(length, (u1)144); // 144 = 80 + 64
10149     __ br(Assembler::LT, Process4B);
10150 
10151     // In the MIME case, the line length cannot be more than 76
10152     // bytes (see RFC 2045). This is too short a block for SIMD
10153     // to be worthwhile, so we use non-SIMD here.
10154     __ movw(rscratch1, 79);
10155 
10156     __ BIND(Process4B);
10157     __ ldrw(r14, __ post(src, 4));
10158     __ ubfxw(r10, r14, 0,  8);
10159     __ ubfxw(r11, r14, 8,  8);
10160     __ ubfxw(r12, r14, 16, 8);
10161     __ ubfxw(r13, r14, 24, 8);
10162     // get the de-code
10163     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10164     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10165     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10166     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10167     // error detection, 255u indicates an illegal input
10168     __ orrw(r14, r10, r11);
10169     __ orrw(r15, r12, r13);
10170     __ orrw(r14, r14, r15);
10171     __ tbnz(r14, 7, Exit);
10172     // recover the data
10173     __ lslw(r14, r10, 10);
10174     __ bfiw(r14, r11, 4, 6);
10175     __ bfmw(r14, r12, 2, 5);
10176     __ rev16w(r14, r14);
10177     __ bfiw(r13, r12, 6, 2);
10178     __ strh(r14, __ post(dst, 2));
10179     __ strb(r13, __ post(dst, 1));
10180     // non-simd loop
10181     __ subsw(rscratch1, rscratch1, 4);
10182     __ br(Assembler::GT, Process4B);
10183 
10184     // if exiting from PreProcess80B, rscratch1 == -1;
10185     // otherwise, rscratch1 == 0.
10186     __ cbzw(rscratch1, Exit);
10187     __ sub(length, length, 80);
10188 
10189     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10190     __ cbz(isURL, SIMDEnter);
10191     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10192 
10193     __ BIND(SIMDEnter);
10194     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10195     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10196     __ mov(rscratch1, 63);
10197     __ dup(v27, __ T16B, rscratch1);
10198 
10199     __ BIND(Process64B);
10200     __ cmp(length, (u1)64);
10201     __ br(Assembler::LT, Process32B);
10202     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10203     __ sub(length, length, 64);
10204     __ b(Process64B);
10205 
10206     __ BIND(Process32B);
10207     __ cmp(length, (u1)32);
10208     __ br(Assembler::LT, SIMDExit);
10209     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10210     __ sub(length, length, 32);
10211     __ b(Process32B);
10212 
10213     __ BIND(SIMDExit);
10214     __ cbz(length, Exit);
10215     __ movw(rscratch1, length);
10216     __ b(Process4B);
10217 
10218     __ BIND(Exit);
10219     __ sub(c_rarg0, dst, doff);
10220 
10221     __ leave();
10222     __ ret(lr);
10223 
10224     return start;
10225   }
10226 
10227   // Support for spin waits.
10228   address generate_spin_wait() {
10229     __ align(CodeEntryAlignment);
10230     StubId stub_id = StubId::stubgen_spin_wait_id;
10231     StubCodeMark mark(this, stub_id);
10232     address start = __ pc();
10233 
10234     __ spin_wait();
10235     __ ret(lr);
10236 
10237     return start;
10238   }
10239 
10240   void generate_lookup_secondary_supers_table_stub() {
10241     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10242     StubCodeMark mark(this, stub_id);
10243 
10244     const Register
10245       r_super_klass  = r0,
10246       r_array_base   = r1,
10247       r_array_length = r2,
10248       r_array_index  = r3,
10249       r_sub_klass    = r4,
10250       r_bitmap       = rscratch2,
10251       result         = r5;
10252     const FloatRegister
10253       vtemp          = v0;
10254 
10255     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10256       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10257       Label L_success;
10258       __ enter();
10259       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10260                                              r_array_base, r_array_length, r_array_index,
10261                                              vtemp, result, slot,
10262                                              /*stub_is_near*/true);
10263       __ leave();
10264       __ ret(lr);
10265     }
10266   }
10267 
10268   // Slow path implementation for UseSecondarySupersTable.
10269   address generate_lookup_secondary_supers_table_slow_path_stub() {
10270     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10271     StubCodeMark mark(this, stub_id);
10272 
10273     address start = __ pc();
10274     const Register
10275       r_super_klass  = r0,        // argument
10276       r_array_base   = r1,        // argument
10277       temp1          = r2,        // temp
10278       r_array_index  = r3,        // argument
10279       r_bitmap       = rscratch2, // argument
10280       result         = r5;        // argument
10281 
10282     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10283     __ ret(lr);
10284 
10285     return start;
10286   }
10287 
10288 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10289 
10290   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
10291   //
10292   // If LSE is in use, generate LSE versions of all the stubs. The
10293   // non-LSE versions are in atomic_aarch64.S.
10294 
10295   // class AtomicStubMark records the entry point of a stub and the
10296   // stub pointer which will point to it. The stub pointer is set to
10297   // the entry point when ~AtomicStubMark() is called, which must be
10298   // after ICache::invalidate_range. This ensures safe publication of
10299   // the generated code.
10300   class AtomicStubMark {
10301     address _entry_point;
10302     aarch64_atomic_stub_t *_stub;
10303     MacroAssembler *_masm;
10304   public:
10305     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10306       _masm = masm;
10307       __ align(32);
10308       _entry_point = __ pc();
10309       _stub = stub;
10310     }
10311     ~AtomicStubMark() {
10312       *_stub = (aarch64_atomic_stub_t)_entry_point;
10313     }
10314   };
10315 
10316   // NB: For memory_order_conservative we need a trailing membar after
10317   // LSE atomic operations but not a leading membar.
10318   //
10319   // We don't need a leading membar because a clause in the Arm ARM
10320   // says:
10321   //
10322   //   Barrier-ordered-before
10323   //
10324   //   Barrier instructions order prior Memory effects before subsequent
10325   //   Memory effects generated by the same Observer. A read or a write
10326   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10327   //   Observer if and only if RW1 appears in program order before RW 2
10328   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10329   //   instruction with both Acquire and Release semantics.
10330   //
10331   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10332   // and Release semantics, therefore we don't need a leading
10333   // barrier. However, there is no corresponding Barrier-ordered-after
10334   // relationship, therefore we need a trailing membar to prevent a
10335   // later store or load from being reordered with the store in an
10336   // atomic instruction.
10337   //
10338   // This was checked by using the herd7 consistency model simulator
10339   // (http://diy.inria.fr/) with this test case:
10340   //
10341   // AArch64 LseCas
10342   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10343   // P0 | P1;
10344   // LDR W4, [X2] | MOV W3, #0;
10345   // DMB LD       | MOV W4, #1;
10346   // LDR W3, [X1] | CASAL W3, W4, [X1];
10347   //              | DMB ISH;
10348   //              | STR W4, [X2];
10349   // exists
10350   // (0:X3=0 /\ 0:X4=1)
10351   //
10352   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10353   // with the store to x in P1. Without the DMB in P1 this may happen.
10354   //
10355   // At the time of writing we don't know of any AArch64 hardware that
10356   // reorders stores in this way, but the Reference Manual permits it.
10357 
10358   void gen_cas_entry(Assembler::operand_size size,
10359                      atomic_memory_order order) {
10360     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10361       exchange_val = c_rarg2;
10362     bool acquire, release;
10363     switch (order) {
10364       case memory_order_relaxed:
10365         acquire = false;
10366         release = false;
10367         break;
10368       case memory_order_release:
10369         acquire = false;
10370         release = true;
10371         break;
10372       default:
10373         acquire = true;
10374         release = true;
10375         break;
10376     }
10377     __ mov(prev, compare_val);
10378     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10379     if (order == memory_order_conservative) {
10380       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10381     }
10382     if (size == Assembler::xword) {
10383       __ mov(r0, prev);
10384     } else {
10385       __ movw(r0, prev);
10386     }
10387     __ ret(lr);
10388   }
10389 
10390   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10391     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10392     // If not relaxed, then default to conservative.  Relaxed is the only
10393     // case we use enough to be worth specializing.
10394     if (order == memory_order_relaxed) {
10395       __ ldadd(size, incr, prev, addr);
10396     } else {
10397       __ ldaddal(size, incr, prev, addr);
10398       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10399     }
10400     if (size == Assembler::xword) {
10401       __ mov(r0, prev);
10402     } else {
10403       __ movw(r0, prev);
10404     }
10405     __ ret(lr);
10406   }
10407 
10408   void gen_swpal_entry(Assembler::operand_size size) {
10409     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10410     __ swpal(size, incr, prev, addr);
10411     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10412     if (size == Assembler::xword) {
10413       __ mov(r0, prev);
10414     } else {
10415       __ movw(r0, prev);
10416     }
10417     __ ret(lr);
10418   }
10419 
10420   void generate_atomic_entry_points() {
10421     if (! UseLSE) {
10422       return;
10423     }
10424     __ align(CodeEntryAlignment);
10425     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10426     StubCodeMark mark(this, stub_id);
10427     address first_entry = __ pc();
10428 
10429     // ADD, memory_order_conservative
10430     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10431     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10432     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10433     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10434 
10435     // ADD, memory_order_relaxed
10436     AtomicStubMark mark_fetch_add_4_relaxed
10437       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10438     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10439     AtomicStubMark mark_fetch_add_8_relaxed
10440       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10441     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10442 
10443     // XCHG, memory_order_conservative
10444     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10445     gen_swpal_entry(Assembler::word);
10446     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10447     gen_swpal_entry(Assembler::xword);
10448 
10449     // CAS, memory_order_conservative
10450     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10451     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10452     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10453     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10454     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10455     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10456 
10457     // CAS, memory_order_relaxed
10458     AtomicStubMark mark_cmpxchg_1_relaxed
10459       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10460     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10461     AtomicStubMark mark_cmpxchg_4_relaxed
10462       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10463     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10464     AtomicStubMark mark_cmpxchg_8_relaxed
10465       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10466     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10467 
10468     AtomicStubMark mark_cmpxchg_4_release
10469       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10470     gen_cas_entry(MacroAssembler::word, memory_order_release);
10471     AtomicStubMark mark_cmpxchg_8_release
10472       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10473     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10474 
10475     AtomicStubMark mark_cmpxchg_4_seq_cst
10476       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10477     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10478     AtomicStubMark mark_cmpxchg_8_seq_cst
10479       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10480     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10481 
10482     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10483   }
10484 #endif // LINUX
10485 
10486   static void save_return_registers(MacroAssembler* masm) {
10487     if (InlineTypeReturnedAsFields) {
10488       masm->push(RegSet::range(r0, r7), sp);
10489       masm->sub(sp, sp, 4 * wordSize);
10490       masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
10491       masm->sub(sp, sp, 4 * wordSize);
10492       masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
10493     } else {
10494       masm->fmovd(rscratch1, v0);
10495       masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
10496     }
10497   }
10498 
10499   static void restore_return_registers(MacroAssembler* masm) {
10500     if (InlineTypeReturnedAsFields) {
10501       masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10502       masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10503       masm->pop(RegSet::range(r0, r7), sp);
10504     } else {
10505       masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
10506       masm->fmovd(v0, rscratch1);
10507     }
10508   }
10509 
10510   address generate_cont_thaw(Continuation::thaw_kind kind) {
10511     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10512     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10513 
10514     address start = __ pc();
10515 
10516     if (return_barrier) {
10517       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10518       __ mov(sp, rscratch1);
10519     }
10520     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10521 
10522     if (return_barrier) {
10523       // preserve possible return value from a method returning to the return barrier
10524       save_return_registers(_masm);
10525     }
10526 
10527     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10528     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10529     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10530 
10531     if (return_barrier) {
10532       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10533       restore_return_registers(_masm);
10534     }
10535     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10536 
10537 
10538     Label thaw_success;
10539     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10540     __ cbnz(rscratch2, thaw_success);
10541     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10542     __ br(rscratch1);
10543     __ bind(thaw_success);
10544 
10545     // make room for the thawed frames
10546     __ sub(rscratch1, sp, rscratch2);
10547     __ andr(rscratch1, rscratch1, -16); // align
10548     __ mov(sp, rscratch1);
10549 
10550     if (return_barrier) {
10551       // save original return value -- again
10552       save_return_registers(_masm);
10553     }
10554 
10555     // If we want, we can templatize thaw by kind, and have three different entries
10556     __ movw(c_rarg1, (uint32_t)kind);
10557 
10558     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10559     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10560 
10561     if (return_barrier) {
10562       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10563       restore_return_registers(_masm);
10564     } else {
10565       __ mov(r0, zr); // return 0 (success) from doYield
10566     }
10567 
10568     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10569     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10570     __ mov(rfp, sp);
10571 
10572     if (return_barrier_exception) {
10573       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10574       __ authenticate_return_address(c_rarg1);
10575       __ verify_oop(r0);
10576       // save return value containing the exception oop in callee-saved R19
10577       __ mov(r19, r0);
10578 
10579       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10580 
10581       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10582       // __ reinitialize_ptrue();
10583 
10584       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10585 
10586       __ mov(r1, r0); // the exception handler
10587       __ mov(r0, r19); // restore return value containing the exception oop
10588       __ verify_oop(r0);
10589 
10590       __ leave();
10591       __ mov(r3, lr);
10592       __ br(r1); // the exception handler
10593     } else {
10594       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10595       __ leave();
10596       __ ret(lr);
10597     }
10598 
10599     return start;
10600   }
10601 
10602   address generate_cont_thaw() {
10603     if (!Continuations::enabled()) return nullptr;
10604 
10605     StubId stub_id = StubId::stubgen_cont_thaw_id;
10606     StubCodeMark mark(this, stub_id);
10607     address start = __ pc();
10608     generate_cont_thaw(Continuation::thaw_top);
10609     return start;
10610   }
10611 
10612   address generate_cont_returnBarrier() {
10613     if (!Continuations::enabled()) return nullptr;
10614 
10615     // TODO: will probably need multiple return barriers depending on return type
10616     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10617     StubCodeMark mark(this, stub_id);
10618     address start = __ pc();
10619 
10620     generate_cont_thaw(Continuation::thaw_return_barrier);
10621 
10622     return start;
10623   }
10624 
10625   address generate_cont_returnBarrier_exception() {
10626     if (!Continuations::enabled()) return nullptr;
10627 
10628     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10629     StubCodeMark mark(this, stub_id);
10630     address start = __ pc();
10631 
10632     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10633 
10634     return start;
10635   }
10636 
10637   address generate_cont_preempt_stub() {
10638     if (!Continuations::enabled()) return nullptr;
10639     StubId stub_id = StubId::stubgen_cont_preempt_id;
10640     StubCodeMark mark(this, stub_id);
10641     address start = __ pc();
10642 
10643     __ reset_last_Java_frame(true);
10644 
10645     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10646     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10647     __ mov(sp, rscratch2);
10648 
10649     Label preemption_cancelled;
10650     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10651     __ cbnz(rscratch1, preemption_cancelled);
10652 
10653     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10654     SharedRuntime::continuation_enter_cleanup(_masm);
10655     __ leave();
10656     __ ret(lr);
10657 
10658     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10659     __ bind(preemption_cancelled);
10660     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10661     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10662     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10663     __ ldr(rscratch1, Address(rscratch1));
10664     __ br(rscratch1);
10665 
10666     return start;
10667   }
10668 
10669   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10670   // are represented as long[5], with BITS_PER_LIMB = 26.
10671   // Pack five 26-bit limbs into three 64-bit registers.
10672   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10673     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10674     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10675     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10676     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10677 
10678     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10679     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10680     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10681     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10682 
10683     if (dest2->is_valid()) {
10684       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10685     } else {
10686 #ifdef ASSERT
10687       Label OK;
10688       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10689       __ br(__ EQ, OK);
10690       __ stop("high bits of Poly1305 integer should be zero");
10691       __ should_not_reach_here();
10692       __ bind(OK);
10693 #endif
10694     }
10695   }
10696 
10697   // As above, but return only a 128-bit integer, packed into two
10698   // 64-bit registers.
10699   void pack_26(Register dest0, Register dest1, Register src) {
10700     pack_26(dest0, dest1, noreg, src);
10701   }
10702 
10703   // Multiply and multiply-accumulate unsigned 64-bit registers.
10704   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10705     __ mul(prod_lo, n, m);
10706     __ umulh(prod_hi, n, m);
10707   }
10708   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10709     wide_mul(rscratch1, rscratch2, n, m);
10710     __ adds(sum_lo, sum_lo, rscratch1);
10711     __ adc(sum_hi, sum_hi, rscratch2);
10712   }
10713 
10714   // Poly1305, RFC 7539
10715 
10716   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10717   // description of the tricks used to simplify and accelerate this
10718   // computation.
10719 
10720   address generate_poly1305_processBlocks() {
10721     __ align(CodeEntryAlignment);
10722     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10723     StubCodeMark mark(this, stub_id);
10724     address start = __ pc();
10725     Label here;
10726     __ enter();
10727     RegSet callee_saved = RegSet::range(r19, r28);
10728     __ push(callee_saved, sp);
10729 
10730     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10731 
10732     // Arguments
10733     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10734 
10735     // R_n is the 128-bit randomly-generated key, packed into two
10736     // registers.  The caller passes this key to us as long[5], with
10737     // BITS_PER_LIMB = 26.
10738     const Register R_0 = *++regs, R_1 = *++regs;
10739     pack_26(R_0, R_1, r_start);
10740 
10741     // RR_n is (R_n >> 2) * 5
10742     const Register RR_0 = *++regs, RR_1 = *++regs;
10743     __ lsr(RR_0, R_0, 2);
10744     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10745     __ lsr(RR_1, R_1, 2);
10746     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10747 
10748     // U_n is the current checksum
10749     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10750     pack_26(U_0, U_1, U_2, acc_start);
10751 
10752     static constexpr int BLOCK_LENGTH = 16;
10753     Label DONE, LOOP;
10754 
10755     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10756     __ br(Assembler::LT, DONE); {
10757       __ bind(LOOP);
10758 
10759       // S_n is to be the sum of U_n and the next block of data
10760       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10761       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10762       __ adds(S_0, U_0, S_0);
10763       __ adcs(S_1, U_1, S_1);
10764       __ adc(S_2, U_2, zr);
10765       __ add(S_2, S_2, 1);
10766 
10767       const Register U_0HI = *++regs, U_1HI = *++regs;
10768 
10769       // NB: this logic depends on some of the special properties of
10770       // Poly1305 keys. In particular, because we know that the top
10771       // four bits of R_0 and R_1 are zero, we can add together
10772       // partial products without any risk of needing to propagate a
10773       // carry out.
10774       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10775       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10776       __ andr(U_2, R_0, 3);
10777       __ mul(U_2, S_2, U_2);
10778 
10779       // Recycle registers S_0, S_1, S_2
10780       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10781 
10782       // Partial reduction mod 2**130 - 5
10783       __ adds(U_1, U_0HI, U_1);
10784       __ adc(U_2, U_1HI, U_2);
10785       // Sum now in U_2:U_1:U_0.
10786       // Dead: U_0HI, U_1HI.
10787       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10788 
10789       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10790 
10791       // First, U_2:U_1:U_0 += (U_2 >> 2)
10792       __ lsr(rscratch1, U_2, 2);
10793       __ andr(U_2, U_2, (u8)3);
10794       __ adds(U_0, U_0, rscratch1);
10795       __ adcs(U_1, U_1, zr);
10796       __ adc(U_2, U_2, zr);
10797       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10798       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10799       __ adcs(U_1, U_1, zr);
10800       __ adc(U_2, U_2, zr);
10801 
10802       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10803       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10804       __ br(~ Assembler::LT, LOOP);
10805     }
10806 
10807     // Further reduce modulo 2^130 - 5
10808     __ lsr(rscratch1, U_2, 2);
10809     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10810     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10811     __ adcs(U_1, U_1, zr);
10812     __ andr(U_2, U_2, (u1)3);
10813     __ adc(U_2, U_2, zr);
10814 
10815     // Unpack the sum into five 26-bit limbs and write to memory.
10816     __ ubfiz(rscratch1, U_0, 0, 26);
10817     __ ubfx(rscratch2, U_0, 26, 26);
10818     __ stp(rscratch1, rscratch2, Address(acc_start));
10819     __ ubfx(rscratch1, U_0, 52, 12);
10820     __ bfi(rscratch1, U_1, 12, 14);
10821     __ ubfx(rscratch2, U_1, 14, 26);
10822     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10823     __ ubfx(rscratch1, U_1, 40, 24);
10824     __ bfi(rscratch1, U_2, 24, 3);
10825     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10826 
10827     __ bind(DONE);
10828     __ pop(callee_saved, sp);
10829     __ leave();
10830     __ ret(lr);
10831 
10832     return start;
10833   }
10834 
10835   // exception handler for upcall stubs
10836   address generate_upcall_stub_exception_handler() {
10837     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10838     StubCodeMark mark(this, stub_id);
10839     address start = __ pc();
10840 
10841     // Native caller has no idea how to handle exceptions,
10842     // so we just crash here. Up to callee to catch exceptions.
10843     __ verify_oop(r0);
10844     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10845     __ blr(rscratch1);
10846     __ should_not_reach_here();
10847 
10848     return start;
10849   }
10850 
10851   // load Method* target of MethodHandle
10852   // j_rarg0 = jobject receiver
10853   // rmethod = result
10854   address generate_upcall_stub_load_target() {
10855     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10856     StubCodeMark mark(this, stub_id);
10857     address start = __ pc();
10858 
10859     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10860       // Load target method from receiver
10861     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10862     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10863     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10864     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10865                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10866                       noreg, noreg);
10867     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10868 
10869     __ ret(lr);
10870 
10871     return start;
10872   }
10873 
10874 #undef __
10875 #define __ masm->
10876 
10877   class MontgomeryMultiplyGenerator : public MacroAssembler {
10878 
10879     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10880       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10881 
10882     RegSet _toSave;
10883     bool _squaring;
10884 
10885   public:
10886     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10887       : MacroAssembler(as->code()), _squaring(squaring) {
10888 
10889       // Register allocation
10890 
10891       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10892       Pa_base = *regs;       // Argument registers
10893       if (squaring)
10894         Pb_base = Pa_base;
10895       else
10896         Pb_base = *++regs;
10897       Pn_base = *++regs;
10898       Rlen= *++regs;
10899       inv = *++regs;
10900       Pm_base = *++regs;
10901 
10902                           // Working registers:
10903       Ra =  *++regs;        // The current digit of a, b, n, and m.
10904       Rb =  *++regs;
10905       Rm =  *++regs;
10906       Rn =  *++regs;
10907 
10908       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10909       Pb =  *++regs;
10910       Pm =  *++regs;
10911       Pn =  *++regs;
10912 
10913       t0 =  *++regs;        // Three registers which form a
10914       t1 =  *++regs;        // triple-precision accumuator.
10915       t2 =  *++regs;
10916 
10917       Ri =  *++regs;        // Inner and outer loop indexes.
10918       Rj =  *++regs;
10919 
10920       Rhi_ab = *++regs;     // Product registers: low and high parts
10921       Rlo_ab = *++regs;     // of a*b and m*n.
10922       Rhi_mn = *++regs;
10923       Rlo_mn = *++regs;
10924 
10925       // r19 and up are callee-saved.
10926       _toSave = RegSet::range(r19, *regs) + Pm_base;
10927     }
10928 
10929   private:
10930     void save_regs() {
10931       push(_toSave, sp);
10932     }
10933 
10934     void restore_regs() {
10935       pop(_toSave, sp);
10936     }
10937 
10938     template <typename T>
10939     void unroll_2(Register count, T block) {
10940       Label loop, end, odd;
10941       tbnz(count, 0, odd);
10942       cbz(count, end);
10943       align(16);
10944       bind(loop);
10945       (this->*block)();
10946       bind(odd);
10947       (this->*block)();
10948       subs(count, count, 2);
10949       br(Assembler::GT, loop);
10950       bind(end);
10951     }
10952 
10953     template <typename T>
10954     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10955       Label loop, end, odd;
10956       tbnz(count, 0, odd);
10957       cbz(count, end);
10958       align(16);
10959       bind(loop);
10960       (this->*block)(d, s, tmp);
10961       bind(odd);
10962       (this->*block)(d, s, tmp);
10963       subs(count, count, 2);
10964       br(Assembler::GT, loop);
10965       bind(end);
10966     }
10967 
10968     void pre1(RegisterOrConstant i) {
10969       block_comment("pre1");
10970       // Pa = Pa_base;
10971       // Pb = Pb_base + i;
10972       // Pm = Pm_base;
10973       // Pn = Pn_base + i;
10974       // Ra = *Pa;
10975       // Rb = *Pb;
10976       // Rm = *Pm;
10977       // Rn = *Pn;
10978       ldr(Ra, Address(Pa_base));
10979       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10980       ldr(Rm, Address(Pm_base));
10981       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10982       lea(Pa, Address(Pa_base));
10983       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10984       lea(Pm, Address(Pm_base));
10985       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10986 
10987       // Zero the m*n result.
10988       mov(Rhi_mn, zr);
10989       mov(Rlo_mn, zr);
10990     }
10991 
10992     // The core multiply-accumulate step of a Montgomery
10993     // multiplication.  The idea is to schedule operations as a
10994     // pipeline so that instructions with long latencies (loads and
10995     // multiplies) have time to complete before their results are
10996     // used.  This most benefits in-order implementations of the
10997     // architecture but out-of-order ones also benefit.
10998     void step() {
10999       block_comment("step");
11000       // MACC(Ra, Rb, t0, t1, t2);
11001       // Ra = *++Pa;
11002       // Rb = *--Pb;
11003       umulh(Rhi_ab, Ra, Rb);
11004       mul(Rlo_ab, Ra, Rb);
11005       ldr(Ra, pre(Pa, wordSize));
11006       ldr(Rb, pre(Pb, -wordSize));
11007       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11008                                        // previous iteration.
11009       // MACC(Rm, Rn, t0, t1, t2);
11010       // Rm = *++Pm;
11011       // Rn = *--Pn;
11012       umulh(Rhi_mn, Rm, Rn);
11013       mul(Rlo_mn, Rm, Rn);
11014       ldr(Rm, pre(Pm, wordSize));
11015       ldr(Rn, pre(Pn, -wordSize));
11016       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11017     }
11018 
11019     void post1() {
11020       block_comment("post1");
11021 
11022       // MACC(Ra, Rb, t0, t1, t2);
11023       // Ra = *++Pa;
11024       // Rb = *--Pb;
11025       umulh(Rhi_ab, Ra, Rb);
11026       mul(Rlo_ab, Ra, Rb);
11027       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11028       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11029 
11030       // *Pm = Rm = t0 * inv;
11031       mul(Rm, t0, inv);
11032       str(Rm, Address(Pm));
11033 
11034       // MACC(Rm, Rn, t0, t1, t2);
11035       // t0 = t1; t1 = t2; t2 = 0;
11036       umulh(Rhi_mn, Rm, Rn);
11037 
11038 #ifndef PRODUCT
11039       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11040       {
11041         mul(Rlo_mn, Rm, Rn);
11042         add(Rlo_mn, t0, Rlo_mn);
11043         Label ok;
11044         cbz(Rlo_mn, ok); {
11045           stop("broken Montgomery multiply");
11046         } bind(ok);
11047       }
11048 #endif
11049       // We have very carefully set things up so that
11050       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11051       // the lower half of Rm * Rn because we know the result already:
11052       // it must be -t0.  t0 + (-t0) must generate a carry iff
11053       // t0 != 0.  So, rather than do a mul and an adds we just set
11054       // the carry flag iff t0 is nonzero.
11055       //
11056       // mul(Rlo_mn, Rm, Rn);
11057       // adds(zr, t0, Rlo_mn);
11058       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11059       adcs(t0, t1, Rhi_mn);
11060       adc(t1, t2, zr);
11061       mov(t2, zr);
11062     }
11063 
11064     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11065       block_comment("pre2");
11066       // Pa = Pa_base + i-len;
11067       // Pb = Pb_base + len;
11068       // Pm = Pm_base + i-len;
11069       // Pn = Pn_base + len;
11070 
11071       if (i.is_register()) {
11072         sub(Rj, i.as_register(), len);
11073       } else {
11074         mov(Rj, i.as_constant());
11075         sub(Rj, Rj, len);
11076       }
11077       // Rj == i-len
11078 
11079       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11080       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11081       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11082       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11083 
11084       // Ra = *++Pa;
11085       // Rb = *--Pb;
11086       // Rm = *++Pm;
11087       // Rn = *--Pn;
11088       ldr(Ra, pre(Pa, wordSize));
11089       ldr(Rb, pre(Pb, -wordSize));
11090       ldr(Rm, pre(Pm, wordSize));
11091       ldr(Rn, pre(Pn, -wordSize));
11092 
11093       mov(Rhi_mn, zr);
11094       mov(Rlo_mn, zr);
11095     }
11096 
11097     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11098       block_comment("post2");
11099       if (i.is_constant()) {
11100         mov(Rj, i.as_constant()-len.as_constant());
11101       } else {
11102         sub(Rj, i.as_register(), len);
11103       }
11104 
11105       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11106 
11107       // As soon as we know the least significant digit of our result,
11108       // store it.
11109       // Pm_base[i-len] = t0;
11110       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11111 
11112       // t0 = t1; t1 = t2; t2 = 0;
11113       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11114       adc(t1, t2, zr);
11115       mov(t2, zr);
11116     }
11117 
11118     // A carry in t0 after Montgomery multiplication means that we
11119     // should subtract multiples of n from our result in m.  We'll
11120     // keep doing that until there is no carry.
11121     void normalize(RegisterOrConstant len) {
11122       block_comment("normalize");
11123       // while (t0)
11124       //   t0 = sub(Pm_base, Pn_base, t0, len);
11125       Label loop, post, again;
11126       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11127       cbz(t0, post); {
11128         bind(again); {
11129           mov(i, zr);
11130           mov(cnt, len);
11131           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11132           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11133           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11134           align(16);
11135           bind(loop); {
11136             sbcs(Rm, Rm, Rn);
11137             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11138             add(i, i, 1);
11139             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11140             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11141             sub(cnt, cnt, 1);
11142           } cbnz(cnt, loop);
11143           sbc(t0, t0, zr);
11144         } cbnz(t0, again);
11145       } bind(post);
11146     }
11147 
11148     // Move memory at s to d, reversing words.
11149     //    Increments d to end of copied memory
11150     //    Destroys tmp1, tmp2
11151     //    Preserves len
11152     //    Leaves s pointing to the address which was in d at start
11153     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11154       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11155       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11156 
11157       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11158       mov(tmp1, len);
11159       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11160       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11161     }
11162     // where
11163     void reverse1(Register d, Register s, Register tmp) {
11164       ldr(tmp, pre(s, -wordSize));
11165       ror(tmp, tmp, 32);
11166       str(tmp, post(d, wordSize));
11167     }
11168 
11169     void step_squaring() {
11170       // An extra ACC
11171       step();
11172       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11173     }
11174 
11175     void last_squaring(RegisterOrConstant i) {
11176       Label dont;
11177       // if ((i & 1) == 0) {
11178       tbnz(i.as_register(), 0, dont); {
11179         // MACC(Ra, Rb, t0, t1, t2);
11180         // Ra = *++Pa;
11181         // Rb = *--Pb;
11182         umulh(Rhi_ab, Ra, Rb);
11183         mul(Rlo_ab, Ra, Rb);
11184         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11185       } bind(dont);
11186     }
11187 
11188     void extra_step_squaring() {
11189       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11190 
11191       // MACC(Rm, Rn, t0, t1, t2);
11192       // Rm = *++Pm;
11193       // Rn = *--Pn;
11194       umulh(Rhi_mn, Rm, Rn);
11195       mul(Rlo_mn, Rm, Rn);
11196       ldr(Rm, pre(Pm, wordSize));
11197       ldr(Rn, pre(Pn, -wordSize));
11198     }
11199 
11200     void post1_squaring() {
11201       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11202 
11203       // *Pm = Rm = t0 * inv;
11204       mul(Rm, t0, inv);
11205       str(Rm, Address(Pm));
11206 
11207       // MACC(Rm, Rn, t0, t1, t2);
11208       // t0 = t1; t1 = t2; t2 = 0;
11209       umulh(Rhi_mn, Rm, Rn);
11210 
11211 #ifndef PRODUCT
11212       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11213       {
11214         mul(Rlo_mn, Rm, Rn);
11215         add(Rlo_mn, t0, Rlo_mn);
11216         Label ok;
11217         cbz(Rlo_mn, ok); {
11218           stop("broken Montgomery multiply");
11219         } bind(ok);
11220       }
11221 #endif
11222       // We have very carefully set things up so that
11223       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11224       // the lower half of Rm * Rn because we know the result already:
11225       // it must be -t0.  t0 + (-t0) must generate a carry iff
11226       // t0 != 0.  So, rather than do a mul and an adds we just set
11227       // the carry flag iff t0 is nonzero.
11228       //
11229       // mul(Rlo_mn, Rm, Rn);
11230       // adds(zr, t0, Rlo_mn);
11231       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11232       adcs(t0, t1, Rhi_mn);
11233       adc(t1, t2, zr);
11234       mov(t2, zr);
11235     }
11236 
11237     void acc(Register Rhi, Register Rlo,
11238              Register t0, Register t1, Register t2) {
11239       adds(t0, t0, Rlo);
11240       adcs(t1, t1, Rhi);
11241       adc(t2, t2, zr);
11242     }
11243 
11244   public:
11245     /**
11246      * Fast Montgomery multiplication.  The derivation of the
11247      * algorithm is in A Cryptographic Library for the Motorola
11248      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11249      *
11250      * Arguments:
11251      *
11252      * Inputs for multiplication:
11253      *   c_rarg0   - int array elements a
11254      *   c_rarg1   - int array elements b
11255      *   c_rarg2   - int array elements n (the modulus)
11256      *   c_rarg3   - int length
11257      *   c_rarg4   - int inv
11258      *   c_rarg5   - int array elements m (the result)
11259      *
11260      * Inputs for squaring:
11261      *   c_rarg0   - int array elements a
11262      *   c_rarg1   - int array elements n (the modulus)
11263      *   c_rarg2   - int length
11264      *   c_rarg3   - int inv
11265      *   c_rarg4   - int array elements m (the result)
11266      *
11267      */
11268     address generate_multiply() {
11269       Label argh, nothing;
11270       bind(argh);
11271       stop("MontgomeryMultiply total_allocation must be <= 8192");
11272 
11273       align(CodeEntryAlignment);
11274       address entry = pc();
11275 
11276       cbzw(Rlen, nothing);
11277 
11278       enter();
11279 
11280       // Make room.
11281       cmpw(Rlen, 512);
11282       br(Assembler::HI, argh);
11283       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11284       andr(sp, Ra, -2 * wordSize);
11285 
11286       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11287 
11288       {
11289         // Copy input args, reversing as we go.  We use Ra as a
11290         // temporary variable.
11291         reverse(Ra, Pa_base, Rlen, t0, t1);
11292         if (!_squaring)
11293           reverse(Ra, Pb_base, Rlen, t0, t1);
11294         reverse(Ra, Pn_base, Rlen, t0, t1);
11295       }
11296 
11297       // Push all call-saved registers and also Pm_base which we'll need
11298       // at the end.
11299       save_regs();
11300 
11301 #ifndef PRODUCT
11302       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11303       {
11304         ldr(Rn, Address(Pn_base, 0));
11305         mul(Rlo_mn, Rn, inv);
11306         subs(zr, Rlo_mn, -1);
11307         Label ok;
11308         br(EQ, ok); {
11309           stop("broken inverse in Montgomery multiply");
11310         } bind(ok);
11311       }
11312 #endif
11313 
11314       mov(Pm_base, Ra);
11315 
11316       mov(t0, zr);
11317       mov(t1, zr);
11318       mov(t2, zr);
11319 
11320       block_comment("for (int i = 0; i < len; i++) {");
11321       mov(Ri, zr); {
11322         Label loop, end;
11323         cmpw(Ri, Rlen);
11324         br(Assembler::GE, end);
11325 
11326         bind(loop);
11327         pre1(Ri);
11328 
11329         block_comment("  for (j = i; j; j--) {"); {
11330           movw(Rj, Ri);
11331           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11332         } block_comment("  } // j");
11333 
11334         post1();
11335         addw(Ri, Ri, 1);
11336         cmpw(Ri, Rlen);
11337         br(Assembler::LT, loop);
11338         bind(end);
11339         block_comment("} // i");
11340       }
11341 
11342       block_comment("for (int i = len; i < 2*len; i++) {");
11343       mov(Ri, Rlen); {
11344         Label loop, end;
11345         cmpw(Ri, Rlen, Assembler::LSL, 1);
11346         br(Assembler::GE, end);
11347 
11348         bind(loop);
11349         pre2(Ri, Rlen);
11350 
11351         block_comment("  for (j = len*2-i-1; j; j--) {"); {
11352           lslw(Rj, Rlen, 1);
11353           subw(Rj, Rj, Ri);
11354           subw(Rj, Rj, 1);
11355           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11356         } block_comment("  } // j");
11357 
11358         post2(Ri, Rlen);
11359         addw(Ri, Ri, 1);
11360         cmpw(Ri, Rlen, Assembler::LSL, 1);
11361         br(Assembler::LT, loop);
11362         bind(end);
11363       }
11364       block_comment("} // i");
11365 
11366       normalize(Rlen);
11367 
11368       mov(Ra, Pm_base);  // Save Pm_base in Ra
11369       restore_regs();  // Restore caller's Pm_base
11370 
11371       // Copy our result into caller's Pm_base
11372       reverse(Pm_base, Ra, Rlen, t0, t1);
11373 
11374       leave();
11375       bind(nothing);
11376       ret(lr);
11377 
11378       return entry;
11379     }
11380     // In C, approximately:
11381 
11382     // void
11383     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11384     //                     julong Pn_base[], julong Pm_base[],
11385     //                     julong inv, int len) {
11386     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11387     //   julong *Pa, *Pb, *Pn, *Pm;
11388     //   julong Ra, Rb, Rn, Rm;
11389 
11390     //   int i;
11391 
11392     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11393 
11394     //   for (i = 0; i < len; i++) {
11395     //     int j;
11396 
11397     //     Pa = Pa_base;
11398     //     Pb = Pb_base + i;
11399     //     Pm = Pm_base;
11400     //     Pn = Pn_base + i;
11401 
11402     //     Ra = *Pa;
11403     //     Rb = *Pb;
11404     //     Rm = *Pm;
11405     //     Rn = *Pn;
11406 
11407     //     int iters = i;
11408     //     for (j = 0; iters--; j++) {
11409     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11410     //       MACC(Ra, Rb, t0, t1, t2);
11411     //       Ra = *++Pa;
11412     //       Rb = *--Pb;
11413     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11414     //       MACC(Rm, Rn, t0, t1, t2);
11415     //       Rm = *++Pm;
11416     //       Rn = *--Pn;
11417     //     }
11418 
11419     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11420     //     MACC(Ra, Rb, t0, t1, t2);
11421     //     *Pm = Rm = t0 * inv;
11422     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11423     //     MACC(Rm, Rn, t0, t1, t2);
11424 
11425     //     assert(t0 == 0, "broken Montgomery multiply");
11426 
11427     //     t0 = t1; t1 = t2; t2 = 0;
11428     //   }
11429 
11430     //   for (i = len; i < 2*len; i++) {
11431     //     int j;
11432 
11433     //     Pa = Pa_base + i-len;
11434     //     Pb = Pb_base + len;
11435     //     Pm = Pm_base + i-len;
11436     //     Pn = Pn_base + len;
11437 
11438     //     Ra = *++Pa;
11439     //     Rb = *--Pb;
11440     //     Rm = *++Pm;
11441     //     Rn = *--Pn;
11442 
11443     //     int iters = len*2-i-1;
11444     //     for (j = i-len+1; iters--; j++) {
11445     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11446     //       MACC(Ra, Rb, t0, t1, t2);
11447     //       Ra = *++Pa;
11448     //       Rb = *--Pb;
11449     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11450     //       MACC(Rm, Rn, t0, t1, t2);
11451     //       Rm = *++Pm;
11452     //       Rn = *--Pn;
11453     //     }
11454 
11455     //     Pm_base[i-len] = t0;
11456     //     t0 = t1; t1 = t2; t2 = 0;
11457     //   }
11458 
11459     //   while (t0)
11460     //     t0 = sub(Pm_base, Pn_base, t0, len);
11461     // }
11462 
11463     /**
11464      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11465      * multiplies than Montgomery multiplication so it should be up to
11466      * 25% faster.  However, its loop control is more complex and it
11467      * may actually run slower on some machines.
11468      *
11469      * Arguments:
11470      *
11471      * Inputs:
11472      *   c_rarg0   - int array elements a
11473      *   c_rarg1   - int array elements n (the modulus)
11474      *   c_rarg2   - int length
11475      *   c_rarg3   - int inv
11476      *   c_rarg4   - int array elements m (the result)
11477      *
11478      */
11479     address generate_square() {
11480       Label argh;
11481       bind(argh);
11482       stop("MontgomeryMultiply total_allocation must be <= 8192");
11483 
11484       align(CodeEntryAlignment);
11485       address entry = pc();
11486 
11487       enter();
11488 
11489       // Make room.
11490       cmpw(Rlen, 512);
11491       br(Assembler::HI, argh);
11492       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11493       andr(sp, Ra, -2 * wordSize);
11494 
11495       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11496 
11497       {
11498         // Copy input args, reversing as we go.  We use Ra as a
11499         // temporary variable.
11500         reverse(Ra, Pa_base, Rlen, t0, t1);
11501         reverse(Ra, Pn_base, Rlen, t0, t1);
11502       }
11503 
11504       // Push all call-saved registers and also Pm_base which we'll need
11505       // at the end.
11506       save_regs();
11507 
11508       mov(Pm_base, Ra);
11509 
11510       mov(t0, zr);
11511       mov(t1, zr);
11512       mov(t2, zr);
11513 
11514       block_comment("for (int i = 0; i < len; i++) {");
11515       mov(Ri, zr); {
11516         Label loop, end;
11517         bind(loop);
11518         cmp(Ri, Rlen);
11519         br(Assembler::GE, end);
11520 
11521         pre1(Ri);
11522 
11523         block_comment("for (j = (i+1)/2; j; j--) {"); {
11524           add(Rj, Ri, 1);
11525           lsr(Rj, Rj, 1);
11526           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11527         } block_comment("  } // j");
11528 
11529         last_squaring(Ri);
11530 
11531         block_comment("  for (j = i/2; j; j--) {"); {
11532           lsr(Rj, Ri, 1);
11533           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11534         } block_comment("  } // j");
11535 
11536         post1_squaring();
11537         add(Ri, Ri, 1);
11538         cmp(Ri, Rlen);
11539         br(Assembler::LT, loop);
11540 
11541         bind(end);
11542         block_comment("} // i");
11543       }
11544 
11545       block_comment("for (int i = len; i < 2*len; i++) {");
11546       mov(Ri, Rlen); {
11547         Label loop, end;
11548         bind(loop);
11549         cmp(Ri, Rlen, Assembler::LSL, 1);
11550         br(Assembler::GE, end);
11551 
11552         pre2(Ri, Rlen);
11553 
11554         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11555           lsl(Rj, Rlen, 1);
11556           sub(Rj, Rj, Ri);
11557           sub(Rj, Rj, 1);
11558           lsr(Rj, Rj, 1);
11559           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11560         } block_comment("  } // j");
11561 
11562         last_squaring(Ri);
11563 
11564         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11565           lsl(Rj, Rlen, 1);
11566           sub(Rj, Rj, Ri);
11567           lsr(Rj, Rj, 1);
11568           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11569         } block_comment("  } // j");
11570 
11571         post2(Ri, Rlen);
11572         add(Ri, Ri, 1);
11573         cmp(Ri, Rlen, Assembler::LSL, 1);
11574 
11575         br(Assembler::LT, loop);
11576         bind(end);
11577         block_comment("} // i");
11578       }
11579 
11580       normalize(Rlen);
11581 
11582       mov(Ra, Pm_base);  // Save Pm_base in Ra
11583       restore_regs();  // Restore caller's Pm_base
11584 
11585       // Copy our result into caller's Pm_base
11586       reverse(Pm_base, Ra, Rlen, t0, t1);
11587 
11588       leave();
11589       ret(lr);
11590 
11591       return entry;
11592     }
11593     // In C, approximately:
11594 
11595     // void
11596     // montgomery_square(julong Pa_base[], julong Pn_base[],
11597     //                   julong Pm_base[], julong inv, int len) {
11598     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11599     //   julong *Pa, *Pb, *Pn, *Pm;
11600     //   julong Ra, Rb, Rn, Rm;
11601 
11602     //   int i;
11603 
11604     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11605 
11606     //   for (i = 0; i < len; i++) {
11607     //     int j;
11608 
11609     //     Pa = Pa_base;
11610     //     Pb = Pa_base + i;
11611     //     Pm = Pm_base;
11612     //     Pn = Pn_base + i;
11613 
11614     //     Ra = *Pa;
11615     //     Rb = *Pb;
11616     //     Rm = *Pm;
11617     //     Rn = *Pn;
11618 
11619     //     int iters = (i+1)/2;
11620     //     for (j = 0; iters--; j++) {
11621     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11622     //       MACC2(Ra, Rb, t0, t1, t2);
11623     //       Ra = *++Pa;
11624     //       Rb = *--Pb;
11625     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11626     //       MACC(Rm, Rn, t0, t1, t2);
11627     //       Rm = *++Pm;
11628     //       Rn = *--Pn;
11629     //     }
11630     //     if ((i & 1) == 0) {
11631     //       assert(Ra == Pa_base[j], "must be");
11632     //       MACC(Ra, Ra, t0, t1, t2);
11633     //     }
11634     //     iters = i/2;
11635     //     assert(iters == i-j, "must be");
11636     //     for (; iters--; j++) {
11637     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11638     //       MACC(Rm, Rn, t0, t1, t2);
11639     //       Rm = *++Pm;
11640     //       Rn = *--Pn;
11641     //     }
11642 
11643     //     *Pm = Rm = t0 * inv;
11644     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11645     //     MACC(Rm, Rn, t0, t1, t2);
11646 
11647     //     assert(t0 == 0, "broken Montgomery multiply");
11648 
11649     //     t0 = t1; t1 = t2; t2 = 0;
11650     //   }
11651 
11652     //   for (i = len; i < 2*len; i++) {
11653     //     int start = i-len+1;
11654     //     int end = start + (len - start)/2;
11655     //     int j;
11656 
11657     //     Pa = Pa_base + i-len;
11658     //     Pb = Pa_base + len;
11659     //     Pm = Pm_base + i-len;
11660     //     Pn = Pn_base + len;
11661 
11662     //     Ra = *++Pa;
11663     //     Rb = *--Pb;
11664     //     Rm = *++Pm;
11665     //     Rn = *--Pn;
11666 
11667     //     int iters = (2*len-i-1)/2;
11668     //     assert(iters == end-start, "must be");
11669     //     for (j = start; iters--; j++) {
11670     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11671     //       MACC2(Ra, Rb, t0, t1, t2);
11672     //       Ra = *++Pa;
11673     //       Rb = *--Pb;
11674     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11675     //       MACC(Rm, Rn, t0, t1, t2);
11676     //       Rm = *++Pm;
11677     //       Rn = *--Pn;
11678     //     }
11679     //     if ((i & 1) == 0) {
11680     //       assert(Ra == Pa_base[j], "must be");
11681     //       MACC(Ra, Ra, t0, t1, t2);
11682     //     }
11683     //     iters =  (2*len-i)/2;
11684     //     assert(iters == len-j, "must be");
11685     //     for (; iters--; j++) {
11686     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11687     //       MACC(Rm, Rn, t0, t1, t2);
11688     //       Rm = *++Pm;
11689     //       Rn = *--Pn;
11690     //     }
11691     //     Pm_base[i-len] = t0;
11692     //     t0 = t1; t1 = t2; t2 = 0;
11693     //   }
11694 
11695     //   while (t0)
11696     //     t0 = sub(Pm_base, Pn_base, t0, len);
11697     // }
11698   };
11699 
11700   // Call here from the interpreter or compiled code to either load
11701   // multiple returned values from the inline type instance being
11702   // returned to registers or to store returned values to a newly
11703   // allocated inline type instance.
11704   address generate_return_value_stub(address destination, const char* name, bool has_res) {
11705     // We need to save all registers the calling convention may use so
11706     // the runtime calls read or update those registers. This needs to
11707     // be in sync with SharedRuntime::java_return_convention().
11708     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
11709     enum layout {
11710       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
11711       j_rarg6_off, j_rarg6_2,
11712       j_rarg5_off, j_rarg5_2,
11713       j_rarg4_off, j_rarg4_2,
11714       j_rarg3_off, j_rarg3_2,
11715       j_rarg2_off, j_rarg2_2,
11716       j_rarg1_off, j_rarg1_2,
11717       j_rarg0_off, j_rarg0_2,
11718 
11719       j_farg7_off, j_farg7_2,
11720       j_farg6_off, j_farg6_2,
11721       j_farg5_off, j_farg5_2,
11722       j_farg4_off, j_farg4_2,
11723       j_farg3_off, j_farg3_2,
11724       j_farg2_off, j_farg2_2,
11725       j_farg1_off, j_farg1_2,
11726       j_farg0_off, j_farg0_2,
11727 
11728       rfp_off, rfp_off2,
11729       return_off, return_off2,
11730 
11731       framesize // inclusive of return address
11732     };
11733 
11734     CodeBuffer code(name, 512, 64);
11735     MacroAssembler* masm = new MacroAssembler(&code);
11736 
11737     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
11738     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
11739     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
11740     int frame_size_in_words = frame_size_in_bytes / wordSize;
11741 
11742     OopMapSet* oop_maps = new OopMapSet();
11743     OopMap* map = new OopMap(frame_size_in_slots, 0);
11744 
11745     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
11746     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
11747     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
11748     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
11749     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
11750     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
11751     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
11752     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
11753 
11754     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
11755     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
11756     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
11757     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
11758     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
11759     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
11760     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
11761     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
11762 
11763     address start = __ pc();
11764 
11765     __ enter(); // Save FP and LR before call
11766 
11767     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
11768     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
11769     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
11770     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
11771 
11772     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
11773     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
11774     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
11775     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
11776 
11777     int frame_complete = __ offset();
11778 
11779     // Set up last_Java_sp and last_Java_fp
11780     address the_pc = __ pc();
11781     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
11782 
11783     // Call runtime
11784     __ mov(c_rarg1, r0);
11785     __ mov(c_rarg0, rthread);
11786 
11787     __ mov(rscratch1, destination);
11788     __ blr(rscratch1);
11789 
11790     oop_maps->add_gc_map(the_pc - start, map);
11791 
11792     __ reset_last_Java_frame(false);
11793 
11794     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
11795     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
11796     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
11797     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
11798 
11799     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
11800     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
11801     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
11802     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
11803 
11804     __ leave();
11805 
11806     // check for pending exceptions
11807     Label pending;
11808     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
11809     __ cbnz(rscratch1, pending);
11810 
11811     if (has_res) {
11812       __ get_vm_result_oop(r0, rthread);
11813     }
11814 
11815     __ ret(lr);
11816 
11817     __ bind(pending);
11818     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
11819 
11820     // -------------
11821     // make sure all code is generated
11822     masm->flush();
11823 
11824     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
11825     return stub->entry_point();
11826   }
11827 
11828   // Initialization
11829   void generate_preuniverse_stubs() {
11830     // preuniverse stubs are not needed for aarch64
11831   }
11832 
11833   void generate_initial_stubs() {
11834     // Generate initial stubs and initializes the entry points
11835 
11836     // entry points that exist in all platforms Note: This is code
11837     // that could be shared among different platforms - however the
11838     // benefit seems to be smaller than the disadvantage of having a
11839     // much more complicated generator structure. See also comment in
11840     // stubRoutines.hpp.
11841 
11842     StubRoutines::_forward_exception_entry = generate_forward_exception();
11843 
11844     StubRoutines::_call_stub_entry =
11845       generate_call_stub(StubRoutines::_call_stub_return_address);
11846 
11847     // is referenced by megamorphic call
11848     StubRoutines::_catch_exception_entry = generate_catch_exception();
11849 
11850     // Initialize table for copy memory (arraycopy) check.
11851     if (UnsafeMemoryAccess::_table == nullptr) {
11852       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11853     }
11854 
11855     if (UseCRC32Intrinsics) {
11856       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11857     }
11858 
11859     if (UseCRC32CIntrinsics) {
11860       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11861     }
11862 
11863     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11864       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11865     }
11866 
11867     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11868       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11869     }
11870 
11871     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11872         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11873       StubRoutines::_hf2f = generate_float16ToFloat();
11874       StubRoutines::_f2hf = generate_floatToFloat16();
11875     }
11876 
11877     if (InlineTypeReturnedAsFields) {
11878       StubRoutines::_load_inline_type_fields_in_regs =
11879          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
11880       StubRoutines::_store_inline_type_fields_to_buf =
11881          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
11882     }
11883 
11884   }
11885 
11886   void generate_continuation_stubs() {
11887     // Continuation stubs:
11888     StubRoutines::_cont_thaw          = generate_cont_thaw();
11889     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11890     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11891     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11892   }
11893 
11894   void generate_final_stubs() {
11895     // support for verify_oop (must happen after universe_init)
11896     if (VerifyOops) {
11897       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11898     }
11899 
11900     // arraycopy stubs used by compilers
11901     generate_arraycopy_stubs();
11902 
11903     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11904 
11905     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11906 
11907     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11908     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11909 
11910 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11911 
11912     generate_atomic_entry_points();
11913 
11914 #endif // LINUX
11915 
11916 #ifdef COMPILER2
11917     if (UseSecondarySupersTable) {
11918       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11919       if (! InlineSecondarySupersTest) {
11920         generate_lookup_secondary_supers_table_stub();
11921       }
11922     }
11923 #endif
11924 
11925     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11926 
11927     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11928   }
11929 
11930   void generate_compiler_stubs() {
11931 #if COMPILER2_OR_JVMCI
11932 
11933     if (UseSVE == 0) {
11934       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11935     }
11936 
11937     // array equals stub for large arrays.
11938     if (!UseSimpleArrayEquals) {
11939       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11940     }
11941 
11942     // arrays_hascode stub for large arrays.
11943     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11944     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11945     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11946     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11947     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11948 
11949     // byte_array_inflate stub for large arrays.
11950     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11951 
11952     // countPositives stub for large arrays.
11953     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11954 
11955     generate_compare_long_strings();
11956 
11957     generate_string_indexof_stubs();
11958 
11959 #ifdef COMPILER2
11960     if (UseMultiplyToLenIntrinsic) {
11961       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11962     }
11963 
11964     if (UseSquareToLenIntrinsic) {
11965       StubRoutines::_squareToLen = generate_squareToLen();
11966     }
11967 
11968     if (UseMulAddIntrinsic) {
11969       StubRoutines::_mulAdd = generate_mulAdd();
11970     }
11971 
11972     if (UseSIMDForBigIntegerShiftIntrinsics) {
11973       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11974       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11975     }
11976 
11977     if (UseMontgomeryMultiplyIntrinsic) {
11978       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11979       StubCodeMark mark(this, stub_id);
11980       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11981       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11982     }
11983 
11984     if (UseMontgomerySquareIntrinsic) {
11985       StubId stub_id = StubId::stubgen_montgomerySquare_id;
11986       StubCodeMark mark(this, stub_id);
11987       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11988       // We use generate_multiply() rather than generate_square()
11989       // because it's faster for the sizes of modulus we care about.
11990       StubRoutines::_montgomerySquare = g.generate_multiply();
11991     }
11992 
11993 #endif // COMPILER2
11994 
11995     if (UseChaCha20Intrinsics) {
11996       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11997     }
11998 
11999     if (UseKyberIntrinsics) {
12000       StubRoutines::_kyberNtt = generate_kyberNtt();
12001       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12002       StubRoutines::_kyberNttMult = generate_kyberNttMult();
12003       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12004       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12005       StubRoutines::_kyber12To16 = generate_kyber12To16();
12006       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12007     }
12008 
12009     if (UseDilithiumIntrinsics) {
12010       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12011       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12012       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12013       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12014       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12015     }
12016 
12017     if (UseBASE64Intrinsics) {
12018         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12019         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12020     }
12021 
12022     // data cache line writeback
12023     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12024     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12025 
12026     if (UseAESIntrinsics) {
12027       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12028       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12029       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12030       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12031       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12032     }
12033     if (UseGHASHIntrinsics) {
12034       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12035       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
12036     }
12037     if (UseAESIntrinsics && UseGHASHIntrinsics) {
12038       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12039     }
12040 
12041     if (UseMD5Intrinsics) {
12042       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12043       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12044     }
12045     if (UseSHA1Intrinsics) {
12046       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12047       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12048     }
12049     if (UseSHA256Intrinsics) {
12050       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12051       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12052     }
12053     if (UseSHA512Intrinsics) {
12054       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12055       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12056     }
12057     if (UseSHA3Intrinsics) {
12058 
12059       StubRoutines::_double_keccak         = generate_double_keccak();
12060       if (UseSIMDForSHA3Intrinsic) {
12061          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12062          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12063       } else {
12064          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12065          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12066       }
12067     }
12068 
12069     if (UsePoly1305Intrinsics) {
12070       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12071     }
12072 
12073     // generate Adler32 intrinsics code
12074     if (UseAdler32Intrinsics) {
12075       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12076     }
12077 
12078 #endif // COMPILER2_OR_JVMCI
12079   }
12080 
12081  public:
12082   StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
12083     switch(blob_id) {
12084     case BlobId::stubgen_preuniverse_id:
12085       generate_preuniverse_stubs();
12086       break;
12087     case BlobId::stubgen_initial_id:
12088       generate_initial_stubs();
12089       break;
12090      case BlobId::stubgen_continuation_id:
12091       generate_continuation_stubs();
12092       break;
12093     case BlobId::stubgen_compiler_id:
12094       generate_compiler_stubs();
12095       break;
12096     case BlobId::stubgen_final_id:
12097       generate_final_stubs();
12098       break;
12099     default:
12100       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12101       break;
12102     };
12103   }
12104 }; // end class declaration
12105 
12106 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
12107   StubGenerator g(code, blob_id);
12108 }
12109 
12110 
12111 #if defined (LINUX)
12112 
12113 // Define pointers to atomic stubs and initialize them to point to the
12114 // code in atomic_aarch64.S.
12115 
12116 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
12117   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12118     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
12119   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12120     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12121 
12122 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12123 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12124 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12125 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12126 DEFAULT_ATOMIC_OP(xchg, 4, )
12127 DEFAULT_ATOMIC_OP(xchg, 8, )
12128 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12129 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12130 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12131 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12132 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12133 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12134 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12135 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12136 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12137 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12138 
12139 #undef DEFAULT_ATOMIC_OP
12140 
12141 #endif // LINUX