1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomic.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     // All of j_rargN may be used to return inline type fields so be careful
  332     // not to clobber those.
  333     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
  334     // assignment of Rresult below.
  335     Register Rresult = r14, Rresult_type = r15;
  336     __ ldr(Rresult, result);
  337     Label is_long, is_float, is_double, check_prim, exit;
  338     __ ldr(Rresult_type, result_type);
  339     __ cmp(Rresult_type, (u1)T_OBJECT);
  340     __ br(Assembler::EQ, check_prim);
  341     __ cmp(Rresult_type, (u1)T_LONG);
  342     __ br(Assembler::EQ, is_long);
  343     __ cmp(Rresult_type, (u1)T_FLOAT);
  344     __ br(Assembler::EQ, is_float);
  345     __ cmp(Rresult_type, (u1)T_DOUBLE);
  346     __ br(Assembler::EQ, is_double);
  347 
  348     // handle T_INT case
  349     __ strw(r0, Address(Rresult));
  350 
  351     __ BIND(exit);
  352 
  353     // pop parameters
  354     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  355 
  356 #ifdef ASSERT
  357     // verify that threads correspond
  358     {
  359       Label L, S;
  360       __ ldr(rscratch1, thread);
  361       __ cmp(rthread, rscratch1);
  362       __ br(Assembler::NE, S);
  363       __ get_thread(rscratch1);
  364       __ cmp(rthread, rscratch1);
  365       __ br(Assembler::EQ, L);
  366       __ BIND(S);
  367       __ stop("StubRoutines::call_stub: threads must correspond");
  368       __ BIND(L);
  369     }
  370 #endif
  371 
  372     __ pop_cont_fastpath(rthread);
  373 
  374     // restore callee-save registers
  375     __ ldpd(v15, v14,  d15_save);
  376     __ ldpd(v13, v12,  d13_save);
  377     __ ldpd(v11, v10,  d11_save);
  378     __ ldpd(v9,  v8,   d9_save);
  379 
  380     __ ldp(r28, r27,   r28_save);
  381     __ ldp(r26, r25,   r26_save);
  382     __ ldp(r24, r23,   r24_save);
  383     __ ldp(r22, r21,   r22_save);
  384     __ ldp(r20, r19,   r20_save);
  385 
  386     // restore fpcr
  387     __ ldr(rscratch1,  fpcr_save);
  388     __ set_fpcr(rscratch1);
  389 
  390     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  391     __ ldrw(c_rarg2, result_type);
  392     __ ldr(c_rarg3,  method);
  393     __ ldp(c_rarg4, c_rarg5,  entry_point);
  394     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  395 
  396     // leave frame and return to caller
  397     __ leave();
  398     __ ret(lr);
  399 
  400     // handle return types different from T_INT
  401     __ BIND(check_prim);
  402     if (InlineTypeReturnedAsFields) {
  403       // Check for scalarized return value
  404       __ tbz(r0, 0, is_long);
  405       // Load pack handler address
  406       __ andr(rscratch1, r0, -2);
  407       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
  408       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
  409       __ blr(rscratch1);
  410       __ b(exit);
  411     }
  412 
  413     __ BIND(is_long);
  414     __ str(r0, Address(Rresult, 0));
  415     __ br(Assembler::AL, exit);
  416 
  417     __ BIND(is_float);
  418     __ strs(j_farg0, Address(Rresult, 0));
  419     __ br(Assembler::AL, exit);
  420 
  421     __ BIND(is_double);
  422     __ strd(j_farg0, Address(Rresult, 0));
  423     __ br(Assembler::AL, exit);
  424 
  425     return start;
  426   }
  427 
  428   // Return point for a Java call if there's an exception thrown in
  429   // Java code.  The exception is caught and transformed into a
  430   // pending exception stored in JavaThread that can be tested from
  431   // within the VM.
  432   //
  433   // Note: Usually the parameters are removed by the callee. In case
  434   // of an exception crossing an activation frame boundary, that is
  435   // not the case if the callee is compiled code => need to setup the
  436   // rsp.
  437   //
  438   // r0: exception oop
  439 
  440   address generate_catch_exception() {
  441     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
  442     StubCodeMark mark(this, stub_id);
  443     address start = __ pc();
  444 
  445     // same as in generate_call_stub():
  446     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  447     const Address thread        (rfp, thread_off         * wordSize);
  448 
  449 #ifdef ASSERT
  450     // verify that threads correspond
  451     {
  452       Label L, S;
  453       __ ldr(rscratch1, thread);
  454       __ cmp(rthread, rscratch1);
  455       __ br(Assembler::NE, S);
  456       __ get_thread(rscratch1);
  457       __ cmp(rthread, rscratch1);
  458       __ br(Assembler::EQ, L);
  459       __ bind(S);
  460       __ stop("StubRoutines::catch_exception: threads must correspond");
  461       __ bind(L);
  462     }
  463 #endif
  464 
  465     // set pending exception
  466     __ verify_oop(r0);
  467 
  468     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  469     __ mov(rscratch1, (address)__FILE__);
  470     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  471     __ movw(rscratch1, (int)__LINE__);
  472     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  473 
  474     // complete return to VM
  475     assert(StubRoutines::_call_stub_return_address != nullptr,
  476            "_call_stub_return_address must have been generated before");
  477     __ b(StubRoutines::_call_stub_return_address);
  478 
  479     return start;
  480   }
  481 
  482   // Continuation point for runtime calls returning with a pending
  483   // exception.  The pending exception check happened in the runtime
  484   // or native call stub.  The pending exception in Thread is
  485   // converted into a Java-level exception.
  486   //
  487   // Contract with Java-level exception handlers:
  488   // r0: exception
  489   // r3: throwing pc
  490   //
  491   // NOTE: At entry of this stub, exception-pc must be in LR !!
  492 
  493   // NOTE: this is always used as a jump target within generated code
  494   // so it just needs to be generated code with no x86 prolog
  495 
  496   address generate_forward_exception() {
  497     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
  498     StubCodeMark mark(this, stub_id);
  499     address start = __ pc();
  500 
  501     // Upon entry, LR points to the return address returning into
  502     // Java (interpreted or compiled) code; i.e., the return address
  503     // becomes the throwing pc.
  504     //
  505     // Arguments pushed before the runtime call are still on the stack
  506     // but the exception handler will reset the stack pointer ->
  507     // ignore them.  A potential result in registers can be ignored as
  508     // well.
  509 
  510 #ifdef ASSERT
  511     // make sure this code is only executed if there is a pending exception
  512     {
  513       Label L;
  514       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  515       __ cbnz(rscratch1, L);
  516       __ stop("StubRoutines::forward exception: no pending exception (1)");
  517       __ bind(L);
  518     }
  519 #endif
  520 
  521     // compute exception handler into r19
  522 
  523     // call the VM to find the handler address associated with the
  524     // caller address. pass thread in r0 and caller pc (ret address)
  525     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  526     // the stack.
  527     __ mov(c_rarg1, lr);
  528     // lr will be trashed by the VM call so we move it to R19
  529     // (callee-saved) because we also need to pass it to the handler
  530     // returned by this call.
  531     __ mov(r19, lr);
  532     BLOCK_COMMENT("call exception_handler_for_return_address");
  533     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  534                          SharedRuntime::exception_handler_for_return_address),
  535                     rthread, c_rarg1);
  536     // Reinitialize the ptrue predicate register, in case the external runtime
  537     // call clobbers ptrue reg, as we may return to SVE compiled code.
  538     __ reinitialize_ptrue();
  539 
  540     // we should not really care that lr is no longer the callee
  541     // address. we saved the value the handler needs in r19 so we can
  542     // just copy it to r3. however, the C2 handler will push its own
  543     // frame and then calls into the VM and the VM code asserts that
  544     // the PC for the frame above the handler belongs to a compiled
  545     // Java method. So, we restore lr here to satisfy that assert.
  546     __ mov(lr, r19);
  547     // setup r0 & r3 & clear pending exception
  548     __ mov(r3, r19);
  549     __ mov(r19, r0);
  550     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  551     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  552 
  553 #ifdef ASSERT
  554     // make sure exception is set
  555     {
  556       Label L;
  557       __ cbnz(r0, L);
  558       __ stop("StubRoutines::forward exception: no pending exception (2)");
  559       __ bind(L);
  560     }
  561 #endif
  562 
  563     // continue at exception handler
  564     // r0: exception
  565     // r3: throwing pc
  566     // r19: exception handler
  567     __ verify_oop(r0);
  568     __ br(r19);
  569 
  570     return start;
  571   }
  572 
  573   // Non-destructive plausibility checks for oops
  574   //
  575   // Arguments:
  576   //    r0: oop to verify
  577   //    rscratch1: error message
  578   //
  579   // Stack after saving c_rarg3:
  580   //    [tos + 0]: saved c_rarg3
  581   //    [tos + 1]: saved c_rarg2
  582   //    [tos + 2]: saved lr
  583   //    [tos + 3]: saved rscratch2
  584   //    [tos + 4]: saved r0
  585   //    [tos + 5]: saved rscratch1
  586   address generate_verify_oop() {
  587     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
  588     StubCodeMark mark(this, stub_id);
  589     address start = __ pc();
  590 
  591     Label exit, error;
  592 
  593     // save c_rarg2 and c_rarg3
  594     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  595 
  596     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  597     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  598     __ ldr(c_rarg3, Address(c_rarg2));
  599     __ add(c_rarg3, c_rarg3, 1);
  600     __ str(c_rarg3, Address(c_rarg2));
  601 
  602     // object is in r0
  603     // make sure object is 'reasonable'
  604     __ cbz(r0, exit); // if obj is null it is OK
  605 
  606     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  607     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  608 
  609     // return if everything seems ok
  610     __ bind(exit);
  611 
  612     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  613     __ ret(lr);
  614 
  615     // handle errors
  616     __ bind(error);
  617     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  618 
  619     __ push(RegSet::range(r0, r29), sp);
  620     // debug(char* msg, int64_t pc, int64_t regs[])
  621     __ mov(c_rarg0, rscratch1);      // pass address of error message
  622     __ mov(c_rarg1, lr);             // pass return address
  623     __ mov(c_rarg2, sp);             // pass address of regs on stack
  624 #ifndef PRODUCT
  625     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  626 #endif
  627     BLOCK_COMMENT("call MacroAssembler::debug");
  628     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  629     __ blr(rscratch1);
  630     __ hlt(0);
  631 
  632     return start;
  633   }
  634 
  635   // Generate indices for iota vector.
  636   address generate_iota_indices(StubGenStubId stub_id) {
  637     __ align(CodeEntryAlignment);
  638     StubCodeMark mark(this, stub_id);
  639     address start = __ pc();
  640     // B
  641     __ emit_data64(0x0706050403020100, relocInfo::none);
  642     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  643     // H
  644     __ emit_data64(0x0003000200010000, relocInfo::none);
  645     __ emit_data64(0x0007000600050004, relocInfo::none);
  646     // S
  647     __ emit_data64(0x0000000100000000, relocInfo::none);
  648     __ emit_data64(0x0000000300000002, relocInfo::none);
  649     // D
  650     __ emit_data64(0x0000000000000000, relocInfo::none);
  651     __ emit_data64(0x0000000000000001, relocInfo::none);
  652     // S - FP
  653     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  654     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  655     // D - FP
  656     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  657     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  658     return start;
  659   }
  660 
  661   // The inner part of zero_words().  This is the bulk operation,
  662   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  663   // caller is responsible for zeroing the last few words.
  664   //
  665   // Inputs:
  666   // r10: the HeapWord-aligned base address of an array to zero.
  667   // r11: the count in HeapWords, r11 > 0.
  668   //
  669   // Returns r10 and r11, adjusted for the caller to clear.
  670   // r10: the base address of the tail of words left to clear.
  671   // r11: the number of words in the tail.
  672   //      r11 < MacroAssembler::zero_words_block_size.
  673 
  674   address generate_zero_blocks() {
  675     Label done;
  676     Label base_aligned;
  677 
  678     Register base = r10, cnt = r11;
  679 
  680     __ align(CodeEntryAlignment);
  681     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
  682     StubCodeMark mark(this, stub_id);
  683     address start = __ pc();
  684 
  685     if (UseBlockZeroing) {
  686       int zva_length = VM_Version::zva_length();
  687 
  688       // Ensure ZVA length can be divided by 16. This is required by
  689       // the subsequent operations.
  690       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  691 
  692       __ tbz(base, 3, base_aligned);
  693       __ str(zr, Address(__ post(base, 8)));
  694       __ sub(cnt, cnt, 1);
  695       __ bind(base_aligned);
  696 
  697       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  698       // alignment.
  699       Label small;
  700       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  701       __ subs(rscratch1, cnt, low_limit >> 3);
  702       __ br(Assembler::LT, small);
  703       __ zero_dcache_blocks(base, cnt);
  704       __ bind(small);
  705     }
  706 
  707     {
  708       // Number of stp instructions we'll unroll
  709       const int unroll =
  710         MacroAssembler::zero_words_block_size / 2;
  711       // Clear the remaining blocks.
  712       Label loop;
  713       __ subs(cnt, cnt, unroll * 2);
  714       __ br(Assembler::LT, done);
  715       __ bind(loop);
  716       for (int i = 0; i < unroll; i++)
  717         __ stp(zr, zr, __ post(base, 16));
  718       __ subs(cnt, cnt, unroll * 2);
  719       __ br(Assembler::GE, loop);
  720       __ bind(done);
  721       __ add(cnt, cnt, unroll * 2);
  722     }
  723 
  724     __ ret(lr);
  725 
  726     return start;
  727   }
  728 
  729 
  730   typedef enum {
  731     copy_forwards = 1,
  732     copy_backwards = -1
  733   } copy_direction;
  734 
  735   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  736   // for arraycopy stubs.
  737   class ArrayCopyBarrierSetHelper : StackObj {
  738     BarrierSetAssembler* _bs_asm;
  739     MacroAssembler* _masm;
  740     DecoratorSet _decorators;
  741     BasicType _type;
  742     Register _gct1;
  743     Register _gct2;
  744     Register _gct3;
  745     FloatRegister _gcvt1;
  746     FloatRegister _gcvt2;
  747     FloatRegister _gcvt3;
  748 
  749   public:
  750     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  751                               DecoratorSet decorators,
  752                               BasicType type,
  753                               Register gct1,
  754                               Register gct2,
  755                               Register gct3,
  756                               FloatRegister gcvt1,
  757                               FloatRegister gcvt2,
  758                               FloatRegister gcvt3)
  759       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  760         _masm(masm),
  761         _decorators(decorators),
  762         _type(type),
  763         _gct1(gct1),
  764         _gct2(gct2),
  765         _gct3(gct3),
  766         _gcvt1(gcvt1),
  767         _gcvt2(gcvt2),
  768         _gcvt3(gcvt3) {
  769     }
  770 
  771     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  772       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  773                             dst1, dst2, src,
  774                             _gct1, _gct2, _gcvt1);
  775     }
  776 
  777     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  778       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  779                              dst, src1, src2,
  780                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  781     }
  782 
  783     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  784       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  785                             dst1, dst2, src,
  786                             _gct1);
  787     }
  788 
  789     void copy_store_at_16(Address dst, Register src1, Register src2) {
  790       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  791                              dst, src1, src2,
  792                              _gct1, _gct2, _gct3);
  793     }
  794 
  795     void copy_load_at_8(Register dst, Address src) {
  796       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  797                             dst, noreg, src,
  798                             _gct1);
  799     }
  800 
  801     void copy_store_at_8(Address dst, Register src) {
  802       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  803                              dst, src, noreg,
  804                              _gct1, _gct2, _gct3);
  805     }
  806   };
  807 
  808   // Bulk copy of blocks of 8 words.
  809   //
  810   // count is a count of words.
  811   //
  812   // Precondition: count >= 8
  813   //
  814   // Postconditions:
  815   //
  816   // The least significant bit of count contains the remaining count
  817   // of words to copy.  The rest of count is trash.
  818   //
  819   // s and d are adjusted to point to the remaining words to copy
  820   //
  821   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  822     BasicType type;
  823     copy_direction direction;
  824 
  825     switch (stub_id) {
  826     case copy_byte_f_id:
  827       direction = copy_forwards;
  828       type = T_BYTE;
  829       break;
  830     case copy_byte_b_id:
  831       direction = copy_backwards;
  832       type = T_BYTE;
  833       break;
  834     case copy_oop_f_id:
  835       direction = copy_forwards;
  836       type = T_OBJECT;
  837       break;
  838     case copy_oop_b_id:
  839       direction = copy_backwards;
  840       type = T_OBJECT;
  841       break;
  842     case copy_oop_uninit_f_id:
  843       direction = copy_forwards;
  844       type = T_OBJECT;
  845       break;
  846     case copy_oop_uninit_b_id:
  847       direction = copy_backwards;
  848       type = T_OBJECT;
  849       break;
  850     default:
  851       ShouldNotReachHere();
  852     }
  853 
  854     int unit = wordSize * direction;
  855     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  856 
  857     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  858       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  859     const Register stride = r14;
  860     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  861     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  862     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  863 
  864     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  865     assert_different_registers(s, d, count, rscratch1, rscratch2);
  866 
  867     Label again, drain;
  868 
  869     __ align(CodeEntryAlignment);
  870 
  871     StubCodeMark mark(this, stub_id);
  872 
  873     __ bind(start);
  874 
  875     Label unaligned_copy_long;
  876     if (AvoidUnalignedAccesses) {
  877       __ tbnz(d, 3, unaligned_copy_long);
  878     }
  879 
  880     if (direction == copy_forwards) {
  881       __ sub(s, s, bias);
  882       __ sub(d, d, bias);
  883     }
  884 
  885 #ifdef ASSERT
  886     // Make sure we are never given < 8 words
  887     {
  888       Label L;
  889       __ cmp(count, (u1)8);
  890       __ br(Assembler::GE, L);
  891       __ stop("genrate_copy_longs called with < 8 words");
  892       __ bind(L);
  893     }
  894 #endif
  895 
  896     // Fill 8 registers
  897     if (UseSIMDForMemoryOps) {
  898       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  899       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  900     } else {
  901       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  902       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  903       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  904       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  905     }
  906 
  907     __ subs(count, count, 16);
  908     __ br(Assembler::LO, drain);
  909 
  910     int prefetch = PrefetchCopyIntervalInBytes;
  911     bool use_stride = false;
  912     if (direction == copy_backwards) {
  913        use_stride = prefetch > 256;
  914        prefetch = -prefetch;
  915        if (use_stride) __ mov(stride, prefetch);
  916     }
  917 
  918     __ bind(again);
  919 
  920     if (PrefetchCopyIntervalInBytes > 0)
  921       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  922 
  923     if (UseSIMDForMemoryOps) {
  924       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  925       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  926       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  927       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  928     } else {
  929       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  930       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  931       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  932       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  933       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  934       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  937     }
  938 
  939     __ subs(count, count, 8);
  940     __ br(Assembler::HS, again);
  941 
  942     // Drain
  943     __ bind(drain);
  944     if (UseSIMDForMemoryOps) {
  945       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  946       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  947     } else {
  948       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  950       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  951       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  952     }
  953 
  954     {
  955       Label L1, L2;
  956       __ tbz(count, exact_log2(4), L1);
  957       if (UseSIMDForMemoryOps) {
  958         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  959         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  960       } else {
  961         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  962         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  963         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  964         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  965       }
  966       __ bind(L1);
  967 
  968       if (direction == copy_forwards) {
  969         __ add(s, s, bias);
  970         __ add(d, d, bias);
  971       }
  972 
  973       __ tbz(count, 1, L2);
  974       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  975       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  976       __ bind(L2);
  977     }
  978 
  979     __ ret(lr);
  980 
  981     if (AvoidUnalignedAccesses) {
  982       Label drain, again;
  983       // Register order for storing. Order is different for backward copy.
  984 
  985       __ bind(unaligned_copy_long);
  986 
  987       // source address is even aligned, target odd aligned
  988       //
  989       // when forward copying word pairs we read long pairs at offsets
  990       // {0, 2, 4, 6} (in long words). when backwards copying we read
  991       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  992       // address by -2 in the forwards case so we can compute the
  993       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  994       // or -1.
  995       //
  996       // when forward copying we need to store 1 word, 3 pairs and
  997       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  998       // zero offset We adjust the destination by -1 which means we
  999       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1000       //
 1001       // When backwards copyng we need to store 1 word, 3 pairs and
 1002       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1003       // offsets {1, 3, 5, 7, 8} * unit.
 1004 
 1005       if (direction == copy_forwards) {
 1006         __ sub(s, s, 16);
 1007         __ sub(d, d, 8);
 1008       }
 1009 
 1010       // Fill 8 registers
 1011       //
 1012       // for forwards copy s was offset by -16 from the original input
 1013       // value of s so the register contents are at these offsets
 1014       // relative to the 64 bit block addressed by that original input
 1015       // and so on for each successive 64 byte block when s is updated
 1016       //
 1017       // t0 at offset 0,  t1 at offset 8
 1018       // t2 at offset 16, t3 at offset 24
 1019       // t4 at offset 32, t5 at offset 40
 1020       // t6 at offset 48, t7 at offset 56
 1021 
 1022       // for backwards copy s was not offset so the register contents
 1023       // are at these offsets into the preceding 64 byte block
 1024       // relative to that original input and so on for each successive
 1025       // preceding 64 byte block when s is updated. this explains the
 1026       // slightly counter-intuitive looking pattern of register usage
 1027       // in the stp instructions for backwards copy.
 1028       //
 1029       // t0 at offset -16, t1 at offset -8
 1030       // t2 at offset -32, t3 at offset -24
 1031       // t4 at offset -48, t5 at offset -40
 1032       // t6 at offset -64, t7 at offset -56
 1033 
 1034       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1035       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1036       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1037       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1038 
 1039       __ subs(count, count, 16);
 1040       __ br(Assembler::LO, drain);
 1041 
 1042       int prefetch = PrefetchCopyIntervalInBytes;
 1043       bool use_stride = false;
 1044       if (direction == copy_backwards) {
 1045          use_stride = prefetch > 256;
 1046          prefetch = -prefetch;
 1047          if (use_stride) __ mov(stride, prefetch);
 1048       }
 1049 
 1050       __ bind(again);
 1051 
 1052       if (PrefetchCopyIntervalInBytes > 0)
 1053         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1054 
 1055       if (direction == copy_forwards) {
 1056        // allowing for the offset of -8 the store instructions place
 1057        // registers into the target 64 bit block at the following
 1058        // offsets
 1059        //
 1060        // t0 at offset 0
 1061        // t1 at offset 8,  t2 at offset 16
 1062        // t3 at offset 24, t4 at offset 32
 1063        // t5 at offset 40, t6 at offset 48
 1064        // t7 at offset 56
 1065 
 1066         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1067         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1068         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1069         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1070         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1071         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1072         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1073         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1074         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1075       } else {
 1076        // d was not offset when we started so the registers are
 1077        // written into the 64 bit block preceding d with the following
 1078        // offsets
 1079        //
 1080        // t1 at offset -8
 1081        // t3 at offset -24, t0 at offset -16
 1082        // t5 at offset -48, t2 at offset -32
 1083        // t7 at offset -56, t4 at offset -48
 1084        //                   t6 at offset -64
 1085        //
 1086        // note that this matches the offsets previously noted for the
 1087        // loads
 1088 
 1089         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1090         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1091         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1092         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1093         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1094         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1095         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1096         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1097         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1098       }
 1099 
 1100       __ subs(count, count, 8);
 1101       __ br(Assembler::HS, again);
 1102 
 1103       // Drain
 1104       //
 1105       // this uses the same pattern of offsets and register arguments
 1106       // as above
 1107       __ bind(drain);
 1108       if (direction == copy_forwards) {
 1109         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1110         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1111         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1112         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1113         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1114       } else {
 1115         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1116         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1117         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1118         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1119         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1120       }
 1121       // now we need to copy any remaining part block which may
 1122       // include a 4 word block subblock and/or a 2 word subblock.
 1123       // bits 2 and 1 in the count are the tell-tale for whether we
 1124       // have each such subblock
 1125       {
 1126         Label L1, L2;
 1127         __ tbz(count, exact_log2(4), L1);
 1128        // this is the same as above but copying only 4 longs hence
 1129        // with only one intervening stp between the str instructions
 1130        // but note that the offsets and registers still follow the
 1131        // same pattern
 1132         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1133         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1134         if (direction == copy_forwards) {
 1135           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1136           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1137           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1141           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1142         }
 1143         __ bind(L1);
 1144 
 1145         __ tbz(count, 1, L2);
 1146        // this is the same as above but copying only 2 longs hence
 1147        // there is no intervening stp between the str instructions
 1148        // but note that the offset and register patterns are still
 1149        // the same
 1150         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1151         if (direction == copy_forwards) {
 1152           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1153           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1154         } else {
 1155           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1156           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1157         }
 1158         __ bind(L2);
 1159 
 1160        // for forwards copy we need to re-adjust the offsets we
 1161        // applied so that s and d are follow the last words written
 1162 
 1163        if (direction == copy_forwards) {
 1164          __ add(s, s, 16);
 1165          __ add(d, d, 8);
 1166        }
 1167 
 1168       }
 1169 
 1170       __ ret(lr);
 1171       }
 1172   }
 1173 
 1174   // Small copy: less than 16 bytes.
 1175   //
 1176   // NB: Ignores all of the bits of count which represent more than 15
 1177   // bytes, so a caller doesn't have to mask them.
 1178 
 1179   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1180     bool is_backwards = step < 0;
 1181     size_t granularity = g_uabs(step);
 1182     int direction = is_backwards ? -1 : 1;
 1183 
 1184     Label Lword, Lint, Lshort, Lbyte;
 1185 
 1186     assert(granularity
 1187            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1188 
 1189     const Register t0 = r3;
 1190     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1191     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1192 
 1193     // ??? I don't know if this bit-test-and-branch is the right thing
 1194     // to do.  It does a lot of jumping, resulting in several
 1195     // mispredicted branches.  It might make more sense to do this
 1196     // with something like Duff's device with a single computed branch.
 1197 
 1198     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1199     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1200     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1201     __ bind(Lword);
 1202 
 1203     if (granularity <= sizeof (jint)) {
 1204       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1205       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1206       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1207       __ bind(Lint);
 1208     }
 1209 
 1210     if (granularity <= sizeof (jshort)) {
 1211       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1212       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1213       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1214       __ bind(Lshort);
 1215     }
 1216 
 1217     if (granularity <= sizeof (jbyte)) {
 1218       __ tbz(count, 0, Lbyte);
 1219       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1220       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1221       __ bind(Lbyte);
 1222     }
 1223   }
 1224 
 1225   Label copy_f, copy_b;
 1226   Label copy_obj_f, copy_obj_b;
 1227   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1228 
 1229   // All-singing all-dancing memory copy.
 1230   //
 1231   // Copy count units of memory from s to d.  The size of a unit is
 1232   // step, which can be positive or negative depending on the direction
 1233   // of copy.  If is_aligned is false, we align the source address.
 1234   //
 1235 
 1236   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1237                    Register s, Register d, Register count, int step) {
 1238     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1239     bool is_backwards = step < 0;
 1240     unsigned int granularity = g_uabs(step);
 1241     const Register t0 = r3, t1 = r4;
 1242 
 1243     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1244     // load all the data before writing anything
 1245     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1246     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1247     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1248     const Register send = r17, dend = r16;
 1249     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1250     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1251     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1252 
 1253     if (PrefetchCopyIntervalInBytes > 0)
 1254       __ prfm(Address(s, 0), PLDL1KEEP);
 1255     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1256     __ br(Assembler::HI, copy_big);
 1257 
 1258     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1259     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1260 
 1261     __ cmp(count, u1(16/granularity));
 1262     __ br(Assembler::LS, copy16);
 1263 
 1264     __ cmp(count, u1(64/granularity));
 1265     __ br(Assembler::HI, copy80);
 1266 
 1267     __ cmp(count, u1(32/granularity));
 1268     __ br(Assembler::LS, copy32);
 1269 
 1270     // 33..64 bytes
 1271     if (UseSIMDForMemoryOps) {
 1272       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1273       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1274       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1275       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1276     } else {
 1277       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1278       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1279       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1280       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1281 
 1282       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1283       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1284       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1285       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1286     }
 1287     __ b(finish);
 1288 
 1289     // 17..32 bytes
 1290     __ bind(copy32);
 1291     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1292     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1293 
 1294     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1295     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1296     __ b(finish);
 1297 
 1298     // 65..80/96 bytes
 1299     // (96 bytes if SIMD because we do 32 byes per instruction)
 1300     __ bind(copy80);
 1301     if (UseSIMDForMemoryOps) {
 1302       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1303       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1304       // Unaligned pointers can be an issue for copying.
 1305       // The issue has more chances to happen when granularity of data is
 1306       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1307       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1308       // The most performance drop has been seen for the range 65-80 bytes.
 1309       // For such cases using the pair of ldp/stp instead of the third pair of
 1310       // ldpq/stpq fixes the performance issue.
 1311       if (granularity < sizeof (jint)) {
 1312         Label copy96;
 1313         __ cmp(count, u1(80/granularity));
 1314         __ br(Assembler::HI, copy96);
 1315         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1316 
 1317         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1318         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1319 
 1320         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1321         __ b(finish);
 1322 
 1323         __ bind(copy96);
 1324       }
 1325       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1326 
 1327       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1328       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1329 
 1330       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1331     } else {
 1332       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1333       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1334       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1335       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1336       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1337 
 1338       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1339       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1340       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1341       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1342       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1343     }
 1344     __ b(finish);
 1345 
 1346     // 0..16 bytes
 1347     __ bind(copy16);
 1348     __ cmp(count, u1(8/granularity));
 1349     __ br(Assembler::LO, copy8);
 1350 
 1351     // 8..16 bytes
 1352     bs.copy_load_at_8(t0, Address(s, 0));
 1353     bs.copy_load_at_8(t1, Address(send, -8));
 1354     bs.copy_store_at_8(Address(d, 0), t0);
 1355     bs.copy_store_at_8(Address(dend, -8), t1);
 1356     __ b(finish);
 1357 
 1358     if (granularity < 8) {
 1359       // 4..7 bytes
 1360       __ bind(copy8);
 1361       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1362       __ ldrw(t0, Address(s, 0));
 1363       __ ldrw(t1, Address(send, -4));
 1364       __ strw(t0, Address(d, 0));
 1365       __ strw(t1, Address(dend, -4));
 1366       __ b(finish);
 1367       if (granularity < 4) {
 1368         // 0..3 bytes
 1369         __ bind(copy4);
 1370         __ cbz(count, finish); // get rid of 0 case
 1371         if (granularity == 2) {
 1372           __ ldrh(t0, Address(s, 0));
 1373           __ strh(t0, Address(d, 0));
 1374         } else { // granularity == 1
 1375           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1376           // the first and last byte.
 1377           // Handle the 3 byte case by loading and storing base + count/2
 1378           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1379           // This does means in the 1 byte case we load/store the same
 1380           // byte 3 times.
 1381           __ lsr(count, count, 1);
 1382           __ ldrb(t0, Address(s, 0));
 1383           __ ldrb(t1, Address(send, -1));
 1384           __ ldrb(t2, Address(s, count));
 1385           __ strb(t0, Address(d, 0));
 1386           __ strb(t1, Address(dend, -1));
 1387           __ strb(t2, Address(d, count));
 1388         }
 1389         __ b(finish);
 1390       }
 1391     }
 1392 
 1393     __ bind(copy_big);
 1394     if (is_backwards) {
 1395       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1396       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1397     }
 1398 
 1399     // Now we've got the small case out of the way we can align the
 1400     // source address on a 2-word boundary.
 1401 
 1402     // Here we will materialize a count in r15, which is used by copy_memory_small
 1403     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1404     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1405     // can not be used as a temp register, as it contains the count.
 1406 
 1407     Label aligned;
 1408 
 1409     if (is_aligned) {
 1410       // We may have to adjust by 1 word to get s 2-word-aligned.
 1411       __ tbz(s, exact_log2(wordSize), aligned);
 1412       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1413       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1414       __ sub(count, count, wordSize/granularity);
 1415     } else {
 1416       if (is_backwards) {
 1417         __ andr(r15, s, 2 * wordSize - 1);
 1418       } else {
 1419         __ neg(r15, s);
 1420         __ andr(r15, r15, 2 * wordSize - 1);
 1421       }
 1422       // r15 is the byte adjustment needed to align s.
 1423       __ cbz(r15, aligned);
 1424       int shift = exact_log2(granularity);
 1425       if (shift > 0) {
 1426         __ lsr(r15, r15, shift);
 1427       }
 1428       __ sub(count, count, r15);
 1429 
 1430 #if 0
 1431       // ?? This code is only correct for a disjoint copy.  It may or
 1432       // may not make sense to use it in that case.
 1433 
 1434       // Copy the first pair; s and d may not be aligned.
 1435       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1436       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1437 
 1438       // Align s and d, adjust count
 1439       if (is_backwards) {
 1440         __ sub(s, s, r15);
 1441         __ sub(d, d, r15);
 1442       } else {
 1443         __ add(s, s, r15);
 1444         __ add(d, d, r15);
 1445       }
 1446 #else
 1447       copy_memory_small(decorators, type, s, d, r15, step);
 1448 #endif
 1449     }
 1450 
 1451     __ bind(aligned);
 1452 
 1453     // s is now 2-word-aligned.
 1454 
 1455     // We have a count of units and some trailing bytes. Adjust the
 1456     // count and do a bulk copy of words. If the shift is zero
 1457     // perform a move instead to benefit from zero latency moves.
 1458     int shift = exact_log2(wordSize/granularity);
 1459     if (shift > 0) {
 1460       __ lsr(r15, count, shift);
 1461     } else {
 1462       __ mov(r15, count);
 1463     }
 1464     if (direction == copy_forwards) {
 1465       if (type != T_OBJECT) {
 1466         __ bl(copy_f);
 1467       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1468         __ bl(copy_obj_uninit_f);
 1469       } else {
 1470         __ bl(copy_obj_f);
 1471       }
 1472     } else {
 1473       if (type != T_OBJECT) {
 1474         __ bl(copy_b);
 1475       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1476         __ bl(copy_obj_uninit_b);
 1477       } else {
 1478         __ bl(copy_obj_b);
 1479       }
 1480     }
 1481 
 1482     // And the tail.
 1483     copy_memory_small(decorators, type, s, d, count, step);
 1484 
 1485     if (granularity >= 8) __ bind(copy8);
 1486     if (granularity >= 4) __ bind(copy4);
 1487     __ bind(finish);
 1488   }
 1489 
 1490 
 1491   void clobber_registers() {
 1492 #ifdef ASSERT
 1493     RegSet clobbered
 1494       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1495     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1496     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1497     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1498       __ mov(*it, rscratch1);
 1499     }
 1500 #endif
 1501 
 1502   }
 1503 
 1504   // Scan over array at a for count oops, verifying each one.
 1505   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1506   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1507     Label loop, end;
 1508     __ mov(rscratch1, a);
 1509     __ mov(rscratch2, zr);
 1510     __ bind(loop);
 1511     __ cmp(rscratch2, count);
 1512     __ br(Assembler::HS, end);
 1513     if (size == wordSize) {
 1514       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1515       __ verify_oop(temp);
 1516     } else {
 1517       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1518       __ decode_heap_oop(temp); // calls verify_oop
 1519     }
 1520     __ add(rscratch2, rscratch2, 1);
 1521     __ b(loop);
 1522     __ bind(end);
 1523   }
 1524 
 1525   // Arguments:
 1526   //   stub_id - is used to name the stub and identify all details of
 1527   //             how to perform the copy.
 1528   //
 1529   //   entry - is assigned to the stub's post push entry point unless
 1530   //           it is null
 1531   //
 1532   // Inputs:
 1533   //   c_rarg0   - source array address
 1534   //   c_rarg1   - destination array address
 1535   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1536   //
 1537   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1538   // the hardware handle it.  The two dwords within qwords that span
 1539   // cache line boundaries will still be loaded and stored atomically.
 1540   //
 1541   // Side Effects: entry is set to the (post push) entry point so it
 1542   //               can be used by the corresponding conjoint copy
 1543   //               method
 1544   //
 1545   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
 1546     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1547     RegSet saved_reg = RegSet::of(s, d, count);
 1548     int size;
 1549     bool aligned;
 1550     bool is_oop;
 1551     bool dest_uninitialized;
 1552     switch (stub_id) {
 1553     case jbyte_disjoint_arraycopy_id:
 1554       size = sizeof(jbyte);
 1555       aligned = false;
 1556       is_oop = false;
 1557       dest_uninitialized = false;
 1558       break;
 1559     case arrayof_jbyte_disjoint_arraycopy_id:
 1560       size = sizeof(jbyte);
 1561       aligned = true;
 1562       is_oop = false;
 1563       dest_uninitialized = false;
 1564       break;
 1565     case jshort_disjoint_arraycopy_id:
 1566       size = sizeof(jshort);
 1567       aligned = false;
 1568       is_oop = false;
 1569       dest_uninitialized = false;
 1570       break;
 1571     case arrayof_jshort_disjoint_arraycopy_id:
 1572       size = sizeof(jshort);
 1573       aligned = true;
 1574       is_oop = false;
 1575       dest_uninitialized = false;
 1576       break;
 1577     case jint_disjoint_arraycopy_id:
 1578       size = sizeof(jint);
 1579       aligned = false;
 1580       is_oop = false;
 1581       dest_uninitialized = false;
 1582       break;
 1583     case arrayof_jint_disjoint_arraycopy_id:
 1584       size = sizeof(jint);
 1585       aligned = true;
 1586       is_oop = false;
 1587       dest_uninitialized = false;
 1588       break;
 1589     case jlong_disjoint_arraycopy_id:
 1590       // since this is always aligned we can (should!) use the same
 1591       // stub as for case arrayof_jlong_disjoint_arraycopy
 1592       ShouldNotReachHere();
 1593       break;
 1594     case arrayof_jlong_disjoint_arraycopy_id:
 1595       size = sizeof(jlong);
 1596       aligned = true;
 1597       is_oop = false;
 1598       dest_uninitialized = false;
 1599       break;
 1600     case oop_disjoint_arraycopy_id:
 1601       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1602       aligned = !UseCompressedOops;
 1603       is_oop = true;
 1604       dest_uninitialized = false;
 1605       break;
 1606     case arrayof_oop_disjoint_arraycopy_id:
 1607       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1608       aligned = !UseCompressedOops;
 1609       is_oop = true;
 1610       dest_uninitialized = false;
 1611       break;
 1612     case oop_disjoint_arraycopy_uninit_id:
 1613       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1614       aligned = !UseCompressedOops;
 1615       is_oop = true;
 1616       dest_uninitialized = true;
 1617       break;
 1618     case arrayof_oop_disjoint_arraycopy_uninit_id:
 1619       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1620       aligned = !UseCompressedOops;
 1621       is_oop = true;
 1622       dest_uninitialized = true;
 1623       break;
 1624     default:
 1625       ShouldNotReachHere();
 1626       break;
 1627     }
 1628 
 1629     __ align(CodeEntryAlignment);
 1630     StubCodeMark mark(this, stub_id);
 1631     address start = __ pc();
 1632     __ enter();
 1633 
 1634     if (entry != nullptr) {
 1635       *entry = __ pc();
 1636       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1637       BLOCK_COMMENT("Entry:");
 1638     }
 1639 
 1640     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1641     if (dest_uninitialized) {
 1642       decorators |= IS_DEST_UNINITIALIZED;
 1643     }
 1644     if (aligned) {
 1645       decorators |= ARRAYCOPY_ALIGNED;
 1646     }
 1647 
 1648     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1649     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1650 
 1651     if (is_oop) {
 1652       // save regs before copy_memory
 1653       __ push(RegSet::of(d, count), sp);
 1654     }
 1655     {
 1656       // UnsafeMemoryAccess page error: continue after unsafe access
 1657       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1658       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1659       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1660     }
 1661 
 1662     if (is_oop) {
 1663       __ pop(RegSet::of(d, count), sp);
 1664       if (VerifyOops)
 1665         verify_oop_array(size, d, count, r16);
 1666     }
 1667 
 1668     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1669 
 1670     __ leave();
 1671     __ mov(r0, zr); // return 0
 1672     __ ret(lr);
 1673     return start;
 1674   }
 1675 
 1676   // Arguments:
 1677   //   stub_id - is used to name the stub and identify all details of
 1678   //             how to perform the copy.
 1679   //
 1680   //   nooverlap_target - identifes the (post push) entry for the
 1681   //             corresponding disjoint copy routine which can be
 1682   //             jumped to if the ranges do not actually overlap
 1683   //
 1684   //   entry - is assigned to the stub's post push entry point unless
 1685   //           it is null
 1686   //
 1687   //
 1688   // Inputs:
 1689   //   c_rarg0   - source array address
 1690   //   c_rarg1   - destination array address
 1691   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1692   //
 1693   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1694   // the hardware handle it.  The two dwords within qwords that span
 1695   // cache line boundaries will still be loaded and stored atomically.
 1696   //
 1697   // Side Effects:
 1698   //   entry is set to the no-overlap entry point so it can be used by
 1699   //   some other conjoint copy method
 1700   //
 1701   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
 1702     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1703     RegSet saved_regs = RegSet::of(s, d, count);
 1704     int size;
 1705     bool aligned;
 1706     bool is_oop;
 1707     bool dest_uninitialized;
 1708     switch (stub_id) {
 1709     case jbyte_arraycopy_id:
 1710       size = sizeof(jbyte);
 1711       aligned = false;
 1712       is_oop = false;
 1713       dest_uninitialized = false;
 1714       break;
 1715     case arrayof_jbyte_arraycopy_id:
 1716       size = sizeof(jbyte);
 1717       aligned = true;
 1718       is_oop = false;
 1719       dest_uninitialized = false;
 1720       break;
 1721     case jshort_arraycopy_id:
 1722       size = sizeof(jshort);
 1723       aligned = false;
 1724       is_oop = false;
 1725       dest_uninitialized = false;
 1726       break;
 1727     case arrayof_jshort_arraycopy_id:
 1728       size = sizeof(jshort);
 1729       aligned = true;
 1730       is_oop = false;
 1731       dest_uninitialized = false;
 1732       break;
 1733     case jint_arraycopy_id:
 1734       size = sizeof(jint);
 1735       aligned = false;
 1736       is_oop = false;
 1737       dest_uninitialized = false;
 1738       break;
 1739     case arrayof_jint_arraycopy_id:
 1740       size = sizeof(jint);
 1741       aligned = true;
 1742       is_oop = false;
 1743       dest_uninitialized = false;
 1744       break;
 1745     case jlong_arraycopy_id:
 1746       // since this is always aligned we can (should!) use the same
 1747       // stub as for case arrayof_jlong_disjoint_arraycopy
 1748       ShouldNotReachHere();
 1749       break;
 1750     case arrayof_jlong_arraycopy_id:
 1751       size = sizeof(jlong);
 1752       aligned = true;
 1753       is_oop = false;
 1754       dest_uninitialized = false;
 1755       break;
 1756     case oop_arraycopy_id:
 1757       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1758       aligned = !UseCompressedOops;
 1759       is_oop = true;
 1760       dest_uninitialized = false;
 1761       break;
 1762     case arrayof_oop_arraycopy_id:
 1763       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1764       aligned = !UseCompressedOops;
 1765       is_oop = true;
 1766       dest_uninitialized = false;
 1767       break;
 1768     case oop_arraycopy_uninit_id:
 1769       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1770       aligned = !UseCompressedOops;
 1771       is_oop = true;
 1772       dest_uninitialized = true;
 1773       break;
 1774     case arrayof_oop_arraycopy_uninit_id:
 1775       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1776       aligned = !UseCompressedOops;
 1777       is_oop = true;
 1778       dest_uninitialized = true;
 1779       break;
 1780     default:
 1781       ShouldNotReachHere();
 1782     }
 1783 
 1784     StubCodeMark mark(this, stub_id);
 1785     address start = __ pc();
 1786     __ enter();
 1787 
 1788     if (entry != nullptr) {
 1789       *entry = __ pc();
 1790       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1791       BLOCK_COMMENT("Entry:");
 1792     }
 1793 
 1794     // use fwd copy when (d-s) above_equal (count*size)
 1795     __ sub(rscratch1, d, s);
 1796     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1797     __ br(Assembler::HS, nooverlap_target);
 1798 
 1799     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1800     if (dest_uninitialized) {
 1801       decorators |= IS_DEST_UNINITIALIZED;
 1802     }
 1803     if (aligned) {
 1804       decorators |= ARRAYCOPY_ALIGNED;
 1805     }
 1806 
 1807     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1808     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1809 
 1810     if (is_oop) {
 1811       // save regs before copy_memory
 1812       __ push(RegSet::of(d, count), sp);
 1813     }
 1814     {
 1815       // UnsafeMemoryAccess page error: continue after unsafe access
 1816       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1817       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1818       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1819     }
 1820     if (is_oop) {
 1821       __ pop(RegSet::of(d, count), sp);
 1822       if (VerifyOops)
 1823         verify_oop_array(size, d, count, r16);
 1824     }
 1825     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1826     __ leave();
 1827     __ mov(r0, zr); // return 0
 1828     __ ret(lr);
 1829     return start;
 1830   }
 1831 
 1832   // Helper for generating a dynamic type check.
 1833   // Smashes rscratch1, rscratch2.
 1834   void generate_type_check(Register sub_klass,
 1835                            Register super_check_offset,
 1836                            Register super_klass,
 1837                            Register temp1,
 1838                            Register temp2,
 1839                            Register result,
 1840                            Label& L_success) {
 1841     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1842 
 1843     BLOCK_COMMENT("type_check:");
 1844 
 1845     Label L_miss;
 1846 
 1847     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1848                                      super_check_offset);
 1849     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1850 
 1851     // Fall through on failure!
 1852     __ BIND(L_miss);
 1853   }
 1854 
 1855   //
 1856   //  Generate checkcasting array copy stub
 1857   //
 1858   //  Input:
 1859   //    c_rarg0   - source array address
 1860   //    c_rarg1   - destination array address
 1861   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1862   //    c_rarg3   - size_t ckoff (super_check_offset)
 1863   //    c_rarg4   - oop ckval (super_klass)
 1864   //
 1865   //  Output:
 1866   //    r0 ==  0  -  success
 1867   //    r0 == -1^K - failure, where K is partial transfer count
 1868   //
 1869   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
 1870     bool dest_uninitialized;
 1871     switch (stub_id) {
 1872     case checkcast_arraycopy_id:
 1873       dest_uninitialized = false;
 1874       break;
 1875     case checkcast_arraycopy_uninit_id:
 1876       dest_uninitialized = true;
 1877       break;
 1878     default:
 1879       ShouldNotReachHere();
 1880     }
 1881 
 1882     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1883 
 1884     // Input registers (after setup_arg_regs)
 1885     const Register from        = c_rarg0;   // source array address
 1886     const Register to          = c_rarg1;   // destination array address
 1887     const Register count       = c_rarg2;   // elementscount
 1888     const Register ckoff       = c_rarg3;   // super_check_offset
 1889     const Register ckval       = c_rarg4;   // super_klass
 1890 
 1891     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1892     RegSet wb_post_saved_regs = RegSet::of(count);
 1893 
 1894     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1895     const Register copied_oop  = r22;       // actual oop copied
 1896     const Register count_save  = r21;       // orig elementscount
 1897     const Register start_to    = r20;       // destination array start address
 1898     const Register r19_klass   = r19;       // oop._klass
 1899 
 1900     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1901     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1902 
 1903     //---------------------------------------------------------------
 1904     // Assembler stub will be used for this call to arraycopy
 1905     // if the two arrays are subtypes of Object[] but the
 1906     // destination array type is not equal to or a supertype
 1907     // of the source type.  Each element must be separately
 1908     // checked.
 1909 
 1910     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1911                                copied_oop, r19_klass, count_save);
 1912 
 1913     __ align(CodeEntryAlignment);
 1914     StubCodeMark mark(this, stub_id);
 1915     address start = __ pc();
 1916 
 1917     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1918 
 1919 #ifdef ASSERT
 1920     // caller guarantees that the arrays really are different
 1921     // otherwise, we would have to make conjoint checks
 1922     { Label L;
 1923       __ b(L);                  // conjoint check not yet implemented
 1924       __ stop("checkcast_copy within a single array");
 1925       __ bind(L);
 1926     }
 1927 #endif //ASSERT
 1928 
 1929     // Caller of this entry point must set up the argument registers.
 1930     if (entry != nullptr) {
 1931       *entry = __ pc();
 1932       BLOCK_COMMENT("Entry:");
 1933     }
 1934 
 1935      // Empty array:  Nothing to do.
 1936     __ cbz(count, L_done);
 1937     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1938 
 1939 #ifdef ASSERT
 1940     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1941     // The ckoff and ckval must be mutually consistent,
 1942     // even though caller generates both.
 1943     { Label L;
 1944       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1945       __ ldrw(start_to, Address(ckval, sco_offset));
 1946       __ cmpw(ckoff, start_to);
 1947       __ br(Assembler::EQ, L);
 1948       __ stop("super_check_offset inconsistent");
 1949       __ bind(L);
 1950     }
 1951 #endif //ASSERT
 1952 
 1953     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1954     bool is_oop = true;
 1955     int element_size = UseCompressedOops ? 4 : 8;
 1956     if (dest_uninitialized) {
 1957       decorators |= IS_DEST_UNINITIALIZED;
 1958     }
 1959 
 1960     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1961     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1962 
 1963     // save the original count
 1964     __ mov(count_save, count);
 1965 
 1966     // Copy from low to high addresses
 1967     __ mov(start_to, to);              // Save destination array start address
 1968     __ b(L_load_element);
 1969 
 1970     // ======== begin loop ========
 1971     // (Loop is rotated; its entry is L_load_element.)
 1972     // Loop control:
 1973     //   for (; count != 0; count--) {
 1974     //     copied_oop = load_heap_oop(from++);
 1975     //     ... generate_type_check ...;
 1976     //     store_heap_oop(to++, copied_oop);
 1977     //   }
 1978     __ align(OptoLoopAlignment);
 1979 
 1980     __ BIND(L_store_element);
 1981     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1982                       __ post(to, element_size), copied_oop, noreg,
 1983                       gct1, gct2, gct3);
 1984     __ sub(count, count, 1);
 1985     __ cbz(count, L_do_card_marks);
 1986 
 1987     // ======== loop entry is here ========
 1988     __ BIND(L_load_element);
 1989     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1990                      copied_oop, noreg, __ post(from, element_size),
 1991                      gct1);
 1992     __ cbz(copied_oop, L_store_element);
 1993 
 1994     __ load_klass(r19_klass, copied_oop);// query the object klass
 1995 
 1996     BLOCK_COMMENT("type_check:");
 1997     generate_type_check(/*sub_klass*/r19_klass,
 1998                         /*super_check_offset*/ckoff,
 1999                         /*super_klass*/ckval,
 2000                         /*r_array_base*/gct1,
 2001                         /*temp2*/gct2,
 2002                         /*result*/r10, L_store_element);
 2003 
 2004     // Fall through on failure!
 2005 
 2006     // ======== end loop ========
 2007 
 2008     // It was a real error; we must depend on the caller to finish the job.
 2009     // Register count = remaining oops, count_orig = total oops.
 2010     // Emit GC store barriers for the oops we have copied and report
 2011     // their number to the caller.
 2012 
 2013     __ subs(count, count_save, count);     // K = partially copied oop count
 2014     __ eon(count, count, zr);              // report (-1^K) to caller
 2015     __ br(Assembler::EQ, L_done_pop);
 2016 
 2017     __ BIND(L_do_card_marks);
 2018     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2019 
 2020     __ bind(L_done_pop);
 2021     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2022     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2023 
 2024     __ bind(L_done);
 2025     __ mov(r0, count);
 2026     __ leave();
 2027     __ ret(lr);
 2028 
 2029     return start;
 2030   }
 2031 
 2032   // Perform range checks on the proposed arraycopy.
 2033   // Kills temp, but nothing else.
 2034   // Also, clean the sign bits of src_pos and dst_pos.
 2035   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2036                               Register src_pos, // source position (c_rarg1)
 2037                               Register dst,     // destination array oo (c_rarg2)
 2038                               Register dst_pos, // destination position (c_rarg3)
 2039                               Register length,
 2040                               Register temp,
 2041                               Label& L_failed) {
 2042     BLOCK_COMMENT("arraycopy_range_checks:");
 2043 
 2044     assert_different_registers(rscratch1, temp);
 2045 
 2046     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2047     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2048     __ addw(temp, length, src_pos);
 2049     __ cmpw(temp, rscratch1);
 2050     __ br(Assembler::HI, L_failed);
 2051 
 2052     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2053     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2054     __ addw(temp, length, dst_pos);
 2055     __ cmpw(temp, rscratch1);
 2056     __ br(Assembler::HI, L_failed);
 2057 
 2058     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2059     __ movw(src_pos, src_pos);
 2060     __ movw(dst_pos, dst_pos);
 2061 
 2062     BLOCK_COMMENT("arraycopy_range_checks done");
 2063   }
 2064 
 2065   // These stubs get called from some dumb test routine.
 2066   // I'll write them properly when they're called from
 2067   // something that's actually doing something.
 2068   static void fake_arraycopy_stub(address src, address dst, int count) {
 2069     assert(count == 0, "huh?");
 2070   }
 2071 
 2072 
 2073   //
 2074   //  Generate 'unsafe' array copy stub
 2075   //  Though just as safe as the other stubs, it takes an unscaled
 2076   //  size_t argument instead of an element count.
 2077   //
 2078   //  Input:
 2079   //    c_rarg0   - source array address
 2080   //    c_rarg1   - destination array address
 2081   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2082   //
 2083   // Examines the alignment of the operands and dispatches
 2084   // to a long, int, short, or byte copy loop.
 2085   //
 2086   address generate_unsafe_copy(address byte_copy_entry,
 2087                                address short_copy_entry,
 2088                                address int_copy_entry,
 2089                                address long_copy_entry) {
 2090     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
 2091 
 2092     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2093     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2094 
 2095     __ align(CodeEntryAlignment);
 2096     StubCodeMark mark(this, stub_id);
 2097     address start = __ pc();
 2098     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2099 
 2100     // bump this on entry, not on exit:
 2101     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2102 
 2103     __ orr(rscratch1, s, d);
 2104     __ orr(rscratch1, rscratch1, count);
 2105 
 2106     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2107     __ cbz(rscratch1, L_long_aligned);
 2108     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2109     __ cbz(rscratch1, L_int_aligned);
 2110     __ tbz(rscratch1, 0, L_short_aligned);
 2111     __ b(RuntimeAddress(byte_copy_entry));
 2112 
 2113     __ BIND(L_short_aligned);
 2114     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2115     __ b(RuntimeAddress(short_copy_entry));
 2116     __ BIND(L_int_aligned);
 2117     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2118     __ b(RuntimeAddress(int_copy_entry));
 2119     __ BIND(L_long_aligned);
 2120     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2121     __ b(RuntimeAddress(long_copy_entry));
 2122 
 2123     return start;
 2124   }
 2125 
 2126   //
 2127   //  Generate generic array copy stubs
 2128   //
 2129   //  Input:
 2130   //    c_rarg0    -  src oop
 2131   //    c_rarg1    -  src_pos (32-bits)
 2132   //    c_rarg2    -  dst oop
 2133   //    c_rarg3    -  dst_pos (32-bits)
 2134   //    c_rarg4    -  element count (32-bits)
 2135   //
 2136   //  Output:
 2137   //    r0 ==  0  -  success
 2138   //    r0 == -1^K - failure, where K is partial transfer count
 2139   //
 2140   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2141                                 address int_copy_entry, address oop_copy_entry,
 2142                                 address long_copy_entry, address checkcast_copy_entry) {
 2143     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
 2144 
 2145     Label L_failed, L_objArray;
 2146     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2147 
 2148     // Input registers
 2149     const Register src        = c_rarg0;  // source array oop
 2150     const Register src_pos    = c_rarg1;  // source position
 2151     const Register dst        = c_rarg2;  // destination array oop
 2152     const Register dst_pos    = c_rarg3;  // destination position
 2153     const Register length     = c_rarg4;
 2154 
 2155 
 2156     // Registers used as temps
 2157     const Register dst_klass  = c_rarg5;
 2158 
 2159     __ align(CodeEntryAlignment);
 2160 
 2161     StubCodeMark mark(this, stub_id);
 2162 
 2163     address start = __ pc();
 2164 
 2165     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2166 
 2167     // bump this on entry, not on exit:
 2168     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2169 
 2170     //-----------------------------------------------------------------------
 2171     // Assembler stub will be used for this call to arraycopy
 2172     // if the following conditions are met:
 2173     //
 2174     // (1) src and dst must not be null.
 2175     // (2) src_pos must not be negative.
 2176     // (3) dst_pos must not be negative.
 2177     // (4) length  must not be negative.
 2178     // (5) src klass and dst klass should be the same and not null.
 2179     // (6) src and dst should be arrays.
 2180     // (7) src_pos + length must not exceed length of src.
 2181     // (8) dst_pos + length must not exceed length of dst.
 2182     //
 2183 
 2184     //  if (src == nullptr) return -1;
 2185     __ cbz(src, L_failed);
 2186 
 2187     //  if (src_pos < 0) return -1;
 2188     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     //  if (dst == nullptr) return -1;
 2191     __ cbz(dst, L_failed);
 2192 
 2193     //  if (dst_pos < 0) return -1;
 2194     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2195 
 2196     // registers used as temp
 2197     const Register scratch_length    = r16; // elements count to copy
 2198     const Register scratch_src_klass = r17; // array klass
 2199     const Register lh                = r15; // layout helper
 2200 
 2201     //  if (length < 0) return -1;
 2202     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2203     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2204 
 2205     __ load_klass(scratch_src_klass, src);
 2206 #ifdef ASSERT
 2207     //  assert(src->klass() != nullptr);
 2208     {
 2209       BLOCK_COMMENT("assert klasses not null {");
 2210       Label L1, L2;
 2211       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2212       __ bind(L1);
 2213       __ stop("broken null klass");
 2214       __ bind(L2);
 2215       __ load_klass(rscratch1, dst);
 2216       __ cbz(rscratch1, L1);     // this would be broken also
 2217       BLOCK_COMMENT("} assert klasses not null done");
 2218     }
 2219 #endif
 2220 
 2221     // Load layout helper (32-bits)
 2222     //
 2223     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2224     // 32        30    24            16              8     2                 0
 2225     //
 2226     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2227     //
 2228 
 2229     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2230 
 2231     // Handle objArrays completely differently...
 2232     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2233     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2234     __ movw(rscratch1, objArray_lh);
 2235     __ eorw(rscratch2, lh, rscratch1);
 2236     __ cbzw(rscratch2, L_objArray);
 2237 
 2238     //  if (src->klass() != dst->klass()) return -1;
 2239     __ load_klass(rscratch2, dst);
 2240     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2241     __ cbnz(rscratch2, L_failed);
 2242 
 2243     // Check for flat inline type array -> return -1
 2244     __ test_flat_array_oop(src, rscratch2, L_failed);
 2245 
 2246     // Check for null-free (non-flat) inline type array -> handle as object array
 2247     __ test_null_free_array_oop(src, rscratch2, L_objArray);
 2248 
 2249     //  if (!src->is_Array()) return -1;
 2250     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2251 
 2252     // At this point, it is known to be a typeArray (array_tag 0x3).
 2253 #ifdef ASSERT
 2254     {
 2255       BLOCK_COMMENT("assert primitive array {");
 2256       Label L;
 2257       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2258       __ cmpw(lh, rscratch2);
 2259       __ br(Assembler::GE, L);
 2260       __ stop("must be a primitive array");
 2261       __ bind(L);
 2262       BLOCK_COMMENT("} assert primitive array done");
 2263     }
 2264 #endif
 2265 
 2266     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2267                            rscratch2, L_failed);
 2268 
 2269     // TypeArrayKlass
 2270     //
 2271     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2272     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2273     //
 2274 
 2275     const Register rscratch1_offset = rscratch1;    // array offset
 2276     const Register r15_elsize = lh; // element size
 2277 
 2278     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2279            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2280     __ add(src, src, rscratch1_offset);           // src array offset
 2281     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2282     BLOCK_COMMENT("choose copy loop based on element size");
 2283 
 2284     // next registers should be set before the jump to corresponding stub
 2285     const Register from     = c_rarg0;  // source array address
 2286     const Register to       = c_rarg1;  // destination array address
 2287     const Register count    = c_rarg2;  // elements count
 2288 
 2289     // 'from', 'to', 'count' registers should be set in such order
 2290     // since they are the same as 'src', 'src_pos', 'dst'.
 2291 
 2292     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2293 
 2294     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2295     // size in bytes).  We do a simple bitwise binary search.
 2296   __ BIND(L_copy_bytes);
 2297     __ tbnz(r15_elsize, 1, L_copy_ints);
 2298     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2299     __ lea(from, Address(src, src_pos));// src_addr
 2300     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2301     __ movw(count, scratch_length); // length
 2302     __ b(RuntimeAddress(byte_copy_entry));
 2303 
 2304   __ BIND(L_copy_shorts);
 2305     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2306     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2307     __ movw(count, scratch_length); // length
 2308     __ b(RuntimeAddress(short_copy_entry));
 2309 
 2310   __ BIND(L_copy_ints);
 2311     __ tbnz(r15_elsize, 0, L_copy_longs);
 2312     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2313     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2314     __ movw(count, scratch_length); // length
 2315     __ b(RuntimeAddress(int_copy_entry));
 2316 
 2317   __ BIND(L_copy_longs);
 2318 #ifdef ASSERT
 2319     {
 2320       BLOCK_COMMENT("assert long copy {");
 2321       Label L;
 2322       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2323       __ cmpw(r15_elsize, LogBytesPerLong);
 2324       __ br(Assembler::EQ, L);
 2325       __ stop("must be long copy, but elsize is wrong");
 2326       __ bind(L);
 2327       BLOCK_COMMENT("} assert long copy done");
 2328     }
 2329 #endif
 2330     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2331     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2332     __ movw(count, scratch_length); // length
 2333     __ b(RuntimeAddress(long_copy_entry));
 2334 
 2335     // ObjArrayKlass
 2336   __ BIND(L_objArray);
 2337     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2338 
 2339     Label L_plain_copy, L_checkcast_copy;
 2340     //  test array classes for subtyping
 2341     __ load_klass(r15, dst);
 2342     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2343     __ br(Assembler::NE, L_checkcast_copy);
 2344 
 2345     // Identically typed arrays can be copied without element-wise checks.
 2346     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                            rscratch2, L_failed);
 2348 
 2349     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2350     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2351     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2352     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353     __ movw(count, scratch_length); // length
 2354   __ BIND(L_plain_copy);
 2355     __ b(RuntimeAddress(oop_copy_entry));
 2356 
 2357   __ BIND(L_checkcast_copy);
 2358     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2359     {
 2360       // Before looking at dst.length, make sure dst is also an objArray.
 2361       __ ldrw(rscratch1, Address(r15, lh_offset));
 2362       __ movw(rscratch2, objArray_lh);
 2363       __ eorw(rscratch1, rscratch1, rscratch2);
 2364       __ cbnzw(rscratch1, L_failed);
 2365 
 2366       // It is safe to examine both src.length and dst.length.
 2367       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2368                              r15, L_failed);
 2369 
 2370       __ load_klass(dst_klass, dst); // reload
 2371 
 2372       // Marshal the base address arguments now, freeing registers.
 2373       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2374       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2375       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2376       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2377       __ movw(count, length);           // length (reloaded)
 2378       Register sco_temp = c_rarg3;      // this register is free now
 2379       assert_different_registers(from, to, count, sco_temp,
 2380                                  dst_klass, scratch_src_klass);
 2381       // assert_clean_int(count, sco_temp);
 2382 
 2383       // Generate the type check.
 2384       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2385       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2386 
 2387       // Smashes rscratch1, rscratch2
 2388       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2389                           L_plain_copy);
 2390 
 2391       // Fetch destination element klass from the ObjArrayKlass header.
 2392       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2393       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2394       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2395 
 2396       // the checkcast_copy loop needs two extra arguments:
 2397       assert(c_rarg3 == sco_temp, "#3 already in place");
 2398       // Set up arguments for checkcast_copy_entry.
 2399       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2400       __ b(RuntimeAddress(checkcast_copy_entry));
 2401     }
 2402 
 2403   __ BIND(L_failed);
 2404     __ mov(r0, -1);
 2405     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2406     __ ret(lr);
 2407 
 2408     return start;
 2409   }
 2410 
 2411   //
 2412   // Generate stub for array fill. If "aligned" is true, the
 2413   // "to" address is assumed to be heapword aligned.
 2414   //
 2415   // Arguments for generated stub:
 2416   //   to:    c_rarg0
 2417   //   value: c_rarg1
 2418   //   count: c_rarg2 treated as signed
 2419   //
 2420   address generate_fill(StubGenStubId stub_id) {
 2421     BasicType t;
 2422     bool aligned;
 2423 
 2424     switch (stub_id) {
 2425     case jbyte_fill_id:
 2426       t = T_BYTE;
 2427       aligned = false;
 2428       break;
 2429     case jshort_fill_id:
 2430       t = T_SHORT;
 2431       aligned = false;
 2432       break;
 2433     case jint_fill_id:
 2434       t = T_INT;
 2435       aligned = false;
 2436       break;
 2437     case arrayof_jbyte_fill_id:
 2438       t = T_BYTE;
 2439       aligned = true;
 2440       break;
 2441     case arrayof_jshort_fill_id:
 2442       t = T_SHORT;
 2443       aligned = true;
 2444       break;
 2445     case arrayof_jint_fill_id:
 2446       t = T_INT;
 2447       aligned = true;
 2448       break;
 2449     default:
 2450       ShouldNotReachHere();
 2451     };
 2452 
 2453     __ align(CodeEntryAlignment);
 2454     StubCodeMark mark(this, stub_id);
 2455     address start = __ pc();
 2456 
 2457     BLOCK_COMMENT("Entry:");
 2458 
 2459     const Register to        = c_rarg0;  // source array address
 2460     const Register value     = c_rarg1;  // value
 2461     const Register count     = c_rarg2;  // elements count
 2462 
 2463     const Register bz_base = r10;        // base for block_zero routine
 2464     const Register cnt_words = r11;      // temp register
 2465 
 2466     __ enter();
 2467 
 2468     Label L_fill_elements, L_exit1;
 2469 
 2470     int shift = -1;
 2471     switch (t) {
 2472       case T_BYTE:
 2473         shift = 0;
 2474         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2475         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2476         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2477         __ br(Assembler::LO, L_fill_elements);
 2478         break;
 2479       case T_SHORT:
 2480         shift = 1;
 2481         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2482         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2483         __ br(Assembler::LO, L_fill_elements);
 2484         break;
 2485       case T_INT:
 2486         shift = 2;
 2487         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2488         __ br(Assembler::LO, L_fill_elements);
 2489         break;
 2490       default: ShouldNotReachHere();
 2491     }
 2492 
 2493     // Align source address at 8 bytes address boundary.
 2494     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2495     if (!aligned) {
 2496       switch (t) {
 2497         case T_BYTE:
 2498           // One byte misalignment happens only for byte arrays.
 2499           __ tbz(to, 0, L_skip_align1);
 2500           __ strb(value, Address(__ post(to, 1)));
 2501           __ subw(count, count, 1);
 2502           __ bind(L_skip_align1);
 2503           // Fallthrough
 2504         case T_SHORT:
 2505           // Two bytes misalignment happens only for byte and short (char) arrays.
 2506           __ tbz(to, 1, L_skip_align2);
 2507           __ strh(value, Address(__ post(to, 2)));
 2508           __ subw(count, count, 2 >> shift);
 2509           __ bind(L_skip_align2);
 2510           // Fallthrough
 2511         case T_INT:
 2512           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2513           __ tbz(to, 2, L_skip_align4);
 2514           __ strw(value, Address(__ post(to, 4)));
 2515           __ subw(count, count, 4 >> shift);
 2516           __ bind(L_skip_align4);
 2517           break;
 2518         default: ShouldNotReachHere();
 2519       }
 2520     }
 2521 
 2522     //
 2523     //  Fill large chunks
 2524     //
 2525     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2526     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2527     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2528     if (UseBlockZeroing) {
 2529       Label non_block_zeroing, rest;
 2530       // If the fill value is zero we can use the fast zero_words().
 2531       __ cbnz(value, non_block_zeroing);
 2532       __ mov(bz_base, to);
 2533       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2534       address tpc = __ zero_words(bz_base, cnt_words);
 2535       if (tpc == nullptr) {
 2536         fatal("CodeCache is full at generate_fill");
 2537       }
 2538       __ b(rest);
 2539       __ bind(non_block_zeroing);
 2540       __ fill_words(to, cnt_words, value);
 2541       __ bind(rest);
 2542     } else {
 2543       __ fill_words(to, cnt_words, value);
 2544     }
 2545 
 2546     // Remaining count is less than 8 bytes. Fill it by a single store.
 2547     // Note that the total length is no less than 8 bytes.
 2548     if (t == T_BYTE || t == T_SHORT) {
 2549       Label L_exit1;
 2550       __ cbzw(count, L_exit1);
 2551       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2552       __ str(value, Address(to, -8));    // overwrite some elements
 2553       __ bind(L_exit1);
 2554       __ leave();
 2555       __ ret(lr);
 2556     }
 2557 
 2558     // Handle copies less than 8 bytes.
 2559     Label L_fill_2, L_fill_4, L_exit2;
 2560     __ bind(L_fill_elements);
 2561     switch (t) {
 2562       case T_BYTE:
 2563         __ tbz(count, 0, L_fill_2);
 2564         __ strb(value, Address(__ post(to, 1)));
 2565         __ bind(L_fill_2);
 2566         __ tbz(count, 1, L_fill_4);
 2567         __ strh(value, Address(__ post(to, 2)));
 2568         __ bind(L_fill_4);
 2569         __ tbz(count, 2, L_exit2);
 2570         __ strw(value, Address(to));
 2571         break;
 2572       case T_SHORT:
 2573         __ tbz(count, 0, L_fill_4);
 2574         __ strh(value, Address(__ post(to, 2)));
 2575         __ bind(L_fill_4);
 2576         __ tbz(count, 1, L_exit2);
 2577         __ strw(value, Address(to));
 2578         break;
 2579       case T_INT:
 2580         __ cbzw(count, L_exit2);
 2581         __ strw(value, Address(to));
 2582         break;
 2583       default: ShouldNotReachHere();
 2584     }
 2585     __ bind(L_exit2);
 2586     __ leave();
 2587     __ ret(lr);
 2588     return start;
 2589   }
 2590 
 2591   address generate_unsafecopy_common_error_exit() {
 2592     address start_pc = __ pc();
 2593       __ leave();
 2594       __ mov(r0, 0);
 2595       __ ret(lr);
 2596     return start_pc;
 2597   }
 2598 
 2599   //
 2600   //  Generate 'unsafe' set memory stub
 2601   //  Though just as safe as the other stubs, it takes an unscaled
 2602   //  size_t (# bytes) argument instead of an element count.
 2603   //
 2604   //  This fill operation is atomicity preserving: as long as the
 2605   //  address supplied is sufficiently aligned, all writes of up to 64
 2606   //  bits in size are single-copy atomic.
 2607   //
 2608   //  Input:
 2609   //    c_rarg0   - destination array address
 2610   //    c_rarg1   - byte count (size_t)
 2611   //    c_rarg2   - byte value
 2612   //
 2613   address generate_unsafe_setmemory() {
 2614     __ align(CodeEntryAlignment);
 2615     StubCodeMark mark(this, StubGenStubId::unsafe_setmemory_id);
 2616     address start = __ pc();
 2617 
 2618     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2619     Label tail;
 2620 
 2621     UnsafeMemoryAccessMark umam(this, true, false);
 2622 
 2623     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2624 
 2625     __ dup(v0, __ T16B, value);
 2626 
 2627     if (AvoidUnalignedAccesses) {
 2628       __ cmp(count, (u1)16);
 2629       __ br(__ LO, tail);
 2630 
 2631       __ mov(rscratch1, 16);
 2632       __ andr(rscratch2, dest, 15);
 2633       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2634       __ strq(v0, Address(dest));
 2635       __ sub(count, count, rscratch1);
 2636       __ add(dest, dest, rscratch1);
 2637     }
 2638 
 2639     __ subs(count, count, (u1)64);
 2640     __ br(__ LO, tail);
 2641     {
 2642       Label again;
 2643       __ bind(again);
 2644       __ stpq(v0, v0, Address(dest));
 2645       __ stpq(v0, v0, Address(dest, 32));
 2646 
 2647       __ subs(count, count, 64);
 2648       __ add(dest, dest, 64);
 2649       __ br(__ HS, again);
 2650     }
 2651 
 2652     __ bind(tail);
 2653     // The count of bytes is off by 64, but we don't need to correct
 2654     // it because we're only going to use the least-significant few
 2655     // count bits from here on.
 2656     // __ add(count, count, 64);
 2657 
 2658     {
 2659       Label dont;
 2660       __ tbz(count, exact_log2(32), dont);
 2661       __ stpq(v0, v0, __ post(dest, 32));
 2662       __ bind(dont);
 2663     }
 2664     {
 2665       Label dont;
 2666       __ tbz(count, exact_log2(16), dont);
 2667       __ strq(v0, __ post(dest, 16));
 2668       __ bind(dont);
 2669     }
 2670     {
 2671       Label dont;
 2672       __ tbz(count, exact_log2(8), dont);
 2673       __ strd(v0, __ post(dest, 8));
 2674       __ bind(dont);
 2675     }
 2676 
 2677     Label finished;
 2678     __ tst(count, 7);
 2679     __ br(__ EQ, finished);
 2680 
 2681     {
 2682       Label dont;
 2683       __ tbz(count, exact_log2(4), dont);
 2684       __ strs(v0, __ post(dest, 4));
 2685       __ bind(dont);
 2686     }
 2687     {
 2688       Label dont;
 2689       __ tbz(count, exact_log2(2), dont);
 2690       __ bfi(value, value, 8, 8);
 2691       __ strh(value, __ post(dest, 2));
 2692       __ bind(dont);
 2693     }
 2694     {
 2695       Label dont;
 2696       __ tbz(count, exact_log2(1), dont);
 2697       __ strb(value, Address(dest));
 2698       __ bind(dont);
 2699     }
 2700 
 2701     __ bind(finished);
 2702     __ leave();
 2703     __ ret(lr);
 2704 
 2705     return start;
 2706   }
 2707 
 2708   address generate_data_cache_writeback() {
 2709     const Register line        = c_rarg0;  // address of line to write back
 2710 
 2711     __ align(CodeEntryAlignment);
 2712 
 2713     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
 2714     StubCodeMark mark(this, stub_id);
 2715 
 2716     address start = __ pc();
 2717     __ enter();
 2718     __ cache_wb(Address(line, 0));
 2719     __ leave();
 2720     __ ret(lr);
 2721 
 2722     return start;
 2723   }
 2724 
 2725   address generate_data_cache_writeback_sync() {
 2726     const Register is_pre     = c_rarg0;  // pre or post sync
 2727 
 2728     __ align(CodeEntryAlignment);
 2729 
 2730     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
 2731     StubCodeMark mark(this, stub_id);
 2732 
 2733     // pre wbsync is a no-op
 2734     // post wbsync translates to an sfence
 2735 
 2736     Label skip;
 2737     address start = __ pc();
 2738     __ enter();
 2739     __ cbnz(is_pre, skip);
 2740     __ cache_wbsync(false);
 2741     __ bind(skip);
 2742     __ leave();
 2743     __ ret(lr);
 2744 
 2745     return start;
 2746   }
 2747 
 2748   void generate_arraycopy_stubs() {
 2749     address entry;
 2750     address entry_jbyte_arraycopy;
 2751     address entry_jshort_arraycopy;
 2752     address entry_jint_arraycopy;
 2753     address entry_oop_arraycopy;
 2754     address entry_jlong_arraycopy;
 2755     address entry_checkcast_arraycopy;
 2756 
 2757     address ucm_common_error_exit       =  generate_unsafecopy_common_error_exit();
 2758     UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit);
 2759 
 2760     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2761     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2762 
 2763     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2764     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2765 
 2766     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2767     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2768 
 2769     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2770 
 2771     //*** jbyte
 2772     // Always need aligned and unaligned versions
 2773     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
 2774     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2775     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2776     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
 2777 
 2778     //*** jshort
 2779     // Always need aligned and unaligned versions
 2780     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
 2781     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2782     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
 2783     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
 2784 
 2785     //*** jint
 2786     // Aligned versions
 2787     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
 2788     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2789     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2790     // entry_jint_arraycopy always points to the unaligned version
 2791     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
 2792     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2793 
 2794     //*** jlong
 2795     // It is always aligned
 2796     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
 2797     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2798     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2799     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2800 
 2801     //*** oops
 2802     {
 2803       // With compressed oops we need unaligned versions; notice that
 2804       // we overwrite entry_oop_arraycopy.
 2805       bool aligned = !UseCompressedOops;
 2806 
 2807       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2808         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
 2809       StubRoutines::_arrayof_oop_arraycopy
 2810         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2811       // Aligned versions without pre-barriers
 2812       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2813         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2814       StubRoutines::_arrayof_oop_arraycopy_uninit
 2815         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2816     }
 2817 
 2818     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2819     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2820     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2821     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2822 
 2823     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2824     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 2825 
 2826     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2827                                                               entry_jshort_arraycopy,
 2828                                                               entry_jint_arraycopy,
 2829                                                               entry_jlong_arraycopy);
 2830 
 2831     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2832                                                                entry_jshort_arraycopy,
 2833                                                                entry_jint_arraycopy,
 2834                                                                entry_oop_arraycopy,
 2835                                                                entry_jlong_arraycopy,
 2836                                                                entry_checkcast_arraycopy);
 2837 
 2838     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 2839     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 2840     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 2841     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 2842     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 2843     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 2844   }
 2845 
 2846   void generate_math_stubs() { Unimplemented(); }
 2847 
 2848   // Arguments:
 2849   //
 2850   // Inputs:
 2851   //   c_rarg0   - source byte array address
 2852   //   c_rarg1   - destination byte array address
 2853   //   c_rarg2   - K (key) in little endian int array
 2854   //
 2855   address generate_aescrypt_encryptBlock() {
 2856     __ align(CodeEntryAlignment);
 2857     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
 2858     StubCodeMark mark(this, stub_id);
 2859 
 2860     const Register from        = c_rarg0;  // source array address
 2861     const Register to          = c_rarg1;  // destination array address
 2862     const Register key         = c_rarg2;  // key array address
 2863     const Register keylen      = rscratch1;
 2864 
 2865     address start = __ pc();
 2866     __ enter();
 2867 
 2868     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2869 
 2870     __ aesenc_loadkeys(key, keylen);
 2871     __ aesecb_encrypt(from, to, keylen);
 2872 
 2873     __ mov(r0, 0);
 2874 
 2875     __ leave();
 2876     __ ret(lr);
 2877 
 2878     return start;
 2879   }
 2880 
 2881   // Arguments:
 2882   //
 2883   // Inputs:
 2884   //   c_rarg0   - source byte array address
 2885   //   c_rarg1   - destination byte array address
 2886   //   c_rarg2   - K (key) in little endian int array
 2887   //
 2888   address generate_aescrypt_decryptBlock() {
 2889     assert(UseAES, "need AES cryptographic extension support");
 2890     __ align(CodeEntryAlignment);
 2891     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
 2892     StubCodeMark mark(this, stub_id);
 2893     Label L_doLast;
 2894 
 2895     const Register from        = c_rarg0;  // source array address
 2896     const Register to          = c_rarg1;  // destination array address
 2897     const Register key         = c_rarg2;  // key array address
 2898     const Register keylen      = rscratch1;
 2899 
 2900     address start = __ pc();
 2901     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2902 
 2903     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2904 
 2905     __ aesecb_decrypt(from, to, key, keylen);
 2906 
 2907     __ mov(r0, 0);
 2908 
 2909     __ leave();
 2910     __ ret(lr);
 2911 
 2912     return start;
 2913   }
 2914 
 2915   // Arguments:
 2916   //
 2917   // Inputs:
 2918   //   c_rarg0   - source byte array address
 2919   //   c_rarg1   - destination byte array address
 2920   //   c_rarg2   - K (key) in little endian int array
 2921   //   c_rarg3   - r vector byte array address
 2922   //   c_rarg4   - input length
 2923   //
 2924   // Output:
 2925   //   x0        - input length
 2926   //
 2927   address generate_cipherBlockChaining_encryptAESCrypt() {
 2928     assert(UseAES, "need AES cryptographic extension support");
 2929     __ align(CodeEntryAlignment);
 2930     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
 2931     StubCodeMark mark(this, stub_id);
 2932 
 2933     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2934 
 2935     const Register from        = c_rarg0;  // source array address
 2936     const Register to          = c_rarg1;  // destination array address
 2937     const Register key         = c_rarg2;  // key array address
 2938     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2939                                            // and left with the results of the last encryption block
 2940     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2941     const Register keylen      = rscratch1;
 2942 
 2943     address start = __ pc();
 2944 
 2945       __ enter();
 2946 
 2947       __ movw(rscratch2, len_reg);
 2948 
 2949       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2950 
 2951       __ ld1(v0, __ T16B, rvec);
 2952 
 2953       __ cmpw(keylen, 52);
 2954       __ br(Assembler::CC, L_loadkeys_44);
 2955       __ br(Assembler::EQ, L_loadkeys_52);
 2956 
 2957       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2958       __ rev32(v17, __ T16B, v17);
 2959       __ rev32(v18, __ T16B, v18);
 2960     __ BIND(L_loadkeys_52);
 2961       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2962       __ rev32(v19, __ T16B, v19);
 2963       __ rev32(v20, __ T16B, v20);
 2964     __ BIND(L_loadkeys_44);
 2965       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2966       __ rev32(v21, __ T16B, v21);
 2967       __ rev32(v22, __ T16B, v22);
 2968       __ rev32(v23, __ T16B, v23);
 2969       __ rev32(v24, __ T16B, v24);
 2970       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2971       __ rev32(v25, __ T16B, v25);
 2972       __ rev32(v26, __ T16B, v26);
 2973       __ rev32(v27, __ T16B, v27);
 2974       __ rev32(v28, __ T16B, v28);
 2975       __ ld1(v29, v30, v31, __ T16B, key);
 2976       __ rev32(v29, __ T16B, v29);
 2977       __ rev32(v30, __ T16B, v30);
 2978       __ rev32(v31, __ T16B, v31);
 2979 
 2980     __ BIND(L_aes_loop);
 2981       __ ld1(v1, __ T16B, __ post(from, 16));
 2982       __ eor(v0, __ T16B, v0, v1);
 2983 
 2984       __ br(Assembler::CC, L_rounds_44);
 2985       __ br(Assembler::EQ, L_rounds_52);
 2986 
 2987       __ aese(v0, v17); __ aesmc(v0, v0);
 2988       __ aese(v0, v18); __ aesmc(v0, v0);
 2989     __ BIND(L_rounds_52);
 2990       __ aese(v0, v19); __ aesmc(v0, v0);
 2991       __ aese(v0, v20); __ aesmc(v0, v0);
 2992     __ BIND(L_rounds_44);
 2993       __ aese(v0, v21); __ aesmc(v0, v0);
 2994       __ aese(v0, v22); __ aesmc(v0, v0);
 2995       __ aese(v0, v23); __ aesmc(v0, v0);
 2996       __ aese(v0, v24); __ aesmc(v0, v0);
 2997       __ aese(v0, v25); __ aesmc(v0, v0);
 2998       __ aese(v0, v26); __ aesmc(v0, v0);
 2999       __ aese(v0, v27); __ aesmc(v0, v0);
 3000       __ aese(v0, v28); __ aesmc(v0, v0);
 3001       __ aese(v0, v29); __ aesmc(v0, v0);
 3002       __ aese(v0, v30);
 3003       __ eor(v0, __ T16B, v0, v31);
 3004 
 3005       __ st1(v0, __ T16B, __ post(to, 16));
 3006 
 3007       __ subw(len_reg, len_reg, 16);
 3008       __ cbnzw(len_reg, L_aes_loop);
 3009 
 3010       __ st1(v0, __ T16B, rvec);
 3011 
 3012       __ mov(r0, rscratch2);
 3013 
 3014       __ leave();
 3015       __ ret(lr);
 3016 
 3017       return start;
 3018   }
 3019 
 3020   // Arguments:
 3021   //
 3022   // Inputs:
 3023   //   c_rarg0   - source byte array address
 3024   //   c_rarg1   - destination byte array address
 3025   //   c_rarg2   - K (key) in little endian int array
 3026   //   c_rarg3   - r vector byte array address
 3027   //   c_rarg4   - input length
 3028   //
 3029   // Output:
 3030   //   r0        - input length
 3031   //
 3032   address generate_cipherBlockChaining_decryptAESCrypt() {
 3033     assert(UseAES, "need AES cryptographic extension support");
 3034     __ align(CodeEntryAlignment);
 3035     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
 3036     StubCodeMark mark(this, stub_id);
 3037 
 3038     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3039 
 3040     const Register from        = c_rarg0;  // source array address
 3041     const Register to          = c_rarg1;  // destination array address
 3042     const Register key         = c_rarg2;  // key array address
 3043     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3044                                            // and left with the results of the last encryption block
 3045     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3046     const Register keylen      = rscratch1;
 3047 
 3048     address start = __ pc();
 3049 
 3050       __ enter();
 3051 
 3052       __ movw(rscratch2, len_reg);
 3053 
 3054       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3055 
 3056       __ ld1(v2, __ T16B, rvec);
 3057 
 3058       __ ld1(v31, __ T16B, __ post(key, 16));
 3059       __ rev32(v31, __ T16B, v31);
 3060 
 3061       __ cmpw(keylen, 52);
 3062       __ br(Assembler::CC, L_loadkeys_44);
 3063       __ br(Assembler::EQ, L_loadkeys_52);
 3064 
 3065       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3066       __ rev32(v17, __ T16B, v17);
 3067       __ rev32(v18, __ T16B, v18);
 3068     __ BIND(L_loadkeys_52);
 3069       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3070       __ rev32(v19, __ T16B, v19);
 3071       __ rev32(v20, __ T16B, v20);
 3072     __ BIND(L_loadkeys_44);
 3073       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3074       __ rev32(v21, __ T16B, v21);
 3075       __ rev32(v22, __ T16B, v22);
 3076       __ rev32(v23, __ T16B, v23);
 3077       __ rev32(v24, __ T16B, v24);
 3078       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3079       __ rev32(v25, __ T16B, v25);
 3080       __ rev32(v26, __ T16B, v26);
 3081       __ rev32(v27, __ T16B, v27);
 3082       __ rev32(v28, __ T16B, v28);
 3083       __ ld1(v29, v30, __ T16B, key);
 3084       __ rev32(v29, __ T16B, v29);
 3085       __ rev32(v30, __ T16B, v30);
 3086 
 3087     __ BIND(L_aes_loop);
 3088       __ ld1(v0, __ T16B, __ post(from, 16));
 3089       __ orr(v1, __ T16B, v0, v0);
 3090 
 3091       __ br(Assembler::CC, L_rounds_44);
 3092       __ br(Assembler::EQ, L_rounds_52);
 3093 
 3094       __ aesd(v0, v17); __ aesimc(v0, v0);
 3095       __ aesd(v0, v18); __ aesimc(v0, v0);
 3096     __ BIND(L_rounds_52);
 3097       __ aesd(v0, v19); __ aesimc(v0, v0);
 3098       __ aesd(v0, v20); __ aesimc(v0, v0);
 3099     __ BIND(L_rounds_44);
 3100       __ aesd(v0, v21); __ aesimc(v0, v0);
 3101       __ aesd(v0, v22); __ aesimc(v0, v0);
 3102       __ aesd(v0, v23); __ aesimc(v0, v0);
 3103       __ aesd(v0, v24); __ aesimc(v0, v0);
 3104       __ aesd(v0, v25); __ aesimc(v0, v0);
 3105       __ aesd(v0, v26); __ aesimc(v0, v0);
 3106       __ aesd(v0, v27); __ aesimc(v0, v0);
 3107       __ aesd(v0, v28); __ aesimc(v0, v0);
 3108       __ aesd(v0, v29); __ aesimc(v0, v0);
 3109       __ aesd(v0, v30);
 3110       __ eor(v0, __ T16B, v0, v31);
 3111       __ eor(v0, __ T16B, v0, v2);
 3112 
 3113       __ st1(v0, __ T16B, __ post(to, 16));
 3114       __ orr(v2, __ T16B, v1, v1);
 3115 
 3116       __ subw(len_reg, len_reg, 16);
 3117       __ cbnzw(len_reg, L_aes_loop);
 3118 
 3119       __ st1(v2, __ T16B, rvec);
 3120 
 3121       __ mov(r0, rscratch2);
 3122 
 3123       __ leave();
 3124       __ ret(lr);
 3125 
 3126     return start;
 3127   }
 3128 
 3129   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3130   // Inputs: 128-bits. in is preserved.
 3131   // The least-significant 64-bit word is in the upper dword of each vector.
 3132   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3133   // Output: result
 3134   void be_add_128_64(FloatRegister result, FloatRegister in,
 3135                      FloatRegister inc, FloatRegister tmp) {
 3136     assert_different_registers(result, tmp, inc);
 3137 
 3138     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3139                                            // input
 3140     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3141     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3142                                            // MSD == 0 (must be!) to LSD
 3143     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3144   }
 3145 
 3146   // CTR AES crypt.
 3147   // Arguments:
 3148   //
 3149   // Inputs:
 3150   //   c_rarg0   - source byte array address
 3151   //   c_rarg1   - destination byte array address
 3152   //   c_rarg2   - K (key) in little endian int array
 3153   //   c_rarg3   - counter vector byte array address
 3154   //   c_rarg4   - input length
 3155   //   c_rarg5   - saved encryptedCounter start
 3156   //   c_rarg6   - saved used length
 3157   //
 3158   // Output:
 3159   //   r0       - input length
 3160   //
 3161   address generate_counterMode_AESCrypt() {
 3162     const Register in = c_rarg0;
 3163     const Register out = c_rarg1;
 3164     const Register key = c_rarg2;
 3165     const Register counter = c_rarg3;
 3166     const Register saved_len = c_rarg4, len = r10;
 3167     const Register saved_encrypted_ctr = c_rarg5;
 3168     const Register used_ptr = c_rarg6, used = r12;
 3169 
 3170     const Register offset = r7;
 3171     const Register keylen = r11;
 3172 
 3173     const unsigned char block_size = 16;
 3174     const int bulk_width = 4;
 3175     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3176     // performance with larger data sizes, but it also means that the
 3177     // fast path isn't used until you have at least 8 blocks, and up
 3178     // to 127 bytes of data will be executed on the slow path. For
 3179     // that reason, and also so as not to blow away too much icache, 4
 3180     // blocks seems like a sensible compromise.
 3181 
 3182     // Algorithm:
 3183     //
 3184     //    if (len == 0) {
 3185     //        goto DONE;
 3186     //    }
 3187     //    int result = len;
 3188     //    do {
 3189     //        if (used >= blockSize) {
 3190     //            if (len >= bulk_width * blockSize) {
 3191     //                CTR_large_block();
 3192     //                if (len == 0)
 3193     //                    goto DONE;
 3194     //            }
 3195     //            for (;;) {
 3196     //                16ByteVector v0 = counter;
 3197     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3198     //                used = 0;
 3199     //                if (len < blockSize)
 3200     //                    break;    /* goto NEXT */
 3201     //                16ByteVector v1 = load16Bytes(in, offset);
 3202     //                v1 = v1 ^ encryptedCounter;
 3203     //                store16Bytes(out, offset);
 3204     //                used = blockSize;
 3205     //                offset += blockSize;
 3206     //                len -= blockSize;
 3207     //                if (len == 0)
 3208     //                    goto DONE;
 3209     //            }
 3210     //        }
 3211     //      NEXT:
 3212     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3213     //        len--;
 3214     //    } while (len != 0);
 3215     //  DONE:
 3216     //    return result;
 3217     //
 3218     // CTR_large_block()
 3219     //    Wide bulk encryption of whole blocks.
 3220 
 3221     __ align(CodeEntryAlignment);
 3222     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
 3223     StubCodeMark mark(this, stub_id);
 3224     const address start = __ pc();
 3225     __ enter();
 3226 
 3227     Label DONE, CTR_large_block, large_block_return;
 3228     __ ldrw(used, Address(used_ptr));
 3229     __ cbzw(saved_len, DONE);
 3230 
 3231     __ mov(len, saved_len);
 3232     __ mov(offset, 0);
 3233 
 3234     // Compute #rounds for AES based on the length of the key array
 3235     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3236 
 3237     __ aesenc_loadkeys(key, keylen);
 3238 
 3239     {
 3240       Label L_CTR_loop, NEXT;
 3241 
 3242       __ bind(L_CTR_loop);
 3243 
 3244       __ cmp(used, block_size);
 3245       __ br(__ LO, NEXT);
 3246 
 3247       // Maybe we have a lot of data
 3248       __ subsw(rscratch1, len, bulk_width * block_size);
 3249       __ br(__ HS, CTR_large_block);
 3250       __ BIND(large_block_return);
 3251       __ cbzw(len, DONE);
 3252 
 3253       // Setup the counter
 3254       __ movi(v4, __ T4S, 0);
 3255       __ movi(v5, __ T4S, 1);
 3256       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3257 
 3258       // 128-bit big-endian increment
 3259       __ ld1(v0, __ T16B, counter);
 3260       __ rev64(v16, __ T16B, v0);
 3261       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3262       __ rev64(v16, __ T16B, v16);
 3263       __ st1(v16, __ T16B, counter);
 3264       // Previous counter value is in v0
 3265       // v4 contains { 0, 1 }
 3266 
 3267       {
 3268         // We have fewer than bulk_width blocks of data left. Encrypt
 3269         // them one by one until there is less than a full block
 3270         // remaining, being careful to save both the encrypted counter
 3271         // and the counter.
 3272 
 3273         Label inner_loop;
 3274         __ bind(inner_loop);
 3275         // Counter to encrypt is in v0
 3276         __ aesecb_encrypt(noreg, noreg, keylen);
 3277         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3278 
 3279         // Do we have a remaining full block?
 3280 
 3281         __ mov(used, 0);
 3282         __ cmp(len, block_size);
 3283         __ br(__ LO, NEXT);
 3284 
 3285         // Yes, we have a full block
 3286         __ ldrq(v1, Address(in, offset));
 3287         __ eor(v1, __ T16B, v1, v0);
 3288         __ strq(v1, Address(out, offset));
 3289         __ mov(used, block_size);
 3290         __ add(offset, offset, block_size);
 3291 
 3292         __ subw(len, len, block_size);
 3293         __ cbzw(len, DONE);
 3294 
 3295         // Increment the counter, store it back
 3296         __ orr(v0, __ T16B, v16, v16);
 3297         __ rev64(v16, __ T16B, v16);
 3298         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3299         __ rev64(v16, __ T16B, v16);
 3300         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3301 
 3302         __ b(inner_loop);
 3303       }
 3304 
 3305       __ BIND(NEXT);
 3306 
 3307       // Encrypt a single byte, and loop.
 3308       // We expect this to be a rare event.
 3309       __ ldrb(rscratch1, Address(in, offset));
 3310       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3311       __ eor(rscratch1, rscratch1, rscratch2);
 3312       __ strb(rscratch1, Address(out, offset));
 3313       __ add(offset, offset, 1);
 3314       __ add(used, used, 1);
 3315       __ subw(len, len,1);
 3316       __ cbnzw(len, L_CTR_loop);
 3317     }
 3318 
 3319     __ bind(DONE);
 3320     __ strw(used, Address(used_ptr));
 3321     __ mov(r0, saved_len);
 3322 
 3323     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3324     __ ret(lr);
 3325 
 3326     // Bulk encryption
 3327 
 3328     __ BIND (CTR_large_block);
 3329     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3330 
 3331     if (bulk_width == 8) {
 3332       __ sub(sp, sp, 4 * 16);
 3333       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3334     }
 3335     __ sub(sp, sp, 4 * 16);
 3336     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3337     RegSet saved_regs = (RegSet::of(in, out, offset)
 3338                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3339     __ push(saved_regs, sp);
 3340     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3341     __ add(in, in, offset);
 3342     __ add(out, out, offset);
 3343 
 3344     // Keys should already be loaded into the correct registers
 3345 
 3346     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3347     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3348 
 3349     // AES/CTR loop
 3350     {
 3351       Label L_CTR_loop;
 3352       __ BIND(L_CTR_loop);
 3353 
 3354       // Setup the counters
 3355       __ movi(v8, __ T4S, 0);
 3356       __ movi(v9, __ T4S, 1);
 3357       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3358 
 3359       for (int i = 0; i < bulk_width; i++) {
 3360         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3361         __ rev64(v0_ofs, __ T16B, v16);
 3362         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3363       }
 3364 
 3365       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3366 
 3367       // Encrypt the counters
 3368       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3369 
 3370       if (bulk_width == 8) {
 3371         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3372       }
 3373 
 3374       // XOR the encrypted counters with the inputs
 3375       for (int i = 0; i < bulk_width; i++) {
 3376         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3377         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3378         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3379       }
 3380 
 3381       // Write the encrypted data
 3382       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3383       if (bulk_width == 8) {
 3384         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3385       }
 3386 
 3387       __ subw(len, len, 16 * bulk_width);
 3388       __ cbnzw(len, L_CTR_loop);
 3389     }
 3390 
 3391     // Save the counter back where it goes
 3392     __ rev64(v16, __ T16B, v16);
 3393     __ st1(v16, __ T16B, counter);
 3394 
 3395     __ pop(saved_regs, sp);
 3396 
 3397     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3398     if (bulk_width == 8) {
 3399       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3400     }
 3401 
 3402     __ andr(rscratch1, len, -16 * bulk_width);
 3403     __ sub(len, len, rscratch1);
 3404     __ add(offset, offset, rscratch1);
 3405     __ mov(used, 16);
 3406     __ strw(used, Address(used_ptr));
 3407     __ b(large_block_return);
 3408 
 3409     return start;
 3410   }
 3411 
 3412   // Vector AES Galois Counter Mode implementation. Parameters:
 3413   //
 3414   // in = c_rarg0
 3415   // len = c_rarg1
 3416   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3417   // out = c_rarg3
 3418   // key = c_rarg4
 3419   // state = c_rarg5 - GHASH.state
 3420   // subkeyHtbl = c_rarg6 - powers of H
 3421   // counter = c_rarg7 - 16 bytes of CTR
 3422   // return - number of processed bytes
 3423   address generate_galoisCounterMode_AESCrypt() {
 3424     address ghash_polynomial = __ pc();
 3425     __ emit_int64(0x87);  // The low-order bits of the field
 3426                           // polynomial (i.e. p = z^7+z^2+z+1)
 3427                           // repeated in the low and high parts of a
 3428                           // 128-bit vector
 3429     __ emit_int64(0x87);
 3430 
 3431     __ align(CodeEntryAlignment);
 3432     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
 3433     StubCodeMark mark(this, stub_id);
 3434     address start = __ pc();
 3435     __ enter();
 3436 
 3437     const Register in = c_rarg0;
 3438     const Register len = c_rarg1;
 3439     const Register ct = c_rarg2;
 3440     const Register out = c_rarg3;
 3441     // and updated with the incremented counter in the end
 3442 
 3443     const Register key = c_rarg4;
 3444     const Register state = c_rarg5;
 3445 
 3446     const Register subkeyHtbl = c_rarg6;
 3447 
 3448     const Register counter = c_rarg7;
 3449 
 3450     const Register keylen = r10;
 3451     // Save state before entering routine
 3452     __ sub(sp, sp, 4 * 16);
 3453     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3454     __ sub(sp, sp, 4 * 16);
 3455     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3456 
 3457     // __ andr(len, len, -512);
 3458     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3459     __ str(len, __ pre(sp, -2 * wordSize));
 3460 
 3461     Label DONE;
 3462     __ cbz(len, DONE);
 3463 
 3464     // Compute #rounds for AES based on the length of the key array
 3465     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3466 
 3467     __ aesenc_loadkeys(key, keylen);
 3468     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3469     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3470 
 3471     // AES/CTR loop
 3472     {
 3473       Label L_CTR_loop;
 3474       __ BIND(L_CTR_loop);
 3475 
 3476       // Setup the counters
 3477       __ movi(v8, __ T4S, 0);
 3478       __ movi(v9, __ T4S, 1);
 3479       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3480 
 3481       assert(v0->encoding() < v8->encoding(), "");
 3482       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3483         FloatRegister f = as_FloatRegister(i);
 3484         __ rev32(f, __ T16B, v16);
 3485         __ addv(v16, __ T4S, v16, v8);
 3486       }
 3487 
 3488       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3489 
 3490       // Encrypt the counters
 3491       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3492 
 3493       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3494 
 3495       // XOR the encrypted counters with the inputs
 3496       for (int i = 0; i < 8; i++) {
 3497         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3498         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3499         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3500       }
 3501       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3502       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3503 
 3504       __ subw(len, len, 16 * 8);
 3505       __ cbnzw(len, L_CTR_loop);
 3506     }
 3507 
 3508     __ rev32(v16, __ T16B, v16);
 3509     __ st1(v16, __ T16B, counter);
 3510 
 3511     __ ldr(len, Address(sp));
 3512     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3513 
 3514     // GHASH/CTR loop
 3515     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3516                                 len, /*unrolls*/4);
 3517 
 3518 #ifdef ASSERT
 3519     { Label L;
 3520       __ cmp(len, (unsigned char)0);
 3521       __ br(Assembler::EQ, L);
 3522       __ stop("stubGenerator: abort");
 3523       __ bind(L);
 3524   }
 3525 #endif
 3526 
 3527   __ bind(DONE);
 3528     // Return the number of bytes processed
 3529     __ ldr(r0, __ post(sp, 2 * wordSize));
 3530 
 3531     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3532     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3533 
 3534     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3535     __ ret(lr);
 3536      return start;
 3537   }
 3538 
 3539   class Cached64Bytes {
 3540   private:
 3541     MacroAssembler *_masm;
 3542     Register _regs[8];
 3543 
 3544   public:
 3545     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3546       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3547       auto it = rs.begin();
 3548       for (auto &r: _regs) {
 3549         r = *it;
 3550         ++it;
 3551       }
 3552     }
 3553 
 3554     void gen_loads(Register base) {
 3555       for (int i = 0; i < 8; i += 2) {
 3556         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3557       }
 3558     }
 3559 
 3560     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3561     void extract_u32(Register dest, int i) {
 3562       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3563     }
 3564   };
 3565 
 3566   // Utility routines for md5.
 3567   // Clobbers r10 and r11.
 3568   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3569               int k, int s, int t) {
 3570     Register rscratch3 = r10;
 3571     Register rscratch4 = r11;
 3572 
 3573     __ eorw(rscratch3, r3, r4);
 3574     __ movw(rscratch2, t);
 3575     __ andw(rscratch3, rscratch3, r2);
 3576     __ addw(rscratch4, r1, rscratch2);
 3577     reg_cache.extract_u32(rscratch1, k);
 3578     __ eorw(rscratch3, rscratch3, r4);
 3579     __ addw(rscratch4, rscratch4, rscratch1);
 3580     __ addw(rscratch3, rscratch3, rscratch4);
 3581     __ rorw(rscratch2, rscratch3, 32 - s);
 3582     __ addw(r1, rscratch2, r2);
 3583   }
 3584 
 3585   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3586               int k, int s, int t) {
 3587     Register rscratch3 = r10;
 3588     Register rscratch4 = r11;
 3589 
 3590     reg_cache.extract_u32(rscratch1, k);
 3591     __ movw(rscratch2, t);
 3592     __ addw(rscratch4, r1, rscratch2);
 3593     __ addw(rscratch4, rscratch4, rscratch1);
 3594     __ bicw(rscratch2, r3, r4);
 3595     __ andw(rscratch3, r2, r4);
 3596     __ addw(rscratch2, rscratch2, rscratch4);
 3597     __ addw(rscratch2, rscratch2, rscratch3);
 3598     __ rorw(rscratch2, rscratch2, 32 - s);
 3599     __ addw(r1, rscratch2, r2);
 3600   }
 3601 
 3602   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3603               int k, int s, int t) {
 3604     Register rscratch3 = r10;
 3605     Register rscratch4 = r11;
 3606 
 3607     __ eorw(rscratch3, r3, r4);
 3608     __ movw(rscratch2, t);
 3609     __ addw(rscratch4, r1, rscratch2);
 3610     reg_cache.extract_u32(rscratch1, k);
 3611     __ eorw(rscratch3, rscratch3, r2);
 3612     __ addw(rscratch4, rscratch4, rscratch1);
 3613     __ addw(rscratch3, rscratch3, rscratch4);
 3614     __ rorw(rscratch2, rscratch3, 32 - s);
 3615     __ addw(r1, rscratch2, r2);
 3616   }
 3617 
 3618   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3619               int k, int s, int t) {
 3620     Register rscratch3 = r10;
 3621     Register rscratch4 = r11;
 3622 
 3623     __ movw(rscratch3, t);
 3624     __ ornw(rscratch2, r2, r4);
 3625     __ addw(rscratch4, r1, rscratch3);
 3626     reg_cache.extract_u32(rscratch1, k);
 3627     __ eorw(rscratch3, rscratch2, r3);
 3628     __ addw(rscratch4, rscratch4, rscratch1);
 3629     __ addw(rscratch3, rscratch3, rscratch4);
 3630     __ rorw(rscratch2, rscratch3, 32 - s);
 3631     __ addw(r1, rscratch2, r2);
 3632   }
 3633 
 3634   // Arguments:
 3635   //
 3636   // Inputs:
 3637   //   c_rarg0   - byte[]  source+offset
 3638   //   c_rarg1   - int[]   SHA.state
 3639   //   c_rarg2   - int     offset
 3640   //   c_rarg3   - int     limit
 3641   //
 3642   address generate_md5_implCompress(StubGenStubId stub_id) {
 3643     bool multi_block;
 3644     switch (stub_id) {
 3645     case md5_implCompress_id:
 3646       multi_block = false;
 3647       break;
 3648     case md5_implCompressMB_id:
 3649       multi_block = true;
 3650       break;
 3651     default:
 3652       ShouldNotReachHere();
 3653     }
 3654     __ align(CodeEntryAlignment);
 3655 
 3656     StubCodeMark mark(this, stub_id);
 3657     address start = __ pc();
 3658 
 3659     Register buf       = c_rarg0;
 3660     Register state     = c_rarg1;
 3661     Register ofs       = c_rarg2;
 3662     Register limit     = c_rarg3;
 3663     Register a         = r4;
 3664     Register b         = r5;
 3665     Register c         = r6;
 3666     Register d         = r7;
 3667     Register rscratch3 = r10;
 3668     Register rscratch4 = r11;
 3669 
 3670     Register state_regs[2] = { r12, r13 };
 3671     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3672     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3673 
 3674     __ push(saved_regs, sp);
 3675 
 3676     __ ldp(state_regs[0], state_regs[1], Address(state));
 3677     __ ubfx(a, state_regs[0],  0, 32);
 3678     __ ubfx(b, state_regs[0], 32, 32);
 3679     __ ubfx(c, state_regs[1],  0, 32);
 3680     __ ubfx(d, state_regs[1], 32, 32);
 3681 
 3682     Label md5_loop;
 3683     __ BIND(md5_loop);
 3684 
 3685     reg_cache.gen_loads(buf);
 3686 
 3687     // Round 1
 3688     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3689     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3690     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3691     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3692     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3693     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3694     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3695     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3696     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3697     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3698     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3699     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3700     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3701     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3702     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3703     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3704 
 3705     // Round 2
 3706     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3707     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3708     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3709     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3710     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3711     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3712     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3713     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3714     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3715     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3716     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3717     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3718     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3719     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3720     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3721     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3722 
 3723     // Round 3
 3724     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3725     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3726     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3727     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3728     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3729     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3730     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3731     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3732     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3733     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3734     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3735     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3736     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3737     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3738     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3739     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3740 
 3741     // Round 4
 3742     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3743     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3744     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3745     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3746     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3747     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3748     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3749     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3750     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3751     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3752     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3753     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3754     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3755     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3756     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3757     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3758 
 3759     __ addw(a, state_regs[0], a);
 3760     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3761     __ addw(b, rscratch2, b);
 3762     __ addw(c, state_regs[1], c);
 3763     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3764     __ addw(d, rscratch4, d);
 3765 
 3766     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3767     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3768 
 3769     if (multi_block) {
 3770       __ add(buf, buf, 64);
 3771       __ add(ofs, ofs, 64);
 3772       __ cmp(ofs, limit);
 3773       __ br(Assembler::LE, md5_loop);
 3774       __ mov(c_rarg0, ofs); // return ofs
 3775     }
 3776 
 3777     // write hash values back in the correct order
 3778     __ stp(state_regs[0], state_regs[1], Address(state));
 3779 
 3780     __ pop(saved_regs, sp);
 3781 
 3782     __ ret(lr);
 3783 
 3784     return start;
 3785   }
 3786 
 3787   // Arguments:
 3788   //
 3789   // Inputs:
 3790   //   c_rarg0   - byte[]  source+offset
 3791   //   c_rarg1   - int[]   SHA.state
 3792   //   c_rarg2   - int     offset
 3793   //   c_rarg3   - int     limit
 3794   //
 3795   address generate_sha1_implCompress(StubGenStubId stub_id) {
 3796     bool multi_block;
 3797     switch (stub_id) {
 3798     case sha1_implCompress_id:
 3799       multi_block = false;
 3800       break;
 3801     case sha1_implCompressMB_id:
 3802       multi_block = true;
 3803       break;
 3804     default:
 3805       ShouldNotReachHere();
 3806     }
 3807 
 3808     __ align(CodeEntryAlignment);
 3809 
 3810     StubCodeMark mark(this, stub_id);
 3811     address start = __ pc();
 3812 
 3813     Register buf   = c_rarg0;
 3814     Register state = c_rarg1;
 3815     Register ofs   = c_rarg2;
 3816     Register limit = c_rarg3;
 3817 
 3818     Label keys;
 3819     Label sha1_loop;
 3820 
 3821     // load the keys into v0..v3
 3822     __ adr(rscratch1, keys);
 3823     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3824     // load 5 words state into v6, v7
 3825     __ ldrq(v6, Address(state, 0));
 3826     __ ldrs(v7, Address(state, 16));
 3827 
 3828 
 3829     __ BIND(sha1_loop);
 3830     // load 64 bytes of data into v16..v19
 3831     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3832     __ rev32(v16, __ T16B, v16);
 3833     __ rev32(v17, __ T16B, v17);
 3834     __ rev32(v18, __ T16B, v18);
 3835     __ rev32(v19, __ T16B, v19);
 3836 
 3837     // do the sha1
 3838     __ addv(v4, __ T4S, v16, v0);
 3839     __ orr(v20, __ T16B, v6, v6);
 3840 
 3841     FloatRegister d0 = v16;
 3842     FloatRegister d1 = v17;
 3843     FloatRegister d2 = v18;
 3844     FloatRegister d3 = v19;
 3845 
 3846     for (int round = 0; round < 20; round++) {
 3847       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3848       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3849       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3850       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3851       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3852 
 3853       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3854       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3855       __ sha1h(tmp2, __ T4S, v20);
 3856       if (round < 5)
 3857         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3858       else if (round < 10 || round >= 15)
 3859         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3860       else
 3861         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3862       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3863 
 3864       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3865     }
 3866 
 3867     __ addv(v7, __ T2S, v7, v21);
 3868     __ addv(v6, __ T4S, v6, v20);
 3869 
 3870     if (multi_block) {
 3871       __ add(ofs, ofs, 64);
 3872       __ cmp(ofs, limit);
 3873       __ br(Assembler::LE, sha1_loop);
 3874       __ mov(c_rarg0, ofs); // return ofs
 3875     }
 3876 
 3877     __ strq(v6, Address(state, 0));
 3878     __ strs(v7, Address(state, 16));
 3879 
 3880     __ ret(lr);
 3881 
 3882     __ bind(keys);
 3883     __ emit_int32(0x5a827999);
 3884     __ emit_int32(0x6ed9eba1);
 3885     __ emit_int32(0x8f1bbcdc);
 3886     __ emit_int32(0xca62c1d6);
 3887 
 3888     return start;
 3889   }
 3890 
 3891 
 3892   // Arguments:
 3893   //
 3894   // Inputs:
 3895   //   c_rarg0   - byte[]  source+offset
 3896   //   c_rarg1   - int[]   SHA.state
 3897   //   c_rarg2   - int     offset
 3898   //   c_rarg3   - int     limit
 3899   //
 3900   address generate_sha256_implCompress(StubGenStubId stub_id) {
 3901     bool multi_block;
 3902     switch (stub_id) {
 3903     case sha256_implCompress_id:
 3904       multi_block = false;
 3905       break;
 3906     case sha256_implCompressMB_id:
 3907       multi_block = true;
 3908       break;
 3909     default:
 3910       ShouldNotReachHere();
 3911     }
 3912 
 3913     static const uint32_t round_consts[64] = {
 3914       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3915       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3916       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3917       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3918       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3919       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3920       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3921       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3922       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3923       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3924       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3925       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3926       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3927       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3928       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3929       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3930     };
 3931 
 3932     __ align(CodeEntryAlignment);
 3933 
 3934     StubCodeMark mark(this, stub_id);
 3935     address start = __ pc();
 3936 
 3937     Register buf   = c_rarg0;
 3938     Register state = c_rarg1;
 3939     Register ofs   = c_rarg2;
 3940     Register limit = c_rarg3;
 3941 
 3942     Label sha1_loop;
 3943 
 3944     __ stpd(v8, v9, __ pre(sp, -32));
 3945     __ stpd(v10, v11, Address(sp, 16));
 3946 
 3947 // dga == v0
 3948 // dgb == v1
 3949 // dg0 == v2
 3950 // dg1 == v3
 3951 // dg2 == v4
 3952 // t0 == v6
 3953 // t1 == v7
 3954 
 3955     // load 16 keys to v16..v31
 3956     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3957     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3958     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3959     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3960     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3961 
 3962     // load 8 words (256 bits) state
 3963     __ ldpq(v0, v1, state);
 3964 
 3965     __ BIND(sha1_loop);
 3966     // load 64 bytes of data into v8..v11
 3967     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3968     __ rev32(v8, __ T16B, v8);
 3969     __ rev32(v9, __ T16B, v9);
 3970     __ rev32(v10, __ T16B, v10);
 3971     __ rev32(v11, __ T16B, v11);
 3972 
 3973     __ addv(v6, __ T4S, v8, v16);
 3974     __ orr(v2, __ T16B, v0, v0);
 3975     __ orr(v3, __ T16B, v1, v1);
 3976 
 3977     FloatRegister d0 = v8;
 3978     FloatRegister d1 = v9;
 3979     FloatRegister d2 = v10;
 3980     FloatRegister d3 = v11;
 3981 
 3982 
 3983     for (int round = 0; round < 16; round++) {
 3984       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3985       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3986       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3987       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3988 
 3989       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3990        __ orr(v4, __ T16B, v2, v2);
 3991       if (round < 15)
 3992         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3993       __ sha256h(v2, __ T4S, v3, tmp2);
 3994       __ sha256h2(v3, __ T4S, v4, tmp2);
 3995       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3996 
 3997       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3998     }
 3999 
 4000     __ addv(v0, __ T4S, v0, v2);
 4001     __ addv(v1, __ T4S, v1, v3);
 4002 
 4003     if (multi_block) {
 4004       __ add(ofs, ofs, 64);
 4005       __ cmp(ofs, limit);
 4006       __ br(Assembler::LE, sha1_loop);
 4007       __ mov(c_rarg0, ofs); // return ofs
 4008     }
 4009 
 4010     __ ldpd(v10, v11, Address(sp, 16));
 4011     __ ldpd(v8, v9, __ post(sp, 32));
 4012 
 4013     __ stpq(v0, v1, state);
 4014 
 4015     __ ret(lr);
 4016 
 4017     return start;
 4018   }
 4019 
 4020   // Double rounds for sha512.
 4021   void sha512_dround(int dr,
 4022                      FloatRegister vi0, FloatRegister vi1,
 4023                      FloatRegister vi2, FloatRegister vi3,
 4024                      FloatRegister vi4, FloatRegister vrc0,
 4025                      FloatRegister vrc1, FloatRegister vin0,
 4026                      FloatRegister vin1, FloatRegister vin2,
 4027                      FloatRegister vin3, FloatRegister vin4) {
 4028       if (dr < 36) {
 4029         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4030       }
 4031       __ addv(v5, __ T2D, vrc0, vin0);
 4032       __ ext(v6, __ T16B, vi2, vi3, 8);
 4033       __ ext(v5, __ T16B, v5, v5, 8);
 4034       __ ext(v7, __ T16B, vi1, vi2, 8);
 4035       __ addv(vi3, __ T2D, vi3, v5);
 4036       if (dr < 32) {
 4037         __ ext(v5, __ T16B, vin3, vin4, 8);
 4038         __ sha512su0(vin0, __ T2D, vin1);
 4039       }
 4040       __ sha512h(vi3, __ T2D, v6, v7);
 4041       if (dr < 32) {
 4042         __ sha512su1(vin0, __ T2D, vin2, v5);
 4043       }
 4044       __ addv(vi4, __ T2D, vi1, vi3);
 4045       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4046   }
 4047 
 4048   // Arguments:
 4049   //
 4050   // Inputs:
 4051   //   c_rarg0   - byte[]  source+offset
 4052   //   c_rarg1   - int[]   SHA.state
 4053   //   c_rarg2   - int     offset
 4054   //   c_rarg3   - int     limit
 4055   //
 4056   address generate_sha512_implCompress(StubGenStubId stub_id) {
 4057     bool multi_block;
 4058     switch (stub_id) {
 4059     case sha512_implCompress_id:
 4060       multi_block = false;
 4061       break;
 4062     case sha512_implCompressMB_id:
 4063       multi_block = true;
 4064       break;
 4065     default:
 4066       ShouldNotReachHere();
 4067     }
 4068 
 4069     static const uint64_t round_consts[80] = {
 4070       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4071       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4072       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4073       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4074       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4075       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4076       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4077       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4078       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4079       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4080       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4081       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4082       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4083       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4084       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4085       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4086       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4087       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4088       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4089       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4090       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4091       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4092       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4093       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4094       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4095       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4096       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4097     };
 4098 
 4099     __ align(CodeEntryAlignment);
 4100 
 4101     StubCodeMark mark(this, stub_id);
 4102     address start = __ pc();
 4103 
 4104     Register buf   = c_rarg0;
 4105     Register state = c_rarg1;
 4106     Register ofs   = c_rarg2;
 4107     Register limit = c_rarg3;
 4108 
 4109     __ stpd(v8, v9, __ pre(sp, -64));
 4110     __ stpd(v10, v11, Address(sp, 16));
 4111     __ stpd(v12, v13, Address(sp, 32));
 4112     __ stpd(v14, v15, Address(sp, 48));
 4113 
 4114     Label sha512_loop;
 4115 
 4116     // load state
 4117     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4118 
 4119     // load first 4 round constants
 4120     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4121     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4122 
 4123     __ BIND(sha512_loop);
 4124     // load 128B of data into v12..v19
 4125     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4126     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4127     __ rev64(v12, __ T16B, v12);
 4128     __ rev64(v13, __ T16B, v13);
 4129     __ rev64(v14, __ T16B, v14);
 4130     __ rev64(v15, __ T16B, v15);
 4131     __ rev64(v16, __ T16B, v16);
 4132     __ rev64(v17, __ T16B, v17);
 4133     __ rev64(v18, __ T16B, v18);
 4134     __ rev64(v19, __ T16B, v19);
 4135 
 4136     __ mov(rscratch2, rscratch1);
 4137 
 4138     __ mov(v0, __ T16B, v8);
 4139     __ mov(v1, __ T16B, v9);
 4140     __ mov(v2, __ T16B, v10);
 4141     __ mov(v3, __ T16B, v11);
 4142 
 4143     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4144     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4145     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4146     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4147     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4148     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4149     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4150     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4151     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4152     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4153     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4154     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4155     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4156     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4157     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4158     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4159     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4160     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4161     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4162     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4163     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4164     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4165     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4166     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4167     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4168     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4169     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4170     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4171     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4172     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4173     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4174     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4175     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4176     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4177     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4178     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4179     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4180     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4181     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4182     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4183 
 4184     __ addv(v8, __ T2D, v8, v0);
 4185     __ addv(v9, __ T2D, v9, v1);
 4186     __ addv(v10, __ T2D, v10, v2);
 4187     __ addv(v11, __ T2D, v11, v3);
 4188 
 4189     if (multi_block) {
 4190       __ add(ofs, ofs, 128);
 4191       __ cmp(ofs, limit);
 4192       __ br(Assembler::LE, sha512_loop);
 4193       __ mov(c_rarg0, ofs); // return ofs
 4194     }
 4195 
 4196     __ st1(v8, v9, v10, v11, __ T2D, state);
 4197 
 4198     __ ldpd(v14, v15, Address(sp, 48));
 4199     __ ldpd(v12, v13, Address(sp, 32));
 4200     __ ldpd(v10, v11, Address(sp, 16));
 4201     __ ldpd(v8, v9, __ post(sp, 64));
 4202 
 4203     __ ret(lr);
 4204 
 4205     return start;
 4206   }
 4207 
 4208   // Execute one round of keccak of two computations in parallel.
 4209   // One of the states should be loaded into the lower halves of
 4210   // the vector registers v0-v24, the other should be loaded into
 4211   // the upper halves of those registers. The ld1r instruction loads
 4212   // the round constant into both halves of register v31.
 4213   // Intermediate results c0...c5 and d0...d5 are computed
 4214   // in registers v25...v30.
 4215   // All vector instructions that are used operate on both register
 4216   // halves in parallel.
 4217   // If only a single computation is needed, one can only load the lower halves.
 4218   void keccak_round(Register rscratch1) {
 4219   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4220   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4221   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4222   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4223   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4224   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4225   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4226   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4227   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4228   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4229 
 4230   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4231   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4232   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4233   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4234   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4235 
 4236   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4237   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4238   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4239   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4240   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4241   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4242   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4243   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4244   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4245   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4246   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4247   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4248   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4249   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4250   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4251   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4252   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4253   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4254   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4255   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4256   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4257   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4258   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4259   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4260   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4261 
 4262   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4263   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4264   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4265   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4266   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4267 
 4268   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4269 
 4270   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4271   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4272   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4273   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4274   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4275 
 4276   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4277   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4278   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4279   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4280   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4281 
 4282   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4283   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4284   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4285   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4286   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4287 
 4288   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4289   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4290   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4291   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4292   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4293 
 4294   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4295   }
 4296 
 4297   // Arguments:
 4298   //
 4299   // Inputs:
 4300   //   c_rarg0   - byte[]  source+offset
 4301   //   c_rarg1   - byte[]  SHA.state
 4302   //   c_rarg2   - int     block_size
 4303   //   c_rarg3   - int     offset
 4304   //   c_rarg4   - int     limit
 4305   //
 4306   address generate_sha3_implCompress(StubGenStubId stub_id) {
 4307     bool multi_block;
 4308     switch (stub_id) {
 4309     case sha3_implCompress_id:
 4310       multi_block = false;
 4311       break;
 4312     case sha3_implCompressMB_id:
 4313       multi_block = true;
 4314       break;
 4315     default:
 4316       ShouldNotReachHere();
 4317     }
 4318 
 4319     static const uint64_t round_consts[24] = {
 4320       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4321       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4322       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4323       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4324       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4325       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4326       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4327       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4328     };
 4329 
 4330     __ align(CodeEntryAlignment);
 4331 
 4332     StubCodeMark mark(this, stub_id);
 4333     address start = __ pc();
 4334 
 4335     Register buf           = c_rarg0;
 4336     Register state         = c_rarg1;
 4337     Register block_size    = c_rarg2;
 4338     Register ofs           = c_rarg3;
 4339     Register limit         = c_rarg4;
 4340 
 4341     Label sha3_loop, rounds24_loop;
 4342     Label sha3_512_or_sha3_384, shake128;
 4343 
 4344     __ stpd(v8, v9, __ pre(sp, -64));
 4345     __ stpd(v10, v11, Address(sp, 16));
 4346     __ stpd(v12, v13, Address(sp, 32));
 4347     __ stpd(v14, v15, Address(sp, 48));
 4348 
 4349     // load state
 4350     __ add(rscratch1, state, 32);
 4351     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4352     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4353     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4354     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4355     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4356     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4357     __ ld1(v24, __ T1D, rscratch1);
 4358 
 4359     __ BIND(sha3_loop);
 4360 
 4361     // 24 keccak rounds
 4362     __ movw(rscratch2, 24);
 4363 
 4364     // load round_constants base
 4365     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4366 
 4367     // load input
 4368     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4369     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4370     __ eor(v0, __ T8B, v0, v25);
 4371     __ eor(v1, __ T8B, v1, v26);
 4372     __ eor(v2, __ T8B, v2, v27);
 4373     __ eor(v3, __ T8B, v3, v28);
 4374     __ eor(v4, __ T8B, v4, v29);
 4375     __ eor(v5, __ T8B, v5, v30);
 4376     __ eor(v6, __ T8B, v6, v31);
 4377 
 4378     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4379     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4380 
 4381     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4382     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4383     __ eor(v7, __ T8B, v7, v25);
 4384     __ eor(v8, __ T8B, v8, v26);
 4385     __ eor(v9, __ T8B, v9, v27);
 4386     __ eor(v10, __ T8B, v10, v28);
 4387     __ eor(v11, __ T8B, v11, v29);
 4388     __ eor(v12, __ T8B, v12, v30);
 4389     __ eor(v13, __ T8B, v13, v31);
 4390 
 4391     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4392     __ eor(v14, __ T8B, v14, v25);
 4393     __ eor(v15, __ T8B, v15, v26);
 4394     __ eor(v16, __ T8B, v16, v27);
 4395 
 4396     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4397     __ andw(c_rarg5, block_size, 48);
 4398     __ cbzw(c_rarg5, rounds24_loop);
 4399 
 4400     __ tbnz(block_size, 5, shake128);
 4401     // block_size == 144, bit5 == 0, SHA3-224
 4402     __ ldrd(v28, __ post(buf, 8));
 4403     __ eor(v17, __ T8B, v17, v28);
 4404     __ b(rounds24_loop);
 4405 
 4406     __ BIND(shake128);
 4407     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4408     __ eor(v17, __ T8B, v17, v28);
 4409     __ eor(v18, __ T8B, v18, v29);
 4410     __ eor(v19, __ T8B, v19, v30);
 4411     __ eor(v20, __ T8B, v20, v31);
 4412     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4413 
 4414     __ BIND(sha3_512_or_sha3_384);
 4415     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4416     __ eor(v7, __ T8B, v7, v25);
 4417     __ eor(v8, __ T8B, v8, v26);
 4418     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4419 
 4420     // SHA3-384
 4421     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4422     __ eor(v9,  __ T8B, v9,  v27);
 4423     __ eor(v10, __ T8B, v10, v28);
 4424     __ eor(v11, __ T8B, v11, v29);
 4425     __ eor(v12, __ T8B, v12, v30);
 4426 
 4427     __ BIND(rounds24_loop);
 4428     __ subw(rscratch2, rscratch2, 1);
 4429 
 4430     keccak_round(rscratch1);
 4431 
 4432     __ cbnzw(rscratch2, rounds24_loop);
 4433 
 4434     if (multi_block) {
 4435       __ add(ofs, ofs, block_size);
 4436       __ cmp(ofs, limit);
 4437       __ br(Assembler::LE, sha3_loop);
 4438       __ mov(c_rarg0, ofs); // return ofs
 4439     }
 4440 
 4441     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4442     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4443     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4444     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4445     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4446     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4447     __ st1(v24, __ T1D, state);
 4448 
 4449     // restore callee-saved registers
 4450     __ ldpd(v14, v15, Address(sp, 48));
 4451     __ ldpd(v12, v13, Address(sp, 32));
 4452     __ ldpd(v10, v11, Address(sp, 16));
 4453     __ ldpd(v8, v9, __ post(sp, 64));
 4454 
 4455     __ ret(lr);
 4456 
 4457     return start;
 4458   }
 4459 
 4460   // Inputs:
 4461   //   c_rarg0   - long[]  state0
 4462   //   c_rarg1   - long[]  state1
 4463   address generate_double_keccak() {
 4464     static const uint64_t round_consts[24] = {
 4465       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4466       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4467       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4468       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4469       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4470       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4471       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4472       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4473     };
 4474 
 4475     // Implements the double_keccak() method of the
 4476     // sun.secyrity.provider.SHA3Parallel class
 4477     __ align(CodeEntryAlignment);
 4478     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4479     address start = __ pc();
 4480     __ enter();
 4481 
 4482     Register state0        = c_rarg0;
 4483     Register state1        = c_rarg1;
 4484 
 4485     Label rounds24_loop;
 4486 
 4487     // save callee-saved registers
 4488     __ stpd(v8, v9, __ pre(sp, -64));
 4489     __ stpd(v10, v11, Address(sp, 16));
 4490     __ stpd(v12, v13, Address(sp, 32));
 4491     __ stpd(v14, v15, Address(sp, 48));
 4492 
 4493     // load states
 4494     __ add(rscratch1, state0, 32);
 4495     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4496     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4497     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4498     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4499     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4500     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4501     __ ld1(v24, __ D, 0, rscratch1);
 4502     __ add(rscratch1, state1, 32);
 4503     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4504     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4505     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4506     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4507     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4508     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4509     __ ld1(v24, __ D, 1, rscratch1);
 4510 
 4511     // 24 keccak rounds
 4512     __ movw(rscratch2, 24);
 4513 
 4514     // load round_constants base
 4515     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4516 
 4517     __ BIND(rounds24_loop);
 4518     __ subw(rscratch2, rscratch2, 1);
 4519     keccak_round(rscratch1);
 4520     __ cbnzw(rscratch2, rounds24_loop);
 4521 
 4522     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4523     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4524     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4525     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4526     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4527     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4528     __ st1(v24, __ D, 0, state0);
 4529     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4530     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4531     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4532     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4533     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4534     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4535     __ st1(v24, __ D, 1, state1);
 4536 
 4537     // restore callee-saved vector registers
 4538     __ ldpd(v14, v15, Address(sp, 48));
 4539     __ ldpd(v12, v13, Address(sp, 32));
 4540     __ ldpd(v10, v11, Address(sp, 16));
 4541     __ ldpd(v8, v9, __ post(sp, 64));
 4542 
 4543     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4544     __ mov(r0, zr); // return 0
 4545     __ ret(lr);
 4546 
 4547     return start;
 4548   }
 4549 
 4550   // ChaCha20 block function.  This version parallelizes the 32-bit
 4551   // state elements on each of 16 vectors, producing 4 blocks of
 4552   // keystream at a time.
 4553   //
 4554   // state (int[16]) = c_rarg0
 4555   // keystream (byte[256]) = c_rarg1
 4556   // return - number of bytes of produced keystream (always 256)
 4557   //
 4558   // This implementation takes each 32-bit integer from the state
 4559   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4560   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4561   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4562   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4563   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4564   // operations represented by one QUARTERROUND function, we instead stack all
 4565   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4566   // and then do the same for the second set of 4 quarter rounds.  This removes
 4567   // some latency that would otherwise be incurred by waiting for an add to
 4568   // complete before performing an xor (which depends on the result of the
 4569   // add), etc. An adjustment happens between the first and second groups of 4
 4570   // quarter rounds, but this is done only in the inputs to the macro functions
 4571   // that generate the assembly instructions - these adjustments themselves are
 4572   // not part of the resulting assembly.
 4573   // The 4 registers v0-v3 are used during the quarter round operations as
 4574   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4575   // registers become the vectors involved in adding the start state back onto
 4576   // the post-QR working state.  After the adds are complete, each of the 16
 4577   // vectors write their first lane back to the keystream buffer, followed
 4578   // by the second lane from all vectors and so on.
 4579   address generate_chacha20Block_blockpar() {
 4580     Label L_twoRounds, L_cc20_const;
 4581     // The constant data is broken into two 128-bit segments to be loaded
 4582     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4583     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4584     // The second 128-bits is a table constant used for 8-bit left rotations.
 4585     __ BIND(L_cc20_const);
 4586     __ emit_int64(0x0000000100000000UL);
 4587     __ emit_int64(0x0000000300000002UL);
 4588     __ emit_int64(0x0605040702010003UL);
 4589     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4590 
 4591     __ align(CodeEntryAlignment);
 4592     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
 4593     StubCodeMark mark(this, stub_id);
 4594     address start = __ pc();
 4595     __ enter();
 4596 
 4597     int i, j;
 4598     const Register state = c_rarg0;
 4599     const Register keystream = c_rarg1;
 4600     const Register loopCtr = r10;
 4601     const Register tmpAddr = r11;
 4602     const FloatRegister ctrAddOverlay = v28;
 4603     const FloatRegister lrot8Tbl = v29;
 4604 
 4605     // Organize SIMD registers in an array that facilitates
 4606     // putting repetitive opcodes into loop structures.  It is
 4607     // important that each grouping of 4 registers is monotonically
 4608     // increasing to support the requirements of multi-register
 4609     // instructions (e.g. ld4r, st4, etc.)
 4610     const FloatRegister workSt[16] = {
 4611          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4612         v20, v21, v22, v23, v24, v25, v26, v27
 4613     };
 4614 
 4615     // Pull in constant data.  The first 16 bytes are the add overlay
 4616     // which is applied to the vector holding the counter (state[12]).
 4617     // The second 16 bytes is the index register for the 8-bit left
 4618     // rotation tbl instruction.
 4619     __ adr(tmpAddr, L_cc20_const);
 4620     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4621 
 4622     // Load from memory and interlace across 16 SIMD registers,
 4623     // With each word from memory being broadcast to all lanes of
 4624     // each successive SIMD register.
 4625     //      Addr(0) -> All lanes in workSt[i]
 4626     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4627     __ mov(tmpAddr, state);
 4628     for (i = 0; i < 16; i += 4) {
 4629       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4630           __ post(tmpAddr, 16));
 4631     }
 4632     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4633 
 4634     // Before entering the loop, create 5 4-register arrays.  These
 4635     // will hold the 4 registers that represent the a/b/c/d fields
 4636     // in the quarter round operation.  For instance the "b" field
 4637     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4638     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4639     // since it is part of a diagonal organization.  The aSet and scratch
 4640     // register sets are defined at declaration time because they do not change
 4641     // organization at any point during the 20-round processing.
 4642     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4643     FloatRegister bSet[4];
 4644     FloatRegister cSet[4];
 4645     FloatRegister dSet[4];
 4646     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4647 
 4648     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4649     __ mov(loopCtr, 10);
 4650     __ BIND(L_twoRounds);
 4651 
 4652     // Set to columnar organization and do the following 4 quarter-rounds:
 4653     // QUARTERROUND(0, 4, 8, 12)
 4654     // QUARTERROUND(1, 5, 9, 13)
 4655     // QUARTERROUND(2, 6, 10, 14)
 4656     // QUARTERROUND(3, 7, 11, 15)
 4657     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4658     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4659     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4660 
 4661     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4662     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4663     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4664 
 4665     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4666     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4667     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4668 
 4669     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4670     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4671     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4672 
 4673     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4674     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4675     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4676 
 4677     // Set to diagonal organization and do the next 4 quarter-rounds:
 4678     // QUARTERROUND(0, 5, 10, 15)
 4679     // QUARTERROUND(1, 6, 11, 12)
 4680     // QUARTERROUND(2, 7, 8, 13)
 4681     // QUARTERROUND(3, 4, 9, 14)
 4682     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4683     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4684     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4685 
 4686     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4687     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4688     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4689 
 4690     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4691     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4692     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4693 
 4694     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4695     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4696     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4697 
 4698     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4699     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4700     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4701 
 4702     // Decrement and iterate
 4703     __ sub(loopCtr, loopCtr, 1);
 4704     __ cbnz(loopCtr, L_twoRounds);
 4705 
 4706     __ mov(tmpAddr, state);
 4707 
 4708     // Add the starting state back to the post-loop keystream
 4709     // state.  We read/interlace the state array from memory into
 4710     // 4 registers similar to what we did in the beginning.  Then
 4711     // add the counter overlay onto workSt[12] at the end.
 4712     for (i = 0; i < 16; i += 4) {
 4713       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4714       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4715       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4716       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4717       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4718     }
 4719     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4720 
 4721     // Write working state into the keystream buffer.  This is accomplished
 4722     // by taking the lane "i" from each of the four vectors and writing
 4723     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4724     // repeating with the next 4 vectors until all 16 vectors have been used.
 4725     // Then move to the next lane and repeat the process until all lanes have
 4726     // been written.
 4727     for (i = 0; i < 4; i++) {
 4728       for (j = 0; j < 16; j += 4) {
 4729         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4730             __ post(keystream, 16));
 4731       }
 4732     }
 4733 
 4734     __ mov(r0, 256);             // Return length of output keystream
 4735     __ leave();
 4736     __ ret(lr);
 4737 
 4738     return start;
 4739   }
 4740 
 4741   // Helpers to schedule parallel operation bundles across vector
 4742   // register sequences of size 2, 4 or 8.
 4743 
 4744   // Implement various primitive computations across vector sequences
 4745 
 4746   template<int N>
 4747   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4748                const VSeq<N>& v1, const VSeq<N>& v2) {
 4749     // output must not be constant
 4750     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4751     // output cannot overwrite pending inputs
 4752     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4753     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4754     for (int i = 0; i < N; i++) {
 4755       __ addv(v[i], T, v1[i], v2[i]);
 4756     }
 4757   }
 4758 
 4759   template<int N>
 4760   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4761                const VSeq<N>& v1, const VSeq<N>& v2) {
 4762     // output must not be constant
 4763     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4764     // output cannot overwrite pending inputs
 4765     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4766     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4767     for (int i = 0; i < N; i++) {
 4768       __ subv(v[i], T, v1[i], v2[i]);
 4769     }
 4770   }
 4771 
 4772   template<int N>
 4773   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4774                const VSeq<N>& v1, const VSeq<N>& v2) {
 4775     // output must not be constant
 4776     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4777     // output cannot overwrite pending inputs
 4778     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4779     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4780     for (int i = 0; i < N; i++) {
 4781       __ mulv(v[i], T, v1[i], v2[i]);
 4782     }
 4783   }
 4784 
 4785   template<int N>
 4786   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4787     // output must not be constant
 4788     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4789     // output cannot overwrite pending inputs
 4790     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4791     for (int i = 0; i < N; i++) {
 4792       __ negr(v[i], T, v1[i]);
 4793     }
 4794   }
 4795 
 4796   template<int N>
 4797   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4798                const VSeq<N>& v1, int shift) {
 4799     // output must not be constant
 4800     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4801     // output cannot overwrite pending inputs
 4802     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4803     for (int i = 0; i < N; i++) {
 4804       __ sshr(v[i], T, v1[i], shift);
 4805     }
 4806   }
 4807 
 4808   template<int N>
 4809   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4810     // output must not be constant
 4811     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4812     // output cannot overwrite pending inputs
 4813     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4814     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4815     for (int i = 0; i < N; i++) {
 4816       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4817     }
 4818   }
 4819 
 4820   template<int N>
 4821   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4822     // output must not be constant
 4823     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4824     // output cannot overwrite pending inputs
 4825     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4826     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4827     for (int i = 0; i < N; i++) {
 4828       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4829     }
 4830   }
 4831 
 4832   template<int N>
 4833   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4834     // output must not be constant
 4835     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4836     // output cannot overwrite pending inputs
 4837     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4838     for (int i = 0; i < N; i++) {
 4839       __ notr(v[i], __ T16B, v1[i]);
 4840     }
 4841   }
 4842 
 4843   template<int N>
 4844   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4845     // output must not be constant
 4846     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4847     // output cannot overwrite pending inputs
 4848     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4849     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4850     for (int i = 0; i < N; i++) {
 4851       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4852     }
 4853   }
 4854 
 4855   template<int N>
 4856   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4857     // output must not be constant
 4858     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4859     // output cannot overwrite pending inputs
 4860     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4861     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4862     for (int i = 0; i < N; i++) {
 4863       __ mlsv(v[i], T, v1[i], v2[i]);
 4864     }
 4865   }
 4866 
 4867   // load N/2 successive pairs of quadword values from memory in order
 4868   // into N successive vector registers of the sequence via the
 4869   // address supplied in base.
 4870   template<int N>
 4871   void vs_ldpq(const VSeq<N>& v, Register base) {
 4872     for (int i = 0; i < N; i += 2) {
 4873       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4874     }
 4875   }
 4876 
 4877   // load N/2 successive pairs of quadword values from memory in order
 4878   // into N vector registers of the sequence via the address supplied
 4879   // in base using post-increment addressing
 4880   template<int N>
 4881   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4882     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4883     for (int i = 0; i < N; i += 2) {
 4884       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4885     }
 4886   }
 4887 
 4888   // store N successive vector registers of the sequence into N/2
 4889   // successive pairs of quadword memory locations via the address
 4890   // supplied in base using post-increment addressing
 4891   template<int N>
 4892   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4893     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4894     for (int i = 0; i < N; i += 2) {
 4895       __ stpq(v[i], v[i+1], __ post(base, 32));
 4896     }
 4897   }
 4898 
 4899   // load N/2 pairs of quadword values from memory de-interleaved into
 4900   // N vector registers 2 at a time via the address supplied in base
 4901   // using post-increment addressing.
 4902   template<int N>
 4903   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4904     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4905     for (int i = 0; i < N; i += 2) {
 4906       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4907     }
 4908   }
 4909 
 4910   // store N vector registers interleaved into N/2 pairs of quadword
 4911   // memory locations via the address supplied in base using
 4912   // post-increment addressing.
 4913   template<int N>
 4914   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4915     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4916     for (int i = 0; i < N; i += 2) {
 4917       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4918     }
 4919   }
 4920 
 4921   // load N quadword values from memory de-interleaved into N vector
 4922   // registers 3 elements at a time via the address supplied in base.
 4923   template<int N>
 4924   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4925     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4926     for (int i = 0; i < N; i += 3) {
 4927       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4928     }
 4929   }
 4930 
 4931   // load N quadword values from memory de-interleaved into N vector
 4932   // registers 3 elements at a time via the address supplied in base
 4933   // using post-increment addressing.
 4934   template<int N>
 4935   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4936     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4937     for (int i = 0; i < N; i += 3) {
 4938       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4939     }
 4940   }
 4941 
 4942   // load N/2 pairs of quadword values from memory into N vector
 4943   // registers via the address supplied in base with each pair indexed
 4944   // using the the start offset plus the corresponding entry in the
 4945   // offsets array
 4946   template<int N>
 4947   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4948     for (int i = 0; i < N/2; i++) {
 4949       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4950     }
 4951   }
 4952 
 4953   // store N vector registers into N/2 pairs of quadword memory
 4954   // locations via the address supplied in base with each pair indexed
 4955   // using the the start offset plus the corresponding entry in the
 4956   // offsets array
 4957   template<int N>
 4958   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4959     for (int i = 0; i < N/2; i++) {
 4960       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4961     }
 4962   }
 4963 
 4964   // load N single quadword values from memory into N vector registers
 4965   // via the address supplied in base with each value indexed using
 4966   // the the start offset plus the corresponding entry in the offsets
 4967   // array
 4968   template<int N>
 4969   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4970                       int start, int (&offsets)[N]) {
 4971     for (int i = 0; i < N; i++) {
 4972       __ ldr(v[i], T, Address(base, start + offsets[i]));
 4973     }
 4974   }
 4975 
 4976   // store N vector registers into N single quadword memory locations
 4977   // via the address supplied in base with each value indexed using
 4978   // the the start offset plus the corresponding entry in the offsets
 4979   // array
 4980   template<int N>
 4981   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4982                       int start, int (&offsets)[N]) {
 4983     for (int i = 0; i < N; i++) {
 4984       __ str(v[i], T, Address(base, start + offsets[i]));
 4985     }
 4986   }
 4987 
 4988   // load N/2 pairs of quadword values from memory de-interleaved into
 4989   // N vector registers 2 at a time via the address supplied in base
 4990   // with each pair indexed using the the start offset plus the
 4991   // corresponding entry in the offsets array
 4992   template<int N>
 4993   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4994                       Register tmp, int start, int (&offsets)[N/2]) {
 4995     for (int i = 0; i < N/2; i++) {
 4996       __ add(tmp, base, start + offsets[i]);
 4997       __ ld2(v[2*i], v[2*i+1], T, tmp);
 4998     }
 4999   }
 5000 
 5001   // store N vector registers 2 at a time interleaved into N/2 pairs
 5002   // of quadword memory locations via the address supplied in base
 5003   // with each pair indexed using the the start offset plus the
 5004   // corresponding entry in the offsets array
 5005   template<int N>
 5006   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5007                       Register tmp, int start, int (&offsets)[N/2]) {
 5008     for (int i = 0; i < N/2; i++) {
 5009       __ add(tmp, base, start + offsets[i]);
 5010       __ st2(v[2*i], v[2*i+1], T, tmp);
 5011     }
 5012   }
 5013 
 5014   // Helper routines for various flavours of Montgomery multiply
 5015 
 5016   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5017   // multiplications in parallel
 5018   //
 5019 
 5020   // See the montMul() method of the sun.security.provider.ML_DSA
 5021   // class.
 5022   //
 5023   // Computes 4x4S results or 8x8H results
 5024   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5025   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5026   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5027   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5028   // Outputs: va - 4x4S or 4x8H vector register sequences
 5029   // vb, vc, vtmp and vq must all be disjoint
 5030   // va must be disjoint from all other inputs/temps or must equal vc
 5031   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5032   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5033   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5034                    Assembler::SIMD_Arrangement T,
 5035                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5036     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5037     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5038     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5039     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5040 
 5041     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5042     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5043 
 5044     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5045 
 5046     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5047     assert(vs_disjoint(va, vb), "va and vb overlap");
 5048     assert(vs_disjoint(va, vq), "va and vq overlap");
 5049     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5050     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5051 
 5052     // schedule 4 streams of instructions across the vector sequences
 5053     for (int i = 0; i < 4; i++) {
 5054       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5055       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5056     }
 5057 
 5058     for (int i = 0; i < 4; i++) {
 5059       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5060     }
 5061 
 5062     for (int i = 0; i < 4; i++) {
 5063       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5064     }
 5065 
 5066     for (int i = 0; i < 4; i++) {
 5067       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5068     }
 5069   }
 5070 
 5071   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5072   // multiplications in parallel
 5073   //
 5074 
 5075   // See the montMul() method of the sun.security.provider.ML_DSA
 5076   // class.
 5077   //
 5078   // Computes 4x4S results or 8x8H results
 5079   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5080   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5081   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5082   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5083   // Outputs: va - 4x4S or 4x8H vector register sequences
 5084   // vb, vc, vtmp and vq must all be disjoint
 5085   // va must be disjoint from all other inputs/temps or must equal vc
 5086   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5087   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5088   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5089                    Assembler::SIMD_Arrangement T,
 5090                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5091     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5092     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5093     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5094     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5095 
 5096     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5097     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5098 
 5099     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5100 
 5101     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5102     assert(vs_disjoint(va, vb), "va and vb overlap");
 5103     assert(vs_disjoint(va, vq), "va and vq overlap");
 5104     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5105     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5106 
 5107     // schedule 2 streams of instructions across the vector sequences
 5108     for (int i = 0; i < 2; i++) {
 5109       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5110       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5111     }
 5112 
 5113     for (int i = 0; i < 2; i++) {
 5114       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5115     }
 5116 
 5117     for (int i = 0; i < 2; i++) {
 5118       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5119     }
 5120 
 5121     for (int i = 0; i < 2; i++) {
 5122       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5123     }
 5124   }
 5125 
 5126   // Perform 16 16-bit Montgomery multiplications in parallel.
 5127   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5128                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5129     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5130     // It will assert that the register use is valid
 5131     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5132   }
 5133 
 5134   // Perform 32 16-bit Montgomery multiplications in parallel.
 5135   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5136                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5137     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5138     // It will assert that the register use is valid
 5139     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5140   }
 5141 
 5142   // Perform 64 16-bit Montgomery multiplications in parallel.
 5143   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5144                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5145     // Schedule two successive 4x8H multiplies via the montmul helper
 5146     // on the front and back halves of va, vb and vc. The helper will
 5147     // assert that the register use has no overlap conflicts on each
 5148     // individual call but we also need to ensure that the necessary
 5149     // disjoint/equality constraints are met across both calls.
 5150 
 5151     // vb, vc, vtmp and vq must be disjoint. va must either be
 5152     // disjoint from all other registers or equal vc
 5153 
 5154     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5155     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5156     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5157 
 5158     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5159     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5160 
 5161     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5162 
 5163     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5164     assert(vs_disjoint(va, vb), "va and vb overlap");
 5165     assert(vs_disjoint(va, vq), "va and vq overlap");
 5166     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5167 
 5168     // we multiply the front and back halves of each sequence 4 at a
 5169     // time because
 5170     //
 5171     // 1) we are currently only able to get 4-way instruction
 5172     // parallelism at best
 5173     //
 5174     // 2) we need registers for the constants in vq and temporary
 5175     // scratch registers to hold intermediate results so vtmp can only
 5176     // be a VSeq<4> which means we only have 4 scratch slots
 5177 
 5178     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5179     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5180   }
 5181 
 5182   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5183                                const VSeq<4>& vc,
 5184                                const VSeq<4>& vtmp,
 5185                                const VSeq<2>& vq) {
 5186     // compute a = montmul(a1, c)
 5187     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5188     // ouptut a1 = a0 - a
 5189     vs_subv(va1, __ T8H, va0, vc);
 5190     //    and a0 = a0 + a
 5191     vs_addv(va0, __ T8H, va0, vc);
 5192   }
 5193 
 5194   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5195                                const VSeq<4>& vb,
 5196                                const VSeq<4>& vtmp1,
 5197                                const VSeq<4>& vtmp2,
 5198                                const VSeq<2>& vq) {
 5199     // compute c = a0 - a1
 5200     vs_subv(vtmp1, __ T8H, va0, va1);
 5201     // output a0 = a0 + a1
 5202     vs_addv(va0, __ T8H, va0, va1);
 5203     // output a1 = b montmul c
 5204     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5205   }
 5206 
 5207   void load64shorts(const VSeq<8>& v, Register shorts) {
 5208     vs_ldpq_post(v, shorts);
 5209   }
 5210 
 5211   void load32shorts(const VSeq<4>& v, Register shorts) {
 5212     vs_ldpq_post(v, shorts);
 5213   }
 5214 
 5215   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5216     vs_stpq_post(v, tmpAddr);
 5217   }
 5218 
 5219   // Kyber NTT function.
 5220   // Implements
 5221   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5222   //
 5223   // coeffs (short[256]) = c_rarg0
 5224   // ntt_zetas (short[256]) = c_rarg1
 5225   address generate_kyberNtt() {
 5226 
 5227     __ align(CodeEntryAlignment);
 5228     StubGenStubId stub_id = StubGenStubId::kyberNtt_id;
 5229     StubCodeMark mark(this, stub_id);
 5230     address start = __ pc();
 5231     __ enter();
 5232 
 5233     const Register coeffs = c_rarg0;
 5234     const Register zetas = c_rarg1;
 5235 
 5236     const Register kyberConsts = r10;
 5237     const Register tmpAddr = r11;
 5238 
 5239     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5240     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5241     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5242 
 5243     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5244     // load the montmul constants
 5245     vs_ldpq(vq, kyberConsts);
 5246 
 5247     // Each level corresponds to an iteration of the outermost loop of the
 5248     // Java method seilerNTT(int[] coeffs). There are some differences
 5249     // from what is done in the seilerNTT() method, though:
 5250     // 1. The computation is using 16-bit signed values, we do not convert them
 5251     // to ints here.
 5252     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5253     // this array for each level, it is easier that way to fill up the vector
 5254     // registers.
 5255     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5256     // multiplications (this is because that way there should not be any
 5257     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5258     // that we can use the 16-bit arithmetic in the vector unit.
 5259     //
 5260     // On each level, we fill up the vector registers in such a way that the
 5261     // array elements that need to be multiplied by the zetas go into one
 5262     // set of vector registers while the corresponding ones that don't need to
 5263     // be multiplied, go into another set.
 5264     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5265     // registers interleaving the steps of 4 identical computations,
 5266     // each done on 8 16-bit values per register.
 5267 
 5268     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5269     // to the zetas occur in discrete blocks whose size is some multiple
 5270     // of 32.
 5271 
 5272     // level 0
 5273     __ add(tmpAddr, coeffs, 256);
 5274     load64shorts(vs1, tmpAddr);
 5275     load64shorts(vs2, zetas);
 5276     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5277     __ add(tmpAddr, coeffs, 0);
 5278     load64shorts(vs1, tmpAddr);
 5279     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5280     vs_addv(vs1, __ T8H, vs1, vs2);
 5281     __ add(tmpAddr, coeffs, 0);
 5282     vs_stpq_post(vs1, tmpAddr);
 5283     __ add(tmpAddr, coeffs, 256);
 5284     vs_stpq_post(vs3, tmpAddr);
 5285     // restore montmul constants
 5286     vs_ldpq(vq, kyberConsts);
 5287     load64shorts(vs1, tmpAddr);
 5288     load64shorts(vs2, zetas);
 5289     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5290     __ add(tmpAddr, coeffs, 128);
 5291     load64shorts(vs1, tmpAddr);
 5292     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5293     vs_addv(vs1, __ T8H, vs1, vs2);
 5294     __ add(tmpAddr, coeffs, 128);
 5295     store64shorts(vs1, tmpAddr);
 5296     __ add(tmpAddr, coeffs, 384);
 5297     store64shorts(vs3, tmpAddr);
 5298 
 5299     // level 1
 5300     // restore montmul constants
 5301     vs_ldpq(vq, kyberConsts);
 5302     __ add(tmpAddr, coeffs, 128);
 5303     load64shorts(vs1, tmpAddr);
 5304     load64shorts(vs2, zetas);
 5305     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5306     __ add(tmpAddr, coeffs, 0);
 5307     load64shorts(vs1, tmpAddr);
 5308     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5309     vs_addv(vs1, __ T8H, vs1, vs2);
 5310     __ add(tmpAddr, coeffs, 0);
 5311     store64shorts(vs1, tmpAddr);
 5312     store64shorts(vs3, tmpAddr);
 5313     vs_ldpq(vq, kyberConsts);
 5314     __ add(tmpAddr, coeffs, 384);
 5315     load64shorts(vs1, tmpAddr);
 5316     load64shorts(vs2, zetas);
 5317     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5318     __ add(tmpAddr, coeffs, 256);
 5319     load64shorts(vs1, tmpAddr);
 5320     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5321     vs_addv(vs1, __ T8H, vs1, vs2);
 5322     __ add(tmpAddr, coeffs, 256);
 5323     store64shorts(vs1, tmpAddr);
 5324     store64shorts(vs3, tmpAddr);
 5325 
 5326     // level 2
 5327     vs_ldpq(vq, kyberConsts);
 5328     int offsets1[4] = { 0, 32, 128, 160 };
 5329     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5330     load64shorts(vs2, zetas);
 5331     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5332     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5333     // kyber_subv_addv64();
 5334     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5335     vs_addv(vs1, __ T8H, vs1, vs2);
 5336     __ add(tmpAddr, coeffs, 0);
 5337     vs_stpq_post(vs_front(vs1), tmpAddr);
 5338     vs_stpq_post(vs_front(vs3), tmpAddr);
 5339     vs_stpq_post(vs_back(vs1), tmpAddr);
 5340     vs_stpq_post(vs_back(vs3), tmpAddr);
 5341     vs_ldpq(vq, kyberConsts);
 5342     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5343     load64shorts(vs2, zetas);
 5344     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5345     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5346     // kyber_subv_addv64();
 5347     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5348     vs_addv(vs1, __ T8H, vs1, vs2);
 5349     __ add(tmpAddr, coeffs, 256);
 5350     vs_stpq_post(vs_front(vs1), tmpAddr);
 5351     vs_stpq_post(vs_front(vs3), tmpAddr);
 5352     vs_stpq_post(vs_back(vs1), tmpAddr);
 5353     vs_stpq_post(vs_back(vs3), tmpAddr);
 5354 
 5355     // level 3
 5356     vs_ldpq(vq, kyberConsts);
 5357     int offsets2[4] = { 0, 64, 128, 192 };
 5358     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5359     load64shorts(vs2, zetas);
 5360     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5361     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5362     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5363     vs_addv(vs1, __ T8H, vs1, vs2);
 5364     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5365     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5366 
 5367     vs_ldpq(vq, kyberConsts);
 5368     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5369     load64shorts(vs2, zetas);
 5370     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5371     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5372     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5373     vs_addv(vs1, __ T8H, vs1, vs2);
 5374     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5375     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5376 
 5377     // level 4
 5378     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5379     // so they are loaded using employing an ldr at 8 distinct offsets.
 5380 
 5381     vs_ldpq(vq, kyberConsts);
 5382     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5383     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5384     load64shorts(vs2, zetas);
 5385     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5386     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5387     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5388     vs_addv(vs1, __ T8H, vs1, vs2);
 5389     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5390     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5391 
 5392     vs_ldpq(vq, kyberConsts);
 5393     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5394     load64shorts(vs2, zetas);
 5395     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5396     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5397     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5398     vs_addv(vs1, __ T8H, vs1, vs2);
 5399     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5400     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5401 
 5402     // level 5
 5403     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5404     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5405 
 5406     vs_ldpq(vq, kyberConsts);
 5407     int offsets4[4] = { 0, 32, 64, 96 };
 5408     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5409     load32shorts(vs_front(vs2), zetas);
 5410     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5411     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5412     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5413     load32shorts(vs_front(vs2), zetas);
 5414     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5415     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5416     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5417     load32shorts(vs_front(vs2), zetas);
 5418     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5419     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5420 
 5421     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5422     load32shorts(vs_front(vs2), zetas);
 5423     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5424     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5425 
 5426     // level 6
 5427     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5428     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5429 
 5430     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5431     load32shorts(vs_front(vs2), zetas);
 5432     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5433     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5434     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5435     // __ ldpq(v18, v19, __ post(zetas, 32));
 5436     load32shorts(vs_front(vs2), zetas);
 5437     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5438     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5439 
 5440     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5441     load32shorts(vs_front(vs2), zetas);
 5442     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5443     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5444 
 5445     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5446     load32shorts(vs_front(vs2), zetas);
 5447     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5448     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5449 
 5450     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5451     __ mov(r0, zr); // return 0
 5452     __ ret(lr);
 5453 
 5454     return start;
 5455   }
 5456 
 5457   // Kyber Inverse NTT function
 5458   // Implements
 5459   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5460   //
 5461   // coeffs (short[256]) = c_rarg0
 5462   // ntt_zetas (short[256]) = c_rarg1
 5463   address generate_kyberInverseNtt() {
 5464 
 5465     __ align(CodeEntryAlignment);
 5466     StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id;
 5467     StubCodeMark mark(this, stub_id);
 5468     address start = __ pc();
 5469     __ enter();
 5470 
 5471     const Register coeffs = c_rarg0;
 5472     const Register zetas = c_rarg1;
 5473 
 5474     const Register kyberConsts = r10;
 5475     const Register tmpAddr = r11;
 5476     const Register tmpAddr2 = c_rarg2;
 5477 
 5478     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5479     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5480     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5481 
 5482     __ lea(kyberConsts,
 5483              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5484 
 5485     // level 0
 5486     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5487     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5488 
 5489     vs_ldpq(vq, kyberConsts);
 5490     int offsets4[4] = { 0, 32, 64, 96 };
 5491     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5492     load32shorts(vs_front(vs2), zetas);
 5493     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5494                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5495     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5496     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5497     load32shorts(vs_front(vs2), zetas);
 5498     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5499                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5500     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5501     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5502     load32shorts(vs_front(vs2), zetas);
 5503     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5504                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5505     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5506     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5507     load32shorts(vs_front(vs2), zetas);
 5508     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5509                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5510     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5511 
 5512     // level 1
 5513     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5514     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5515 
 5516     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5517     load32shorts(vs_front(vs2), zetas);
 5518     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5519                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5520     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5521     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5522     load32shorts(vs_front(vs2), zetas);
 5523     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5524                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5525     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5526 
 5527     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5528     load32shorts(vs_front(vs2), zetas);
 5529     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5530                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5531     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5532     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5533     load32shorts(vs_front(vs2), zetas);
 5534     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5535                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5536     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5537 
 5538     // level 2
 5539     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5540     // so they are loaded using employing an ldr at 8 distinct offsets.
 5541 
 5542     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5543     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5544     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5545     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5546     vs_subv(vs1, __ T8H, vs1, vs2);
 5547     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5548     load64shorts(vs2, zetas);
 5549     vs_ldpq(vq, kyberConsts);
 5550     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5551     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5552 
 5553     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5554     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5555     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5556     vs_subv(vs1, __ T8H, vs1, vs2);
 5557     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5558     load64shorts(vs2, zetas);
 5559     vs_ldpq(vq, kyberConsts);
 5560     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5561     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5562 
 5563     // Barrett reduction at indexes where overflow may happen
 5564 
 5565     // load q and the multiplier for the Barrett reduction
 5566     __ add(tmpAddr, kyberConsts, 16);
 5567     vs_ldpq(vq, tmpAddr);
 5568 
 5569     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5570     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5571     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5572     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5573     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5574     vs_sshr(vs2, __ T8H, vs2, 11);
 5575     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5576     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5577     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5578     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5579     vs_sshr(vs2, __ T8H, vs2, 11);
 5580     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5581     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5582 
 5583     // level 3
 5584     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5585     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5586 
 5587     int offsets2[4] = { 0, 64, 128, 192 };
 5588     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5589     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5590     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5591     vs_subv(vs1, __ T8H, vs1, vs2);
 5592     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5593     load64shorts(vs2, zetas);
 5594     vs_ldpq(vq, kyberConsts);
 5595     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5596     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5597 
 5598     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5599     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5600     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5601     vs_subv(vs1, __ T8H, vs1, vs2);
 5602     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5603     load64shorts(vs2, zetas);
 5604     vs_ldpq(vq, kyberConsts);
 5605     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5606     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5607 
 5608     // level 4
 5609 
 5610     int offsets1[4] = { 0, 32, 128, 160 };
 5611     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5612     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5613     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5614     vs_subv(vs1, __ T8H, vs1, vs2);
 5615     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5616     load64shorts(vs2, zetas);
 5617     vs_ldpq(vq, kyberConsts);
 5618     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5619     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5620 
 5621     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5622     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5623     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5624     vs_subv(vs1, __ T8H, vs1, vs2);
 5625     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5626     load64shorts(vs2, zetas);
 5627     vs_ldpq(vq, kyberConsts);
 5628     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5629     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5630 
 5631     // level 5
 5632 
 5633     __ add(tmpAddr, coeffs, 0);
 5634     load64shorts(vs1, tmpAddr);
 5635     __ add(tmpAddr, coeffs, 128);
 5636     load64shorts(vs2, tmpAddr);
 5637     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5638     vs_subv(vs1, __ T8H, vs1, vs2);
 5639     __ add(tmpAddr, coeffs, 0);
 5640     store64shorts(vs3, tmpAddr);
 5641     load64shorts(vs2, zetas);
 5642     vs_ldpq(vq, kyberConsts);
 5643     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5644     __ add(tmpAddr, coeffs, 128);
 5645     store64shorts(vs2, tmpAddr);
 5646 
 5647     load64shorts(vs1, tmpAddr);
 5648     __ add(tmpAddr, coeffs, 384);
 5649     load64shorts(vs2, tmpAddr);
 5650     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5651     vs_subv(vs1, __ T8H, vs1, vs2);
 5652     __ add(tmpAddr, coeffs, 256);
 5653     store64shorts(vs3, tmpAddr);
 5654     load64shorts(vs2, zetas);
 5655     vs_ldpq(vq, kyberConsts);
 5656     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5657     __ add(tmpAddr, coeffs, 384);
 5658     store64shorts(vs2, tmpAddr);
 5659 
 5660     // Barrett reduction at indexes where overflow may happen
 5661 
 5662     // load q and the multiplier for the Barrett reduction
 5663     __ add(tmpAddr, kyberConsts, 16);
 5664     vs_ldpq(vq, tmpAddr);
 5665 
 5666     int offsets0[2] = { 0, 256 };
 5667     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5668     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5669     vs_sshr(vs2, __ T8H, vs2, 11);
 5670     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5671     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5672 
 5673     // level 6
 5674 
 5675     __ add(tmpAddr, coeffs, 0);
 5676     load64shorts(vs1, tmpAddr);
 5677     __ add(tmpAddr, coeffs, 256);
 5678     load64shorts(vs2, tmpAddr);
 5679     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5680     vs_subv(vs1, __ T8H, vs1, vs2);
 5681     __ add(tmpAddr, coeffs, 0);
 5682     store64shorts(vs3, tmpAddr);
 5683     load64shorts(vs2, zetas);
 5684     vs_ldpq(vq, kyberConsts);
 5685     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5686     __ add(tmpAddr, coeffs, 256);
 5687     store64shorts(vs2, tmpAddr);
 5688 
 5689     __ add(tmpAddr, coeffs, 128);
 5690     load64shorts(vs1, tmpAddr);
 5691     __ add(tmpAddr, coeffs, 384);
 5692     load64shorts(vs2, tmpAddr);
 5693     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5694     vs_subv(vs1, __ T8H, vs1, vs2);
 5695     __ add(tmpAddr, coeffs, 128);
 5696     store64shorts(vs3, tmpAddr);
 5697     load64shorts(vs2, zetas);
 5698     vs_ldpq(vq, kyberConsts);
 5699     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5700     __ add(tmpAddr, coeffs, 384);
 5701     store64shorts(vs2, tmpAddr);
 5702 
 5703     // multiply by 2^-n
 5704 
 5705     // load toMont(2^-n mod q)
 5706     __ add(tmpAddr, kyberConsts, 48);
 5707     __ ldr(v29, __ Q, tmpAddr);
 5708 
 5709     vs_ldpq(vq, kyberConsts);
 5710     __ add(tmpAddr, coeffs, 0);
 5711     load64shorts(vs1, tmpAddr);
 5712     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5713     __ add(tmpAddr, coeffs, 0);
 5714     store64shorts(vs2, tmpAddr);
 5715 
 5716     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5717     load64shorts(vs1, tmpAddr);
 5718     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5719     __ add(tmpAddr, coeffs, 128);
 5720     store64shorts(vs2, tmpAddr);
 5721 
 5722     // now tmpAddr contains coeffs + 256
 5723     load64shorts(vs1, tmpAddr);
 5724     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5725     __ add(tmpAddr, coeffs, 256);
 5726     store64shorts(vs2, tmpAddr);
 5727 
 5728     // now tmpAddr contains coeffs + 384
 5729     load64shorts(vs1, tmpAddr);
 5730     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5731     __ add(tmpAddr, coeffs, 384);
 5732     store64shorts(vs2, tmpAddr);
 5733 
 5734     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5735     __ mov(r0, zr); // return 0
 5736     __ ret(lr);
 5737 
 5738     return start;
 5739   }
 5740 
 5741   // Kyber multiply polynomials in the NTT domain.
 5742   // Implements
 5743   // static int implKyberNttMult(
 5744   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5745   //
 5746   // result (short[256]) = c_rarg0
 5747   // ntta (short[256]) = c_rarg1
 5748   // nttb (short[256]) = c_rarg2
 5749   // zetas (short[128]) = c_rarg3
 5750   address generate_kyberNttMult() {
 5751 
 5752     __ align(CodeEntryAlignment);
 5753     StubGenStubId stub_id = StubGenStubId::kyberNttMult_id;
 5754     StubCodeMark mark(this, stub_id);
 5755     address start = __ pc();
 5756     __ enter();
 5757 
 5758     const Register result = c_rarg0;
 5759     const Register ntta = c_rarg1;
 5760     const Register nttb = c_rarg2;
 5761     const Register zetas = c_rarg3;
 5762 
 5763     const Register kyberConsts = r10;
 5764     const Register limit = r11;
 5765 
 5766     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5767     VSeq<4> vs3(16), vs4(20);
 5768     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5769     VSeq<2> vz(28);          // pair of zetas
 5770     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5771 
 5772     __ lea(kyberConsts,
 5773              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5774 
 5775     Label kyberNttMult_loop;
 5776 
 5777     __ add(limit, result, 512);
 5778 
 5779     // load q and qinv
 5780     vs_ldpq(vq, kyberConsts);
 5781 
 5782     // load R^2 mod q (to convert back from Montgomery representation)
 5783     __ add(kyberConsts, kyberConsts, 64);
 5784     __ ldr(v27, __ Q, kyberConsts);
 5785 
 5786     __ BIND(kyberNttMult_loop);
 5787 
 5788     // load 16 zetas
 5789     vs_ldpq_post(vz, zetas);
 5790 
 5791     // load 2 sets of 32 coefficients from the two input arrays
 5792     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5793     // are striped across pairs of vector registers
 5794     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5795     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5796     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5797     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5798 
 5799     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5800     // i.e. montmul the first and second halves of vs1 in order and
 5801     // then with one sequence reversed storing the two results in vs3
 5802     //
 5803     // vs3[0] <- montmul(a0, b0)
 5804     // vs3[1] <- montmul(a1, b1)
 5805     // vs3[2] <- montmul(a0, b1)
 5806     // vs3[3] <- montmul(a1, b0)
 5807     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5808     kyber_montmul16(vs_back(vs3),
 5809                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5810 
 5811     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5812     // i.e. montmul the first and second halves of vs4 in order and
 5813     // then with one sequence reversed storing the two results in vs1
 5814     //
 5815     // vs1[0] <- montmul(a2, b2)
 5816     // vs1[1] <- montmul(a3, b3)
 5817     // vs1[2] <- montmul(a2, b3)
 5818     // vs1[3] <- montmul(a3, b2)
 5819     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5820     kyber_montmul16(vs_back(vs1),
 5821                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5822 
 5823     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5824     // We can schedule two montmuls at a time if we use a suitable vector
 5825     // sequence <vs3[1], vs1[1]>.
 5826     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5827     VSeq<2> vs5(vs3[1], delta);
 5828 
 5829     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5830     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5831     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5832 
 5833     // add results in pairs storing in vs3
 5834     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5835     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5836     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5837 
 5838     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5839     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5840     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5841 
 5842     // vs1 <- montmul(vs3, montRSquareModQ)
 5843     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5844 
 5845     // store back the two pairs of result vectors de-interleaved as 8H elements
 5846     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5847     // in memory
 5848     vs_st2_post(vs1, __ T8H, result);
 5849 
 5850     __ cmp(result, limit);
 5851     __ br(Assembler::NE, kyberNttMult_loop);
 5852 
 5853     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5854     __ mov(r0, zr); // return 0
 5855     __ ret(lr);
 5856 
 5857     return start;
 5858   }
 5859 
 5860   // Kyber add 2 polynomials.
 5861   // Implements
 5862   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5863   //
 5864   // result (short[256]) = c_rarg0
 5865   // a (short[256]) = c_rarg1
 5866   // b (short[256]) = c_rarg2
 5867   address generate_kyberAddPoly_2() {
 5868 
 5869     __ align(CodeEntryAlignment);
 5870     StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id;
 5871     StubCodeMark mark(this, stub_id);
 5872     address start = __ pc();
 5873     __ enter();
 5874 
 5875     const Register result = c_rarg0;
 5876     const Register a = c_rarg1;
 5877     const Register b = c_rarg2;
 5878 
 5879     const Register kyberConsts = r11;
 5880 
 5881     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5882     // So, we can load, add and store the data in 3 groups of 11,
 5883     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5884     // registers. A further constraint is that the mapping needs
 5885     // to skip callee saves. So, we allocate the register
 5886     // sequences using two 8 sequences, two 2 sequences and two
 5887     // single registers.
 5888     VSeq<8> vs1_1(0);
 5889     VSeq<2> vs1_2(16);
 5890     FloatRegister vs1_3 = v28;
 5891     VSeq<8> vs2_1(18);
 5892     VSeq<2> vs2_2(26);
 5893     FloatRegister vs2_3 = v29;
 5894 
 5895     // two constant vector sequences
 5896     VSeq<8> vc_1(31, 0);
 5897     VSeq<2> vc_2(31, 0);
 5898 
 5899     FloatRegister vc_3 = v31;
 5900     __ lea(kyberConsts,
 5901              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5902 
 5903     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5904     for (int i = 0; i < 3; i++) {
 5905       // load 80 or 88 values from a into vs1_1/2/3
 5906       vs_ldpq_post(vs1_1, a);
 5907       vs_ldpq_post(vs1_2, a);
 5908       if (i < 2) {
 5909         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5910       }
 5911       // load 80 or 88 values from b into vs2_1/2/3
 5912       vs_ldpq_post(vs2_1, b);
 5913       vs_ldpq_post(vs2_2, b);
 5914       if (i < 2) {
 5915         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5916       }
 5917       // sum 80 or 88 values across vs1 and vs2 into vs1
 5918       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5919       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5920       if (i < 2) {
 5921         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5922       }
 5923       // add constant to all 80 or 88 results
 5924       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5925       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5926       if (i < 2) {
 5927         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5928       }
 5929       // store 80 or 88 values
 5930       vs_stpq_post(vs1_1, result);
 5931       vs_stpq_post(vs1_2, result);
 5932       if (i < 2) {
 5933         __ str(vs1_3, __ Q, __ post(result, 16));
 5934       }
 5935     }
 5936 
 5937     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5938     __ mov(r0, zr); // return 0
 5939     __ ret(lr);
 5940 
 5941     return start;
 5942   }
 5943 
 5944   // Kyber add 3 polynomials.
 5945   // Implements
 5946   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5947   //
 5948   // result (short[256]) = c_rarg0
 5949   // a (short[256]) = c_rarg1
 5950   // b (short[256]) = c_rarg2
 5951   // c (short[256]) = c_rarg3
 5952   address generate_kyberAddPoly_3() {
 5953 
 5954     __ align(CodeEntryAlignment);
 5955     StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id;
 5956     StubCodeMark mark(this, stub_id);
 5957     address start = __ pc();
 5958     __ enter();
 5959 
 5960     const Register result = c_rarg0;
 5961     const Register a = c_rarg1;
 5962     const Register b = c_rarg2;
 5963     const Register c = c_rarg3;
 5964 
 5965     const Register kyberConsts = r11;
 5966 
 5967     // As above we sum 256 sets of values in total i.e. 32 x 8H
 5968     // quadwords.  So, we can load, add and store the data in 3
 5969     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 5970     // of 10 or 11 registers. A further constraint is that the
 5971     // mapping needs to skip callee saves. So, we allocate the
 5972     // register sequences using two 8 sequences, two 2 sequences
 5973     // and two single registers.
 5974     VSeq<8> vs1_1(0);
 5975     VSeq<2> vs1_2(16);
 5976     FloatRegister vs1_3 = v28;
 5977     VSeq<8> vs2_1(18);
 5978     VSeq<2> vs2_2(26);
 5979     FloatRegister vs2_3 = v29;
 5980 
 5981     // two constant vector sequences
 5982     VSeq<8> vc_1(31, 0);
 5983     VSeq<2> vc_2(31, 0);
 5984 
 5985     FloatRegister vc_3 = v31;
 5986 
 5987     __ lea(kyberConsts,
 5988              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5989 
 5990     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5991     for (int i = 0; i < 3; i++) {
 5992       // load 80 or 88 values from a into vs1_1/2/3
 5993       vs_ldpq_post(vs1_1, a);
 5994       vs_ldpq_post(vs1_2, a);
 5995       if (i < 2) {
 5996         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5997       }
 5998       // load 80 or 88 values from b into vs2_1/2/3
 5999       vs_ldpq_post(vs2_1, b);
 6000       vs_ldpq_post(vs2_2, b);
 6001       if (i < 2) {
 6002         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6003       }
 6004       // sum 80 or 88 values across vs1 and vs2 into vs1
 6005       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6006       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6007       if (i < 2) {
 6008         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6009       }
 6010       // load 80 or 88 values from c into vs2_1/2/3
 6011       vs_ldpq_post(vs2_1, c);
 6012       vs_ldpq_post(vs2_2, c);
 6013       if (i < 2) {
 6014         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6015       }
 6016       // sum 80 or 88 values across vs1 and vs2 into vs1
 6017       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6018       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6019       if (i < 2) {
 6020         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6021       }
 6022       // add constant to all 80 or 88 results
 6023       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6024       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6025       if (i < 2) {
 6026         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6027       }
 6028       // store 80 or 88 values
 6029       vs_stpq_post(vs1_1, result);
 6030       vs_stpq_post(vs1_2, result);
 6031       if (i < 2) {
 6032         __ str(vs1_3, __ Q, __ post(result, 16));
 6033       }
 6034     }
 6035 
 6036     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6037     __ mov(r0, zr); // return 0
 6038     __ ret(lr);
 6039 
 6040     return start;
 6041   }
 6042 
 6043   // Kyber parse XOF output to polynomial coefficient candidates
 6044   // or decodePoly(12, ...).
 6045   // Implements
 6046   // static int implKyber12To16(
 6047   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6048   //
 6049   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 6050   //
 6051   // condensed (byte[]) = c_rarg0
 6052   // condensedIndex = c_rarg1
 6053   // parsed (short[112 or 256]) = c_rarg2
 6054   // parsedLength (112 or 256) = c_rarg3
 6055   address generate_kyber12To16() {
 6056     Label L_F00, L_loop, L_end;
 6057 
 6058     __ BIND(L_F00);
 6059     __ emit_int64(0x0f000f000f000f00);
 6060     __ emit_int64(0x0f000f000f000f00);
 6061 
 6062     __ align(CodeEntryAlignment);
 6063     StubGenStubId stub_id = StubGenStubId::kyber12To16_id;
 6064     StubCodeMark mark(this, stub_id);
 6065     address start = __ pc();
 6066     __ enter();
 6067 
 6068     const Register condensed = c_rarg0;
 6069     const Register condensedOffs = c_rarg1;
 6070     const Register parsed = c_rarg2;
 6071     const Register parsedLength = c_rarg3;
 6072 
 6073     const Register tmpAddr = r11;
 6074 
 6075     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6076     // quadwords so we need a 6 vector sequence for the inputs.
 6077     // Parsing produces 64 shorts, employing two 8 vector
 6078     // sequences to store and combine the intermediate data.
 6079     VSeq<6> vin(24);
 6080     VSeq<8> va(0), vb(16);
 6081 
 6082     __ adr(tmpAddr, L_F00);
 6083     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6084     __ add(condensed, condensed, condensedOffs);
 6085 
 6086     __ BIND(L_loop);
 6087     // load 96 (6 x 16B) byte values
 6088     vs_ld3_post(vin, __ T16B, condensed);
 6089 
 6090     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6091     // holds 48 (16x3) contiguous bytes from memory striped
 6092     // horizontally across each of the 16 byte lanes. Equivalently,
 6093     // that is 16 pairs of 12-bit integers. Likewise the back half
 6094     // holds the next 48 bytes in the same arrangement.
 6095 
 6096     // Each vector in the front half can also be viewed as a vertical
 6097     // strip across the 16 pairs of 12 bit integers. Each byte in
 6098     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6099     // byte in vin[1] stores the high 4 bits of the first int and the
 6100     // low 4 bits of the second int. Each byte in vin[2] stores the
 6101     // high 8 bits of the second int. Likewise the vectors in second
 6102     // half.
 6103 
 6104     // Converting the data to 16-bit shorts requires first of all
 6105     // expanding each of the 6 x 16B vectors into 6 corresponding
 6106     // pairs of 8H vectors. Mask, shift and add operations on the
 6107     // resulting vector pairs can be used to combine 4 and 8 bit
 6108     // parts of related 8H vector elements.
 6109     //
 6110     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6111     // twice, one copy manipulated to provide the lower 4 bits
 6112     // belonging to the first short in a pair and another copy
 6113     // manipulated to provide the higher 4 bits belonging to the
 6114     // second short in a pair. This is why the the vector sequences va
 6115     // and vb used to hold the expanded 8H elements are of length 8.
 6116 
 6117     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6118     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6119     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6120     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6121     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6122     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6123     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6124     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6125 
 6126     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6127     // and vb[4:5]
 6128     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6129     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6130     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6131     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6132     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6133     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6134 
 6135     // shift lo byte of copy 1 of the middle stripe into the high byte
 6136     __ shl(va[2], __ T8H, va[2], 8);
 6137     __ shl(va[3], __ T8H, va[3], 8);
 6138     __ shl(vb[2], __ T8H, vb[2], 8);
 6139     __ shl(vb[3], __ T8H, vb[3], 8);
 6140 
 6141     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6142     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6143     // are in bit positions [4..11].
 6144     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6145     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6146     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6147     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6148 
 6149     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6150     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6151     // copy2
 6152     __ andr(va[2], __ T16B, va[2], v31);
 6153     __ andr(va[3], __ T16B, va[3], v31);
 6154     __ ushr(va[4], __ T8H, va[4], 4);
 6155     __ ushr(va[5], __ T8H, va[5], 4);
 6156     __ andr(vb[2], __ T16B, vb[2], v31);
 6157     __ andr(vb[3], __ T16B, vb[3], v31);
 6158     __ ushr(vb[4], __ T8H, vb[4], 4);
 6159     __ ushr(vb[5], __ T8H, vb[5], 4);
 6160 
 6161     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6162     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6163     // n.b. the ordering ensures: i) inputs are consumed before they
 6164     // are overwritten ii) the order of 16-bit results across successive
 6165     // pairs of vectors in va and then vb reflects the order of the
 6166     // corresponding 12-bit inputs
 6167     __ addv(va[0], __ T8H, va[0], va[2]);
 6168     __ addv(va[2], __ T8H, va[1], va[3]);
 6169     __ addv(va[1], __ T8H, va[4], va[6]);
 6170     __ addv(va[3], __ T8H, va[5], va[7]);
 6171     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6172     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6173     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6174     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6175 
 6176     // store 64 results interleaved as shorts
 6177     vs_st2_post(vs_front(va), __ T8H, parsed);
 6178     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6179 
 6180     __ sub(parsedLength, parsedLength, 64);
 6181     __ cmp(parsedLength, (u1)64);
 6182     __ br(Assembler::GE, L_loop);
 6183     __ cbz(parsedLength, L_end);
 6184 
 6185     // if anything is left it should be a final 72 bytes of input
 6186     // i.e. a final 48 12-bit values. so we handle this by loading
 6187     // 48 bytes into all 16B lanes of front(vin) and only 24
 6188     // bytes into the lower 8B lane of back(vin)
 6189     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6190     vs_ld3(vs_back(vin), __ T8B, condensed);
 6191 
 6192     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6193     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6194     // 5 and target element 2 of vb duplicates element 4.
 6195     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6196     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6197     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6198     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6199     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6200     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6201 
 6202     // This time expand just the lower 8 lanes
 6203     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6204     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6205     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6206 
 6207     // shift lo byte of copy 1 of the middle stripe into the high byte
 6208     __ shl(va[2], __ T8H, va[2], 8);
 6209     __ shl(va[3], __ T8H, va[3], 8);
 6210     __ shl(vb[2], __ T8H, vb[2], 8);
 6211 
 6212     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6213     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6214     // int are in bit positions [4..11].
 6215     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6216     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6217     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6218 
 6219     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6220     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6221     // copy2
 6222     __ andr(va[2], __ T16B, va[2], v31);
 6223     __ andr(va[3], __ T16B, va[3], v31);
 6224     __ ushr(va[4], __ T8H, va[4], 4);
 6225     __ ushr(va[5], __ T8H, va[5], 4);
 6226     __ andr(vb[2], __ T16B, vb[2], v31);
 6227     __ ushr(vb[4], __ T8H, vb[4], 4);
 6228 
 6229 
 6230 
 6231     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6232     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6233 
 6234     // n.b. ordering ensures: i) inputs are consumed before they are
 6235     // overwritten ii) order of 16-bit results across succsessive
 6236     // pairs of vectors in va and then lower half of vb reflects order
 6237     // of corresponding 12-bit inputs
 6238     __ addv(va[0], __ T8H, va[0], va[2]);
 6239     __ addv(va[2], __ T8H, va[1], va[3]);
 6240     __ addv(va[1], __ T8H, va[4], va[6]);
 6241     __ addv(va[3], __ T8H, va[5], va[7]);
 6242     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6243     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6244 
 6245     // store 48 results interleaved as shorts
 6246     vs_st2_post(vs_front(va), __ T8H, parsed);
 6247     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6248 
 6249     __ BIND(L_end);
 6250 
 6251     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6252     __ mov(r0, zr); // return 0
 6253     __ ret(lr);
 6254 
 6255     return start;
 6256   }
 6257 
 6258   // Kyber Barrett reduce function.
 6259   // Implements
 6260   // static int implKyberBarrettReduce(short[] coeffs) {}
 6261   //
 6262   // coeffs (short[256]) = c_rarg0
 6263   address generate_kyberBarrettReduce() {
 6264 
 6265     __ align(CodeEntryAlignment);
 6266     StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id;
 6267     StubCodeMark mark(this, stub_id);
 6268     address start = __ pc();
 6269     __ enter();
 6270 
 6271     const Register coeffs = c_rarg0;
 6272 
 6273     const Register kyberConsts = r10;
 6274     const Register result = r11;
 6275 
 6276     // As above we process 256 sets of values in total i.e. 32 x
 6277     // 8H quadwords. So, we can load, add and store the data in 3
 6278     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6279     // of 10 or 11 registers. A further constraint is that the
 6280     // mapping needs to skip callee saves. So, we allocate the
 6281     // register sequences using two 8 sequences, two 2 sequences
 6282     // and two single registers.
 6283     VSeq<8> vs1_1(0);
 6284     VSeq<2> vs1_2(16);
 6285     FloatRegister vs1_3 = v28;
 6286     VSeq<8> vs2_1(18);
 6287     VSeq<2> vs2_2(26);
 6288     FloatRegister vs2_3 = v29;
 6289 
 6290     // we also need a pair of corresponding constant sequences
 6291 
 6292     VSeq<8> vc1_1(30, 0);
 6293     VSeq<2> vc1_2(30, 0);
 6294     FloatRegister vc1_3 = v30; // for kyber_q
 6295 
 6296     VSeq<8> vc2_1(31, 0);
 6297     VSeq<2> vc2_2(31, 0);
 6298     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6299 
 6300     __ add(result, coeffs, 0);
 6301     __ lea(kyberConsts,
 6302              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6303 
 6304     // load q and the multiplier for the Barrett reduction
 6305     __ add(kyberConsts, kyberConsts, 16);
 6306     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6307 
 6308     for (int i = 0; i < 3; i++) {
 6309       // load 80 or 88 coefficients
 6310       vs_ldpq_post(vs1_1, coeffs);
 6311       vs_ldpq_post(vs1_2, coeffs);
 6312       if (i < 2) {
 6313         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6314       }
 6315 
 6316       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6317       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6318       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6319       if (i < 2) {
 6320         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6321       }
 6322 
 6323       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6324       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6325       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6326       if (i < 2) {
 6327         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6328       }
 6329 
 6330       // vs1 <- vs1 - vs2 * kyber_q
 6331       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6332       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6333       if (i < 2) {
 6334         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6335       }
 6336 
 6337       vs_stpq_post(vs1_1, result);
 6338       vs_stpq_post(vs1_2, result);
 6339       if (i < 2) {
 6340         __ str(vs1_3, __ Q, __ post(result, 16));
 6341       }
 6342     }
 6343 
 6344     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6345     __ mov(r0, zr); // return 0
 6346     __ ret(lr);
 6347 
 6348     return start;
 6349   }
 6350 
 6351 
 6352   // Dilithium-specific montmul helper routines that generate parallel
 6353   // code for, respectively, a single 4x4s vector sequence montmul or
 6354   // two such multiplies in a row.
 6355 
 6356   // Perform 16 32-bit Montgomery multiplications in parallel
 6357   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6358                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6359     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6360     // It will assert that the register use is valid
 6361     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6362   }
 6363 
 6364   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6365   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6366                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6367     // Schedule two successive 4x4S multiplies via the montmul helper
 6368     // on the front and back halves of va, vb and vc. The helper will
 6369     // assert that the register use has no overlap conflicts on each
 6370     // individual call but we also need to ensure that the necessary
 6371     // disjoint/equality constraints are met across both calls.
 6372 
 6373     // vb, vc, vtmp and vq must be disjoint. va must either be
 6374     // disjoint from all other registers or equal vc
 6375 
 6376     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6377     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6378     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6379 
 6380     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6381     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6382 
 6383     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6384 
 6385     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6386     assert(vs_disjoint(va, vb), "va and vb overlap");
 6387     assert(vs_disjoint(va, vq), "va and vq overlap");
 6388     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6389 
 6390     // We multiply the front and back halves of each sequence 4 at a
 6391     // time because
 6392     //
 6393     // 1) we are currently only able to get 4-way instruction
 6394     // parallelism at best
 6395     //
 6396     // 2) we need registers for the constants in vq and temporary
 6397     // scratch registers to hold intermediate results so vtmp can only
 6398     // be a VSeq<4> which means we only have 4 scratch slots.
 6399 
 6400     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6401     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6402   }
 6403 
 6404   // Perform combined montmul then add/sub on 4x4S vectors.
 6405   void dilithium_montmul16_sub_add(
 6406           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6407           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6408     // compute a = montmul(a1, c)
 6409     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6410     // ouptut a1 = a0 - a
 6411     vs_subv(va1, __ T4S, va0, vc);
 6412     //    and a0 = a0 + a
 6413     vs_addv(va0, __ T4S, va0, vc);
 6414   }
 6415 
 6416   // Perform combined add/sub then montul on 4x4S vectors.
 6417   void dilithium_sub_add_montmul16(
 6418           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6419           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6420     // compute c = a0 - a1
 6421     vs_subv(vtmp1, __ T4S, va0, va1);
 6422     // output a0 = a0 + a1
 6423     vs_addv(va0, __ T4S, va0, va1);
 6424     // output a1 = b montmul c
 6425     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6426   }
 6427 
 6428   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6429   // in the Java implementation come in sequences of at least 8, so we
 6430   // can use ldpq to collect the corresponding data into pairs of vector
 6431   // registers.
 6432   // We collect the coefficients corresponding to the 'j+l' indexes into
 6433   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6434   // then we do the (Montgomery) multiplications by the zetas in parallel
 6435   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6436   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6437   // v0-v7 and finally save the results back to the coeffs array.
 6438   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6439     const Register coeffs, const Register zetas) {
 6440     int c1 = 0;
 6441     int c2 = 512;
 6442     int startIncr;
 6443     // don't use callee save registers v8 - v15
 6444     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6445     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6446     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6447     int offsets[4] = { 0, 32, 64, 96 };
 6448 
 6449     for (int level = 0; level < 5; level++) {
 6450       int c1Start = c1;
 6451       int c2Start = c2;
 6452       if (level == 3) {
 6453         offsets[1] = 32;
 6454         offsets[2] = 128;
 6455         offsets[3] = 160;
 6456       } else if (level == 4) {
 6457         offsets[1] = 64;
 6458         offsets[2] = 128;
 6459         offsets[3] = 192;
 6460       }
 6461 
 6462       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6463       // time at 4 different offsets and multiply them in order by the
 6464       // next set of input values. So we employ indexed load and store
 6465       // pair instructions with arrangement 4S.
 6466       for (int i = 0; i < 4; i++) {
 6467         // reload q and qinv
 6468         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6469         // load 8x4S coefficients via second start pos == c2
 6470         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6471         // load next 8x4S inputs == b
 6472         vs_ldpq_post(vs2, zetas);
 6473         // compute a == c2 * b mod MONT_Q
 6474         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6475         // load 8x4s coefficients via first start pos == c1
 6476         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6477         // compute a1 =  c1 + a
 6478         vs_addv(vs3, __ T4S, vs1, vs2);
 6479         // compute a2 =  c1 - a
 6480         vs_subv(vs1, __ T4S, vs1, vs2);
 6481         // output a1 and a2
 6482         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6483         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6484 
 6485         int k = 4 * level + i;
 6486 
 6487         if (k > 7) {
 6488           startIncr = 256;
 6489         } else if (k == 5) {
 6490           startIncr = 384;
 6491         } else {
 6492           startIncr = 128;
 6493         }
 6494 
 6495         c1Start += startIncr;
 6496         c2Start += startIncr;
 6497       }
 6498 
 6499       c2 /= 2;
 6500     }
 6501   }
 6502 
 6503   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6504   // Implements the method
 6505   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6506   // of the Java class sun.security.provider
 6507   //
 6508   // coeffs (int[256]) = c_rarg0
 6509   // zetas (int[256]) = c_rarg1
 6510   address generate_dilithiumAlmostNtt() {
 6511 
 6512     __ align(CodeEntryAlignment);
 6513     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id;
 6514     StubCodeMark mark(this, stub_id);
 6515     address start = __ pc();
 6516     __ enter();
 6517 
 6518     const Register coeffs = c_rarg0;
 6519     const Register zetas = c_rarg1;
 6520 
 6521     const Register tmpAddr = r9;
 6522     const Register dilithiumConsts = r10;
 6523     const Register result = r11;
 6524     // don't use callee save registers v8 - v15
 6525     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6526     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6527     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6528     int offsets[4] = { 0, 32, 64, 96};
 6529     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6530     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6531     __ add(result, coeffs, 0);
 6532     __ lea(dilithiumConsts,
 6533              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6534 
 6535     // Each level represents one iteration of the outer for loop of the Java version.
 6536 
 6537     // level 0-4
 6538     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6539 
 6540     // level 5
 6541 
 6542     // At level 5 the coefficients we need to combine with the zetas
 6543     // are grouped in memory in blocks of size 4. So, for both sets of
 6544     // coefficients we load 4 adjacent values at 8 different offsets
 6545     // using an indexed ldr with register variant Q and multiply them
 6546     // in sequence order by the next set of inputs. Likewise we store
 6547     // the resuls using an indexed str with register variant Q.
 6548     for (int i = 0; i < 1024; i += 256) {
 6549       // reload constants q, qinv each iteration as they get clobbered later
 6550       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6551       // load 32 (8x4S) coefficients via first offsets = c1
 6552       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6553       // load next 32 (8x4S) inputs = b
 6554       vs_ldpq_post(vs2, zetas);
 6555       // a = b montul c1
 6556       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6557       // load 32 (8x4S) coefficients via second offsets = c2
 6558       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6559       // add/sub with result of multiply
 6560       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6561       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6562       // write back new coefficients using same offsets
 6563       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6564       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6565     }
 6566 
 6567     // level 6
 6568     // At level 6 the coefficients we need to combine with the zetas
 6569     // are grouped in memory in pairs, the first two being montmul
 6570     // inputs and the second add/sub inputs. We can still implement
 6571     // the montmul+sub+add using 4-way parallelism but only if we
 6572     // combine the coefficients with the zetas 16 at a time. We load 8
 6573     // adjacent values at 4 different offsets using an ld2 load with
 6574     // arrangement 2D. That interleaves the lower and upper halves of
 6575     // each pair of quadwords into successive vector registers. We
 6576     // then need to montmul the 4 even elements of the coefficients
 6577     // register sequence by the zetas in order and then add/sub the 4
 6578     // odd elements of the coefficients register sequence. We use an
 6579     // equivalent st2 operation to store the results back into memory
 6580     // de-interleaved.
 6581     for (int i = 0; i < 1024; i += 128) {
 6582       // reload constants q, qinv each iteration as they get clobbered later
 6583       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6584       // load interleaved 16 (4x2D) coefficients via offsets
 6585       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6586       // load next 16 (4x4S) inputs
 6587       vs_ldpq_post(vs_front(vs2), zetas);
 6588       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6589       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6590                                   vs_front(vs2), vtmp, vq);
 6591       // store interleaved 16 (4x2D) coefficients via offsets
 6592       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6593     }
 6594 
 6595     // level 7
 6596     // At level 7 the coefficients we need to combine with the zetas
 6597     // occur singly with montmul inputs alterating with add/sub
 6598     // inputs. Once again we can use 4-way parallelism to combine 16
 6599     // zetas at a time. However, we have to load 8 adjacent values at
 6600     // 4 different offsets using an ld2 load with arrangement 4S. That
 6601     // interleaves the the odd words of each pair into one
 6602     // coefficients vector register and the even words of the pair
 6603     // into the next register. We then need to montmul the 4 even
 6604     // elements of the coefficients register sequence by the zetas in
 6605     // order and then add/sub the 4 odd elements of the coefficients
 6606     // register sequence. We use an equivalent st2 operation to store
 6607     // the results back into memory de-interleaved.
 6608 
 6609     for (int i = 0; i < 1024; i += 128) {
 6610       // reload constants q, qinv each iteration as they get clobbered later
 6611       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6612       // load interleaved 16 (4x4S) coefficients via offsets
 6613       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6614       // load next 16 (4x4S) inputs
 6615       vs_ldpq_post(vs_front(vs2), zetas);
 6616       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6617       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6618                                   vs_front(vs2), vtmp, vq);
 6619       // store interleaved 16 (4x4S) coefficients via offsets
 6620       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6621     }
 6622     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6623     __ mov(r0, zr); // return 0
 6624     __ ret(lr);
 6625 
 6626     return start;
 6627   }
 6628 
 6629   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6630   // in the Java implementation come in sequences of at least 8, so we
 6631   // can use ldpq to collect the corresponding data into pairs of vector
 6632   // registers
 6633   // We collect the coefficients that correspond to the 'j's into vs1
 6634   // the coefficiets that correspond to the 'j+l's into vs2 then
 6635   // do the additions into vs3 and the subtractions into vs1 then
 6636   // save the result of the additions, load the zetas into vs2
 6637   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6638   // finally save the results back to the coeffs array
 6639   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6640     const Register coeffs, const Register zetas) {
 6641     int c1 = 0;
 6642     int c2 = 32;
 6643     int startIncr;
 6644     int offsets[4];
 6645     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6646     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6647     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6648 
 6649     offsets[0] = 0;
 6650 
 6651     for (int level = 3; level < 8; level++) {
 6652       int c1Start = c1;
 6653       int c2Start = c2;
 6654       if (level == 3) {
 6655         offsets[1] = 64;
 6656         offsets[2] = 128;
 6657         offsets[3] = 192;
 6658       } else if (level == 4) {
 6659         offsets[1] = 32;
 6660         offsets[2] = 128;
 6661         offsets[3] = 160;
 6662       } else {
 6663         offsets[1] = 32;
 6664         offsets[2] = 64;
 6665         offsets[3] = 96;
 6666       }
 6667 
 6668       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6669       // time at 4 different offsets and multiply them in order by the
 6670       // next set of input values. So we employ indexed load and store
 6671       // pair instructions with arrangement 4S.
 6672       for (int i = 0; i < 4; i++) {
 6673         // load v1 32 (8x4S) coefficients relative to first start index
 6674         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6675         // load v2 32 (8x4S) coefficients relative to second start index
 6676         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6677         // a0 = v1 + v2 -- n.b. clobbers vqs
 6678         vs_addv(vs3, __ T4S, vs1, vs2);
 6679         // a1 = v1 - v2
 6680         vs_subv(vs1, __ T4S, vs1, vs2);
 6681         // save a1 relative to first start index
 6682         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6683         // load constants q, qinv each iteration as they get clobbered above
 6684         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6685         // load b next 32 (8x4S) inputs
 6686         vs_ldpq_post(vs2, zetas);
 6687         // a = a1 montmul b
 6688         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6689         // save a relative to second start index
 6690         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6691 
 6692         int k = 4 * level + i;
 6693 
 6694         if (k < 24) {
 6695           startIncr = 256;
 6696         } else if (k == 25) {
 6697           startIncr = 384;
 6698         } else {
 6699           startIncr = 128;
 6700         }
 6701 
 6702         c1Start += startIncr;
 6703         c2Start += startIncr;
 6704       }
 6705 
 6706       c2 *= 2;
 6707     }
 6708   }
 6709 
 6710   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6711   // Implements the method
 6712   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6713   // the sun.security.provider.ML_DSA class.
 6714   //
 6715   // coeffs (int[256]) = c_rarg0
 6716   // zetas (int[256]) = c_rarg1
 6717   address generate_dilithiumAlmostInverseNtt() {
 6718 
 6719     __ align(CodeEntryAlignment);
 6720     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id;
 6721     StubCodeMark mark(this, stub_id);
 6722     address start = __ pc();
 6723     __ enter();
 6724 
 6725     const Register coeffs = c_rarg0;
 6726     const Register zetas = c_rarg1;
 6727 
 6728     const Register tmpAddr = r9;
 6729     const Register dilithiumConsts = r10;
 6730     const Register result = r11;
 6731     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6732     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6733     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6734     int offsets[4] = { 0, 32, 64, 96 };
 6735     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6736     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6737 
 6738     __ add(result, coeffs, 0);
 6739     __ lea(dilithiumConsts,
 6740              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6741 
 6742     // Each level represents one iteration of the outer for loop of the Java version
 6743 
 6744     // level 0
 6745     // At level 0 we need to interleave adjacent quartets of
 6746     // coefficients before we multiply and add/sub by the next 16
 6747     // zetas just as we did for level 7 in the multiply code. So we
 6748     // load and store the values using an ld2/st2 with arrangement 4S.
 6749     for (int i = 0; i < 1024; i += 128) {
 6750       // load constants q, qinv
 6751       // n.b. this can be moved out of the loop as they do not get
 6752       // clobbered by first two loops
 6753       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6754       // a0/a1 load interleaved 32 (8x4S) coefficients
 6755       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6756       // b load next 32 (8x4S) inputs
 6757       vs_ldpq_post(vs_front(vs2), zetas);
 6758       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6759       // n.b. second half of vs2 provides temporary register storage
 6760       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6761                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6762       // a0/a1 store interleaved 32 (8x4S) coefficients
 6763       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6764     }
 6765 
 6766     // level 1
 6767     // At level 1 we need to interleave pairs of adjacent pairs of
 6768     // coefficients before we multiply by the next 16 zetas just as we
 6769     // did for level 6 in the multiply code. So we load and store the
 6770     // values an ld2/st2 with arrangement 2D.
 6771     for (int i = 0; i < 1024; i += 128) {
 6772       // a0/a1 load interleaved 32 (8x2D) coefficients
 6773       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6774       // b load next 16 (4x4S) inputs
 6775       vs_ldpq_post(vs_front(vs2), zetas);
 6776       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6777       // n.b. second half of vs2 provides temporary register storage
 6778       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6779                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6780       // a0/a1 store interleaved 32 (8x2D) coefficients
 6781       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6782     }
 6783 
 6784     // level 2
 6785     // At level 2 coefficients come in blocks of 4. So, we load 4
 6786     // adjacent coefficients at 8 distinct offsets for both the first
 6787     // and second coefficient sequences, using an ldr with register
 6788     // variant Q then combine them with next set of 32 zetas. Likewise
 6789     // we store the results using an str with register variant Q.
 6790     for (int i = 0; i < 1024; i += 256) {
 6791       // c0 load 32 (8x4S) coefficients via first offsets
 6792       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6793       // c1 load 32 (8x4S) coefficients via second offsets
 6794       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6795       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6796       vs_addv(vs3, __ T4S, vs1, vs2);
 6797       // c = c0 - c1
 6798       vs_subv(vs1, __ T4S, vs1, vs2);
 6799       // store a0 32 (8x4S) coefficients via first offsets
 6800       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6801       // b load 32 (8x4S) next inputs
 6802       vs_ldpq_post(vs2, zetas);
 6803       // reload constants q, qinv -- they were clobbered earlier
 6804       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6805       // compute a1 = b montmul c
 6806       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6807       // store a1 32 (8x4S) coefficients via second offsets
 6808       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6809     }
 6810 
 6811     // level 3-7
 6812     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6813 
 6814     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6815     __ mov(r0, zr); // return 0
 6816     __ ret(lr);
 6817 
 6818     return start;
 6819   }
 6820 
 6821   // Dilithium multiply polynomials in the NTT domain.
 6822   // Straightforward implementation of the method
 6823   // static int implDilithiumNttMult(
 6824   //              int[] result, int[] ntta, int[] nttb {} of
 6825   // the sun.security.provider.ML_DSA class.
 6826   //
 6827   // result (int[256]) = c_rarg0
 6828   // poly1 (int[256]) = c_rarg1
 6829   // poly2 (int[256]) = c_rarg2
 6830   address generate_dilithiumNttMult() {
 6831 
 6832         __ align(CodeEntryAlignment);
 6833     StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id;
 6834     StubCodeMark mark(this, stub_id);
 6835     address start = __ pc();
 6836     __ enter();
 6837 
 6838     Label L_loop;
 6839 
 6840     const Register result = c_rarg0;
 6841     const Register poly1 = c_rarg1;
 6842     const Register poly2 = c_rarg2;
 6843 
 6844     const Register dilithiumConsts = r10;
 6845     const Register len = r11;
 6846 
 6847     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6848     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6849     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6850     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6851 
 6852     __ lea(dilithiumConsts,
 6853              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6854 
 6855     // load constants q, qinv
 6856     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6857     // load constant rSquare into v29
 6858     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6859 
 6860     __ mov(len, zr);
 6861     __ add(len, len, 1024);
 6862 
 6863     __ BIND(L_loop);
 6864 
 6865     // b load 32 (8x4S) next inputs from poly1
 6866     vs_ldpq_post(vs1, poly1);
 6867     // c load 32 (8x4S) next inputs from poly2
 6868     vs_ldpq_post(vs2, poly2);
 6869     // compute a = b montmul c
 6870     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6871     // compute a = rsquare montmul a
 6872     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6873     // save a 32 (8x4S) results
 6874     vs_stpq_post(vs2, result);
 6875 
 6876     __ sub(len, len, 128);
 6877     __ cmp(len, (u1)128);
 6878     __ br(Assembler::GE, L_loop);
 6879 
 6880     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6881     __ mov(r0, zr); // return 0
 6882     __ ret(lr);
 6883 
 6884     return start;
 6885   }
 6886 
 6887   // Dilithium Motgomery multiply an array by a constant.
 6888   // A straightforward implementation of the method
 6889   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6890   // of the sun.security.provider.MLDSA class
 6891   //
 6892   // coeffs (int[256]) = c_rarg0
 6893   // constant (int) = c_rarg1
 6894   address generate_dilithiumMontMulByConstant() {
 6895 
 6896     __ align(CodeEntryAlignment);
 6897     StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id;
 6898     StubCodeMark mark(this, stub_id);
 6899     address start = __ pc();
 6900     __ enter();
 6901 
 6902     Label L_loop;
 6903 
 6904     const Register coeffs = c_rarg0;
 6905     const Register constant = c_rarg1;
 6906 
 6907     const Register dilithiumConsts = r10;
 6908     const Register result = r11;
 6909     const Register len = r12;
 6910 
 6911     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6912     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6913     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6914     VSeq<8> vconst(29, 0);             // for montmul by constant
 6915 
 6916     // results track inputs
 6917     __ add(result, coeffs, 0);
 6918     __ lea(dilithiumConsts,
 6919              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6920 
 6921     // load constants q, qinv -- they do not get clobbered by first two loops
 6922     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6923     // copy caller supplied constant across vconst
 6924     __ dup(vconst[0], __ T4S, constant);
 6925     __ mov(len, zr);
 6926     __ add(len, len, 1024);
 6927 
 6928     __ BIND(L_loop);
 6929 
 6930     // load next 32 inputs
 6931     vs_ldpq_post(vs2, coeffs);
 6932     // mont mul by constant
 6933     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6934     // write next 32 results
 6935     vs_stpq_post(vs2, result);
 6936 
 6937     __ sub(len, len, 128);
 6938     __ cmp(len, (u1)128);
 6939     __ br(Assembler::GE, L_loop);
 6940 
 6941     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6942     __ mov(r0, zr); // return 0
 6943     __ ret(lr);
 6944 
 6945     return start;
 6946   }
 6947 
 6948   // Dilithium decompose poly.
 6949   // Implements the method
 6950   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6951   // of the sun.security.provider.ML_DSA class
 6952   //
 6953   // input (int[256]) = c_rarg0
 6954   // lowPart (int[256]) = c_rarg1
 6955   // highPart (int[256]) = c_rarg2
 6956   // twoGamma2  (int) = c_rarg3
 6957   // multiplier (int) = c_rarg4
 6958   address generate_dilithiumDecomposePoly() {
 6959 
 6960     __ align(CodeEntryAlignment);
 6961     StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id;
 6962     StubCodeMark mark(this, stub_id);
 6963     address start = __ pc();
 6964     Label L_loop;
 6965 
 6966     const Register input = c_rarg0;
 6967     const Register lowPart = c_rarg1;
 6968     const Register highPart = c_rarg2;
 6969     const Register twoGamma2 = c_rarg3;
 6970     const Register multiplier = c_rarg4;
 6971 
 6972     const Register len = r9;
 6973     const Register dilithiumConsts = r10;
 6974     const Register tmp = r11;
 6975 
 6976     // 6 independent sets of 4x4s values
 6977     VSeq<4> vs1(0), vs2(4), vs3(8);
 6978     VSeq<4> vs4(12), vs5(16), vtmp(20);
 6979 
 6980     // 7 constants for cross-multiplying
 6981     VSeq<4> one(25, 0);
 6982     VSeq<4> qminus1(26, 0);
 6983     VSeq<4> g2(27, 0);
 6984     VSeq<4> twog2(28, 0);
 6985     VSeq<4> mult(29, 0);
 6986     VSeq<4> q(30, 0);
 6987     VSeq<4> qadd(31, 0);
 6988 
 6989     __ enter();
 6990 
 6991     __ lea(dilithiumConsts,
 6992              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6993 
 6994     // save callee-saved registers
 6995     __ stpd(v8, v9, __ pre(sp, -64));
 6996     __ stpd(v10, v11, Address(sp, 16));
 6997     __ stpd(v12, v13, Address(sp, 32));
 6998     __ stpd(v14, v15, Address(sp, 48));
 6999 
 7000     // populate constant registers
 7001     __ mov(tmp, zr);
 7002     __ add(tmp, tmp, 1);
 7003     __ dup(one[0], __ T4S, tmp); // 1
 7004     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7005     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7006     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7007     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7008     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7009     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7010 
 7011     __ mov(len, zr);
 7012     __ add(len, len, 1024);
 7013 
 7014     __ BIND(L_loop);
 7015 
 7016     // load next 4x4S inputs interleaved: rplus --> vs1
 7017     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7018 
 7019     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7020     vs_addv(vtmp, __ T4S, vs1, qadd);
 7021     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7022     vs_mulv(vtmp, __ T4S, vtmp, q);
 7023     vs_subv(vs1, __ T4S, vs1, vtmp);
 7024 
 7025     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7026     vs_sshr(vtmp, __ T4S, vs1, 31);
 7027     vs_andr(vtmp, vtmp, q);
 7028     vs_addv(vs1, __ T4S, vs1, vtmp);
 7029 
 7030     // quotient --> vs2
 7031     // int quotient = (rplus * multiplier) >> 22;
 7032     vs_mulv(vtmp, __ T4S, vs1, mult);
 7033     vs_sshr(vs2, __ T4S, vtmp, 22);
 7034 
 7035     // r0 --> vs3
 7036     // int r0 = rplus - quotient * twoGamma2;
 7037     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7038     vs_subv(vs3, __ T4S, vs1, vtmp);
 7039 
 7040     // mask --> vs4
 7041     // int mask = (twoGamma2 - r0) >> 22;
 7042     vs_subv(vtmp, __ T4S, twog2, vs3);
 7043     vs_sshr(vs4, __ T4S, vtmp, 22);
 7044 
 7045     // r0 -= (mask & twoGamma2);
 7046     vs_andr(vtmp, vs4, twog2);
 7047     vs_subv(vs3, __ T4S, vs3, vtmp);
 7048 
 7049     //  quotient += (mask & 1);
 7050     vs_andr(vtmp, vs4, one);
 7051     vs_addv(vs2, __ T4S, vs2, vtmp);
 7052 
 7053     // mask = (twoGamma2 / 2 - r0) >> 31;
 7054     vs_subv(vtmp, __ T4S, g2, vs3);
 7055     vs_sshr(vs4, __ T4S, vtmp, 31);
 7056 
 7057     // r0 -= (mask & twoGamma2);
 7058     vs_andr(vtmp, vs4, twog2);
 7059     vs_subv(vs3, __ T4S, vs3, vtmp);
 7060 
 7061     // quotient += (mask & 1);
 7062     vs_andr(vtmp, vs4, one);
 7063     vs_addv(vs2, __ T4S, vs2, vtmp);
 7064 
 7065     // r1 --> vs5
 7066     // int r1 = rplus - r0 - (dilithium_q - 1);
 7067     vs_subv(vtmp, __ T4S, vs1, vs3);
 7068     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7069 
 7070     // r1 --> vs1 (overwriting rplus)
 7071     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7072     vs_negr(vtmp, __ T4S, vs5);
 7073     vs_orr(vtmp, vs5, vtmp);
 7074     vs_sshr(vs1, __ T4S, vtmp, 31);
 7075 
 7076     // r0 += ~r1;
 7077     vs_notr(vtmp, vs1);
 7078     vs_addv(vs3, __ T4S, vs3, vtmp);
 7079 
 7080     // r1 = r1 & quotient;
 7081     vs_andr(vs1, vs2, vs1);
 7082 
 7083     // store results inteleaved
 7084     // lowPart[m] = r0;
 7085     // highPart[m] = r1;
 7086     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7087     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7088 
 7089     __ sub(len, len, 64);
 7090     __ cmp(len, (u1)64);
 7091     __ br(Assembler::GE, L_loop);
 7092 
 7093     // restore callee-saved vector registers
 7094     __ ldpd(v14, v15, Address(sp, 48));
 7095     __ ldpd(v12, v13, Address(sp, 32));
 7096     __ ldpd(v10, v11, Address(sp, 16));
 7097     __ ldpd(v8, v9, __ post(sp, 64));
 7098 
 7099     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7100     __ mov(r0, zr); // return 0
 7101     __ ret(lr);
 7102 
 7103     return start;
 7104   }
 7105 
 7106   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7107              Register tmp0, Register tmp1, Register tmp2) {
 7108     __ bic(tmp0, a2, a1); // for a0
 7109     __ bic(tmp1, a3, a2); // for a1
 7110     __ bic(tmp2, a4, a3); // for a2
 7111     __ eor(a2, a2, tmp2);
 7112     __ bic(tmp2, a0, a4); // for a3
 7113     __ eor(a3, a3, tmp2);
 7114     __ bic(tmp2, a1, a0); // for a4
 7115     __ eor(a0, a0, tmp0);
 7116     __ eor(a1, a1, tmp1);
 7117     __ eor(a4, a4, tmp2);
 7118   }
 7119 
 7120   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7121                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7122                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7123                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7124                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7125                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7126                         Register tmp0, Register tmp1, Register tmp2) {
 7127     __ eor3(tmp1, a4, a9, a14);
 7128     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7129     __ eor3(tmp2, a1, a6, a11);
 7130     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7131     __ rax1(tmp2, tmp0, tmp1); // d0
 7132     {
 7133 
 7134       Register tmp3, tmp4;
 7135       if (can_use_fp && can_use_r18) {
 7136         tmp3 = rfp;
 7137         tmp4 = r18_tls;
 7138       } else {
 7139         tmp3 = a4;
 7140         tmp4 = a9;
 7141         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7142       }
 7143 
 7144       __ eor3(tmp3, a0, a5, a10);
 7145       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7146       __ eor(a0, a0, tmp2);
 7147       __ eor(a5, a5, tmp2);
 7148       __ eor(a10, a10, tmp2);
 7149       __ eor(a15, a15, tmp2);
 7150       __ eor(a20, a20, tmp2); // d0(tmp2)
 7151       __ eor3(tmp3, a2, a7, a12);
 7152       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7153       __ rax1(tmp3, tmp4, tmp2); // d1
 7154       __ eor(a1, a1, tmp3);
 7155       __ eor(a6, a6, tmp3);
 7156       __ eor(a11, a11, tmp3);
 7157       __ eor(a16, a16, tmp3);
 7158       __ eor(a21, a21, tmp3); // d1(tmp3)
 7159       __ rax1(tmp3, tmp2, tmp0); // d3
 7160       __ eor3(tmp2, a3, a8, a13);
 7161       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7162       __ eor(a3, a3, tmp3);
 7163       __ eor(a8, a8, tmp3);
 7164       __ eor(a13, a13, tmp3);
 7165       __ eor(a18, a18, tmp3);
 7166       __ eor(a23, a23, tmp3);
 7167       __ rax1(tmp2, tmp1, tmp0); // d2
 7168       __ eor(a2, a2, tmp2);
 7169       __ eor(a7, a7, tmp2);
 7170       __ eor(a12, a12, tmp2);
 7171       __ rax1(tmp0, tmp0, tmp4); // d4
 7172       if (!can_use_fp || !can_use_r18) {
 7173         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7174       }
 7175       __ eor(a17, a17, tmp2);
 7176       __ eor(a22, a22, tmp2);
 7177       __ eor(a4, a4, tmp0);
 7178       __ eor(a9, a9, tmp0);
 7179       __ eor(a14, a14, tmp0);
 7180       __ eor(a19, a19, tmp0);
 7181       __ eor(a24, a24, tmp0);
 7182     }
 7183 
 7184     __ rol(tmp0, a10, 3);
 7185     __ rol(a10, a1, 1);
 7186     __ rol(a1, a6, 44);
 7187     __ rol(a6, a9, 20);
 7188     __ rol(a9, a22, 61);
 7189     __ rol(a22, a14, 39);
 7190     __ rol(a14, a20, 18);
 7191     __ rol(a20, a2, 62);
 7192     __ rol(a2, a12, 43);
 7193     __ rol(a12, a13, 25);
 7194     __ rol(a13, a19, 8) ;
 7195     __ rol(a19, a23, 56);
 7196     __ rol(a23, a15, 41);
 7197     __ rol(a15, a4, 27);
 7198     __ rol(a4, a24, 14);
 7199     __ rol(a24, a21, 2);
 7200     __ rol(a21, a8, 55);
 7201     __ rol(a8, a16, 45);
 7202     __ rol(a16, a5, 36);
 7203     __ rol(a5, a3, 28);
 7204     __ rol(a3, a18, 21);
 7205     __ rol(a18, a17, 15);
 7206     __ rol(a17, a11, 10);
 7207     __ rol(a11, a7, 6);
 7208     __ mov(a7, tmp0);
 7209 
 7210     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7211     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7212     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7213     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7214     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7215 
 7216     __ ldr(tmp1, __ post(rc, 8));
 7217     __ eor(a0, a0, tmp1);
 7218 
 7219   }
 7220 
 7221   // Arguments:
 7222   //
 7223   // Inputs:
 7224   //   c_rarg0   - byte[]  source+offset
 7225   //   c_rarg1   - byte[]  SHA.state
 7226   //   c_rarg2   - int     block_size
 7227   //   c_rarg3   - int     offset
 7228   //   c_rarg4   - int     limit
 7229   //
 7230   address generate_sha3_implCompress_gpr(StubGenStubId stub_id) {
 7231     bool multi_block;
 7232     switch (stub_id) {
 7233     case sha3_implCompress_id:
 7234       multi_block = false;
 7235       break;
 7236     case sha3_implCompressMB_id:
 7237       multi_block = true;
 7238       break;
 7239     default:
 7240       ShouldNotReachHere();
 7241     }
 7242 
 7243     static const uint64_t round_consts[24] = {
 7244       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 7245       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 7246       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 7247       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 7248       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 7249       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 7250       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 7251       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 7252     };
 7253 
 7254     __ align(CodeEntryAlignment);
 7255     StubCodeMark mark(this, stub_id);
 7256     address start = __ pc();
 7257 
 7258     Register buf           = c_rarg0;
 7259     Register state         = c_rarg1;
 7260     Register block_size    = c_rarg2;
 7261     Register ofs           = c_rarg3;
 7262     Register limit         = c_rarg4;
 7263 
 7264     // use r3.r17,r19..r28 to keep a0..a24.
 7265     // a0..a24 are respective locals from SHA3.java
 7266     Register a0 = r25,
 7267              a1 = r26,
 7268              a2 = r27,
 7269              a3 = r3,
 7270              a4 = r4,
 7271              a5 = r5,
 7272              a6 = r6,
 7273              a7 = r7,
 7274              a8 = rscratch1, // r8
 7275              a9 = rscratch2, // r9
 7276              a10 = r10,
 7277              a11 = r11,
 7278              a12 = r12,
 7279              a13 = r13,
 7280              a14 = r14,
 7281              a15 = r15,
 7282              a16 = r16,
 7283              a17 = r17,
 7284              a18 = r28,
 7285              a19 = r19,
 7286              a20 = r20,
 7287              a21 = r21,
 7288              a22 = r22,
 7289              a23 = r23,
 7290              a24 = r24;
 7291 
 7292     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7293 
 7294     Label sha3_loop, rounds24_preloop, loop_body;
 7295     Label sha3_512_or_sha3_384, shake128;
 7296 
 7297     bool can_use_r18 = false;
 7298 #ifndef R18_RESERVED
 7299     can_use_r18 = true;
 7300 #endif
 7301     bool can_use_fp = !PreserveFramePointer;
 7302 
 7303     __ enter();
 7304 
 7305     // save almost all yet unsaved gpr registers on stack
 7306     __ str(block_size, __ pre(sp, -128));
 7307     if (multi_block) {
 7308       __ stpw(ofs, limit, Address(sp, 8));
 7309     }
 7310     // 8 bytes at sp+16 will be used to keep buf
 7311     __ stp(r19, r20, Address(sp, 32));
 7312     __ stp(r21, r22, Address(sp, 48));
 7313     __ stp(r23, r24, Address(sp, 64));
 7314     __ stp(r25, r26, Address(sp, 80));
 7315     __ stp(r27, r28, Address(sp, 96));
 7316     if (can_use_r18 && can_use_fp) {
 7317       __ stp(r18_tls, state, Address(sp, 112));
 7318     } else {
 7319       __ str(state, Address(sp, 112));
 7320     }
 7321 
 7322     // begin sha3 calculations: loading a0..a24 from state arrary
 7323     __ ldp(a0, a1, state);
 7324     __ ldp(a2, a3, Address(state, 16));
 7325     __ ldp(a4, a5, Address(state, 32));
 7326     __ ldp(a6, a7, Address(state, 48));
 7327     __ ldp(a8, a9, Address(state, 64));
 7328     __ ldp(a10, a11, Address(state, 80));
 7329     __ ldp(a12, a13, Address(state, 96));
 7330     __ ldp(a14, a15, Address(state, 112));
 7331     __ ldp(a16, a17, Address(state, 128));
 7332     __ ldp(a18, a19, Address(state, 144));
 7333     __ ldp(a20, a21, Address(state, 160));
 7334     __ ldp(a22, a23, Address(state, 176));
 7335     __ ldr(a24, Address(state, 192));
 7336 
 7337     __ BIND(sha3_loop);
 7338 
 7339     // load input
 7340     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7341     __ eor(a0, a0, tmp3);
 7342     __ eor(a1, a1, tmp2);
 7343     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7344     __ eor(a2, a2, tmp3);
 7345     __ eor(a3, a3, tmp2);
 7346     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7347     __ eor(a4, a4, tmp3);
 7348     __ eor(a5, a5, tmp2);
 7349     __ ldr(tmp3, __ post(buf, 8));
 7350     __ eor(a6, a6, tmp3);
 7351 
 7352     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7353     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7354 
 7355     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7356     __ eor(a7, a7, tmp3);
 7357     __ eor(a8, a8, tmp2);
 7358     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7359     __ eor(a9, a9, tmp3);
 7360     __ eor(a10, a10, tmp2);
 7361     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7362     __ eor(a11, a11, tmp3);
 7363     __ eor(a12, a12, tmp2);
 7364     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7365     __ eor(a13, a13, tmp3);
 7366     __ eor(a14, a14, tmp2);
 7367     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7368     __ eor(a15, a15, tmp3);
 7369     __ eor(a16, a16, tmp2);
 7370 
 7371     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7372     __ andw(tmp2, block_size, 48);
 7373     __ cbzw(tmp2, rounds24_preloop);
 7374     __ tbnz(block_size, 5, shake128);
 7375     // block_size == 144, bit5 == 0, SHA3-244
 7376     __ ldr(tmp3, __ post(buf, 8));
 7377     __ eor(a17, a17, tmp3);
 7378     __ b(rounds24_preloop);
 7379 
 7380     __ BIND(shake128);
 7381     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7382     __ eor(a17, a17, tmp3);
 7383     __ eor(a18, a18, tmp2);
 7384     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7385     __ eor(a19, a19, tmp3);
 7386     __ eor(a20, a20, tmp2);
 7387     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7388 
 7389     __ BIND(sha3_512_or_sha3_384);
 7390     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7391     __ eor(a7, a7, tmp3);
 7392     __ eor(a8, a8, tmp2);
 7393     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7394 
 7395     // SHA3-384
 7396     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7397     __ eor(a9, a9, tmp3);
 7398     __ eor(a10, a10, tmp2);
 7399     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7400     __ eor(a11, a11, tmp3);
 7401     __ eor(a12, a12, tmp2);
 7402 
 7403     __ BIND(rounds24_preloop);
 7404     __ fmovs(v0, 24.0); // float loop counter,
 7405     __ fmovs(v1, 1.0);  // exact representation
 7406 
 7407     __ str(buf, Address(sp, 16));
 7408     __ lea(tmp3, ExternalAddress((address) round_consts));
 7409 
 7410     __ BIND(loop_body);
 7411     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7412                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7413                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7414                      tmp0, tmp1, tmp2);
 7415     __ fsubs(v0, v0, v1);
 7416     __ fcmps(v0, 0.0);
 7417     __ br(__ NE, loop_body);
 7418 
 7419     if (multi_block) {
 7420       __ ldrw(block_size, sp); // block_size
 7421       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7422       __ addw(tmp2, tmp2, block_size);
 7423       __ cmpw(tmp2, tmp1);
 7424       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7425       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7426       __ br(Assembler::LE, sha3_loop);
 7427       __ movw(c_rarg0, tmp2); // return offset
 7428     }
 7429     if (can_use_fp && can_use_r18) {
 7430       __ ldp(r18_tls, state, Address(sp, 112));
 7431     } else {
 7432       __ ldr(state, Address(sp, 112));
 7433     }
 7434     // save calculated sha3 state
 7435     __ stp(a0, a1, Address(state));
 7436     __ stp(a2, a3, Address(state, 16));
 7437     __ stp(a4, a5, Address(state, 32));
 7438     __ stp(a6, a7, Address(state, 48));
 7439     __ stp(a8, a9, Address(state, 64));
 7440     __ stp(a10, a11, Address(state, 80));
 7441     __ stp(a12, a13, Address(state, 96));
 7442     __ stp(a14, a15, Address(state, 112));
 7443     __ stp(a16, a17, Address(state, 128));
 7444     __ stp(a18, a19, Address(state, 144));
 7445     __ stp(a20, a21, Address(state, 160));
 7446     __ stp(a22, a23, Address(state, 176));
 7447     __ str(a24, Address(state, 192));
 7448 
 7449     // restore required registers from stack
 7450     __ ldp(r19, r20, Address(sp, 32));
 7451     __ ldp(r21, r22, Address(sp, 48));
 7452     __ ldp(r23, r24, Address(sp, 64));
 7453     __ ldp(r25, r26, Address(sp, 80));
 7454     __ ldp(r27, r28, Address(sp, 96));
 7455     if (can_use_fp && can_use_r18) {
 7456       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 7457     } // else no need to recalculate rfp, since it wasn't changed
 7458 
 7459     __ leave();
 7460 
 7461     __ ret(lr);
 7462 
 7463     return start;
 7464   }
 7465 
 7466   /**
 7467    *  Arguments:
 7468    *
 7469    * Inputs:
 7470    *   c_rarg0   - int crc
 7471    *   c_rarg1   - byte* buf
 7472    *   c_rarg2   - int length
 7473    *
 7474    * Output:
 7475    *       rax   - int crc result
 7476    */
 7477   address generate_updateBytesCRC32() {
 7478     assert(UseCRC32Intrinsics, "what are we doing here?");
 7479 
 7480     __ align(CodeEntryAlignment);
 7481     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
 7482     StubCodeMark mark(this, stub_id);
 7483 
 7484     address start = __ pc();
 7485 
 7486     const Register crc   = c_rarg0;  // crc
 7487     const Register buf   = c_rarg1;  // source java byte array address
 7488     const Register len   = c_rarg2;  // length
 7489     const Register table0 = c_rarg3; // crc_table address
 7490     const Register table1 = c_rarg4;
 7491     const Register table2 = c_rarg5;
 7492     const Register table3 = c_rarg6;
 7493     const Register tmp3 = c_rarg7;
 7494 
 7495     BLOCK_COMMENT("Entry:");
 7496     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7497 
 7498     __ kernel_crc32(crc, buf, len,
 7499               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7500 
 7501     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7502     __ ret(lr);
 7503 
 7504     return start;
 7505   }
 7506 
 7507   /**
 7508    *  Arguments:
 7509    *
 7510    * Inputs:
 7511    *   c_rarg0   - int crc
 7512    *   c_rarg1   - byte* buf
 7513    *   c_rarg2   - int length
 7514    *   c_rarg3   - int* table
 7515    *
 7516    * Output:
 7517    *       r0   - int crc result
 7518    */
 7519   address generate_updateBytesCRC32C() {
 7520     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7521 
 7522     __ align(CodeEntryAlignment);
 7523     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
 7524     StubCodeMark mark(this, stub_id);
 7525 
 7526     address start = __ pc();
 7527 
 7528     const Register crc   = c_rarg0;  // crc
 7529     const Register buf   = c_rarg1;  // source java byte array address
 7530     const Register len   = c_rarg2;  // length
 7531     const Register table0 = c_rarg3; // crc_table address
 7532     const Register table1 = c_rarg4;
 7533     const Register table2 = c_rarg5;
 7534     const Register table3 = c_rarg6;
 7535     const Register tmp3 = c_rarg7;
 7536 
 7537     BLOCK_COMMENT("Entry:");
 7538     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7539 
 7540     __ kernel_crc32c(crc, buf, len,
 7541               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7542 
 7543     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7544     __ ret(lr);
 7545 
 7546     return start;
 7547   }
 7548 
 7549   /***
 7550    *  Arguments:
 7551    *
 7552    *  Inputs:
 7553    *   c_rarg0   - int   adler
 7554    *   c_rarg1   - byte* buff
 7555    *   c_rarg2   - int   len
 7556    *
 7557    * Output:
 7558    *   c_rarg0   - int adler result
 7559    */
 7560   address generate_updateBytesAdler32() {
 7561     __ align(CodeEntryAlignment);
 7562     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
 7563     StubCodeMark mark(this, stub_id);
 7564     address start = __ pc();
 7565 
 7566     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7567 
 7568     // Aliases
 7569     Register adler  = c_rarg0;
 7570     Register s1     = c_rarg0;
 7571     Register s2     = c_rarg3;
 7572     Register buff   = c_rarg1;
 7573     Register len    = c_rarg2;
 7574     Register nmax  = r4;
 7575     Register base  = r5;
 7576     Register count = r6;
 7577     Register temp0 = rscratch1;
 7578     Register temp1 = rscratch2;
 7579     FloatRegister vbytes = v0;
 7580     FloatRegister vs1acc = v1;
 7581     FloatRegister vs2acc = v2;
 7582     FloatRegister vtable = v3;
 7583 
 7584     // Max number of bytes we can process before having to take the mod
 7585     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7586     uint64_t BASE = 0xfff1;
 7587     uint64_t NMAX = 0x15B0;
 7588 
 7589     __ mov(base, BASE);
 7590     __ mov(nmax, NMAX);
 7591 
 7592     // Load accumulation coefficients for the upper 16 bits
 7593     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7594     __ ld1(vtable, __ T16B, Address(temp0));
 7595 
 7596     // s1 is initialized to the lower 16 bits of adler
 7597     // s2 is initialized to the upper 16 bits of adler
 7598     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7599     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7600 
 7601     // The pipelined loop needs at least 16 elements for 1 iteration
 7602     // It does check this, but it is more effective to skip to the cleanup loop
 7603     __ cmp(len, (u1)16);
 7604     __ br(Assembler::HS, L_nmax);
 7605     __ cbz(len, L_combine);
 7606 
 7607     __ bind(L_simple_by1_loop);
 7608     __ ldrb(temp0, Address(__ post(buff, 1)));
 7609     __ add(s1, s1, temp0);
 7610     __ add(s2, s2, s1);
 7611     __ subs(len, len, 1);
 7612     __ br(Assembler::HI, L_simple_by1_loop);
 7613 
 7614     // s1 = s1 % BASE
 7615     __ subs(temp0, s1, base);
 7616     __ csel(s1, temp0, s1, Assembler::HS);
 7617 
 7618     // s2 = s2 % BASE
 7619     __ lsr(temp0, s2, 16);
 7620     __ lsl(temp1, temp0, 4);
 7621     __ sub(temp1, temp1, temp0);
 7622     __ add(s2, temp1, s2, ext::uxth);
 7623 
 7624     __ subs(temp0, s2, base);
 7625     __ csel(s2, temp0, s2, Assembler::HS);
 7626 
 7627     __ b(L_combine);
 7628 
 7629     __ bind(L_nmax);
 7630     __ subs(len, len, nmax);
 7631     __ sub(count, nmax, 16);
 7632     __ br(Assembler::LO, L_by16);
 7633 
 7634     __ bind(L_nmax_loop);
 7635 
 7636     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7637                                       vbytes, vs1acc, vs2acc, vtable);
 7638 
 7639     __ subs(count, count, 16);
 7640     __ br(Assembler::HS, L_nmax_loop);
 7641 
 7642     // s1 = s1 % BASE
 7643     __ lsr(temp0, s1, 16);
 7644     __ lsl(temp1, temp0, 4);
 7645     __ sub(temp1, temp1, temp0);
 7646     __ add(temp1, temp1, s1, ext::uxth);
 7647 
 7648     __ lsr(temp0, temp1, 16);
 7649     __ lsl(s1, temp0, 4);
 7650     __ sub(s1, s1, temp0);
 7651     __ add(s1, s1, temp1, ext:: uxth);
 7652 
 7653     __ subs(temp0, s1, base);
 7654     __ csel(s1, temp0, s1, Assembler::HS);
 7655 
 7656     // s2 = s2 % BASE
 7657     __ lsr(temp0, s2, 16);
 7658     __ lsl(temp1, temp0, 4);
 7659     __ sub(temp1, temp1, temp0);
 7660     __ add(temp1, temp1, s2, ext::uxth);
 7661 
 7662     __ lsr(temp0, temp1, 16);
 7663     __ lsl(s2, temp0, 4);
 7664     __ sub(s2, s2, temp0);
 7665     __ add(s2, s2, temp1, ext:: uxth);
 7666 
 7667     __ subs(temp0, s2, base);
 7668     __ csel(s2, temp0, s2, Assembler::HS);
 7669 
 7670     __ subs(len, len, nmax);
 7671     __ sub(count, nmax, 16);
 7672     __ br(Assembler::HS, L_nmax_loop);
 7673 
 7674     __ bind(L_by16);
 7675     __ adds(len, len, count);
 7676     __ br(Assembler::LO, L_by1);
 7677 
 7678     __ bind(L_by16_loop);
 7679 
 7680     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7681                                       vbytes, vs1acc, vs2acc, vtable);
 7682 
 7683     __ subs(len, len, 16);
 7684     __ br(Assembler::HS, L_by16_loop);
 7685 
 7686     __ bind(L_by1);
 7687     __ adds(len, len, 15);
 7688     __ br(Assembler::LO, L_do_mod);
 7689 
 7690     __ bind(L_by1_loop);
 7691     __ ldrb(temp0, Address(__ post(buff, 1)));
 7692     __ add(s1, temp0, s1);
 7693     __ add(s2, s2, s1);
 7694     __ subs(len, len, 1);
 7695     __ br(Assembler::HS, L_by1_loop);
 7696 
 7697     __ bind(L_do_mod);
 7698     // s1 = s1 % BASE
 7699     __ lsr(temp0, s1, 16);
 7700     __ lsl(temp1, temp0, 4);
 7701     __ sub(temp1, temp1, temp0);
 7702     __ add(temp1, temp1, s1, ext::uxth);
 7703 
 7704     __ lsr(temp0, temp1, 16);
 7705     __ lsl(s1, temp0, 4);
 7706     __ sub(s1, s1, temp0);
 7707     __ add(s1, s1, temp1, ext:: uxth);
 7708 
 7709     __ subs(temp0, s1, base);
 7710     __ csel(s1, temp0, s1, Assembler::HS);
 7711 
 7712     // s2 = s2 % BASE
 7713     __ lsr(temp0, s2, 16);
 7714     __ lsl(temp1, temp0, 4);
 7715     __ sub(temp1, temp1, temp0);
 7716     __ add(temp1, temp1, s2, ext::uxth);
 7717 
 7718     __ lsr(temp0, temp1, 16);
 7719     __ lsl(s2, temp0, 4);
 7720     __ sub(s2, s2, temp0);
 7721     __ add(s2, s2, temp1, ext:: uxth);
 7722 
 7723     __ subs(temp0, s2, base);
 7724     __ csel(s2, temp0, s2, Assembler::HS);
 7725 
 7726     // Combine lower bits and higher bits
 7727     __ bind(L_combine);
 7728     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7729 
 7730     __ ret(lr);
 7731 
 7732     return start;
 7733   }
 7734 
 7735   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7736           Register temp0, Register temp1, FloatRegister vbytes,
 7737           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7738     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7739     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7740     // In non-vectorized code, we update s1 and s2 as:
 7741     //   s1 <- s1 + b1
 7742     //   s2 <- s2 + s1
 7743     //   s1 <- s1 + b2
 7744     //   s2 <- s2 + b1
 7745     //   ...
 7746     //   s1 <- s1 + b16
 7747     //   s2 <- s2 + s1
 7748     // Putting above assignments together, we have:
 7749     //   s1_new = s1 + b1 + b2 + ... + b16
 7750     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7751     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7752     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7753     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7754 
 7755     // s2 = s2 + s1 * 16
 7756     __ add(s2, s2, s1, Assembler::LSL, 4);
 7757 
 7758     // vs1acc = b1 + b2 + b3 + ... + b16
 7759     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7760     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7761     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7762     __ uaddlv(vs1acc, __ T16B, vbytes);
 7763     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7764 
 7765     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7766     __ fmovd(temp0, vs1acc);
 7767     __ fmovd(temp1, vs2acc);
 7768     __ add(s1, s1, temp0);
 7769     __ add(s2, s2, temp1);
 7770   }
 7771 
 7772   /**
 7773    *  Arguments:
 7774    *
 7775    *  Input:
 7776    *    c_rarg0   - x address
 7777    *    c_rarg1   - x length
 7778    *    c_rarg2   - y address
 7779    *    c_rarg3   - y length
 7780    *    c_rarg4   - z address
 7781    */
 7782   address generate_multiplyToLen() {
 7783     __ align(CodeEntryAlignment);
 7784     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
 7785     StubCodeMark mark(this, stub_id);
 7786 
 7787     address start = __ pc();
 7788     const Register x     = r0;
 7789     const Register xlen  = r1;
 7790     const Register y     = r2;
 7791     const Register ylen  = r3;
 7792     const Register z     = r4;
 7793 
 7794     const Register tmp0  = r5;
 7795     const Register tmp1  = r10;
 7796     const Register tmp2  = r11;
 7797     const Register tmp3  = r12;
 7798     const Register tmp4  = r13;
 7799     const Register tmp5  = r14;
 7800     const Register tmp6  = r15;
 7801     const Register tmp7  = r16;
 7802 
 7803     BLOCK_COMMENT("Entry:");
 7804     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7805     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7806     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7807     __ ret(lr);
 7808 
 7809     return start;
 7810   }
 7811 
 7812   address generate_squareToLen() {
 7813     // squareToLen algorithm for sizes 1..127 described in java code works
 7814     // faster than multiply_to_len on some CPUs and slower on others, but
 7815     // multiply_to_len shows a bit better overall results
 7816     __ align(CodeEntryAlignment);
 7817     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
 7818     StubCodeMark mark(this, stub_id);
 7819     address start = __ pc();
 7820 
 7821     const Register x     = r0;
 7822     const Register xlen  = r1;
 7823     const Register z     = r2;
 7824     const Register y     = r4; // == x
 7825     const Register ylen  = r5; // == xlen
 7826 
 7827     const Register tmp0  = r3;
 7828     const Register tmp1  = r10;
 7829     const Register tmp2  = r11;
 7830     const Register tmp3  = r12;
 7831     const Register tmp4  = r13;
 7832     const Register tmp5  = r14;
 7833     const Register tmp6  = r15;
 7834     const Register tmp7  = r16;
 7835 
 7836     RegSet spilled_regs = RegSet::of(y, ylen);
 7837     BLOCK_COMMENT("Entry:");
 7838     __ enter();
 7839     __ push(spilled_regs, sp);
 7840     __ mov(y, x);
 7841     __ mov(ylen, xlen);
 7842     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7843     __ pop(spilled_regs, sp);
 7844     __ leave();
 7845     __ ret(lr);
 7846     return start;
 7847   }
 7848 
 7849   address generate_mulAdd() {
 7850     __ align(CodeEntryAlignment);
 7851     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
 7852     StubCodeMark mark(this, stub_id);
 7853 
 7854     address start = __ pc();
 7855 
 7856     const Register out     = r0;
 7857     const Register in      = r1;
 7858     const Register offset  = r2;
 7859     const Register len     = r3;
 7860     const Register k       = r4;
 7861 
 7862     BLOCK_COMMENT("Entry:");
 7863     __ enter();
 7864     __ mul_add(out, in, offset, len, k);
 7865     __ leave();
 7866     __ ret(lr);
 7867 
 7868     return start;
 7869   }
 7870 
 7871   // Arguments:
 7872   //
 7873   // Input:
 7874   //   c_rarg0   - newArr address
 7875   //   c_rarg1   - oldArr address
 7876   //   c_rarg2   - newIdx
 7877   //   c_rarg3   - shiftCount
 7878   //   c_rarg4   - numIter
 7879   //
 7880   address generate_bigIntegerRightShift() {
 7881     __ align(CodeEntryAlignment);
 7882     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
 7883     StubCodeMark mark(this, stub_id);
 7884     address start = __ pc();
 7885 
 7886     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7887 
 7888     Register newArr        = c_rarg0;
 7889     Register oldArr        = c_rarg1;
 7890     Register newIdx        = c_rarg2;
 7891     Register shiftCount    = c_rarg3;
 7892     Register numIter       = c_rarg4;
 7893     Register idx           = numIter;
 7894 
 7895     Register newArrCur     = rscratch1;
 7896     Register shiftRevCount = rscratch2;
 7897     Register oldArrCur     = r13;
 7898     Register oldArrNext    = r14;
 7899 
 7900     FloatRegister oldElem0        = v0;
 7901     FloatRegister oldElem1        = v1;
 7902     FloatRegister newElem         = v2;
 7903     FloatRegister shiftVCount     = v3;
 7904     FloatRegister shiftVRevCount  = v4;
 7905 
 7906     __ cbz(idx, Exit);
 7907 
 7908     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7909 
 7910     // left shift count
 7911     __ movw(shiftRevCount, 32);
 7912     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7913 
 7914     // numIter too small to allow a 4-words SIMD loop, rolling back
 7915     __ cmp(numIter, (u1)4);
 7916     __ br(Assembler::LT, ShiftThree);
 7917 
 7918     __ dup(shiftVCount,    __ T4S, shiftCount);
 7919     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7920     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7921 
 7922     __ BIND(ShiftSIMDLoop);
 7923 
 7924     // Calculate the load addresses
 7925     __ sub(idx, idx, 4);
 7926     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7927     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7928     __ add(oldArrCur,  oldArrNext, 4);
 7929 
 7930     // Load 4 words and process
 7931     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7932     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7933     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7934     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7935     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7936     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7937 
 7938     __ cmp(idx, (u1)4);
 7939     __ br(Assembler::LT, ShiftTwoLoop);
 7940     __ b(ShiftSIMDLoop);
 7941 
 7942     __ BIND(ShiftTwoLoop);
 7943     __ cbz(idx, Exit);
 7944     __ cmp(idx, (u1)1);
 7945     __ br(Assembler::EQ, ShiftOne);
 7946 
 7947     // Calculate the load addresses
 7948     __ sub(idx, idx, 2);
 7949     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7950     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7951     __ add(oldArrCur,  oldArrNext, 4);
 7952 
 7953     // Load 2 words and process
 7954     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7955     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7956     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7957     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7958     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7959     __ st1(newElem,   __ T2S, Address(newArrCur));
 7960     __ b(ShiftTwoLoop);
 7961 
 7962     __ BIND(ShiftThree);
 7963     __ tbz(idx, 1, ShiftOne);
 7964     __ tbz(idx, 0, ShiftTwo);
 7965     __ ldrw(r10,  Address(oldArr, 12));
 7966     __ ldrw(r11,  Address(oldArr, 8));
 7967     __ lsrvw(r10, r10, shiftCount);
 7968     __ lslvw(r11, r11, shiftRevCount);
 7969     __ orrw(r12,  r10, r11);
 7970     __ strw(r12,  Address(newArr, 8));
 7971 
 7972     __ BIND(ShiftTwo);
 7973     __ ldrw(r10,  Address(oldArr, 8));
 7974     __ ldrw(r11,  Address(oldArr, 4));
 7975     __ lsrvw(r10, r10, shiftCount);
 7976     __ lslvw(r11, r11, shiftRevCount);
 7977     __ orrw(r12,  r10, r11);
 7978     __ strw(r12,  Address(newArr, 4));
 7979 
 7980     __ BIND(ShiftOne);
 7981     __ ldrw(r10,  Address(oldArr, 4));
 7982     __ ldrw(r11,  Address(oldArr));
 7983     __ lsrvw(r10, r10, shiftCount);
 7984     __ lslvw(r11, r11, shiftRevCount);
 7985     __ orrw(r12,  r10, r11);
 7986     __ strw(r12,  Address(newArr));
 7987 
 7988     __ BIND(Exit);
 7989     __ ret(lr);
 7990 
 7991     return start;
 7992   }
 7993 
 7994   // Arguments:
 7995   //
 7996   // Input:
 7997   //   c_rarg0   - newArr address
 7998   //   c_rarg1   - oldArr address
 7999   //   c_rarg2   - newIdx
 8000   //   c_rarg3   - shiftCount
 8001   //   c_rarg4   - numIter
 8002   //
 8003   address generate_bigIntegerLeftShift() {
 8004     __ align(CodeEntryAlignment);
 8005     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
 8006     StubCodeMark mark(this, stub_id);
 8007     address start = __ pc();
 8008 
 8009     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8010 
 8011     Register newArr        = c_rarg0;
 8012     Register oldArr        = c_rarg1;
 8013     Register newIdx        = c_rarg2;
 8014     Register shiftCount    = c_rarg3;
 8015     Register numIter       = c_rarg4;
 8016 
 8017     Register shiftRevCount = rscratch1;
 8018     Register oldArrNext    = rscratch2;
 8019 
 8020     FloatRegister oldElem0        = v0;
 8021     FloatRegister oldElem1        = v1;
 8022     FloatRegister newElem         = v2;
 8023     FloatRegister shiftVCount     = v3;
 8024     FloatRegister shiftVRevCount  = v4;
 8025 
 8026     __ cbz(numIter, Exit);
 8027 
 8028     __ add(oldArrNext, oldArr, 4);
 8029     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8030 
 8031     // right shift count
 8032     __ movw(shiftRevCount, 32);
 8033     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8034 
 8035     // numIter too small to allow a 4-words SIMD loop, rolling back
 8036     __ cmp(numIter, (u1)4);
 8037     __ br(Assembler::LT, ShiftThree);
 8038 
 8039     __ dup(shiftVCount,     __ T4S, shiftCount);
 8040     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8041     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8042 
 8043     __ BIND(ShiftSIMDLoop);
 8044 
 8045     // load 4 words and process
 8046     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8047     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8048     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8049     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8050     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8051     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8052     __ sub(numIter,   numIter, 4);
 8053 
 8054     __ cmp(numIter, (u1)4);
 8055     __ br(Assembler::LT, ShiftTwoLoop);
 8056     __ b(ShiftSIMDLoop);
 8057 
 8058     __ BIND(ShiftTwoLoop);
 8059     __ cbz(numIter, Exit);
 8060     __ cmp(numIter, (u1)1);
 8061     __ br(Assembler::EQ, ShiftOne);
 8062 
 8063     // load 2 words and process
 8064     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8065     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8066     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8067     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8068     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8069     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8070     __ sub(numIter,   numIter, 2);
 8071     __ b(ShiftTwoLoop);
 8072 
 8073     __ BIND(ShiftThree);
 8074     __ ldrw(r10,  __ post(oldArr, 4));
 8075     __ ldrw(r11,  __ post(oldArrNext, 4));
 8076     __ lslvw(r10, r10, shiftCount);
 8077     __ lsrvw(r11, r11, shiftRevCount);
 8078     __ orrw(r12,  r10, r11);
 8079     __ strw(r12,  __ post(newArr, 4));
 8080     __ tbz(numIter, 1, Exit);
 8081     __ tbz(numIter, 0, ShiftOne);
 8082 
 8083     __ BIND(ShiftTwo);
 8084     __ ldrw(r10,  __ post(oldArr, 4));
 8085     __ ldrw(r11,  __ post(oldArrNext, 4));
 8086     __ lslvw(r10, r10, shiftCount);
 8087     __ lsrvw(r11, r11, shiftRevCount);
 8088     __ orrw(r12,  r10, r11);
 8089     __ strw(r12,  __ post(newArr, 4));
 8090 
 8091     __ BIND(ShiftOne);
 8092     __ ldrw(r10,  Address(oldArr));
 8093     __ ldrw(r11,  Address(oldArrNext));
 8094     __ lslvw(r10, r10, shiftCount);
 8095     __ lsrvw(r11, r11, shiftRevCount);
 8096     __ orrw(r12,  r10, r11);
 8097     __ strw(r12,  Address(newArr));
 8098 
 8099     __ BIND(Exit);
 8100     __ ret(lr);
 8101 
 8102     return start;
 8103   }
 8104 
 8105   address generate_count_positives(address &count_positives_long) {
 8106     const u1 large_loop_size = 64;
 8107     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8108     int dcache_line = VM_Version::dcache_line_size();
 8109 
 8110     Register ary1 = r1, len = r2, result = r0;
 8111 
 8112     __ align(CodeEntryAlignment);
 8113 
 8114     StubGenStubId stub_id = StubGenStubId::count_positives_id;
 8115     StubCodeMark mark(this, stub_id);
 8116 
 8117     address entry = __ pc();
 8118 
 8119     __ enter();
 8120     // precondition: a copy of len is already in result
 8121     // __ mov(result, len);
 8122 
 8123   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8124         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8125 
 8126   __ cmp(len, (u1)15);
 8127   __ br(Assembler::GT, LEN_OVER_15);
 8128   // The only case when execution falls into this code is when pointer is near
 8129   // the end of memory page and we have to avoid reading next page
 8130   __ add(ary1, ary1, len);
 8131   __ subs(len, len, 8);
 8132   __ br(Assembler::GT, LEN_OVER_8);
 8133   __ ldr(rscratch2, Address(ary1, -8));
 8134   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8135   __ lsrv(rscratch2, rscratch2, rscratch1);
 8136   __ tst(rscratch2, UPPER_BIT_MASK);
 8137   __ csel(result, zr, result, Assembler::NE);
 8138   __ leave();
 8139   __ ret(lr);
 8140   __ bind(LEN_OVER_8);
 8141   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8142   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8143   __ tst(rscratch2, UPPER_BIT_MASK);
 8144   __ br(Assembler::NE, RET_NO_POP);
 8145   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8146   __ lsrv(rscratch1, rscratch1, rscratch2);
 8147   __ tst(rscratch1, UPPER_BIT_MASK);
 8148   __ bind(RET_NO_POP);
 8149   __ csel(result, zr, result, Assembler::NE);
 8150   __ leave();
 8151   __ ret(lr);
 8152 
 8153   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8154   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8155 
 8156   count_positives_long = __ pc(); // 2nd entry point
 8157 
 8158   __ enter();
 8159 
 8160   __ bind(LEN_OVER_15);
 8161     __ push(spilled_regs, sp);
 8162     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8163     __ cbz(rscratch2, ALIGNED);
 8164     __ ldp(tmp6, tmp1, Address(ary1));
 8165     __ mov(tmp5, 16);
 8166     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8167     __ add(ary1, ary1, rscratch1);
 8168     __ orr(tmp6, tmp6, tmp1);
 8169     __ tst(tmp6, UPPER_BIT_MASK);
 8170     __ br(Assembler::NE, RET_ADJUST);
 8171     __ sub(len, len, rscratch1);
 8172 
 8173   __ bind(ALIGNED);
 8174     __ cmp(len, large_loop_size);
 8175     __ br(Assembler::LT, CHECK_16);
 8176     // Perform 16-byte load as early return in pre-loop to handle situation
 8177     // when initially aligned large array has negative values at starting bytes,
 8178     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8179     // slower. Cases with negative bytes further ahead won't be affected that
 8180     // much. In fact, it'll be faster due to early loads, less instructions and
 8181     // less branches in LARGE_LOOP.
 8182     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8183     __ sub(len, len, 16);
 8184     __ orr(tmp6, tmp6, tmp1);
 8185     __ tst(tmp6, UPPER_BIT_MASK);
 8186     __ br(Assembler::NE, RET_ADJUST_16);
 8187     __ cmp(len, large_loop_size);
 8188     __ br(Assembler::LT, CHECK_16);
 8189 
 8190     if (SoftwarePrefetchHintDistance >= 0
 8191         && SoftwarePrefetchHintDistance >= dcache_line) {
 8192       // initial prefetch
 8193       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8194     }
 8195   __ bind(LARGE_LOOP);
 8196     if (SoftwarePrefetchHintDistance >= 0) {
 8197       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8198     }
 8199     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8200     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8201     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8202     // instructions per cycle and have less branches, but this approach disables
 8203     // early return, thus, all 64 bytes are loaded and checked every time.
 8204     __ ldp(tmp2, tmp3, Address(ary1));
 8205     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8206     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8207     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8208     __ add(ary1, ary1, large_loop_size);
 8209     __ sub(len, len, large_loop_size);
 8210     __ orr(tmp2, tmp2, tmp3);
 8211     __ orr(tmp4, tmp4, tmp5);
 8212     __ orr(rscratch1, rscratch1, rscratch2);
 8213     __ orr(tmp6, tmp6, tmp1);
 8214     __ orr(tmp2, tmp2, tmp4);
 8215     __ orr(rscratch1, rscratch1, tmp6);
 8216     __ orr(tmp2, tmp2, rscratch1);
 8217     __ tst(tmp2, UPPER_BIT_MASK);
 8218     __ br(Assembler::NE, RET_ADJUST_LONG);
 8219     __ cmp(len, large_loop_size);
 8220     __ br(Assembler::GE, LARGE_LOOP);
 8221 
 8222   __ bind(CHECK_16); // small 16-byte load pre-loop
 8223     __ cmp(len, (u1)16);
 8224     __ br(Assembler::LT, POST_LOOP16);
 8225 
 8226   __ bind(LOOP16); // small 16-byte load loop
 8227     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8228     __ sub(len, len, 16);
 8229     __ orr(tmp2, tmp2, tmp3);
 8230     __ tst(tmp2, UPPER_BIT_MASK);
 8231     __ br(Assembler::NE, RET_ADJUST_16);
 8232     __ cmp(len, (u1)16);
 8233     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8234 
 8235   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8236     __ cmp(len, (u1)8);
 8237     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8238     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8239     __ tst(tmp3, UPPER_BIT_MASK);
 8240     __ br(Assembler::NE, RET_ADJUST);
 8241     __ sub(len, len, 8);
 8242 
 8243   __ bind(POST_LOOP16_LOAD_TAIL);
 8244     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8245     __ ldr(tmp1, Address(ary1));
 8246     __ mov(tmp2, 64);
 8247     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8248     __ lslv(tmp1, tmp1, tmp4);
 8249     __ tst(tmp1, UPPER_BIT_MASK);
 8250     __ br(Assembler::NE, RET_ADJUST);
 8251     // Fallthrough
 8252 
 8253   __ bind(RET_LEN);
 8254     __ pop(spilled_regs, sp);
 8255     __ leave();
 8256     __ ret(lr);
 8257 
 8258     // difference result - len is the count of guaranteed to be
 8259     // positive bytes
 8260 
 8261   __ bind(RET_ADJUST_LONG);
 8262     __ add(len, len, (u1)(large_loop_size - 16));
 8263   __ bind(RET_ADJUST_16);
 8264     __ add(len, len, 16);
 8265   __ bind(RET_ADJUST);
 8266     __ pop(spilled_regs, sp);
 8267     __ leave();
 8268     __ sub(result, result, len);
 8269     __ ret(lr);
 8270 
 8271     return entry;
 8272   }
 8273 
 8274   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8275         bool usePrefetch, Label &NOT_EQUAL) {
 8276     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8277         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8278         tmp7 = r12, tmp8 = r13;
 8279     Label LOOP;
 8280 
 8281     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8282     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8283     __ bind(LOOP);
 8284     if (usePrefetch) {
 8285       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8286       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8287     }
 8288     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8289     __ eor(tmp1, tmp1, tmp2);
 8290     __ eor(tmp3, tmp3, tmp4);
 8291     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8292     __ orr(tmp1, tmp1, tmp3);
 8293     __ cbnz(tmp1, NOT_EQUAL);
 8294     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8295     __ eor(tmp5, tmp5, tmp6);
 8296     __ eor(tmp7, tmp7, tmp8);
 8297     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8298     __ orr(tmp5, tmp5, tmp7);
 8299     __ cbnz(tmp5, NOT_EQUAL);
 8300     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8301     __ eor(tmp1, tmp1, tmp2);
 8302     __ eor(tmp3, tmp3, tmp4);
 8303     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8304     __ orr(tmp1, tmp1, tmp3);
 8305     __ cbnz(tmp1, NOT_EQUAL);
 8306     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8307     __ eor(tmp5, tmp5, tmp6);
 8308     __ sub(cnt1, cnt1, 8 * wordSize);
 8309     __ eor(tmp7, tmp7, tmp8);
 8310     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8311     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8312     // cmp) because subs allows an unlimited range of immediate operand.
 8313     __ subs(tmp6, cnt1, loopThreshold);
 8314     __ orr(tmp5, tmp5, tmp7);
 8315     __ cbnz(tmp5, NOT_EQUAL);
 8316     __ br(__ GE, LOOP);
 8317     // post-loop
 8318     __ eor(tmp1, tmp1, tmp2);
 8319     __ eor(tmp3, tmp3, tmp4);
 8320     __ orr(tmp1, tmp1, tmp3);
 8321     __ sub(cnt1, cnt1, 2 * wordSize);
 8322     __ cbnz(tmp1, NOT_EQUAL);
 8323   }
 8324 
 8325   void generate_large_array_equals_loop_simd(int loopThreshold,
 8326         bool usePrefetch, Label &NOT_EQUAL) {
 8327     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8328         tmp2 = rscratch2;
 8329     Label LOOP;
 8330 
 8331     __ bind(LOOP);
 8332     if (usePrefetch) {
 8333       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8334       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8335     }
 8336     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8337     __ sub(cnt1, cnt1, 8 * wordSize);
 8338     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8339     __ subs(tmp1, cnt1, loopThreshold);
 8340     __ eor(v0, __ T16B, v0, v4);
 8341     __ eor(v1, __ T16B, v1, v5);
 8342     __ eor(v2, __ T16B, v2, v6);
 8343     __ eor(v3, __ T16B, v3, v7);
 8344     __ orr(v0, __ T16B, v0, v1);
 8345     __ orr(v1, __ T16B, v2, v3);
 8346     __ orr(v0, __ T16B, v0, v1);
 8347     __ umov(tmp1, v0, __ D, 0);
 8348     __ umov(tmp2, v0, __ D, 1);
 8349     __ orr(tmp1, tmp1, tmp2);
 8350     __ cbnz(tmp1, NOT_EQUAL);
 8351     __ br(__ GE, LOOP);
 8352   }
 8353 
 8354   // a1 = r1 - array1 address
 8355   // a2 = r2 - array2 address
 8356   // result = r0 - return value. Already contains "false"
 8357   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8358   // r3-r5 are reserved temporary registers
 8359   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8360   address generate_large_array_equals() {
 8361     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8362         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8363         tmp7 = r12, tmp8 = r13;
 8364     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 8365         SMALL_LOOP, POST_LOOP;
 8366     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8367     // calculate if at least 32 prefetched bytes are used
 8368     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8369     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8370     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8371     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8372         tmp5, tmp6, tmp7, tmp8);
 8373 
 8374     __ align(CodeEntryAlignment);
 8375 
 8376     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
 8377     StubCodeMark mark(this, stub_id);
 8378 
 8379     address entry = __ pc();
 8380     __ enter();
 8381     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8382     // also advance pointers to use post-increment instead of pre-increment
 8383     __ add(a1, a1, wordSize);
 8384     __ add(a2, a2, wordSize);
 8385     if (AvoidUnalignedAccesses) {
 8386       // both implementations (SIMD/nonSIMD) are using relatively large load
 8387       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8388       // on some CPUs in case of address is not at least 16-byte aligned.
 8389       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8390       // load if needed at least for 1st address and make if 16-byte aligned.
 8391       Label ALIGNED16;
 8392       __ tbz(a1, 3, ALIGNED16);
 8393       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8394       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8395       __ sub(cnt1, cnt1, wordSize);
 8396       __ eor(tmp1, tmp1, tmp2);
 8397       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8398       __ bind(ALIGNED16);
 8399     }
 8400     if (UseSIMDForArrayEquals) {
 8401       if (SoftwarePrefetchHintDistance >= 0) {
 8402         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8403         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8404         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8405             /* prfm = */ true, NOT_EQUAL);
 8406         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8407         __ br(__ LT, TAIL);
 8408       }
 8409       __ bind(NO_PREFETCH_LARGE_LOOP);
 8410       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8411           /* prfm = */ false, NOT_EQUAL);
 8412     } else {
 8413       __ push(spilled_regs, sp);
 8414       if (SoftwarePrefetchHintDistance >= 0) {
 8415         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8416         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8417         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8418             /* prfm = */ true, NOT_EQUAL);
 8419         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8420         __ br(__ LT, TAIL);
 8421       }
 8422       __ bind(NO_PREFETCH_LARGE_LOOP);
 8423       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8424           /* prfm = */ false, NOT_EQUAL);
 8425     }
 8426     __ bind(TAIL);
 8427       __ cbz(cnt1, EQUAL);
 8428       __ subs(cnt1, cnt1, wordSize);
 8429       __ br(__ LE, POST_LOOP);
 8430     __ bind(SMALL_LOOP);
 8431       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8432       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8433       __ subs(cnt1, cnt1, wordSize);
 8434       __ eor(tmp1, tmp1, tmp2);
 8435       __ cbnz(tmp1, NOT_EQUAL);
 8436       __ br(__ GT, SMALL_LOOP);
 8437     __ bind(POST_LOOP);
 8438       __ ldr(tmp1, Address(a1, cnt1));
 8439       __ ldr(tmp2, Address(a2, cnt1));
 8440       __ eor(tmp1, tmp1, tmp2);
 8441       __ cbnz(tmp1, NOT_EQUAL);
 8442     __ bind(EQUAL);
 8443       __ mov(result, true);
 8444     __ bind(NOT_EQUAL);
 8445       if (!UseSIMDForArrayEquals) {
 8446         __ pop(spilled_regs, sp);
 8447       }
 8448     __ bind(NOT_EQUAL_NO_POP);
 8449     __ leave();
 8450     __ ret(lr);
 8451     return entry;
 8452   }
 8453 
 8454   // result = r0 - return value. Contains initial hashcode value on entry.
 8455   // ary = r1 - array address
 8456   // cnt = r2 - elements count
 8457   // Clobbers: v0-v13, rscratch1, rscratch2
 8458   address generate_large_arrays_hashcode(BasicType eltype) {
 8459     const Register result = r0, ary = r1, cnt = r2;
 8460     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8461     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8462     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8463     const FloatRegister vpowm = v13;
 8464 
 8465     ARRAYS_HASHCODE_REGISTERS;
 8466 
 8467     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8468 
 8469     unsigned int vf; // vectorization factor
 8470     bool multiply_by_halves;
 8471     Assembler::SIMD_Arrangement load_arrangement;
 8472     switch (eltype) {
 8473     case T_BOOLEAN:
 8474     case T_BYTE:
 8475       load_arrangement = Assembler::T8B;
 8476       multiply_by_halves = true;
 8477       vf = 8;
 8478       break;
 8479     case T_CHAR:
 8480     case T_SHORT:
 8481       load_arrangement = Assembler::T8H;
 8482       multiply_by_halves = true;
 8483       vf = 8;
 8484       break;
 8485     case T_INT:
 8486       load_arrangement = Assembler::T4S;
 8487       multiply_by_halves = false;
 8488       vf = 4;
 8489       break;
 8490     default:
 8491       ShouldNotReachHere();
 8492     }
 8493 
 8494     // Unroll factor
 8495     const unsigned uf = 4;
 8496 
 8497     // Effective vectorization factor
 8498     const unsigned evf = vf * uf;
 8499 
 8500     __ align(CodeEntryAlignment);
 8501 
 8502     StubGenStubId stub_id;
 8503     switch (eltype) {
 8504     case T_BOOLEAN:
 8505       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
 8506       break;
 8507     case T_BYTE:
 8508       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
 8509       break;
 8510     case T_CHAR:
 8511       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
 8512       break;
 8513     case T_SHORT:
 8514       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
 8515       break;
 8516     case T_INT:
 8517       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
 8518       break;
 8519     default:
 8520       stub_id = StubGenStubId::NO_STUBID;
 8521       ShouldNotReachHere();
 8522     };
 8523 
 8524     StubCodeMark mark(this, stub_id);
 8525 
 8526     address entry = __ pc();
 8527     __ enter();
 8528 
 8529     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8530     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8531     // value shouldn't change throughout both loops.
 8532     __ movw(rscratch1, intpow(31U, 3));
 8533     __ mov(vpow, Assembler::S, 0, rscratch1);
 8534     __ movw(rscratch1, intpow(31U, 2));
 8535     __ mov(vpow, Assembler::S, 1, rscratch1);
 8536     __ movw(rscratch1, intpow(31U, 1));
 8537     __ mov(vpow, Assembler::S, 2, rscratch1);
 8538     __ movw(rscratch1, intpow(31U, 0));
 8539     __ mov(vpow, Assembler::S, 3, rscratch1);
 8540 
 8541     __ mov(vmul0, Assembler::T16B, 0);
 8542     __ mov(vmul0, Assembler::S, 3, result);
 8543 
 8544     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8545     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8546 
 8547     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8548     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8549 
 8550     // SMALL LOOP
 8551     __ bind(SMALL_LOOP);
 8552 
 8553     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8554     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8555     __ subsw(rscratch2, rscratch2, vf);
 8556 
 8557     if (load_arrangement == Assembler::T8B) {
 8558       // Extend 8B to 8H to be able to use vector multiply
 8559       // instructions
 8560       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8561       if (is_signed_subword_type(eltype)) {
 8562         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8563       } else {
 8564         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8565       }
 8566     }
 8567 
 8568     switch (load_arrangement) {
 8569     case Assembler::T4S:
 8570       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8571       break;
 8572     case Assembler::T8B:
 8573     case Assembler::T8H:
 8574       assert(is_subword_type(eltype), "subword type expected");
 8575       if (is_signed_subword_type(eltype)) {
 8576         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8577       } else {
 8578         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8579       }
 8580       break;
 8581     default:
 8582       __ should_not_reach_here();
 8583     }
 8584 
 8585     // Process the upper half of a vector
 8586     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8587       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8588       if (is_signed_subword_type(eltype)) {
 8589         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8590       } else {
 8591         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8592       }
 8593     }
 8594 
 8595     __ br(Assembler::HI, SMALL_LOOP);
 8596 
 8597     // SMALL LOOP'S EPILOQUE
 8598     __ lsr(rscratch2, cnt, exact_log2(evf));
 8599     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8600 
 8601     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8602     __ addv(vmul0, Assembler::T4S, vmul0);
 8603     __ umov(result, vmul0, Assembler::S, 0);
 8604 
 8605     // TAIL
 8606     __ bind(TAIL);
 8607 
 8608     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8609     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8610     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8611     __ andr(rscratch2, cnt, vf - 1);
 8612     __ bind(TAIL_SHORTCUT);
 8613     __ adr(rscratch1, BR_BASE);
 8614     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8615     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8616     __ movw(rscratch2, 0x1f);
 8617     __ br(rscratch1);
 8618 
 8619     for (size_t i = 0; i < vf - 1; ++i) {
 8620       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8621                                    eltype);
 8622       __ maddw(result, result, rscratch2, rscratch1);
 8623       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8624       // Generate 2nd nop to have 4 instructions per iteration.
 8625       if (VM_Version::supports_a53mac()) {
 8626         __ nop();
 8627       }
 8628     }
 8629     __ bind(BR_BASE);
 8630 
 8631     __ leave();
 8632     __ ret(lr);
 8633 
 8634     // LARGE LOOP
 8635     __ bind(LARGE_LOOP_PREHEADER);
 8636 
 8637     __ lsr(rscratch2, cnt, exact_log2(evf));
 8638 
 8639     if (multiply_by_halves) {
 8640       // 31^4 - multiplier between lower and upper parts of a register
 8641       __ movw(rscratch1, intpow(31U, vf / 2));
 8642       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8643       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8644       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8645       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8646     } else {
 8647       // 31^16
 8648       __ movw(rscratch1, intpow(31U, evf));
 8649       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8650     }
 8651 
 8652     __ mov(vmul3, Assembler::T16B, 0);
 8653     __ mov(vmul2, Assembler::T16B, 0);
 8654     __ mov(vmul1, Assembler::T16B, 0);
 8655 
 8656     __ bind(LARGE_LOOP);
 8657 
 8658     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8659     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8660     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8661     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8662 
 8663     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8664            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8665 
 8666     if (load_arrangement == Assembler::T8B) {
 8667       // Extend 8B to 8H to be able to use vector multiply
 8668       // instructions
 8669       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8670       if (is_signed_subword_type(eltype)) {
 8671         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8672         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8673         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8674         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8675       } else {
 8676         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8677         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8678         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8679         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8680       }
 8681     }
 8682 
 8683     switch (load_arrangement) {
 8684     case Assembler::T4S:
 8685       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8686       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8687       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8688       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8689       break;
 8690     case Assembler::T8B:
 8691     case Assembler::T8H:
 8692       assert(is_subword_type(eltype), "subword type expected");
 8693       if (is_signed_subword_type(eltype)) {
 8694         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8695         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8696         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8697         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8698       } else {
 8699         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8700         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8701         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8702         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8703       }
 8704       break;
 8705     default:
 8706       __ should_not_reach_here();
 8707     }
 8708 
 8709     // Process the upper half of a vector
 8710     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8711       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8712       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8713       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8714       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8715       if (is_signed_subword_type(eltype)) {
 8716         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8717         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8718         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8719         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8720       } else {
 8721         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8722         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8723         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8724         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8725       }
 8726     }
 8727 
 8728     __ subsw(rscratch2, rscratch2, 1);
 8729     __ br(Assembler::HI, LARGE_LOOP);
 8730 
 8731     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8732     __ addv(vmul3, Assembler::T4S, vmul3);
 8733     __ umov(result, vmul3, Assembler::S, 0);
 8734 
 8735     __ mov(rscratch2, intpow(31U, vf));
 8736 
 8737     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8738     __ addv(vmul2, Assembler::T4S, vmul2);
 8739     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8740     __ maddw(result, result, rscratch2, rscratch1);
 8741 
 8742     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8743     __ addv(vmul1, Assembler::T4S, vmul1);
 8744     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8745     __ maddw(result, result, rscratch2, rscratch1);
 8746 
 8747     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8748     __ addv(vmul0, Assembler::T4S, vmul0);
 8749     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8750     __ maddw(result, result, rscratch2, rscratch1);
 8751 
 8752     __ andr(rscratch2, cnt, vf - 1);
 8753     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8754 
 8755     __ leave();
 8756     __ ret(lr);
 8757 
 8758     return entry;
 8759   }
 8760 
 8761   address generate_dsin_dcos(bool isCos) {
 8762     __ align(CodeEntryAlignment);
 8763     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
 8764     StubCodeMark mark(this, stub_id);
 8765     address start = __ pc();
 8766     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8767         (address)StubRoutines::aarch64::_two_over_pi,
 8768         (address)StubRoutines::aarch64::_pio2,
 8769         (address)StubRoutines::aarch64::_dsin_coef,
 8770         (address)StubRoutines::aarch64::_dcos_coef);
 8771     return start;
 8772   }
 8773 
 8774   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8775   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8776       Label &DIFF2) {
 8777     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8778     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8779 
 8780     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8781     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8782     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8783     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8784 
 8785     __ fmovd(tmpL, vtmp3);
 8786     __ eor(rscratch2, tmp3, tmpL);
 8787     __ cbnz(rscratch2, DIFF2);
 8788 
 8789     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8790     __ umov(tmpL, vtmp3, __ D, 1);
 8791     __ eor(rscratch2, tmpU, tmpL);
 8792     __ cbnz(rscratch2, DIFF1);
 8793 
 8794     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8795     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8796     __ fmovd(tmpL, vtmp);
 8797     __ eor(rscratch2, tmp3, tmpL);
 8798     __ cbnz(rscratch2, DIFF2);
 8799 
 8800     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8801     __ umov(tmpL, vtmp, __ D, 1);
 8802     __ eor(rscratch2, tmpU, tmpL);
 8803     __ cbnz(rscratch2, DIFF1);
 8804   }
 8805 
 8806   // r0  = result
 8807   // r1  = str1
 8808   // r2  = cnt1
 8809   // r3  = str2
 8810   // r4  = cnt2
 8811   // r10 = tmp1
 8812   // r11 = tmp2
 8813   address generate_compare_long_string_different_encoding(bool isLU) {
 8814     __ align(CodeEntryAlignment);
 8815     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
 8816     StubCodeMark mark(this, stub_id);
 8817     address entry = __ pc();
 8818     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8819         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8820         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8821     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8822         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8823     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8824     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8825 
 8826     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8827 
 8828     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8829     // cnt2 == amount of characters left to compare
 8830     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8831     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8832     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8833     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8834     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8835     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8836     __ eor(rscratch2, tmp1, tmp2);
 8837     __ mov(rscratch1, tmp2);
 8838     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8839     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8840              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8841     __ push(spilled_regs, sp);
 8842     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8843     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8844 
 8845     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8846 
 8847     if (SoftwarePrefetchHintDistance >= 0) {
 8848       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8849       __ br(__ LT, NO_PREFETCH);
 8850       __ bind(LARGE_LOOP_PREFETCH);
 8851         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8852         __ mov(tmp4, 2);
 8853         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8854         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8855           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8856           __ subs(tmp4, tmp4, 1);
 8857           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8858           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8859           __ mov(tmp4, 2);
 8860         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8861           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8862           __ subs(tmp4, tmp4, 1);
 8863           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8864           __ sub(cnt2, cnt2, 64);
 8865           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8866           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8867     }
 8868     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8869     __ bind(NO_PREFETCH);
 8870     __ subs(cnt2, cnt2, 16);
 8871     __ br(__ LT, TAIL);
 8872     __ align(OptoLoopAlignment);
 8873     __ bind(SMALL_LOOP); // smaller loop
 8874       __ subs(cnt2, cnt2, 16);
 8875       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8876       __ br(__ GE, SMALL_LOOP);
 8877       __ cmn(cnt2, (u1)16);
 8878       __ br(__ EQ, LOAD_LAST);
 8879     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8880       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8881       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8882       __ ldr(tmp3, Address(cnt1, -8));
 8883       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8884       __ b(LOAD_LAST);
 8885     __ bind(DIFF2);
 8886       __ mov(tmpU, tmp3);
 8887     __ bind(DIFF1);
 8888       __ pop(spilled_regs, sp);
 8889       __ b(CALCULATE_DIFFERENCE);
 8890     __ bind(LOAD_LAST);
 8891       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8892       // No need to load it again
 8893       __ mov(tmpU, tmp3);
 8894       __ pop(spilled_regs, sp);
 8895 
 8896       // tmp2 points to the address of the last 4 Latin1 characters right now
 8897       __ ldrs(vtmp, Address(tmp2));
 8898       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8899       __ fmovd(tmpL, vtmp);
 8900 
 8901       __ eor(rscratch2, tmpU, tmpL);
 8902       __ cbz(rscratch2, DONE);
 8903 
 8904     // Find the first different characters in the longwords and
 8905     // compute their difference.
 8906     __ bind(CALCULATE_DIFFERENCE);
 8907       __ rev(rscratch2, rscratch2);
 8908       __ clz(rscratch2, rscratch2);
 8909       __ andr(rscratch2, rscratch2, -16);
 8910       __ lsrv(tmp1, tmp1, rscratch2);
 8911       __ uxthw(tmp1, tmp1);
 8912       __ lsrv(rscratch1, rscratch1, rscratch2);
 8913       __ uxthw(rscratch1, rscratch1);
 8914       __ subw(result, tmp1, rscratch1);
 8915     __ bind(DONE);
 8916       __ ret(lr);
 8917     return entry;
 8918   }
 8919 
 8920   // r0 = input (float16)
 8921   // v0 = result (float)
 8922   // v1 = temporary float register
 8923   address generate_float16ToFloat() {
 8924     __ align(CodeEntryAlignment);
 8925     StubGenStubId stub_id = StubGenStubId::hf2f_id;
 8926     StubCodeMark mark(this, stub_id);
 8927     address entry = __ pc();
 8928     BLOCK_COMMENT("Entry:");
 8929     __ flt16_to_flt(v0, r0, v1);
 8930     __ ret(lr);
 8931     return entry;
 8932   }
 8933 
 8934   // v0 = input (float)
 8935   // r0 = result (float16)
 8936   // v1 = temporary float register
 8937   address generate_floatToFloat16() {
 8938     __ align(CodeEntryAlignment);
 8939     StubGenStubId stub_id = StubGenStubId::f2hf_id;
 8940     StubCodeMark mark(this, stub_id);
 8941     address entry = __ pc();
 8942     BLOCK_COMMENT("Entry:");
 8943     __ flt_to_flt16(r0, v0, v1);
 8944     __ ret(lr);
 8945     return entry;
 8946   }
 8947 
 8948   address generate_method_entry_barrier() {
 8949     __ align(CodeEntryAlignment);
 8950     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
 8951     StubCodeMark mark(this, stub_id);
 8952 
 8953     Label deoptimize_label;
 8954 
 8955     address start = __ pc();
 8956 
 8957     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8958 
 8959     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8960       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8961       // We can get here despite the nmethod being good, if we have not
 8962       // yet applied our cross modification fence (or data fence).
 8963       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 8964       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 8965       __ ldrw(rscratch2, rscratch2);
 8966       __ strw(rscratch2, thread_epoch_addr);
 8967       __ isb();
 8968       __ membar(__ LoadLoad);
 8969     }
 8970 
 8971     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 8972 
 8973     __ enter();
 8974     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 8975 
 8976     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 8977 
 8978     __ push_call_clobbered_registers();
 8979 
 8980     __ mov(c_rarg0, rscratch2);
 8981     __ call_VM_leaf
 8982          (CAST_FROM_FN_PTR
 8983           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 8984 
 8985     __ reset_last_Java_frame(true);
 8986 
 8987     __ mov(rscratch1, r0);
 8988 
 8989     __ pop_call_clobbered_registers();
 8990 
 8991     __ cbnz(rscratch1, deoptimize_label);
 8992 
 8993     __ leave();
 8994     __ ret(lr);
 8995 
 8996     __ BIND(deoptimize_label);
 8997 
 8998     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 8999     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 9000 
 9001     __ mov(sp, rscratch1);
 9002     __ br(rscratch2);
 9003 
 9004     return start;
 9005   }
 9006 
 9007   // r0  = result
 9008   // r1  = str1
 9009   // r2  = cnt1
 9010   // r3  = str2
 9011   // r4  = cnt2
 9012   // r10 = tmp1
 9013   // r11 = tmp2
 9014   address generate_compare_long_string_same_encoding(bool isLL) {
 9015     __ align(CodeEntryAlignment);
 9016     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
 9017     StubCodeMark mark(this, stub_id);
 9018     address entry = __ pc();
 9019     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9020         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9021 
 9022     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9023 
 9024     // exit from large loop when less than 64 bytes left to read or we're about
 9025     // to prefetch memory behind array border
 9026     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9027 
 9028     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9029     __ eor(rscratch2, tmp1, tmp2);
 9030     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9031 
 9032     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9033     // update pointers, because of previous read
 9034     __ add(str1, str1, wordSize);
 9035     __ add(str2, str2, wordSize);
 9036     if (SoftwarePrefetchHintDistance >= 0) {
 9037       __ align(OptoLoopAlignment);
 9038       __ bind(LARGE_LOOP_PREFETCH);
 9039         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9040         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9041 
 9042         for (int i = 0; i < 4; i++) {
 9043           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9044           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9045           __ cmp(tmp1, tmp2);
 9046           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9047           __ br(Assembler::NE, DIFF);
 9048         }
 9049         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9050         __ add(str1, str1, 64);
 9051         __ add(str2, str2, 64);
 9052         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9053         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9054         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9055     }
 9056 
 9057     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9058     __ br(Assembler::LE, LESS16);
 9059     __ align(OptoLoopAlignment);
 9060     __ bind(LOOP_COMPARE16);
 9061       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9062       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9063       __ cmp(tmp1, tmp2);
 9064       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9065       __ br(Assembler::NE, DIFF);
 9066       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9067       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9068       __ br(Assembler::LT, LESS16);
 9069 
 9070       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9071       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9072       __ cmp(tmp1, tmp2);
 9073       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9074       __ br(Assembler::NE, DIFF);
 9075       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9076       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9077       __ br(Assembler::GE, LOOP_COMPARE16);
 9078       __ cbz(cnt2, LENGTH_DIFF);
 9079 
 9080     __ bind(LESS16);
 9081       // each 8 compare
 9082       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9083       __ br(Assembler::LE, LESS8);
 9084       __ ldr(tmp1, Address(__ post(str1, 8)));
 9085       __ ldr(tmp2, Address(__ post(str2, 8)));
 9086       __ eor(rscratch2, tmp1, tmp2);
 9087       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9088       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9089 
 9090     __ bind(LESS8); // directly load last 8 bytes
 9091       if (!isLL) {
 9092         __ add(cnt2, cnt2, cnt2);
 9093       }
 9094       __ ldr(tmp1, Address(str1, cnt2));
 9095       __ ldr(tmp2, Address(str2, cnt2));
 9096       __ eor(rscratch2, tmp1, tmp2);
 9097       __ cbz(rscratch2, LENGTH_DIFF);
 9098       __ b(CAL_DIFFERENCE);
 9099 
 9100     __ bind(DIFF);
 9101       __ cmp(tmp1, tmp2);
 9102       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9103       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9104       // reuse rscratch2 register for the result of eor instruction
 9105       __ eor(rscratch2, tmp1, tmp2);
 9106 
 9107     __ bind(CAL_DIFFERENCE);
 9108       __ rev(rscratch2, rscratch2);
 9109       __ clz(rscratch2, rscratch2);
 9110       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9111       __ lsrv(tmp1, tmp1, rscratch2);
 9112       __ lsrv(tmp2, tmp2, rscratch2);
 9113       if (isLL) {
 9114         __ uxtbw(tmp1, tmp1);
 9115         __ uxtbw(tmp2, tmp2);
 9116       } else {
 9117         __ uxthw(tmp1, tmp1);
 9118         __ uxthw(tmp2, tmp2);
 9119       }
 9120       __ subw(result, tmp1, tmp2);
 9121 
 9122     __ bind(LENGTH_DIFF);
 9123       __ ret(lr);
 9124     return entry;
 9125   }
 9126 
 9127   enum string_compare_mode {
 9128     LL,
 9129     LU,
 9130     UL,
 9131     UU,
 9132   };
 9133 
 9134   // The following registers are declared in aarch64.ad
 9135   // r0  = result
 9136   // r1  = str1
 9137   // r2  = cnt1
 9138   // r3  = str2
 9139   // r4  = cnt2
 9140   // r10 = tmp1
 9141   // r11 = tmp2
 9142   // z0  = ztmp1
 9143   // z1  = ztmp2
 9144   // p0  = pgtmp1
 9145   // p1  = pgtmp2
 9146   address generate_compare_long_string_sve(string_compare_mode mode) {
 9147     StubGenStubId stub_id;
 9148     switch (mode) {
 9149       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
 9150       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
 9151       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
 9152       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
 9153       default: ShouldNotReachHere();
 9154     }
 9155 
 9156     __ align(CodeEntryAlignment);
 9157     address entry = __ pc();
 9158     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9159              tmp1 = r10, tmp2 = r11;
 9160 
 9161     Label LOOP, DONE, MISMATCH;
 9162     Register vec_len = tmp1;
 9163     Register idx = tmp2;
 9164     // The minimum of the string lengths has been stored in cnt2.
 9165     Register cnt = cnt2;
 9166     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9167     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9168 
 9169 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9170     switch (mode) {                                                            \
 9171       case LL:                                                                 \
 9172         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9173         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9174         break;                                                                 \
 9175       case LU:                                                                 \
 9176         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9177         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9178         break;                                                                 \
 9179       case UL:                                                                 \
 9180         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9181         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9182         break;                                                                 \
 9183       case UU:                                                                 \
 9184         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9185         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9186         break;                                                                 \
 9187       default:                                                                 \
 9188         ShouldNotReachHere();                                                  \
 9189     }
 9190 
 9191     StubCodeMark mark(this, stub_id);
 9192 
 9193     __ mov(idx, 0);
 9194     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9195 
 9196     if (mode == LL) {
 9197       __ sve_cntb(vec_len);
 9198     } else {
 9199       __ sve_cnth(vec_len);
 9200     }
 9201 
 9202     __ sub(rscratch1, cnt, vec_len);
 9203 
 9204     __ bind(LOOP);
 9205 
 9206       // main loop
 9207       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9208       __ add(idx, idx, vec_len);
 9209       // Compare strings.
 9210       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9211       __ br(__ NE, MISMATCH);
 9212       __ cmp(idx, rscratch1);
 9213       __ br(__ LT, LOOP);
 9214 
 9215     // post loop, last iteration
 9216     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9217 
 9218     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9219     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9220     __ br(__ EQ, DONE);
 9221 
 9222     __ bind(MISMATCH);
 9223 
 9224     // Crop the vector to find its location.
 9225     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9226     // Extract the first different characters of each string.
 9227     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9228     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9229 
 9230     // Compute the difference of the first different characters.
 9231     __ sub(result, rscratch1, rscratch2);
 9232 
 9233     __ bind(DONE);
 9234     __ ret(lr);
 9235 #undef LOAD_PAIR
 9236     return entry;
 9237   }
 9238 
 9239   void generate_compare_long_strings() {
 9240     if (UseSVE == 0) {
 9241       StubRoutines::aarch64::_compare_long_string_LL
 9242           = generate_compare_long_string_same_encoding(true);
 9243       StubRoutines::aarch64::_compare_long_string_UU
 9244           = generate_compare_long_string_same_encoding(false);
 9245       StubRoutines::aarch64::_compare_long_string_LU
 9246           = generate_compare_long_string_different_encoding(true);
 9247       StubRoutines::aarch64::_compare_long_string_UL
 9248           = generate_compare_long_string_different_encoding(false);
 9249     } else {
 9250       StubRoutines::aarch64::_compare_long_string_LL
 9251           = generate_compare_long_string_sve(LL);
 9252       StubRoutines::aarch64::_compare_long_string_UU
 9253           = generate_compare_long_string_sve(UU);
 9254       StubRoutines::aarch64::_compare_long_string_LU
 9255           = generate_compare_long_string_sve(LU);
 9256       StubRoutines::aarch64::_compare_long_string_UL
 9257           = generate_compare_long_string_sve(UL);
 9258     }
 9259   }
 9260 
 9261   // R0 = result
 9262   // R1 = str2
 9263   // R2 = cnt1
 9264   // R3 = str1
 9265   // R4 = cnt2
 9266   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9267   //
 9268   // This generic linear code use few additional ideas, which makes it faster:
 9269   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9270   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9271   // 2) we can use "fast" algorithm of finding single character to search for
 9272   // first symbol with less branches(1 branch per each loaded register instead
 9273   // of branch for each symbol), so, this is where constants like
 9274   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9275   // 3) after loading and analyzing 1st register of source string, it can be
 9276   // used to search for every 1st character entry, saving few loads in
 9277   // comparison with "simplier-but-slower" implementation
 9278   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9279   // re-using/re-initializing/compressing register values, which makes code
 9280   // larger and a bit less readable, however, most of extra operations are
 9281   // issued during loads or branches, so, penalty is minimal
 9282   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 9283     StubGenStubId stub_id;
 9284     if (str1_isL) {
 9285       if (str2_isL) {
 9286         stub_id = StubGenStubId::string_indexof_linear_ll_id;
 9287       } else {
 9288         stub_id = StubGenStubId::string_indexof_linear_ul_id;
 9289       }
 9290     } else {
 9291       if (str2_isL) {
 9292         ShouldNotReachHere();
 9293       } else {
 9294         stub_id = StubGenStubId::string_indexof_linear_uu_id;
 9295       }
 9296     }
 9297     __ align(CodeEntryAlignment);
 9298     StubCodeMark mark(this, stub_id);
 9299     address entry = __ pc();
 9300 
 9301     int str1_chr_size = str1_isL ? 1 : 2;
 9302     int str2_chr_size = str2_isL ? 1 : 2;
 9303     int str1_chr_shift = str1_isL ? 0 : 1;
 9304     int str2_chr_shift = str2_isL ? 0 : 1;
 9305     bool isL = str1_isL && str2_isL;
 9306    // parameters
 9307     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 9308     // temporary registers
 9309     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 9310     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 9311     // redefinitions
 9312     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 9313 
 9314     __ push(spilled_regs, sp);
 9315     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 9316         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 9317         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 9318         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 9319         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 9320         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 9321     // Read whole register from str1. It is safe, because length >=8 here
 9322     __ ldr(ch1, Address(str1));
 9323     // Read whole register from str2. It is safe, because length >=8 here
 9324     __ ldr(ch2, Address(str2));
 9325     __ sub(cnt2, cnt2, cnt1);
 9326     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 9327     if (str1_isL != str2_isL) {
 9328       __ eor(v0, __ T16B, v0, v0);
 9329     }
 9330     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 9331     __ mul(first, first, tmp1);
 9332     // check if we have less than 1 register to check
 9333     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 9334     if (str1_isL != str2_isL) {
 9335       __ fmovd(v1, ch1);
 9336     }
 9337     __ br(__ LE, L_SMALL);
 9338     __ eor(ch2, first, ch2);
 9339     if (str1_isL != str2_isL) {
 9340       __ zip1(v1, __ T16B, v1, v0);
 9341     }
 9342     __ sub(tmp2, ch2, tmp1);
 9343     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9344     __ bics(tmp2, tmp2, ch2);
 9345     if (str1_isL != str2_isL) {
 9346       __ fmovd(ch1, v1);
 9347     }
 9348     __ br(__ NE, L_HAS_ZERO);
 9349     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9350     __ add(result, result, wordSize/str2_chr_size);
 9351     __ add(str2, str2, wordSize);
 9352     __ br(__ LT, L_POST_LOOP);
 9353     __ BIND(L_LOOP);
 9354       __ ldr(ch2, Address(str2));
 9355       __ eor(ch2, first, ch2);
 9356       __ sub(tmp2, ch2, tmp1);
 9357       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9358       __ bics(tmp2, tmp2, ch2);
 9359       __ br(__ NE, L_HAS_ZERO);
 9360     __ BIND(L_LOOP_PROCEED);
 9361       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9362       __ add(str2, str2, wordSize);
 9363       __ add(result, result, wordSize/str2_chr_size);
 9364       __ br(__ GE, L_LOOP);
 9365     __ BIND(L_POST_LOOP);
 9366       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9367       __ br(__ LE, NOMATCH);
 9368       __ ldr(ch2, Address(str2));
 9369       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9370       __ eor(ch2, first, ch2);
 9371       __ sub(tmp2, ch2, tmp1);
 9372       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9373       __ mov(tmp4, -1); // all bits set
 9374       __ b(L_SMALL_PROCEED);
 9375     __ align(OptoLoopAlignment);
 9376     __ BIND(L_SMALL);
 9377       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9378       __ eor(ch2, first, ch2);
 9379       if (str1_isL != str2_isL) {
 9380         __ zip1(v1, __ T16B, v1, v0);
 9381       }
 9382       __ sub(tmp2, ch2, tmp1);
 9383       __ mov(tmp4, -1); // all bits set
 9384       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9385       if (str1_isL != str2_isL) {
 9386         __ fmovd(ch1, v1); // move converted 4 symbols
 9387       }
 9388     __ BIND(L_SMALL_PROCEED);
 9389       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9390       __ bic(tmp2, tmp2, ch2);
 9391       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9392       __ rbit(tmp2, tmp2);
 9393       __ br(__ EQ, NOMATCH);
 9394     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9395       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9396       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9397       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9398       if (str2_isL) { // LL
 9399         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9400         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9401         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9402         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9403         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9404       } else {
 9405         __ mov(ch2, 0xE); // all bits in byte set except last one
 9406         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9407         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9408         __ lslv(tmp2, tmp2, tmp4);
 9409         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9410         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9411         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9412         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9413       }
 9414       __ cmp(ch1, ch2);
 9415       __ mov(tmp4, wordSize/str2_chr_size);
 9416       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9417     __ BIND(L_SMALL_CMP_LOOP);
 9418       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9419                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9420       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9421                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9422       __ add(tmp4, tmp4, 1);
 9423       __ cmp(tmp4, cnt1);
 9424       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9425       __ cmp(first, ch2);
 9426       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9427     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9428       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9429       __ clz(tmp4, tmp2);
 9430       __ add(result, result, 1); // advance index
 9431       __ add(str2, str2, str2_chr_size); // advance pointer
 9432       __ b(L_SMALL_HAS_ZERO_LOOP);
 9433     __ align(OptoLoopAlignment);
 9434     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9435       __ cmp(first, ch2);
 9436       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9437       __ b(DONE);
 9438     __ align(OptoLoopAlignment);
 9439     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9440       if (str2_isL) { // LL
 9441         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9442         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9443         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9444         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9445         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9446       } else {
 9447         __ mov(ch2, 0xE); // all bits in byte set except last one
 9448         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9449         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9450         __ lslv(tmp2, tmp2, tmp4);
 9451         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9452         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9453         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9454         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9455       }
 9456       __ cmp(ch1, ch2);
 9457       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9458       __ b(DONE);
 9459     __ align(OptoLoopAlignment);
 9460     __ BIND(L_HAS_ZERO);
 9461       __ rbit(tmp2, tmp2);
 9462       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9463       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9464       // It's fine because both counters are 32bit and are not changed in this
 9465       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9466       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9467       __ sub(result, result, 1);
 9468     __ BIND(L_HAS_ZERO_LOOP);
 9469       __ mov(cnt1, wordSize/str2_chr_size);
 9470       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9471       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9472       if (str2_isL) {
 9473         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9474         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9475         __ lslv(tmp2, tmp2, tmp4);
 9476         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9477         __ add(tmp4, tmp4, 1);
 9478         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9479         __ lsl(tmp2, tmp2, 1);
 9480         __ mov(tmp4, wordSize/str2_chr_size);
 9481       } else {
 9482         __ mov(ch2, 0xE);
 9483         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9484         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9485         __ lslv(tmp2, tmp2, tmp4);
 9486         __ add(tmp4, tmp4, 1);
 9487         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9488         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9489         __ lsl(tmp2, tmp2, 1);
 9490         __ mov(tmp4, wordSize/str2_chr_size);
 9491         __ sub(str2, str2, str2_chr_size);
 9492       }
 9493       __ cmp(ch1, ch2);
 9494       __ mov(tmp4, wordSize/str2_chr_size);
 9495       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9496     __ BIND(L_CMP_LOOP);
 9497       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9498                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9499       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9500                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9501       __ add(tmp4, tmp4, 1);
 9502       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9503       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9504       __ cmp(cnt1, ch2);
 9505       __ br(__ EQ, L_CMP_LOOP);
 9506     __ BIND(L_CMP_LOOP_NOMATCH);
 9507       // here we're not matched
 9508       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9509       __ clz(tmp4, tmp2);
 9510       __ add(str2, str2, str2_chr_size); // advance pointer
 9511       __ b(L_HAS_ZERO_LOOP);
 9512     __ align(OptoLoopAlignment);
 9513     __ BIND(L_CMP_LOOP_LAST_CMP);
 9514       __ cmp(cnt1, ch2);
 9515       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9516       __ b(DONE);
 9517     __ align(OptoLoopAlignment);
 9518     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9519       if (str2_isL) {
 9520         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9521         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9522         __ lslv(tmp2, tmp2, tmp4);
 9523         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9524         __ add(tmp4, tmp4, 1);
 9525         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9526         __ lsl(tmp2, tmp2, 1);
 9527       } else {
 9528         __ mov(ch2, 0xE);
 9529         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9530         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9531         __ lslv(tmp2, tmp2, tmp4);
 9532         __ add(tmp4, tmp4, 1);
 9533         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9534         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9535         __ lsl(tmp2, tmp2, 1);
 9536         __ sub(str2, str2, str2_chr_size);
 9537       }
 9538       __ cmp(ch1, ch2);
 9539       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9540       __ b(DONE);
 9541     __ align(OptoLoopAlignment);
 9542     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9543       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9544       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9545       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9546       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9547       // result by analyzed characters value, so, we can just reset lower bits
 9548       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9549       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9550       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9551       // index of last analyzed substring inside current octet. So, str2 in at
 9552       // respective start address. We need to advance it to next octet
 9553       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9554       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9555       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9556       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9557       __ movw(cnt2, cnt2);
 9558       __ b(L_LOOP_PROCEED);
 9559     __ align(OptoLoopAlignment);
 9560     __ BIND(NOMATCH);
 9561       __ mov(result, -1);
 9562     __ BIND(DONE);
 9563       __ pop(spilled_regs, sp);
 9564       __ ret(lr);
 9565     return entry;
 9566   }
 9567 
 9568   void generate_string_indexof_stubs() {
 9569     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9570     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9571     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9572   }
 9573 
 9574   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9575       FloatRegister src1, FloatRegister src2) {
 9576     Register dst = r1;
 9577     __ zip1(v1, __ T16B, src1, v0);
 9578     __ zip2(v2, __ T16B, src1, v0);
 9579     if (generatePrfm) {
 9580       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9581     }
 9582     __ zip1(v3, __ T16B, src2, v0);
 9583     __ zip2(v4, __ T16B, src2, v0);
 9584     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9585   }
 9586 
 9587   // R0 = src
 9588   // R1 = dst
 9589   // R2 = len
 9590   // R3 = len >> 3
 9591   // V0 = 0
 9592   // v1 = loaded 8 bytes
 9593   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9594   address generate_large_byte_array_inflate() {
 9595     __ align(CodeEntryAlignment);
 9596     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
 9597     StubCodeMark mark(this, stub_id);
 9598     address entry = __ pc();
 9599     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9600     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9601     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9602 
 9603     // do one more 8-byte read to have address 16-byte aligned in most cases
 9604     // also use single store instruction
 9605     __ ldrd(v2, __ post(src, 8));
 9606     __ sub(octetCounter, octetCounter, 2);
 9607     __ zip1(v1, __ T16B, v1, v0);
 9608     __ zip1(v2, __ T16B, v2, v0);
 9609     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9610     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9611     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9612     __ br(__ LE, LOOP_START);
 9613     __ b(LOOP_PRFM_START);
 9614     __ bind(LOOP_PRFM);
 9615       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9616     __ bind(LOOP_PRFM_START);
 9617       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9618       __ sub(octetCounter, octetCounter, 8);
 9619       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9620       inflate_and_store_2_fp_registers(true, v3, v4);
 9621       inflate_and_store_2_fp_registers(true, v5, v6);
 9622       __ br(__ GT, LOOP_PRFM);
 9623       __ cmp(octetCounter, (u1)8);
 9624       __ br(__ LT, DONE);
 9625     __ bind(LOOP);
 9626       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9627       __ bind(LOOP_START);
 9628       __ sub(octetCounter, octetCounter, 8);
 9629       __ cmp(octetCounter, (u1)8);
 9630       inflate_and_store_2_fp_registers(false, v3, v4);
 9631       inflate_and_store_2_fp_registers(false, v5, v6);
 9632       __ br(__ GE, LOOP);
 9633     __ bind(DONE);
 9634       __ ret(lr);
 9635     return entry;
 9636   }
 9637 
 9638   /**
 9639    *  Arguments:
 9640    *
 9641    *  Input:
 9642    *  c_rarg0   - current state address
 9643    *  c_rarg1   - H key address
 9644    *  c_rarg2   - data address
 9645    *  c_rarg3   - number of blocks
 9646    *
 9647    *  Output:
 9648    *  Updated state at c_rarg0
 9649    */
 9650   address generate_ghash_processBlocks() {
 9651     // Bafflingly, GCM uses little-endian for the byte order, but
 9652     // big-endian for the bit order.  For example, the polynomial 1 is
 9653     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9654     //
 9655     // So, we must either reverse the bytes in each word and do
 9656     // everything big-endian or reverse the bits in each byte and do
 9657     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9658     // the bits in each byte (we have an instruction, RBIT, to do
 9659     // that) and keep the data in little-endian bit order through the
 9660     // calculation, bit-reversing the inputs and outputs.
 9661 
 9662     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
 9663     StubCodeMark mark(this, stub_id);
 9664     __ align(wordSize * 2);
 9665     address p = __ pc();
 9666     __ emit_int64(0x87);  // The low-order bits of the field
 9667                           // polynomial (i.e. p = z^7+z^2+z+1)
 9668                           // repeated in the low and high parts of a
 9669                           // 128-bit vector
 9670     __ emit_int64(0x87);
 9671 
 9672     __ align(CodeEntryAlignment);
 9673     address start = __ pc();
 9674 
 9675     Register state   = c_rarg0;
 9676     Register subkeyH = c_rarg1;
 9677     Register data    = c_rarg2;
 9678     Register blocks  = c_rarg3;
 9679 
 9680     FloatRegister vzr = v30;
 9681     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9682 
 9683     __ ldrq(v24, p);    // The field polynomial
 9684 
 9685     __ ldrq(v0, Address(state));
 9686     __ ldrq(v1, Address(subkeyH));
 9687 
 9688     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9689     __ rbit(v0, __ T16B, v0);
 9690     __ rev64(v1, __ T16B, v1);
 9691     __ rbit(v1, __ T16B, v1);
 9692 
 9693     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9694     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9695 
 9696     {
 9697       Label L_ghash_loop;
 9698       __ bind(L_ghash_loop);
 9699 
 9700       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9701                                                  // reversing each byte
 9702       __ rbit(v2, __ T16B, v2);
 9703       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9704 
 9705       // Multiply state in v2 by subkey in v1
 9706       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9707                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9708                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9709       // Reduce v7:v5 by the field polynomial
 9710       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9711 
 9712       __ sub(blocks, blocks, 1);
 9713       __ cbnz(blocks, L_ghash_loop);
 9714     }
 9715 
 9716     // The bit-reversed result is at this point in v0
 9717     __ rev64(v0, __ T16B, v0);
 9718     __ rbit(v0, __ T16B, v0);
 9719 
 9720     __ st1(v0, __ T16B, state);
 9721     __ ret(lr);
 9722 
 9723     return start;
 9724   }
 9725 
 9726   address generate_ghash_processBlocks_wide() {
 9727     address small = generate_ghash_processBlocks();
 9728 
 9729     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
 9730     StubCodeMark mark(this, stub_id);
 9731     __ align(wordSize * 2);
 9732     address p = __ pc();
 9733     __ emit_int64(0x87);  // The low-order bits of the field
 9734                           // polynomial (i.e. p = z^7+z^2+z+1)
 9735                           // repeated in the low and high parts of a
 9736                           // 128-bit vector
 9737     __ emit_int64(0x87);
 9738 
 9739     __ align(CodeEntryAlignment);
 9740     address start = __ pc();
 9741 
 9742     Register state   = c_rarg0;
 9743     Register subkeyH = c_rarg1;
 9744     Register data    = c_rarg2;
 9745     Register blocks  = c_rarg3;
 9746 
 9747     const int unroll = 4;
 9748 
 9749     __ cmp(blocks, (unsigned char)(unroll * 2));
 9750     __ br(__ LT, small);
 9751 
 9752     if (unroll > 1) {
 9753     // Save state before entering routine
 9754       __ sub(sp, sp, 4 * 16);
 9755       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9756       __ sub(sp, sp, 4 * 16);
 9757       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9758     }
 9759 
 9760     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 9761 
 9762     if (unroll > 1) {
 9763       // And restore state
 9764       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9765       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9766     }
 9767 
 9768     __ cmp(blocks, (unsigned char)0);
 9769     __ br(__ GT, small);
 9770 
 9771     __ ret(lr);
 9772 
 9773     return start;
 9774   }
 9775 
 9776   void generate_base64_encode_simdround(Register src, Register dst,
 9777         FloatRegister codec, u8 size) {
 9778 
 9779     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9780     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9781     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9782 
 9783     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9784 
 9785     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9786 
 9787     __ ushr(ind0, arrangement, in0,  2);
 9788 
 9789     __ ushr(ind1, arrangement, in1,  2);
 9790     __ shl(in0,   arrangement, in0,  6);
 9791     __ orr(ind1,  arrangement, ind1, in0);
 9792     __ ushr(ind1, arrangement, ind1, 2);
 9793 
 9794     __ ushr(ind2, arrangement, in2,  4);
 9795     __ shl(in1,   arrangement, in1,  4);
 9796     __ orr(ind2,  arrangement, in1,  ind2);
 9797     __ ushr(ind2, arrangement, ind2, 2);
 9798 
 9799     __ shl(ind3,  arrangement, in2,  2);
 9800     __ ushr(ind3, arrangement, ind3, 2);
 9801 
 9802     __ tbl(out0,  arrangement, codec,  4, ind0);
 9803     __ tbl(out1,  arrangement, codec,  4, ind1);
 9804     __ tbl(out2,  arrangement, codec,  4, ind2);
 9805     __ tbl(out3,  arrangement, codec,  4, ind3);
 9806 
 9807     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9808   }
 9809 
 9810    /**
 9811    *  Arguments:
 9812    *
 9813    *  Input:
 9814    *  c_rarg0   - src_start
 9815    *  c_rarg1   - src_offset
 9816    *  c_rarg2   - src_length
 9817    *  c_rarg3   - dest_start
 9818    *  c_rarg4   - dest_offset
 9819    *  c_rarg5   - isURL
 9820    *
 9821    */
 9822   address generate_base64_encodeBlock() {
 9823 
 9824     static const char toBase64[64] = {
 9825       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9826       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9827       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9828       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9829       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9830     };
 9831 
 9832     static const char toBase64URL[64] = {
 9833       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9834       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9835       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9836       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9837       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9838     };
 9839 
 9840     __ align(CodeEntryAlignment);
 9841     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
 9842     StubCodeMark mark(this, stub_id);
 9843     address start = __ pc();
 9844 
 9845     Register src   = c_rarg0;  // source array
 9846     Register soff  = c_rarg1;  // source start offset
 9847     Register send  = c_rarg2;  // source end offset
 9848     Register dst   = c_rarg3;  // dest array
 9849     Register doff  = c_rarg4;  // position for writing to dest array
 9850     Register isURL = c_rarg5;  // Base64 or URL character set
 9851 
 9852     // c_rarg6 and c_rarg7 are free to use as temps
 9853     Register codec  = c_rarg6;
 9854     Register length = c_rarg7;
 9855 
 9856     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9857 
 9858     __ add(src, src, soff);
 9859     __ add(dst, dst, doff);
 9860     __ sub(length, send, soff);
 9861 
 9862     // load the codec base address
 9863     __ lea(codec, ExternalAddress((address) toBase64));
 9864     __ cbz(isURL, ProcessData);
 9865     __ lea(codec, ExternalAddress((address) toBase64URL));
 9866 
 9867     __ BIND(ProcessData);
 9868 
 9869     // too short to formup a SIMD loop, roll back
 9870     __ cmp(length, (u1)24);
 9871     __ br(Assembler::LT, Process3B);
 9872 
 9873     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9874 
 9875     __ BIND(Process48B);
 9876     __ cmp(length, (u1)48);
 9877     __ br(Assembler::LT, Process24B);
 9878     generate_base64_encode_simdround(src, dst, v0, 16);
 9879     __ sub(length, length, 48);
 9880     __ b(Process48B);
 9881 
 9882     __ BIND(Process24B);
 9883     __ cmp(length, (u1)24);
 9884     __ br(Assembler::LT, SIMDExit);
 9885     generate_base64_encode_simdround(src, dst, v0, 8);
 9886     __ sub(length, length, 24);
 9887 
 9888     __ BIND(SIMDExit);
 9889     __ cbz(length, Exit);
 9890 
 9891     __ BIND(Process3B);
 9892     //  3 src bytes, 24 bits
 9893     __ ldrb(r10, __ post(src, 1));
 9894     __ ldrb(r11, __ post(src, 1));
 9895     __ ldrb(r12, __ post(src, 1));
 9896     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9897     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9898     // codec index
 9899     __ ubfmw(r15, r12, 18, 23);
 9900     __ ubfmw(r14, r12, 12, 17);
 9901     __ ubfmw(r13, r12, 6,  11);
 9902     __ andw(r12,  r12, 63);
 9903     // get the code based on the codec
 9904     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9905     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9906     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9907     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9908     __ strb(r15, __ post(dst, 1));
 9909     __ strb(r14, __ post(dst, 1));
 9910     __ strb(r13, __ post(dst, 1));
 9911     __ strb(r12, __ post(dst, 1));
 9912     __ sub(length, length, 3);
 9913     __ cbnz(length, Process3B);
 9914 
 9915     __ BIND(Exit);
 9916     __ ret(lr);
 9917 
 9918     return start;
 9919   }
 9920 
 9921   void generate_base64_decode_simdround(Register src, Register dst,
 9922         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9923 
 9924     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9925     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9926 
 9927     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9928     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9929 
 9930     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9931 
 9932     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9933 
 9934     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9935 
 9936     // we need unsigned saturating subtract, to make sure all input values
 9937     // in range [0, 63] will have 0U value in the higher half lookup
 9938     __ uqsubv(decH0, __ T16B, in0, v27);
 9939     __ uqsubv(decH1, __ T16B, in1, v27);
 9940     __ uqsubv(decH2, __ T16B, in2, v27);
 9941     __ uqsubv(decH3, __ T16B, in3, v27);
 9942 
 9943     // lower half lookup
 9944     __ tbl(decL0, arrangement, codecL, 4, in0);
 9945     __ tbl(decL1, arrangement, codecL, 4, in1);
 9946     __ tbl(decL2, arrangement, codecL, 4, in2);
 9947     __ tbl(decL3, arrangement, codecL, 4, in3);
 9948 
 9949     // higher half lookup
 9950     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9951     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9952     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9953     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9954 
 9955     // combine lower and higher
 9956     __ orr(decL0, arrangement, decL0, decH0);
 9957     __ orr(decL1, arrangement, decL1, decH1);
 9958     __ orr(decL2, arrangement, decL2, decH2);
 9959     __ orr(decL3, arrangement, decL3, decH3);
 9960 
 9961     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 9962     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 9963     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 9964     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 9965     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 9966     __ orr(in0, arrangement, decH0, decH1);
 9967     __ orr(in1, arrangement, decH2, decH3);
 9968     __ orr(in2, arrangement, in0,   in1);
 9969     __ umaxv(in3, arrangement, in2);
 9970     __ umov(rscratch2, in3, __ B, 0);
 9971 
 9972     // get the data to output
 9973     __ shl(out0,  arrangement, decL0, 2);
 9974     __ ushr(out1, arrangement, decL1, 4);
 9975     __ orr(out0,  arrangement, out0,  out1);
 9976     __ shl(out1,  arrangement, decL1, 4);
 9977     __ ushr(out2, arrangement, decL2, 2);
 9978     __ orr(out1,  arrangement, out1,  out2);
 9979     __ shl(out2,  arrangement, decL2, 6);
 9980     __ orr(out2,  arrangement, out2,  decL3);
 9981 
 9982     __ cbz(rscratch2, NoIllegalData);
 9983 
 9984     // handle illegal input
 9985     __ umov(r10, in2, __ D, 0);
 9986     if (size == 16) {
 9987       __ cbnz(r10, ErrorInLowerHalf);
 9988 
 9989       // illegal input is in higher half, store the lower half now.
 9990       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 9991 
 9992       __ umov(r10, in2,  __ D, 1);
 9993       __ umov(r11, out0, __ D, 1);
 9994       __ umov(r12, out1, __ D, 1);
 9995       __ umov(r13, out2, __ D, 1);
 9996       __ b(StoreLegalData);
 9997 
 9998       __ BIND(ErrorInLowerHalf);
 9999     }
10000     __ umov(r11, out0, __ D, 0);
10001     __ umov(r12, out1, __ D, 0);
10002     __ umov(r13, out2, __ D, 0);
10003 
10004     __ BIND(StoreLegalData);
10005     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10006     __ strb(r11, __ post(dst, 1));
10007     __ strb(r12, __ post(dst, 1));
10008     __ strb(r13, __ post(dst, 1));
10009     __ lsr(r10, r10, 8);
10010     __ lsr(r11, r11, 8);
10011     __ lsr(r12, r12, 8);
10012     __ lsr(r13, r13, 8);
10013     __ b(StoreLegalData);
10014 
10015     __ BIND(NoIllegalData);
10016     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10017   }
10018 
10019 
10020    /**
10021    *  Arguments:
10022    *
10023    *  Input:
10024    *  c_rarg0   - src_start
10025    *  c_rarg1   - src_offset
10026    *  c_rarg2   - src_length
10027    *  c_rarg3   - dest_start
10028    *  c_rarg4   - dest_offset
10029    *  c_rarg5   - isURL
10030    *  c_rarg6   - isMIME
10031    *
10032    */
10033   address generate_base64_decodeBlock() {
10034 
10035     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10036     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10037     // titled "Base64 decoding".
10038 
10039     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10040     // except the trailing character '=' is also treated illegal value in this intrinsic. That
10041     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10042     static const uint8_t fromBase64ForNoSIMD[256] = {
10043       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10044       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10045       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10046        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10047       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10048        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
10049       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10050        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10051       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10052       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10053       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10054       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10055       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10056       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10057       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10058       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10059     };
10060 
10061     static const uint8_t fromBase64URLForNoSIMD[256] = {
10062       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10063       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10064       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10065        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10066       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10067        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
10068       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10069        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10070       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10071       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10072       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10073       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10074       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10075       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10076       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10077       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10078     };
10079 
10080     // A legal value of base64 code is in range [0, 127].  We need two lookups
10081     // with tbl/tbx and combine them to get the decode data. The 1st table vector
10082     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10083     // table vector lookup use tbx, out of range indices are unchanged in
10084     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10085     // The value of index 64 is set to 0, so that we know that we already get the
10086     // decoded data with the 1st lookup.
10087     static const uint8_t fromBase64ForSIMD[128] = {
10088       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10089       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10090       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10091        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10092         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10093        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10094       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10095        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10096     };
10097 
10098     static const uint8_t fromBase64URLForSIMD[128] = {
10099       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10100       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10101       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10102        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10103         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10104        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10105        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10106        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10107     };
10108 
10109     __ align(CodeEntryAlignment);
10110     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
10111     StubCodeMark mark(this, stub_id);
10112     address start = __ pc();
10113 
10114     Register src    = c_rarg0;  // source array
10115     Register soff   = c_rarg1;  // source start offset
10116     Register send   = c_rarg2;  // source end offset
10117     Register dst    = c_rarg3;  // dest array
10118     Register doff   = c_rarg4;  // position for writing to dest array
10119     Register isURL  = c_rarg5;  // Base64 or URL character set
10120     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10121 
10122     Register length = send;    // reuse send as length of source data to process
10123 
10124     Register simd_codec   = c_rarg6;
10125     Register nosimd_codec = c_rarg7;
10126 
10127     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10128 
10129     __ enter();
10130 
10131     __ add(src, src, soff);
10132     __ add(dst, dst, doff);
10133 
10134     __ mov(doff, dst);
10135 
10136     __ sub(length, send, soff);
10137     __ bfm(length, zr, 0, 1);
10138 
10139     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10140     __ cbz(isURL, ProcessData);
10141     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10142 
10143     __ BIND(ProcessData);
10144     __ mov(rscratch1, length);
10145     __ cmp(length, (u1)144); // 144 = 80 + 64
10146     __ br(Assembler::LT, Process4B);
10147 
10148     // In the MIME case, the line length cannot be more than 76
10149     // bytes (see RFC 2045). This is too short a block for SIMD
10150     // to be worthwhile, so we use non-SIMD here.
10151     __ movw(rscratch1, 79);
10152 
10153     __ BIND(Process4B);
10154     __ ldrw(r14, __ post(src, 4));
10155     __ ubfxw(r10, r14, 0,  8);
10156     __ ubfxw(r11, r14, 8,  8);
10157     __ ubfxw(r12, r14, 16, 8);
10158     __ ubfxw(r13, r14, 24, 8);
10159     // get the de-code
10160     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10161     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10162     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10163     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10164     // error detection, 255u indicates an illegal input
10165     __ orrw(r14, r10, r11);
10166     __ orrw(r15, r12, r13);
10167     __ orrw(r14, r14, r15);
10168     __ tbnz(r14, 7, Exit);
10169     // recover the data
10170     __ lslw(r14, r10, 10);
10171     __ bfiw(r14, r11, 4, 6);
10172     __ bfmw(r14, r12, 2, 5);
10173     __ rev16w(r14, r14);
10174     __ bfiw(r13, r12, 6, 2);
10175     __ strh(r14, __ post(dst, 2));
10176     __ strb(r13, __ post(dst, 1));
10177     // non-simd loop
10178     __ subsw(rscratch1, rscratch1, 4);
10179     __ br(Assembler::GT, Process4B);
10180 
10181     // if exiting from PreProcess80B, rscratch1 == -1;
10182     // otherwise, rscratch1 == 0.
10183     __ cbzw(rscratch1, Exit);
10184     __ sub(length, length, 80);
10185 
10186     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10187     __ cbz(isURL, SIMDEnter);
10188     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10189 
10190     __ BIND(SIMDEnter);
10191     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10192     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10193     __ mov(rscratch1, 63);
10194     __ dup(v27, __ T16B, rscratch1);
10195 
10196     __ BIND(Process64B);
10197     __ cmp(length, (u1)64);
10198     __ br(Assembler::LT, Process32B);
10199     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10200     __ sub(length, length, 64);
10201     __ b(Process64B);
10202 
10203     __ BIND(Process32B);
10204     __ cmp(length, (u1)32);
10205     __ br(Assembler::LT, SIMDExit);
10206     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10207     __ sub(length, length, 32);
10208     __ b(Process32B);
10209 
10210     __ BIND(SIMDExit);
10211     __ cbz(length, Exit);
10212     __ movw(rscratch1, length);
10213     __ b(Process4B);
10214 
10215     __ BIND(Exit);
10216     __ sub(c_rarg0, dst, doff);
10217 
10218     __ leave();
10219     __ ret(lr);
10220 
10221     return start;
10222   }
10223 
10224   // Support for spin waits.
10225   address generate_spin_wait() {
10226     __ align(CodeEntryAlignment);
10227     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
10228     StubCodeMark mark(this, stub_id);
10229     address start = __ pc();
10230 
10231     __ spin_wait();
10232     __ ret(lr);
10233 
10234     return start;
10235   }
10236 
10237   void generate_lookup_secondary_supers_table_stub() {
10238     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
10239     StubCodeMark mark(this, stub_id);
10240 
10241     const Register
10242       r_super_klass  = r0,
10243       r_array_base   = r1,
10244       r_array_length = r2,
10245       r_array_index  = r3,
10246       r_sub_klass    = r4,
10247       r_bitmap       = rscratch2,
10248       result         = r5;
10249     const FloatRegister
10250       vtemp          = v0;
10251 
10252     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10253       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10254       Label L_success;
10255       __ enter();
10256       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10257                                              r_array_base, r_array_length, r_array_index,
10258                                              vtemp, result, slot,
10259                                              /*stub_is_near*/true);
10260       __ leave();
10261       __ ret(lr);
10262     }
10263   }
10264 
10265   // Slow path implementation for UseSecondarySupersTable.
10266   address generate_lookup_secondary_supers_table_slow_path_stub() {
10267     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
10268     StubCodeMark mark(this, stub_id);
10269 
10270     address start = __ pc();
10271     const Register
10272       r_super_klass  = r0,        // argument
10273       r_array_base   = r1,        // argument
10274       temp1          = r2,        // temp
10275       r_array_index  = r3,        // argument
10276       r_bitmap       = rscratch2, // argument
10277       result         = r5;        // argument
10278 
10279     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10280     __ ret(lr);
10281 
10282     return start;
10283   }
10284 
10285 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10286 
10287   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
10288   //
10289   // If LSE is in use, generate LSE versions of all the stubs. The
10290   // non-LSE versions are in atomic_aarch64.S.
10291 
10292   // class AtomicStubMark records the entry point of a stub and the
10293   // stub pointer which will point to it. The stub pointer is set to
10294   // the entry point when ~AtomicStubMark() is called, which must be
10295   // after ICache::invalidate_range. This ensures safe publication of
10296   // the generated code.
10297   class AtomicStubMark {
10298     address _entry_point;
10299     aarch64_atomic_stub_t *_stub;
10300     MacroAssembler *_masm;
10301   public:
10302     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10303       _masm = masm;
10304       __ align(32);
10305       _entry_point = __ pc();
10306       _stub = stub;
10307     }
10308     ~AtomicStubMark() {
10309       *_stub = (aarch64_atomic_stub_t)_entry_point;
10310     }
10311   };
10312 
10313   // NB: For memory_order_conservative we need a trailing membar after
10314   // LSE atomic operations but not a leading membar.
10315   //
10316   // We don't need a leading membar because a clause in the Arm ARM
10317   // says:
10318   //
10319   //   Barrier-ordered-before
10320   //
10321   //   Barrier instructions order prior Memory effects before subsequent
10322   //   Memory effects generated by the same Observer. A read or a write
10323   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10324   //   Observer if and only if RW1 appears in program order before RW 2
10325   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10326   //   instruction with both Acquire and Release semantics.
10327   //
10328   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10329   // and Release semantics, therefore we don't need a leading
10330   // barrier. However, there is no corresponding Barrier-ordered-after
10331   // relationship, therefore we need a trailing membar to prevent a
10332   // later store or load from being reordered with the store in an
10333   // atomic instruction.
10334   //
10335   // This was checked by using the herd7 consistency model simulator
10336   // (http://diy.inria.fr/) with this test case:
10337   //
10338   // AArch64 LseCas
10339   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10340   // P0 | P1;
10341   // LDR W4, [X2] | MOV W3, #0;
10342   // DMB LD       | MOV W4, #1;
10343   // LDR W3, [X1] | CASAL W3, W4, [X1];
10344   //              | DMB ISH;
10345   //              | STR W4, [X2];
10346   // exists
10347   // (0:X3=0 /\ 0:X4=1)
10348   //
10349   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10350   // with the store to x in P1. Without the DMB in P1 this may happen.
10351   //
10352   // At the time of writing we don't know of any AArch64 hardware that
10353   // reorders stores in this way, but the Reference Manual permits it.
10354 
10355   void gen_cas_entry(Assembler::operand_size size,
10356                      atomic_memory_order order) {
10357     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10358       exchange_val = c_rarg2;
10359     bool acquire, release;
10360     switch (order) {
10361       case memory_order_relaxed:
10362         acquire = false;
10363         release = false;
10364         break;
10365       case memory_order_release:
10366         acquire = false;
10367         release = true;
10368         break;
10369       default:
10370         acquire = true;
10371         release = true;
10372         break;
10373     }
10374     __ mov(prev, compare_val);
10375     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10376     if (order == memory_order_conservative) {
10377       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10378     }
10379     if (size == Assembler::xword) {
10380       __ mov(r0, prev);
10381     } else {
10382       __ movw(r0, prev);
10383     }
10384     __ ret(lr);
10385   }
10386 
10387   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10388     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10389     // If not relaxed, then default to conservative.  Relaxed is the only
10390     // case we use enough to be worth specializing.
10391     if (order == memory_order_relaxed) {
10392       __ ldadd(size, incr, prev, addr);
10393     } else {
10394       __ ldaddal(size, incr, prev, addr);
10395       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10396     }
10397     if (size == Assembler::xword) {
10398       __ mov(r0, prev);
10399     } else {
10400       __ movw(r0, prev);
10401     }
10402     __ ret(lr);
10403   }
10404 
10405   void gen_swpal_entry(Assembler::operand_size size) {
10406     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10407     __ swpal(size, incr, prev, addr);
10408     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10409     if (size == Assembler::xword) {
10410       __ mov(r0, prev);
10411     } else {
10412       __ movw(r0, prev);
10413     }
10414     __ ret(lr);
10415   }
10416 
10417   void generate_atomic_entry_points() {
10418     if (! UseLSE) {
10419       return;
10420     }
10421     __ align(CodeEntryAlignment);
10422     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
10423     StubCodeMark mark(this, stub_id);
10424     address first_entry = __ pc();
10425 
10426     // ADD, memory_order_conservative
10427     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10428     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10429     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10430     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10431 
10432     // ADD, memory_order_relaxed
10433     AtomicStubMark mark_fetch_add_4_relaxed
10434       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10435     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10436     AtomicStubMark mark_fetch_add_8_relaxed
10437       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10438     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10439 
10440     // XCHG, memory_order_conservative
10441     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10442     gen_swpal_entry(Assembler::word);
10443     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10444     gen_swpal_entry(Assembler::xword);
10445 
10446     // CAS, memory_order_conservative
10447     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10448     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10449     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10450     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10451     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10452     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10453 
10454     // CAS, memory_order_relaxed
10455     AtomicStubMark mark_cmpxchg_1_relaxed
10456       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10457     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10458     AtomicStubMark mark_cmpxchg_4_relaxed
10459       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10460     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10461     AtomicStubMark mark_cmpxchg_8_relaxed
10462       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10463     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10464 
10465     AtomicStubMark mark_cmpxchg_4_release
10466       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10467     gen_cas_entry(MacroAssembler::word, memory_order_release);
10468     AtomicStubMark mark_cmpxchg_8_release
10469       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10470     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10471 
10472     AtomicStubMark mark_cmpxchg_4_seq_cst
10473       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10474     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10475     AtomicStubMark mark_cmpxchg_8_seq_cst
10476       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10477     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10478 
10479     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10480   }
10481 #endif // LINUX
10482 
10483   static void save_return_registers(MacroAssembler* masm) {
10484     if (InlineTypeReturnedAsFields) {
10485       masm->push(RegSet::range(r0, r7), sp);
10486       masm->sub(sp, sp, 4 * wordSize);
10487       masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
10488       masm->sub(sp, sp, 4 * wordSize);
10489       masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
10490     } else {
10491       masm->fmovd(rscratch1, v0);
10492       masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
10493     }
10494   }
10495 
10496   static void restore_return_registers(MacroAssembler* masm) {
10497     if (InlineTypeReturnedAsFields) {
10498       masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10499       masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10500       masm->pop(RegSet::range(r0, r7), sp);
10501     } else {
10502       masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
10503       masm->fmovd(v0, rscratch1);
10504     }
10505   }
10506 
10507   address generate_cont_thaw(Continuation::thaw_kind kind) {
10508     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10509     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10510 
10511     address start = __ pc();
10512 
10513     if (return_barrier) {
10514       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10515       __ mov(sp, rscratch1);
10516     }
10517     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10518 
10519     if (return_barrier) {
10520       // preserve possible return value from a method returning to the return barrier
10521       save_return_registers(_masm);
10522     }
10523 
10524     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10525     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10526     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10527 
10528     if (return_barrier) {
10529       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10530       restore_return_registers(_masm);
10531     }
10532     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10533 
10534 
10535     Label thaw_success;
10536     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10537     __ cbnz(rscratch2, thaw_success);
10538     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10539     __ br(rscratch1);
10540     __ bind(thaw_success);
10541 
10542     // make room for the thawed frames
10543     __ sub(rscratch1, sp, rscratch2);
10544     __ andr(rscratch1, rscratch1, -16); // align
10545     __ mov(sp, rscratch1);
10546 
10547     if (return_barrier) {
10548       // save original return value -- again
10549       save_return_registers(_masm);
10550     }
10551 
10552     // If we want, we can templatize thaw by kind, and have three different entries
10553     __ movw(c_rarg1, (uint32_t)kind);
10554 
10555     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10556     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10557 
10558     if (return_barrier) {
10559       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10560       restore_return_registers(_masm);
10561     } else {
10562       __ mov(r0, zr); // return 0 (success) from doYield
10563     }
10564 
10565     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10566     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10567     __ mov(rfp, sp);
10568 
10569     if (return_barrier_exception) {
10570       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10571       __ authenticate_return_address(c_rarg1);
10572       __ verify_oop(r0);
10573       // save return value containing the exception oop in callee-saved R19
10574       __ mov(r19, r0);
10575 
10576       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10577 
10578       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10579       // __ reinitialize_ptrue();
10580 
10581       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10582 
10583       __ mov(r1, r0); // the exception handler
10584       __ mov(r0, r19); // restore return value containing the exception oop
10585       __ verify_oop(r0);
10586 
10587       __ leave();
10588       __ mov(r3, lr);
10589       __ br(r1); // the exception handler
10590     } else {
10591       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10592       __ leave();
10593       __ ret(lr);
10594     }
10595 
10596     return start;
10597   }
10598 
10599   address generate_cont_thaw() {
10600     if (!Continuations::enabled()) return nullptr;
10601 
10602     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
10603     StubCodeMark mark(this, stub_id);
10604     address start = __ pc();
10605     generate_cont_thaw(Continuation::thaw_top);
10606     return start;
10607   }
10608 
10609   address generate_cont_returnBarrier() {
10610     if (!Continuations::enabled()) return nullptr;
10611 
10612     // TODO: will probably need multiple return barriers depending on return type
10613     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
10614     StubCodeMark mark(this, stub_id);
10615     address start = __ pc();
10616 
10617     generate_cont_thaw(Continuation::thaw_return_barrier);
10618 
10619     return start;
10620   }
10621 
10622   address generate_cont_returnBarrier_exception() {
10623     if (!Continuations::enabled()) return nullptr;
10624 
10625     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
10626     StubCodeMark mark(this, stub_id);
10627     address start = __ pc();
10628 
10629     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10630 
10631     return start;
10632   }
10633 
10634   address generate_cont_preempt_stub() {
10635     if (!Continuations::enabled()) return nullptr;
10636     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
10637     StubCodeMark mark(this, stub_id);
10638     address start = __ pc();
10639 
10640     __ reset_last_Java_frame(true);
10641 
10642     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10643     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10644     __ mov(sp, rscratch2);
10645 
10646     Label preemption_cancelled;
10647     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10648     __ cbnz(rscratch1, preemption_cancelled);
10649 
10650     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10651     SharedRuntime::continuation_enter_cleanup(_masm);
10652     __ leave();
10653     __ ret(lr);
10654 
10655     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10656     __ bind(preemption_cancelled);
10657     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10658     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10659     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10660     __ ldr(rscratch1, Address(rscratch1));
10661     __ br(rscratch1);
10662 
10663     return start;
10664   }
10665 
10666   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10667   // are represented as long[5], with BITS_PER_LIMB = 26.
10668   // Pack five 26-bit limbs into three 64-bit registers.
10669   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10670     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10671     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10672     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10673     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10674 
10675     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10676     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10677     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10678     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10679 
10680     if (dest2->is_valid()) {
10681       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10682     } else {
10683 #ifdef ASSERT
10684       Label OK;
10685       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10686       __ br(__ EQ, OK);
10687       __ stop("high bits of Poly1305 integer should be zero");
10688       __ should_not_reach_here();
10689       __ bind(OK);
10690 #endif
10691     }
10692   }
10693 
10694   // As above, but return only a 128-bit integer, packed into two
10695   // 64-bit registers.
10696   void pack_26(Register dest0, Register dest1, Register src) {
10697     pack_26(dest0, dest1, noreg, src);
10698   }
10699 
10700   // Multiply and multiply-accumulate unsigned 64-bit registers.
10701   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10702     __ mul(prod_lo, n, m);
10703     __ umulh(prod_hi, n, m);
10704   }
10705   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10706     wide_mul(rscratch1, rscratch2, n, m);
10707     __ adds(sum_lo, sum_lo, rscratch1);
10708     __ adc(sum_hi, sum_hi, rscratch2);
10709   }
10710 
10711   // Poly1305, RFC 7539
10712 
10713   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10714   // description of the tricks used to simplify and accelerate this
10715   // computation.
10716 
10717   address generate_poly1305_processBlocks() {
10718     __ align(CodeEntryAlignment);
10719     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
10720     StubCodeMark mark(this, stub_id);
10721     address start = __ pc();
10722     Label here;
10723     __ enter();
10724     RegSet callee_saved = RegSet::range(r19, r28);
10725     __ push(callee_saved, sp);
10726 
10727     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10728 
10729     // Arguments
10730     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10731 
10732     // R_n is the 128-bit randomly-generated key, packed into two
10733     // registers.  The caller passes this key to us as long[5], with
10734     // BITS_PER_LIMB = 26.
10735     const Register R_0 = *++regs, R_1 = *++regs;
10736     pack_26(R_0, R_1, r_start);
10737 
10738     // RR_n is (R_n >> 2) * 5
10739     const Register RR_0 = *++regs, RR_1 = *++regs;
10740     __ lsr(RR_0, R_0, 2);
10741     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10742     __ lsr(RR_1, R_1, 2);
10743     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10744 
10745     // U_n is the current checksum
10746     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10747     pack_26(U_0, U_1, U_2, acc_start);
10748 
10749     static constexpr int BLOCK_LENGTH = 16;
10750     Label DONE, LOOP;
10751 
10752     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10753     __ br(Assembler::LT, DONE); {
10754       __ bind(LOOP);
10755 
10756       // S_n is to be the sum of U_n and the next block of data
10757       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10758       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10759       __ adds(S_0, U_0, S_0);
10760       __ adcs(S_1, U_1, S_1);
10761       __ adc(S_2, U_2, zr);
10762       __ add(S_2, S_2, 1);
10763 
10764       const Register U_0HI = *++regs, U_1HI = *++regs;
10765 
10766       // NB: this logic depends on some of the special properties of
10767       // Poly1305 keys. In particular, because we know that the top
10768       // four bits of R_0 and R_1 are zero, we can add together
10769       // partial products without any risk of needing to propagate a
10770       // carry out.
10771       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10772       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10773       __ andr(U_2, R_0, 3);
10774       __ mul(U_2, S_2, U_2);
10775 
10776       // Recycle registers S_0, S_1, S_2
10777       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10778 
10779       // Partial reduction mod 2**130 - 5
10780       __ adds(U_1, U_0HI, U_1);
10781       __ adc(U_2, U_1HI, U_2);
10782       // Sum now in U_2:U_1:U_0.
10783       // Dead: U_0HI, U_1HI.
10784       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10785 
10786       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10787 
10788       // First, U_2:U_1:U_0 += (U_2 >> 2)
10789       __ lsr(rscratch1, U_2, 2);
10790       __ andr(U_2, U_2, (u8)3);
10791       __ adds(U_0, U_0, rscratch1);
10792       __ adcs(U_1, U_1, zr);
10793       __ adc(U_2, U_2, zr);
10794       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10795       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10796       __ adcs(U_1, U_1, zr);
10797       __ adc(U_2, U_2, zr);
10798 
10799       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10800       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10801       __ br(~ Assembler::LT, LOOP);
10802     }
10803 
10804     // Further reduce modulo 2^130 - 5
10805     __ lsr(rscratch1, U_2, 2);
10806     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10807     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10808     __ adcs(U_1, U_1, zr);
10809     __ andr(U_2, U_2, (u1)3);
10810     __ adc(U_2, U_2, zr);
10811 
10812     // Unpack the sum into five 26-bit limbs and write to memory.
10813     __ ubfiz(rscratch1, U_0, 0, 26);
10814     __ ubfx(rscratch2, U_0, 26, 26);
10815     __ stp(rscratch1, rscratch2, Address(acc_start));
10816     __ ubfx(rscratch1, U_0, 52, 12);
10817     __ bfi(rscratch1, U_1, 12, 14);
10818     __ ubfx(rscratch2, U_1, 14, 26);
10819     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10820     __ ubfx(rscratch1, U_1, 40, 24);
10821     __ bfi(rscratch1, U_2, 24, 3);
10822     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10823 
10824     __ bind(DONE);
10825     __ pop(callee_saved, sp);
10826     __ leave();
10827     __ ret(lr);
10828 
10829     return start;
10830   }
10831 
10832   // exception handler for upcall stubs
10833   address generate_upcall_stub_exception_handler() {
10834     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
10835     StubCodeMark mark(this, stub_id);
10836     address start = __ pc();
10837 
10838     // Native caller has no idea how to handle exceptions,
10839     // so we just crash here. Up to callee to catch exceptions.
10840     __ verify_oop(r0);
10841     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10842     __ blr(rscratch1);
10843     __ should_not_reach_here();
10844 
10845     return start;
10846   }
10847 
10848   // load Method* target of MethodHandle
10849   // j_rarg0 = jobject receiver
10850   // rmethod = result
10851   address generate_upcall_stub_load_target() {
10852     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
10853     StubCodeMark mark(this, stub_id);
10854     address start = __ pc();
10855 
10856     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10857       // Load target method from receiver
10858     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10859     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10860     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10861     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10862                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10863                       noreg, noreg);
10864     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10865 
10866     __ ret(lr);
10867 
10868     return start;
10869   }
10870 
10871 #undef __
10872 #define __ masm->
10873 
10874   class MontgomeryMultiplyGenerator : public MacroAssembler {
10875 
10876     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10877       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10878 
10879     RegSet _toSave;
10880     bool _squaring;
10881 
10882   public:
10883     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10884       : MacroAssembler(as->code()), _squaring(squaring) {
10885 
10886       // Register allocation
10887 
10888       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10889       Pa_base = *regs;       // Argument registers
10890       if (squaring)
10891         Pb_base = Pa_base;
10892       else
10893         Pb_base = *++regs;
10894       Pn_base = *++regs;
10895       Rlen= *++regs;
10896       inv = *++regs;
10897       Pm_base = *++regs;
10898 
10899                           // Working registers:
10900       Ra =  *++regs;        // The current digit of a, b, n, and m.
10901       Rb =  *++regs;
10902       Rm =  *++regs;
10903       Rn =  *++regs;
10904 
10905       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10906       Pb =  *++regs;
10907       Pm =  *++regs;
10908       Pn =  *++regs;
10909 
10910       t0 =  *++regs;        // Three registers which form a
10911       t1 =  *++regs;        // triple-precision accumuator.
10912       t2 =  *++regs;
10913 
10914       Ri =  *++regs;        // Inner and outer loop indexes.
10915       Rj =  *++regs;
10916 
10917       Rhi_ab = *++regs;     // Product registers: low and high parts
10918       Rlo_ab = *++regs;     // of a*b and m*n.
10919       Rhi_mn = *++regs;
10920       Rlo_mn = *++regs;
10921 
10922       // r19 and up are callee-saved.
10923       _toSave = RegSet::range(r19, *regs) + Pm_base;
10924     }
10925 
10926   private:
10927     void save_regs() {
10928       push(_toSave, sp);
10929     }
10930 
10931     void restore_regs() {
10932       pop(_toSave, sp);
10933     }
10934 
10935     template <typename T>
10936     void unroll_2(Register count, T block) {
10937       Label loop, end, odd;
10938       tbnz(count, 0, odd);
10939       cbz(count, end);
10940       align(16);
10941       bind(loop);
10942       (this->*block)();
10943       bind(odd);
10944       (this->*block)();
10945       subs(count, count, 2);
10946       br(Assembler::GT, loop);
10947       bind(end);
10948     }
10949 
10950     template <typename T>
10951     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10952       Label loop, end, odd;
10953       tbnz(count, 0, odd);
10954       cbz(count, end);
10955       align(16);
10956       bind(loop);
10957       (this->*block)(d, s, tmp);
10958       bind(odd);
10959       (this->*block)(d, s, tmp);
10960       subs(count, count, 2);
10961       br(Assembler::GT, loop);
10962       bind(end);
10963     }
10964 
10965     void pre1(RegisterOrConstant i) {
10966       block_comment("pre1");
10967       // Pa = Pa_base;
10968       // Pb = Pb_base + i;
10969       // Pm = Pm_base;
10970       // Pn = Pn_base + i;
10971       // Ra = *Pa;
10972       // Rb = *Pb;
10973       // Rm = *Pm;
10974       // Rn = *Pn;
10975       ldr(Ra, Address(Pa_base));
10976       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10977       ldr(Rm, Address(Pm_base));
10978       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10979       lea(Pa, Address(Pa_base));
10980       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10981       lea(Pm, Address(Pm_base));
10982       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10983 
10984       // Zero the m*n result.
10985       mov(Rhi_mn, zr);
10986       mov(Rlo_mn, zr);
10987     }
10988 
10989     // The core multiply-accumulate step of a Montgomery
10990     // multiplication.  The idea is to schedule operations as a
10991     // pipeline so that instructions with long latencies (loads and
10992     // multiplies) have time to complete before their results are
10993     // used.  This most benefits in-order implementations of the
10994     // architecture but out-of-order ones also benefit.
10995     void step() {
10996       block_comment("step");
10997       // MACC(Ra, Rb, t0, t1, t2);
10998       // Ra = *++Pa;
10999       // Rb = *--Pb;
11000       umulh(Rhi_ab, Ra, Rb);
11001       mul(Rlo_ab, Ra, Rb);
11002       ldr(Ra, pre(Pa, wordSize));
11003       ldr(Rb, pre(Pb, -wordSize));
11004       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11005                                        // previous iteration.
11006       // MACC(Rm, Rn, t0, t1, t2);
11007       // Rm = *++Pm;
11008       // Rn = *--Pn;
11009       umulh(Rhi_mn, Rm, Rn);
11010       mul(Rlo_mn, Rm, Rn);
11011       ldr(Rm, pre(Pm, wordSize));
11012       ldr(Rn, pre(Pn, -wordSize));
11013       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11014     }
11015 
11016     void post1() {
11017       block_comment("post1");
11018 
11019       // MACC(Ra, Rb, t0, t1, t2);
11020       // Ra = *++Pa;
11021       // Rb = *--Pb;
11022       umulh(Rhi_ab, Ra, Rb);
11023       mul(Rlo_ab, Ra, Rb);
11024       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11025       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11026 
11027       // *Pm = Rm = t0 * inv;
11028       mul(Rm, t0, inv);
11029       str(Rm, Address(Pm));
11030 
11031       // MACC(Rm, Rn, t0, t1, t2);
11032       // t0 = t1; t1 = t2; t2 = 0;
11033       umulh(Rhi_mn, Rm, Rn);
11034 
11035 #ifndef PRODUCT
11036       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11037       {
11038         mul(Rlo_mn, Rm, Rn);
11039         add(Rlo_mn, t0, Rlo_mn);
11040         Label ok;
11041         cbz(Rlo_mn, ok); {
11042           stop("broken Montgomery multiply");
11043         } bind(ok);
11044       }
11045 #endif
11046       // We have very carefully set things up so that
11047       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11048       // the lower half of Rm * Rn because we know the result already:
11049       // it must be -t0.  t0 + (-t0) must generate a carry iff
11050       // t0 != 0.  So, rather than do a mul and an adds we just set
11051       // the carry flag iff t0 is nonzero.
11052       //
11053       // mul(Rlo_mn, Rm, Rn);
11054       // adds(zr, t0, Rlo_mn);
11055       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11056       adcs(t0, t1, Rhi_mn);
11057       adc(t1, t2, zr);
11058       mov(t2, zr);
11059     }
11060 
11061     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11062       block_comment("pre2");
11063       // Pa = Pa_base + i-len;
11064       // Pb = Pb_base + len;
11065       // Pm = Pm_base + i-len;
11066       // Pn = Pn_base + len;
11067 
11068       if (i.is_register()) {
11069         sub(Rj, i.as_register(), len);
11070       } else {
11071         mov(Rj, i.as_constant());
11072         sub(Rj, Rj, len);
11073       }
11074       // Rj == i-len
11075 
11076       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11077       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11078       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11079       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11080 
11081       // Ra = *++Pa;
11082       // Rb = *--Pb;
11083       // Rm = *++Pm;
11084       // Rn = *--Pn;
11085       ldr(Ra, pre(Pa, wordSize));
11086       ldr(Rb, pre(Pb, -wordSize));
11087       ldr(Rm, pre(Pm, wordSize));
11088       ldr(Rn, pre(Pn, -wordSize));
11089 
11090       mov(Rhi_mn, zr);
11091       mov(Rlo_mn, zr);
11092     }
11093 
11094     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11095       block_comment("post2");
11096       if (i.is_constant()) {
11097         mov(Rj, i.as_constant()-len.as_constant());
11098       } else {
11099         sub(Rj, i.as_register(), len);
11100       }
11101 
11102       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11103 
11104       // As soon as we know the least significant digit of our result,
11105       // store it.
11106       // Pm_base[i-len] = t0;
11107       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11108 
11109       // t0 = t1; t1 = t2; t2 = 0;
11110       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11111       adc(t1, t2, zr);
11112       mov(t2, zr);
11113     }
11114 
11115     // A carry in t0 after Montgomery multiplication means that we
11116     // should subtract multiples of n from our result in m.  We'll
11117     // keep doing that until there is no carry.
11118     void normalize(RegisterOrConstant len) {
11119       block_comment("normalize");
11120       // while (t0)
11121       //   t0 = sub(Pm_base, Pn_base, t0, len);
11122       Label loop, post, again;
11123       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11124       cbz(t0, post); {
11125         bind(again); {
11126           mov(i, zr);
11127           mov(cnt, len);
11128           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11129           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11130           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11131           align(16);
11132           bind(loop); {
11133             sbcs(Rm, Rm, Rn);
11134             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11135             add(i, i, 1);
11136             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11137             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11138             sub(cnt, cnt, 1);
11139           } cbnz(cnt, loop);
11140           sbc(t0, t0, zr);
11141         } cbnz(t0, again);
11142       } bind(post);
11143     }
11144 
11145     // Move memory at s to d, reversing words.
11146     //    Increments d to end of copied memory
11147     //    Destroys tmp1, tmp2
11148     //    Preserves len
11149     //    Leaves s pointing to the address which was in d at start
11150     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11151       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11152       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11153 
11154       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11155       mov(tmp1, len);
11156       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11157       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11158     }
11159     // where
11160     void reverse1(Register d, Register s, Register tmp) {
11161       ldr(tmp, pre(s, -wordSize));
11162       ror(tmp, tmp, 32);
11163       str(tmp, post(d, wordSize));
11164     }
11165 
11166     void step_squaring() {
11167       // An extra ACC
11168       step();
11169       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11170     }
11171 
11172     void last_squaring(RegisterOrConstant i) {
11173       Label dont;
11174       // if ((i & 1) == 0) {
11175       tbnz(i.as_register(), 0, dont); {
11176         // MACC(Ra, Rb, t0, t1, t2);
11177         // Ra = *++Pa;
11178         // Rb = *--Pb;
11179         umulh(Rhi_ab, Ra, Rb);
11180         mul(Rlo_ab, Ra, Rb);
11181         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11182       } bind(dont);
11183     }
11184 
11185     void extra_step_squaring() {
11186       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11187 
11188       // MACC(Rm, Rn, t0, t1, t2);
11189       // Rm = *++Pm;
11190       // Rn = *--Pn;
11191       umulh(Rhi_mn, Rm, Rn);
11192       mul(Rlo_mn, Rm, Rn);
11193       ldr(Rm, pre(Pm, wordSize));
11194       ldr(Rn, pre(Pn, -wordSize));
11195     }
11196 
11197     void post1_squaring() {
11198       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11199 
11200       // *Pm = Rm = t0 * inv;
11201       mul(Rm, t0, inv);
11202       str(Rm, Address(Pm));
11203 
11204       // MACC(Rm, Rn, t0, t1, t2);
11205       // t0 = t1; t1 = t2; t2 = 0;
11206       umulh(Rhi_mn, Rm, Rn);
11207 
11208 #ifndef PRODUCT
11209       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11210       {
11211         mul(Rlo_mn, Rm, Rn);
11212         add(Rlo_mn, t0, Rlo_mn);
11213         Label ok;
11214         cbz(Rlo_mn, ok); {
11215           stop("broken Montgomery multiply");
11216         } bind(ok);
11217       }
11218 #endif
11219       // We have very carefully set things up so that
11220       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11221       // the lower half of Rm * Rn because we know the result already:
11222       // it must be -t0.  t0 + (-t0) must generate a carry iff
11223       // t0 != 0.  So, rather than do a mul and an adds we just set
11224       // the carry flag iff t0 is nonzero.
11225       //
11226       // mul(Rlo_mn, Rm, Rn);
11227       // adds(zr, t0, Rlo_mn);
11228       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11229       adcs(t0, t1, Rhi_mn);
11230       adc(t1, t2, zr);
11231       mov(t2, zr);
11232     }
11233 
11234     void acc(Register Rhi, Register Rlo,
11235              Register t0, Register t1, Register t2) {
11236       adds(t0, t0, Rlo);
11237       adcs(t1, t1, Rhi);
11238       adc(t2, t2, zr);
11239     }
11240 
11241   public:
11242     /**
11243      * Fast Montgomery multiplication.  The derivation of the
11244      * algorithm is in A Cryptographic Library for the Motorola
11245      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11246      *
11247      * Arguments:
11248      *
11249      * Inputs for multiplication:
11250      *   c_rarg0   - int array elements a
11251      *   c_rarg1   - int array elements b
11252      *   c_rarg2   - int array elements n (the modulus)
11253      *   c_rarg3   - int length
11254      *   c_rarg4   - int inv
11255      *   c_rarg5   - int array elements m (the result)
11256      *
11257      * Inputs for squaring:
11258      *   c_rarg0   - int array elements a
11259      *   c_rarg1   - int array elements n (the modulus)
11260      *   c_rarg2   - int length
11261      *   c_rarg3   - int inv
11262      *   c_rarg4   - int array elements m (the result)
11263      *
11264      */
11265     address generate_multiply() {
11266       Label argh, nothing;
11267       bind(argh);
11268       stop("MontgomeryMultiply total_allocation must be <= 8192");
11269 
11270       align(CodeEntryAlignment);
11271       address entry = pc();
11272 
11273       cbzw(Rlen, nothing);
11274 
11275       enter();
11276 
11277       // Make room.
11278       cmpw(Rlen, 512);
11279       br(Assembler::HI, argh);
11280       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11281       andr(sp, Ra, -2 * wordSize);
11282 
11283       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11284 
11285       {
11286         // Copy input args, reversing as we go.  We use Ra as a
11287         // temporary variable.
11288         reverse(Ra, Pa_base, Rlen, t0, t1);
11289         if (!_squaring)
11290           reverse(Ra, Pb_base, Rlen, t0, t1);
11291         reverse(Ra, Pn_base, Rlen, t0, t1);
11292       }
11293 
11294       // Push all call-saved registers and also Pm_base which we'll need
11295       // at the end.
11296       save_regs();
11297 
11298 #ifndef PRODUCT
11299       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11300       {
11301         ldr(Rn, Address(Pn_base, 0));
11302         mul(Rlo_mn, Rn, inv);
11303         subs(zr, Rlo_mn, -1);
11304         Label ok;
11305         br(EQ, ok); {
11306           stop("broken inverse in Montgomery multiply");
11307         } bind(ok);
11308       }
11309 #endif
11310 
11311       mov(Pm_base, Ra);
11312 
11313       mov(t0, zr);
11314       mov(t1, zr);
11315       mov(t2, zr);
11316 
11317       block_comment("for (int i = 0; i < len; i++) {");
11318       mov(Ri, zr); {
11319         Label loop, end;
11320         cmpw(Ri, Rlen);
11321         br(Assembler::GE, end);
11322 
11323         bind(loop);
11324         pre1(Ri);
11325 
11326         block_comment("  for (j = i; j; j--) {"); {
11327           movw(Rj, Ri);
11328           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11329         } block_comment("  } // j");
11330 
11331         post1();
11332         addw(Ri, Ri, 1);
11333         cmpw(Ri, Rlen);
11334         br(Assembler::LT, loop);
11335         bind(end);
11336         block_comment("} // i");
11337       }
11338 
11339       block_comment("for (int i = len; i < 2*len; i++) {");
11340       mov(Ri, Rlen); {
11341         Label loop, end;
11342         cmpw(Ri, Rlen, Assembler::LSL, 1);
11343         br(Assembler::GE, end);
11344 
11345         bind(loop);
11346         pre2(Ri, Rlen);
11347 
11348         block_comment("  for (j = len*2-i-1; j; j--) {"); {
11349           lslw(Rj, Rlen, 1);
11350           subw(Rj, Rj, Ri);
11351           subw(Rj, Rj, 1);
11352           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11353         } block_comment("  } // j");
11354 
11355         post2(Ri, Rlen);
11356         addw(Ri, Ri, 1);
11357         cmpw(Ri, Rlen, Assembler::LSL, 1);
11358         br(Assembler::LT, loop);
11359         bind(end);
11360       }
11361       block_comment("} // i");
11362 
11363       normalize(Rlen);
11364 
11365       mov(Ra, Pm_base);  // Save Pm_base in Ra
11366       restore_regs();  // Restore caller's Pm_base
11367 
11368       // Copy our result into caller's Pm_base
11369       reverse(Pm_base, Ra, Rlen, t0, t1);
11370 
11371       leave();
11372       bind(nothing);
11373       ret(lr);
11374 
11375       return entry;
11376     }
11377     // In C, approximately:
11378 
11379     // void
11380     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11381     //                     julong Pn_base[], julong Pm_base[],
11382     //                     julong inv, int len) {
11383     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11384     //   julong *Pa, *Pb, *Pn, *Pm;
11385     //   julong Ra, Rb, Rn, Rm;
11386 
11387     //   int i;
11388 
11389     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11390 
11391     //   for (i = 0; i < len; i++) {
11392     //     int j;
11393 
11394     //     Pa = Pa_base;
11395     //     Pb = Pb_base + i;
11396     //     Pm = Pm_base;
11397     //     Pn = Pn_base + i;
11398 
11399     //     Ra = *Pa;
11400     //     Rb = *Pb;
11401     //     Rm = *Pm;
11402     //     Rn = *Pn;
11403 
11404     //     int iters = i;
11405     //     for (j = 0; iters--; j++) {
11406     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11407     //       MACC(Ra, Rb, t0, t1, t2);
11408     //       Ra = *++Pa;
11409     //       Rb = *--Pb;
11410     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11411     //       MACC(Rm, Rn, t0, t1, t2);
11412     //       Rm = *++Pm;
11413     //       Rn = *--Pn;
11414     //     }
11415 
11416     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11417     //     MACC(Ra, Rb, t0, t1, t2);
11418     //     *Pm = Rm = t0 * inv;
11419     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11420     //     MACC(Rm, Rn, t0, t1, t2);
11421 
11422     //     assert(t0 == 0, "broken Montgomery multiply");
11423 
11424     //     t0 = t1; t1 = t2; t2 = 0;
11425     //   }
11426 
11427     //   for (i = len; i < 2*len; i++) {
11428     //     int j;
11429 
11430     //     Pa = Pa_base + i-len;
11431     //     Pb = Pb_base + len;
11432     //     Pm = Pm_base + i-len;
11433     //     Pn = Pn_base + len;
11434 
11435     //     Ra = *++Pa;
11436     //     Rb = *--Pb;
11437     //     Rm = *++Pm;
11438     //     Rn = *--Pn;
11439 
11440     //     int iters = len*2-i-1;
11441     //     for (j = i-len+1; iters--; j++) {
11442     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11443     //       MACC(Ra, Rb, t0, t1, t2);
11444     //       Ra = *++Pa;
11445     //       Rb = *--Pb;
11446     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11447     //       MACC(Rm, Rn, t0, t1, t2);
11448     //       Rm = *++Pm;
11449     //       Rn = *--Pn;
11450     //     }
11451 
11452     //     Pm_base[i-len] = t0;
11453     //     t0 = t1; t1 = t2; t2 = 0;
11454     //   }
11455 
11456     //   while (t0)
11457     //     t0 = sub(Pm_base, Pn_base, t0, len);
11458     // }
11459 
11460     /**
11461      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11462      * multiplies than Montgomery multiplication so it should be up to
11463      * 25% faster.  However, its loop control is more complex and it
11464      * may actually run slower on some machines.
11465      *
11466      * Arguments:
11467      *
11468      * Inputs:
11469      *   c_rarg0   - int array elements a
11470      *   c_rarg1   - int array elements n (the modulus)
11471      *   c_rarg2   - int length
11472      *   c_rarg3   - int inv
11473      *   c_rarg4   - int array elements m (the result)
11474      *
11475      */
11476     address generate_square() {
11477       Label argh;
11478       bind(argh);
11479       stop("MontgomeryMultiply total_allocation must be <= 8192");
11480 
11481       align(CodeEntryAlignment);
11482       address entry = pc();
11483 
11484       enter();
11485 
11486       // Make room.
11487       cmpw(Rlen, 512);
11488       br(Assembler::HI, argh);
11489       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11490       andr(sp, Ra, -2 * wordSize);
11491 
11492       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11493 
11494       {
11495         // Copy input args, reversing as we go.  We use Ra as a
11496         // temporary variable.
11497         reverse(Ra, Pa_base, Rlen, t0, t1);
11498         reverse(Ra, Pn_base, Rlen, t0, t1);
11499       }
11500 
11501       // Push all call-saved registers and also Pm_base which we'll need
11502       // at the end.
11503       save_regs();
11504 
11505       mov(Pm_base, Ra);
11506 
11507       mov(t0, zr);
11508       mov(t1, zr);
11509       mov(t2, zr);
11510 
11511       block_comment("for (int i = 0; i < len; i++) {");
11512       mov(Ri, zr); {
11513         Label loop, end;
11514         bind(loop);
11515         cmp(Ri, Rlen);
11516         br(Assembler::GE, end);
11517 
11518         pre1(Ri);
11519 
11520         block_comment("for (j = (i+1)/2; j; j--) {"); {
11521           add(Rj, Ri, 1);
11522           lsr(Rj, Rj, 1);
11523           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11524         } block_comment("  } // j");
11525 
11526         last_squaring(Ri);
11527 
11528         block_comment("  for (j = i/2; j; j--) {"); {
11529           lsr(Rj, Ri, 1);
11530           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11531         } block_comment("  } // j");
11532 
11533         post1_squaring();
11534         add(Ri, Ri, 1);
11535         cmp(Ri, Rlen);
11536         br(Assembler::LT, loop);
11537 
11538         bind(end);
11539         block_comment("} // i");
11540       }
11541 
11542       block_comment("for (int i = len; i < 2*len; i++) {");
11543       mov(Ri, Rlen); {
11544         Label loop, end;
11545         bind(loop);
11546         cmp(Ri, Rlen, Assembler::LSL, 1);
11547         br(Assembler::GE, end);
11548 
11549         pre2(Ri, Rlen);
11550 
11551         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11552           lsl(Rj, Rlen, 1);
11553           sub(Rj, Rj, Ri);
11554           sub(Rj, Rj, 1);
11555           lsr(Rj, Rj, 1);
11556           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11557         } block_comment("  } // j");
11558 
11559         last_squaring(Ri);
11560 
11561         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11562           lsl(Rj, Rlen, 1);
11563           sub(Rj, Rj, Ri);
11564           lsr(Rj, Rj, 1);
11565           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11566         } block_comment("  } // j");
11567 
11568         post2(Ri, Rlen);
11569         add(Ri, Ri, 1);
11570         cmp(Ri, Rlen, Assembler::LSL, 1);
11571 
11572         br(Assembler::LT, loop);
11573         bind(end);
11574         block_comment("} // i");
11575       }
11576 
11577       normalize(Rlen);
11578 
11579       mov(Ra, Pm_base);  // Save Pm_base in Ra
11580       restore_regs();  // Restore caller's Pm_base
11581 
11582       // Copy our result into caller's Pm_base
11583       reverse(Pm_base, Ra, Rlen, t0, t1);
11584 
11585       leave();
11586       ret(lr);
11587 
11588       return entry;
11589     }
11590     // In C, approximately:
11591 
11592     // void
11593     // montgomery_square(julong Pa_base[], julong Pn_base[],
11594     //                   julong Pm_base[], julong inv, int len) {
11595     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11596     //   julong *Pa, *Pb, *Pn, *Pm;
11597     //   julong Ra, Rb, Rn, Rm;
11598 
11599     //   int i;
11600 
11601     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11602 
11603     //   for (i = 0; i < len; i++) {
11604     //     int j;
11605 
11606     //     Pa = Pa_base;
11607     //     Pb = Pa_base + i;
11608     //     Pm = Pm_base;
11609     //     Pn = Pn_base + i;
11610 
11611     //     Ra = *Pa;
11612     //     Rb = *Pb;
11613     //     Rm = *Pm;
11614     //     Rn = *Pn;
11615 
11616     //     int iters = (i+1)/2;
11617     //     for (j = 0; iters--; j++) {
11618     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11619     //       MACC2(Ra, Rb, t0, t1, t2);
11620     //       Ra = *++Pa;
11621     //       Rb = *--Pb;
11622     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11623     //       MACC(Rm, Rn, t0, t1, t2);
11624     //       Rm = *++Pm;
11625     //       Rn = *--Pn;
11626     //     }
11627     //     if ((i & 1) == 0) {
11628     //       assert(Ra == Pa_base[j], "must be");
11629     //       MACC(Ra, Ra, t0, t1, t2);
11630     //     }
11631     //     iters = i/2;
11632     //     assert(iters == i-j, "must be");
11633     //     for (; iters--; j++) {
11634     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11635     //       MACC(Rm, Rn, t0, t1, t2);
11636     //       Rm = *++Pm;
11637     //       Rn = *--Pn;
11638     //     }
11639 
11640     //     *Pm = Rm = t0 * inv;
11641     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11642     //     MACC(Rm, Rn, t0, t1, t2);
11643 
11644     //     assert(t0 == 0, "broken Montgomery multiply");
11645 
11646     //     t0 = t1; t1 = t2; t2 = 0;
11647     //   }
11648 
11649     //   for (i = len; i < 2*len; i++) {
11650     //     int start = i-len+1;
11651     //     int end = start + (len - start)/2;
11652     //     int j;
11653 
11654     //     Pa = Pa_base + i-len;
11655     //     Pb = Pa_base + len;
11656     //     Pm = Pm_base + i-len;
11657     //     Pn = Pn_base + len;
11658 
11659     //     Ra = *++Pa;
11660     //     Rb = *--Pb;
11661     //     Rm = *++Pm;
11662     //     Rn = *--Pn;
11663 
11664     //     int iters = (2*len-i-1)/2;
11665     //     assert(iters == end-start, "must be");
11666     //     for (j = start; iters--; j++) {
11667     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11668     //       MACC2(Ra, Rb, t0, t1, t2);
11669     //       Ra = *++Pa;
11670     //       Rb = *--Pb;
11671     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11672     //       MACC(Rm, Rn, t0, t1, t2);
11673     //       Rm = *++Pm;
11674     //       Rn = *--Pn;
11675     //     }
11676     //     if ((i & 1) == 0) {
11677     //       assert(Ra == Pa_base[j], "must be");
11678     //       MACC(Ra, Ra, t0, t1, t2);
11679     //     }
11680     //     iters =  (2*len-i)/2;
11681     //     assert(iters == len-j, "must be");
11682     //     for (; iters--; j++) {
11683     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11684     //       MACC(Rm, Rn, t0, t1, t2);
11685     //       Rm = *++Pm;
11686     //       Rn = *--Pn;
11687     //     }
11688     //     Pm_base[i-len] = t0;
11689     //     t0 = t1; t1 = t2; t2 = 0;
11690     //   }
11691 
11692     //   while (t0)
11693     //     t0 = sub(Pm_base, Pn_base, t0, len);
11694     // }
11695   };
11696 
11697   // Call here from the interpreter or compiled code to either load
11698   // multiple returned values from the inline type instance being
11699   // returned to registers or to store returned values to a newly
11700   // allocated inline type instance.
11701   address generate_return_value_stub(address destination, const char* name, bool has_res) {
11702     // We need to save all registers the calling convention may use so
11703     // the runtime calls read or update those registers. This needs to
11704     // be in sync with SharedRuntime::java_return_convention().
11705     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
11706     enum layout {
11707       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
11708       j_rarg6_off, j_rarg6_2,
11709       j_rarg5_off, j_rarg5_2,
11710       j_rarg4_off, j_rarg4_2,
11711       j_rarg3_off, j_rarg3_2,
11712       j_rarg2_off, j_rarg2_2,
11713       j_rarg1_off, j_rarg1_2,
11714       j_rarg0_off, j_rarg0_2,
11715 
11716       j_farg7_off, j_farg7_2,
11717       j_farg6_off, j_farg6_2,
11718       j_farg5_off, j_farg5_2,
11719       j_farg4_off, j_farg4_2,
11720       j_farg3_off, j_farg3_2,
11721       j_farg2_off, j_farg2_2,
11722       j_farg1_off, j_farg1_2,
11723       j_farg0_off, j_farg0_2,
11724 
11725       rfp_off, rfp_off2,
11726       return_off, return_off2,
11727 
11728       framesize // inclusive of return address
11729     };
11730 
11731     CodeBuffer code(name, 512, 64);
11732     MacroAssembler* masm = new MacroAssembler(&code);
11733 
11734     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
11735     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
11736     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
11737     int frame_size_in_words = frame_size_in_bytes / wordSize;
11738 
11739     OopMapSet* oop_maps = new OopMapSet();
11740     OopMap* map = new OopMap(frame_size_in_slots, 0);
11741 
11742     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
11743     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
11744     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
11745     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
11746     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
11747     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
11748     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
11749     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
11750 
11751     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
11752     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
11753     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
11754     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
11755     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
11756     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
11757     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
11758     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
11759 
11760     address start = __ pc();
11761 
11762     __ enter(); // Save FP and LR before call
11763 
11764     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
11765     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
11766     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
11767     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
11768 
11769     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
11770     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
11771     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
11772     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
11773 
11774     int frame_complete = __ offset();
11775 
11776     // Set up last_Java_sp and last_Java_fp
11777     address the_pc = __ pc();
11778     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
11779 
11780     // Call runtime
11781     __ mov(c_rarg1, r0);
11782     __ mov(c_rarg0, rthread);
11783 
11784     __ mov(rscratch1, destination);
11785     __ blr(rscratch1);
11786 
11787     oop_maps->add_gc_map(the_pc - start, map);
11788 
11789     __ reset_last_Java_frame(false);
11790 
11791     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
11792     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
11793     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
11794     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
11795 
11796     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
11797     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
11798     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
11799     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
11800 
11801     __ leave();
11802 
11803     // check for pending exceptions
11804     Label pending;
11805     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
11806     __ cbnz(rscratch1, pending);
11807 
11808     if (has_res) {
11809       __ get_vm_result_oop(r0, rthread);
11810     }
11811 
11812     __ ret(lr);
11813 
11814     __ bind(pending);
11815     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
11816 
11817     // -------------
11818     // make sure all code is generated
11819     masm->flush();
11820 
11821     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
11822     return stub->entry_point();
11823   }
11824 
11825   // Initialization
11826   void generate_preuniverse_stubs() {
11827     // preuniverse stubs are not needed for aarch64
11828   }
11829 
11830   void generate_initial_stubs() {
11831     // Generate initial stubs and initializes the entry points
11832 
11833     // entry points that exist in all platforms Note: This is code
11834     // that could be shared among different platforms - however the
11835     // benefit seems to be smaller than the disadvantage of having a
11836     // much more complicated generator structure. See also comment in
11837     // stubRoutines.hpp.
11838 
11839     StubRoutines::_forward_exception_entry = generate_forward_exception();
11840 
11841     StubRoutines::_call_stub_entry =
11842       generate_call_stub(StubRoutines::_call_stub_return_address);
11843 
11844     // is referenced by megamorphic call
11845     StubRoutines::_catch_exception_entry = generate_catch_exception();
11846 
11847     // Initialize table for copy memory (arraycopy) check.
11848     if (UnsafeMemoryAccess::_table == nullptr) {
11849       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11850     }
11851 
11852     if (UseCRC32Intrinsics) {
11853       // set table address before stub generation which use it
11854       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
11855       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11856     }
11857 
11858     if (UseCRC32CIntrinsics) {
11859       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11860     }
11861 
11862     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11863       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11864     }
11865 
11866     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11867       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11868     }
11869 
11870     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11871         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11872       StubRoutines::_hf2f = generate_float16ToFloat();
11873       StubRoutines::_f2hf = generate_floatToFloat16();
11874     }
11875 
11876     if (InlineTypeReturnedAsFields) {
11877       StubRoutines::_load_inline_type_fields_in_regs =
11878          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
11879       StubRoutines::_store_inline_type_fields_to_buf =
11880          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
11881     }
11882 
11883   }
11884 
11885   void generate_continuation_stubs() {
11886     // Continuation stubs:
11887     StubRoutines::_cont_thaw          = generate_cont_thaw();
11888     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11889     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11890     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11891   }
11892 
11893   void generate_final_stubs() {
11894     // support for verify_oop (must happen after universe_init)
11895     if (VerifyOops) {
11896       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11897     }
11898 
11899     // arraycopy stubs used by compilers
11900     generate_arraycopy_stubs();
11901 
11902     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11903 
11904     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11905 
11906     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11907     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11908 
11909 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11910 
11911     generate_atomic_entry_points();
11912 
11913 #endif // LINUX
11914 
11915 #ifdef COMPILER2
11916     if (UseSecondarySupersTable) {
11917       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11918       if (! InlineSecondarySupersTest) {
11919         generate_lookup_secondary_supers_table_stub();
11920       }
11921     }
11922 #endif
11923 
11924     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11925 
11926     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11927   }
11928 
11929   void generate_compiler_stubs() {
11930 #if COMPILER2_OR_JVMCI
11931 
11932     if (UseSVE == 0) {
11933       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
11934     }
11935 
11936     // array equals stub for large arrays.
11937     if (!UseSimpleArrayEquals) {
11938       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11939     }
11940 
11941     // arrays_hascode stub for large arrays.
11942     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11943     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11944     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11945     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11946     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11947 
11948     // byte_array_inflate stub for large arrays.
11949     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11950 
11951     // countPositives stub for large arrays.
11952     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11953 
11954     generate_compare_long_strings();
11955 
11956     generate_string_indexof_stubs();
11957 
11958 #ifdef COMPILER2
11959     if (UseMultiplyToLenIntrinsic) {
11960       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11961     }
11962 
11963     if (UseSquareToLenIntrinsic) {
11964       StubRoutines::_squareToLen = generate_squareToLen();
11965     }
11966 
11967     if (UseMulAddIntrinsic) {
11968       StubRoutines::_mulAdd = generate_mulAdd();
11969     }
11970 
11971     if (UseSIMDForBigIntegerShiftIntrinsics) {
11972       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11973       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11974     }
11975 
11976     if (UseMontgomeryMultiplyIntrinsic) {
11977       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
11978       StubCodeMark mark(this, stub_id);
11979       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11980       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11981     }
11982 
11983     if (UseMontgomerySquareIntrinsic) {
11984       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
11985       StubCodeMark mark(this, stub_id);
11986       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11987       // We use generate_multiply() rather than generate_square()
11988       // because it's faster for the sizes of modulus we care about.
11989       StubRoutines::_montgomerySquare = g.generate_multiply();
11990     }
11991 
11992 #endif // COMPILER2
11993 
11994     if (UseChaCha20Intrinsics) {
11995       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11996     }
11997 
11998     if (UseKyberIntrinsics) {
11999       StubRoutines::_kyberNtt = generate_kyberNtt();
12000       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12001       StubRoutines::_kyberNttMult = generate_kyberNttMult();
12002       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12003       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12004       StubRoutines::_kyber12To16 = generate_kyber12To16();
12005       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12006     }
12007 
12008     if (UseDilithiumIntrinsics) {
12009       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12010       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12011       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12012       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12013       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12014     }
12015 
12016     if (UseBASE64Intrinsics) {
12017         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12018         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12019     }
12020 
12021     // data cache line writeback
12022     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12023     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12024 
12025     if (UseAESIntrinsics) {
12026       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12027       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12028       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12029       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12030       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12031     }
12032     if (UseGHASHIntrinsics) {
12033       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12034       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
12035     }
12036     if (UseAESIntrinsics && UseGHASHIntrinsics) {
12037       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12038     }
12039 
12040     if (UseMD5Intrinsics) {
12041       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
12042       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
12043     }
12044     if (UseSHA1Intrinsics) {
12045       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
12046       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
12047     }
12048     if (UseSHA256Intrinsics) {
12049       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
12050       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
12051     }
12052     if (UseSHA512Intrinsics) {
12053       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
12054       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
12055     }
12056     if (UseSHA3Intrinsics) {
12057 
12058       StubRoutines::_double_keccak         = generate_double_keccak();
12059       if (UseSIMDForSHA3Intrinsic) {
12060          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
12061          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
12062       } else {
12063          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubGenStubId::sha3_implCompress_id);
12064          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubGenStubId::sha3_implCompressMB_id);
12065       }
12066     }
12067 
12068     if (UsePoly1305Intrinsics) {
12069       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12070     }
12071 
12072     // generate Adler32 intrinsics code
12073     if (UseAdler32Intrinsics) {
12074       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12075     }
12076 
12077 #endif // COMPILER2_OR_JVMCI
12078   }
12079 
12080  public:
12081   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
12082     switch(blob_id) {
12083     case preuniverse_id:
12084       generate_preuniverse_stubs();
12085       break;
12086     case initial_id:
12087       generate_initial_stubs();
12088       break;
12089      case continuation_id:
12090       generate_continuation_stubs();
12091       break;
12092     case compiler_id:
12093       generate_compiler_stubs();
12094       break;
12095     case final_id:
12096       generate_final_stubs();
12097       break;
12098     default:
12099       fatal("unexpected blob id: %d", blob_id);
12100       break;
12101     };
12102   }
12103 }; // end class declaration
12104 
12105 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
12106   StubGenerator g(code, blob_id);
12107 }
12108 
12109 
12110 #if defined (LINUX)
12111 
12112 // Define pointers to atomic stubs and initialize them to point to the
12113 // code in atomic_aarch64.S.
12114 
12115 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
12116   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12117     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
12118   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12119     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12120 
12121 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12122 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12123 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12124 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12125 DEFAULT_ATOMIC_OP(xchg, 4, )
12126 DEFAULT_ATOMIC_OP(xchg, 8, )
12127 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12128 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12129 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12130 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12131 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12132 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12133 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12134 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12135 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12136 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12137 
12138 #undef DEFAULT_ATOMIC_OP
12139 
12140 #endif // LINUX