1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "code/aotCodeCache.hpp"
   31 #include "compiler/oopMap.hpp"
   32 #include "gc/shared/barrierSet.hpp"
   33 #include "gc/shared/barrierSetAssembler.hpp"
   34 #include "gc/shared/gc_globals.hpp"
   35 #include "gc/shared/tlab_globals.hpp"
   36 #include "interpreter/interpreter.hpp"
   37 #include "memory/universe.hpp"
   38 #include "nativeInst_aarch64.hpp"
   39 #include "oops/instanceOop.hpp"
   40 #include "oops/method.hpp"
   41 #include "oops/objArrayKlass.hpp"
   42 #include "oops/oop.inline.hpp"
   43 #include "prims/methodHandles.hpp"
   44 #include "prims/upcallLinker.hpp"
   45 #include "runtime/arguments.hpp"
   46 #include "runtime/atomic.hpp"
   47 #include "runtime/continuation.hpp"
   48 #include "runtime/continuationEntry.inline.hpp"
   49 #include "runtime/frame.inline.hpp"
   50 #include "runtime/handles.inline.hpp"
   51 #include "runtime/javaThread.hpp"
   52 #include "runtime/sharedRuntime.hpp"
   53 #include "runtime/stubCodeGenerator.hpp"
   54 #include "runtime/stubRoutines.hpp"
   55 #include "utilities/align.hpp"
   56 #include "utilities/checkedCast.hpp"
   57 #include "utilities/debug.hpp"
   58 #include "utilities/globalDefinitions.hpp"
   59 #include "utilities/intpow.hpp"
   60 #include "utilities/powerOfTwo.hpp"
   61 #ifdef COMPILER2
   62 #include "opto/runtime.hpp"
   63 #endif
   64 #if INCLUDE_ZGC
   65 #include "gc/z/zThreadLocalData.hpp"
   66 #endif
   67 
   68 // Declaration and definition of StubGenerator (no .hpp file).
   69 // For a more detailed description of the stub routine structure
   70 // see the comment in stubRoutines.hpp
   71 
   72 #undef __
   73 #define __ _masm->
   74 
   75 #ifdef PRODUCT
   76 #define BLOCK_COMMENT(str) /* nothing */
   77 #else
   78 #define BLOCK_COMMENT(str) __ block_comment(str)
   79 #endif
   80 
   81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   82 
   83 // Stub Code definitions
   84 
   85 class StubGenerator: public StubCodeGenerator {
   86  private:
   87 
   88 #ifdef PRODUCT
   89 #define inc_counter_np(counter) ((void)0)
   90 #else
   91   void inc_counter_np_(uint& counter) {
   92     __ incrementw(ExternalAddress((address)&counter));
   93   }
   94 #define inc_counter_np(counter) \
   95   BLOCK_COMMENT("inc_counter " #counter); \
   96   inc_counter_np_(counter);
   97 #endif
   98 
   99   // Call stubs are used to call Java from C
  100   //
  101   // Arguments:
  102   //    c_rarg0:   call wrapper address                   address
  103   //    c_rarg1:   result                                 address
  104   //    c_rarg2:   result type                            BasicType
  105   //    c_rarg3:   method                                 Method*
  106   //    c_rarg4:   (interpreter) entry point              address
  107   //    c_rarg5:   parameters                             intptr_t*
  108   //    c_rarg6:   parameter size (in words)              int
  109   //    c_rarg7:   thread                                 Thread*
  110   //
  111   // There is no return from the stub itself as any Java result
  112   // is written to result
  113   //
  114   // we save r30 (lr) as the return PC at the base of the frame and
  115   // link r29 (fp) below it as the frame pointer installing sp (r31)
  116   // into fp.
  117   //
  118   // we save r0-r7, which accounts for all the c arguments.
  119   //
  120   // TODO: strictly do we need to save them all? they are treated as
  121   // volatile by C so could we omit saving the ones we are going to
  122   // place in global registers (thread? method?) or those we only use
  123   // during setup of the Java call?
  124   //
  125   // we don't need to save r8 which C uses as an indirect result location
  126   // return register.
  127   //
  128   // we don't need to save r9-r15 which both C and Java treat as
  129   // volatile
  130   //
  131   // we don't need to save r16-18 because Java does not use them
  132   //
  133   // we save r19-r28 which Java uses as scratch registers and C
  134   // expects to be callee-save
  135   //
  136   // we save the bottom 64 bits of each value stored in v8-v15; it is
  137   // the responsibility of the caller to preserve larger values.
  138   //
  139   // so the stub frame looks like this when we enter Java code
  140   //
  141   //     [ return_from_Java     ] <--- sp
  142   //     [ argument word n      ]
  143   //      ...
  144   // -29 [ argument word 1      ]
  145   // -28 [ saved Floating-point Control Register ]
  146   // -26 [ saved v15            ] <--- sp_after_call
  147   // -25 [ saved v14            ]
  148   // -24 [ saved v13            ]
  149   // -23 [ saved v12            ]
  150   // -22 [ saved v11            ]
  151   // -21 [ saved v10            ]
  152   // -20 [ saved v9             ]
  153   // -19 [ saved v8             ]
  154   // -18 [ saved r28            ]
  155   // -17 [ saved r27            ]
  156   // -16 [ saved r26            ]
  157   // -15 [ saved r25            ]
  158   // -14 [ saved r24            ]
  159   // -13 [ saved r23            ]
  160   // -12 [ saved r22            ]
  161   // -11 [ saved r21            ]
  162   // -10 [ saved r20            ]
  163   //  -9 [ saved r19            ]
  164   //  -8 [ call wrapper    (r0) ]
  165   //  -7 [ result          (r1) ]
  166   //  -6 [ result type     (r2) ]
  167   //  -5 [ method          (r3) ]
  168   //  -4 [ entry point     (r4) ]
  169   //  -3 [ parameters      (r5) ]
  170   //  -2 [ parameter size  (r6) ]
  171   //  -1 [ thread (r7)          ]
  172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  173   //   1 [ saved lr       (r30) ]
  174 
  175   // Call stub stack layout word offsets from fp
  176   enum call_stub_layout {
  177     sp_after_call_off  = -28,
  178 
  179     fpcr_off           = sp_after_call_off,
  180     d15_off            = -26,
  181     d13_off            = -24,
  182     d11_off            = -22,
  183     d9_off             = -20,
  184 
  185     r28_off            = -18,
  186     r26_off            = -16,
  187     r24_off            = -14,
  188     r22_off            = -12,
  189     r20_off            = -10,
  190     call_wrapper_off   =  -8,
  191     result_off         =  -7,
  192     result_type_off    =  -6,
  193     method_off         =  -5,
  194     entry_point_off    =  -4,
  195     parameter_size_off =  -2,
  196     thread_off         =  -1,
  197     fp_f               =   0,
  198     retaddr_off        =   1,
  199   };
  200 
  201   address generate_call_stub(address& return_address) {
  202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  204            "adjust this code");
  205 
  206     StubId stub_id = StubId::stubgen_call_stub_id;
  207     StubCodeMark mark(this, stub_id);
  208     address start = __ pc();
  209 
  210     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  211 
  212     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  213     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  214     const Address result        (rfp, result_off         * wordSize);
  215     const Address result_type   (rfp, result_type_off    * wordSize);
  216     const Address method        (rfp, method_off         * wordSize);
  217     const Address entry_point   (rfp, entry_point_off    * wordSize);
  218     const Address parameter_size(rfp, parameter_size_off * wordSize);
  219 
  220     const Address thread        (rfp, thread_off         * wordSize);
  221 
  222     const Address d15_save      (rfp, d15_off * wordSize);
  223     const Address d13_save      (rfp, d13_off * wordSize);
  224     const Address d11_save      (rfp, d11_off * wordSize);
  225     const Address d9_save       (rfp, d9_off * wordSize);
  226 
  227     const Address r28_save      (rfp, r28_off * wordSize);
  228     const Address r26_save      (rfp, r26_off * wordSize);
  229     const Address r24_save      (rfp, r24_off * wordSize);
  230     const Address r22_save      (rfp, r22_off * wordSize);
  231     const Address r20_save      (rfp, r20_off * wordSize);
  232 
  233     // stub code
  234 
  235     address aarch64_entry = __ pc();
  236 
  237     // set up frame and move sp to end of save area
  238     __ enter();
  239     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  240 
  241     // save register parameters and Java scratch/global registers
  242     // n.b. we save thread even though it gets installed in
  243     // rthread because we want to sanity check rthread later
  244     __ str(c_rarg7,  thread);
  245     __ strw(c_rarg6, parameter_size);
  246     __ stp(c_rarg4, c_rarg5,  entry_point);
  247     __ stp(c_rarg2, c_rarg3,  result_type);
  248     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  249 
  250     __ stp(r20, r19,   r20_save);
  251     __ stp(r22, r21,   r22_save);
  252     __ stp(r24, r23,   r24_save);
  253     __ stp(r26, r25,   r26_save);
  254     __ stp(r28, r27,   r28_save);
  255 
  256     __ stpd(v9,  v8,   d9_save);
  257     __ stpd(v11, v10,  d11_save);
  258     __ stpd(v13, v12,  d13_save);
  259     __ stpd(v15, v14,  d15_save);
  260 
  261     __ get_fpcr(rscratch1);
  262     __ str(rscratch1, fpcr_save);
  263     // Set FPCR to the state we need. We do want Round to Nearest. We
  264     // don't want non-IEEE rounding modes or floating-point traps.
  265     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  266     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  267     __ set_fpcr(rscratch1);
  268 
  269     // install Java thread in global register now we have saved
  270     // whatever value it held
  271     __ mov(rthread, c_rarg7);
  272     // And method
  273     __ mov(rmethod, c_rarg3);
  274 
  275     // set up the heapbase register
  276     __ reinit_heapbase();
  277 
  278 #ifdef ASSERT
  279     // make sure we have no pending exceptions
  280     {
  281       Label L;
  282       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  283       __ cmp(rscratch1, (u1)NULL_WORD);
  284       __ br(Assembler::EQ, L);
  285       __ stop("StubRoutines::call_stub: entered with pending exception");
  286       __ BIND(L);
  287     }
  288 #endif
  289     // pass parameters if any
  290     __ mov(esp, sp);
  291     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  292     __ andr(sp, rscratch1, -2 * wordSize);
  293 
  294     BLOCK_COMMENT("pass parameters if any");
  295     Label parameters_done;
  296     // parameter count is still in c_rarg6
  297     // and parameter pointer identifying param 1 is in c_rarg5
  298     __ cbzw(c_rarg6, parameters_done);
  299 
  300     address loop = __ pc();
  301     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  302     __ subsw(c_rarg6, c_rarg6, 1);
  303     __ push(rscratch1);
  304     __ br(Assembler::GT, loop);
  305 
  306     __ BIND(parameters_done);
  307 
  308     // call Java entry -- passing methdoOop, and current sp
  309     //      rmethod: Method*
  310     //      r19_sender_sp: sender sp
  311     BLOCK_COMMENT("call Java function");
  312     __ mov(r19_sender_sp, sp);
  313     __ blr(c_rarg4);
  314 
  315     // we do this here because the notify will already have been done
  316     // if we get to the next instruction via an exception
  317     //
  318     // n.b. adding this instruction here affects the calculation of
  319     // whether or not a routine returns to the call stub (used when
  320     // doing stack walks) since the normal test is to check the return
  321     // pc against the address saved below. so we may need to allow for
  322     // this extra instruction in the check.
  323 
  324     // save current address for use by exception handling code
  325 
  326     return_address = __ pc();
  327 
  328     // store result depending on type (everything that is not
  329     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  330     // n.b. this assumes Java returns an integral result in r0
  331     // and a floating result in j_farg0
  332     __ ldr(j_rarg2, result);
  333     Label is_long, is_float, is_double, exit;
  334     __ ldr(j_rarg1, result_type);
  335     __ cmp(j_rarg1, (u1)T_OBJECT);
  336     __ br(Assembler::EQ, is_long);
  337     __ cmp(j_rarg1, (u1)T_LONG);
  338     __ br(Assembler::EQ, is_long);
  339     __ cmp(j_rarg1, (u1)T_FLOAT);
  340     __ br(Assembler::EQ, is_float);
  341     __ cmp(j_rarg1, (u1)T_DOUBLE);
  342     __ br(Assembler::EQ, is_double);
  343 
  344     // handle T_INT case
  345     __ strw(r0, Address(j_rarg2));
  346 
  347     __ BIND(exit);
  348 
  349     // pop parameters
  350     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  351 
  352 #ifdef ASSERT
  353     // verify that threads correspond
  354     {
  355       Label L, S;
  356       __ ldr(rscratch1, thread);
  357       __ cmp(rthread, rscratch1);
  358       __ br(Assembler::NE, S);
  359       __ get_thread(rscratch1);
  360       __ cmp(rthread, rscratch1);
  361       __ br(Assembler::EQ, L);
  362       __ BIND(S);
  363       __ stop("StubRoutines::call_stub: threads must correspond");
  364       __ BIND(L);
  365     }
  366 #endif
  367 
  368     __ pop_cont_fastpath(rthread);
  369 
  370     // restore callee-save registers
  371     __ ldpd(v15, v14,  d15_save);
  372     __ ldpd(v13, v12,  d13_save);
  373     __ ldpd(v11, v10,  d11_save);
  374     __ ldpd(v9,  v8,   d9_save);
  375 
  376     __ ldp(r28, r27,   r28_save);
  377     __ ldp(r26, r25,   r26_save);
  378     __ ldp(r24, r23,   r24_save);
  379     __ ldp(r22, r21,   r22_save);
  380     __ ldp(r20, r19,   r20_save);
  381 
  382     // restore fpcr
  383     __ ldr(rscratch1,  fpcr_save);
  384     __ set_fpcr(rscratch1);
  385 
  386     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  387     __ ldrw(c_rarg2, result_type);
  388     __ ldr(c_rarg3,  method);
  389     __ ldp(c_rarg4, c_rarg5,  entry_point);
  390     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  391 
  392     // leave frame and return to caller
  393     __ leave();
  394     __ ret(lr);
  395 
  396     // handle return types different from T_INT
  397 
  398     __ BIND(is_long);
  399     __ str(r0, Address(j_rarg2, 0));
  400     __ br(Assembler::AL, exit);
  401 
  402     __ BIND(is_float);
  403     __ strs(j_farg0, Address(j_rarg2, 0));
  404     __ br(Assembler::AL, exit);
  405 
  406     __ BIND(is_double);
  407     __ strd(j_farg0, Address(j_rarg2, 0));
  408     __ br(Assembler::AL, exit);
  409 
  410     return start;
  411   }
  412 
  413   // Return point for a Java call if there's an exception thrown in
  414   // Java code.  The exception is caught and transformed into a
  415   // pending exception stored in JavaThread that can be tested from
  416   // within the VM.
  417   //
  418   // Note: Usually the parameters are removed by the callee. In case
  419   // of an exception crossing an activation frame boundary, that is
  420   // not the case if the callee is compiled code => need to setup the
  421   // rsp.
  422   //
  423   // r0: exception oop
  424 
  425   address generate_catch_exception() {
  426     StubId stub_id = StubId::stubgen_catch_exception_id;
  427     StubCodeMark mark(this, stub_id);
  428     address start = __ pc();
  429 
  430     // same as in generate_call_stub():
  431     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  432     const Address thread        (rfp, thread_off         * wordSize);
  433 
  434 #ifdef ASSERT
  435     // verify that threads correspond
  436     {
  437       Label L, S;
  438       __ ldr(rscratch1, thread);
  439       __ cmp(rthread, rscratch1);
  440       __ br(Assembler::NE, S);
  441       __ get_thread(rscratch1);
  442       __ cmp(rthread, rscratch1);
  443       __ br(Assembler::EQ, L);
  444       __ bind(S);
  445       __ stop("StubRoutines::catch_exception: threads must correspond");
  446       __ bind(L);
  447     }
  448 #endif
  449 
  450     // set pending exception
  451     __ verify_oop(r0);
  452 
  453     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  454     __ mov(rscratch1, (address)__FILE__);
  455     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  456     __ movw(rscratch1, (int)__LINE__);
  457     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  458 
  459     // complete return to VM
  460     assert(StubRoutines::_call_stub_return_address != nullptr,
  461            "_call_stub_return_address must have been generated before");
  462     __ b(StubRoutines::_call_stub_return_address);
  463 
  464     return start;
  465   }
  466 
  467   // Continuation point for runtime calls returning with a pending
  468   // exception.  The pending exception check happened in the runtime
  469   // or native call stub.  The pending exception in Thread is
  470   // converted into a Java-level exception.
  471   //
  472   // Contract with Java-level exception handlers:
  473   // r0: exception
  474   // r3: throwing pc
  475   //
  476   // NOTE: At entry of this stub, exception-pc must be in LR !!
  477 
  478   // NOTE: this is always used as a jump target within generated code
  479   // so it just needs to be generated code with no x86 prolog
  480 
  481   address generate_forward_exception() {
  482     StubId stub_id = StubId::stubgen_forward_exception_id;
  483     StubCodeMark mark(this, stub_id);
  484     address start = __ pc();
  485 
  486     // Upon entry, LR points to the return address returning into
  487     // Java (interpreted or compiled) code; i.e., the return address
  488     // becomes the throwing pc.
  489     //
  490     // Arguments pushed before the runtime call are still on the stack
  491     // but the exception handler will reset the stack pointer ->
  492     // ignore them.  A potential result in registers can be ignored as
  493     // well.
  494 
  495 #ifdef ASSERT
  496     // make sure this code is only executed if there is a pending exception
  497     {
  498       Label L;
  499       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  500       __ cbnz(rscratch1, L);
  501       __ stop("StubRoutines::forward exception: no pending exception (1)");
  502       __ bind(L);
  503     }
  504 #endif
  505 
  506     // compute exception handler into r19
  507 
  508     // call the VM to find the handler address associated with the
  509     // caller address. pass thread in r0 and caller pc (ret address)
  510     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  511     // the stack.
  512     __ mov(c_rarg1, lr);
  513     // lr will be trashed by the VM call so we move it to R19
  514     // (callee-saved) because we also need to pass it to the handler
  515     // returned by this call.
  516     __ mov(r19, lr);
  517     BLOCK_COMMENT("call exception_handler_for_return_address");
  518     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  519                          SharedRuntime::exception_handler_for_return_address),
  520                     rthread, c_rarg1);
  521     // Reinitialize the ptrue predicate register, in case the external runtime
  522     // call clobbers ptrue reg, as we may return to SVE compiled code.
  523     __ reinitialize_ptrue();
  524 
  525     // we should not really care that lr is no longer the callee
  526     // address. we saved the value the handler needs in r19 so we can
  527     // just copy it to r3. however, the C2 handler will push its own
  528     // frame and then calls into the VM and the VM code asserts that
  529     // the PC for the frame above the handler belongs to a compiled
  530     // Java method. So, we restore lr here to satisfy that assert.
  531     __ mov(lr, r19);
  532     // setup r0 & r3 & clear pending exception
  533     __ mov(r3, r19);
  534     __ mov(r19, r0);
  535     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  536     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  537 
  538 #ifdef ASSERT
  539     // make sure exception is set
  540     {
  541       Label L;
  542       __ cbnz(r0, L);
  543       __ stop("StubRoutines::forward exception: no pending exception (2)");
  544       __ bind(L);
  545     }
  546 #endif
  547 
  548     // continue at exception handler
  549     // r0: exception
  550     // r3: throwing pc
  551     // r19: exception handler
  552     __ verify_oop(r0);
  553     __ br(r19);
  554 
  555     return start;
  556   }
  557 
  558   // Non-destructive plausibility checks for oops
  559   //
  560   // Arguments:
  561   //    r0: oop to verify
  562   //    rscratch1: error message
  563   //
  564   // Stack after saving c_rarg3:
  565   //    [tos + 0]: saved c_rarg3
  566   //    [tos + 1]: saved c_rarg2
  567   //    [tos + 2]: saved lr
  568   //    [tos + 3]: saved rscratch2
  569   //    [tos + 4]: saved r0
  570   //    [tos + 5]: saved rscratch1
  571   address generate_verify_oop() {
  572     StubId stub_id = StubId::stubgen_verify_oop_id;
  573     StubCodeMark mark(this, stub_id);
  574     address start = __ pc();
  575 
  576     Label exit, error;
  577 
  578     // save c_rarg2 and c_rarg3
  579     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  580 
  581     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  583     __ ldr(c_rarg3, Address(c_rarg2));
  584     __ add(c_rarg3, c_rarg3, 1);
  585     __ str(c_rarg3, Address(c_rarg2));
  586 
  587     // object is in r0
  588     // make sure object is 'reasonable'
  589     __ cbz(r0, exit); // if obj is null it is OK
  590 
  591     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  592     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  593 
  594     // return if everything seems ok
  595     __ bind(exit);
  596 
  597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  598     __ ret(lr);
  599 
  600     // handle errors
  601     __ bind(error);
  602     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  603 
  604     __ push(RegSet::range(r0, r29), sp);
  605     // debug(char* msg, int64_t pc, int64_t regs[])
  606     __ mov(c_rarg0, rscratch1);      // pass address of error message
  607     __ mov(c_rarg1, lr);             // pass return address
  608     __ mov(c_rarg2, sp);             // pass address of regs on stack
  609 #ifndef PRODUCT
  610     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  611 #endif
  612     BLOCK_COMMENT("call MacroAssembler::debug");
  613     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  614     __ blr(rscratch1);
  615     __ hlt(0);
  616 
  617     return start;
  618   }
  619 
  620   // Generate indices for iota vector.
  621   address generate_iota_indices(StubId stub_id) {
  622     __ align(CodeEntryAlignment);
  623     StubCodeMark mark(this, stub_id);
  624     address start = __ pc();
  625     // B
  626     __ emit_data64(0x0706050403020100, relocInfo::none);
  627     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  628     // H
  629     __ emit_data64(0x0003000200010000, relocInfo::none);
  630     __ emit_data64(0x0007000600050004, relocInfo::none);
  631     // S
  632     __ emit_data64(0x0000000100000000, relocInfo::none);
  633     __ emit_data64(0x0000000300000002, relocInfo::none);
  634     // D
  635     __ emit_data64(0x0000000000000000, relocInfo::none);
  636     __ emit_data64(0x0000000000000001, relocInfo::none);
  637     // S - FP
  638     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  639     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  640     // D - FP
  641     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  642     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  643     return start;
  644   }
  645 
  646   // The inner part of zero_words().  This is the bulk operation,
  647   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  648   // caller is responsible for zeroing the last few words.
  649   //
  650   // Inputs:
  651   // r10: the HeapWord-aligned base address of an array to zero.
  652   // r11: the count in HeapWords, r11 > 0.
  653   //
  654   // Returns r10 and r11, adjusted for the caller to clear.
  655   // r10: the base address of the tail of words left to clear.
  656   // r11: the number of words in the tail.
  657   //      r11 < MacroAssembler::zero_words_block_size.
  658 
  659   address generate_zero_blocks() {
  660     Label done;
  661     Label base_aligned;
  662 
  663     Register base = r10, cnt = r11;
  664 
  665     __ align(CodeEntryAlignment);
  666     StubId stub_id = StubId::stubgen_zero_blocks_id;
  667     StubCodeMark mark(this, stub_id);
  668     address start = __ pc();
  669 
  670     if (UseBlockZeroing) {
  671       int zva_length = VM_Version::zva_length();
  672 
  673       // Ensure ZVA length can be divided by 16. This is required by
  674       // the subsequent operations.
  675       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  676 
  677       __ tbz(base, 3, base_aligned);
  678       __ str(zr, Address(__ post(base, 8)));
  679       __ sub(cnt, cnt, 1);
  680       __ bind(base_aligned);
  681 
  682       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  683       // alignment.
  684       Label small;
  685       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  686       __ subs(rscratch1, cnt, low_limit >> 3);
  687       __ br(Assembler::LT, small);
  688       __ zero_dcache_blocks(base, cnt);
  689       __ bind(small);
  690     }
  691 
  692     {
  693       // Number of stp instructions we'll unroll
  694       const int unroll =
  695         MacroAssembler::zero_words_block_size / 2;
  696       // Clear the remaining blocks.
  697       Label loop;
  698       __ subs(cnt, cnt, unroll * 2);
  699       __ br(Assembler::LT, done);
  700       __ bind(loop);
  701       for (int i = 0; i < unroll; i++)
  702         __ stp(zr, zr, __ post(base, 16));
  703       __ subs(cnt, cnt, unroll * 2);
  704       __ br(Assembler::GE, loop);
  705       __ bind(done);
  706       __ add(cnt, cnt, unroll * 2);
  707     }
  708 
  709     __ ret(lr);
  710 
  711     return start;
  712   }
  713 
  714 
  715   typedef enum {
  716     copy_forwards = 1,
  717     copy_backwards = -1
  718   } copy_direction;
  719 
  720   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  721   // for arraycopy stubs.
  722   class ArrayCopyBarrierSetHelper : StackObj {
  723     BarrierSetAssembler* _bs_asm;
  724     MacroAssembler* _masm;
  725     DecoratorSet _decorators;
  726     BasicType _type;
  727     Register _gct1;
  728     Register _gct2;
  729     Register _gct3;
  730     FloatRegister _gcvt1;
  731     FloatRegister _gcvt2;
  732     FloatRegister _gcvt3;
  733 
  734   public:
  735     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  736                               DecoratorSet decorators,
  737                               BasicType type,
  738                               Register gct1,
  739                               Register gct2,
  740                               Register gct3,
  741                               FloatRegister gcvt1,
  742                               FloatRegister gcvt2,
  743                               FloatRegister gcvt3)
  744       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  745         _masm(masm),
  746         _decorators(decorators),
  747         _type(type),
  748         _gct1(gct1),
  749         _gct2(gct2),
  750         _gct3(gct3),
  751         _gcvt1(gcvt1),
  752         _gcvt2(gcvt2),
  753         _gcvt3(gcvt3) {
  754     }
  755 
  756     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  757       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  758                             dst1, dst2, src,
  759                             _gct1, _gct2, _gcvt1);
  760     }
  761 
  762     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  763       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  764                              dst, src1, src2,
  765                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  766     }
  767 
  768     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  769       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  770                             dst1, dst2, src,
  771                             _gct1);
  772     }
  773 
  774     void copy_store_at_16(Address dst, Register src1, Register src2) {
  775       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  776                              dst, src1, src2,
  777                              _gct1, _gct2, _gct3);
  778     }
  779 
  780     void copy_load_at_8(Register dst, Address src) {
  781       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  782                             dst, noreg, src,
  783                             _gct1);
  784     }
  785 
  786     void copy_store_at_8(Address dst, Register src) {
  787       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  788                              dst, src, noreg,
  789                              _gct1, _gct2, _gct3);
  790     }
  791   };
  792 
  793   // Bulk copy of blocks of 8 words.
  794   //
  795   // count is a count of words.
  796   //
  797   // Precondition: count >= 8
  798   //
  799   // Postconditions:
  800   //
  801   // The least significant bit of count contains the remaining count
  802   // of words to copy.  The rest of count is trash.
  803   //
  804   // s and d are adjusted to point to the remaining words to copy
  805   //
  806   void generate_copy_longs(StubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  807     BasicType type;
  808     copy_direction direction;
  809 
  810     switch (stub_id) {
  811     case StubId::stubgen_copy_byte_f_id:
  812       direction = copy_forwards;
  813       type = T_BYTE;
  814       break;
  815     case StubId::stubgen_copy_byte_b_id:
  816       direction = copy_backwards;
  817       type = T_BYTE;
  818       break;
  819     case StubId::stubgen_copy_oop_f_id:
  820       direction = copy_forwards;
  821       type = T_OBJECT;
  822       break;
  823     case StubId::stubgen_copy_oop_b_id:
  824       direction = copy_backwards;
  825       type = T_OBJECT;
  826       break;
  827     case StubId::stubgen_copy_oop_uninit_f_id:
  828       direction = copy_forwards;
  829       type = T_OBJECT;
  830       break;
  831     case StubId::stubgen_copy_oop_uninit_b_id:
  832       direction = copy_backwards;
  833       type = T_OBJECT;
  834       break;
  835     default:
  836       ShouldNotReachHere();
  837     }
  838 
  839     int unit = wordSize * direction;
  840     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  841 
  842     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  843       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  844     const Register stride = r14;
  845     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  846     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  847     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  848 
  849     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  850     assert_different_registers(s, d, count, rscratch1, rscratch2);
  851 
  852     Label again, drain;
  853 
  854     __ align(CodeEntryAlignment);
  855 
  856     StubCodeMark mark(this, stub_id);
  857 
  858     __ bind(start);
  859 
  860     Label unaligned_copy_long;
  861     if (AvoidUnalignedAccesses) {
  862       __ tbnz(d, 3, unaligned_copy_long);
  863     }
  864 
  865     if (direction == copy_forwards) {
  866       __ sub(s, s, bias);
  867       __ sub(d, d, bias);
  868     }
  869 
  870 #ifdef ASSERT
  871     // Make sure we are never given < 8 words
  872     {
  873       Label L;
  874       __ cmp(count, (u1)8);
  875       __ br(Assembler::GE, L);
  876       __ stop("genrate_copy_longs called with < 8 words");
  877       __ bind(L);
  878     }
  879 #endif
  880 
  881     // Fill 8 registers
  882     if (UseSIMDForMemoryOps) {
  883       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  884       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  885     } else {
  886       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  887       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  888       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  889       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  890     }
  891 
  892     __ subs(count, count, 16);
  893     __ br(Assembler::LO, drain);
  894 
  895     int prefetch = PrefetchCopyIntervalInBytes;
  896     bool use_stride = false;
  897     if (direction == copy_backwards) {
  898        use_stride = prefetch > 256;
  899        prefetch = -prefetch;
  900        if (use_stride) __ mov(stride, prefetch);
  901     }
  902 
  903     __ bind(again);
  904 
  905     if (PrefetchCopyIntervalInBytes > 0)
  906       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  907 
  908     if (UseSIMDForMemoryOps) {
  909       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  910       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  911       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  912       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  913     } else {
  914       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  915       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  916       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  917       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  918       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  919       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  920       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  921       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  922     }
  923 
  924     __ subs(count, count, 8);
  925     __ br(Assembler::HS, again);
  926 
  927     // Drain
  928     __ bind(drain);
  929     if (UseSIMDForMemoryOps) {
  930       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  931       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  932     } else {
  933       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  934       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  935       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  936       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  937     }
  938 
  939     {
  940       Label L1, L2;
  941       __ tbz(count, exact_log2(4), L1);
  942       if (UseSIMDForMemoryOps) {
  943         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  944         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  945       } else {
  946         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  947         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  948         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  950       }
  951       __ bind(L1);
  952 
  953       if (direction == copy_forwards) {
  954         __ add(s, s, bias);
  955         __ add(d, d, bias);
  956       }
  957 
  958       __ tbz(count, 1, L2);
  959       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  960       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  961       __ bind(L2);
  962     }
  963 
  964     __ ret(lr);
  965 
  966     if (AvoidUnalignedAccesses) {
  967       Label drain, again;
  968       // Register order for storing. Order is different for backward copy.
  969 
  970       __ bind(unaligned_copy_long);
  971 
  972       // source address is even aligned, target odd aligned
  973       //
  974       // when forward copying word pairs we read long pairs at offsets
  975       // {0, 2, 4, 6} (in long words). when backwards copying we read
  976       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  977       // address by -2 in the forwards case so we can compute the
  978       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  979       // or -1.
  980       //
  981       // when forward copying we need to store 1 word, 3 pairs and
  982       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  983       // zero offset We adjust the destination by -1 which means we
  984       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  985       //
  986       // When backwards copyng we need to store 1 word, 3 pairs and
  987       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  988       // offsets {1, 3, 5, 7, 8} * unit.
  989 
  990       if (direction == copy_forwards) {
  991         __ sub(s, s, 16);
  992         __ sub(d, d, 8);
  993       }
  994 
  995       // Fill 8 registers
  996       //
  997       // for forwards copy s was offset by -16 from the original input
  998       // value of s so the register contents are at these offsets
  999       // relative to the 64 bit block addressed by that original input
 1000       // and so on for each successive 64 byte block when s is updated
 1001       //
 1002       // t0 at offset 0,  t1 at offset 8
 1003       // t2 at offset 16, t3 at offset 24
 1004       // t4 at offset 32, t5 at offset 40
 1005       // t6 at offset 48, t7 at offset 56
 1006 
 1007       // for backwards copy s was not offset so the register contents
 1008       // are at these offsets into the preceding 64 byte block
 1009       // relative to that original input and so on for each successive
 1010       // preceding 64 byte block when s is updated. this explains the
 1011       // slightly counter-intuitive looking pattern of register usage
 1012       // in the stp instructions for backwards copy.
 1013       //
 1014       // t0 at offset -16, t1 at offset -8
 1015       // t2 at offset -32, t3 at offset -24
 1016       // t4 at offset -48, t5 at offset -40
 1017       // t6 at offset -64, t7 at offset -56
 1018 
 1019       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1020       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1021       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1022       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1023 
 1024       __ subs(count, count, 16);
 1025       __ br(Assembler::LO, drain);
 1026 
 1027       int prefetch = PrefetchCopyIntervalInBytes;
 1028       bool use_stride = false;
 1029       if (direction == copy_backwards) {
 1030          use_stride = prefetch > 256;
 1031          prefetch = -prefetch;
 1032          if (use_stride) __ mov(stride, prefetch);
 1033       }
 1034 
 1035       __ bind(again);
 1036 
 1037       if (PrefetchCopyIntervalInBytes > 0)
 1038         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1039 
 1040       if (direction == copy_forwards) {
 1041        // allowing for the offset of -8 the store instructions place
 1042        // registers into the target 64 bit block at the following
 1043        // offsets
 1044        //
 1045        // t0 at offset 0
 1046        // t1 at offset 8,  t2 at offset 16
 1047        // t3 at offset 24, t4 at offset 32
 1048        // t5 at offset 40, t6 at offset 48
 1049        // t7 at offset 56
 1050 
 1051         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1052         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1053         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1054         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1055         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1056         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1057         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1058         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1059         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1060       } else {
 1061        // d was not offset when we started so the registers are
 1062        // written into the 64 bit block preceding d with the following
 1063        // offsets
 1064        //
 1065        // t1 at offset -8
 1066        // t3 at offset -24, t0 at offset -16
 1067        // t5 at offset -48, t2 at offset -32
 1068        // t7 at offset -56, t4 at offset -48
 1069        //                   t6 at offset -64
 1070        //
 1071        // note that this matches the offsets previously noted for the
 1072        // loads
 1073 
 1074         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1075         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1076         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1077         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1078         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1079         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1080         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1081         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1082         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1083       }
 1084 
 1085       __ subs(count, count, 8);
 1086       __ br(Assembler::HS, again);
 1087 
 1088       // Drain
 1089       //
 1090       // this uses the same pattern of offsets and register arguments
 1091       // as above
 1092       __ bind(drain);
 1093       if (direction == copy_forwards) {
 1094         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1095         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1096         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1097         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1098         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1099       } else {
 1100         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1101         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1102         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1103         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1104         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1105       }
 1106       // now we need to copy any remaining part block which may
 1107       // include a 4 word block subblock and/or a 2 word subblock.
 1108       // bits 2 and 1 in the count are the tell-tale for whether we
 1109       // have each such subblock
 1110       {
 1111         Label L1, L2;
 1112         __ tbz(count, exact_log2(4), L1);
 1113        // this is the same as above but copying only 4 longs hence
 1114        // with only one intervening stp between the str instructions
 1115        // but note that the offsets and registers still follow the
 1116        // same pattern
 1117         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1118         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1119         if (direction == copy_forwards) {
 1120           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1121           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1122           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1123         } else {
 1124           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1125           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1126           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1127         }
 1128         __ bind(L1);
 1129 
 1130         __ tbz(count, 1, L2);
 1131        // this is the same as above but copying only 2 longs hence
 1132        // there is no intervening stp between the str instructions
 1133        // but note that the offset and register patterns are still
 1134        // the same
 1135         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1136         if (direction == copy_forwards) {
 1137           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1138           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1139         } else {
 1140           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1141           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1142         }
 1143         __ bind(L2);
 1144 
 1145        // for forwards copy we need to re-adjust the offsets we
 1146        // applied so that s and d are follow the last words written
 1147 
 1148        if (direction == copy_forwards) {
 1149          __ add(s, s, 16);
 1150          __ add(d, d, 8);
 1151        }
 1152 
 1153       }
 1154 
 1155       __ ret(lr);
 1156       }
 1157   }
 1158 
 1159   // Small copy: less than 16 bytes.
 1160   //
 1161   // NB: Ignores all of the bits of count which represent more than 15
 1162   // bytes, so a caller doesn't have to mask them.
 1163 
 1164   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1165     bool is_backwards = step < 0;
 1166     size_t granularity = g_uabs(step);
 1167     int direction = is_backwards ? -1 : 1;
 1168 
 1169     Label Lword, Lint, Lshort, Lbyte;
 1170 
 1171     assert(granularity
 1172            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1173 
 1174     const Register t0 = r3;
 1175     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1176     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1177 
 1178     // ??? I don't know if this bit-test-and-branch is the right thing
 1179     // to do.  It does a lot of jumping, resulting in several
 1180     // mispredicted branches.  It might make more sense to do this
 1181     // with something like Duff's device with a single computed branch.
 1182 
 1183     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1184     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1185     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1186     __ bind(Lword);
 1187 
 1188     if (granularity <= sizeof (jint)) {
 1189       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1190       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1191       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1192       __ bind(Lint);
 1193     }
 1194 
 1195     if (granularity <= sizeof (jshort)) {
 1196       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1197       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1198       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1199       __ bind(Lshort);
 1200     }
 1201 
 1202     if (granularity <= sizeof (jbyte)) {
 1203       __ tbz(count, 0, Lbyte);
 1204       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1205       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1206       __ bind(Lbyte);
 1207     }
 1208   }
 1209 
 1210   Label copy_f, copy_b;
 1211   Label copy_obj_f, copy_obj_b;
 1212   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1213 
 1214   // All-singing all-dancing memory copy.
 1215   //
 1216   // Copy count units of memory from s to d.  The size of a unit is
 1217   // step, which can be positive or negative depending on the direction
 1218   // of copy.  If is_aligned is false, we align the source address.
 1219   //
 1220 
 1221   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1222                    Register s, Register d, Register count, int step) {
 1223     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1224     bool is_backwards = step < 0;
 1225     unsigned int granularity = g_uabs(step);
 1226     const Register t0 = r3, t1 = r4;
 1227 
 1228     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1229     // load all the data before writing anything
 1230     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1231     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1232     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1233     const Register send = r17, dend = r16;
 1234     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1235     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1236     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1237 
 1238     if (PrefetchCopyIntervalInBytes > 0)
 1239       __ prfm(Address(s, 0), PLDL1KEEP);
 1240     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1241     __ br(Assembler::HI, copy_big);
 1242 
 1243     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1244     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1245 
 1246     __ cmp(count, u1(16/granularity));
 1247     __ br(Assembler::LS, copy16);
 1248 
 1249     __ cmp(count, u1(64/granularity));
 1250     __ br(Assembler::HI, copy80);
 1251 
 1252     __ cmp(count, u1(32/granularity));
 1253     __ br(Assembler::LS, copy32);
 1254 
 1255     // 33..64 bytes
 1256     if (UseSIMDForMemoryOps) {
 1257       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1258       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1259       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1260       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1261     } else {
 1262       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1263       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1264       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1265       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1266 
 1267       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1268       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1269       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1270       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1271     }
 1272     __ b(finish);
 1273 
 1274     // 17..32 bytes
 1275     __ bind(copy32);
 1276     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1277     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1278 
 1279     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1280     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1281     __ b(finish);
 1282 
 1283     // 65..80/96 bytes
 1284     // (96 bytes if SIMD because we do 32 byes per instruction)
 1285     __ bind(copy80);
 1286     if (UseSIMDForMemoryOps) {
 1287       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1288       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1289       // Unaligned pointers can be an issue for copying.
 1290       // The issue has more chances to happen when granularity of data is
 1291       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1292       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1293       // The most performance drop has been seen for the range 65-80 bytes.
 1294       // For such cases using the pair of ldp/stp instead of the third pair of
 1295       // ldpq/stpq fixes the performance issue.
 1296       if (granularity < sizeof (jint)) {
 1297         Label copy96;
 1298         __ cmp(count, u1(80/granularity));
 1299         __ br(Assembler::HI, copy96);
 1300         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1301 
 1302         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1303         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1304 
 1305         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1306         __ b(finish);
 1307 
 1308         __ bind(copy96);
 1309       }
 1310       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1311 
 1312       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1313       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1314 
 1315       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1316     } else {
 1317       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1318       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1319       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1320       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1321       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1322 
 1323       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1324       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1325       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1326       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1327       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1328     }
 1329     __ b(finish);
 1330 
 1331     // 0..16 bytes
 1332     __ bind(copy16);
 1333     __ cmp(count, u1(8/granularity));
 1334     __ br(Assembler::LO, copy8);
 1335 
 1336     // 8..16 bytes
 1337     bs.copy_load_at_8(t0, Address(s, 0));
 1338     bs.copy_load_at_8(t1, Address(send, -8));
 1339     bs.copy_store_at_8(Address(d, 0), t0);
 1340     bs.copy_store_at_8(Address(dend, -8), t1);
 1341     __ b(finish);
 1342 
 1343     if (granularity < 8) {
 1344       // 4..7 bytes
 1345       __ bind(copy8);
 1346       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1347       __ ldrw(t0, Address(s, 0));
 1348       __ ldrw(t1, Address(send, -4));
 1349       __ strw(t0, Address(d, 0));
 1350       __ strw(t1, Address(dend, -4));
 1351       __ b(finish);
 1352       if (granularity < 4) {
 1353         // 0..3 bytes
 1354         __ bind(copy4);
 1355         __ cbz(count, finish); // get rid of 0 case
 1356         if (granularity == 2) {
 1357           __ ldrh(t0, Address(s, 0));
 1358           __ strh(t0, Address(d, 0));
 1359         } else { // granularity == 1
 1360           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1361           // the first and last byte.
 1362           // Handle the 3 byte case by loading and storing base + count/2
 1363           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1364           // This does means in the 1 byte case we load/store the same
 1365           // byte 3 times.
 1366           __ lsr(count, count, 1);
 1367           __ ldrb(t0, Address(s, 0));
 1368           __ ldrb(t1, Address(send, -1));
 1369           __ ldrb(t2, Address(s, count));
 1370           __ strb(t0, Address(d, 0));
 1371           __ strb(t1, Address(dend, -1));
 1372           __ strb(t2, Address(d, count));
 1373         }
 1374         __ b(finish);
 1375       }
 1376     }
 1377 
 1378     __ bind(copy_big);
 1379     if (is_backwards) {
 1380       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1381       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1382     }
 1383 
 1384     // Now we've got the small case out of the way we can align the
 1385     // source address on a 2-word boundary.
 1386 
 1387     // Here we will materialize a count in r15, which is used by copy_memory_small
 1388     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1389     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1390     // can not be used as a temp register, as it contains the count.
 1391 
 1392     Label aligned;
 1393 
 1394     if (is_aligned) {
 1395       // We may have to adjust by 1 word to get s 2-word-aligned.
 1396       __ tbz(s, exact_log2(wordSize), aligned);
 1397       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1398       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1399       __ sub(count, count, wordSize/granularity);
 1400     } else {
 1401       if (is_backwards) {
 1402         __ andr(r15, s, 2 * wordSize - 1);
 1403       } else {
 1404         __ neg(r15, s);
 1405         __ andr(r15, r15, 2 * wordSize - 1);
 1406       }
 1407       // r15 is the byte adjustment needed to align s.
 1408       __ cbz(r15, aligned);
 1409       int shift = exact_log2(granularity);
 1410       if (shift > 0) {
 1411         __ lsr(r15, r15, shift);
 1412       }
 1413       __ sub(count, count, r15);
 1414 
 1415 #if 0
 1416       // ?? This code is only correct for a disjoint copy.  It may or
 1417       // may not make sense to use it in that case.
 1418 
 1419       // Copy the first pair; s and d may not be aligned.
 1420       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1421       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1422 
 1423       // Align s and d, adjust count
 1424       if (is_backwards) {
 1425         __ sub(s, s, r15);
 1426         __ sub(d, d, r15);
 1427       } else {
 1428         __ add(s, s, r15);
 1429         __ add(d, d, r15);
 1430       }
 1431 #else
 1432       copy_memory_small(decorators, type, s, d, r15, step);
 1433 #endif
 1434     }
 1435 
 1436     __ bind(aligned);
 1437 
 1438     // s is now 2-word-aligned.
 1439 
 1440     // We have a count of units and some trailing bytes. Adjust the
 1441     // count and do a bulk copy of words. If the shift is zero
 1442     // perform a move instead to benefit from zero latency moves.
 1443     int shift = exact_log2(wordSize/granularity);
 1444     if (shift > 0) {
 1445       __ lsr(r15, count, shift);
 1446     } else {
 1447       __ mov(r15, count);
 1448     }
 1449     if (direction == copy_forwards) {
 1450       if (type != T_OBJECT) {
 1451         __ bl(copy_f);
 1452       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1453         __ bl(copy_obj_uninit_f);
 1454       } else {
 1455         __ bl(copy_obj_f);
 1456       }
 1457     } else {
 1458       if (type != T_OBJECT) {
 1459         __ bl(copy_b);
 1460       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1461         __ bl(copy_obj_uninit_b);
 1462       } else {
 1463         __ bl(copy_obj_b);
 1464       }
 1465     }
 1466 
 1467     // And the tail.
 1468     copy_memory_small(decorators, type, s, d, count, step);
 1469 
 1470     if (granularity >= 8) __ bind(copy8);
 1471     if (granularity >= 4) __ bind(copy4);
 1472     __ bind(finish);
 1473   }
 1474 
 1475 
 1476   void clobber_registers() {
 1477 #ifdef ASSERT
 1478     RegSet clobbered
 1479       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1480     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1481     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1482     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1483       __ mov(*it, rscratch1);
 1484     }
 1485 #endif
 1486 
 1487   }
 1488 
 1489   // Scan over array at a for count oops, verifying each one.
 1490   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1491   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1492     Label loop, end;
 1493     __ mov(rscratch1, a);
 1494     __ mov(rscratch2, zr);
 1495     __ bind(loop);
 1496     __ cmp(rscratch2, count);
 1497     __ br(Assembler::HS, end);
 1498     if (size == wordSize) {
 1499       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1500       __ verify_oop(temp);
 1501     } else {
 1502       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1503       __ decode_heap_oop(temp); // calls verify_oop
 1504     }
 1505     __ add(rscratch2, rscratch2, 1);
 1506     __ b(loop);
 1507     __ bind(end);
 1508   }
 1509 
 1510   // Arguments:
 1511   //   stub_id - is used to name the stub and identify all details of
 1512   //             how to perform the copy.
 1513   //
 1514   //   entry - is assigned to the stub's post push entry point unless
 1515   //           it is null
 1516   //
 1517   // Inputs:
 1518   //   c_rarg0   - source array address
 1519   //   c_rarg1   - destination array address
 1520   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1521   //
 1522   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1523   // the hardware handle it.  The two dwords within qwords that span
 1524   // cache line boundaries will still be loaded and stored atomically.
 1525   //
 1526   // Side Effects: entry is set to the (post push) entry point so it
 1527   //               can be used by the corresponding conjoint copy
 1528   //               method
 1529   //
 1530   address generate_disjoint_copy(StubId stub_id, address *entry) {
 1531     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1532     RegSet saved_reg = RegSet::of(s, d, count);
 1533     int size;
 1534     bool aligned;
 1535     bool is_oop;
 1536     bool dest_uninitialized;
 1537     switch (stub_id) {
 1538     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1539       size = sizeof(jbyte);
 1540       aligned = false;
 1541       is_oop = false;
 1542       dest_uninitialized = false;
 1543       break;
 1544     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1545       size = sizeof(jbyte);
 1546       aligned = true;
 1547       is_oop = false;
 1548       dest_uninitialized = false;
 1549       break;
 1550     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1551       size = sizeof(jshort);
 1552       aligned = false;
 1553       is_oop = false;
 1554       dest_uninitialized = false;
 1555       break;
 1556     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1557       size = sizeof(jshort);
 1558       aligned = true;
 1559       is_oop = false;
 1560       dest_uninitialized = false;
 1561       break;
 1562     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1563       size = sizeof(jint);
 1564       aligned = false;
 1565       is_oop = false;
 1566       dest_uninitialized = false;
 1567       break;
 1568     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1569       size = sizeof(jint);
 1570       aligned = true;
 1571       is_oop = false;
 1572       dest_uninitialized = false;
 1573       break;
 1574     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1575       // since this is always aligned we can (should!) use the same
 1576       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1577       ShouldNotReachHere();
 1578       break;
 1579     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1580       size = sizeof(jlong);
 1581       aligned = true;
 1582       is_oop = false;
 1583       dest_uninitialized = false;
 1584       break;
 1585     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1586       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1587       aligned = !UseCompressedOops;
 1588       is_oop = true;
 1589       dest_uninitialized = false;
 1590       break;
 1591     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1592       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1593       aligned = !UseCompressedOops;
 1594       is_oop = true;
 1595       dest_uninitialized = false;
 1596       break;
 1597     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1598       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1599       aligned = !UseCompressedOops;
 1600       is_oop = true;
 1601       dest_uninitialized = true;
 1602       break;
 1603     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1604       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1605       aligned = !UseCompressedOops;
 1606       is_oop = true;
 1607       dest_uninitialized = true;
 1608       break;
 1609     default:
 1610       ShouldNotReachHere();
 1611       break;
 1612     }
 1613 
 1614     __ align(CodeEntryAlignment);
 1615     StubCodeMark mark(this, stub_id);
 1616     address start = __ pc();
 1617     __ enter();
 1618 
 1619     if (entry != nullptr) {
 1620       *entry = __ pc();
 1621       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1622       BLOCK_COMMENT("Entry:");
 1623     }
 1624 
 1625     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1626     if (dest_uninitialized) {
 1627       decorators |= IS_DEST_UNINITIALIZED;
 1628     }
 1629     if (aligned) {
 1630       decorators |= ARRAYCOPY_ALIGNED;
 1631     }
 1632 
 1633     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1634     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1635 
 1636     if (is_oop) {
 1637       // save regs before copy_memory
 1638       __ push(RegSet::of(d, count), sp);
 1639     }
 1640     {
 1641       // UnsafeMemoryAccess page error: continue after unsafe access
 1642       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1643       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1644       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1645     }
 1646 
 1647     if (is_oop) {
 1648       __ pop(RegSet::of(d, count), sp);
 1649       if (VerifyOops)
 1650         verify_oop_array(size, d, count, r16);
 1651     }
 1652 
 1653     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1654 
 1655     __ leave();
 1656     __ mov(r0, zr); // return 0
 1657     __ ret(lr);
 1658     return start;
 1659   }
 1660 
 1661   // Arguments:
 1662   //   stub_id - is used to name the stub and identify all details of
 1663   //             how to perform the copy.
 1664   //
 1665   //   nooverlap_target - identifes the (post push) entry for the
 1666   //             corresponding disjoint copy routine which can be
 1667   //             jumped to if the ranges do not actually overlap
 1668   //
 1669   //   entry - is assigned to the stub's post push entry point unless
 1670   //           it is null
 1671   //
 1672   //
 1673   // Inputs:
 1674   //   c_rarg0   - source array address
 1675   //   c_rarg1   - destination array address
 1676   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1677   //
 1678   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1679   // the hardware handle it.  The two dwords within qwords that span
 1680   // cache line boundaries will still be loaded and stored atomically.
 1681   //
 1682   // Side Effects:
 1683   //   entry is set to the no-overlap entry point so it can be used by
 1684   //   some other conjoint copy method
 1685   //
 1686   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *entry) {
 1687     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1688     RegSet saved_regs = RegSet::of(s, d, count);
 1689     int size;
 1690     bool aligned;
 1691     bool is_oop;
 1692     bool dest_uninitialized;
 1693     switch (stub_id) {
 1694     case StubId::stubgen_jbyte_arraycopy_id:
 1695       size = sizeof(jbyte);
 1696       aligned = false;
 1697       is_oop = false;
 1698       dest_uninitialized = false;
 1699       break;
 1700     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1701       size = sizeof(jbyte);
 1702       aligned = true;
 1703       is_oop = false;
 1704       dest_uninitialized = false;
 1705       break;
 1706     case StubId::stubgen_jshort_arraycopy_id:
 1707       size = sizeof(jshort);
 1708       aligned = false;
 1709       is_oop = false;
 1710       dest_uninitialized = false;
 1711       break;
 1712     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 1713       size = sizeof(jshort);
 1714       aligned = true;
 1715       is_oop = false;
 1716       dest_uninitialized = false;
 1717       break;
 1718     case StubId::stubgen_jint_arraycopy_id:
 1719       size = sizeof(jint);
 1720       aligned = false;
 1721       is_oop = false;
 1722       dest_uninitialized = false;
 1723       break;
 1724     case StubId::stubgen_arrayof_jint_arraycopy_id:
 1725       size = sizeof(jint);
 1726       aligned = true;
 1727       is_oop = false;
 1728       dest_uninitialized = false;
 1729       break;
 1730     case StubId::stubgen_jlong_arraycopy_id:
 1731       // since this is always aligned we can (should!) use the same
 1732       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1733       ShouldNotReachHere();
 1734       break;
 1735     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 1736       size = sizeof(jlong);
 1737       aligned = true;
 1738       is_oop = false;
 1739       dest_uninitialized = false;
 1740       break;
 1741     case StubId::stubgen_oop_arraycopy_id:
 1742       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1743       aligned = !UseCompressedOops;
 1744       is_oop = true;
 1745       dest_uninitialized = false;
 1746       break;
 1747     case StubId::stubgen_arrayof_oop_arraycopy_id:
 1748       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1749       aligned = !UseCompressedOops;
 1750       is_oop = true;
 1751       dest_uninitialized = false;
 1752       break;
 1753     case StubId::stubgen_oop_arraycopy_uninit_id:
 1754       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1755       aligned = !UseCompressedOops;
 1756       is_oop = true;
 1757       dest_uninitialized = true;
 1758       break;
 1759     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 1760       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1761       aligned = !UseCompressedOops;
 1762       is_oop = true;
 1763       dest_uninitialized = true;
 1764       break;
 1765     default:
 1766       ShouldNotReachHere();
 1767     }
 1768 
 1769     StubCodeMark mark(this, stub_id);
 1770     address start = __ pc();
 1771     __ enter();
 1772 
 1773     if (entry != nullptr) {
 1774       *entry = __ pc();
 1775       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1776       BLOCK_COMMENT("Entry:");
 1777     }
 1778 
 1779     // use fwd copy when (d-s) above_equal (count*size)
 1780     __ sub(rscratch1, d, s);
 1781     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1782     __ br(Assembler::HS, nooverlap_target);
 1783 
 1784     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1785     if (dest_uninitialized) {
 1786       decorators |= IS_DEST_UNINITIALIZED;
 1787     }
 1788     if (aligned) {
 1789       decorators |= ARRAYCOPY_ALIGNED;
 1790     }
 1791 
 1792     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1793     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1794 
 1795     if (is_oop) {
 1796       // save regs before copy_memory
 1797       __ push(RegSet::of(d, count), sp);
 1798     }
 1799     {
 1800       // UnsafeMemoryAccess page error: continue after unsafe access
 1801       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1802       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1803       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1804     }
 1805     if (is_oop) {
 1806       __ pop(RegSet::of(d, count), sp);
 1807       if (VerifyOops)
 1808         verify_oop_array(size, d, count, r16);
 1809     }
 1810     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1811     __ leave();
 1812     __ mov(r0, zr); // return 0
 1813     __ ret(lr);
 1814     return start;
 1815   }
 1816 
 1817   // Helper for generating a dynamic type check.
 1818   // Smashes rscratch1, rscratch2.
 1819   void generate_type_check(Register sub_klass,
 1820                            Register super_check_offset,
 1821                            Register super_klass,
 1822                            Register temp1,
 1823                            Register temp2,
 1824                            Register result,
 1825                            Label& L_success) {
 1826     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1827 
 1828     BLOCK_COMMENT("type_check:");
 1829 
 1830     Label L_miss;
 1831 
 1832     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1833                                      super_check_offset);
 1834     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1835 
 1836     // Fall through on failure!
 1837     __ BIND(L_miss);
 1838   }
 1839 
 1840   //
 1841   //  Generate checkcasting array copy stub
 1842   //
 1843   //  Input:
 1844   //    c_rarg0   - source array address
 1845   //    c_rarg1   - destination array address
 1846   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1847   //    c_rarg3   - size_t ckoff (super_check_offset)
 1848   //    c_rarg4   - oop ckval (super_klass)
 1849   //
 1850   //  Output:
 1851   //    r0 ==  0  -  success
 1852   //    r0 == -1^K - failure, where K is partial transfer count
 1853   //
 1854   address generate_checkcast_copy(StubId stub_id, address *entry) {
 1855     bool dest_uninitialized;
 1856     switch (stub_id) {
 1857     case StubId::stubgen_checkcast_arraycopy_id:
 1858       dest_uninitialized = false;
 1859       break;
 1860     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 1861       dest_uninitialized = true;
 1862       break;
 1863     default:
 1864       ShouldNotReachHere();
 1865     }
 1866 
 1867     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1868 
 1869     // Input registers (after setup_arg_regs)
 1870     const Register from        = c_rarg0;   // source array address
 1871     const Register to          = c_rarg1;   // destination array address
 1872     const Register count       = c_rarg2;   // elementscount
 1873     const Register ckoff       = c_rarg3;   // super_check_offset
 1874     const Register ckval       = c_rarg4;   // super_klass
 1875 
 1876     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1877     RegSet wb_post_saved_regs = RegSet::of(count);
 1878 
 1879     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1880     const Register copied_oop  = r22;       // actual oop copied
 1881     const Register count_save  = r21;       // orig elementscount
 1882     const Register start_to    = r20;       // destination array start address
 1883     const Register r19_klass   = r19;       // oop._klass
 1884 
 1885     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1886     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1887 
 1888     //---------------------------------------------------------------
 1889     // Assembler stub will be used for this call to arraycopy
 1890     // if the two arrays are subtypes of Object[] but the
 1891     // destination array type is not equal to or a supertype
 1892     // of the source type.  Each element must be separately
 1893     // checked.
 1894 
 1895     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1896                                copied_oop, r19_klass, count_save);
 1897 
 1898     __ align(CodeEntryAlignment);
 1899     StubCodeMark mark(this, stub_id);
 1900     address start = __ pc();
 1901 
 1902     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1903 
 1904 #ifdef ASSERT
 1905     // caller guarantees that the arrays really are different
 1906     // otherwise, we would have to make conjoint checks
 1907     { Label L;
 1908       __ b(L);                  // conjoint check not yet implemented
 1909       __ stop("checkcast_copy within a single array");
 1910       __ bind(L);
 1911     }
 1912 #endif //ASSERT
 1913 
 1914     // Caller of this entry point must set up the argument registers.
 1915     if (entry != nullptr) {
 1916       *entry = __ pc();
 1917       BLOCK_COMMENT("Entry:");
 1918     }
 1919 
 1920      // Empty array:  Nothing to do.
 1921     __ cbz(count, L_done);
 1922     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1923 
 1924 #ifdef ASSERT
 1925     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1926     // The ckoff and ckval must be mutually consistent,
 1927     // even though caller generates both.
 1928     { Label L;
 1929       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1930       __ ldrw(start_to, Address(ckval, sco_offset));
 1931       __ cmpw(ckoff, start_to);
 1932       __ br(Assembler::EQ, L);
 1933       __ stop("super_check_offset inconsistent");
 1934       __ bind(L);
 1935     }
 1936 #endif //ASSERT
 1937 
 1938     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1939     bool is_oop = true;
 1940     int element_size = UseCompressedOops ? 4 : 8;
 1941     if (dest_uninitialized) {
 1942       decorators |= IS_DEST_UNINITIALIZED;
 1943     }
 1944 
 1945     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1946     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1947 
 1948     // save the original count
 1949     __ mov(count_save, count);
 1950 
 1951     // Copy from low to high addresses
 1952     __ mov(start_to, to);              // Save destination array start address
 1953     __ b(L_load_element);
 1954 
 1955     // ======== begin loop ========
 1956     // (Loop is rotated; its entry is L_load_element.)
 1957     // Loop control:
 1958     //   for (; count != 0; count--) {
 1959     //     copied_oop = load_heap_oop(from++);
 1960     //     ... generate_type_check ...;
 1961     //     store_heap_oop(to++, copied_oop);
 1962     //   }
 1963     __ align(OptoLoopAlignment);
 1964 
 1965     __ BIND(L_store_element);
 1966     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1967                       __ post(to, element_size), copied_oop, noreg,
 1968                       gct1, gct2, gct3);
 1969     __ sub(count, count, 1);
 1970     __ cbz(count, L_do_card_marks);
 1971 
 1972     // ======== loop entry is here ========
 1973     __ BIND(L_load_element);
 1974     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1975                      copied_oop, noreg, __ post(from, element_size),
 1976                      gct1);
 1977     __ cbz(copied_oop, L_store_element);
 1978 
 1979     __ load_klass(r19_klass, copied_oop);// query the object klass
 1980 
 1981     BLOCK_COMMENT("type_check:");
 1982     generate_type_check(/*sub_klass*/r19_klass,
 1983                         /*super_check_offset*/ckoff,
 1984                         /*super_klass*/ckval,
 1985                         /*r_array_base*/gct1,
 1986                         /*temp2*/gct2,
 1987                         /*result*/r10, L_store_element);
 1988 
 1989     // Fall through on failure!
 1990 
 1991     // ======== end loop ========
 1992 
 1993     // It was a real error; we must depend on the caller to finish the job.
 1994     // Register count = remaining oops, count_orig = total oops.
 1995     // Emit GC store barriers for the oops we have copied and report
 1996     // their number to the caller.
 1997 
 1998     __ subs(count, count_save, count);     // K = partially copied oop count
 1999     __ eon(count, count, zr);              // report (-1^K) to caller
 2000     __ br(Assembler::EQ, L_done_pop);
 2001 
 2002     __ BIND(L_do_card_marks);
 2003     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2004 
 2005     __ bind(L_done_pop);
 2006     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2007     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2008 
 2009     __ bind(L_done);
 2010     __ mov(r0, count);
 2011     __ leave();
 2012     __ ret(lr);
 2013 
 2014     return start;
 2015   }
 2016 
 2017   // Perform range checks on the proposed arraycopy.
 2018   // Kills temp, but nothing else.
 2019   // Also, clean the sign bits of src_pos and dst_pos.
 2020   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2021                               Register src_pos, // source position (c_rarg1)
 2022                               Register dst,     // destination array oo (c_rarg2)
 2023                               Register dst_pos, // destination position (c_rarg3)
 2024                               Register length,
 2025                               Register temp,
 2026                               Label& L_failed) {
 2027     BLOCK_COMMENT("arraycopy_range_checks:");
 2028 
 2029     assert_different_registers(rscratch1, temp);
 2030 
 2031     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2032     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2033     __ addw(temp, length, src_pos);
 2034     __ cmpw(temp, rscratch1);
 2035     __ br(Assembler::HI, L_failed);
 2036 
 2037     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2038     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2039     __ addw(temp, length, dst_pos);
 2040     __ cmpw(temp, rscratch1);
 2041     __ br(Assembler::HI, L_failed);
 2042 
 2043     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2044     __ movw(src_pos, src_pos);
 2045     __ movw(dst_pos, dst_pos);
 2046 
 2047     BLOCK_COMMENT("arraycopy_range_checks done");
 2048   }
 2049 
 2050   // These stubs get called from some dumb test routine.
 2051   // I'll write them properly when they're called from
 2052   // something that's actually doing something.
 2053   static void fake_arraycopy_stub(address src, address dst, int count) {
 2054     assert(count == 0, "huh?");
 2055   }
 2056 
 2057 
 2058   //
 2059   //  Generate 'unsafe' array copy stub
 2060   //  Though just as safe as the other stubs, it takes an unscaled
 2061   //  size_t argument instead of an element count.
 2062   //
 2063   //  Input:
 2064   //    c_rarg0   - source array address
 2065   //    c_rarg1   - destination array address
 2066   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2067   //
 2068   // Examines the alignment of the operands and dispatches
 2069   // to a long, int, short, or byte copy loop.
 2070   //
 2071   address generate_unsafe_copy(address byte_copy_entry,
 2072                                address short_copy_entry,
 2073                                address int_copy_entry,
 2074                                address long_copy_entry) {
 2075     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2076 
 2077     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2078     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2079 
 2080     __ align(CodeEntryAlignment);
 2081     StubCodeMark mark(this, stub_id);
 2082     address start = __ pc();
 2083     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2084 
 2085     // bump this on entry, not on exit:
 2086     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2087 
 2088     __ orr(rscratch1, s, d);
 2089     __ orr(rscratch1, rscratch1, count);
 2090 
 2091     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2092     __ cbz(rscratch1, L_long_aligned);
 2093     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2094     __ cbz(rscratch1, L_int_aligned);
 2095     __ tbz(rscratch1, 0, L_short_aligned);
 2096     __ b(RuntimeAddress(byte_copy_entry));
 2097 
 2098     __ BIND(L_short_aligned);
 2099     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2100     __ b(RuntimeAddress(short_copy_entry));
 2101     __ BIND(L_int_aligned);
 2102     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2103     __ b(RuntimeAddress(int_copy_entry));
 2104     __ BIND(L_long_aligned);
 2105     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2106     __ b(RuntimeAddress(long_copy_entry));
 2107 
 2108     return start;
 2109   }
 2110 
 2111   //
 2112   //  Generate generic array copy stubs
 2113   //
 2114   //  Input:
 2115   //    c_rarg0    -  src oop
 2116   //    c_rarg1    -  src_pos (32-bits)
 2117   //    c_rarg2    -  dst oop
 2118   //    c_rarg3    -  dst_pos (32-bits)
 2119   //    c_rarg4    -  element count (32-bits)
 2120   //
 2121   //  Output:
 2122   //    r0 ==  0  -  success
 2123   //    r0 == -1^K - failure, where K is partial transfer count
 2124   //
 2125   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2126                                 address int_copy_entry, address oop_copy_entry,
 2127                                 address long_copy_entry, address checkcast_copy_entry) {
 2128     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2129 
 2130     Label L_failed, L_objArray;
 2131     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2132 
 2133     // Input registers
 2134     const Register src        = c_rarg0;  // source array oop
 2135     const Register src_pos    = c_rarg1;  // source position
 2136     const Register dst        = c_rarg2;  // destination array oop
 2137     const Register dst_pos    = c_rarg3;  // destination position
 2138     const Register length     = c_rarg4;
 2139 
 2140 
 2141     // Registers used as temps
 2142     const Register dst_klass  = c_rarg5;
 2143 
 2144     __ align(CodeEntryAlignment);
 2145 
 2146     StubCodeMark mark(this, stub_id);
 2147 
 2148     address start = __ pc();
 2149 
 2150     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2151 
 2152     // bump this on entry, not on exit:
 2153     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2154 
 2155     //-----------------------------------------------------------------------
 2156     // Assembler stub will be used for this call to arraycopy
 2157     // if the following conditions are met:
 2158     //
 2159     // (1) src and dst must not be null.
 2160     // (2) src_pos must not be negative.
 2161     // (3) dst_pos must not be negative.
 2162     // (4) length  must not be negative.
 2163     // (5) src klass and dst klass should be the same and not null.
 2164     // (6) src and dst should be arrays.
 2165     // (7) src_pos + length must not exceed length of src.
 2166     // (8) dst_pos + length must not exceed length of dst.
 2167     //
 2168 
 2169     //  if (src == nullptr) return -1;
 2170     __ cbz(src, L_failed);
 2171 
 2172     //  if (src_pos < 0) return -1;
 2173     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2174 
 2175     //  if (dst == nullptr) return -1;
 2176     __ cbz(dst, L_failed);
 2177 
 2178     //  if (dst_pos < 0) return -1;
 2179     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2180 
 2181     // registers used as temp
 2182     const Register scratch_length    = r16; // elements count to copy
 2183     const Register scratch_src_klass = r17; // array klass
 2184     const Register lh                = r15; // layout helper
 2185 
 2186     //  if (length < 0) return -1;
 2187     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2188     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     __ load_klass(scratch_src_klass, src);
 2191 #ifdef ASSERT
 2192     //  assert(src->klass() != nullptr);
 2193     {
 2194       BLOCK_COMMENT("assert klasses not null {");
 2195       Label L1, L2;
 2196       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2197       __ bind(L1);
 2198       __ stop("broken null klass");
 2199       __ bind(L2);
 2200       __ load_klass(rscratch1, dst);
 2201       __ cbz(rscratch1, L1);     // this would be broken also
 2202       BLOCK_COMMENT("} assert klasses not null done");
 2203     }
 2204 #endif
 2205 
 2206     // Load layout helper (32-bits)
 2207     //
 2208     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2209     // 32        30    24            16              8     2                 0
 2210     //
 2211     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2212     //
 2213 
 2214     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2215 
 2216     // Handle objArrays completely differently...
 2217     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2218     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2219     __ movw(rscratch1, objArray_lh);
 2220     __ eorw(rscratch2, lh, rscratch1);
 2221     __ cbzw(rscratch2, L_objArray);
 2222 
 2223     //  if (src->klass() != dst->klass()) return -1;
 2224     __ load_klass(rscratch2, dst);
 2225     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2226     __ cbnz(rscratch2, L_failed);
 2227 
 2228     //  if (!src->is_Array()) return -1;
 2229     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2230 
 2231     // At this point, it is known to be a typeArray (array_tag 0x3).
 2232 #ifdef ASSERT
 2233     {
 2234       BLOCK_COMMENT("assert primitive array {");
 2235       Label L;
 2236       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2237       __ cmpw(lh, rscratch2);
 2238       __ br(Assembler::GE, L);
 2239       __ stop("must be a primitive array");
 2240       __ bind(L);
 2241       BLOCK_COMMENT("} assert primitive array done");
 2242     }
 2243 #endif
 2244 
 2245     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2246                            rscratch2, L_failed);
 2247 
 2248     // TypeArrayKlass
 2249     //
 2250     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2251     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2252     //
 2253 
 2254     const Register rscratch1_offset = rscratch1;    // array offset
 2255     const Register r15_elsize = lh; // element size
 2256 
 2257     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2258            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2259     __ add(src, src, rscratch1_offset);           // src array offset
 2260     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2261     BLOCK_COMMENT("choose copy loop based on element size");
 2262 
 2263     // next registers should be set before the jump to corresponding stub
 2264     const Register from     = c_rarg0;  // source array address
 2265     const Register to       = c_rarg1;  // destination array address
 2266     const Register count    = c_rarg2;  // elements count
 2267 
 2268     // 'from', 'to', 'count' registers should be set in such order
 2269     // since they are the same as 'src', 'src_pos', 'dst'.
 2270 
 2271     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2272 
 2273     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2274     // size in bytes).  We do a simple bitwise binary search.
 2275   __ BIND(L_copy_bytes);
 2276     __ tbnz(r15_elsize, 1, L_copy_ints);
 2277     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2278     __ lea(from, Address(src, src_pos));// src_addr
 2279     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2280     __ movw(count, scratch_length); // length
 2281     __ b(RuntimeAddress(byte_copy_entry));
 2282 
 2283   __ BIND(L_copy_shorts);
 2284     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2285     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2286     __ movw(count, scratch_length); // length
 2287     __ b(RuntimeAddress(short_copy_entry));
 2288 
 2289   __ BIND(L_copy_ints);
 2290     __ tbnz(r15_elsize, 0, L_copy_longs);
 2291     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2292     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2293     __ movw(count, scratch_length); // length
 2294     __ b(RuntimeAddress(int_copy_entry));
 2295 
 2296   __ BIND(L_copy_longs);
 2297 #ifdef ASSERT
 2298     {
 2299       BLOCK_COMMENT("assert long copy {");
 2300       Label L;
 2301       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2302       __ cmpw(r15_elsize, LogBytesPerLong);
 2303       __ br(Assembler::EQ, L);
 2304       __ stop("must be long copy, but elsize is wrong");
 2305       __ bind(L);
 2306       BLOCK_COMMENT("} assert long copy done");
 2307     }
 2308 #endif
 2309     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2310     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2311     __ movw(count, scratch_length); // length
 2312     __ b(RuntimeAddress(long_copy_entry));
 2313 
 2314     // ObjArrayKlass
 2315   __ BIND(L_objArray);
 2316     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2317 
 2318     Label L_plain_copy, L_checkcast_copy;
 2319     //  test array classes for subtyping
 2320     __ load_klass(r15, dst);
 2321     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2322     __ br(Assembler::NE, L_checkcast_copy);
 2323 
 2324     // Identically typed arrays can be copied without element-wise checks.
 2325     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2326                            rscratch2, L_failed);
 2327 
 2328     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2329     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2330     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2331     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2332     __ movw(count, scratch_length); // length
 2333   __ BIND(L_plain_copy);
 2334     __ b(RuntimeAddress(oop_copy_entry));
 2335 
 2336   __ BIND(L_checkcast_copy);
 2337     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2338     {
 2339       // Before looking at dst.length, make sure dst is also an objArray.
 2340       __ ldrw(rscratch1, Address(r15, lh_offset));
 2341       __ movw(rscratch2, objArray_lh);
 2342       __ eorw(rscratch1, rscratch1, rscratch2);
 2343       __ cbnzw(rscratch1, L_failed);
 2344 
 2345       // It is safe to examine both src.length and dst.length.
 2346       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                              r15, L_failed);
 2348 
 2349       __ load_klass(dst_klass, dst); // reload
 2350 
 2351       // Marshal the base address arguments now, freeing registers.
 2352       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2353       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2354       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2355       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2356       __ movw(count, length);           // length (reloaded)
 2357       Register sco_temp = c_rarg3;      // this register is free now
 2358       assert_different_registers(from, to, count, sco_temp,
 2359                                  dst_klass, scratch_src_klass);
 2360       // assert_clean_int(count, sco_temp);
 2361 
 2362       // Generate the type check.
 2363       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2364       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2365 
 2366       // Smashes rscratch1, rscratch2
 2367       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2368                           L_plain_copy);
 2369 
 2370       // Fetch destination element klass from the ObjArrayKlass header.
 2371       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2372       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2373       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2374 
 2375       // the checkcast_copy loop needs two extra arguments:
 2376       assert(c_rarg3 == sco_temp, "#3 already in place");
 2377       // Set up arguments for checkcast_copy_entry.
 2378       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2379       __ b(RuntimeAddress(checkcast_copy_entry));
 2380     }
 2381 
 2382   __ BIND(L_failed);
 2383     __ mov(r0, -1);
 2384     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2385     __ ret(lr);
 2386 
 2387     return start;
 2388   }
 2389 
 2390   //
 2391   // Generate stub for array fill. If "aligned" is true, the
 2392   // "to" address is assumed to be heapword aligned.
 2393   //
 2394   // Arguments for generated stub:
 2395   //   to:    c_rarg0
 2396   //   value: c_rarg1
 2397   //   count: c_rarg2 treated as signed
 2398   //
 2399   address generate_fill(StubId stub_id) {
 2400     BasicType t;
 2401     bool aligned;
 2402 
 2403     switch (stub_id) {
 2404     case StubId::stubgen_jbyte_fill_id:
 2405       t = T_BYTE;
 2406       aligned = false;
 2407       break;
 2408     case StubId::stubgen_jshort_fill_id:
 2409       t = T_SHORT;
 2410       aligned = false;
 2411       break;
 2412     case StubId::stubgen_jint_fill_id:
 2413       t = T_INT;
 2414       aligned = false;
 2415       break;
 2416     case StubId::stubgen_arrayof_jbyte_fill_id:
 2417       t = T_BYTE;
 2418       aligned = true;
 2419       break;
 2420     case StubId::stubgen_arrayof_jshort_fill_id:
 2421       t = T_SHORT;
 2422       aligned = true;
 2423       break;
 2424     case StubId::stubgen_arrayof_jint_fill_id:
 2425       t = T_INT;
 2426       aligned = true;
 2427       break;
 2428     default:
 2429       ShouldNotReachHere();
 2430     };
 2431 
 2432     __ align(CodeEntryAlignment);
 2433     StubCodeMark mark(this, stub_id);
 2434     address start = __ pc();
 2435 
 2436     BLOCK_COMMENT("Entry:");
 2437 
 2438     const Register to        = c_rarg0;  // source array address
 2439     const Register value     = c_rarg1;  // value
 2440     const Register count     = c_rarg2;  // elements count
 2441 
 2442     const Register bz_base = r10;        // base for block_zero routine
 2443     const Register cnt_words = r11;      // temp register
 2444 
 2445     __ enter();
 2446 
 2447     Label L_fill_elements, L_exit1;
 2448 
 2449     int shift = -1;
 2450     switch (t) {
 2451       case T_BYTE:
 2452         shift = 0;
 2453         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2454         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2455         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2456         __ br(Assembler::LO, L_fill_elements);
 2457         break;
 2458       case T_SHORT:
 2459         shift = 1;
 2460         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2461         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2462         __ br(Assembler::LO, L_fill_elements);
 2463         break;
 2464       case T_INT:
 2465         shift = 2;
 2466         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2467         __ br(Assembler::LO, L_fill_elements);
 2468         break;
 2469       default: ShouldNotReachHere();
 2470     }
 2471 
 2472     // Align source address at 8 bytes address boundary.
 2473     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2474     if (!aligned) {
 2475       switch (t) {
 2476         case T_BYTE:
 2477           // One byte misalignment happens only for byte arrays.
 2478           __ tbz(to, 0, L_skip_align1);
 2479           __ strb(value, Address(__ post(to, 1)));
 2480           __ subw(count, count, 1);
 2481           __ bind(L_skip_align1);
 2482           // Fallthrough
 2483         case T_SHORT:
 2484           // Two bytes misalignment happens only for byte and short (char) arrays.
 2485           __ tbz(to, 1, L_skip_align2);
 2486           __ strh(value, Address(__ post(to, 2)));
 2487           __ subw(count, count, 2 >> shift);
 2488           __ bind(L_skip_align2);
 2489           // Fallthrough
 2490         case T_INT:
 2491           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2492           __ tbz(to, 2, L_skip_align4);
 2493           __ strw(value, Address(__ post(to, 4)));
 2494           __ subw(count, count, 4 >> shift);
 2495           __ bind(L_skip_align4);
 2496           break;
 2497         default: ShouldNotReachHere();
 2498       }
 2499     }
 2500 
 2501     //
 2502     //  Fill large chunks
 2503     //
 2504     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2505     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2506     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2507     if (UseBlockZeroing) {
 2508       Label non_block_zeroing, rest;
 2509       // If the fill value is zero we can use the fast zero_words().
 2510       __ cbnz(value, non_block_zeroing);
 2511       __ mov(bz_base, to);
 2512       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2513       address tpc = __ zero_words(bz_base, cnt_words);
 2514       if (tpc == nullptr) {
 2515         fatal("CodeCache is full at generate_fill");
 2516       }
 2517       __ b(rest);
 2518       __ bind(non_block_zeroing);
 2519       __ fill_words(to, cnt_words, value);
 2520       __ bind(rest);
 2521     } else {
 2522       __ fill_words(to, cnt_words, value);
 2523     }
 2524 
 2525     // Remaining count is less than 8 bytes. Fill it by a single store.
 2526     // Note that the total length is no less than 8 bytes.
 2527     if (t == T_BYTE || t == T_SHORT) {
 2528       Label L_exit1;
 2529       __ cbzw(count, L_exit1);
 2530       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2531       __ str(value, Address(to, -8));    // overwrite some elements
 2532       __ bind(L_exit1);
 2533       __ leave();
 2534       __ ret(lr);
 2535     }
 2536 
 2537     // Handle copies less than 8 bytes.
 2538     Label L_fill_2, L_fill_4, L_exit2;
 2539     __ bind(L_fill_elements);
 2540     switch (t) {
 2541       case T_BYTE:
 2542         __ tbz(count, 0, L_fill_2);
 2543         __ strb(value, Address(__ post(to, 1)));
 2544         __ bind(L_fill_2);
 2545         __ tbz(count, 1, L_fill_4);
 2546         __ strh(value, Address(__ post(to, 2)));
 2547         __ bind(L_fill_4);
 2548         __ tbz(count, 2, L_exit2);
 2549         __ strw(value, Address(to));
 2550         break;
 2551       case T_SHORT:
 2552         __ tbz(count, 0, L_fill_4);
 2553         __ strh(value, Address(__ post(to, 2)));
 2554         __ bind(L_fill_4);
 2555         __ tbz(count, 1, L_exit2);
 2556         __ strw(value, Address(to));
 2557         break;
 2558       case T_INT:
 2559         __ cbzw(count, L_exit2);
 2560         __ strw(value, Address(to));
 2561         break;
 2562       default: ShouldNotReachHere();
 2563     }
 2564     __ bind(L_exit2);
 2565     __ leave();
 2566     __ ret(lr);
 2567     return start;
 2568   }
 2569 
 2570   address generate_unsafecopy_common_error_exit() {
 2571     address start_pc = __ pc();
 2572       __ leave();
 2573       __ mov(r0, 0);
 2574       __ ret(lr);
 2575     return start_pc;
 2576   }
 2577 
 2578   //
 2579   //  Generate 'unsafe' set memory stub
 2580   //  Though just as safe as the other stubs, it takes an unscaled
 2581   //  size_t (# bytes) argument instead of an element count.
 2582   //
 2583   //  This fill operation is atomicity preserving: as long as the
 2584   //  address supplied is sufficiently aligned, all writes of up to 64
 2585   //  bits in size are single-copy atomic.
 2586   //
 2587   //  Input:
 2588   //    c_rarg0   - destination array address
 2589   //    c_rarg1   - byte count (size_t)
 2590   //    c_rarg2   - byte value
 2591   //
 2592   address generate_unsafe_setmemory() {
 2593     __ align(CodeEntryAlignment);
 2594     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
 2595     address start = __ pc();
 2596 
 2597     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2598     Label tail;
 2599 
 2600     UnsafeMemoryAccessMark umam(this, true, false);
 2601 
 2602     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2603 
 2604     __ dup(v0, __ T16B, value);
 2605 
 2606     if (AvoidUnalignedAccesses) {
 2607       __ cmp(count, (u1)16);
 2608       __ br(__ LO, tail);
 2609 
 2610       __ mov(rscratch1, 16);
 2611       __ andr(rscratch2, dest, 15);
 2612       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2613       __ strq(v0, Address(dest));
 2614       __ sub(count, count, rscratch1);
 2615       __ add(dest, dest, rscratch1);
 2616     }
 2617 
 2618     __ subs(count, count, (u1)64);
 2619     __ br(__ LO, tail);
 2620     {
 2621       Label again;
 2622       __ bind(again);
 2623       __ stpq(v0, v0, Address(dest));
 2624       __ stpq(v0, v0, Address(dest, 32));
 2625 
 2626       __ subs(count, count, 64);
 2627       __ add(dest, dest, 64);
 2628       __ br(__ HS, again);
 2629     }
 2630 
 2631     __ bind(tail);
 2632     // The count of bytes is off by 64, but we don't need to correct
 2633     // it because we're only going to use the least-significant few
 2634     // count bits from here on.
 2635     // __ add(count, count, 64);
 2636 
 2637     {
 2638       Label dont;
 2639       __ tbz(count, exact_log2(32), dont);
 2640       __ stpq(v0, v0, __ post(dest, 32));
 2641       __ bind(dont);
 2642     }
 2643     {
 2644       Label dont;
 2645       __ tbz(count, exact_log2(16), dont);
 2646       __ strq(v0, __ post(dest, 16));
 2647       __ bind(dont);
 2648     }
 2649     {
 2650       Label dont;
 2651       __ tbz(count, exact_log2(8), dont);
 2652       __ strd(v0, __ post(dest, 8));
 2653       __ bind(dont);
 2654     }
 2655 
 2656     Label finished;
 2657     __ tst(count, 7);
 2658     __ br(__ EQ, finished);
 2659 
 2660     {
 2661       Label dont;
 2662       __ tbz(count, exact_log2(4), dont);
 2663       __ strs(v0, __ post(dest, 4));
 2664       __ bind(dont);
 2665     }
 2666     {
 2667       Label dont;
 2668       __ tbz(count, exact_log2(2), dont);
 2669       __ bfi(value, value, 8, 8);
 2670       __ strh(value, __ post(dest, 2));
 2671       __ bind(dont);
 2672     }
 2673     {
 2674       Label dont;
 2675       __ tbz(count, exact_log2(1), dont);
 2676       __ strb(value, Address(dest));
 2677       __ bind(dont);
 2678     }
 2679 
 2680     __ bind(finished);
 2681     __ leave();
 2682     __ ret(lr);
 2683 
 2684     return start;
 2685   }
 2686 
 2687   address generate_data_cache_writeback() {
 2688     const Register line        = c_rarg0;  // address of line to write back
 2689 
 2690     __ align(CodeEntryAlignment);
 2691 
 2692     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 2693     StubCodeMark mark(this, stub_id);
 2694 
 2695     address start = __ pc();
 2696     __ enter();
 2697     __ cache_wb(Address(line, 0));
 2698     __ leave();
 2699     __ ret(lr);
 2700 
 2701     return start;
 2702   }
 2703 
 2704   address generate_data_cache_writeback_sync() {
 2705     const Register is_pre     = c_rarg0;  // pre or post sync
 2706 
 2707     __ align(CodeEntryAlignment);
 2708 
 2709     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 2710     StubCodeMark mark(this, stub_id);
 2711 
 2712     // pre wbsync is a no-op
 2713     // post wbsync translates to an sfence
 2714 
 2715     Label skip;
 2716     address start = __ pc();
 2717     __ enter();
 2718     __ cbnz(is_pre, skip);
 2719     __ cache_wbsync(false);
 2720     __ bind(skip);
 2721     __ leave();
 2722     __ ret(lr);
 2723 
 2724     return start;
 2725   }
 2726 
 2727   void generate_arraycopy_stubs() {
 2728     address entry;
 2729     address entry_jbyte_arraycopy;
 2730     address entry_jshort_arraycopy;
 2731     address entry_jint_arraycopy;
 2732     address entry_oop_arraycopy;
 2733     address entry_jlong_arraycopy;
 2734     address entry_checkcast_arraycopy;
 2735 
 2736     // generate the common exit first so later stubs can rely on it if
 2737     // they want an UnsafeMemoryAccess exit non-local to the stub
 2738     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 2739     // register the stub as the default exit with class UnsafeMemoryAccess
 2740     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 2741 
 2742     generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2743     generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2744 
 2745     generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2746     generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2747 
 2748     generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2749     generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2750 
 2751     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2752 
 2753     //*** jbyte
 2754     // Always need aligned and unaligned versions
 2755     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &entry);
 2756     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2757     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2758     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, entry, nullptr);
 2759 
 2760     //*** jshort
 2761     // Always need aligned and unaligned versions
 2762     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &entry);
 2763     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2764     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &entry);
 2765     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, entry, nullptr);
 2766 
 2767     //*** jint
 2768     // Aligned versions
 2769     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &entry);
 2770     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2771     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2772     // entry_jint_arraycopy always points to the unaligned version
 2773     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &entry);
 2774     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2775 
 2776     //*** jlong
 2777     // It is always aligned
 2778     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &entry);
 2779     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2780     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2781     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2782 
 2783     //*** oops
 2784     {
 2785       // With compressed oops we need unaligned versions; notice that
 2786       // we overwrite entry_oop_arraycopy.
 2787       bool aligned = !UseCompressedOops;
 2788 
 2789       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2790         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &entry);
 2791       StubRoutines::_arrayof_oop_arraycopy
 2792         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2793       // Aligned versions without pre-barriers
 2794       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2795         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2796       StubRoutines::_arrayof_oop_arraycopy_uninit
 2797         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2798     }
 2799 
 2800     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2801     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2802     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2803     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2804 
 2805     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2806     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 2807 
 2808     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2809                                                               entry_jshort_arraycopy,
 2810                                                               entry_jint_arraycopy,
 2811                                                               entry_jlong_arraycopy);
 2812 
 2813     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2814                                                                entry_jshort_arraycopy,
 2815                                                                entry_jint_arraycopy,
 2816                                                                entry_oop_arraycopy,
 2817                                                                entry_jlong_arraycopy,
 2818                                                                entry_checkcast_arraycopy);
 2819 
 2820     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 2821     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 2822     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 2823     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 2824     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 2825     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 2826   }
 2827 
 2828   void generate_math_stubs() { Unimplemented(); }
 2829 
 2830   // Arguments:
 2831   //
 2832   // Inputs:
 2833   //   c_rarg0   - source byte array address
 2834   //   c_rarg1   - destination byte array address
 2835   //   c_rarg2   - K (key) in little endian int array
 2836   //
 2837   address generate_aescrypt_encryptBlock() {
 2838     __ align(CodeEntryAlignment);
 2839     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 2840     StubCodeMark mark(this, stub_id);
 2841 
 2842     const Register from        = c_rarg0;  // source array address
 2843     const Register to          = c_rarg1;  // destination array address
 2844     const Register key         = c_rarg2;  // key array address
 2845     const Register keylen      = rscratch1;
 2846 
 2847     address start = __ pc();
 2848     __ enter();
 2849 
 2850     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2851 
 2852     __ aesenc_loadkeys(key, keylen);
 2853     __ aesecb_encrypt(from, to, keylen);
 2854 
 2855     __ mov(r0, 0);
 2856 
 2857     __ leave();
 2858     __ ret(lr);
 2859 
 2860     return start;
 2861   }
 2862 
 2863   // Arguments:
 2864   //
 2865   // Inputs:
 2866   //   c_rarg0   - source byte array address
 2867   //   c_rarg1   - destination byte array address
 2868   //   c_rarg2   - K (key) in little endian int array
 2869   //
 2870   address generate_aescrypt_decryptBlock() {
 2871     assert(UseAES, "need AES cryptographic extension support");
 2872     __ align(CodeEntryAlignment);
 2873     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 2874     StubCodeMark mark(this, stub_id);
 2875     Label L_doLast;
 2876 
 2877     const Register from        = c_rarg0;  // source array address
 2878     const Register to          = c_rarg1;  // destination array address
 2879     const Register key         = c_rarg2;  // key array address
 2880     const Register keylen      = rscratch1;
 2881 
 2882     address start = __ pc();
 2883     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2884 
 2885     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2886 
 2887     __ aesecb_decrypt(from, to, key, keylen);
 2888 
 2889     __ mov(r0, 0);
 2890 
 2891     __ leave();
 2892     __ ret(lr);
 2893 
 2894     return start;
 2895   }
 2896 
 2897   // Arguments:
 2898   //
 2899   // Inputs:
 2900   //   c_rarg0   - source byte array address
 2901   //   c_rarg1   - destination byte array address
 2902   //   c_rarg2   - K (key) in little endian int array
 2903   //   c_rarg3   - r vector byte array address
 2904   //   c_rarg4   - input length
 2905   //
 2906   // Output:
 2907   //   x0        - input length
 2908   //
 2909   address generate_cipherBlockChaining_encryptAESCrypt() {
 2910     assert(UseAES, "need AES cryptographic extension support");
 2911     __ align(CodeEntryAlignment);
 2912     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 2913     StubCodeMark mark(this, stub_id);
 2914 
 2915     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2916 
 2917     const Register from        = c_rarg0;  // source array address
 2918     const Register to          = c_rarg1;  // destination array address
 2919     const Register key         = c_rarg2;  // key array address
 2920     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2921                                            // and left with the results of the last encryption block
 2922     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2923     const Register keylen      = rscratch1;
 2924 
 2925     address start = __ pc();
 2926 
 2927       __ enter();
 2928 
 2929       __ movw(rscratch2, len_reg);
 2930 
 2931       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2932 
 2933       __ ld1(v0, __ T16B, rvec);
 2934 
 2935       __ cmpw(keylen, 52);
 2936       __ br(Assembler::CC, L_loadkeys_44);
 2937       __ br(Assembler::EQ, L_loadkeys_52);
 2938 
 2939       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2940       __ rev32(v17, __ T16B, v17);
 2941       __ rev32(v18, __ T16B, v18);
 2942     __ BIND(L_loadkeys_52);
 2943       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2944       __ rev32(v19, __ T16B, v19);
 2945       __ rev32(v20, __ T16B, v20);
 2946     __ BIND(L_loadkeys_44);
 2947       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2948       __ rev32(v21, __ T16B, v21);
 2949       __ rev32(v22, __ T16B, v22);
 2950       __ rev32(v23, __ T16B, v23);
 2951       __ rev32(v24, __ T16B, v24);
 2952       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2953       __ rev32(v25, __ T16B, v25);
 2954       __ rev32(v26, __ T16B, v26);
 2955       __ rev32(v27, __ T16B, v27);
 2956       __ rev32(v28, __ T16B, v28);
 2957       __ ld1(v29, v30, v31, __ T16B, key);
 2958       __ rev32(v29, __ T16B, v29);
 2959       __ rev32(v30, __ T16B, v30);
 2960       __ rev32(v31, __ T16B, v31);
 2961 
 2962     __ BIND(L_aes_loop);
 2963       __ ld1(v1, __ T16B, __ post(from, 16));
 2964       __ eor(v0, __ T16B, v0, v1);
 2965 
 2966       __ br(Assembler::CC, L_rounds_44);
 2967       __ br(Assembler::EQ, L_rounds_52);
 2968 
 2969       __ aese(v0, v17); __ aesmc(v0, v0);
 2970       __ aese(v0, v18); __ aesmc(v0, v0);
 2971     __ BIND(L_rounds_52);
 2972       __ aese(v0, v19); __ aesmc(v0, v0);
 2973       __ aese(v0, v20); __ aesmc(v0, v0);
 2974     __ BIND(L_rounds_44);
 2975       __ aese(v0, v21); __ aesmc(v0, v0);
 2976       __ aese(v0, v22); __ aesmc(v0, v0);
 2977       __ aese(v0, v23); __ aesmc(v0, v0);
 2978       __ aese(v0, v24); __ aesmc(v0, v0);
 2979       __ aese(v0, v25); __ aesmc(v0, v0);
 2980       __ aese(v0, v26); __ aesmc(v0, v0);
 2981       __ aese(v0, v27); __ aesmc(v0, v0);
 2982       __ aese(v0, v28); __ aesmc(v0, v0);
 2983       __ aese(v0, v29); __ aesmc(v0, v0);
 2984       __ aese(v0, v30);
 2985       __ eor(v0, __ T16B, v0, v31);
 2986 
 2987       __ st1(v0, __ T16B, __ post(to, 16));
 2988 
 2989       __ subw(len_reg, len_reg, 16);
 2990       __ cbnzw(len_reg, L_aes_loop);
 2991 
 2992       __ st1(v0, __ T16B, rvec);
 2993 
 2994       __ mov(r0, rscratch2);
 2995 
 2996       __ leave();
 2997       __ ret(lr);
 2998 
 2999       return start;
 3000   }
 3001 
 3002   // Arguments:
 3003   //
 3004   // Inputs:
 3005   //   c_rarg0   - source byte array address
 3006   //   c_rarg1   - destination byte array address
 3007   //   c_rarg2   - K (key) in little endian int array
 3008   //   c_rarg3   - r vector byte array address
 3009   //   c_rarg4   - input length
 3010   //
 3011   // Output:
 3012   //   r0        - input length
 3013   //
 3014   address generate_cipherBlockChaining_decryptAESCrypt() {
 3015     assert(UseAES, "need AES cryptographic extension support");
 3016     __ align(CodeEntryAlignment);
 3017     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3018     StubCodeMark mark(this, stub_id);
 3019 
 3020     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3021 
 3022     const Register from        = c_rarg0;  // source array address
 3023     const Register to          = c_rarg1;  // destination array address
 3024     const Register key         = c_rarg2;  // key array address
 3025     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3026                                            // and left with the results of the last encryption block
 3027     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3028     const Register keylen      = rscratch1;
 3029 
 3030     address start = __ pc();
 3031 
 3032       __ enter();
 3033 
 3034       __ movw(rscratch2, len_reg);
 3035 
 3036       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3037 
 3038       __ ld1(v2, __ T16B, rvec);
 3039 
 3040       __ ld1(v31, __ T16B, __ post(key, 16));
 3041       __ rev32(v31, __ T16B, v31);
 3042 
 3043       __ cmpw(keylen, 52);
 3044       __ br(Assembler::CC, L_loadkeys_44);
 3045       __ br(Assembler::EQ, L_loadkeys_52);
 3046 
 3047       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3048       __ rev32(v17, __ T16B, v17);
 3049       __ rev32(v18, __ T16B, v18);
 3050     __ BIND(L_loadkeys_52);
 3051       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3052       __ rev32(v19, __ T16B, v19);
 3053       __ rev32(v20, __ T16B, v20);
 3054     __ BIND(L_loadkeys_44);
 3055       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3056       __ rev32(v21, __ T16B, v21);
 3057       __ rev32(v22, __ T16B, v22);
 3058       __ rev32(v23, __ T16B, v23);
 3059       __ rev32(v24, __ T16B, v24);
 3060       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3061       __ rev32(v25, __ T16B, v25);
 3062       __ rev32(v26, __ T16B, v26);
 3063       __ rev32(v27, __ T16B, v27);
 3064       __ rev32(v28, __ T16B, v28);
 3065       __ ld1(v29, v30, __ T16B, key);
 3066       __ rev32(v29, __ T16B, v29);
 3067       __ rev32(v30, __ T16B, v30);
 3068 
 3069     __ BIND(L_aes_loop);
 3070       __ ld1(v0, __ T16B, __ post(from, 16));
 3071       __ orr(v1, __ T16B, v0, v0);
 3072 
 3073       __ br(Assembler::CC, L_rounds_44);
 3074       __ br(Assembler::EQ, L_rounds_52);
 3075 
 3076       __ aesd(v0, v17); __ aesimc(v0, v0);
 3077       __ aesd(v0, v18); __ aesimc(v0, v0);
 3078     __ BIND(L_rounds_52);
 3079       __ aesd(v0, v19); __ aesimc(v0, v0);
 3080       __ aesd(v0, v20); __ aesimc(v0, v0);
 3081     __ BIND(L_rounds_44);
 3082       __ aesd(v0, v21); __ aesimc(v0, v0);
 3083       __ aesd(v0, v22); __ aesimc(v0, v0);
 3084       __ aesd(v0, v23); __ aesimc(v0, v0);
 3085       __ aesd(v0, v24); __ aesimc(v0, v0);
 3086       __ aesd(v0, v25); __ aesimc(v0, v0);
 3087       __ aesd(v0, v26); __ aesimc(v0, v0);
 3088       __ aesd(v0, v27); __ aesimc(v0, v0);
 3089       __ aesd(v0, v28); __ aesimc(v0, v0);
 3090       __ aesd(v0, v29); __ aesimc(v0, v0);
 3091       __ aesd(v0, v30);
 3092       __ eor(v0, __ T16B, v0, v31);
 3093       __ eor(v0, __ T16B, v0, v2);
 3094 
 3095       __ st1(v0, __ T16B, __ post(to, 16));
 3096       __ orr(v2, __ T16B, v1, v1);
 3097 
 3098       __ subw(len_reg, len_reg, 16);
 3099       __ cbnzw(len_reg, L_aes_loop);
 3100 
 3101       __ st1(v2, __ T16B, rvec);
 3102 
 3103       __ mov(r0, rscratch2);
 3104 
 3105       __ leave();
 3106       __ ret(lr);
 3107 
 3108     return start;
 3109   }
 3110 
 3111   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3112   // Inputs: 128-bits. in is preserved.
 3113   // The least-significant 64-bit word is in the upper dword of each vector.
 3114   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3115   // Output: result
 3116   void be_add_128_64(FloatRegister result, FloatRegister in,
 3117                      FloatRegister inc, FloatRegister tmp) {
 3118     assert_different_registers(result, tmp, inc);
 3119 
 3120     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3121                                            // input
 3122     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3123     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3124                                            // MSD == 0 (must be!) to LSD
 3125     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3126   }
 3127 
 3128   // CTR AES crypt.
 3129   // Arguments:
 3130   //
 3131   // Inputs:
 3132   //   c_rarg0   - source byte array address
 3133   //   c_rarg1   - destination byte array address
 3134   //   c_rarg2   - K (key) in little endian int array
 3135   //   c_rarg3   - counter vector byte array address
 3136   //   c_rarg4   - input length
 3137   //   c_rarg5   - saved encryptedCounter start
 3138   //   c_rarg6   - saved used length
 3139   //
 3140   // Output:
 3141   //   r0       - input length
 3142   //
 3143   address generate_counterMode_AESCrypt() {
 3144     const Register in = c_rarg0;
 3145     const Register out = c_rarg1;
 3146     const Register key = c_rarg2;
 3147     const Register counter = c_rarg3;
 3148     const Register saved_len = c_rarg4, len = r10;
 3149     const Register saved_encrypted_ctr = c_rarg5;
 3150     const Register used_ptr = c_rarg6, used = r12;
 3151 
 3152     const Register offset = r7;
 3153     const Register keylen = r11;
 3154 
 3155     const unsigned char block_size = 16;
 3156     const int bulk_width = 4;
 3157     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3158     // performance with larger data sizes, but it also means that the
 3159     // fast path isn't used until you have at least 8 blocks, and up
 3160     // to 127 bytes of data will be executed on the slow path. For
 3161     // that reason, and also so as not to blow away too much icache, 4
 3162     // blocks seems like a sensible compromise.
 3163 
 3164     // Algorithm:
 3165     //
 3166     //    if (len == 0) {
 3167     //        goto DONE;
 3168     //    }
 3169     //    int result = len;
 3170     //    do {
 3171     //        if (used >= blockSize) {
 3172     //            if (len >= bulk_width * blockSize) {
 3173     //                CTR_large_block();
 3174     //                if (len == 0)
 3175     //                    goto DONE;
 3176     //            }
 3177     //            for (;;) {
 3178     //                16ByteVector v0 = counter;
 3179     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3180     //                used = 0;
 3181     //                if (len < blockSize)
 3182     //                    break;    /* goto NEXT */
 3183     //                16ByteVector v1 = load16Bytes(in, offset);
 3184     //                v1 = v1 ^ encryptedCounter;
 3185     //                store16Bytes(out, offset);
 3186     //                used = blockSize;
 3187     //                offset += blockSize;
 3188     //                len -= blockSize;
 3189     //                if (len == 0)
 3190     //                    goto DONE;
 3191     //            }
 3192     //        }
 3193     //      NEXT:
 3194     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3195     //        len--;
 3196     //    } while (len != 0);
 3197     //  DONE:
 3198     //    return result;
 3199     //
 3200     // CTR_large_block()
 3201     //    Wide bulk encryption of whole blocks.
 3202 
 3203     __ align(CodeEntryAlignment);
 3204     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3205     StubCodeMark mark(this, stub_id);
 3206     const address start = __ pc();
 3207     __ enter();
 3208 
 3209     Label DONE, CTR_large_block, large_block_return;
 3210     __ ldrw(used, Address(used_ptr));
 3211     __ cbzw(saved_len, DONE);
 3212 
 3213     __ mov(len, saved_len);
 3214     __ mov(offset, 0);
 3215 
 3216     // Compute #rounds for AES based on the length of the key array
 3217     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3218 
 3219     __ aesenc_loadkeys(key, keylen);
 3220 
 3221     {
 3222       Label L_CTR_loop, NEXT;
 3223 
 3224       __ bind(L_CTR_loop);
 3225 
 3226       __ cmp(used, block_size);
 3227       __ br(__ LO, NEXT);
 3228 
 3229       // Maybe we have a lot of data
 3230       __ subsw(rscratch1, len, bulk_width * block_size);
 3231       __ br(__ HS, CTR_large_block);
 3232       __ BIND(large_block_return);
 3233       __ cbzw(len, DONE);
 3234 
 3235       // Setup the counter
 3236       __ movi(v4, __ T4S, 0);
 3237       __ movi(v5, __ T4S, 1);
 3238       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3239 
 3240       // 128-bit big-endian increment
 3241       __ ld1(v0, __ T16B, counter);
 3242       __ rev64(v16, __ T16B, v0);
 3243       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3244       __ rev64(v16, __ T16B, v16);
 3245       __ st1(v16, __ T16B, counter);
 3246       // Previous counter value is in v0
 3247       // v4 contains { 0, 1 }
 3248 
 3249       {
 3250         // We have fewer than bulk_width blocks of data left. Encrypt
 3251         // them one by one until there is less than a full block
 3252         // remaining, being careful to save both the encrypted counter
 3253         // and the counter.
 3254 
 3255         Label inner_loop;
 3256         __ bind(inner_loop);
 3257         // Counter to encrypt is in v0
 3258         __ aesecb_encrypt(noreg, noreg, keylen);
 3259         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3260 
 3261         // Do we have a remaining full block?
 3262 
 3263         __ mov(used, 0);
 3264         __ cmp(len, block_size);
 3265         __ br(__ LO, NEXT);
 3266 
 3267         // Yes, we have a full block
 3268         __ ldrq(v1, Address(in, offset));
 3269         __ eor(v1, __ T16B, v1, v0);
 3270         __ strq(v1, Address(out, offset));
 3271         __ mov(used, block_size);
 3272         __ add(offset, offset, block_size);
 3273 
 3274         __ subw(len, len, block_size);
 3275         __ cbzw(len, DONE);
 3276 
 3277         // Increment the counter, store it back
 3278         __ orr(v0, __ T16B, v16, v16);
 3279         __ rev64(v16, __ T16B, v16);
 3280         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3281         __ rev64(v16, __ T16B, v16);
 3282         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3283 
 3284         __ b(inner_loop);
 3285       }
 3286 
 3287       __ BIND(NEXT);
 3288 
 3289       // Encrypt a single byte, and loop.
 3290       // We expect this to be a rare event.
 3291       __ ldrb(rscratch1, Address(in, offset));
 3292       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3293       __ eor(rscratch1, rscratch1, rscratch2);
 3294       __ strb(rscratch1, Address(out, offset));
 3295       __ add(offset, offset, 1);
 3296       __ add(used, used, 1);
 3297       __ subw(len, len,1);
 3298       __ cbnzw(len, L_CTR_loop);
 3299     }
 3300 
 3301     __ bind(DONE);
 3302     __ strw(used, Address(used_ptr));
 3303     __ mov(r0, saved_len);
 3304 
 3305     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3306     __ ret(lr);
 3307 
 3308     // Bulk encryption
 3309 
 3310     __ BIND (CTR_large_block);
 3311     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3312 
 3313     if (bulk_width == 8) {
 3314       __ sub(sp, sp, 4 * 16);
 3315       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3316     }
 3317     __ sub(sp, sp, 4 * 16);
 3318     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3319     RegSet saved_regs = (RegSet::of(in, out, offset)
 3320                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3321     __ push(saved_regs, sp);
 3322     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3323     __ add(in, in, offset);
 3324     __ add(out, out, offset);
 3325 
 3326     // Keys should already be loaded into the correct registers
 3327 
 3328     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3329     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3330 
 3331     // AES/CTR loop
 3332     {
 3333       Label L_CTR_loop;
 3334       __ BIND(L_CTR_loop);
 3335 
 3336       // Setup the counters
 3337       __ movi(v8, __ T4S, 0);
 3338       __ movi(v9, __ T4S, 1);
 3339       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3340 
 3341       for (int i = 0; i < bulk_width; i++) {
 3342         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3343         __ rev64(v0_ofs, __ T16B, v16);
 3344         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3345       }
 3346 
 3347       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3348 
 3349       // Encrypt the counters
 3350       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3351 
 3352       if (bulk_width == 8) {
 3353         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3354       }
 3355 
 3356       // XOR the encrypted counters with the inputs
 3357       for (int i = 0; i < bulk_width; i++) {
 3358         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3359         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3360         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3361       }
 3362 
 3363       // Write the encrypted data
 3364       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3365       if (bulk_width == 8) {
 3366         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3367       }
 3368 
 3369       __ subw(len, len, 16 * bulk_width);
 3370       __ cbnzw(len, L_CTR_loop);
 3371     }
 3372 
 3373     // Save the counter back where it goes
 3374     __ rev64(v16, __ T16B, v16);
 3375     __ st1(v16, __ T16B, counter);
 3376 
 3377     __ pop(saved_regs, sp);
 3378 
 3379     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3380     if (bulk_width == 8) {
 3381       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3382     }
 3383 
 3384     __ andr(rscratch1, len, -16 * bulk_width);
 3385     __ sub(len, len, rscratch1);
 3386     __ add(offset, offset, rscratch1);
 3387     __ mov(used, 16);
 3388     __ strw(used, Address(used_ptr));
 3389     __ b(large_block_return);
 3390 
 3391     return start;
 3392   }
 3393 
 3394   // Vector AES Galois Counter Mode implementation. Parameters:
 3395   //
 3396   // in = c_rarg0
 3397   // len = c_rarg1
 3398   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3399   // out = c_rarg3
 3400   // key = c_rarg4
 3401   // state = c_rarg5 - GHASH.state
 3402   // subkeyHtbl = c_rarg6 - powers of H
 3403   // counter = c_rarg7 - 16 bytes of CTR
 3404   // return - number of processed bytes
 3405   address generate_galoisCounterMode_AESCrypt() {
 3406     address ghash_polynomial = __ pc();
 3407     __ emit_int64(0x87);  // The low-order bits of the field
 3408                           // polynomial (i.e. p = z^7+z^2+z+1)
 3409                           // repeated in the low and high parts of a
 3410                           // 128-bit vector
 3411     __ emit_int64(0x87);
 3412 
 3413     __ align(CodeEntryAlignment);
 3414     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3415     StubCodeMark mark(this, stub_id);
 3416     address start = __ pc();
 3417     __ enter();
 3418 
 3419     const Register in = c_rarg0;
 3420     const Register len = c_rarg1;
 3421     const Register ct = c_rarg2;
 3422     const Register out = c_rarg3;
 3423     // and updated with the incremented counter in the end
 3424 
 3425     const Register key = c_rarg4;
 3426     const Register state = c_rarg5;
 3427 
 3428     const Register subkeyHtbl = c_rarg6;
 3429 
 3430     const Register counter = c_rarg7;
 3431 
 3432     const Register keylen = r10;
 3433     // Save state before entering routine
 3434     __ sub(sp, sp, 4 * 16);
 3435     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3436     __ sub(sp, sp, 4 * 16);
 3437     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3438 
 3439     // __ andr(len, len, -512);
 3440     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3441     __ str(len, __ pre(sp, -2 * wordSize));
 3442 
 3443     Label DONE;
 3444     __ cbz(len, DONE);
 3445 
 3446     // Compute #rounds for AES based on the length of the key array
 3447     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3448 
 3449     __ aesenc_loadkeys(key, keylen);
 3450     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3451     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3452 
 3453     // AES/CTR loop
 3454     {
 3455       Label L_CTR_loop;
 3456       __ BIND(L_CTR_loop);
 3457 
 3458       // Setup the counters
 3459       __ movi(v8, __ T4S, 0);
 3460       __ movi(v9, __ T4S, 1);
 3461       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3462 
 3463       assert(v0->encoding() < v8->encoding(), "");
 3464       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3465         FloatRegister f = as_FloatRegister(i);
 3466         __ rev32(f, __ T16B, v16);
 3467         __ addv(v16, __ T4S, v16, v8);
 3468       }
 3469 
 3470       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3471 
 3472       // Encrypt the counters
 3473       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3474 
 3475       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3476 
 3477       // XOR the encrypted counters with the inputs
 3478       for (int i = 0; i < 8; i++) {
 3479         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3480         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3481         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3482       }
 3483       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3484       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3485 
 3486       __ subw(len, len, 16 * 8);
 3487       __ cbnzw(len, L_CTR_loop);
 3488     }
 3489 
 3490     __ rev32(v16, __ T16B, v16);
 3491     __ st1(v16, __ T16B, counter);
 3492 
 3493     __ ldr(len, Address(sp));
 3494     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3495 
 3496     // GHASH/CTR loop
 3497     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3498                                 len, /*unrolls*/4);
 3499 
 3500 #ifdef ASSERT
 3501     { Label L;
 3502       __ cmp(len, (unsigned char)0);
 3503       __ br(Assembler::EQ, L);
 3504       __ stop("stubGenerator: abort");
 3505       __ bind(L);
 3506   }
 3507 #endif
 3508 
 3509   __ bind(DONE);
 3510     // Return the number of bytes processed
 3511     __ ldr(r0, __ post(sp, 2 * wordSize));
 3512 
 3513     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3514     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3515 
 3516     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3517     __ ret(lr);
 3518      return start;
 3519   }
 3520 
 3521   class Cached64Bytes {
 3522   private:
 3523     MacroAssembler *_masm;
 3524     Register _regs[8];
 3525 
 3526   public:
 3527     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3528       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3529       auto it = rs.begin();
 3530       for (auto &r: _regs) {
 3531         r = *it;
 3532         ++it;
 3533       }
 3534     }
 3535 
 3536     void gen_loads(Register base) {
 3537       for (int i = 0; i < 8; i += 2) {
 3538         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3539       }
 3540     }
 3541 
 3542     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3543     void extract_u32(Register dest, int i) {
 3544       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3545     }
 3546   };
 3547 
 3548   // Utility routines for md5.
 3549   // Clobbers r10 and r11.
 3550   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3551               int k, int s, int t) {
 3552     Register rscratch3 = r10;
 3553     Register rscratch4 = r11;
 3554 
 3555     __ eorw(rscratch3, r3, r4);
 3556     __ movw(rscratch2, t);
 3557     __ andw(rscratch3, rscratch3, r2);
 3558     __ addw(rscratch4, r1, rscratch2);
 3559     reg_cache.extract_u32(rscratch1, k);
 3560     __ eorw(rscratch3, rscratch3, r4);
 3561     __ addw(rscratch4, rscratch4, rscratch1);
 3562     __ addw(rscratch3, rscratch3, rscratch4);
 3563     __ rorw(rscratch2, rscratch3, 32 - s);
 3564     __ addw(r1, rscratch2, r2);
 3565   }
 3566 
 3567   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3568               int k, int s, int t) {
 3569     Register rscratch3 = r10;
 3570     Register rscratch4 = r11;
 3571 
 3572     reg_cache.extract_u32(rscratch1, k);
 3573     __ movw(rscratch2, t);
 3574     __ addw(rscratch4, r1, rscratch2);
 3575     __ addw(rscratch4, rscratch4, rscratch1);
 3576     __ bicw(rscratch2, r3, r4);
 3577     __ andw(rscratch3, r2, r4);
 3578     __ addw(rscratch2, rscratch2, rscratch4);
 3579     __ addw(rscratch2, rscratch2, rscratch3);
 3580     __ rorw(rscratch2, rscratch2, 32 - s);
 3581     __ addw(r1, rscratch2, r2);
 3582   }
 3583 
 3584   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3585               int k, int s, int t) {
 3586     Register rscratch3 = r10;
 3587     Register rscratch4 = r11;
 3588 
 3589     __ eorw(rscratch3, r3, r4);
 3590     __ movw(rscratch2, t);
 3591     __ addw(rscratch4, r1, rscratch2);
 3592     reg_cache.extract_u32(rscratch1, k);
 3593     __ eorw(rscratch3, rscratch3, r2);
 3594     __ addw(rscratch4, rscratch4, rscratch1);
 3595     __ addw(rscratch3, rscratch3, rscratch4);
 3596     __ rorw(rscratch2, rscratch3, 32 - s);
 3597     __ addw(r1, rscratch2, r2);
 3598   }
 3599 
 3600   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3601               int k, int s, int t) {
 3602     Register rscratch3 = r10;
 3603     Register rscratch4 = r11;
 3604 
 3605     __ movw(rscratch3, t);
 3606     __ ornw(rscratch2, r2, r4);
 3607     __ addw(rscratch4, r1, rscratch3);
 3608     reg_cache.extract_u32(rscratch1, k);
 3609     __ eorw(rscratch3, rscratch2, r3);
 3610     __ addw(rscratch4, rscratch4, rscratch1);
 3611     __ addw(rscratch3, rscratch3, rscratch4);
 3612     __ rorw(rscratch2, rscratch3, 32 - s);
 3613     __ addw(r1, rscratch2, r2);
 3614   }
 3615 
 3616   // Arguments:
 3617   //
 3618   // Inputs:
 3619   //   c_rarg0   - byte[]  source+offset
 3620   //   c_rarg1   - int[]   SHA.state
 3621   //   c_rarg2   - int     offset
 3622   //   c_rarg3   - int     limit
 3623   //
 3624   address generate_md5_implCompress(StubId stub_id) {
 3625     bool multi_block;
 3626     switch (stub_id) {
 3627     case StubId::stubgen_md5_implCompress_id:
 3628       multi_block = false;
 3629       break;
 3630     case StubId::stubgen_md5_implCompressMB_id:
 3631       multi_block = true;
 3632       break;
 3633     default:
 3634       ShouldNotReachHere();
 3635     }
 3636     __ align(CodeEntryAlignment);
 3637 
 3638     StubCodeMark mark(this, stub_id);
 3639     address start = __ pc();
 3640 
 3641     Register buf       = c_rarg0;
 3642     Register state     = c_rarg1;
 3643     Register ofs       = c_rarg2;
 3644     Register limit     = c_rarg3;
 3645     Register a         = r4;
 3646     Register b         = r5;
 3647     Register c         = r6;
 3648     Register d         = r7;
 3649     Register rscratch3 = r10;
 3650     Register rscratch4 = r11;
 3651 
 3652     Register state_regs[2] = { r12, r13 };
 3653     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3654     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3655 
 3656     __ push(saved_regs, sp);
 3657 
 3658     __ ldp(state_regs[0], state_regs[1], Address(state));
 3659     __ ubfx(a, state_regs[0],  0, 32);
 3660     __ ubfx(b, state_regs[0], 32, 32);
 3661     __ ubfx(c, state_regs[1],  0, 32);
 3662     __ ubfx(d, state_regs[1], 32, 32);
 3663 
 3664     Label md5_loop;
 3665     __ BIND(md5_loop);
 3666 
 3667     reg_cache.gen_loads(buf);
 3668 
 3669     // Round 1
 3670     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3671     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3672     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3673     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3674     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3675     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3676     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3677     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3678     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3679     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3680     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3681     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3682     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3683     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3684     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3685     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3686 
 3687     // Round 2
 3688     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3689     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3690     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3691     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3692     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3693     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3694     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3695     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3696     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3697     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3698     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3699     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3700     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3701     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3702     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3703     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3704 
 3705     // Round 3
 3706     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3707     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3708     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3709     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3710     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3711     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3712     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3713     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3714     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3715     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3716     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3717     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3718     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3719     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3720     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3721     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3722 
 3723     // Round 4
 3724     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3725     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3726     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3727     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3728     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3729     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3730     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3731     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3732     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3733     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3734     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3735     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3736     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3737     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3738     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3739     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3740 
 3741     __ addw(a, state_regs[0], a);
 3742     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3743     __ addw(b, rscratch2, b);
 3744     __ addw(c, state_regs[1], c);
 3745     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3746     __ addw(d, rscratch4, d);
 3747 
 3748     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3749     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3750 
 3751     if (multi_block) {
 3752       __ add(buf, buf, 64);
 3753       __ add(ofs, ofs, 64);
 3754       __ cmp(ofs, limit);
 3755       __ br(Assembler::LE, md5_loop);
 3756       __ mov(c_rarg0, ofs); // return ofs
 3757     }
 3758 
 3759     // write hash values back in the correct order
 3760     __ stp(state_regs[0], state_regs[1], Address(state));
 3761 
 3762     __ pop(saved_regs, sp);
 3763 
 3764     __ ret(lr);
 3765 
 3766     return start;
 3767   }
 3768 
 3769   // Arguments:
 3770   //
 3771   // Inputs:
 3772   //   c_rarg0   - byte[]  source+offset
 3773   //   c_rarg1   - int[]   SHA.state
 3774   //   c_rarg2   - int     offset
 3775   //   c_rarg3   - int     limit
 3776   //
 3777   address generate_sha1_implCompress(StubId stub_id) {
 3778     bool multi_block;
 3779     switch (stub_id) {
 3780     case StubId::stubgen_sha1_implCompress_id:
 3781       multi_block = false;
 3782       break;
 3783     case StubId::stubgen_sha1_implCompressMB_id:
 3784       multi_block = true;
 3785       break;
 3786     default:
 3787       ShouldNotReachHere();
 3788     }
 3789 
 3790     __ align(CodeEntryAlignment);
 3791 
 3792     StubCodeMark mark(this, stub_id);
 3793     address start = __ pc();
 3794 
 3795     Register buf   = c_rarg0;
 3796     Register state = c_rarg1;
 3797     Register ofs   = c_rarg2;
 3798     Register limit = c_rarg3;
 3799 
 3800     Label keys;
 3801     Label sha1_loop;
 3802 
 3803     // load the keys into v0..v3
 3804     __ adr(rscratch1, keys);
 3805     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3806     // load 5 words state into v6, v7
 3807     __ ldrq(v6, Address(state, 0));
 3808     __ ldrs(v7, Address(state, 16));
 3809 
 3810 
 3811     __ BIND(sha1_loop);
 3812     // load 64 bytes of data into v16..v19
 3813     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3814     __ rev32(v16, __ T16B, v16);
 3815     __ rev32(v17, __ T16B, v17);
 3816     __ rev32(v18, __ T16B, v18);
 3817     __ rev32(v19, __ T16B, v19);
 3818 
 3819     // do the sha1
 3820     __ addv(v4, __ T4S, v16, v0);
 3821     __ orr(v20, __ T16B, v6, v6);
 3822 
 3823     FloatRegister d0 = v16;
 3824     FloatRegister d1 = v17;
 3825     FloatRegister d2 = v18;
 3826     FloatRegister d3 = v19;
 3827 
 3828     for (int round = 0; round < 20; round++) {
 3829       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3830       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3831       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3832       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3833       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3834 
 3835       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3836       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3837       __ sha1h(tmp2, __ T4S, v20);
 3838       if (round < 5)
 3839         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3840       else if (round < 10 || round >= 15)
 3841         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3842       else
 3843         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3844       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3845 
 3846       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3847     }
 3848 
 3849     __ addv(v7, __ T2S, v7, v21);
 3850     __ addv(v6, __ T4S, v6, v20);
 3851 
 3852     if (multi_block) {
 3853       __ add(ofs, ofs, 64);
 3854       __ cmp(ofs, limit);
 3855       __ br(Assembler::LE, sha1_loop);
 3856       __ mov(c_rarg0, ofs); // return ofs
 3857     }
 3858 
 3859     __ strq(v6, Address(state, 0));
 3860     __ strs(v7, Address(state, 16));
 3861 
 3862     __ ret(lr);
 3863 
 3864     __ bind(keys);
 3865     __ emit_int32(0x5a827999);
 3866     __ emit_int32(0x6ed9eba1);
 3867     __ emit_int32(0x8f1bbcdc);
 3868     __ emit_int32(0xca62c1d6);
 3869 
 3870     return start;
 3871   }
 3872 
 3873 
 3874   // Arguments:
 3875   //
 3876   // Inputs:
 3877   //   c_rarg0   - byte[]  source+offset
 3878   //   c_rarg1   - int[]   SHA.state
 3879   //   c_rarg2   - int     offset
 3880   //   c_rarg3   - int     limit
 3881   //
 3882   address generate_sha256_implCompress(StubId stub_id) {
 3883     bool multi_block;
 3884     switch (stub_id) {
 3885     case StubId::stubgen_sha256_implCompress_id:
 3886       multi_block = false;
 3887       break;
 3888     case StubId::stubgen_sha256_implCompressMB_id:
 3889       multi_block = true;
 3890       break;
 3891     default:
 3892       ShouldNotReachHere();
 3893     }
 3894 
 3895     static const uint32_t round_consts[64] = {
 3896       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3897       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3898       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3899       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3900       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3901       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3902       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3903       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3904       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3905       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3906       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3907       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3908       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3909       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3910       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3911       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3912     };
 3913 
 3914     __ align(CodeEntryAlignment);
 3915 
 3916     StubCodeMark mark(this, stub_id);
 3917     address start = __ pc();
 3918 
 3919     Register buf   = c_rarg0;
 3920     Register state = c_rarg1;
 3921     Register ofs   = c_rarg2;
 3922     Register limit = c_rarg3;
 3923 
 3924     Label sha1_loop;
 3925 
 3926     __ stpd(v8, v9, __ pre(sp, -32));
 3927     __ stpd(v10, v11, Address(sp, 16));
 3928 
 3929 // dga == v0
 3930 // dgb == v1
 3931 // dg0 == v2
 3932 // dg1 == v3
 3933 // dg2 == v4
 3934 // t0 == v6
 3935 // t1 == v7
 3936 
 3937     // load 16 keys to v16..v31
 3938     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3939     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3940     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3941     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3942     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3943 
 3944     // load 8 words (256 bits) state
 3945     __ ldpq(v0, v1, state);
 3946 
 3947     __ BIND(sha1_loop);
 3948     // load 64 bytes of data into v8..v11
 3949     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3950     __ rev32(v8, __ T16B, v8);
 3951     __ rev32(v9, __ T16B, v9);
 3952     __ rev32(v10, __ T16B, v10);
 3953     __ rev32(v11, __ T16B, v11);
 3954 
 3955     __ addv(v6, __ T4S, v8, v16);
 3956     __ orr(v2, __ T16B, v0, v0);
 3957     __ orr(v3, __ T16B, v1, v1);
 3958 
 3959     FloatRegister d0 = v8;
 3960     FloatRegister d1 = v9;
 3961     FloatRegister d2 = v10;
 3962     FloatRegister d3 = v11;
 3963 
 3964 
 3965     for (int round = 0; round < 16; round++) {
 3966       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3967       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3968       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3969       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3970 
 3971       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3972        __ orr(v4, __ T16B, v2, v2);
 3973       if (round < 15)
 3974         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3975       __ sha256h(v2, __ T4S, v3, tmp2);
 3976       __ sha256h2(v3, __ T4S, v4, tmp2);
 3977       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3978 
 3979       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3980     }
 3981 
 3982     __ addv(v0, __ T4S, v0, v2);
 3983     __ addv(v1, __ T4S, v1, v3);
 3984 
 3985     if (multi_block) {
 3986       __ add(ofs, ofs, 64);
 3987       __ cmp(ofs, limit);
 3988       __ br(Assembler::LE, sha1_loop);
 3989       __ mov(c_rarg0, ofs); // return ofs
 3990     }
 3991 
 3992     __ ldpd(v10, v11, Address(sp, 16));
 3993     __ ldpd(v8, v9, __ post(sp, 32));
 3994 
 3995     __ stpq(v0, v1, state);
 3996 
 3997     __ ret(lr);
 3998 
 3999     return start;
 4000   }
 4001 
 4002   // Double rounds for sha512.
 4003   void sha512_dround(int dr,
 4004                      FloatRegister vi0, FloatRegister vi1,
 4005                      FloatRegister vi2, FloatRegister vi3,
 4006                      FloatRegister vi4, FloatRegister vrc0,
 4007                      FloatRegister vrc1, FloatRegister vin0,
 4008                      FloatRegister vin1, FloatRegister vin2,
 4009                      FloatRegister vin3, FloatRegister vin4) {
 4010       if (dr < 36) {
 4011         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4012       }
 4013       __ addv(v5, __ T2D, vrc0, vin0);
 4014       __ ext(v6, __ T16B, vi2, vi3, 8);
 4015       __ ext(v5, __ T16B, v5, v5, 8);
 4016       __ ext(v7, __ T16B, vi1, vi2, 8);
 4017       __ addv(vi3, __ T2D, vi3, v5);
 4018       if (dr < 32) {
 4019         __ ext(v5, __ T16B, vin3, vin4, 8);
 4020         __ sha512su0(vin0, __ T2D, vin1);
 4021       }
 4022       __ sha512h(vi3, __ T2D, v6, v7);
 4023       if (dr < 32) {
 4024         __ sha512su1(vin0, __ T2D, vin2, v5);
 4025       }
 4026       __ addv(vi4, __ T2D, vi1, vi3);
 4027       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4028   }
 4029 
 4030   // Arguments:
 4031   //
 4032   // Inputs:
 4033   //   c_rarg0   - byte[]  source+offset
 4034   //   c_rarg1   - int[]   SHA.state
 4035   //   c_rarg2   - int     offset
 4036   //   c_rarg3   - int     limit
 4037   //
 4038   address generate_sha512_implCompress(StubId stub_id) {
 4039     bool multi_block;
 4040     switch (stub_id) {
 4041     case StubId::stubgen_sha512_implCompress_id:
 4042       multi_block = false;
 4043       break;
 4044     case StubId::stubgen_sha512_implCompressMB_id:
 4045       multi_block = true;
 4046       break;
 4047     default:
 4048       ShouldNotReachHere();
 4049     }
 4050 
 4051     static const uint64_t round_consts[80] = {
 4052       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4053       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4054       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4055       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4056       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4057       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4058       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4059       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4060       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4061       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4062       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4063       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4064       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4065       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4066       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4067       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4068       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4069       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4070       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4071       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4072       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4073       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4074       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4075       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4076       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4077       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4078       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4079     };
 4080 
 4081     __ align(CodeEntryAlignment);
 4082 
 4083     StubCodeMark mark(this, stub_id);
 4084     address start = __ pc();
 4085 
 4086     Register buf   = c_rarg0;
 4087     Register state = c_rarg1;
 4088     Register ofs   = c_rarg2;
 4089     Register limit = c_rarg3;
 4090 
 4091     __ stpd(v8, v9, __ pre(sp, -64));
 4092     __ stpd(v10, v11, Address(sp, 16));
 4093     __ stpd(v12, v13, Address(sp, 32));
 4094     __ stpd(v14, v15, Address(sp, 48));
 4095 
 4096     Label sha512_loop;
 4097 
 4098     // load state
 4099     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4100 
 4101     // load first 4 round constants
 4102     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4103     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4104 
 4105     __ BIND(sha512_loop);
 4106     // load 128B of data into v12..v19
 4107     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4108     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4109     __ rev64(v12, __ T16B, v12);
 4110     __ rev64(v13, __ T16B, v13);
 4111     __ rev64(v14, __ T16B, v14);
 4112     __ rev64(v15, __ T16B, v15);
 4113     __ rev64(v16, __ T16B, v16);
 4114     __ rev64(v17, __ T16B, v17);
 4115     __ rev64(v18, __ T16B, v18);
 4116     __ rev64(v19, __ T16B, v19);
 4117 
 4118     __ mov(rscratch2, rscratch1);
 4119 
 4120     __ mov(v0, __ T16B, v8);
 4121     __ mov(v1, __ T16B, v9);
 4122     __ mov(v2, __ T16B, v10);
 4123     __ mov(v3, __ T16B, v11);
 4124 
 4125     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4126     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4127     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4128     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4129     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4130     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4131     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4132     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4133     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4134     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4135     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4136     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4137     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4138     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4139     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4140     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4141     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4142     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4143     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4144     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4145     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4146     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4147     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4148     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4149     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4150     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4151     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4152     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4153     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4154     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4155     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4156     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4157     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4158     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4159     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4160     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4161     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4162     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4163     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4164     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4165 
 4166     __ addv(v8, __ T2D, v8, v0);
 4167     __ addv(v9, __ T2D, v9, v1);
 4168     __ addv(v10, __ T2D, v10, v2);
 4169     __ addv(v11, __ T2D, v11, v3);
 4170 
 4171     if (multi_block) {
 4172       __ add(ofs, ofs, 128);
 4173       __ cmp(ofs, limit);
 4174       __ br(Assembler::LE, sha512_loop);
 4175       __ mov(c_rarg0, ofs); // return ofs
 4176     }
 4177 
 4178     __ st1(v8, v9, v10, v11, __ T2D, state);
 4179 
 4180     __ ldpd(v14, v15, Address(sp, 48));
 4181     __ ldpd(v12, v13, Address(sp, 32));
 4182     __ ldpd(v10, v11, Address(sp, 16));
 4183     __ ldpd(v8, v9, __ post(sp, 64));
 4184 
 4185     __ ret(lr);
 4186 
 4187     return start;
 4188   }
 4189 
 4190   // Execute one round of keccak of two computations in parallel.
 4191   // One of the states should be loaded into the lower halves of
 4192   // the vector registers v0-v24, the other should be loaded into
 4193   // the upper halves of those registers. The ld1r instruction loads
 4194   // the round constant into both halves of register v31.
 4195   // Intermediate results c0...c5 and d0...d5 are computed
 4196   // in registers v25...v30.
 4197   // All vector instructions that are used operate on both register
 4198   // halves in parallel.
 4199   // If only a single computation is needed, one can only load the lower halves.
 4200   void keccak_round(Register rscratch1) {
 4201   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4202   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4203   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4204   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4205   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4206   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4207   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4208   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4209   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4210   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4211 
 4212   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4213   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4214   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4215   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4216   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4217 
 4218   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4219   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4220   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4221   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4222   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4223   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4224   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4225   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4226   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4227   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4228   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4229   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4230   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4231   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4232   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4233   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4234   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4235   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4236   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4237   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4238   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4239   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4240   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4241   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4242   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4243 
 4244   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4245   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4246   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4247   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4248   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4249 
 4250   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4251 
 4252   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4253   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4254   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4255   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4256   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4257 
 4258   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4259   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4260   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4261   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4262   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4263 
 4264   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4265   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4266   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4267   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4268   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4269 
 4270   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4271   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4272   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4273   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4274   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4275 
 4276   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4277   }
 4278 
 4279   // Arguments:
 4280   //
 4281   // Inputs:
 4282   //   c_rarg0   - byte[]  source+offset
 4283   //   c_rarg1   - byte[]  SHA.state
 4284   //   c_rarg2   - int     block_size
 4285   //   c_rarg3   - int     offset
 4286   //   c_rarg4   - int     limit
 4287   //
 4288   address generate_sha3_implCompress(StubId stub_id) {
 4289     bool multi_block;
 4290     switch (stub_id) {
 4291     case StubId::stubgen_sha3_implCompress_id:
 4292       multi_block = false;
 4293       break;
 4294     case StubId::stubgen_sha3_implCompressMB_id:
 4295       multi_block = true;
 4296       break;
 4297     default:
 4298       ShouldNotReachHere();
 4299     }
 4300 
 4301     static const uint64_t round_consts[24] = {
 4302       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4303       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4304       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4305       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4306       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4307       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4308       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4309       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4310     };
 4311 
 4312     __ align(CodeEntryAlignment);
 4313 
 4314     StubCodeMark mark(this, stub_id);
 4315     address start = __ pc();
 4316 
 4317     Register buf           = c_rarg0;
 4318     Register state         = c_rarg1;
 4319     Register block_size    = c_rarg2;
 4320     Register ofs           = c_rarg3;
 4321     Register limit         = c_rarg4;
 4322 
 4323     Label sha3_loop, rounds24_loop;
 4324     Label sha3_512_or_sha3_384, shake128;
 4325 
 4326     __ stpd(v8, v9, __ pre(sp, -64));
 4327     __ stpd(v10, v11, Address(sp, 16));
 4328     __ stpd(v12, v13, Address(sp, 32));
 4329     __ stpd(v14, v15, Address(sp, 48));
 4330 
 4331     // load state
 4332     __ add(rscratch1, state, 32);
 4333     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4334     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4335     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4336     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4337     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4338     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4339     __ ld1(v24, __ T1D, rscratch1);
 4340 
 4341     __ BIND(sha3_loop);
 4342 
 4343     // 24 keccak rounds
 4344     __ movw(rscratch2, 24);
 4345 
 4346     // load round_constants base
 4347     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4348 
 4349     // load input
 4350     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4351     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4352     __ eor(v0, __ T8B, v0, v25);
 4353     __ eor(v1, __ T8B, v1, v26);
 4354     __ eor(v2, __ T8B, v2, v27);
 4355     __ eor(v3, __ T8B, v3, v28);
 4356     __ eor(v4, __ T8B, v4, v29);
 4357     __ eor(v5, __ T8B, v5, v30);
 4358     __ eor(v6, __ T8B, v6, v31);
 4359 
 4360     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4361     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4362 
 4363     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4364     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4365     __ eor(v7, __ T8B, v7, v25);
 4366     __ eor(v8, __ T8B, v8, v26);
 4367     __ eor(v9, __ T8B, v9, v27);
 4368     __ eor(v10, __ T8B, v10, v28);
 4369     __ eor(v11, __ T8B, v11, v29);
 4370     __ eor(v12, __ T8B, v12, v30);
 4371     __ eor(v13, __ T8B, v13, v31);
 4372 
 4373     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4374     __ eor(v14, __ T8B, v14, v25);
 4375     __ eor(v15, __ T8B, v15, v26);
 4376     __ eor(v16, __ T8B, v16, v27);
 4377 
 4378     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4379     __ andw(c_rarg5, block_size, 48);
 4380     __ cbzw(c_rarg5, rounds24_loop);
 4381 
 4382     __ tbnz(block_size, 5, shake128);
 4383     // block_size == 144, bit5 == 0, SHA3-224
 4384     __ ldrd(v28, __ post(buf, 8));
 4385     __ eor(v17, __ T8B, v17, v28);
 4386     __ b(rounds24_loop);
 4387 
 4388     __ BIND(shake128);
 4389     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4390     __ eor(v17, __ T8B, v17, v28);
 4391     __ eor(v18, __ T8B, v18, v29);
 4392     __ eor(v19, __ T8B, v19, v30);
 4393     __ eor(v20, __ T8B, v20, v31);
 4394     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4395 
 4396     __ BIND(sha3_512_or_sha3_384);
 4397     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4398     __ eor(v7, __ T8B, v7, v25);
 4399     __ eor(v8, __ T8B, v8, v26);
 4400     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4401 
 4402     // SHA3-384
 4403     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4404     __ eor(v9,  __ T8B, v9,  v27);
 4405     __ eor(v10, __ T8B, v10, v28);
 4406     __ eor(v11, __ T8B, v11, v29);
 4407     __ eor(v12, __ T8B, v12, v30);
 4408 
 4409     __ BIND(rounds24_loop);
 4410     __ subw(rscratch2, rscratch2, 1);
 4411 
 4412     keccak_round(rscratch1);
 4413 
 4414     __ cbnzw(rscratch2, rounds24_loop);
 4415 
 4416     if (multi_block) {
 4417       __ add(ofs, ofs, block_size);
 4418       __ cmp(ofs, limit);
 4419       __ br(Assembler::LE, sha3_loop);
 4420       __ mov(c_rarg0, ofs); // return ofs
 4421     }
 4422 
 4423     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4424     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4425     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4426     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4427     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4428     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4429     __ st1(v24, __ T1D, state);
 4430 
 4431     // restore callee-saved registers
 4432     __ ldpd(v14, v15, Address(sp, 48));
 4433     __ ldpd(v12, v13, Address(sp, 32));
 4434     __ ldpd(v10, v11, Address(sp, 16));
 4435     __ ldpd(v8, v9, __ post(sp, 64));
 4436 
 4437     __ ret(lr);
 4438 
 4439     return start;
 4440   }
 4441 
 4442   // Inputs:
 4443   //   c_rarg0   - long[]  state0
 4444   //   c_rarg1   - long[]  state1
 4445   address generate_double_keccak() {
 4446     static const uint64_t round_consts[24] = {
 4447       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4448       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4449       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4450       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4451       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4452       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4453       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4454       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4455     };
 4456 
 4457     // Implements the double_keccak() method of the
 4458     // sun.secyrity.provider.SHA3Parallel class
 4459     __ align(CodeEntryAlignment);
 4460     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4461     address start = __ pc();
 4462     __ enter();
 4463 
 4464     Register state0        = c_rarg0;
 4465     Register state1        = c_rarg1;
 4466 
 4467     Label rounds24_loop;
 4468 
 4469     // save callee-saved registers
 4470     __ stpd(v8, v9, __ pre(sp, -64));
 4471     __ stpd(v10, v11, Address(sp, 16));
 4472     __ stpd(v12, v13, Address(sp, 32));
 4473     __ stpd(v14, v15, Address(sp, 48));
 4474 
 4475     // load states
 4476     __ add(rscratch1, state0, 32);
 4477     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4478     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4479     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4480     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4481     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4482     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4483     __ ld1(v24, __ D, 0, rscratch1);
 4484     __ add(rscratch1, state1, 32);
 4485     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4486     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4487     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4488     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4489     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4490     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4491     __ ld1(v24, __ D, 1, rscratch1);
 4492 
 4493     // 24 keccak rounds
 4494     __ movw(rscratch2, 24);
 4495 
 4496     // load round_constants base
 4497     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4498 
 4499     __ BIND(rounds24_loop);
 4500     __ subw(rscratch2, rscratch2, 1);
 4501     keccak_round(rscratch1);
 4502     __ cbnzw(rscratch2, rounds24_loop);
 4503 
 4504     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4505     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4506     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4507     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4508     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4509     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4510     __ st1(v24, __ D, 0, state0);
 4511     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4512     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4513     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4514     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4515     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4516     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4517     __ st1(v24, __ D, 1, state1);
 4518 
 4519     // restore callee-saved vector registers
 4520     __ ldpd(v14, v15, Address(sp, 48));
 4521     __ ldpd(v12, v13, Address(sp, 32));
 4522     __ ldpd(v10, v11, Address(sp, 16));
 4523     __ ldpd(v8, v9, __ post(sp, 64));
 4524 
 4525     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4526     __ mov(r0, zr); // return 0
 4527     __ ret(lr);
 4528 
 4529     return start;
 4530   }
 4531 
 4532   // ChaCha20 block function.  This version parallelizes the 32-bit
 4533   // state elements on each of 16 vectors, producing 4 blocks of
 4534   // keystream at a time.
 4535   //
 4536   // state (int[16]) = c_rarg0
 4537   // keystream (byte[256]) = c_rarg1
 4538   // return - number of bytes of produced keystream (always 256)
 4539   //
 4540   // This implementation takes each 32-bit integer from the state
 4541   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4542   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4543   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4544   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4545   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4546   // operations represented by one QUARTERROUND function, we instead stack all
 4547   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4548   // and then do the same for the second set of 4 quarter rounds.  This removes
 4549   // some latency that would otherwise be incurred by waiting for an add to
 4550   // complete before performing an xor (which depends on the result of the
 4551   // add), etc. An adjustment happens between the first and second groups of 4
 4552   // quarter rounds, but this is done only in the inputs to the macro functions
 4553   // that generate the assembly instructions - these adjustments themselves are
 4554   // not part of the resulting assembly.
 4555   // The 4 registers v0-v3 are used during the quarter round operations as
 4556   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4557   // registers become the vectors involved in adding the start state back onto
 4558   // the post-QR working state.  After the adds are complete, each of the 16
 4559   // vectors write their first lane back to the keystream buffer, followed
 4560   // by the second lane from all vectors and so on.
 4561   address generate_chacha20Block_blockpar() {
 4562     Label L_twoRounds, L_cc20_const;
 4563     // The constant data is broken into two 128-bit segments to be loaded
 4564     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4565     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4566     // The second 128-bits is a table constant used for 8-bit left rotations.
 4567     __ BIND(L_cc20_const);
 4568     __ emit_int64(0x0000000100000000UL);
 4569     __ emit_int64(0x0000000300000002UL);
 4570     __ emit_int64(0x0605040702010003UL);
 4571     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4572 
 4573     __ align(CodeEntryAlignment);
 4574     StubId stub_id = StubId::stubgen_chacha20Block_id;
 4575     StubCodeMark mark(this, stub_id);
 4576     address start = __ pc();
 4577     __ enter();
 4578 
 4579     int i, j;
 4580     const Register state = c_rarg0;
 4581     const Register keystream = c_rarg1;
 4582     const Register loopCtr = r10;
 4583     const Register tmpAddr = r11;
 4584     const FloatRegister ctrAddOverlay = v28;
 4585     const FloatRegister lrot8Tbl = v29;
 4586 
 4587     // Organize SIMD registers in an array that facilitates
 4588     // putting repetitive opcodes into loop structures.  It is
 4589     // important that each grouping of 4 registers is monotonically
 4590     // increasing to support the requirements of multi-register
 4591     // instructions (e.g. ld4r, st4, etc.)
 4592     const FloatRegister workSt[16] = {
 4593          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4594         v20, v21, v22, v23, v24, v25, v26, v27
 4595     };
 4596 
 4597     // Pull in constant data.  The first 16 bytes are the add overlay
 4598     // which is applied to the vector holding the counter (state[12]).
 4599     // The second 16 bytes is the index register for the 8-bit left
 4600     // rotation tbl instruction.
 4601     __ adr(tmpAddr, L_cc20_const);
 4602     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4603 
 4604     // Load from memory and interlace across 16 SIMD registers,
 4605     // With each word from memory being broadcast to all lanes of
 4606     // each successive SIMD register.
 4607     //      Addr(0) -> All lanes in workSt[i]
 4608     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4609     __ mov(tmpAddr, state);
 4610     for (i = 0; i < 16; i += 4) {
 4611       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4612           __ post(tmpAddr, 16));
 4613     }
 4614     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4615 
 4616     // Before entering the loop, create 5 4-register arrays.  These
 4617     // will hold the 4 registers that represent the a/b/c/d fields
 4618     // in the quarter round operation.  For instance the "b" field
 4619     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4620     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4621     // since it is part of a diagonal organization.  The aSet and scratch
 4622     // register sets are defined at declaration time because they do not change
 4623     // organization at any point during the 20-round processing.
 4624     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4625     FloatRegister bSet[4];
 4626     FloatRegister cSet[4];
 4627     FloatRegister dSet[4];
 4628     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4629 
 4630     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4631     __ mov(loopCtr, 10);
 4632     __ BIND(L_twoRounds);
 4633 
 4634     // Set to columnar organization and do the following 4 quarter-rounds:
 4635     // QUARTERROUND(0, 4, 8, 12)
 4636     // QUARTERROUND(1, 5, 9, 13)
 4637     // QUARTERROUND(2, 6, 10, 14)
 4638     // QUARTERROUND(3, 7, 11, 15)
 4639     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4640     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4641     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4642 
 4643     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4644     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4645     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4646 
 4647     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4648     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4649     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4650 
 4651     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4652     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4653     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4654 
 4655     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4656     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4657     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4658 
 4659     // Set to diagonal organization and do the next 4 quarter-rounds:
 4660     // QUARTERROUND(0, 5, 10, 15)
 4661     // QUARTERROUND(1, 6, 11, 12)
 4662     // QUARTERROUND(2, 7, 8, 13)
 4663     // QUARTERROUND(3, 4, 9, 14)
 4664     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4665     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4666     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4667 
 4668     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4669     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4670     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4671 
 4672     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4673     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4674     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4675 
 4676     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4677     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4678     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4679 
 4680     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4681     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4682     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4683 
 4684     // Decrement and iterate
 4685     __ sub(loopCtr, loopCtr, 1);
 4686     __ cbnz(loopCtr, L_twoRounds);
 4687 
 4688     __ mov(tmpAddr, state);
 4689 
 4690     // Add the starting state back to the post-loop keystream
 4691     // state.  We read/interlace the state array from memory into
 4692     // 4 registers similar to what we did in the beginning.  Then
 4693     // add the counter overlay onto workSt[12] at the end.
 4694     for (i = 0; i < 16; i += 4) {
 4695       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4696       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4697       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4698       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4699       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4700     }
 4701     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4702 
 4703     // Write working state into the keystream buffer.  This is accomplished
 4704     // by taking the lane "i" from each of the four vectors and writing
 4705     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4706     // repeating with the next 4 vectors until all 16 vectors have been used.
 4707     // Then move to the next lane and repeat the process until all lanes have
 4708     // been written.
 4709     for (i = 0; i < 4; i++) {
 4710       for (j = 0; j < 16; j += 4) {
 4711         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4712             __ post(keystream, 16));
 4713       }
 4714     }
 4715 
 4716     __ mov(r0, 256);             // Return length of output keystream
 4717     __ leave();
 4718     __ ret(lr);
 4719 
 4720     return start;
 4721   }
 4722 
 4723   // Helpers to schedule parallel operation bundles across vector
 4724   // register sequences of size 2, 4 or 8.
 4725 
 4726   // Implement various primitive computations across vector sequences
 4727 
 4728   template<int N>
 4729   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4730                const VSeq<N>& v1, const VSeq<N>& v2) {
 4731     // output must not be constant
 4732     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4733     // output cannot overwrite pending inputs
 4734     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4735     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4736     for (int i = 0; i < N; i++) {
 4737       __ addv(v[i], T, v1[i], v2[i]);
 4738     }
 4739   }
 4740 
 4741   template<int N>
 4742   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4743                const VSeq<N>& v1, const VSeq<N>& v2) {
 4744     // output must not be constant
 4745     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4746     // output cannot overwrite pending inputs
 4747     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4748     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4749     for (int i = 0; i < N; i++) {
 4750       __ subv(v[i], T, v1[i], v2[i]);
 4751     }
 4752   }
 4753 
 4754   template<int N>
 4755   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4756                const VSeq<N>& v1, const VSeq<N>& v2) {
 4757     // output must not be constant
 4758     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4759     // output cannot overwrite pending inputs
 4760     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4761     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4762     for (int i = 0; i < N; i++) {
 4763       __ mulv(v[i], T, v1[i], v2[i]);
 4764     }
 4765   }
 4766 
 4767   template<int N>
 4768   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4769     // output must not be constant
 4770     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4771     // output cannot overwrite pending inputs
 4772     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4773     for (int i = 0; i < N; i++) {
 4774       __ negr(v[i], T, v1[i]);
 4775     }
 4776   }
 4777 
 4778   template<int N>
 4779   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4780                const VSeq<N>& v1, int shift) {
 4781     // output must not be constant
 4782     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4783     // output cannot overwrite pending inputs
 4784     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4785     for (int i = 0; i < N; i++) {
 4786       __ sshr(v[i], T, v1[i], shift);
 4787     }
 4788   }
 4789 
 4790   template<int N>
 4791   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4792     // output must not be constant
 4793     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4794     // output cannot overwrite pending inputs
 4795     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4796     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4797     for (int i = 0; i < N; i++) {
 4798       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4799     }
 4800   }
 4801 
 4802   template<int N>
 4803   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4804     // output must not be constant
 4805     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4806     // output cannot overwrite pending inputs
 4807     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4808     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4809     for (int i = 0; i < N; i++) {
 4810       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4811     }
 4812   }
 4813 
 4814   template<int N>
 4815   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4816     // output must not be constant
 4817     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4818     // output cannot overwrite pending inputs
 4819     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4820     for (int i = 0; i < N; i++) {
 4821       __ notr(v[i], __ T16B, v1[i]);
 4822     }
 4823   }
 4824 
 4825   template<int N>
 4826   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4827     // output must not be constant
 4828     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4829     // output cannot overwrite pending inputs
 4830     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4831     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4832     for (int i = 0; i < N; i++) {
 4833       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4834     }
 4835   }
 4836 
 4837   template<int N>
 4838   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4839     // output must not be constant
 4840     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4841     // output cannot overwrite pending inputs
 4842     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4843     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4844     for (int i = 0; i < N; i++) {
 4845       __ mlsv(v[i], T, v1[i], v2[i]);
 4846     }
 4847   }
 4848 
 4849   // load N/2 successive pairs of quadword values from memory in order
 4850   // into N successive vector registers of the sequence via the
 4851   // address supplied in base.
 4852   template<int N>
 4853   void vs_ldpq(const VSeq<N>& v, Register base) {
 4854     for (int i = 0; i < N; i += 2) {
 4855       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4856     }
 4857   }
 4858 
 4859   // load N/2 successive pairs of quadword values from memory in order
 4860   // into N vector registers of the sequence via the address supplied
 4861   // in base using post-increment addressing
 4862   template<int N>
 4863   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4864     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4865     for (int i = 0; i < N; i += 2) {
 4866       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4867     }
 4868   }
 4869 
 4870   // store N successive vector registers of the sequence into N/2
 4871   // successive pairs of quadword memory locations via the address
 4872   // supplied in base using post-increment addressing
 4873   template<int N>
 4874   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4875     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4876     for (int i = 0; i < N; i += 2) {
 4877       __ stpq(v[i], v[i+1], __ post(base, 32));
 4878     }
 4879   }
 4880 
 4881   // load N/2 pairs of quadword values from memory de-interleaved into
 4882   // N vector registers 2 at a time via the address supplied in base
 4883   // using post-increment addressing.
 4884   template<int N>
 4885   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4886     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4887     for (int i = 0; i < N; i += 2) {
 4888       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4889     }
 4890   }
 4891 
 4892   // store N vector registers interleaved into N/2 pairs of quadword
 4893   // memory locations via the address supplied in base using
 4894   // post-increment addressing.
 4895   template<int N>
 4896   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4897     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4898     for (int i = 0; i < N; i += 2) {
 4899       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4900     }
 4901   }
 4902 
 4903   // load N quadword values from memory de-interleaved into N vector
 4904   // registers 3 elements at a time via the address supplied in base.
 4905   template<int N>
 4906   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4907     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4908     for (int i = 0; i < N; i += 3) {
 4909       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4910     }
 4911   }
 4912 
 4913   // load N quadword values from memory de-interleaved into N vector
 4914   // registers 3 elements at a time via the address supplied in base
 4915   // using post-increment addressing.
 4916   template<int N>
 4917   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4918     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4919     for (int i = 0; i < N; i += 3) {
 4920       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4921     }
 4922   }
 4923 
 4924   // load N/2 pairs of quadword values from memory into N vector
 4925   // registers via the address supplied in base with each pair indexed
 4926   // using the the start offset plus the corresponding entry in the
 4927   // offsets array
 4928   template<int N>
 4929   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4930     for (int i = 0; i < N/2; i++) {
 4931       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4932     }
 4933   }
 4934 
 4935   // store N vector registers into N/2 pairs of quadword memory
 4936   // locations via the address supplied in base with each pair indexed
 4937   // using the the start offset plus the corresponding entry in the
 4938   // offsets array
 4939   template<int N>
 4940   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4941     for (int i = 0; i < N/2; i++) {
 4942       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4943     }
 4944   }
 4945 
 4946   // load N single quadword values from memory into N vector registers
 4947   // via the address supplied in base with each value indexed using
 4948   // the the start offset plus the corresponding entry in the offsets
 4949   // array
 4950   template<int N>
 4951   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4952                       int start, int (&offsets)[N]) {
 4953     for (int i = 0; i < N; i++) {
 4954       __ ldr(v[i], T, Address(base, start + offsets[i]));
 4955     }
 4956   }
 4957 
 4958   // store N vector registers into N single quadword memory locations
 4959   // via the address supplied in base with each value indexed using
 4960   // the the start offset plus the corresponding entry in the offsets
 4961   // array
 4962   template<int N>
 4963   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4964                       int start, int (&offsets)[N]) {
 4965     for (int i = 0; i < N; i++) {
 4966       __ str(v[i], T, Address(base, start + offsets[i]));
 4967     }
 4968   }
 4969 
 4970   // load N/2 pairs of quadword values from memory de-interleaved into
 4971   // N vector registers 2 at a time via the address supplied in base
 4972   // with each pair indexed using the the start offset plus the
 4973   // corresponding entry in the offsets array
 4974   template<int N>
 4975   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4976                       Register tmp, int start, int (&offsets)[N/2]) {
 4977     for (int i = 0; i < N/2; i++) {
 4978       __ add(tmp, base, start + offsets[i]);
 4979       __ ld2(v[2*i], v[2*i+1], T, tmp);
 4980     }
 4981   }
 4982 
 4983   // store N vector registers 2 at a time interleaved into N/2 pairs
 4984   // of quadword memory locations via the address supplied in base
 4985   // with each pair indexed using the the start offset plus the
 4986   // corresponding entry in the offsets array
 4987   template<int N>
 4988   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4989                       Register tmp, int start, int (&offsets)[N/2]) {
 4990     for (int i = 0; i < N/2; i++) {
 4991       __ add(tmp, base, start + offsets[i]);
 4992       __ st2(v[2*i], v[2*i+1], T, tmp);
 4993     }
 4994   }
 4995 
 4996   // Helper routines for various flavours of Montgomery multiply
 4997 
 4998   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 4999   // multiplications in parallel
 5000   //
 5001 
 5002   // See the montMul() method of the sun.security.provider.ML_DSA
 5003   // class.
 5004   //
 5005   // Computes 4x4S results or 8x8H results
 5006   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5007   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5008   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5009   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5010   // Outputs: va - 4x4S or 4x8H vector register sequences
 5011   // vb, vc, vtmp and vq must all be disjoint
 5012   // va must be disjoint from all other inputs/temps or must equal vc
 5013   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5014   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5015   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5016                    Assembler::SIMD_Arrangement T,
 5017                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5018     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5019     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5020     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5021     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5022 
 5023     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5024     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5025 
 5026     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5027 
 5028     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5029     assert(vs_disjoint(va, vb), "va and vb overlap");
 5030     assert(vs_disjoint(va, vq), "va and vq overlap");
 5031     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5032     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5033 
 5034     // schedule 4 streams of instructions across the vector sequences
 5035     for (int i = 0; i < 4; i++) {
 5036       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5037       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5038     }
 5039 
 5040     for (int i = 0; i < 4; i++) {
 5041       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5042     }
 5043 
 5044     for (int i = 0; i < 4; i++) {
 5045       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5046     }
 5047 
 5048     for (int i = 0; i < 4; i++) {
 5049       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5050     }
 5051   }
 5052 
 5053   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5054   // multiplications in parallel
 5055   //
 5056 
 5057   // See the montMul() method of the sun.security.provider.ML_DSA
 5058   // class.
 5059   //
 5060   // Computes 4x4S results or 8x8H results
 5061   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5062   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5063   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5064   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5065   // Outputs: va - 4x4S or 4x8H vector register sequences
 5066   // vb, vc, vtmp and vq must all be disjoint
 5067   // va must be disjoint from all other inputs/temps or must equal vc
 5068   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5069   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5070   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5071                    Assembler::SIMD_Arrangement T,
 5072                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5073     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5074     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5075     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5076     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5077 
 5078     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5079     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5080 
 5081     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5082 
 5083     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5084     assert(vs_disjoint(va, vb), "va and vb overlap");
 5085     assert(vs_disjoint(va, vq), "va and vq overlap");
 5086     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5087     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5088 
 5089     // schedule 2 streams of instructions across the vector sequences
 5090     for (int i = 0; i < 2; i++) {
 5091       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5092       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5093     }
 5094 
 5095     for (int i = 0; i < 2; i++) {
 5096       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5097     }
 5098 
 5099     for (int i = 0; i < 2; i++) {
 5100       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5101     }
 5102 
 5103     for (int i = 0; i < 2; i++) {
 5104       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5105     }
 5106   }
 5107 
 5108   // Perform 16 16-bit Montgomery multiplications in parallel.
 5109   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5110                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5111     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5112     // It will assert that the register use is valid
 5113     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5114   }
 5115 
 5116   // Perform 32 16-bit Montgomery multiplications in parallel.
 5117   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5118                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5119     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5120     // It will assert that the register use is valid
 5121     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5122   }
 5123 
 5124   // Perform 64 16-bit Montgomery multiplications in parallel.
 5125   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5126                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5127     // Schedule two successive 4x8H multiplies via the montmul helper
 5128     // on the front and back halves of va, vb and vc. The helper will
 5129     // assert that the register use has no overlap conflicts on each
 5130     // individual call but we also need to ensure that the necessary
 5131     // disjoint/equality constraints are met across both calls.
 5132 
 5133     // vb, vc, vtmp and vq must be disjoint. va must either be
 5134     // disjoint from all other registers or equal vc
 5135 
 5136     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5137     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5138     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5139 
 5140     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5141     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5142 
 5143     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5144 
 5145     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5146     assert(vs_disjoint(va, vb), "va and vb overlap");
 5147     assert(vs_disjoint(va, vq), "va and vq overlap");
 5148     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5149 
 5150     // we multiply the front and back halves of each sequence 4 at a
 5151     // time because
 5152     //
 5153     // 1) we are currently only able to get 4-way instruction
 5154     // parallelism at best
 5155     //
 5156     // 2) we need registers for the constants in vq and temporary
 5157     // scratch registers to hold intermediate results so vtmp can only
 5158     // be a VSeq<4> which means we only have 4 scratch slots
 5159 
 5160     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5161     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5162   }
 5163 
 5164   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5165                                const VSeq<4>& vc,
 5166                                const VSeq<4>& vtmp,
 5167                                const VSeq<2>& vq) {
 5168     // compute a = montmul(a1, c)
 5169     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5170     // ouptut a1 = a0 - a
 5171     vs_subv(va1, __ T8H, va0, vc);
 5172     //    and a0 = a0 + a
 5173     vs_addv(va0, __ T8H, va0, vc);
 5174   }
 5175 
 5176   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5177                                const VSeq<4>& vb,
 5178                                const VSeq<4>& vtmp1,
 5179                                const VSeq<4>& vtmp2,
 5180                                const VSeq<2>& vq) {
 5181     // compute c = a0 - a1
 5182     vs_subv(vtmp1, __ T8H, va0, va1);
 5183     // output a0 = a0 + a1
 5184     vs_addv(va0, __ T8H, va0, va1);
 5185     // output a1 = b montmul c
 5186     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5187   }
 5188 
 5189   void load64shorts(const VSeq<8>& v, Register shorts) {
 5190     vs_ldpq_post(v, shorts);
 5191   }
 5192 
 5193   void load32shorts(const VSeq<4>& v, Register shorts) {
 5194     vs_ldpq_post(v, shorts);
 5195   }
 5196 
 5197   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5198     vs_stpq_post(v, tmpAddr);
 5199   }
 5200 
 5201   // Kyber NTT function.
 5202   // Implements
 5203   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5204   //
 5205   // coeffs (short[256]) = c_rarg0
 5206   // ntt_zetas (short[256]) = c_rarg1
 5207   address generate_kyberNtt() {
 5208 
 5209     __ align(CodeEntryAlignment);
 5210     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5211     StubCodeMark mark(this, stub_id);
 5212     address start = __ pc();
 5213     __ enter();
 5214 
 5215     const Register coeffs = c_rarg0;
 5216     const Register zetas = c_rarg1;
 5217 
 5218     const Register kyberConsts = r10;
 5219     const Register tmpAddr = r11;
 5220 
 5221     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5222     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5223     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5224 
 5225     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5226     // load the montmul constants
 5227     vs_ldpq(vq, kyberConsts);
 5228 
 5229     // Each level corresponds to an iteration of the outermost loop of the
 5230     // Java method seilerNTT(int[] coeffs). There are some differences
 5231     // from what is done in the seilerNTT() method, though:
 5232     // 1. The computation is using 16-bit signed values, we do not convert them
 5233     // to ints here.
 5234     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5235     // this array for each level, it is easier that way to fill up the vector
 5236     // registers.
 5237     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5238     // multiplications (this is because that way there should not be any
 5239     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5240     // that we can use the 16-bit arithmetic in the vector unit.
 5241     //
 5242     // On each level, we fill up the vector registers in such a way that the
 5243     // array elements that need to be multiplied by the zetas go into one
 5244     // set of vector registers while the corresponding ones that don't need to
 5245     // be multiplied, go into another set.
 5246     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5247     // registers interleaving the steps of 4 identical computations,
 5248     // each done on 8 16-bit values per register.
 5249 
 5250     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5251     // to the zetas occur in discrete blocks whose size is some multiple
 5252     // of 32.
 5253 
 5254     // level 0
 5255     __ add(tmpAddr, coeffs, 256);
 5256     load64shorts(vs1, tmpAddr);
 5257     load64shorts(vs2, zetas);
 5258     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5259     __ add(tmpAddr, coeffs, 0);
 5260     load64shorts(vs1, tmpAddr);
 5261     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5262     vs_addv(vs1, __ T8H, vs1, vs2);
 5263     __ add(tmpAddr, coeffs, 0);
 5264     vs_stpq_post(vs1, tmpAddr);
 5265     __ add(tmpAddr, coeffs, 256);
 5266     vs_stpq_post(vs3, tmpAddr);
 5267     // restore montmul constants
 5268     vs_ldpq(vq, kyberConsts);
 5269     load64shorts(vs1, tmpAddr);
 5270     load64shorts(vs2, zetas);
 5271     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5272     __ add(tmpAddr, coeffs, 128);
 5273     load64shorts(vs1, tmpAddr);
 5274     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5275     vs_addv(vs1, __ T8H, vs1, vs2);
 5276     __ add(tmpAddr, coeffs, 128);
 5277     store64shorts(vs1, tmpAddr);
 5278     __ add(tmpAddr, coeffs, 384);
 5279     store64shorts(vs3, tmpAddr);
 5280 
 5281     // level 1
 5282     // restore montmul constants
 5283     vs_ldpq(vq, kyberConsts);
 5284     __ add(tmpAddr, coeffs, 128);
 5285     load64shorts(vs1, tmpAddr);
 5286     load64shorts(vs2, zetas);
 5287     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5288     __ add(tmpAddr, coeffs, 0);
 5289     load64shorts(vs1, tmpAddr);
 5290     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5291     vs_addv(vs1, __ T8H, vs1, vs2);
 5292     __ add(tmpAddr, coeffs, 0);
 5293     store64shorts(vs1, tmpAddr);
 5294     store64shorts(vs3, tmpAddr);
 5295     vs_ldpq(vq, kyberConsts);
 5296     __ add(tmpAddr, coeffs, 384);
 5297     load64shorts(vs1, tmpAddr);
 5298     load64shorts(vs2, zetas);
 5299     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5300     __ add(tmpAddr, coeffs, 256);
 5301     load64shorts(vs1, tmpAddr);
 5302     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5303     vs_addv(vs1, __ T8H, vs1, vs2);
 5304     __ add(tmpAddr, coeffs, 256);
 5305     store64shorts(vs1, tmpAddr);
 5306     store64shorts(vs3, tmpAddr);
 5307 
 5308     // level 2
 5309     vs_ldpq(vq, kyberConsts);
 5310     int offsets1[4] = { 0, 32, 128, 160 };
 5311     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5312     load64shorts(vs2, zetas);
 5313     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5314     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5315     // kyber_subv_addv64();
 5316     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5317     vs_addv(vs1, __ T8H, vs1, vs2);
 5318     __ add(tmpAddr, coeffs, 0);
 5319     vs_stpq_post(vs_front(vs1), tmpAddr);
 5320     vs_stpq_post(vs_front(vs3), tmpAddr);
 5321     vs_stpq_post(vs_back(vs1), tmpAddr);
 5322     vs_stpq_post(vs_back(vs3), tmpAddr);
 5323     vs_ldpq(vq, kyberConsts);
 5324     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5325     load64shorts(vs2, zetas);
 5326     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5327     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5328     // kyber_subv_addv64();
 5329     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5330     vs_addv(vs1, __ T8H, vs1, vs2);
 5331     __ add(tmpAddr, coeffs, 256);
 5332     vs_stpq_post(vs_front(vs1), tmpAddr);
 5333     vs_stpq_post(vs_front(vs3), tmpAddr);
 5334     vs_stpq_post(vs_back(vs1), tmpAddr);
 5335     vs_stpq_post(vs_back(vs3), tmpAddr);
 5336 
 5337     // level 3
 5338     vs_ldpq(vq, kyberConsts);
 5339     int offsets2[4] = { 0, 64, 128, 192 };
 5340     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5341     load64shorts(vs2, zetas);
 5342     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5343     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5344     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5345     vs_addv(vs1, __ T8H, vs1, vs2);
 5346     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5347     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5348 
 5349     vs_ldpq(vq, kyberConsts);
 5350     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5351     load64shorts(vs2, zetas);
 5352     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5353     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5354     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5355     vs_addv(vs1, __ T8H, vs1, vs2);
 5356     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5357     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5358 
 5359     // level 4
 5360     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5361     // so they are loaded using employing an ldr at 8 distinct offsets.
 5362 
 5363     vs_ldpq(vq, kyberConsts);
 5364     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5365     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5366     load64shorts(vs2, zetas);
 5367     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5368     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5369     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5370     vs_addv(vs1, __ T8H, vs1, vs2);
 5371     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5372     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5373 
 5374     vs_ldpq(vq, kyberConsts);
 5375     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5376     load64shorts(vs2, zetas);
 5377     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5378     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5379     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5380     vs_addv(vs1, __ T8H, vs1, vs2);
 5381     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5382     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5383 
 5384     // level 5
 5385     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5386     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5387 
 5388     vs_ldpq(vq, kyberConsts);
 5389     int offsets4[4] = { 0, 32, 64, 96 };
 5390     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5391     load32shorts(vs_front(vs2), zetas);
 5392     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5393     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5394     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5395     load32shorts(vs_front(vs2), zetas);
 5396     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5397     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5398     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5399     load32shorts(vs_front(vs2), zetas);
 5400     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5401     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5402 
 5403     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5404     load32shorts(vs_front(vs2), zetas);
 5405     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5406     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5407 
 5408     // level 6
 5409     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5410     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5411 
 5412     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5413     load32shorts(vs_front(vs2), zetas);
 5414     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5415     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5416     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5417     // __ ldpq(v18, v19, __ post(zetas, 32));
 5418     load32shorts(vs_front(vs2), zetas);
 5419     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5420     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5421 
 5422     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5423     load32shorts(vs_front(vs2), zetas);
 5424     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5425     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5426 
 5427     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5428     load32shorts(vs_front(vs2), zetas);
 5429     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5430     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5431 
 5432     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5433     __ mov(r0, zr); // return 0
 5434     __ ret(lr);
 5435 
 5436     return start;
 5437   }
 5438 
 5439   // Kyber Inverse NTT function
 5440   // Implements
 5441   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5442   //
 5443   // coeffs (short[256]) = c_rarg0
 5444   // ntt_zetas (short[256]) = c_rarg1
 5445   address generate_kyberInverseNtt() {
 5446 
 5447     __ align(CodeEntryAlignment);
 5448     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5449     StubCodeMark mark(this, stub_id);
 5450     address start = __ pc();
 5451     __ enter();
 5452 
 5453     const Register coeffs = c_rarg0;
 5454     const Register zetas = c_rarg1;
 5455 
 5456     const Register kyberConsts = r10;
 5457     const Register tmpAddr = r11;
 5458     const Register tmpAddr2 = c_rarg2;
 5459 
 5460     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5461     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5462     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5463 
 5464     __ lea(kyberConsts,
 5465              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5466 
 5467     // level 0
 5468     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5469     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5470 
 5471     vs_ldpq(vq, kyberConsts);
 5472     int offsets4[4] = { 0, 32, 64, 96 };
 5473     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5474     load32shorts(vs_front(vs2), zetas);
 5475     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5476                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5477     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5478     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5479     load32shorts(vs_front(vs2), zetas);
 5480     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5481                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5482     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5483     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5484     load32shorts(vs_front(vs2), zetas);
 5485     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5486                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5487     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5488     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5489     load32shorts(vs_front(vs2), zetas);
 5490     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5491                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5492     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5493 
 5494     // level 1
 5495     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5496     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5497 
 5498     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5499     load32shorts(vs_front(vs2), zetas);
 5500     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5501                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5502     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5503     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5504     load32shorts(vs_front(vs2), zetas);
 5505     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5506                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5507     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5508 
 5509     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5510     load32shorts(vs_front(vs2), zetas);
 5511     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5512                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5513     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5514     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5515     load32shorts(vs_front(vs2), zetas);
 5516     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5517                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5518     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5519 
 5520     // level 2
 5521     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5522     // so they are loaded using employing an ldr at 8 distinct offsets.
 5523 
 5524     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5525     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5526     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5527     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5528     vs_subv(vs1, __ T8H, vs1, vs2);
 5529     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5530     load64shorts(vs2, zetas);
 5531     vs_ldpq(vq, kyberConsts);
 5532     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5533     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5534 
 5535     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5536     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5537     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5538     vs_subv(vs1, __ T8H, vs1, vs2);
 5539     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5540     load64shorts(vs2, zetas);
 5541     vs_ldpq(vq, kyberConsts);
 5542     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5543     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5544 
 5545     // Barrett reduction at indexes where overflow may happen
 5546 
 5547     // load q and the multiplier for the Barrett reduction
 5548     __ add(tmpAddr, kyberConsts, 16);
 5549     vs_ldpq(vq, tmpAddr);
 5550 
 5551     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5552     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5553     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5554     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5555     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5556     vs_sshr(vs2, __ T8H, vs2, 11);
 5557     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5558     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5559     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5560     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5561     vs_sshr(vs2, __ T8H, vs2, 11);
 5562     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5563     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5564 
 5565     // level 3
 5566     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5567     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5568 
 5569     int offsets2[4] = { 0, 64, 128, 192 };
 5570     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5571     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5572     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5573     vs_subv(vs1, __ T8H, vs1, vs2);
 5574     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5575     load64shorts(vs2, zetas);
 5576     vs_ldpq(vq, kyberConsts);
 5577     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5578     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5579 
 5580     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5581     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5582     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5583     vs_subv(vs1, __ T8H, vs1, vs2);
 5584     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5585     load64shorts(vs2, zetas);
 5586     vs_ldpq(vq, kyberConsts);
 5587     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5588     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5589 
 5590     // level 4
 5591 
 5592     int offsets1[4] = { 0, 32, 128, 160 };
 5593     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5594     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5595     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5596     vs_subv(vs1, __ T8H, vs1, vs2);
 5597     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5598     load64shorts(vs2, zetas);
 5599     vs_ldpq(vq, kyberConsts);
 5600     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5601     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5602 
 5603     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5604     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5605     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5606     vs_subv(vs1, __ T8H, vs1, vs2);
 5607     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5608     load64shorts(vs2, zetas);
 5609     vs_ldpq(vq, kyberConsts);
 5610     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5611     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5612 
 5613     // level 5
 5614 
 5615     __ add(tmpAddr, coeffs, 0);
 5616     load64shorts(vs1, tmpAddr);
 5617     __ add(tmpAddr, coeffs, 128);
 5618     load64shorts(vs2, tmpAddr);
 5619     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5620     vs_subv(vs1, __ T8H, vs1, vs2);
 5621     __ add(tmpAddr, coeffs, 0);
 5622     store64shorts(vs3, tmpAddr);
 5623     load64shorts(vs2, zetas);
 5624     vs_ldpq(vq, kyberConsts);
 5625     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5626     __ add(tmpAddr, coeffs, 128);
 5627     store64shorts(vs2, tmpAddr);
 5628 
 5629     load64shorts(vs1, tmpAddr);
 5630     __ add(tmpAddr, coeffs, 384);
 5631     load64shorts(vs2, tmpAddr);
 5632     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5633     vs_subv(vs1, __ T8H, vs1, vs2);
 5634     __ add(tmpAddr, coeffs, 256);
 5635     store64shorts(vs3, tmpAddr);
 5636     load64shorts(vs2, zetas);
 5637     vs_ldpq(vq, kyberConsts);
 5638     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5639     __ add(tmpAddr, coeffs, 384);
 5640     store64shorts(vs2, tmpAddr);
 5641 
 5642     // Barrett reduction at indexes where overflow may happen
 5643 
 5644     // load q and the multiplier for the Barrett reduction
 5645     __ add(tmpAddr, kyberConsts, 16);
 5646     vs_ldpq(vq, tmpAddr);
 5647 
 5648     int offsets0[2] = { 0, 256 };
 5649     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5650     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5651     vs_sshr(vs2, __ T8H, vs2, 11);
 5652     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5653     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5654 
 5655     // level 6
 5656 
 5657     __ add(tmpAddr, coeffs, 0);
 5658     load64shorts(vs1, tmpAddr);
 5659     __ add(tmpAddr, coeffs, 256);
 5660     load64shorts(vs2, tmpAddr);
 5661     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5662     vs_subv(vs1, __ T8H, vs1, vs2);
 5663     __ add(tmpAddr, coeffs, 0);
 5664     store64shorts(vs3, tmpAddr);
 5665     load64shorts(vs2, zetas);
 5666     vs_ldpq(vq, kyberConsts);
 5667     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5668     __ add(tmpAddr, coeffs, 256);
 5669     store64shorts(vs2, tmpAddr);
 5670 
 5671     __ add(tmpAddr, coeffs, 128);
 5672     load64shorts(vs1, tmpAddr);
 5673     __ add(tmpAddr, coeffs, 384);
 5674     load64shorts(vs2, tmpAddr);
 5675     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5676     vs_subv(vs1, __ T8H, vs1, vs2);
 5677     __ add(tmpAddr, coeffs, 128);
 5678     store64shorts(vs3, tmpAddr);
 5679     load64shorts(vs2, zetas);
 5680     vs_ldpq(vq, kyberConsts);
 5681     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5682     __ add(tmpAddr, coeffs, 384);
 5683     store64shorts(vs2, tmpAddr);
 5684 
 5685     // multiply by 2^-n
 5686 
 5687     // load toMont(2^-n mod q)
 5688     __ add(tmpAddr, kyberConsts, 48);
 5689     __ ldr(v29, __ Q, tmpAddr);
 5690 
 5691     vs_ldpq(vq, kyberConsts);
 5692     __ add(tmpAddr, coeffs, 0);
 5693     load64shorts(vs1, tmpAddr);
 5694     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5695     __ add(tmpAddr, coeffs, 0);
 5696     store64shorts(vs2, tmpAddr);
 5697 
 5698     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5699     load64shorts(vs1, tmpAddr);
 5700     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5701     __ add(tmpAddr, coeffs, 128);
 5702     store64shorts(vs2, tmpAddr);
 5703 
 5704     // now tmpAddr contains coeffs + 256
 5705     load64shorts(vs1, tmpAddr);
 5706     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5707     __ add(tmpAddr, coeffs, 256);
 5708     store64shorts(vs2, tmpAddr);
 5709 
 5710     // now tmpAddr contains coeffs + 384
 5711     load64shorts(vs1, tmpAddr);
 5712     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5713     __ add(tmpAddr, coeffs, 384);
 5714     store64shorts(vs2, tmpAddr);
 5715 
 5716     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5717     __ mov(r0, zr); // return 0
 5718     __ ret(lr);
 5719 
 5720     return start;
 5721   }
 5722 
 5723   // Kyber multiply polynomials in the NTT domain.
 5724   // Implements
 5725   // static int implKyberNttMult(
 5726   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5727   //
 5728   // result (short[256]) = c_rarg0
 5729   // ntta (short[256]) = c_rarg1
 5730   // nttb (short[256]) = c_rarg2
 5731   // zetas (short[128]) = c_rarg3
 5732   address generate_kyberNttMult() {
 5733 
 5734     __ align(CodeEntryAlignment);
 5735     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 5736     StubCodeMark mark(this, stub_id);
 5737     address start = __ pc();
 5738     __ enter();
 5739 
 5740     const Register result = c_rarg0;
 5741     const Register ntta = c_rarg1;
 5742     const Register nttb = c_rarg2;
 5743     const Register zetas = c_rarg3;
 5744 
 5745     const Register kyberConsts = r10;
 5746     const Register limit = r11;
 5747 
 5748     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5749     VSeq<4> vs3(16), vs4(20);
 5750     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5751     VSeq<2> vz(28);          // pair of zetas
 5752     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5753 
 5754     __ lea(kyberConsts,
 5755              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5756 
 5757     Label kyberNttMult_loop;
 5758 
 5759     __ add(limit, result, 512);
 5760 
 5761     // load q and qinv
 5762     vs_ldpq(vq, kyberConsts);
 5763 
 5764     // load R^2 mod q (to convert back from Montgomery representation)
 5765     __ add(kyberConsts, kyberConsts, 64);
 5766     __ ldr(v27, __ Q, kyberConsts);
 5767 
 5768     __ BIND(kyberNttMult_loop);
 5769 
 5770     // load 16 zetas
 5771     vs_ldpq_post(vz, zetas);
 5772 
 5773     // load 2 sets of 32 coefficients from the two input arrays
 5774     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5775     // are striped across pairs of vector registers
 5776     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5777     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5778     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5779     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5780 
 5781     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5782     // i.e. montmul the first and second halves of vs1 in order and
 5783     // then with one sequence reversed storing the two results in vs3
 5784     //
 5785     // vs3[0] <- montmul(a0, b0)
 5786     // vs3[1] <- montmul(a1, b1)
 5787     // vs3[2] <- montmul(a0, b1)
 5788     // vs3[3] <- montmul(a1, b0)
 5789     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5790     kyber_montmul16(vs_back(vs3),
 5791                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5792 
 5793     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5794     // i.e. montmul the first and second halves of vs4 in order and
 5795     // then with one sequence reversed storing the two results in vs1
 5796     //
 5797     // vs1[0] <- montmul(a2, b2)
 5798     // vs1[1] <- montmul(a3, b3)
 5799     // vs1[2] <- montmul(a2, b3)
 5800     // vs1[3] <- montmul(a3, b2)
 5801     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5802     kyber_montmul16(vs_back(vs1),
 5803                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5804 
 5805     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5806     // We can schedule two montmuls at a time if we use a suitable vector
 5807     // sequence <vs3[1], vs1[1]>.
 5808     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5809     VSeq<2> vs5(vs3[1], delta);
 5810 
 5811     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5812     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5813     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5814 
 5815     // add results in pairs storing in vs3
 5816     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5817     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5818     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5819 
 5820     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5821     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5822     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5823 
 5824     // vs1 <- montmul(vs3, montRSquareModQ)
 5825     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5826 
 5827     // store back the two pairs of result vectors de-interleaved as 8H elements
 5828     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5829     // in memory
 5830     vs_st2_post(vs1, __ T8H, result);
 5831 
 5832     __ cmp(result, limit);
 5833     __ br(Assembler::NE, kyberNttMult_loop);
 5834 
 5835     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5836     __ mov(r0, zr); // return 0
 5837     __ ret(lr);
 5838 
 5839     return start;
 5840   }
 5841 
 5842   // Kyber add 2 polynomials.
 5843   // Implements
 5844   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5845   //
 5846   // result (short[256]) = c_rarg0
 5847   // a (short[256]) = c_rarg1
 5848   // b (short[256]) = c_rarg2
 5849   address generate_kyberAddPoly_2() {
 5850 
 5851     __ align(CodeEntryAlignment);
 5852     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 5853     StubCodeMark mark(this, stub_id);
 5854     address start = __ pc();
 5855     __ enter();
 5856 
 5857     const Register result = c_rarg0;
 5858     const Register a = c_rarg1;
 5859     const Register b = c_rarg2;
 5860 
 5861     const Register kyberConsts = r11;
 5862 
 5863     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5864     // So, we can load, add and store the data in 3 groups of 11,
 5865     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5866     // registers. A further constraint is that the mapping needs
 5867     // to skip callee saves. So, we allocate the register
 5868     // sequences using two 8 sequences, two 2 sequences and two
 5869     // single registers.
 5870     VSeq<8> vs1_1(0);
 5871     VSeq<2> vs1_2(16);
 5872     FloatRegister vs1_3 = v28;
 5873     VSeq<8> vs2_1(18);
 5874     VSeq<2> vs2_2(26);
 5875     FloatRegister vs2_3 = v29;
 5876 
 5877     // two constant vector sequences
 5878     VSeq<8> vc_1(31, 0);
 5879     VSeq<2> vc_2(31, 0);
 5880 
 5881     FloatRegister vc_3 = v31;
 5882     __ lea(kyberConsts,
 5883              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5884 
 5885     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5886     for (int i = 0; i < 3; i++) {
 5887       // load 80 or 88 values from a into vs1_1/2/3
 5888       vs_ldpq_post(vs1_1, a);
 5889       vs_ldpq_post(vs1_2, a);
 5890       if (i < 2) {
 5891         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5892       }
 5893       // load 80 or 88 values from b into vs2_1/2/3
 5894       vs_ldpq_post(vs2_1, b);
 5895       vs_ldpq_post(vs2_2, b);
 5896       if (i < 2) {
 5897         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5898       }
 5899       // sum 80 or 88 values across vs1 and vs2 into vs1
 5900       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5901       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5902       if (i < 2) {
 5903         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5904       }
 5905       // add constant to all 80 or 88 results
 5906       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5907       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5908       if (i < 2) {
 5909         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5910       }
 5911       // store 80 or 88 values
 5912       vs_stpq_post(vs1_1, result);
 5913       vs_stpq_post(vs1_2, result);
 5914       if (i < 2) {
 5915         __ str(vs1_3, __ Q, __ post(result, 16));
 5916       }
 5917     }
 5918 
 5919     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5920     __ mov(r0, zr); // return 0
 5921     __ ret(lr);
 5922 
 5923     return start;
 5924   }
 5925 
 5926   // Kyber add 3 polynomials.
 5927   // Implements
 5928   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5929   //
 5930   // result (short[256]) = c_rarg0
 5931   // a (short[256]) = c_rarg1
 5932   // b (short[256]) = c_rarg2
 5933   // c (short[256]) = c_rarg3
 5934   address generate_kyberAddPoly_3() {
 5935 
 5936     __ align(CodeEntryAlignment);
 5937     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 5938     StubCodeMark mark(this, stub_id);
 5939     address start = __ pc();
 5940     __ enter();
 5941 
 5942     const Register result = c_rarg0;
 5943     const Register a = c_rarg1;
 5944     const Register b = c_rarg2;
 5945     const Register c = c_rarg3;
 5946 
 5947     const Register kyberConsts = r11;
 5948 
 5949     // As above we sum 256 sets of values in total i.e. 32 x 8H
 5950     // quadwords.  So, we can load, add and store the data in 3
 5951     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 5952     // of 10 or 11 registers. A further constraint is that the
 5953     // mapping needs to skip callee saves. So, we allocate the
 5954     // register sequences using two 8 sequences, two 2 sequences
 5955     // and two single registers.
 5956     VSeq<8> vs1_1(0);
 5957     VSeq<2> vs1_2(16);
 5958     FloatRegister vs1_3 = v28;
 5959     VSeq<8> vs2_1(18);
 5960     VSeq<2> vs2_2(26);
 5961     FloatRegister vs2_3 = v29;
 5962 
 5963     // two constant vector sequences
 5964     VSeq<8> vc_1(31, 0);
 5965     VSeq<2> vc_2(31, 0);
 5966 
 5967     FloatRegister vc_3 = v31;
 5968 
 5969     __ lea(kyberConsts,
 5970              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5971 
 5972     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5973     for (int i = 0; i < 3; i++) {
 5974       // load 80 or 88 values from a into vs1_1/2/3
 5975       vs_ldpq_post(vs1_1, a);
 5976       vs_ldpq_post(vs1_2, a);
 5977       if (i < 2) {
 5978         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5979       }
 5980       // load 80 or 88 values from b into vs2_1/2/3
 5981       vs_ldpq_post(vs2_1, b);
 5982       vs_ldpq_post(vs2_2, b);
 5983       if (i < 2) {
 5984         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5985       }
 5986       // sum 80 or 88 values across vs1 and vs2 into vs1
 5987       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5988       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5989       if (i < 2) {
 5990         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5991       }
 5992       // load 80 or 88 values from c into vs2_1/2/3
 5993       vs_ldpq_post(vs2_1, c);
 5994       vs_ldpq_post(vs2_2, c);
 5995       if (i < 2) {
 5996         __ ldr(vs2_3, __ Q, __ post(c, 16));
 5997       }
 5998       // sum 80 or 88 values across vs1 and vs2 into vs1
 5999       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6000       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6001       if (i < 2) {
 6002         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6003       }
 6004       // add constant to all 80 or 88 results
 6005       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6006       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6007       if (i < 2) {
 6008         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6009       }
 6010       // store 80 or 88 values
 6011       vs_stpq_post(vs1_1, result);
 6012       vs_stpq_post(vs1_2, result);
 6013       if (i < 2) {
 6014         __ str(vs1_3, __ Q, __ post(result, 16));
 6015       }
 6016     }
 6017 
 6018     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6019     __ mov(r0, zr); // return 0
 6020     __ ret(lr);
 6021 
 6022     return start;
 6023   }
 6024 
 6025   // Kyber parse XOF output to polynomial coefficient candidates
 6026   // or decodePoly(12, ...).
 6027   // Implements
 6028   // static int implKyber12To16(
 6029   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6030   //
 6031   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 6032   //
 6033   // condensed (byte[]) = c_rarg0
 6034   // condensedIndex = c_rarg1
 6035   // parsed (short[112 or 256]) = c_rarg2
 6036   // parsedLength (112 or 256) = c_rarg3
 6037   address generate_kyber12To16() {
 6038     Label L_F00, L_loop, L_end;
 6039 
 6040     __ BIND(L_F00);
 6041     __ emit_int64(0x0f000f000f000f00);
 6042     __ emit_int64(0x0f000f000f000f00);
 6043 
 6044     __ align(CodeEntryAlignment);
 6045     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6046     StubCodeMark mark(this, stub_id);
 6047     address start = __ pc();
 6048     __ enter();
 6049 
 6050     const Register condensed = c_rarg0;
 6051     const Register condensedOffs = c_rarg1;
 6052     const Register parsed = c_rarg2;
 6053     const Register parsedLength = c_rarg3;
 6054 
 6055     const Register tmpAddr = r11;
 6056 
 6057     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6058     // quadwords so we need a 6 vector sequence for the inputs.
 6059     // Parsing produces 64 shorts, employing two 8 vector
 6060     // sequences to store and combine the intermediate data.
 6061     VSeq<6> vin(24);
 6062     VSeq<8> va(0), vb(16);
 6063 
 6064     __ adr(tmpAddr, L_F00);
 6065     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6066     __ add(condensed, condensed, condensedOffs);
 6067 
 6068     __ BIND(L_loop);
 6069     // load 96 (6 x 16B) byte values
 6070     vs_ld3_post(vin, __ T16B, condensed);
 6071 
 6072     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6073     // holds 48 (16x3) contiguous bytes from memory striped
 6074     // horizontally across each of the 16 byte lanes. Equivalently,
 6075     // that is 16 pairs of 12-bit integers. Likewise the back half
 6076     // holds the next 48 bytes in the same arrangement.
 6077 
 6078     // Each vector in the front half can also be viewed as a vertical
 6079     // strip across the 16 pairs of 12 bit integers. Each byte in
 6080     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6081     // byte in vin[1] stores the high 4 bits of the first int and the
 6082     // low 4 bits of the second int. Each byte in vin[2] stores the
 6083     // high 8 bits of the second int. Likewise the vectors in second
 6084     // half.
 6085 
 6086     // Converting the data to 16-bit shorts requires first of all
 6087     // expanding each of the 6 x 16B vectors into 6 corresponding
 6088     // pairs of 8H vectors. Mask, shift and add operations on the
 6089     // resulting vector pairs can be used to combine 4 and 8 bit
 6090     // parts of related 8H vector elements.
 6091     //
 6092     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6093     // twice, one copy manipulated to provide the lower 4 bits
 6094     // belonging to the first short in a pair and another copy
 6095     // manipulated to provide the higher 4 bits belonging to the
 6096     // second short in a pair. This is why the the vector sequences va
 6097     // and vb used to hold the expanded 8H elements are of length 8.
 6098 
 6099     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6100     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6101     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6102     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6103     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6104     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6105     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6106     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6107 
 6108     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6109     // and vb[4:5]
 6110     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6111     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6112     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6113     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6114     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6115     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6116 
 6117     // shift lo byte of copy 1 of the middle stripe into the high byte
 6118     __ shl(va[2], __ T8H, va[2], 8);
 6119     __ shl(va[3], __ T8H, va[3], 8);
 6120     __ shl(vb[2], __ T8H, vb[2], 8);
 6121     __ shl(vb[3], __ T8H, vb[3], 8);
 6122 
 6123     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6124     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6125     // are in bit positions [4..11].
 6126     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6127     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6128     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6129     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6130 
 6131     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6132     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6133     // copy2
 6134     __ andr(va[2], __ T16B, va[2], v31);
 6135     __ andr(va[3], __ T16B, va[3], v31);
 6136     __ ushr(va[4], __ T8H, va[4], 4);
 6137     __ ushr(va[5], __ T8H, va[5], 4);
 6138     __ andr(vb[2], __ T16B, vb[2], v31);
 6139     __ andr(vb[3], __ T16B, vb[3], v31);
 6140     __ ushr(vb[4], __ T8H, vb[4], 4);
 6141     __ ushr(vb[5], __ T8H, vb[5], 4);
 6142 
 6143     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6144     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6145     // n.b. the ordering ensures: i) inputs are consumed before they
 6146     // are overwritten ii) the order of 16-bit results across successive
 6147     // pairs of vectors in va and then vb reflects the order of the
 6148     // corresponding 12-bit inputs
 6149     __ addv(va[0], __ T8H, va[0], va[2]);
 6150     __ addv(va[2], __ T8H, va[1], va[3]);
 6151     __ addv(va[1], __ T8H, va[4], va[6]);
 6152     __ addv(va[3], __ T8H, va[5], va[7]);
 6153     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6154     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6155     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6156     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6157 
 6158     // store 64 results interleaved as shorts
 6159     vs_st2_post(vs_front(va), __ T8H, parsed);
 6160     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6161 
 6162     __ sub(parsedLength, parsedLength, 64);
 6163     __ cmp(parsedLength, (u1)64);
 6164     __ br(Assembler::GE, L_loop);
 6165     __ cbz(parsedLength, L_end);
 6166 
 6167     // if anything is left it should be a final 72 bytes of input
 6168     // i.e. a final 48 12-bit values. so we handle this by loading
 6169     // 48 bytes into all 16B lanes of front(vin) and only 24
 6170     // bytes into the lower 8B lane of back(vin)
 6171     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6172     vs_ld3(vs_back(vin), __ T8B, condensed);
 6173 
 6174     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6175     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6176     // 5 and target element 2 of vb duplicates element 4.
 6177     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6178     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6179     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6180     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6181     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6182     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6183 
 6184     // This time expand just the lower 8 lanes
 6185     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6186     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6187     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6188 
 6189     // shift lo byte of copy 1 of the middle stripe into the high byte
 6190     __ shl(va[2], __ T8H, va[2], 8);
 6191     __ shl(va[3], __ T8H, va[3], 8);
 6192     __ shl(vb[2], __ T8H, vb[2], 8);
 6193 
 6194     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6195     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6196     // int are in bit positions [4..11].
 6197     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6198     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6199     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6200 
 6201     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6202     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6203     // copy2
 6204     __ andr(va[2], __ T16B, va[2], v31);
 6205     __ andr(va[3], __ T16B, va[3], v31);
 6206     __ ushr(va[4], __ T8H, va[4], 4);
 6207     __ ushr(va[5], __ T8H, va[5], 4);
 6208     __ andr(vb[2], __ T16B, vb[2], v31);
 6209     __ ushr(vb[4], __ T8H, vb[4], 4);
 6210 
 6211 
 6212 
 6213     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6214     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6215 
 6216     // n.b. ordering ensures: i) inputs are consumed before they are
 6217     // overwritten ii) order of 16-bit results across succsessive
 6218     // pairs of vectors in va and then lower half of vb reflects order
 6219     // of corresponding 12-bit inputs
 6220     __ addv(va[0], __ T8H, va[0], va[2]);
 6221     __ addv(va[2], __ T8H, va[1], va[3]);
 6222     __ addv(va[1], __ T8H, va[4], va[6]);
 6223     __ addv(va[3], __ T8H, va[5], va[7]);
 6224     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6225     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6226 
 6227     // store 48 results interleaved as shorts
 6228     vs_st2_post(vs_front(va), __ T8H, parsed);
 6229     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6230 
 6231     __ BIND(L_end);
 6232 
 6233     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6234     __ mov(r0, zr); // return 0
 6235     __ ret(lr);
 6236 
 6237     return start;
 6238   }
 6239 
 6240   // Kyber Barrett reduce function.
 6241   // Implements
 6242   // static int implKyberBarrettReduce(short[] coeffs) {}
 6243   //
 6244   // coeffs (short[256]) = c_rarg0
 6245   address generate_kyberBarrettReduce() {
 6246 
 6247     __ align(CodeEntryAlignment);
 6248     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6249     StubCodeMark mark(this, stub_id);
 6250     address start = __ pc();
 6251     __ enter();
 6252 
 6253     const Register coeffs = c_rarg0;
 6254 
 6255     const Register kyberConsts = r10;
 6256     const Register result = r11;
 6257 
 6258     // As above we process 256 sets of values in total i.e. 32 x
 6259     // 8H quadwords. So, we can load, add and store the data in 3
 6260     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6261     // of 10 or 11 registers. A further constraint is that the
 6262     // mapping needs to skip callee saves. So, we allocate the
 6263     // register sequences using two 8 sequences, two 2 sequences
 6264     // and two single registers.
 6265     VSeq<8> vs1_1(0);
 6266     VSeq<2> vs1_2(16);
 6267     FloatRegister vs1_3 = v28;
 6268     VSeq<8> vs2_1(18);
 6269     VSeq<2> vs2_2(26);
 6270     FloatRegister vs2_3 = v29;
 6271 
 6272     // we also need a pair of corresponding constant sequences
 6273 
 6274     VSeq<8> vc1_1(30, 0);
 6275     VSeq<2> vc1_2(30, 0);
 6276     FloatRegister vc1_3 = v30; // for kyber_q
 6277 
 6278     VSeq<8> vc2_1(31, 0);
 6279     VSeq<2> vc2_2(31, 0);
 6280     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6281 
 6282     __ add(result, coeffs, 0);
 6283     __ lea(kyberConsts,
 6284              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6285 
 6286     // load q and the multiplier for the Barrett reduction
 6287     __ add(kyberConsts, kyberConsts, 16);
 6288     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6289 
 6290     for (int i = 0; i < 3; i++) {
 6291       // load 80 or 88 coefficients
 6292       vs_ldpq_post(vs1_1, coeffs);
 6293       vs_ldpq_post(vs1_2, coeffs);
 6294       if (i < 2) {
 6295         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6296       }
 6297 
 6298       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6299       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6300       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6301       if (i < 2) {
 6302         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6303       }
 6304 
 6305       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6306       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6307       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6308       if (i < 2) {
 6309         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6310       }
 6311 
 6312       // vs1 <- vs1 - vs2 * kyber_q
 6313       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6314       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6315       if (i < 2) {
 6316         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6317       }
 6318 
 6319       vs_stpq_post(vs1_1, result);
 6320       vs_stpq_post(vs1_2, result);
 6321       if (i < 2) {
 6322         __ str(vs1_3, __ Q, __ post(result, 16));
 6323       }
 6324     }
 6325 
 6326     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6327     __ mov(r0, zr); // return 0
 6328     __ ret(lr);
 6329 
 6330     return start;
 6331   }
 6332 
 6333 
 6334   // Dilithium-specific montmul helper routines that generate parallel
 6335   // code for, respectively, a single 4x4s vector sequence montmul or
 6336   // two such multiplies in a row.
 6337 
 6338   // Perform 16 32-bit Montgomery multiplications in parallel
 6339   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6340                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6341     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6342     // It will assert that the register use is valid
 6343     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6344   }
 6345 
 6346   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6347   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6348                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6349     // Schedule two successive 4x4S multiplies via the montmul helper
 6350     // on the front and back halves of va, vb and vc. The helper will
 6351     // assert that the register use has no overlap conflicts on each
 6352     // individual call but we also need to ensure that the necessary
 6353     // disjoint/equality constraints are met across both calls.
 6354 
 6355     // vb, vc, vtmp and vq must be disjoint. va must either be
 6356     // disjoint from all other registers or equal vc
 6357 
 6358     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6359     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6360     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6361 
 6362     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6363     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6364 
 6365     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6366 
 6367     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6368     assert(vs_disjoint(va, vb), "va and vb overlap");
 6369     assert(vs_disjoint(va, vq), "va and vq overlap");
 6370     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6371 
 6372     // We multiply the front and back halves of each sequence 4 at a
 6373     // time because
 6374     //
 6375     // 1) we are currently only able to get 4-way instruction
 6376     // parallelism at best
 6377     //
 6378     // 2) we need registers for the constants in vq and temporary
 6379     // scratch registers to hold intermediate results so vtmp can only
 6380     // be a VSeq<4> which means we only have 4 scratch slots.
 6381 
 6382     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6383     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6384   }
 6385 
 6386   // Perform combined montmul then add/sub on 4x4S vectors.
 6387   void dilithium_montmul16_sub_add(
 6388           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6389           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6390     // compute a = montmul(a1, c)
 6391     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6392     // ouptut a1 = a0 - a
 6393     vs_subv(va1, __ T4S, va0, vc);
 6394     //    and a0 = a0 + a
 6395     vs_addv(va0, __ T4S, va0, vc);
 6396   }
 6397 
 6398   // Perform combined add/sub then montul on 4x4S vectors.
 6399   void dilithium_sub_add_montmul16(
 6400           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6401           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6402     // compute c = a0 - a1
 6403     vs_subv(vtmp1, __ T4S, va0, va1);
 6404     // output a0 = a0 + a1
 6405     vs_addv(va0, __ T4S, va0, va1);
 6406     // output a1 = b montmul c
 6407     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6408   }
 6409 
 6410   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6411   // in the Java implementation come in sequences of at least 8, so we
 6412   // can use ldpq to collect the corresponding data into pairs of vector
 6413   // registers.
 6414   // We collect the coefficients corresponding to the 'j+l' indexes into
 6415   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6416   // then we do the (Montgomery) multiplications by the zetas in parallel
 6417   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6418   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6419   // v0-v7 and finally save the results back to the coeffs array.
 6420   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6421     const Register coeffs, const Register zetas) {
 6422     int c1 = 0;
 6423     int c2 = 512;
 6424     int startIncr;
 6425     // don't use callee save registers v8 - v15
 6426     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6427     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6428     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6429     int offsets[4] = { 0, 32, 64, 96 };
 6430 
 6431     for (int level = 0; level < 5; level++) {
 6432       int c1Start = c1;
 6433       int c2Start = c2;
 6434       if (level == 3) {
 6435         offsets[1] = 32;
 6436         offsets[2] = 128;
 6437         offsets[3] = 160;
 6438       } else if (level == 4) {
 6439         offsets[1] = 64;
 6440         offsets[2] = 128;
 6441         offsets[3] = 192;
 6442       }
 6443 
 6444       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6445       // time at 4 different offsets and multiply them in order by the
 6446       // next set of input values. So we employ indexed load and store
 6447       // pair instructions with arrangement 4S.
 6448       for (int i = 0; i < 4; i++) {
 6449         // reload q and qinv
 6450         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6451         // load 8x4S coefficients via second start pos == c2
 6452         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6453         // load next 8x4S inputs == b
 6454         vs_ldpq_post(vs2, zetas);
 6455         // compute a == c2 * b mod MONT_Q
 6456         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6457         // load 8x4s coefficients via first start pos == c1
 6458         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6459         // compute a1 =  c1 + a
 6460         vs_addv(vs3, __ T4S, vs1, vs2);
 6461         // compute a2 =  c1 - a
 6462         vs_subv(vs1, __ T4S, vs1, vs2);
 6463         // output a1 and a2
 6464         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6465         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6466 
 6467         int k = 4 * level + i;
 6468 
 6469         if (k > 7) {
 6470           startIncr = 256;
 6471         } else if (k == 5) {
 6472           startIncr = 384;
 6473         } else {
 6474           startIncr = 128;
 6475         }
 6476 
 6477         c1Start += startIncr;
 6478         c2Start += startIncr;
 6479       }
 6480 
 6481       c2 /= 2;
 6482     }
 6483   }
 6484 
 6485   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6486   // Implements the method
 6487   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6488   // of the Java class sun.security.provider
 6489   //
 6490   // coeffs (int[256]) = c_rarg0
 6491   // zetas (int[256]) = c_rarg1
 6492   address generate_dilithiumAlmostNtt() {
 6493 
 6494     __ align(CodeEntryAlignment);
 6495     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 6496     StubCodeMark mark(this, stub_id);
 6497     address start = __ pc();
 6498     __ enter();
 6499 
 6500     const Register coeffs = c_rarg0;
 6501     const Register zetas = c_rarg1;
 6502 
 6503     const Register tmpAddr = r9;
 6504     const Register dilithiumConsts = r10;
 6505     const Register result = r11;
 6506     // don't use callee save registers v8 - v15
 6507     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6508     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6509     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6510     int offsets[4] = { 0, 32, 64, 96};
 6511     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6512     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6513     __ add(result, coeffs, 0);
 6514     __ lea(dilithiumConsts,
 6515              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6516 
 6517     // Each level represents one iteration of the outer for loop of the Java version.
 6518 
 6519     // level 0-4
 6520     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6521 
 6522     // level 5
 6523 
 6524     // At level 5 the coefficients we need to combine with the zetas
 6525     // are grouped in memory in blocks of size 4. So, for both sets of
 6526     // coefficients we load 4 adjacent values at 8 different offsets
 6527     // using an indexed ldr with register variant Q and multiply them
 6528     // in sequence order by the next set of inputs. Likewise we store
 6529     // the resuls using an indexed str with register variant Q.
 6530     for (int i = 0; i < 1024; i += 256) {
 6531       // reload constants q, qinv each iteration as they get clobbered later
 6532       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6533       // load 32 (8x4S) coefficients via first offsets = c1
 6534       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6535       // load next 32 (8x4S) inputs = b
 6536       vs_ldpq_post(vs2, zetas);
 6537       // a = b montul c1
 6538       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6539       // load 32 (8x4S) coefficients via second offsets = c2
 6540       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6541       // add/sub with result of multiply
 6542       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6543       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6544       // write back new coefficients using same offsets
 6545       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6546       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6547     }
 6548 
 6549     // level 6
 6550     // At level 6 the coefficients we need to combine with the zetas
 6551     // are grouped in memory in pairs, the first two being montmul
 6552     // inputs and the second add/sub inputs. We can still implement
 6553     // the montmul+sub+add using 4-way parallelism but only if we
 6554     // combine the coefficients with the zetas 16 at a time. We load 8
 6555     // adjacent values at 4 different offsets using an ld2 load with
 6556     // arrangement 2D. That interleaves the lower and upper halves of
 6557     // each pair of quadwords into successive vector registers. We
 6558     // then need to montmul the 4 even elements of the coefficients
 6559     // register sequence by the zetas in order and then add/sub the 4
 6560     // odd elements of the coefficients register sequence. We use an
 6561     // equivalent st2 operation to store the results back into memory
 6562     // de-interleaved.
 6563     for (int i = 0; i < 1024; i += 128) {
 6564       // reload constants q, qinv each iteration as they get clobbered later
 6565       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6566       // load interleaved 16 (4x2D) coefficients via offsets
 6567       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6568       // load next 16 (4x4S) inputs
 6569       vs_ldpq_post(vs_front(vs2), zetas);
 6570       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6571       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6572                                   vs_front(vs2), vtmp, vq);
 6573       // store interleaved 16 (4x2D) coefficients via offsets
 6574       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6575     }
 6576 
 6577     // level 7
 6578     // At level 7 the coefficients we need to combine with the zetas
 6579     // occur singly with montmul inputs alterating with add/sub
 6580     // inputs. Once again we can use 4-way parallelism to combine 16
 6581     // zetas at a time. However, we have to load 8 adjacent values at
 6582     // 4 different offsets using an ld2 load with arrangement 4S. That
 6583     // interleaves the the odd words of each pair into one
 6584     // coefficients vector register and the even words of the pair
 6585     // into the next register. We then need to montmul the 4 even
 6586     // elements of the coefficients register sequence by the zetas in
 6587     // order and then add/sub the 4 odd elements of the coefficients
 6588     // register sequence. We use an equivalent st2 operation to store
 6589     // the results back into memory de-interleaved.
 6590 
 6591     for (int i = 0; i < 1024; i += 128) {
 6592       // reload constants q, qinv each iteration as they get clobbered later
 6593       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6594       // load interleaved 16 (4x4S) coefficients via offsets
 6595       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6596       // load next 16 (4x4S) inputs
 6597       vs_ldpq_post(vs_front(vs2), zetas);
 6598       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6599       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6600                                   vs_front(vs2), vtmp, vq);
 6601       // store interleaved 16 (4x4S) coefficients via offsets
 6602       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6603     }
 6604     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6605     __ mov(r0, zr); // return 0
 6606     __ ret(lr);
 6607 
 6608     return start;
 6609   }
 6610 
 6611   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6612   // in the Java implementation come in sequences of at least 8, so we
 6613   // can use ldpq to collect the corresponding data into pairs of vector
 6614   // registers
 6615   // We collect the coefficients that correspond to the 'j's into vs1
 6616   // the coefficiets that correspond to the 'j+l's into vs2 then
 6617   // do the additions into vs3 and the subtractions into vs1 then
 6618   // save the result of the additions, load the zetas into vs2
 6619   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6620   // finally save the results back to the coeffs array
 6621   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6622     const Register coeffs, const Register zetas) {
 6623     int c1 = 0;
 6624     int c2 = 32;
 6625     int startIncr;
 6626     int offsets[4];
 6627     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6628     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6629     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6630 
 6631     offsets[0] = 0;
 6632 
 6633     for (int level = 3; level < 8; level++) {
 6634       int c1Start = c1;
 6635       int c2Start = c2;
 6636       if (level == 3) {
 6637         offsets[1] = 64;
 6638         offsets[2] = 128;
 6639         offsets[3] = 192;
 6640       } else if (level == 4) {
 6641         offsets[1] = 32;
 6642         offsets[2] = 128;
 6643         offsets[3] = 160;
 6644       } else {
 6645         offsets[1] = 32;
 6646         offsets[2] = 64;
 6647         offsets[3] = 96;
 6648       }
 6649 
 6650       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6651       // time at 4 different offsets and multiply them in order by the
 6652       // next set of input values. So we employ indexed load and store
 6653       // pair instructions with arrangement 4S.
 6654       for (int i = 0; i < 4; i++) {
 6655         // load v1 32 (8x4S) coefficients relative to first start index
 6656         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6657         // load v2 32 (8x4S) coefficients relative to second start index
 6658         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6659         // a0 = v1 + v2 -- n.b. clobbers vqs
 6660         vs_addv(vs3, __ T4S, vs1, vs2);
 6661         // a1 = v1 - v2
 6662         vs_subv(vs1, __ T4S, vs1, vs2);
 6663         // save a1 relative to first start index
 6664         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6665         // load constants q, qinv each iteration as they get clobbered above
 6666         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6667         // load b next 32 (8x4S) inputs
 6668         vs_ldpq_post(vs2, zetas);
 6669         // a = a1 montmul b
 6670         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6671         // save a relative to second start index
 6672         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6673 
 6674         int k = 4 * level + i;
 6675 
 6676         if (k < 24) {
 6677           startIncr = 256;
 6678         } else if (k == 25) {
 6679           startIncr = 384;
 6680         } else {
 6681           startIncr = 128;
 6682         }
 6683 
 6684         c1Start += startIncr;
 6685         c2Start += startIncr;
 6686       }
 6687 
 6688       c2 *= 2;
 6689     }
 6690   }
 6691 
 6692   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6693   // Implements the method
 6694   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6695   // the sun.security.provider.ML_DSA class.
 6696   //
 6697   // coeffs (int[256]) = c_rarg0
 6698   // zetas (int[256]) = c_rarg1
 6699   address generate_dilithiumAlmostInverseNtt() {
 6700 
 6701     __ align(CodeEntryAlignment);
 6702     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 6703     StubCodeMark mark(this, stub_id);
 6704     address start = __ pc();
 6705     __ enter();
 6706 
 6707     const Register coeffs = c_rarg0;
 6708     const Register zetas = c_rarg1;
 6709 
 6710     const Register tmpAddr = r9;
 6711     const Register dilithiumConsts = r10;
 6712     const Register result = r11;
 6713     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6714     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6715     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6716     int offsets[4] = { 0, 32, 64, 96 };
 6717     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6718     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6719 
 6720     __ add(result, coeffs, 0);
 6721     __ lea(dilithiumConsts,
 6722              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6723 
 6724     // Each level represents one iteration of the outer for loop of the Java version
 6725 
 6726     // level 0
 6727     // At level 0 we need to interleave adjacent quartets of
 6728     // coefficients before we multiply and add/sub by the next 16
 6729     // zetas just as we did for level 7 in the multiply code. So we
 6730     // load and store the values using an ld2/st2 with arrangement 4S.
 6731     for (int i = 0; i < 1024; i += 128) {
 6732       // load constants q, qinv
 6733       // n.b. this can be moved out of the loop as they do not get
 6734       // clobbered by first two loops
 6735       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6736       // a0/a1 load interleaved 32 (8x4S) coefficients
 6737       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6738       // b load next 32 (8x4S) inputs
 6739       vs_ldpq_post(vs_front(vs2), zetas);
 6740       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6741       // n.b. second half of vs2 provides temporary register storage
 6742       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6743                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6744       // a0/a1 store interleaved 32 (8x4S) coefficients
 6745       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6746     }
 6747 
 6748     // level 1
 6749     // At level 1 we need to interleave pairs of adjacent pairs of
 6750     // coefficients before we multiply by the next 16 zetas just as we
 6751     // did for level 6 in the multiply code. So we load and store the
 6752     // values an ld2/st2 with arrangement 2D.
 6753     for (int i = 0; i < 1024; i += 128) {
 6754       // a0/a1 load interleaved 32 (8x2D) coefficients
 6755       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6756       // b load next 16 (4x4S) inputs
 6757       vs_ldpq_post(vs_front(vs2), zetas);
 6758       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6759       // n.b. second half of vs2 provides temporary register storage
 6760       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6761                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6762       // a0/a1 store interleaved 32 (8x2D) coefficients
 6763       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6764     }
 6765 
 6766     // level 2
 6767     // At level 2 coefficients come in blocks of 4. So, we load 4
 6768     // adjacent coefficients at 8 distinct offsets for both the first
 6769     // and second coefficient sequences, using an ldr with register
 6770     // variant Q then combine them with next set of 32 zetas. Likewise
 6771     // we store the results using an str with register variant Q.
 6772     for (int i = 0; i < 1024; i += 256) {
 6773       // c0 load 32 (8x4S) coefficients via first offsets
 6774       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6775       // c1 load 32 (8x4S) coefficients via second offsets
 6776       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6777       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6778       vs_addv(vs3, __ T4S, vs1, vs2);
 6779       // c = c0 - c1
 6780       vs_subv(vs1, __ T4S, vs1, vs2);
 6781       // store a0 32 (8x4S) coefficients via first offsets
 6782       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6783       // b load 32 (8x4S) next inputs
 6784       vs_ldpq_post(vs2, zetas);
 6785       // reload constants q, qinv -- they were clobbered earlier
 6786       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6787       // compute a1 = b montmul c
 6788       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6789       // store a1 32 (8x4S) coefficients via second offsets
 6790       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6791     }
 6792 
 6793     // level 3-7
 6794     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6795 
 6796     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6797     __ mov(r0, zr); // return 0
 6798     __ ret(lr);
 6799 
 6800     return start;
 6801   }
 6802 
 6803   // Dilithium multiply polynomials in the NTT domain.
 6804   // Straightforward implementation of the method
 6805   // static int implDilithiumNttMult(
 6806   //              int[] result, int[] ntta, int[] nttb {} of
 6807   // the sun.security.provider.ML_DSA class.
 6808   //
 6809   // result (int[256]) = c_rarg0
 6810   // poly1 (int[256]) = c_rarg1
 6811   // poly2 (int[256]) = c_rarg2
 6812   address generate_dilithiumNttMult() {
 6813 
 6814         __ align(CodeEntryAlignment);
 6815     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 6816     StubCodeMark mark(this, stub_id);
 6817     address start = __ pc();
 6818     __ enter();
 6819 
 6820     Label L_loop;
 6821 
 6822     const Register result = c_rarg0;
 6823     const Register poly1 = c_rarg1;
 6824     const Register poly2 = c_rarg2;
 6825 
 6826     const Register dilithiumConsts = r10;
 6827     const Register len = r11;
 6828 
 6829     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6830     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6831     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6832     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6833 
 6834     __ lea(dilithiumConsts,
 6835              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6836 
 6837     // load constants q, qinv
 6838     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6839     // load constant rSquare into v29
 6840     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6841 
 6842     __ mov(len, zr);
 6843     __ add(len, len, 1024);
 6844 
 6845     __ BIND(L_loop);
 6846 
 6847     // b load 32 (8x4S) next inputs from poly1
 6848     vs_ldpq_post(vs1, poly1);
 6849     // c load 32 (8x4S) next inputs from poly2
 6850     vs_ldpq_post(vs2, poly2);
 6851     // compute a = b montmul c
 6852     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6853     // compute a = rsquare montmul a
 6854     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6855     // save a 32 (8x4S) results
 6856     vs_stpq_post(vs2, result);
 6857 
 6858     __ sub(len, len, 128);
 6859     __ cmp(len, (u1)128);
 6860     __ br(Assembler::GE, L_loop);
 6861 
 6862     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6863     __ mov(r0, zr); // return 0
 6864     __ ret(lr);
 6865 
 6866     return start;
 6867   }
 6868 
 6869   // Dilithium Motgomery multiply an array by a constant.
 6870   // A straightforward implementation of the method
 6871   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6872   // of the sun.security.provider.MLDSA class
 6873   //
 6874   // coeffs (int[256]) = c_rarg0
 6875   // constant (int) = c_rarg1
 6876   address generate_dilithiumMontMulByConstant() {
 6877 
 6878     __ align(CodeEntryAlignment);
 6879     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 6880     StubCodeMark mark(this, stub_id);
 6881     address start = __ pc();
 6882     __ enter();
 6883 
 6884     Label L_loop;
 6885 
 6886     const Register coeffs = c_rarg0;
 6887     const Register constant = c_rarg1;
 6888 
 6889     const Register dilithiumConsts = r10;
 6890     const Register result = r11;
 6891     const Register len = r12;
 6892 
 6893     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6894     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6895     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6896     VSeq<8> vconst(29, 0);             // for montmul by constant
 6897 
 6898     // results track inputs
 6899     __ add(result, coeffs, 0);
 6900     __ lea(dilithiumConsts,
 6901              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6902 
 6903     // load constants q, qinv -- they do not get clobbered by first two loops
 6904     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6905     // copy caller supplied constant across vconst
 6906     __ dup(vconst[0], __ T4S, constant);
 6907     __ mov(len, zr);
 6908     __ add(len, len, 1024);
 6909 
 6910     __ BIND(L_loop);
 6911 
 6912     // load next 32 inputs
 6913     vs_ldpq_post(vs2, coeffs);
 6914     // mont mul by constant
 6915     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6916     // write next 32 results
 6917     vs_stpq_post(vs2, result);
 6918 
 6919     __ sub(len, len, 128);
 6920     __ cmp(len, (u1)128);
 6921     __ br(Assembler::GE, L_loop);
 6922 
 6923     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6924     __ mov(r0, zr); // return 0
 6925     __ ret(lr);
 6926 
 6927     return start;
 6928   }
 6929 
 6930   // Dilithium decompose poly.
 6931   // Implements the method
 6932   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6933   // of the sun.security.provider.ML_DSA class
 6934   //
 6935   // input (int[256]) = c_rarg0
 6936   // lowPart (int[256]) = c_rarg1
 6937   // highPart (int[256]) = c_rarg2
 6938   // twoGamma2  (int) = c_rarg3
 6939   // multiplier (int) = c_rarg4
 6940   address generate_dilithiumDecomposePoly() {
 6941 
 6942     __ align(CodeEntryAlignment);
 6943     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 6944     StubCodeMark mark(this, stub_id);
 6945     address start = __ pc();
 6946     Label L_loop;
 6947 
 6948     const Register input = c_rarg0;
 6949     const Register lowPart = c_rarg1;
 6950     const Register highPart = c_rarg2;
 6951     const Register twoGamma2 = c_rarg3;
 6952     const Register multiplier = c_rarg4;
 6953 
 6954     const Register len = r9;
 6955     const Register dilithiumConsts = r10;
 6956     const Register tmp = r11;
 6957 
 6958     // 6 independent sets of 4x4s values
 6959     VSeq<4> vs1(0), vs2(4), vs3(8);
 6960     VSeq<4> vs4(12), vs5(16), vtmp(20);
 6961 
 6962     // 7 constants for cross-multiplying
 6963     VSeq<4> one(25, 0);
 6964     VSeq<4> qminus1(26, 0);
 6965     VSeq<4> g2(27, 0);
 6966     VSeq<4> twog2(28, 0);
 6967     VSeq<4> mult(29, 0);
 6968     VSeq<4> q(30, 0);
 6969     VSeq<4> qadd(31, 0);
 6970 
 6971     __ enter();
 6972 
 6973     __ lea(dilithiumConsts,
 6974              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6975 
 6976     // save callee-saved registers
 6977     __ stpd(v8, v9, __ pre(sp, -64));
 6978     __ stpd(v10, v11, Address(sp, 16));
 6979     __ stpd(v12, v13, Address(sp, 32));
 6980     __ stpd(v14, v15, Address(sp, 48));
 6981 
 6982     // populate constant registers
 6983     __ mov(tmp, zr);
 6984     __ add(tmp, tmp, 1);
 6985     __ dup(one[0], __ T4S, tmp); // 1
 6986     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 6987     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 6988     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 6989     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 6990     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 6991     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 6992 
 6993     __ mov(len, zr);
 6994     __ add(len, len, 1024);
 6995 
 6996     __ BIND(L_loop);
 6997 
 6998     // load next 4x4S inputs interleaved: rplus --> vs1
 6999     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7000 
 7001     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7002     vs_addv(vtmp, __ T4S, vs1, qadd);
 7003     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7004     vs_mulv(vtmp, __ T4S, vtmp, q);
 7005     vs_subv(vs1, __ T4S, vs1, vtmp);
 7006 
 7007     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7008     vs_sshr(vtmp, __ T4S, vs1, 31);
 7009     vs_andr(vtmp, vtmp, q);
 7010     vs_addv(vs1, __ T4S, vs1, vtmp);
 7011 
 7012     // quotient --> vs2
 7013     // int quotient = (rplus * multiplier) >> 22;
 7014     vs_mulv(vtmp, __ T4S, vs1, mult);
 7015     vs_sshr(vs2, __ T4S, vtmp, 22);
 7016 
 7017     // r0 --> vs3
 7018     // int r0 = rplus - quotient * twoGamma2;
 7019     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7020     vs_subv(vs3, __ T4S, vs1, vtmp);
 7021 
 7022     // mask --> vs4
 7023     // int mask = (twoGamma2 - r0) >> 22;
 7024     vs_subv(vtmp, __ T4S, twog2, vs3);
 7025     vs_sshr(vs4, __ T4S, vtmp, 22);
 7026 
 7027     // r0 -= (mask & twoGamma2);
 7028     vs_andr(vtmp, vs4, twog2);
 7029     vs_subv(vs3, __ T4S, vs3, vtmp);
 7030 
 7031     //  quotient += (mask & 1);
 7032     vs_andr(vtmp, vs4, one);
 7033     vs_addv(vs2, __ T4S, vs2, vtmp);
 7034 
 7035     // mask = (twoGamma2 / 2 - r0) >> 31;
 7036     vs_subv(vtmp, __ T4S, g2, vs3);
 7037     vs_sshr(vs4, __ T4S, vtmp, 31);
 7038 
 7039     // r0 -= (mask & twoGamma2);
 7040     vs_andr(vtmp, vs4, twog2);
 7041     vs_subv(vs3, __ T4S, vs3, vtmp);
 7042 
 7043     // quotient += (mask & 1);
 7044     vs_andr(vtmp, vs4, one);
 7045     vs_addv(vs2, __ T4S, vs2, vtmp);
 7046 
 7047     // r1 --> vs5
 7048     // int r1 = rplus - r0 - (dilithium_q - 1);
 7049     vs_subv(vtmp, __ T4S, vs1, vs3);
 7050     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7051 
 7052     // r1 --> vs1 (overwriting rplus)
 7053     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7054     vs_negr(vtmp, __ T4S, vs5);
 7055     vs_orr(vtmp, vs5, vtmp);
 7056     vs_sshr(vs1, __ T4S, vtmp, 31);
 7057 
 7058     // r0 += ~r1;
 7059     vs_notr(vtmp, vs1);
 7060     vs_addv(vs3, __ T4S, vs3, vtmp);
 7061 
 7062     // r1 = r1 & quotient;
 7063     vs_andr(vs1, vs2, vs1);
 7064 
 7065     // store results inteleaved
 7066     // lowPart[m] = r0;
 7067     // highPart[m] = r1;
 7068     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7069     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7070 
 7071     __ sub(len, len, 64);
 7072     __ cmp(len, (u1)64);
 7073     __ br(Assembler::GE, L_loop);
 7074 
 7075     // restore callee-saved vector registers
 7076     __ ldpd(v14, v15, Address(sp, 48));
 7077     __ ldpd(v12, v13, Address(sp, 32));
 7078     __ ldpd(v10, v11, Address(sp, 16));
 7079     __ ldpd(v8, v9, __ post(sp, 64));
 7080 
 7081     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7082     __ mov(r0, zr); // return 0
 7083     __ ret(lr);
 7084 
 7085     return start;
 7086   }
 7087 
 7088   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7089              Register tmp0, Register tmp1, Register tmp2) {
 7090     __ bic(tmp0, a2, a1); // for a0
 7091     __ bic(tmp1, a3, a2); // for a1
 7092     __ bic(tmp2, a4, a3); // for a2
 7093     __ eor(a2, a2, tmp2);
 7094     __ bic(tmp2, a0, a4); // for a3
 7095     __ eor(a3, a3, tmp2);
 7096     __ bic(tmp2, a1, a0); // for a4
 7097     __ eor(a0, a0, tmp0);
 7098     __ eor(a1, a1, tmp1);
 7099     __ eor(a4, a4, tmp2);
 7100   }
 7101 
 7102   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7103                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7104                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7105                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7106                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7107                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7108                         Register tmp0, Register tmp1, Register tmp2) {
 7109     __ eor3(tmp1, a4, a9, a14);
 7110     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7111     __ eor3(tmp2, a1, a6, a11);
 7112     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7113     __ rax1(tmp2, tmp0, tmp1); // d0
 7114     {
 7115 
 7116       Register tmp3, tmp4;
 7117       if (can_use_fp && can_use_r18) {
 7118         tmp3 = rfp;
 7119         tmp4 = r18_tls;
 7120       } else {
 7121         tmp3 = a4;
 7122         tmp4 = a9;
 7123         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7124       }
 7125 
 7126       __ eor3(tmp3, a0, a5, a10);
 7127       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7128       __ eor(a0, a0, tmp2);
 7129       __ eor(a5, a5, tmp2);
 7130       __ eor(a10, a10, tmp2);
 7131       __ eor(a15, a15, tmp2);
 7132       __ eor(a20, a20, tmp2); // d0(tmp2)
 7133       __ eor3(tmp3, a2, a7, a12);
 7134       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7135       __ rax1(tmp3, tmp4, tmp2); // d1
 7136       __ eor(a1, a1, tmp3);
 7137       __ eor(a6, a6, tmp3);
 7138       __ eor(a11, a11, tmp3);
 7139       __ eor(a16, a16, tmp3);
 7140       __ eor(a21, a21, tmp3); // d1(tmp3)
 7141       __ rax1(tmp3, tmp2, tmp0); // d3
 7142       __ eor3(tmp2, a3, a8, a13);
 7143       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7144       __ eor(a3, a3, tmp3);
 7145       __ eor(a8, a8, tmp3);
 7146       __ eor(a13, a13, tmp3);
 7147       __ eor(a18, a18, tmp3);
 7148       __ eor(a23, a23, tmp3);
 7149       __ rax1(tmp2, tmp1, tmp0); // d2
 7150       __ eor(a2, a2, tmp2);
 7151       __ eor(a7, a7, tmp2);
 7152       __ eor(a12, a12, tmp2);
 7153       __ rax1(tmp0, tmp0, tmp4); // d4
 7154       if (!can_use_fp || !can_use_r18) {
 7155         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7156       }
 7157       __ eor(a17, a17, tmp2);
 7158       __ eor(a22, a22, tmp2);
 7159       __ eor(a4, a4, tmp0);
 7160       __ eor(a9, a9, tmp0);
 7161       __ eor(a14, a14, tmp0);
 7162       __ eor(a19, a19, tmp0);
 7163       __ eor(a24, a24, tmp0);
 7164     }
 7165 
 7166     __ rol(tmp0, a10, 3);
 7167     __ rol(a10, a1, 1);
 7168     __ rol(a1, a6, 44);
 7169     __ rol(a6, a9, 20);
 7170     __ rol(a9, a22, 61);
 7171     __ rol(a22, a14, 39);
 7172     __ rol(a14, a20, 18);
 7173     __ rol(a20, a2, 62);
 7174     __ rol(a2, a12, 43);
 7175     __ rol(a12, a13, 25);
 7176     __ rol(a13, a19, 8) ;
 7177     __ rol(a19, a23, 56);
 7178     __ rol(a23, a15, 41);
 7179     __ rol(a15, a4, 27);
 7180     __ rol(a4, a24, 14);
 7181     __ rol(a24, a21, 2);
 7182     __ rol(a21, a8, 55);
 7183     __ rol(a8, a16, 45);
 7184     __ rol(a16, a5, 36);
 7185     __ rol(a5, a3, 28);
 7186     __ rol(a3, a18, 21);
 7187     __ rol(a18, a17, 15);
 7188     __ rol(a17, a11, 10);
 7189     __ rol(a11, a7, 6);
 7190     __ mov(a7, tmp0);
 7191 
 7192     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7193     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7194     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7195     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7196     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7197 
 7198     __ ldr(tmp1, __ post(rc, 8));
 7199     __ eor(a0, a0, tmp1);
 7200 
 7201   }
 7202 
 7203   // Arguments:
 7204   //
 7205   // Inputs:
 7206   //   c_rarg0   - byte[]  source+offset
 7207   //   c_rarg1   - byte[]  SHA.state
 7208   //   c_rarg2   - int     block_size
 7209   //   c_rarg3   - int     offset
 7210   //   c_rarg4   - int     limit
 7211   //
 7212   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7213     bool multi_block;
 7214     switch (stub_id) {
 7215     case StubId::stubgen_sha3_implCompress_id:
 7216       multi_block = false;
 7217       break;
 7218     case StubId::stubgen_sha3_implCompressMB_id:
 7219       multi_block = true;
 7220       break;
 7221     default:
 7222       ShouldNotReachHere();
 7223     }
 7224 
 7225     static const uint64_t round_consts[24] = {
 7226       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 7227       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 7228       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 7229       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 7230       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 7231       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 7232       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 7233       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 7234     };
 7235 
 7236     __ align(CodeEntryAlignment);
 7237     StubCodeMark mark(this, stub_id);
 7238     address start = __ pc();
 7239 
 7240     Register buf           = c_rarg0;
 7241     Register state         = c_rarg1;
 7242     Register block_size    = c_rarg2;
 7243     Register ofs           = c_rarg3;
 7244     Register limit         = c_rarg4;
 7245 
 7246     // use r3.r17,r19..r28 to keep a0..a24.
 7247     // a0..a24 are respective locals from SHA3.java
 7248     Register a0 = r25,
 7249              a1 = r26,
 7250              a2 = r27,
 7251              a3 = r3,
 7252              a4 = r4,
 7253              a5 = r5,
 7254              a6 = r6,
 7255              a7 = r7,
 7256              a8 = rscratch1, // r8
 7257              a9 = rscratch2, // r9
 7258              a10 = r10,
 7259              a11 = r11,
 7260              a12 = r12,
 7261              a13 = r13,
 7262              a14 = r14,
 7263              a15 = r15,
 7264              a16 = r16,
 7265              a17 = r17,
 7266              a18 = r28,
 7267              a19 = r19,
 7268              a20 = r20,
 7269              a21 = r21,
 7270              a22 = r22,
 7271              a23 = r23,
 7272              a24 = r24;
 7273 
 7274     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7275 
 7276     Label sha3_loop, rounds24_preloop, loop_body;
 7277     Label sha3_512_or_sha3_384, shake128;
 7278 
 7279     bool can_use_r18 = false;
 7280 #ifndef R18_RESERVED
 7281     can_use_r18 = true;
 7282 #endif
 7283     bool can_use_fp = !PreserveFramePointer;
 7284 
 7285     __ enter();
 7286 
 7287     // save almost all yet unsaved gpr registers on stack
 7288     __ str(block_size, __ pre(sp, -128));
 7289     if (multi_block) {
 7290       __ stpw(ofs, limit, Address(sp, 8));
 7291     }
 7292     // 8 bytes at sp+16 will be used to keep buf
 7293     __ stp(r19, r20, Address(sp, 32));
 7294     __ stp(r21, r22, Address(sp, 48));
 7295     __ stp(r23, r24, Address(sp, 64));
 7296     __ stp(r25, r26, Address(sp, 80));
 7297     __ stp(r27, r28, Address(sp, 96));
 7298     if (can_use_r18 && can_use_fp) {
 7299       __ stp(r18_tls, state, Address(sp, 112));
 7300     } else {
 7301       __ str(state, Address(sp, 112));
 7302     }
 7303 
 7304     // begin sha3 calculations: loading a0..a24 from state arrary
 7305     __ ldp(a0, a1, state);
 7306     __ ldp(a2, a3, Address(state, 16));
 7307     __ ldp(a4, a5, Address(state, 32));
 7308     __ ldp(a6, a7, Address(state, 48));
 7309     __ ldp(a8, a9, Address(state, 64));
 7310     __ ldp(a10, a11, Address(state, 80));
 7311     __ ldp(a12, a13, Address(state, 96));
 7312     __ ldp(a14, a15, Address(state, 112));
 7313     __ ldp(a16, a17, Address(state, 128));
 7314     __ ldp(a18, a19, Address(state, 144));
 7315     __ ldp(a20, a21, Address(state, 160));
 7316     __ ldp(a22, a23, Address(state, 176));
 7317     __ ldr(a24, Address(state, 192));
 7318 
 7319     __ BIND(sha3_loop);
 7320 
 7321     // load input
 7322     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7323     __ eor(a0, a0, tmp3);
 7324     __ eor(a1, a1, tmp2);
 7325     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7326     __ eor(a2, a2, tmp3);
 7327     __ eor(a3, a3, tmp2);
 7328     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7329     __ eor(a4, a4, tmp3);
 7330     __ eor(a5, a5, tmp2);
 7331     __ ldr(tmp3, __ post(buf, 8));
 7332     __ eor(a6, a6, tmp3);
 7333 
 7334     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7335     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7336 
 7337     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7338     __ eor(a7, a7, tmp3);
 7339     __ eor(a8, a8, tmp2);
 7340     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7341     __ eor(a9, a9, tmp3);
 7342     __ eor(a10, a10, tmp2);
 7343     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7344     __ eor(a11, a11, tmp3);
 7345     __ eor(a12, a12, tmp2);
 7346     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7347     __ eor(a13, a13, tmp3);
 7348     __ eor(a14, a14, tmp2);
 7349     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7350     __ eor(a15, a15, tmp3);
 7351     __ eor(a16, a16, tmp2);
 7352 
 7353     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7354     __ andw(tmp2, block_size, 48);
 7355     __ cbzw(tmp2, rounds24_preloop);
 7356     __ tbnz(block_size, 5, shake128);
 7357     // block_size == 144, bit5 == 0, SHA3-244
 7358     __ ldr(tmp3, __ post(buf, 8));
 7359     __ eor(a17, a17, tmp3);
 7360     __ b(rounds24_preloop);
 7361 
 7362     __ BIND(shake128);
 7363     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7364     __ eor(a17, a17, tmp3);
 7365     __ eor(a18, a18, tmp2);
 7366     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7367     __ eor(a19, a19, tmp3);
 7368     __ eor(a20, a20, tmp2);
 7369     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7370 
 7371     __ BIND(sha3_512_or_sha3_384);
 7372     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7373     __ eor(a7, a7, tmp3);
 7374     __ eor(a8, a8, tmp2);
 7375     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7376 
 7377     // SHA3-384
 7378     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7379     __ eor(a9, a9, tmp3);
 7380     __ eor(a10, a10, tmp2);
 7381     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7382     __ eor(a11, a11, tmp3);
 7383     __ eor(a12, a12, tmp2);
 7384 
 7385     __ BIND(rounds24_preloop);
 7386     __ fmovs(v0, 24.0); // float loop counter,
 7387     __ fmovs(v1, 1.0);  // exact representation
 7388 
 7389     __ str(buf, Address(sp, 16));
 7390     __ lea(tmp3, ExternalAddress((address) round_consts));
 7391 
 7392     __ BIND(loop_body);
 7393     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7394                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7395                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7396                      tmp0, tmp1, tmp2);
 7397     __ fsubs(v0, v0, v1);
 7398     __ fcmps(v0, 0.0);
 7399     __ br(__ NE, loop_body);
 7400 
 7401     if (multi_block) {
 7402       __ ldrw(block_size, sp); // block_size
 7403       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7404       __ addw(tmp2, tmp2, block_size);
 7405       __ cmpw(tmp2, tmp1);
 7406       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7407       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7408       __ br(Assembler::LE, sha3_loop);
 7409       __ movw(c_rarg0, tmp2); // return offset
 7410     }
 7411     if (can_use_fp && can_use_r18) {
 7412       __ ldp(r18_tls, state, Address(sp, 112));
 7413     } else {
 7414       __ ldr(state, Address(sp, 112));
 7415     }
 7416     // save calculated sha3 state
 7417     __ stp(a0, a1, Address(state));
 7418     __ stp(a2, a3, Address(state, 16));
 7419     __ stp(a4, a5, Address(state, 32));
 7420     __ stp(a6, a7, Address(state, 48));
 7421     __ stp(a8, a9, Address(state, 64));
 7422     __ stp(a10, a11, Address(state, 80));
 7423     __ stp(a12, a13, Address(state, 96));
 7424     __ stp(a14, a15, Address(state, 112));
 7425     __ stp(a16, a17, Address(state, 128));
 7426     __ stp(a18, a19, Address(state, 144));
 7427     __ stp(a20, a21, Address(state, 160));
 7428     __ stp(a22, a23, Address(state, 176));
 7429     __ str(a24, Address(state, 192));
 7430 
 7431     // restore required registers from stack
 7432     __ ldp(r19, r20, Address(sp, 32));
 7433     __ ldp(r21, r22, Address(sp, 48));
 7434     __ ldp(r23, r24, Address(sp, 64));
 7435     __ ldp(r25, r26, Address(sp, 80));
 7436     __ ldp(r27, r28, Address(sp, 96));
 7437     if (can_use_fp && can_use_r18) {
 7438       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 7439     } // else no need to recalculate rfp, since it wasn't changed
 7440 
 7441     __ leave();
 7442 
 7443     __ ret(lr);
 7444 
 7445     return start;
 7446   }
 7447 
 7448   /**
 7449    *  Arguments:
 7450    *
 7451    * Inputs:
 7452    *   c_rarg0   - int crc
 7453    *   c_rarg1   - byte* buf
 7454    *   c_rarg2   - int length
 7455    *
 7456    * Output:
 7457    *       rax   - int crc result
 7458    */
 7459   address generate_updateBytesCRC32() {
 7460     assert(UseCRC32Intrinsics, "what are we doing here?");
 7461 
 7462     __ align(CodeEntryAlignment);
 7463     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 7464     StubCodeMark mark(this, stub_id);
 7465 
 7466     address start = __ pc();
 7467 
 7468     const Register crc   = c_rarg0;  // crc
 7469     const Register buf   = c_rarg1;  // source java byte array address
 7470     const Register len   = c_rarg2;  // length
 7471     const Register table0 = c_rarg3; // crc_table address
 7472     const Register table1 = c_rarg4;
 7473     const Register table2 = c_rarg5;
 7474     const Register table3 = c_rarg6;
 7475     const Register tmp3 = c_rarg7;
 7476 
 7477     BLOCK_COMMENT("Entry:");
 7478     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7479 
 7480     __ kernel_crc32(crc, buf, len,
 7481               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7482 
 7483     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7484     __ ret(lr);
 7485 
 7486     return start;
 7487   }
 7488 
 7489   /**
 7490    *  Arguments:
 7491    *
 7492    * Inputs:
 7493    *   c_rarg0   - int crc
 7494    *   c_rarg1   - byte* buf
 7495    *   c_rarg2   - int length
 7496    *   c_rarg3   - int* table
 7497    *
 7498    * Output:
 7499    *       r0   - int crc result
 7500    */
 7501   address generate_updateBytesCRC32C() {
 7502     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7503 
 7504     __ align(CodeEntryAlignment);
 7505     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 7506     StubCodeMark mark(this, stub_id);
 7507 
 7508     address start = __ pc();
 7509 
 7510     const Register crc   = c_rarg0;  // crc
 7511     const Register buf   = c_rarg1;  // source java byte array address
 7512     const Register len   = c_rarg2;  // length
 7513     const Register table0 = c_rarg3; // crc_table address
 7514     const Register table1 = c_rarg4;
 7515     const Register table2 = c_rarg5;
 7516     const Register table3 = c_rarg6;
 7517     const Register tmp3 = c_rarg7;
 7518 
 7519     BLOCK_COMMENT("Entry:");
 7520     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7521 
 7522     __ kernel_crc32c(crc, buf, len,
 7523               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7524 
 7525     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7526     __ ret(lr);
 7527 
 7528     return start;
 7529   }
 7530 
 7531   /***
 7532    *  Arguments:
 7533    *
 7534    *  Inputs:
 7535    *   c_rarg0   - int   adler
 7536    *   c_rarg1   - byte* buff
 7537    *   c_rarg2   - int   len
 7538    *
 7539    * Output:
 7540    *   c_rarg0   - int adler result
 7541    */
 7542   address generate_updateBytesAdler32() {
 7543     __ align(CodeEntryAlignment);
 7544     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 7545     StubCodeMark mark(this, stub_id);
 7546     address start = __ pc();
 7547 
 7548     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7549 
 7550     // Aliases
 7551     Register adler  = c_rarg0;
 7552     Register s1     = c_rarg0;
 7553     Register s2     = c_rarg3;
 7554     Register buff   = c_rarg1;
 7555     Register len    = c_rarg2;
 7556     Register nmax  = r4;
 7557     Register base  = r5;
 7558     Register count = r6;
 7559     Register temp0 = rscratch1;
 7560     Register temp1 = rscratch2;
 7561     FloatRegister vbytes = v0;
 7562     FloatRegister vs1acc = v1;
 7563     FloatRegister vs2acc = v2;
 7564     FloatRegister vtable = v3;
 7565 
 7566     // Max number of bytes we can process before having to take the mod
 7567     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7568     uint64_t BASE = 0xfff1;
 7569     uint64_t NMAX = 0x15B0;
 7570 
 7571     __ mov(base, BASE);
 7572     __ mov(nmax, NMAX);
 7573 
 7574     // Load accumulation coefficients for the upper 16 bits
 7575     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7576     __ ld1(vtable, __ T16B, Address(temp0));
 7577 
 7578     // s1 is initialized to the lower 16 bits of adler
 7579     // s2 is initialized to the upper 16 bits of adler
 7580     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7581     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7582 
 7583     // The pipelined loop needs at least 16 elements for 1 iteration
 7584     // It does check this, but it is more effective to skip to the cleanup loop
 7585     __ cmp(len, (u1)16);
 7586     __ br(Assembler::HS, L_nmax);
 7587     __ cbz(len, L_combine);
 7588 
 7589     __ bind(L_simple_by1_loop);
 7590     __ ldrb(temp0, Address(__ post(buff, 1)));
 7591     __ add(s1, s1, temp0);
 7592     __ add(s2, s2, s1);
 7593     __ subs(len, len, 1);
 7594     __ br(Assembler::HI, L_simple_by1_loop);
 7595 
 7596     // s1 = s1 % BASE
 7597     __ subs(temp0, s1, base);
 7598     __ csel(s1, temp0, s1, Assembler::HS);
 7599 
 7600     // s2 = s2 % BASE
 7601     __ lsr(temp0, s2, 16);
 7602     __ lsl(temp1, temp0, 4);
 7603     __ sub(temp1, temp1, temp0);
 7604     __ add(s2, temp1, s2, ext::uxth);
 7605 
 7606     __ subs(temp0, s2, base);
 7607     __ csel(s2, temp0, s2, Assembler::HS);
 7608 
 7609     __ b(L_combine);
 7610 
 7611     __ bind(L_nmax);
 7612     __ subs(len, len, nmax);
 7613     __ sub(count, nmax, 16);
 7614     __ br(Assembler::LO, L_by16);
 7615 
 7616     __ bind(L_nmax_loop);
 7617 
 7618     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7619                                       vbytes, vs1acc, vs2acc, vtable);
 7620 
 7621     __ subs(count, count, 16);
 7622     __ br(Assembler::HS, L_nmax_loop);
 7623 
 7624     // s1 = s1 % BASE
 7625     __ lsr(temp0, s1, 16);
 7626     __ lsl(temp1, temp0, 4);
 7627     __ sub(temp1, temp1, temp0);
 7628     __ add(temp1, temp1, s1, ext::uxth);
 7629 
 7630     __ lsr(temp0, temp1, 16);
 7631     __ lsl(s1, temp0, 4);
 7632     __ sub(s1, s1, temp0);
 7633     __ add(s1, s1, temp1, ext:: uxth);
 7634 
 7635     __ subs(temp0, s1, base);
 7636     __ csel(s1, temp0, s1, Assembler::HS);
 7637 
 7638     // s2 = s2 % BASE
 7639     __ lsr(temp0, s2, 16);
 7640     __ lsl(temp1, temp0, 4);
 7641     __ sub(temp1, temp1, temp0);
 7642     __ add(temp1, temp1, s2, ext::uxth);
 7643 
 7644     __ lsr(temp0, temp1, 16);
 7645     __ lsl(s2, temp0, 4);
 7646     __ sub(s2, s2, temp0);
 7647     __ add(s2, s2, temp1, ext:: uxth);
 7648 
 7649     __ subs(temp0, s2, base);
 7650     __ csel(s2, temp0, s2, Assembler::HS);
 7651 
 7652     __ subs(len, len, nmax);
 7653     __ sub(count, nmax, 16);
 7654     __ br(Assembler::HS, L_nmax_loop);
 7655 
 7656     __ bind(L_by16);
 7657     __ adds(len, len, count);
 7658     __ br(Assembler::LO, L_by1);
 7659 
 7660     __ bind(L_by16_loop);
 7661 
 7662     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7663                                       vbytes, vs1acc, vs2acc, vtable);
 7664 
 7665     __ subs(len, len, 16);
 7666     __ br(Assembler::HS, L_by16_loop);
 7667 
 7668     __ bind(L_by1);
 7669     __ adds(len, len, 15);
 7670     __ br(Assembler::LO, L_do_mod);
 7671 
 7672     __ bind(L_by1_loop);
 7673     __ ldrb(temp0, Address(__ post(buff, 1)));
 7674     __ add(s1, temp0, s1);
 7675     __ add(s2, s2, s1);
 7676     __ subs(len, len, 1);
 7677     __ br(Assembler::HS, L_by1_loop);
 7678 
 7679     __ bind(L_do_mod);
 7680     // s1 = s1 % BASE
 7681     __ lsr(temp0, s1, 16);
 7682     __ lsl(temp1, temp0, 4);
 7683     __ sub(temp1, temp1, temp0);
 7684     __ add(temp1, temp1, s1, ext::uxth);
 7685 
 7686     __ lsr(temp0, temp1, 16);
 7687     __ lsl(s1, temp0, 4);
 7688     __ sub(s1, s1, temp0);
 7689     __ add(s1, s1, temp1, ext:: uxth);
 7690 
 7691     __ subs(temp0, s1, base);
 7692     __ csel(s1, temp0, s1, Assembler::HS);
 7693 
 7694     // s2 = s2 % BASE
 7695     __ lsr(temp0, s2, 16);
 7696     __ lsl(temp1, temp0, 4);
 7697     __ sub(temp1, temp1, temp0);
 7698     __ add(temp1, temp1, s2, ext::uxth);
 7699 
 7700     __ lsr(temp0, temp1, 16);
 7701     __ lsl(s2, temp0, 4);
 7702     __ sub(s2, s2, temp0);
 7703     __ add(s2, s2, temp1, ext:: uxth);
 7704 
 7705     __ subs(temp0, s2, base);
 7706     __ csel(s2, temp0, s2, Assembler::HS);
 7707 
 7708     // Combine lower bits and higher bits
 7709     __ bind(L_combine);
 7710     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7711 
 7712     __ ret(lr);
 7713 
 7714     return start;
 7715   }
 7716 
 7717   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7718           Register temp0, Register temp1, FloatRegister vbytes,
 7719           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7720     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7721     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7722     // In non-vectorized code, we update s1 and s2 as:
 7723     //   s1 <- s1 + b1
 7724     //   s2 <- s2 + s1
 7725     //   s1 <- s1 + b2
 7726     //   s2 <- s2 + b1
 7727     //   ...
 7728     //   s1 <- s1 + b16
 7729     //   s2 <- s2 + s1
 7730     // Putting above assignments together, we have:
 7731     //   s1_new = s1 + b1 + b2 + ... + b16
 7732     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7733     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7734     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7735     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7736 
 7737     // s2 = s2 + s1 * 16
 7738     __ add(s2, s2, s1, Assembler::LSL, 4);
 7739 
 7740     // vs1acc = b1 + b2 + b3 + ... + b16
 7741     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7742     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7743     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7744     __ uaddlv(vs1acc, __ T16B, vbytes);
 7745     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7746 
 7747     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7748     __ fmovd(temp0, vs1acc);
 7749     __ fmovd(temp1, vs2acc);
 7750     __ add(s1, s1, temp0);
 7751     __ add(s2, s2, temp1);
 7752   }
 7753 
 7754   /**
 7755    *  Arguments:
 7756    *
 7757    *  Input:
 7758    *    c_rarg0   - x address
 7759    *    c_rarg1   - x length
 7760    *    c_rarg2   - y address
 7761    *    c_rarg3   - y length
 7762    *    c_rarg4   - z address
 7763    */
 7764   address generate_multiplyToLen() {
 7765     __ align(CodeEntryAlignment);
 7766     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 7767     StubCodeMark mark(this, stub_id);
 7768 
 7769     address start = __ pc();
 7770  
 7771     if (AOTCodeCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) {
 7772       return start;
 7773     }
 7774     const Register x     = r0;
 7775     const Register xlen  = r1;
 7776     const Register y     = r2;
 7777     const Register ylen  = r3;
 7778     const Register z     = r4;
 7779 
 7780     const Register tmp0  = r5;
 7781     const Register tmp1  = r10;
 7782     const Register tmp2  = r11;
 7783     const Register tmp3  = r12;
 7784     const Register tmp4  = r13;
 7785     const Register tmp5  = r14;
 7786     const Register tmp6  = r15;
 7787     const Register tmp7  = r16;
 7788 
 7789     BLOCK_COMMENT("Entry:");
 7790     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7791     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7792     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7793     __ ret(lr);
 7794 
 7795     AOTCodeCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start);
 7796     return start;
 7797   }
 7798 
 7799   address generate_squareToLen() {
 7800     // squareToLen algorithm for sizes 1..127 described in java code works
 7801     // faster than multiply_to_len on some CPUs and slower on others, but
 7802     // multiply_to_len shows a bit better overall results
 7803     __ align(CodeEntryAlignment);
 7804     StubId stub_id = StubId::stubgen_squareToLen_id;
 7805     StubCodeMark mark(this, stub_id);
 7806     address start = __ pc();
 7807 
 7808     if (AOTCodeCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) {
 7809       return start;
 7810     }
 7811     const Register x     = r0;
 7812     const Register xlen  = r1;
 7813     const Register z     = r2;
 7814     const Register y     = r4; // == x
 7815     const Register ylen  = r5; // == xlen
 7816 
 7817     const Register tmp0  = r3;
 7818     const Register tmp1  = r10;
 7819     const Register tmp2  = r11;
 7820     const Register tmp3  = r12;
 7821     const Register tmp4  = r13;
 7822     const Register tmp5  = r14;
 7823     const Register tmp6  = r15;
 7824     const Register tmp7  = r16;
 7825 
 7826     RegSet spilled_regs = RegSet::of(y, ylen);
 7827     BLOCK_COMMENT("Entry:");
 7828     __ enter();
 7829     __ push(spilled_regs, sp);
 7830     __ mov(y, x);
 7831     __ mov(ylen, xlen);
 7832     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7833     __ pop(spilled_regs, sp);
 7834     __ leave();
 7835     __ ret(lr);
 7836 
 7837     AOTCodeCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start);
 7838     return start;
 7839   }
 7840 
 7841   address generate_mulAdd() {
 7842     __ align(CodeEntryAlignment);
 7843     StubId stub_id = StubId::stubgen_mulAdd_id;
 7844     StubCodeMark mark(this, stub_id);
 7845 
 7846     address start = __ pc();
 7847 
 7848     if (AOTCodeCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) {
 7849       return start;
 7850     }
 7851     const Register out     = r0;
 7852     const Register in      = r1;
 7853     const Register offset  = r2;
 7854     const Register len     = r3;
 7855     const Register k       = r4;
 7856 
 7857     BLOCK_COMMENT("Entry:");
 7858     __ enter();
 7859     __ mul_add(out, in, offset, len, k);
 7860     __ leave();
 7861     __ ret(lr);
 7862 
 7863     AOTCodeCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start);
 7864     return start;
 7865   }
 7866 
 7867   // Arguments:
 7868   //
 7869   // Input:
 7870   //   c_rarg0   - newArr address
 7871   //   c_rarg1   - oldArr address
 7872   //   c_rarg2   - newIdx
 7873   //   c_rarg3   - shiftCount
 7874   //   c_rarg4   - numIter
 7875   //
 7876   address generate_bigIntegerRightShift() {
 7877     __ align(CodeEntryAlignment);
 7878     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 7879     StubCodeMark mark(this, stub_id);
 7880     address start = __ pc();
 7881 
 7882     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7883 
 7884     Register newArr        = c_rarg0;
 7885     Register oldArr        = c_rarg1;
 7886     Register newIdx        = c_rarg2;
 7887     Register shiftCount    = c_rarg3;
 7888     Register numIter       = c_rarg4;
 7889     Register idx           = numIter;
 7890 
 7891     Register newArrCur     = rscratch1;
 7892     Register shiftRevCount = rscratch2;
 7893     Register oldArrCur     = r13;
 7894     Register oldArrNext    = r14;
 7895 
 7896     FloatRegister oldElem0        = v0;
 7897     FloatRegister oldElem1        = v1;
 7898     FloatRegister newElem         = v2;
 7899     FloatRegister shiftVCount     = v3;
 7900     FloatRegister shiftVRevCount  = v4;
 7901 
 7902     __ cbz(idx, Exit);
 7903 
 7904     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7905 
 7906     // left shift count
 7907     __ movw(shiftRevCount, 32);
 7908     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7909 
 7910     // numIter too small to allow a 4-words SIMD loop, rolling back
 7911     __ cmp(numIter, (u1)4);
 7912     __ br(Assembler::LT, ShiftThree);
 7913 
 7914     __ dup(shiftVCount,    __ T4S, shiftCount);
 7915     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7916     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7917 
 7918     __ BIND(ShiftSIMDLoop);
 7919 
 7920     // Calculate the load addresses
 7921     __ sub(idx, idx, 4);
 7922     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7923     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7924     __ add(oldArrCur,  oldArrNext, 4);
 7925 
 7926     // Load 4 words and process
 7927     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7928     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7929     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7930     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7931     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7932     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7933 
 7934     __ cmp(idx, (u1)4);
 7935     __ br(Assembler::LT, ShiftTwoLoop);
 7936     __ b(ShiftSIMDLoop);
 7937 
 7938     __ BIND(ShiftTwoLoop);
 7939     __ cbz(idx, Exit);
 7940     __ cmp(idx, (u1)1);
 7941     __ br(Assembler::EQ, ShiftOne);
 7942 
 7943     // Calculate the load addresses
 7944     __ sub(idx, idx, 2);
 7945     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7946     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7947     __ add(oldArrCur,  oldArrNext, 4);
 7948 
 7949     // Load 2 words and process
 7950     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 7951     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 7952     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 7953     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 7954     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 7955     __ st1(newElem,   __ T2S, Address(newArrCur));
 7956     __ b(ShiftTwoLoop);
 7957 
 7958     __ BIND(ShiftThree);
 7959     __ tbz(idx, 1, ShiftOne);
 7960     __ tbz(idx, 0, ShiftTwo);
 7961     __ ldrw(r10,  Address(oldArr, 12));
 7962     __ ldrw(r11,  Address(oldArr, 8));
 7963     __ lsrvw(r10, r10, shiftCount);
 7964     __ lslvw(r11, r11, shiftRevCount);
 7965     __ orrw(r12,  r10, r11);
 7966     __ strw(r12,  Address(newArr, 8));
 7967 
 7968     __ BIND(ShiftTwo);
 7969     __ ldrw(r10,  Address(oldArr, 8));
 7970     __ ldrw(r11,  Address(oldArr, 4));
 7971     __ lsrvw(r10, r10, shiftCount);
 7972     __ lslvw(r11, r11, shiftRevCount);
 7973     __ orrw(r12,  r10, r11);
 7974     __ strw(r12,  Address(newArr, 4));
 7975 
 7976     __ BIND(ShiftOne);
 7977     __ ldrw(r10,  Address(oldArr, 4));
 7978     __ ldrw(r11,  Address(oldArr));
 7979     __ lsrvw(r10, r10, shiftCount);
 7980     __ lslvw(r11, r11, shiftRevCount);
 7981     __ orrw(r12,  r10, r11);
 7982     __ strw(r12,  Address(newArr));
 7983 
 7984     __ BIND(Exit);
 7985     __ ret(lr);
 7986 
 7987     return start;
 7988   }
 7989 
 7990   // Arguments:
 7991   //
 7992   // Input:
 7993   //   c_rarg0   - newArr address
 7994   //   c_rarg1   - oldArr address
 7995   //   c_rarg2   - newIdx
 7996   //   c_rarg3   - shiftCount
 7997   //   c_rarg4   - numIter
 7998   //
 7999   address generate_bigIntegerLeftShift() {
 8000     __ align(CodeEntryAlignment);
 8001     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 8002     StubCodeMark mark(this, stub_id);
 8003     address start = __ pc();
 8004 
 8005     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8006 
 8007     Register newArr        = c_rarg0;
 8008     Register oldArr        = c_rarg1;
 8009     Register newIdx        = c_rarg2;
 8010     Register shiftCount    = c_rarg3;
 8011     Register numIter       = c_rarg4;
 8012 
 8013     Register shiftRevCount = rscratch1;
 8014     Register oldArrNext    = rscratch2;
 8015 
 8016     FloatRegister oldElem0        = v0;
 8017     FloatRegister oldElem1        = v1;
 8018     FloatRegister newElem         = v2;
 8019     FloatRegister shiftVCount     = v3;
 8020     FloatRegister shiftVRevCount  = v4;
 8021 
 8022     __ cbz(numIter, Exit);
 8023 
 8024     __ add(oldArrNext, oldArr, 4);
 8025     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8026 
 8027     // right shift count
 8028     __ movw(shiftRevCount, 32);
 8029     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8030 
 8031     // numIter too small to allow a 4-words SIMD loop, rolling back
 8032     __ cmp(numIter, (u1)4);
 8033     __ br(Assembler::LT, ShiftThree);
 8034 
 8035     __ dup(shiftVCount,     __ T4S, shiftCount);
 8036     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8037     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8038 
 8039     __ BIND(ShiftSIMDLoop);
 8040 
 8041     // load 4 words and process
 8042     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8043     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8044     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8045     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8046     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8047     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8048     __ sub(numIter,   numIter, 4);
 8049 
 8050     __ cmp(numIter, (u1)4);
 8051     __ br(Assembler::LT, ShiftTwoLoop);
 8052     __ b(ShiftSIMDLoop);
 8053 
 8054     __ BIND(ShiftTwoLoop);
 8055     __ cbz(numIter, Exit);
 8056     __ cmp(numIter, (u1)1);
 8057     __ br(Assembler::EQ, ShiftOne);
 8058 
 8059     // load 2 words and process
 8060     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8061     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8062     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8063     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8064     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8065     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8066     __ sub(numIter,   numIter, 2);
 8067     __ b(ShiftTwoLoop);
 8068 
 8069     __ BIND(ShiftThree);
 8070     __ ldrw(r10,  __ post(oldArr, 4));
 8071     __ ldrw(r11,  __ post(oldArrNext, 4));
 8072     __ lslvw(r10, r10, shiftCount);
 8073     __ lsrvw(r11, r11, shiftRevCount);
 8074     __ orrw(r12,  r10, r11);
 8075     __ strw(r12,  __ post(newArr, 4));
 8076     __ tbz(numIter, 1, Exit);
 8077     __ tbz(numIter, 0, ShiftOne);
 8078 
 8079     __ BIND(ShiftTwo);
 8080     __ ldrw(r10,  __ post(oldArr, 4));
 8081     __ ldrw(r11,  __ post(oldArrNext, 4));
 8082     __ lslvw(r10, r10, shiftCount);
 8083     __ lsrvw(r11, r11, shiftRevCount);
 8084     __ orrw(r12,  r10, r11);
 8085     __ strw(r12,  __ post(newArr, 4));
 8086 
 8087     __ BIND(ShiftOne);
 8088     __ ldrw(r10,  Address(oldArr));
 8089     __ ldrw(r11,  Address(oldArrNext));
 8090     __ lslvw(r10, r10, shiftCount);
 8091     __ lsrvw(r11, r11, shiftRevCount);
 8092     __ orrw(r12,  r10, r11);
 8093     __ strw(r12,  Address(newArr));
 8094 
 8095     __ BIND(Exit);
 8096     __ ret(lr);
 8097 
 8098     return start;
 8099   }
 8100 
 8101   address generate_count_positives(address &count_positives_long) {
 8102     const u1 large_loop_size = 64;
 8103     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8104     int dcache_line = VM_Version::dcache_line_size();
 8105 
 8106     Register ary1 = r1, len = r2, result = r0;
 8107 
 8108     __ align(CodeEntryAlignment);
 8109 
 8110     StubId stub_id = StubId::stubgen_count_positives_id;
 8111     StubCodeMark mark(this, stub_id);
 8112 
 8113     address entry = __ pc();
 8114 
 8115     __ enter();
 8116     // precondition: a copy of len is already in result
 8117     // __ mov(result, len);
 8118 
 8119   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8120         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8121 
 8122   __ cmp(len, (u1)15);
 8123   __ br(Assembler::GT, LEN_OVER_15);
 8124   // The only case when execution falls into this code is when pointer is near
 8125   // the end of memory page and we have to avoid reading next page
 8126   __ add(ary1, ary1, len);
 8127   __ subs(len, len, 8);
 8128   __ br(Assembler::GT, LEN_OVER_8);
 8129   __ ldr(rscratch2, Address(ary1, -8));
 8130   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8131   __ lsrv(rscratch2, rscratch2, rscratch1);
 8132   __ tst(rscratch2, UPPER_BIT_MASK);
 8133   __ csel(result, zr, result, Assembler::NE);
 8134   __ leave();
 8135   __ ret(lr);
 8136   __ bind(LEN_OVER_8);
 8137   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8138   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8139   __ tst(rscratch2, UPPER_BIT_MASK);
 8140   __ br(Assembler::NE, RET_NO_POP);
 8141   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8142   __ lsrv(rscratch1, rscratch1, rscratch2);
 8143   __ tst(rscratch1, UPPER_BIT_MASK);
 8144   __ bind(RET_NO_POP);
 8145   __ csel(result, zr, result, Assembler::NE);
 8146   __ leave();
 8147   __ ret(lr);
 8148 
 8149   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8150   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8151 
 8152   count_positives_long = __ pc(); // 2nd entry point
 8153 
 8154   __ enter();
 8155 
 8156   __ bind(LEN_OVER_15);
 8157     __ push(spilled_regs, sp);
 8158     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8159     __ cbz(rscratch2, ALIGNED);
 8160     __ ldp(tmp6, tmp1, Address(ary1));
 8161     __ mov(tmp5, 16);
 8162     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8163     __ add(ary1, ary1, rscratch1);
 8164     __ orr(tmp6, tmp6, tmp1);
 8165     __ tst(tmp6, UPPER_BIT_MASK);
 8166     __ br(Assembler::NE, RET_ADJUST);
 8167     __ sub(len, len, rscratch1);
 8168 
 8169   __ bind(ALIGNED);
 8170     __ cmp(len, large_loop_size);
 8171     __ br(Assembler::LT, CHECK_16);
 8172     // Perform 16-byte load as early return in pre-loop to handle situation
 8173     // when initially aligned large array has negative values at starting bytes,
 8174     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8175     // slower. Cases with negative bytes further ahead won't be affected that
 8176     // much. In fact, it'll be faster due to early loads, less instructions and
 8177     // less branches in LARGE_LOOP.
 8178     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8179     __ sub(len, len, 16);
 8180     __ orr(tmp6, tmp6, tmp1);
 8181     __ tst(tmp6, UPPER_BIT_MASK);
 8182     __ br(Assembler::NE, RET_ADJUST_16);
 8183     __ cmp(len, large_loop_size);
 8184     __ br(Assembler::LT, CHECK_16);
 8185 
 8186     if (SoftwarePrefetchHintDistance >= 0
 8187         && SoftwarePrefetchHintDistance >= dcache_line) {
 8188       // initial prefetch
 8189       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8190     }
 8191   __ bind(LARGE_LOOP);
 8192     if (SoftwarePrefetchHintDistance >= 0) {
 8193       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8194     }
 8195     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8196     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8197     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8198     // instructions per cycle and have less branches, but this approach disables
 8199     // early return, thus, all 64 bytes are loaded and checked every time.
 8200     __ ldp(tmp2, tmp3, Address(ary1));
 8201     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8202     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8203     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8204     __ add(ary1, ary1, large_loop_size);
 8205     __ sub(len, len, large_loop_size);
 8206     __ orr(tmp2, tmp2, tmp3);
 8207     __ orr(tmp4, tmp4, tmp5);
 8208     __ orr(rscratch1, rscratch1, rscratch2);
 8209     __ orr(tmp6, tmp6, tmp1);
 8210     __ orr(tmp2, tmp2, tmp4);
 8211     __ orr(rscratch1, rscratch1, tmp6);
 8212     __ orr(tmp2, tmp2, rscratch1);
 8213     __ tst(tmp2, UPPER_BIT_MASK);
 8214     __ br(Assembler::NE, RET_ADJUST_LONG);
 8215     __ cmp(len, large_loop_size);
 8216     __ br(Assembler::GE, LARGE_LOOP);
 8217 
 8218   __ bind(CHECK_16); // small 16-byte load pre-loop
 8219     __ cmp(len, (u1)16);
 8220     __ br(Assembler::LT, POST_LOOP16);
 8221 
 8222   __ bind(LOOP16); // small 16-byte load loop
 8223     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8224     __ sub(len, len, 16);
 8225     __ orr(tmp2, tmp2, tmp3);
 8226     __ tst(tmp2, UPPER_BIT_MASK);
 8227     __ br(Assembler::NE, RET_ADJUST_16);
 8228     __ cmp(len, (u1)16);
 8229     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8230 
 8231   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8232     __ cmp(len, (u1)8);
 8233     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8234     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8235     __ tst(tmp3, UPPER_BIT_MASK);
 8236     __ br(Assembler::NE, RET_ADJUST);
 8237     __ sub(len, len, 8);
 8238 
 8239   __ bind(POST_LOOP16_LOAD_TAIL);
 8240     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8241     __ ldr(tmp1, Address(ary1));
 8242     __ mov(tmp2, 64);
 8243     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8244     __ lslv(tmp1, tmp1, tmp4);
 8245     __ tst(tmp1, UPPER_BIT_MASK);
 8246     __ br(Assembler::NE, RET_ADJUST);
 8247     // Fallthrough
 8248 
 8249   __ bind(RET_LEN);
 8250     __ pop(spilled_regs, sp);
 8251     __ leave();
 8252     __ ret(lr);
 8253 
 8254     // difference result - len is the count of guaranteed to be
 8255     // positive bytes
 8256 
 8257   __ bind(RET_ADJUST_LONG);
 8258     __ add(len, len, (u1)(large_loop_size - 16));
 8259   __ bind(RET_ADJUST_16);
 8260     __ add(len, len, 16);
 8261   __ bind(RET_ADJUST);
 8262     __ pop(spilled_regs, sp);
 8263     __ leave();
 8264     __ sub(result, result, len);
 8265     __ ret(lr);
 8266 
 8267     return entry;
 8268   }
 8269 
 8270   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8271         bool usePrefetch, Label &NOT_EQUAL) {
 8272     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8273         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8274         tmp7 = r12, tmp8 = r13;
 8275     Label LOOP;
 8276 
 8277     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8278     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8279     __ bind(LOOP);
 8280     if (usePrefetch) {
 8281       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8282       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8283     }
 8284     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8285     __ eor(tmp1, tmp1, tmp2);
 8286     __ eor(tmp3, tmp3, tmp4);
 8287     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8288     __ orr(tmp1, tmp1, tmp3);
 8289     __ cbnz(tmp1, NOT_EQUAL);
 8290     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8291     __ eor(tmp5, tmp5, tmp6);
 8292     __ eor(tmp7, tmp7, tmp8);
 8293     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8294     __ orr(tmp5, tmp5, tmp7);
 8295     __ cbnz(tmp5, NOT_EQUAL);
 8296     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8297     __ eor(tmp1, tmp1, tmp2);
 8298     __ eor(tmp3, tmp3, tmp4);
 8299     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8300     __ orr(tmp1, tmp1, tmp3);
 8301     __ cbnz(tmp1, NOT_EQUAL);
 8302     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8303     __ eor(tmp5, tmp5, tmp6);
 8304     __ sub(cnt1, cnt1, 8 * wordSize);
 8305     __ eor(tmp7, tmp7, tmp8);
 8306     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8307     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8308     // cmp) because subs allows an unlimited range of immediate operand.
 8309     __ subs(tmp6, cnt1, loopThreshold);
 8310     __ orr(tmp5, tmp5, tmp7);
 8311     __ cbnz(tmp5, NOT_EQUAL);
 8312     __ br(__ GE, LOOP);
 8313     // post-loop
 8314     __ eor(tmp1, tmp1, tmp2);
 8315     __ eor(tmp3, tmp3, tmp4);
 8316     __ orr(tmp1, tmp1, tmp3);
 8317     __ sub(cnt1, cnt1, 2 * wordSize);
 8318     __ cbnz(tmp1, NOT_EQUAL);
 8319   }
 8320 
 8321   void generate_large_array_equals_loop_simd(int loopThreshold,
 8322         bool usePrefetch, Label &NOT_EQUAL) {
 8323     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8324         tmp2 = rscratch2;
 8325     Label LOOP;
 8326 
 8327     __ bind(LOOP);
 8328     if (usePrefetch) {
 8329       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8330       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8331     }
 8332     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8333     __ sub(cnt1, cnt1, 8 * wordSize);
 8334     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8335     __ subs(tmp1, cnt1, loopThreshold);
 8336     __ eor(v0, __ T16B, v0, v4);
 8337     __ eor(v1, __ T16B, v1, v5);
 8338     __ eor(v2, __ T16B, v2, v6);
 8339     __ eor(v3, __ T16B, v3, v7);
 8340     __ orr(v0, __ T16B, v0, v1);
 8341     __ orr(v1, __ T16B, v2, v3);
 8342     __ orr(v0, __ T16B, v0, v1);
 8343     __ umov(tmp1, v0, __ D, 0);
 8344     __ umov(tmp2, v0, __ D, 1);
 8345     __ orr(tmp1, tmp1, tmp2);
 8346     __ cbnz(tmp1, NOT_EQUAL);
 8347     __ br(__ GE, LOOP);
 8348   }
 8349 
 8350   // a1 = r1 - array1 address
 8351   // a2 = r2 - array2 address
 8352   // result = r0 - return value. Already contains "false"
 8353   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8354   // r3-r5 are reserved temporary registers
 8355   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8356   address generate_large_array_equals() {
 8357     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8358         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8359         tmp7 = r12, tmp8 = r13;
 8360     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 8361         SMALL_LOOP, POST_LOOP;
 8362     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8363     // calculate if at least 32 prefetched bytes are used
 8364     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8365     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8366     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8367     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8368         tmp5, tmp6, tmp7, tmp8);
 8369 
 8370     __ align(CodeEntryAlignment);
 8371 
 8372     StubId stub_id = StubId::stubgen_large_array_equals_id;
 8373     StubCodeMark mark(this, stub_id);
 8374 
 8375     address entry = __ pc();
 8376     __ enter();
 8377     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8378     // also advance pointers to use post-increment instead of pre-increment
 8379     __ add(a1, a1, wordSize);
 8380     __ add(a2, a2, wordSize);
 8381     if (AvoidUnalignedAccesses) {
 8382       // both implementations (SIMD/nonSIMD) are using relatively large load
 8383       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8384       // on some CPUs in case of address is not at least 16-byte aligned.
 8385       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8386       // load if needed at least for 1st address and make if 16-byte aligned.
 8387       Label ALIGNED16;
 8388       __ tbz(a1, 3, ALIGNED16);
 8389       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8390       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8391       __ sub(cnt1, cnt1, wordSize);
 8392       __ eor(tmp1, tmp1, tmp2);
 8393       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8394       __ bind(ALIGNED16);
 8395     }
 8396     if (UseSIMDForArrayEquals) {
 8397       if (SoftwarePrefetchHintDistance >= 0) {
 8398         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8399         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8400         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8401             /* prfm = */ true, NOT_EQUAL);
 8402         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8403         __ br(__ LT, TAIL);
 8404       }
 8405       __ bind(NO_PREFETCH_LARGE_LOOP);
 8406       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8407           /* prfm = */ false, NOT_EQUAL);
 8408     } else {
 8409       __ push(spilled_regs, sp);
 8410       if (SoftwarePrefetchHintDistance >= 0) {
 8411         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8412         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8413         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8414             /* prfm = */ true, NOT_EQUAL);
 8415         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8416         __ br(__ LT, TAIL);
 8417       }
 8418       __ bind(NO_PREFETCH_LARGE_LOOP);
 8419       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8420           /* prfm = */ false, NOT_EQUAL);
 8421     }
 8422     __ bind(TAIL);
 8423       __ cbz(cnt1, EQUAL);
 8424       __ subs(cnt1, cnt1, wordSize);
 8425       __ br(__ LE, POST_LOOP);
 8426     __ bind(SMALL_LOOP);
 8427       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8428       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8429       __ subs(cnt1, cnt1, wordSize);
 8430       __ eor(tmp1, tmp1, tmp2);
 8431       __ cbnz(tmp1, NOT_EQUAL);
 8432       __ br(__ GT, SMALL_LOOP);
 8433     __ bind(POST_LOOP);
 8434       __ ldr(tmp1, Address(a1, cnt1));
 8435       __ ldr(tmp2, Address(a2, cnt1));
 8436       __ eor(tmp1, tmp1, tmp2);
 8437       __ cbnz(tmp1, NOT_EQUAL);
 8438     __ bind(EQUAL);
 8439       __ mov(result, true);
 8440     __ bind(NOT_EQUAL);
 8441       if (!UseSIMDForArrayEquals) {
 8442         __ pop(spilled_regs, sp);
 8443       }
 8444     __ bind(NOT_EQUAL_NO_POP);
 8445     __ leave();
 8446     __ ret(lr);
 8447     return entry;
 8448   }
 8449 
 8450   // result = r0 - return value. Contains initial hashcode value on entry.
 8451   // ary = r1 - array address
 8452   // cnt = r2 - elements count
 8453   // Clobbers: v0-v13, rscratch1, rscratch2
 8454   address generate_large_arrays_hashcode(BasicType eltype) {
 8455     const Register result = r0, ary = r1, cnt = r2;
 8456     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8457     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8458     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8459     const FloatRegister vpowm = v13;
 8460 
 8461     ARRAYS_HASHCODE_REGISTERS;
 8462 
 8463     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8464 
 8465     unsigned int vf; // vectorization factor
 8466     bool multiply_by_halves;
 8467     Assembler::SIMD_Arrangement load_arrangement;
 8468     switch (eltype) {
 8469     case T_BOOLEAN:
 8470     case T_BYTE:
 8471       load_arrangement = Assembler::T8B;
 8472       multiply_by_halves = true;
 8473       vf = 8;
 8474       break;
 8475     case T_CHAR:
 8476     case T_SHORT:
 8477       load_arrangement = Assembler::T8H;
 8478       multiply_by_halves = true;
 8479       vf = 8;
 8480       break;
 8481     case T_INT:
 8482       load_arrangement = Assembler::T4S;
 8483       multiply_by_halves = false;
 8484       vf = 4;
 8485       break;
 8486     default:
 8487       ShouldNotReachHere();
 8488     }
 8489 
 8490     // Unroll factor
 8491     const unsigned uf = 4;
 8492 
 8493     // Effective vectorization factor
 8494     const unsigned evf = vf * uf;
 8495 
 8496     __ align(CodeEntryAlignment);
 8497 
 8498     StubId stub_id;
 8499     switch (eltype) {
 8500     case T_BOOLEAN:
 8501       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 8502       break;
 8503     case T_BYTE:
 8504       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 8505       break;
 8506     case T_CHAR:
 8507       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 8508       break;
 8509     case T_SHORT:
 8510       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 8511       break;
 8512     case T_INT:
 8513       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 8514       break;
 8515     default:
 8516       stub_id = StubId::NO_STUBID;
 8517       ShouldNotReachHere();
 8518     };
 8519 
 8520     StubCodeMark mark(this, stub_id);
 8521 
 8522     address entry = __ pc();
 8523     __ enter();
 8524 
 8525     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8526     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8527     // value shouldn't change throughout both loops.
 8528     __ movw(rscratch1, intpow(31U, 3));
 8529     __ mov(vpow, Assembler::S, 0, rscratch1);
 8530     __ movw(rscratch1, intpow(31U, 2));
 8531     __ mov(vpow, Assembler::S, 1, rscratch1);
 8532     __ movw(rscratch1, intpow(31U, 1));
 8533     __ mov(vpow, Assembler::S, 2, rscratch1);
 8534     __ movw(rscratch1, intpow(31U, 0));
 8535     __ mov(vpow, Assembler::S, 3, rscratch1);
 8536 
 8537     __ mov(vmul0, Assembler::T16B, 0);
 8538     __ mov(vmul0, Assembler::S, 3, result);
 8539 
 8540     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8541     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8542 
 8543     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8544     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8545 
 8546     // SMALL LOOP
 8547     __ bind(SMALL_LOOP);
 8548 
 8549     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8550     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8551     __ subsw(rscratch2, rscratch2, vf);
 8552 
 8553     if (load_arrangement == Assembler::T8B) {
 8554       // Extend 8B to 8H to be able to use vector multiply
 8555       // instructions
 8556       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8557       if (is_signed_subword_type(eltype)) {
 8558         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8559       } else {
 8560         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8561       }
 8562     }
 8563 
 8564     switch (load_arrangement) {
 8565     case Assembler::T4S:
 8566       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8567       break;
 8568     case Assembler::T8B:
 8569     case Assembler::T8H:
 8570       assert(is_subword_type(eltype), "subword type expected");
 8571       if (is_signed_subword_type(eltype)) {
 8572         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8573       } else {
 8574         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8575       }
 8576       break;
 8577     default:
 8578       __ should_not_reach_here();
 8579     }
 8580 
 8581     // Process the upper half of a vector
 8582     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8583       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8584       if (is_signed_subword_type(eltype)) {
 8585         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8586       } else {
 8587         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8588       }
 8589     }
 8590 
 8591     __ br(Assembler::HI, SMALL_LOOP);
 8592 
 8593     // SMALL LOOP'S EPILOQUE
 8594     __ lsr(rscratch2, cnt, exact_log2(evf));
 8595     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8596 
 8597     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8598     __ addv(vmul0, Assembler::T4S, vmul0);
 8599     __ umov(result, vmul0, Assembler::S, 0);
 8600 
 8601     // TAIL
 8602     __ bind(TAIL);
 8603 
 8604     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8605     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8606     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8607     __ andr(rscratch2, cnt, vf - 1);
 8608     __ bind(TAIL_SHORTCUT);
 8609     __ adr(rscratch1, BR_BASE);
 8610     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8611     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8612     __ movw(rscratch2, 0x1f);
 8613     __ br(rscratch1);
 8614 
 8615     for (size_t i = 0; i < vf - 1; ++i) {
 8616       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8617                                    eltype);
 8618       __ maddw(result, result, rscratch2, rscratch1);
 8619       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8620       // Generate 2nd nop to have 4 instructions per iteration.
 8621       if (VM_Version::supports_a53mac()) {
 8622         __ nop();
 8623       }
 8624     }
 8625     __ bind(BR_BASE);
 8626 
 8627     __ leave();
 8628     __ ret(lr);
 8629 
 8630     // LARGE LOOP
 8631     __ bind(LARGE_LOOP_PREHEADER);
 8632 
 8633     __ lsr(rscratch2, cnt, exact_log2(evf));
 8634 
 8635     if (multiply_by_halves) {
 8636       // 31^4 - multiplier between lower and upper parts of a register
 8637       __ movw(rscratch1, intpow(31U, vf / 2));
 8638       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8639       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8640       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8641       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8642     } else {
 8643       // 31^16
 8644       __ movw(rscratch1, intpow(31U, evf));
 8645       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8646     }
 8647 
 8648     __ mov(vmul3, Assembler::T16B, 0);
 8649     __ mov(vmul2, Assembler::T16B, 0);
 8650     __ mov(vmul1, Assembler::T16B, 0);
 8651 
 8652     __ bind(LARGE_LOOP);
 8653 
 8654     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8655     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8656     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8657     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8658 
 8659     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8660            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8661 
 8662     if (load_arrangement == Assembler::T8B) {
 8663       // Extend 8B to 8H to be able to use vector multiply
 8664       // instructions
 8665       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8666       if (is_signed_subword_type(eltype)) {
 8667         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8668         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8669         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8670         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8671       } else {
 8672         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8673         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8674         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8675         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8676       }
 8677     }
 8678 
 8679     switch (load_arrangement) {
 8680     case Assembler::T4S:
 8681       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8682       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8683       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8684       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8685       break;
 8686     case Assembler::T8B:
 8687     case Assembler::T8H:
 8688       assert(is_subword_type(eltype), "subword type expected");
 8689       if (is_signed_subword_type(eltype)) {
 8690         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8691         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8692         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8693         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8694       } else {
 8695         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8696         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8697         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8698         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8699       }
 8700       break;
 8701     default:
 8702       __ should_not_reach_here();
 8703     }
 8704 
 8705     // Process the upper half of a vector
 8706     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8707       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8708       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8709       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8710       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8711       if (is_signed_subword_type(eltype)) {
 8712         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8713         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8714         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8715         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8716       } else {
 8717         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8718         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8719         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8720         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8721       }
 8722     }
 8723 
 8724     __ subsw(rscratch2, rscratch2, 1);
 8725     __ br(Assembler::HI, LARGE_LOOP);
 8726 
 8727     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8728     __ addv(vmul3, Assembler::T4S, vmul3);
 8729     __ umov(result, vmul3, Assembler::S, 0);
 8730 
 8731     __ mov(rscratch2, intpow(31U, vf));
 8732 
 8733     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8734     __ addv(vmul2, Assembler::T4S, vmul2);
 8735     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8736     __ maddw(result, result, rscratch2, rscratch1);
 8737 
 8738     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8739     __ addv(vmul1, Assembler::T4S, vmul1);
 8740     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8741     __ maddw(result, result, rscratch2, rscratch1);
 8742 
 8743     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8744     __ addv(vmul0, Assembler::T4S, vmul0);
 8745     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8746     __ maddw(result, result, rscratch2, rscratch1);
 8747 
 8748     __ andr(rscratch2, cnt, vf - 1);
 8749     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8750 
 8751     __ leave();
 8752     __ ret(lr);
 8753 
 8754     return entry;
 8755   }
 8756 
 8757   address generate_dsin_dcos(bool isCos) {
 8758     __ align(CodeEntryAlignment);
 8759     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 8760     StubCodeMark mark(this, stub_id);
 8761     address start = __ pc();
 8762     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8763         (address)StubRoutines::aarch64::_two_over_pi,
 8764         (address)StubRoutines::aarch64::_pio2,
 8765         (address)StubRoutines::aarch64::_dsin_coef,
 8766         (address)StubRoutines::aarch64::_dcos_coef);
 8767     return start;
 8768   }
 8769 
 8770   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8771   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8772       Label &DIFF2) {
 8773     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8774     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8775 
 8776     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8777     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8778     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8779     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8780 
 8781     __ fmovd(tmpL, vtmp3);
 8782     __ eor(rscratch2, tmp3, tmpL);
 8783     __ cbnz(rscratch2, DIFF2);
 8784 
 8785     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8786     __ umov(tmpL, vtmp3, __ D, 1);
 8787     __ eor(rscratch2, tmpU, tmpL);
 8788     __ cbnz(rscratch2, DIFF1);
 8789 
 8790     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8791     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8792     __ fmovd(tmpL, vtmp);
 8793     __ eor(rscratch2, tmp3, tmpL);
 8794     __ cbnz(rscratch2, DIFF2);
 8795 
 8796     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8797     __ umov(tmpL, vtmp, __ D, 1);
 8798     __ eor(rscratch2, tmpU, tmpL);
 8799     __ cbnz(rscratch2, DIFF1);
 8800   }
 8801 
 8802   // r0  = result
 8803   // r1  = str1
 8804   // r2  = cnt1
 8805   // r3  = str2
 8806   // r4  = cnt2
 8807   // r10 = tmp1
 8808   // r11 = tmp2
 8809   address generate_compare_long_string_different_encoding(bool isLU) {
 8810     __ align(CodeEntryAlignment);
 8811     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 8812     StubCodeMark mark(this, stub_id);
 8813     address entry = __ pc();
 8814     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8815         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8816         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8817     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8818         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8819     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8820     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8821 
 8822     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8823 
 8824     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8825     // cnt2 == amount of characters left to compare
 8826     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8827     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8828     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8829     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8830     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8831     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8832     __ eor(rscratch2, tmp1, tmp2);
 8833     __ mov(rscratch1, tmp2);
 8834     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8835     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8836              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8837     __ push(spilled_regs, sp);
 8838     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8839     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8840 
 8841     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8842 
 8843     if (SoftwarePrefetchHintDistance >= 0) {
 8844       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8845       __ br(__ LT, NO_PREFETCH);
 8846       __ bind(LARGE_LOOP_PREFETCH);
 8847         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8848         __ mov(tmp4, 2);
 8849         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8850         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8851           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8852           __ subs(tmp4, tmp4, 1);
 8853           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8854           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8855           __ mov(tmp4, 2);
 8856         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8857           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8858           __ subs(tmp4, tmp4, 1);
 8859           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8860           __ sub(cnt2, cnt2, 64);
 8861           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8862           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8863     }
 8864     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8865     __ bind(NO_PREFETCH);
 8866     __ subs(cnt2, cnt2, 16);
 8867     __ br(__ LT, TAIL);
 8868     __ align(OptoLoopAlignment);
 8869     __ bind(SMALL_LOOP); // smaller loop
 8870       __ subs(cnt2, cnt2, 16);
 8871       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8872       __ br(__ GE, SMALL_LOOP);
 8873       __ cmn(cnt2, (u1)16);
 8874       __ br(__ EQ, LOAD_LAST);
 8875     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8876       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8877       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8878       __ ldr(tmp3, Address(cnt1, -8));
 8879       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8880       __ b(LOAD_LAST);
 8881     __ bind(DIFF2);
 8882       __ mov(tmpU, tmp3);
 8883     __ bind(DIFF1);
 8884       __ pop(spilled_regs, sp);
 8885       __ b(CALCULATE_DIFFERENCE);
 8886     __ bind(LOAD_LAST);
 8887       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8888       // No need to load it again
 8889       __ mov(tmpU, tmp3);
 8890       __ pop(spilled_regs, sp);
 8891 
 8892       // tmp2 points to the address of the last 4 Latin1 characters right now
 8893       __ ldrs(vtmp, Address(tmp2));
 8894       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8895       __ fmovd(tmpL, vtmp);
 8896 
 8897       __ eor(rscratch2, tmpU, tmpL);
 8898       __ cbz(rscratch2, DONE);
 8899 
 8900     // Find the first different characters in the longwords and
 8901     // compute their difference.
 8902     __ bind(CALCULATE_DIFFERENCE);
 8903       __ rev(rscratch2, rscratch2);
 8904       __ clz(rscratch2, rscratch2);
 8905       __ andr(rscratch2, rscratch2, -16);
 8906       __ lsrv(tmp1, tmp1, rscratch2);
 8907       __ uxthw(tmp1, tmp1);
 8908       __ lsrv(rscratch1, rscratch1, rscratch2);
 8909       __ uxthw(rscratch1, rscratch1);
 8910       __ subw(result, tmp1, rscratch1);
 8911     __ bind(DONE);
 8912       __ ret(lr);
 8913     return entry;
 8914   }
 8915 
 8916   // r0 = input (float16)
 8917   // v0 = result (float)
 8918   // v1 = temporary float register
 8919   address generate_float16ToFloat() {
 8920     __ align(CodeEntryAlignment);
 8921     StubId stub_id = StubId::stubgen_hf2f_id;
 8922     StubCodeMark mark(this, stub_id);
 8923     address entry = __ pc();
 8924     BLOCK_COMMENT("Entry:");
 8925     __ flt16_to_flt(v0, r0, v1);
 8926     __ ret(lr);
 8927     return entry;
 8928   }
 8929 
 8930   // v0 = input (float)
 8931   // r0 = result (float16)
 8932   // v1 = temporary float register
 8933   address generate_floatToFloat16() {
 8934     __ align(CodeEntryAlignment);
 8935     StubId stub_id = StubId::stubgen_f2hf_id;
 8936     StubCodeMark mark(this, stub_id);
 8937     address entry = __ pc();
 8938     BLOCK_COMMENT("Entry:");
 8939     __ flt_to_flt16(r0, v0, v1);
 8940     __ ret(lr);
 8941     return entry;
 8942   }
 8943 
 8944   address generate_method_entry_barrier() {
 8945     __ align(CodeEntryAlignment);
 8946     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 8947     StubCodeMark mark(this, stub_id);
 8948 
 8949     Label deoptimize_label;
 8950 
 8951     address start = __ pc();
 8952 
 8953     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 8954 
 8955     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 8956       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 8957       // We can get here despite the nmethod being good, if we have not
 8958       // yet applied our cross modification fence (or data fence).
 8959       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 8960       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 8961       __ ldrw(rscratch2, rscratch2);
 8962       __ strw(rscratch2, thread_epoch_addr);
 8963       __ isb();
 8964       __ membar(__ LoadLoad);
 8965     }
 8966 
 8967     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 8968 
 8969     __ enter();
 8970     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 8971 
 8972     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 8973 
 8974     __ push_call_clobbered_registers();
 8975 
 8976     __ mov(c_rarg0, rscratch2);
 8977     __ call_VM_leaf
 8978          (CAST_FROM_FN_PTR
 8979           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 8980 
 8981     __ reset_last_Java_frame(true);
 8982 
 8983     __ mov(rscratch1, r0);
 8984 
 8985     __ pop_call_clobbered_registers();
 8986 
 8987     __ cbnz(rscratch1, deoptimize_label);
 8988 
 8989     __ leave();
 8990     __ ret(lr);
 8991 
 8992     __ BIND(deoptimize_label);
 8993 
 8994     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 8995     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 8996 
 8997     __ mov(sp, rscratch1);
 8998     __ br(rscratch2);
 8999 
 9000     return start;
 9001   }
 9002 
 9003   // r0  = result
 9004   // r1  = str1
 9005   // r2  = cnt1
 9006   // r3  = str2
 9007   // r4  = cnt2
 9008   // r10 = tmp1
 9009   // r11 = tmp2
 9010   address generate_compare_long_string_same_encoding(bool isLL) {
 9011     __ align(CodeEntryAlignment);
 9012     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9013     StubCodeMark mark(this, stub_id);
 9014     address entry = __ pc();
 9015     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9016         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9017 
 9018     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9019 
 9020     // exit from large loop when less than 64 bytes left to read or we're about
 9021     // to prefetch memory behind array border
 9022     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9023 
 9024     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9025     __ eor(rscratch2, tmp1, tmp2);
 9026     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9027 
 9028     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9029     // update pointers, because of previous read
 9030     __ add(str1, str1, wordSize);
 9031     __ add(str2, str2, wordSize);
 9032     if (SoftwarePrefetchHintDistance >= 0) {
 9033       __ align(OptoLoopAlignment);
 9034       __ bind(LARGE_LOOP_PREFETCH);
 9035         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9036         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9037 
 9038         for (int i = 0; i < 4; i++) {
 9039           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9040           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9041           __ cmp(tmp1, tmp2);
 9042           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9043           __ br(Assembler::NE, DIFF);
 9044         }
 9045         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9046         __ add(str1, str1, 64);
 9047         __ add(str2, str2, 64);
 9048         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9049         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9050         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9051     }
 9052 
 9053     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9054     __ br(Assembler::LE, LESS16);
 9055     __ align(OptoLoopAlignment);
 9056     __ bind(LOOP_COMPARE16);
 9057       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9058       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9059       __ cmp(tmp1, tmp2);
 9060       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9061       __ br(Assembler::NE, DIFF);
 9062       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9063       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9064       __ br(Assembler::LT, LESS16);
 9065 
 9066       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9067       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9068       __ cmp(tmp1, tmp2);
 9069       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9070       __ br(Assembler::NE, DIFF);
 9071       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9072       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9073       __ br(Assembler::GE, LOOP_COMPARE16);
 9074       __ cbz(cnt2, LENGTH_DIFF);
 9075 
 9076     __ bind(LESS16);
 9077       // each 8 compare
 9078       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9079       __ br(Assembler::LE, LESS8);
 9080       __ ldr(tmp1, Address(__ post(str1, 8)));
 9081       __ ldr(tmp2, Address(__ post(str2, 8)));
 9082       __ eor(rscratch2, tmp1, tmp2);
 9083       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9084       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9085 
 9086     __ bind(LESS8); // directly load last 8 bytes
 9087       if (!isLL) {
 9088         __ add(cnt2, cnt2, cnt2);
 9089       }
 9090       __ ldr(tmp1, Address(str1, cnt2));
 9091       __ ldr(tmp2, Address(str2, cnt2));
 9092       __ eor(rscratch2, tmp1, tmp2);
 9093       __ cbz(rscratch2, LENGTH_DIFF);
 9094       __ b(CAL_DIFFERENCE);
 9095 
 9096     __ bind(DIFF);
 9097       __ cmp(tmp1, tmp2);
 9098       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9099       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9100       // reuse rscratch2 register for the result of eor instruction
 9101       __ eor(rscratch2, tmp1, tmp2);
 9102 
 9103     __ bind(CAL_DIFFERENCE);
 9104       __ rev(rscratch2, rscratch2);
 9105       __ clz(rscratch2, rscratch2);
 9106       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9107       __ lsrv(tmp1, tmp1, rscratch2);
 9108       __ lsrv(tmp2, tmp2, rscratch2);
 9109       if (isLL) {
 9110         __ uxtbw(tmp1, tmp1);
 9111         __ uxtbw(tmp2, tmp2);
 9112       } else {
 9113         __ uxthw(tmp1, tmp1);
 9114         __ uxthw(tmp2, tmp2);
 9115       }
 9116       __ subw(result, tmp1, tmp2);
 9117 
 9118     __ bind(LENGTH_DIFF);
 9119       __ ret(lr);
 9120     return entry;
 9121   }
 9122 
 9123   enum string_compare_mode {
 9124     LL,
 9125     LU,
 9126     UL,
 9127     UU,
 9128   };
 9129 
 9130   // The following registers are declared in aarch64.ad
 9131   // r0  = result
 9132   // r1  = str1
 9133   // r2  = cnt1
 9134   // r3  = str2
 9135   // r4  = cnt2
 9136   // r10 = tmp1
 9137   // r11 = tmp2
 9138   // z0  = ztmp1
 9139   // z1  = ztmp2
 9140   // p0  = pgtmp1
 9141   // p1  = pgtmp2
 9142   address generate_compare_long_string_sve(string_compare_mode mode) {
 9143     StubId stub_id;
 9144     switch (mode) {
 9145       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9146       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9147       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9148       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9149       default: ShouldNotReachHere();
 9150     }
 9151 
 9152     __ align(CodeEntryAlignment);
 9153     address entry = __ pc();
 9154     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9155              tmp1 = r10, tmp2 = r11;
 9156 
 9157     Label LOOP, DONE, MISMATCH;
 9158     Register vec_len = tmp1;
 9159     Register idx = tmp2;
 9160     // The minimum of the string lengths has been stored in cnt2.
 9161     Register cnt = cnt2;
 9162     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9163     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9164 
 9165 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9166     switch (mode) {                                                            \
 9167       case LL:                                                                 \
 9168         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9169         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9170         break;                                                                 \
 9171       case LU:                                                                 \
 9172         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9173         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9174         break;                                                                 \
 9175       case UL:                                                                 \
 9176         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9177         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9178         break;                                                                 \
 9179       case UU:                                                                 \
 9180         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9181         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9182         break;                                                                 \
 9183       default:                                                                 \
 9184         ShouldNotReachHere();                                                  \
 9185     }
 9186 
 9187     StubCodeMark mark(this, stub_id);
 9188 
 9189     __ mov(idx, 0);
 9190     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9191 
 9192     if (mode == LL) {
 9193       __ sve_cntb(vec_len);
 9194     } else {
 9195       __ sve_cnth(vec_len);
 9196     }
 9197 
 9198     __ sub(rscratch1, cnt, vec_len);
 9199 
 9200     __ bind(LOOP);
 9201 
 9202       // main loop
 9203       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9204       __ add(idx, idx, vec_len);
 9205       // Compare strings.
 9206       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9207       __ br(__ NE, MISMATCH);
 9208       __ cmp(idx, rscratch1);
 9209       __ br(__ LT, LOOP);
 9210 
 9211     // post loop, last iteration
 9212     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9213 
 9214     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9215     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9216     __ br(__ EQ, DONE);
 9217 
 9218     __ bind(MISMATCH);
 9219 
 9220     // Crop the vector to find its location.
 9221     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9222     // Extract the first different characters of each string.
 9223     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9224     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9225 
 9226     // Compute the difference of the first different characters.
 9227     __ sub(result, rscratch1, rscratch2);
 9228 
 9229     __ bind(DONE);
 9230     __ ret(lr);
 9231 #undef LOAD_PAIR
 9232     return entry;
 9233   }
 9234 
 9235   void generate_compare_long_strings() {
 9236     if (UseSVE == 0) {
 9237       StubRoutines::aarch64::_compare_long_string_LL
 9238           = generate_compare_long_string_same_encoding(true);
 9239       StubRoutines::aarch64::_compare_long_string_UU
 9240           = generate_compare_long_string_same_encoding(false);
 9241       StubRoutines::aarch64::_compare_long_string_LU
 9242           = generate_compare_long_string_different_encoding(true);
 9243       StubRoutines::aarch64::_compare_long_string_UL
 9244           = generate_compare_long_string_different_encoding(false);
 9245     } else {
 9246       StubRoutines::aarch64::_compare_long_string_LL
 9247           = generate_compare_long_string_sve(LL);
 9248       StubRoutines::aarch64::_compare_long_string_UU
 9249           = generate_compare_long_string_sve(UU);
 9250       StubRoutines::aarch64::_compare_long_string_LU
 9251           = generate_compare_long_string_sve(LU);
 9252       StubRoutines::aarch64::_compare_long_string_UL
 9253           = generate_compare_long_string_sve(UL);
 9254     }
 9255   }
 9256 
 9257   // R0 = result
 9258   // R1 = str2
 9259   // R2 = cnt1
 9260   // R3 = str1
 9261   // R4 = cnt2
 9262   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9263   //
 9264   // This generic linear code use few additional ideas, which makes it faster:
 9265   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9266   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9267   // 2) we can use "fast" algorithm of finding single character to search for
 9268   // first symbol with less branches(1 branch per each loaded register instead
 9269   // of branch for each symbol), so, this is where constants like
 9270   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9271   // 3) after loading and analyzing 1st register of source string, it can be
 9272   // used to search for every 1st character entry, saving few loads in
 9273   // comparison with "simplier-but-slower" implementation
 9274   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9275   // re-using/re-initializing/compressing register values, which makes code
 9276   // larger and a bit less readable, however, most of extra operations are
 9277   // issued during loads or branches, so, penalty is minimal
 9278   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 9279     StubId stub_id;
 9280     if (str1_isL) {
 9281       if (str2_isL) {
 9282         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
 9283       } else {
 9284         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
 9285       }
 9286     } else {
 9287       if (str2_isL) {
 9288         ShouldNotReachHere();
 9289       } else {
 9290         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
 9291       }
 9292     }
 9293     __ align(CodeEntryAlignment);
 9294     StubCodeMark mark(this, stub_id);
 9295     address entry = __ pc();
 9296 
 9297     int str1_chr_size = str1_isL ? 1 : 2;
 9298     int str2_chr_size = str2_isL ? 1 : 2;
 9299     int str1_chr_shift = str1_isL ? 0 : 1;
 9300     int str2_chr_shift = str2_isL ? 0 : 1;
 9301     bool isL = str1_isL && str2_isL;
 9302    // parameters
 9303     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 9304     // temporary registers
 9305     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 9306     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 9307     // redefinitions
 9308     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 9309 
 9310     __ push(spilled_regs, sp);
 9311     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 9312         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 9313         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 9314         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 9315         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 9316         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 9317     // Read whole register from str1. It is safe, because length >=8 here
 9318     __ ldr(ch1, Address(str1));
 9319     // Read whole register from str2. It is safe, because length >=8 here
 9320     __ ldr(ch2, Address(str2));
 9321     __ sub(cnt2, cnt2, cnt1);
 9322     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 9323     if (str1_isL != str2_isL) {
 9324       __ eor(v0, __ T16B, v0, v0);
 9325     }
 9326     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 9327     __ mul(first, first, tmp1);
 9328     // check if we have less than 1 register to check
 9329     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 9330     if (str1_isL != str2_isL) {
 9331       __ fmovd(v1, ch1);
 9332     }
 9333     __ br(__ LE, L_SMALL);
 9334     __ eor(ch2, first, ch2);
 9335     if (str1_isL != str2_isL) {
 9336       __ zip1(v1, __ T16B, v1, v0);
 9337     }
 9338     __ sub(tmp2, ch2, tmp1);
 9339     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9340     __ bics(tmp2, tmp2, ch2);
 9341     if (str1_isL != str2_isL) {
 9342       __ fmovd(ch1, v1);
 9343     }
 9344     __ br(__ NE, L_HAS_ZERO);
 9345     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9346     __ add(result, result, wordSize/str2_chr_size);
 9347     __ add(str2, str2, wordSize);
 9348     __ br(__ LT, L_POST_LOOP);
 9349     __ BIND(L_LOOP);
 9350       __ ldr(ch2, Address(str2));
 9351       __ eor(ch2, first, ch2);
 9352       __ sub(tmp2, ch2, tmp1);
 9353       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9354       __ bics(tmp2, tmp2, ch2);
 9355       __ br(__ NE, L_HAS_ZERO);
 9356     __ BIND(L_LOOP_PROCEED);
 9357       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9358       __ add(str2, str2, wordSize);
 9359       __ add(result, result, wordSize/str2_chr_size);
 9360       __ br(__ GE, L_LOOP);
 9361     __ BIND(L_POST_LOOP);
 9362       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9363       __ br(__ LE, NOMATCH);
 9364       __ ldr(ch2, Address(str2));
 9365       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9366       __ eor(ch2, first, ch2);
 9367       __ sub(tmp2, ch2, tmp1);
 9368       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9369       __ mov(tmp4, -1); // all bits set
 9370       __ b(L_SMALL_PROCEED);
 9371     __ align(OptoLoopAlignment);
 9372     __ BIND(L_SMALL);
 9373       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9374       __ eor(ch2, first, ch2);
 9375       if (str1_isL != str2_isL) {
 9376         __ zip1(v1, __ T16B, v1, v0);
 9377       }
 9378       __ sub(tmp2, ch2, tmp1);
 9379       __ mov(tmp4, -1); // all bits set
 9380       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9381       if (str1_isL != str2_isL) {
 9382         __ fmovd(ch1, v1); // move converted 4 symbols
 9383       }
 9384     __ BIND(L_SMALL_PROCEED);
 9385       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9386       __ bic(tmp2, tmp2, ch2);
 9387       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9388       __ rbit(tmp2, tmp2);
 9389       __ br(__ EQ, NOMATCH);
 9390     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9391       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9392       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9393       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9394       if (str2_isL) { // LL
 9395         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9396         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9397         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9398         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9399         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9400       } else {
 9401         __ mov(ch2, 0xE); // all bits in byte set except last one
 9402         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9403         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9404         __ lslv(tmp2, tmp2, tmp4);
 9405         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9406         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9407         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9408         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9409       }
 9410       __ cmp(ch1, ch2);
 9411       __ mov(tmp4, wordSize/str2_chr_size);
 9412       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9413     __ BIND(L_SMALL_CMP_LOOP);
 9414       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9415                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9416       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9417                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9418       __ add(tmp4, tmp4, 1);
 9419       __ cmp(tmp4, cnt1);
 9420       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9421       __ cmp(first, ch2);
 9422       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9423     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9424       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9425       __ clz(tmp4, tmp2);
 9426       __ add(result, result, 1); // advance index
 9427       __ add(str2, str2, str2_chr_size); // advance pointer
 9428       __ b(L_SMALL_HAS_ZERO_LOOP);
 9429     __ align(OptoLoopAlignment);
 9430     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9431       __ cmp(first, ch2);
 9432       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9433       __ b(DONE);
 9434     __ align(OptoLoopAlignment);
 9435     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9436       if (str2_isL) { // LL
 9437         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9438         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9439         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9440         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9441         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9442       } else {
 9443         __ mov(ch2, 0xE); // all bits in byte set except last one
 9444         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9445         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9446         __ lslv(tmp2, tmp2, tmp4);
 9447         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9448         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9449         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9450         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9451       }
 9452       __ cmp(ch1, ch2);
 9453       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9454       __ b(DONE);
 9455     __ align(OptoLoopAlignment);
 9456     __ BIND(L_HAS_ZERO);
 9457       __ rbit(tmp2, tmp2);
 9458       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9459       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9460       // It's fine because both counters are 32bit and are not changed in this
 9461       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9462       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9463       __ sub(result, result, 1);
 9464     __ BIND(L_HAS_ZERO_LOOP);
 9465       __ mov(cnt1, wordSize/str2_chr_size);
 9466       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9467       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9468       if (str2_isL) {
 9469         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9470         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9471         __ lslv(tmp2, tmp2, tmp4);
 9472         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9473         __ add(tmp4, tmp4, 1);
 9474         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9475         __ lsl(tmp2, tmp2, 1);
 9476         __ mov(tmp4, wordSize/str2_chr_size);
 9477       } else {
 9478         __ mov(ch2, 0xE);
 9479         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9480         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9481         __ lslv(tmp2, tmp2, tmp4);
 9482         __ add(tmp4, tmp4, 1);
 9483         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9484         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9485         __ lsl(tmp2, tmp2, 1);
 9486         __ mov(tmp4, wordSize/str2_chr_size);
 9487         __ sub(str2, str2, str2_chr_size);
 9488       }
 9489       __ cmp(ch1, ch2);
 9490       __ mov(tmp4, wordSize/str2_chr_size);
 9491       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9492     __ BIND(L_CMP_LOOP);
 9493       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9494                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9495       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9496                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9497       __ add(tmp4, tmp4, 1);
 9498       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9499       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9500       __ cmp(cnt1, ch2);
 9501       __ br(__ EQ, L_CMP_LOOP);
 9502     __ BIND(L_CMP_LOOP_NOMATCH);
 9503       // here we're not matched
 9504       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9505       __ clz(tmp4, tmp2);
 9506       __ add(str2, str2, str2_chr_size); // advance pointer
 9507       __ b(L_HAS_ZERO_LOOP);
 9508     __ align(OptoLoopAlignment);
 9509     __ BIND(L_CMP_LOOP_LAST_CMP);
 9510       __ cmp(cnt1, ch2);
 9511       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9512       __ b(DONE);
 9513     __ align(OptoLoopAlignment);
 9514     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9515       if (str2_isL) {
 9516         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9517         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9518         __ lslv(tmp2, tmp2, tmp4);
 9519         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9520         __ add(tmp4, tmp4, 1);
 9521         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9522         __ lsl(tmp2, tmp2, 1);
 9523       } else {
 9524         __ mov(ch2, 0xE);
 9525         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9526         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9527         __ lslv(tmp2, tmp2, tmp4);
 9528         __ add(tmp4, tmp4, 1);
 9529         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9530         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9531         __ lsl(tmp2, tmp2, 1);
 9532         __ sub(str2, str2, str2_chr_size);
 9533       }
 9534       __ cmp(ch1, ch2);
 9535       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9536       __ b(DONE);
 9537     __ align(OptoLoopAlignment);
 9538     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9539       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9540       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9541       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9542       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9543       // result by analyzed characters value, so, we can just reset lower bits
 9544       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9545       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9546       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9547       // index of last analyzed substring inside current octet. So, str2 in at
 9548       // respective start address. We need to advance it to next octet
 9549       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9550       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9551       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9552       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9553       __ movw(cnt2, cnt2);
 9554       __ b(L_LOOP_PROCEED);
 9555     __ align(OptoLoopAlignment);
 9556     __ BIND(NOMATCH);
 9557       __ mov(result, -1);
 9558     __ BIND(DONE);
 9559       __ pop(spilled_regs, sp);
 9560       __ ret(lr);
 9561     return entry;
 9562   }
 9563 
 9564   void generate_string_indexof_stubs() {
 9565     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9566     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9567     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9568   }
 9569 
 9570   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9571       FloatRegister src1, FloatRegister src2) {
 9572     Register dst = r1;
 9573     __ zip1(v1, __ T16B, src1, v0);
 9574     __ zip2(v2, __ T16B, src1, v0);
 9575     if (generatePrfm) {
 9576       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9577     }
 9578     __ zip1(v3, __ T16B, src2, v0);
 9579     __ zip2(v4, __ T16B, src2, v0);
 9580     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9581   }
 9582 
 9583   // R0 = src
 9584   // R1 = dst
 9585   // R2 = len
 9586   // R3 = len >> 3
 9587   // V0 = 0
 9588   // v1 = loaded 8 bytes
 9589   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9590   address generate_large_byte_array_inflate() {
 9591     __ align(CodeEntryAlignment);
 9592     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
 9593     StubCodeMark mark(this, stub_id);
 9594     address entry = __ pc();
 9595     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9596     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9597     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9598 
 9599     // do one more 8-byte read to have address 16-byte aligned in most cases
 9600     // also use single store instruction
 9601     __ ldrd(v2, __ post(src, 8));
 9602     __ sub(octetCounter, octetCounter, 2);
 9603     __ zip1(v1, __ T16B, v1, v0);
 9604     __ zip1(v2, __ T16B, v2, v0);
 9605     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9606     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9607     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9608     __ br(__ LE, LOOP_START);
 9609     __ b(LOOP_PRFM_START);
 9610     __ bind(LOOP_PRFM);
 9611       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9612     __ bind(LOOP_PRFM_START);
 9613       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9614       __ sub(octetCounter, octetCounter, 8);
 9615       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9616       inflate_and_store_2_fp_registers(true, v3, v4);
 9617       inflate_and_store_2_fp_registers(true, v5, v6);
 9618       __ br(__ GT, LOOP_PRFM);
 9619       __ cmp(octetCounter, (u1)8);
 9620       __ br(__ LT, DONE);
 9621     __ bind(LOOP);
 9622       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9623       __ bind(LOOP_START);
 9624       __ sub(octetCounter, octetCounter, 8);
 9625       __ cmp(octetCounter, (u1)8);
 9626       inflate_and_store_2_fp_registers(false, v3, v4);
 9627       inflate_and_store_2_fp_registers(false, v5, v6);
 9628       __ br(__ GE, LOOP);
 9629     __ bind(DONE);
 9630       __ ret(lr);
 9631     return entry;
 9632   }
 9633 
 9634   /**
 9635    *  Arguments:
 9636    *
 9637    *  Input:
 9638    *  c_rarg0   - current state address
 9639    *  c_rarg1   - H key address
 9640    *  c_rarg2   - data address
 9641    *  c_rarg3   - number of blocks
 9642    *
 9643    *  Output:
 9644    *  Updated state at c_rarg0
 9645    */
 9646   address generate_ghash_processBlocks() {
 9647     // Bafflingly, GCM uses little-endian for the byte order, but
 9648     // big-endian for the bit order.  For example, the polynomial 1 is
 9649     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9650     //
 9651     // So, we must either reverse the bytes in each word and do
 9652     // everything big-endian or reverse the bits in each byte and do
 9653     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9654     // the bits in each byte (we have an instruction, RBIT, to do
 9655     // that) and keep the data in little-endian bit order through the
 9656     // calculation, bit-reversing the inputs and outputs.
 9657 
 9658     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
 9659     StubCodeMark mark(this, stub_id);
 9660     __ align(wordSize * 2);
 9661     address p = __ pc();
 9662     __ emit_int64(0x87);  // The low-order bits of the field
 9663                           // polynomial (i.e. p = z^7+z^2+z+1)
 9664                           // repeated in the low and high parts of a
 9665                           // 128-bit vector
 9666     __ emit_int64(0x87);
 9667 
 9668     __ align(CodeEntryAlignment);
 9669     address start = __ pc();
 9670 
 9671     Register state   = c_rarg0;
 9672     Register subkeyH = c_rarg1;
 9673     Register data    = c_rarg2;
 9674     Register blocks  = c_rarg3;
 9675 
 9676     FloatRegister vzr = v30;
 9677     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9678 
 9679     __ ldrq(v24, p);    // The field polynomial
 9680 
 9681     __ ldrq(v0, Address(state));
 9682     __ ldrq(v1, Address(subkeyH));
 9683 
 9684     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9685     __ rbit(v0, __ T16B, v0);
 9686     __ rev64(v1, __ T16B, v1);
 9687     __ rbit(v1, __ T16B, v1);
 9688 
 9689     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9690     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9691 
 9692     {
 9693       Label L_ghash_loop;
 9694       __ bind(L_ghash_loop);
 9695 
 9696       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9697                                                  // reversing each byte
 9698       __ rbit(v2, __ T16B, v2);
 9699       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9700 
 9701       // Multiply state in v2 by subkey in v1
 9702       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9703                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9704                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9705       // Reduce v7:v5 by the field polynomial
 9706       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9707 
 9708       __ sub(blocks, blocks, 1);
 9709       __ cbnz(blocks, L_ghash_loop);
 9710     }
 9711 
 9712     // The bit-reversed result is at this point in v0
 9713     __ rev64(v0, __ T16B, v0);
 9714     __ rbit(v0, __ T16B, v0);
 9715 
 9716     __ st1(v0, __ T16B, state);
 9717     __ ret(lr);
 9718 
 9719     return start;
 9720   }
 9721 
 9722   address generate_ghash_processBlocks_wide() {
 9723     address small = generate_ghash_processBlocks();
 9724 
 9725     StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
 9726     StubCodeMark mark(this, stub_id);
 9727     __ align(wordSize * 2);
 9728     address p = __ pc();
 9729     __ emit_int64(0x87);  // The low-order bits of the field
 9730                           // polynomial (i.e. p = z^7+z^2+z+1)
 9731                           // repeated in the low and high parts of a
 9732                           // 128-bit vector
 9733     __ emit_int64(0x87);
 9734 
 9735     __ align(CodeEntryAlignment);
 9736     address start = __ pc();
 9737 
 9738     Register state   = c_rarg0;
 9739     Register subkeyH = c_rarg1;
 9740     Register data    = c_rarg2;
 9741     Register blocks  = c_rarg3;
 9742 
 9743     const int unroll = 4;
 9744 
 9745     __ cmp(blocks, (unsigned char)(unroll * 2));
 9746     __ br(__ LT, small);
 9747 
 9748     if (unroll > 1) {
 9749     // Save state before entering routine
 9750       __ sub(sp, sp, 4 * 16);
 9751       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9752       __ sub(sp, sp, 4 * 16);
 9753       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9754     }
 9755 
 9756     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 9757 
 9758     if (unroll > 1) {
 9759       // And restore state
 9760       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9761       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9762     }
 9763 
 9764     __ cmp(blocks, (unsigned char)0);
 9765     __ br(__ GT, small);
 9766 
 9767     __ ret(lr);
 9768 
 9769     return start;
 9770   }
 9771 
 9772   void generate_base64_encode_simdround(Register src, Register dst,
 9773         FloatRegister codec, u8 size) {
 9774 
 9775     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9776     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9777     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9778 
 9779     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9780 
 9781     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9782 
 9783     __ ushr(ind0, arrangement, in0,  2);
 9784 
 9785     __ ushr(ind1, arrangement, in1,  2);
 9786     __ shl(in0,   arrangement, in0,  6);
 9787     __ orr(ind1,  arrangement, ind1, in0);
 9788     __ ushr(ind1, arrangement, ind1, 2);
 9789 
 9790     __ ushr(ind2, arrangement, in2,  4);
 9791     __ shl(in1,   arrangement, in1,  4);
 9792     __ orr(ind2,  arrangement, in1,  ind2);
 9793     __ ushr(ind2, arrangement, ind2, 2);
 9794 
 9795     __ shl(ind3,  arrangement, in2,  2);
 9796     __ ushr(ind3, arrangement, ind3, 2);
 9797 
 9798     __ tbl(out0,  arrangement, codec,  4, ind0);
 9799     __ tbl(out1,  arrangement, codec,  4, ind1);
 9800     __ tbl(out2,  arrangement, codec,  4, ind2);
 9801     __ tbl(out3,  arrangement, codec,  4, ind3);
 9802 
 9803     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9804   }
 9805 
 9806    /**
 9807    *  Arguments:
 9808    *
 9809    *  Input:
 9810    *  c_rarg0   - src_start
 9811    *  c_rarg1   - src_offset
 9812    *  c_rarg2   - src_length
 9813    *  c_rarg3   - dest_start
 9814    *  c_rarg4   - dest_offset
 9815    *  c_rarg5   - isURL
 9816    *
 9817    */
 9818   address generate_base64_encodeBlock() {
 9819 
 9820     static const char toBase64[64] = {
 9821       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9822       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9823       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9824       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9825       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9826     };
 9827 
 9828     static const char toBase64URL[64] = {
 9829       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9830       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9831       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9832       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9833       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9834     };
 9835 
 9836     __ align(CodeEntryAlignment);
 9837     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
 9838     StubCodeMark mark(this, stub_id);
 9839     address start = __ pc();
 9840 
 9841     Register src   = c_rarg0;  // source array
 9842     Register soff  = c_rarg1;  // source start offset
 9843     Register send  = c_rarg2;  // source end offset
 9844     Register dst   = c_rarg3;  // dest array
 9845     Register doff  = c_rarg4;  // position for writing to dest array
 9846     Register isURL = c_rarg5;  // Base64 or URL character set
 9847 
 9848     // c_rarg6 and c_rarg7 are free to use as temps
 9849     Register codec  = c_rarg6;
 9850     Register length = c_rarg7;
 9851 
 9852     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9853 
 9854     __ add(src, src, soff);
 9855     __ add(dst, dst, doff);
 9856     __ sub(length, send, soff);
 9857 
 9858     // load the codec base address
 9859     __ lea(codec, ExternalAddress((address) toBase64));
 9860     __ cbz(isURL, ProcessData);
 9861     __ lea(codec, ExternalAddress((address) toBase64URL));
 9862 
 9863     __ BIND(ProcessData);
 9864 
 9865     // too short to formup a SIMD loop, roll back
 9866     __ cmp(length, (u1)24);
 9867     __ br(Assembler::LT, Process3B);
 9868 
 9869     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9870 
 9871     __ BIND(Process48B);
 9872     __ cmp(length, (u1)48);
 9873     __ br(Assembler::LT, Process24B);
 9874     generate_base64_encode_simdround(src, dst, v0, 16);
 9875     __ sub(length, length, 48);
 9876     __ b(Process48B);
 9877 
 9878     __ BIND(Process24B);
 9879     __ cmp(length, (u1)24);
 9880     __ br(Assembler::LT, SIMDExit);
 9881     generate_base64_encode_simdround(src, dst, v0, 8);
 9882     __ sub(length, length, 24);
 9883 
 9884     __ BIND(SIMDExit);
 9885     __ cbz(length, Exit);
 9886 
 9887     __ BIND(Process3B);
 9888     //  3 src bytes, 24 bits
 9889     __ ldrb(r10, __ post(src, 1));
 9890     __ ldrb(r11, __ post(src, 1));
 9891     __ ldrb(r12, __ post(src, 1));
 9892     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9893     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9894     // codec index
 9895     __ ubfmw(r15, r12, 18, 23);
 9896     __ ubfmw(r14, r12, 12, 17);
 9897     __ ubfmw(r13, r12, 6,  11);
 9898     __ andw(r12,  r12, 63);
 9899     // get the code based on the codec
 9900     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9901     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9902     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9903     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9904     __ strb(r15, __ post(dst, 1));
 9905     __ strb(r14, __ post(dst, 1));
 9906     __ strb(r13, __ post(dst, 1));
 9907     __ strb(r12, __ post(dst, 1));
 9908     __ sub(length, length, 3);
 9909     __ cbnz(length, Process3B);
 9910 
 9911     __ BIND(Exit);
 9912     __ ret(lr);
 9913 
 9914     return start;
 9915   }
 9916 
 9917   void generate_base64_decode_simdround(Register src, Register dst,
 9918         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9919 
 9920     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9921     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9922 
 9923     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9924     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9925 
 9926     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9927 
 9928     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9929 
 9930     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9931 
 9932     // we need unsigned saturating subtract, to make sure all input values
 9933     // in range [0, 63] will have 0U value in the higher half lookup
 9934     __ uqsubv(decH0, __ T16B, in0, v27);
 9935     __ uqsubv(decH1, __ T16B, in1, v27);
 9936     __ uqsubv(decH2, __ T16B, in2, v27);
 9937     __ uqsubv(decH3, __ T16B, in3, v27);
 9938 
 9939     // lower half lookup
 9940     __ tbl(decL0, arrangement, codecL, 4, in0);
 9941     __ tbl(decL1, arrangement, codecL, 4, in1);
 9942     __ tbl(decL2, arrangement, codecL, 4, in2);
 9943     __ tbl(decL3, arrangement, codecL, 4, in3);
 9944 
 9945     // higher half lookup
 9946     __ tbx(decH0, arrangement, codecH, 4, decH0);
 9947     __ tbx(decH1, arrangement, codecH, 4, decH1);
 9948     __ tbx(decH2, arrangement, codecH, 4, decH2);
 9949     __ tbx(decH3, arrangement, codecH, 4, decH3);
 9950 
 9951     // combine lower and higher
 9952     __ orr(decL0, arrangement, decL0, decH0);
 9953     __ orr(decL1, arrangement, decL1, decH1);
 9954     __ orr(decL2, arrangement, decL2, decH2);
 9955     __ orr(decL3, arrangement, decL3, decH3);
 9956 
 9957     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 9958     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 9959     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 9960     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 9961     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 9962     __ orr(in0, arrangement, decH0, decH1);
 9963     __ orr(in1, arrangement, decH2, decH3);
 9964     __ orr(in2, arrangement, in0,   in1);
 9965     __ umaxv(in3, arrangement, in2);
 9966     __ umov(rscratch2, in3, __ B, 0);
 9967 
 9968     // get the data to output
 9969     __ shl(out0,  arrangement, decL0, 2);
 9970     __ ushr(out1, arrangement, decL1, 4);
 9971     __ orr(out0,  arrangement, out0,  out1);
 9972     __ shl(out1,  arrangement, decL1, 4);
 9973     __ ushr(out2, arrangement, decL2, 2);
 9974     __ orr(out1,  arrangement, out1,  out2);
 9975     __ shl(out2,  arrangement, decL2, 6);
 9976     __ orr(out2,  arrangement, out2,  decL3);
 9977 
 9978     __ cbz(rscratch2, NoIllegalData);
 9979 
 9980     // handle illegal input
 9981     __ umov(r10, in2, __ D, 0);
 9982     if (size == 16) {
 9983       __ cbnz(r10, ErrorInLowerHalf);
 9984 
 9985       // illegal input is in higher half, store the lower half now.
 9986       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 9987 
 9988       __ umov(r10, in2,  __ D, 1);
 9989       __ umov(r11, out0, __ D, 1);
 9990       __ umov(r12, out1, __ D, 1);
 9991       __ umov(r13, out2, __ D, 1);
 9992       __ b(StoreLegalData);
 9993 
 9994       __ BIND(ErrorInLowerHalf);
 9995     }
 9996     __ umov(r11, out0, __ D, 0);
 9997     __ umov(r12, out1, __ D, 0);
 9998     __ umov(r13, out2, __ D, 0);
 9999 
10000     __ BIND(StoreLegalData);
10001     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10002     __ strb(r11, __ post(dst, 1));
10003     __ strb(r12, __ post(dst, 1));
10004     __ strb(r13, __ post(dst, 1));
10005     __ lsr(r10, r10, 8);
10006     __ lsr(r11, r11, 8);
10007     __ lsr(r12, r12, 8);
10008     __ lsr(r13, r13, 8);
10009     __ b(StoreLegalData);
10010 
10011     __ BIND(NoIllegalData);
10012     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10013   }
10014 
10015 
10016    /**
10017    *  Arguments:
10018    *
10019    *  Input:
10020    *  c_rarg0   - src_start
10021    *  c_rarg1   - src_offset
10022    *  c_rarg2   - src_length
10023    *  c_rarg3   - dest_start
10024    *  c_rarg4   - dest_offset
10025    *  c_rarg5   - isURL
10026    *  c_rarg6   - isMIME
10027    *
10028    */
10029   address generate_base64_decodeBlock() {
10030 
10031     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10032     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10033     // titled "Base64 decoding".
10034 
10035     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10036     // except the trailing character '=' is also treated illegal value in this intrinsic. That
10037     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10038     static const uint8_t fromBase64ForNoSIMD[256] = {
10039       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10040       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10041       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10042        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10043       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10044        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
10045       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10046        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10047       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10048       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10049       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10050       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10051       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10052       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10053       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10054       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10055     };
10056 
10057     static const uint8_t fromBase64URLForNoSIMD[256] = {
10058       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10059       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10060       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10061        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10062       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10063        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
10064       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10065        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10066       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10067       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10068       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10069       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10070       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10071       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10072       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10073       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10074     };
10075 
10076     // A legal value of base64 code is in range [0, 127].  We need two lookups
10077     // with tbl/tbx and combine them to get the decode data. The 1st table vector
10078     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10079     // table vector lookup use tbx, out of range indices are unchanged in
10080     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10081     // The value of index 64 is set to 0, so that we know that we already get the
10082     // decoded data with the 1st lookup.
10083     static const uint8_t fromBase64ForSIMD[128] = {
10084       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10085       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10086       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10087        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10088         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10089        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10090       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10091        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10092     };
10093 
10094     static const uint8_t fromBase64URLForSIMD[128] = {
10095       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10096       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10097       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10098        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10099         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10100        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10101        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10102        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10103     };
10104 
10105     __ align(CodeEntryAlignment);
10106     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10107     StubCodeMark mark(this, stub_id);
10108     address start = __ pc();
10109 
10110     Register src    = c_rarg0;  // source array
10111     Register soff   = c_rarg1;  // source start offset
10112     Register send   = c_rarg2;  // source end offset
10113     Register dst    = c_rarg3;  // dest array
10114     Register doff   = c_rarg4;  // position for writing to dest array
10115     Register isURL  = c_rarg5;  // Base64 or URL character set
10116     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10117 
10118     Register length = send;    // reuse send as length of source data to process
10119 
10120     Register simd_codec   = c_rarg6;
10121     Register nosimd_codec = c_rarg7;
10122 
10123     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10124 
10125     __ enter();
10126 
10127     __ add(src, src, soff);
10128     __ add(dst, dst, doff);
10129 
10130     __ mov(doff, dst);
10131 
10132     __ sub(length, send, soff);
10133     __ bfm(length, zr, 0, 1);
10134 
10135     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10136     __ cbz(isURL, ProcessData);
10137     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10138 
10139     __ BIND(ProcessData);
10140     __ mov(rscratch1, length);
10141     __ cmp(length, (u1)144); // 144 = 80 + 64
10142     __ br(Assembler::LT, Process4B);
10143 
10144     // In the MIME case, the line length cannot be more than 76
10145     // bytes (see RFC 2045). This is too short a block for SIMD
10146     // to be worthwhile, so we use non-SIMD here.
10147     __ movw(rscratch1, 79);
10148 
10149     __ BIND(Process4B);
10150     __ ldrw(r14, __ post(src, 4));
10151     __ ubfxw(r10, r14, 0,  8);
10152     __ ubfxw(r11, r14, 8,  8);
10153     __ ubfxw(r12, r14, 16, 8);
10154     __ ubfxw(r13, r14, 24, 8);
10155     // get the de-code
10156     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10157     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10158     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10159     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10160     // error detection, 255u indicates an illegal input
10161     __ orrw(r14, r10, r11);
10162     __ orrw(r15, r12, r13);
10163     __ orrw(r14, r14, r15);
10164     __ tbnz(r14, 7, Exit);
10165     // recover the data
10166     __ lslw(r14, r10, 10);
10167     __ bfiw(r14, r11, 4, 6);
10168     __ bfmw(r14, r12, 2, 5);
10169     __ rev16w(r14, r14);
10170     __ bfiw(r13, r12, 6, 2);
10171     __ strh(r14, __ post(dst, 2));
10172     __ strb(r13, __ post(dst, 1));
10173     // non-simd loop
10174     __ subsw(rscratch1, rscratch1, 4);
10175     __ br(Assembler::GT, Process4B);
10176 
10177     // if exiting from PreProcess80B, rscratch1 == -1;
10178     // otherwise, rscratch1 == 0.
10179     __ cbzw(rscratch1, Exit);
10180     __ sub(length, length, 80);
10181 
10182     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10183     __ cbz(isURL, SIMDEnter);
10184     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10185 
10186     __ BIND(SIMDEnter);
10187     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10188     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10189     __ mov(rscratch1, 63);
10190     __ dup(v27, __ T16B, rscratch1);
10191 
10192     __ BIND(Process64B);
10193     __ cmp(length, (u1)64);
10194     __ br(Assembler::LT, Process32B);
10195     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10196     __ sub(length, length, 64);
10197     __ b(Process64B);
10198 
10199     __ BIND(Process32B);
10200     __ cmp(length, (u1)32);
10201     __ br(Assembler::LT, SIMDExit);
10202     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10203     __ sub(length, length, 32);
10204     __ b(Process32B);
10205 
10206     __ BIND(SIMDExit);
10207     __ cbz(length, Exit);
10208     __ movw(rscratch1, length);
10209     __ b(Process4B);
10210 
10211     __ BIND(Exit);
10212     __ sub(c_rarg0, dst, doff);
10213 
10214     __ leave();
10215     __ ret(lr);
10216 
10217     return start;
10218   }
10219 
10220   // Support for spin waits.
10221   address generate_spin_wait() {
10222     __ align(CodeEntryAlignment);
10223     StubId stub_id = StubId::stubgen_spin_wait_id;
10224     StubCodeMark mark(this, stub_id);
10225     address start = __ pc();
10226 
10227     __ spin_wait();
10228     __ ret(lr);
10229 
10230     return start;
10231   }
10232 
10233   void generate_lookup_secondary_supers_table_stub() {
10234     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10235     StubCodeMark mark(this, stub_id);
10236 
10237     const Register
10238       r_super_klass  = r0,
10239       r_array_base   = r1,
10240       r_array_length = r2,
10241       r_array_index  = r3,
10242       r_sub_klass    = r4,
10243       r_bitmap       = rscratch2,
10244       result         = r5;
10245     const FloatRegister
10246       vtemp          = v0;
10247 
10248     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10249       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10250       Label L_success;
10251       __ enter();
10252       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10253                                              r_array_base, r_array_length, r_array_index,
10254                                              vtemp, result, slot,
10255                                              /*stub_is_near*/true);
10256       __ leave();
10257       __ ret(lr);
10258     }
10259   }
10260 
10261   // Slow path implementation for UseSecondarySupersTable.
10262   address generate_lookup_secondary_supers_table_slow_path_stub() {
10263     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10264     StubCodeMark mark(this, stub_id);
10265 
10266     address start = __ pc();
10267     const Register
10268       r_super_klass  = r0,        // argument
10269       r_array_base   = r1,        // argument
10270       temp1          = r2,        // temp
10271       r_array_index  = r3,        // argument
10272       r_bitmap       = rscratch2, // argument
10273       result         = r5;        // argument
10274 
10275     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10276     __ ret(lr);
10277 
10278     return start;
10279   }
10280 
10281 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10282 
10283   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
10284   //
10285   // If LSE is in use, generate LSE versions of all the stubs. The
10286   // non-LSE versions are in atomic_aarch64.S.
10287 
10288   // class AtomicStubMark records the entry point of a stub and the
10289   // stub pointer which will point to it. The stub pointer is set to
10290   // the entry point when ~AtomicStubMark() is called, which must be
10291   // after ICache::invalidate_range. This ensures safe publication of
10292   // the generated code.
10293   class AtomicStubMark {
10294     address _entry_point;
10295     aarch64_atomic_stub_t *_stub;
10296     MacroAssembler *_masm;
10297   public:
10298     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10299       _masm = masm;
10300       __ align(32);
10301       _entry_point = __ pc();
10302       _stub = stub;
10303     }
10304     ~AtomicStubMark() {
10305       *_stub = (aarch64_atomic_stub_t)_entry_point;
10306     }
10307   };
10308 
10309   // NB: For memory_order_conservative we need a trailing membar after
10310   // LSE atomic operations but not a leading membar.
10311   //
10312   // We don't need a leading membar because a clause in the Arm ARM
10313   // says:
10314   //
10315   //   Barrier-ordered-before
10316   //
10317   //   Barrier instructions order prior Memory effects before subsequent
10318   //   Memory effects generated by the same Observer. A read or a write
10319   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10320   //   Observer if and only if RW1 appears in program order before RW 2
10321   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10322   //   instruction with both Acquire and Release semantics.
10323   //
10324   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10325   // and Release semantics, therefore we don't need a leading
10326   // barrier. However, there is no corresponding Barrier-ordered-after
10327   // relationship, therefore we need a trailing membar to prevent a
10328   // later store or load from being reordered with the store in an
10329   // atomic instruction.
10330   //
10331   // This was checked by using the herd7 consistency model simulator
10332   // (http://diy.inria.fr/) with this test case:
10333   //
10334   // AArch64 LseCas
10335   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10336   // P0 | P1;
10337   // LDR W4, [X2] | MOV W3, #0;
10338   // DMB LD       | MOV W4, #1;
10339   // LDR W3, [X1] | CASAL W3, W4, [X1];
10340   //              | DMB ISH;
10341   //              | STR W4, [X2];
10342   // exists
10343   // (0:X3=0 /\ 0:X4=1)
10344   //
10345   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10346   // with the store to x in P1. Without the DMB in P1 this may happen.
10347   //
10348   // At the time of writing we don't know of any AArch64 hardware that
10349   // reorders stores in this way, but the Reference Manual permits it.
10350 
10351   void gen_cas_entry(Assembler::operand_size size,
10352                      atomic_memory_order order) {
10353     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10354       exchange_val = c_rarg2;
10355     bool acquire, release;
10356     switch (order) {
10357       case memory_order_relaxed:
10358         acquire = false;
10359         release = false;
10360         break;
10361       case memory_order_release:
10362         acquire = false;
10363         release = true;
10364         break;
10365       default:
10366         acquire = true;
10367         release = true;
10368         break;
10369     }
10370     __ mov(prev, compare_val);
10371     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10372     if (order == memory_order_conservative) {
10373       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10374     }
10375     if (size == Assembler::xword) {
10376       __ mov(r0, prev);
10377     } else {
10378       __ movw(r0, prev);
10379     }
10380     __ ret(lr);
10381   }
10382 
10383   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10384     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10385     // If not relaxed, then default to conservative.  Relaxed is the only
10386     // case we use enough to be worth specializing.
10387     if (order == memory_order_relaxed) {
10388       __ ldadd(size, incr, prev, addr);
10389     } else {
10390       __ ldaddal(size, incr, prev, addr);
10391       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10392     }
10393     if (size == Assembler::xword) {
10394       __ mov(r0, prev);
10395     } else {
10396       __ movw(r0, prev);
10397     }
10398     __ ret(lr);
10399   }
10400 
10401   void gen_swpal_entry(Assembler::operand_size size) {
10402     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10403     __ swpal(size, incr, prev, addr);
10404     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10405     if (size == Assembler::xword) {
10406       __ mov(r0, prev);
10407     } else {
10408       __ movw(r0, prev);
10409     }
10410     __ ret(lr);
10411   }
10412 
10413   void generate_atomic_entry_points() {
10414     if (! UseLSE) {
10415       return;
10416     }
10417     __ align(CodeEntryAlignment);
10418     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10419     StubCodeMark mark(this, stub_id);
10420     address first_entry = __ pc();
10421 
10422     // ADD, memory_order_conservative
10423     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10424     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10425     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10426     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10427 
10428     // ADD, memory_order_relaxed
10429     AtomicStubMark mark_fetch_add_4_relaxed
10430       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10431     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10432     AtomicStubMark mark_fetch_add_8_relaxed
10433       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10434     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10435 
10436     // XCHG, memory_order_conservative
10437     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10438     gen_swpal_entry(Assembler::word);
10439     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10440     gen_swpal_entry(Assembler::xword);
10441 
10442     // CAS, memory_order_conservative
10443     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10444     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10445     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10446     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10447     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10448     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10449 
10450     // CAS, memory_order_relaxed
10451     AtomicStubMark mark_cmpxchg_1_relaxed
10452       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10453     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10454     AtomicStubMark mark_cmpxchg_4_relaxed
10455       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10456     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10457     AtomicStubMark mark_cmpxchg_8_relaxed
10458       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10459     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10460 
10461     AtomicStubMark mark_cmpxchg_4_release
10462       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10463     gen_cas_entry(MacroAssembler::word, memory_order_release);
10464     AtomicStubMark mark_cmpxchg_8_release
10465       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10466     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10467 
10468     AtomicStubMark mark_cmpxchg_4_seq_cst
10469       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10470     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10471     AtomicStubMark mark_cmpxchg_8_seq_cst
10472       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10473     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10474 
10475     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10476   }
10477 #endif // LINUX
10478 
10479   address generate_cont_thaw(Continuation::thaw_kind kind) {
10480     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10481     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10482 
10483     address start = __ pc();
10484 
10485     if (return_barrier) {
10486       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10487       __ mov(sp, rscratch1);
10488     }
10489     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10490 
10491     if (return_barrier) {
10492       // preserve possible return value from a method returning to the return barrier
10493       __ fmovd(rscratch1, v0);
10494       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10495     }
10496 
10497     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10498     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10499     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10500 
10501     if (return_barrier) {
10502       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10503       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10504       __ fmovd(v0, rscratch1);
10505     }
10506     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10507 
10508 
10509     Label thaw_success;
10510     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10511     __ cbnz(rscratch2, thaw_success);
10512     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10513     __ br(rscratch1);
10514     __ bind(thaw_success);
10515 
10516     // make room for the thawed frames
10517     __ sub(rscratch1, sp, rscratch2);
10518     __ andr(rscratch1, rscratch1, -16); // align
10519     __ mov(sp, rscratch1);
10520 
10521     if (return_barrier) {
10522       // save original return value -- again
10523       __ fmovd(rscratch1, v0);
10524       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10525     }
10526 
10527     // If we want, we can templatize thaw by kind, and have three different entries
10528     __ movw(c_rarg1, (uint32_t)kind);
10529 
10530     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10531     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10532 
10533     if (return_barrier) {
10534       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10535       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10536       __ fmovd(v0, rscratch1);
10537     } else {
10538       __ mov(r0, zr); // return 0 (success) from doYield
10539     }
10540 
10541     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10542     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10543     __ mov(rfp, sp);
10544 
10545     if (return_barrier_exception) {
10546       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10547       __ authenticate_return_address(c_rarg1);
10548       __ verify_oop(r0);
10549       // save return value containing the exception oop in callee-saved R19
10550       __ mov(r19, r0);
10551 
10552       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10553 
10554       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10555       // __ reinitialize_ptrue();
10556 
10557       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10558 
10559       __ mov(r1, r0); // the exception handler
10560       __ mov(r0, r19); // restore return value containing the exception oop
10561       __ verify_oop(r0);
10562 
10563       __ leave();
10564       __ mov(r3, lr);
10565       __ br(r1); // the exception handler
10566     } else {
10567       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10568       __ leave();
10569       __ ret(lr);
10570     }
10571 
10572     return start;
10573   }
10574 
10575   address generate_cont_thaw() {
10576     if (!Continuations::enabled()) return nullptr;
10577 
10578     StubId stub_id = StubId::stubgen_cont_thaw_id;
10579     StubCodeMark mark(this, stub_id);
10580     address start = __ pc();
10581     generate_cont_thaw(Continuation::thaw_top);
10582     return start;
10583   }
10584 
10585   address generate_cont_returnBarrier() {
10586     if (!Continuations::enabled()) return nullptr;
10587 
10588     // TODO: will probably need multiple return barriers depending on return type
10589     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10590     StubCodeMark mark(this, stub_id);
10591     address start = __ pc();
10592 
10593     generate_cont_thaw(Continuation::thaw_return_barrier);
10594 
10595     return start;
10596   }
10597 
10598   address generate_cont_returnBarrier_exception() {
10599     if (!Continuations::enabled()) return nullptr;
10600 
10601     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10602     StubCodeMark mark(this, stub_id);
10603     address start = __ pc();
10604 
10605     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10606 
10607     return start;
10608   }
10609 
10610   address generate_cont_preempt_stub() {
10611     if (!Continuations::enabled()) return nullptr;
10612     StubId stub_id = StubId::stubgen_cont_preempt_id;
10613     StubCodeMark mark(this, stub_id);
10614     address start = __ pc();
10615 
10616     __ reset_last_Java_frame(true);
10617 
10618     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10619     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10620     __ mov(sp, rscratch2);
10621 
10622     Label preemption_cancelled;
10623     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10624     __ cbnz(rscratch1, preemption_cancelled);
10625 
10626     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10627     SharedRuntime::continuation_enter_cleanup(_masm);
10628     __ leave();
10629     __ ret(lr);
10630 
10631     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10632     __ bind(preemption_cancelled);
10633     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10634     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10635     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10636     __ ldr(rscratch1, Address(rscratch1));
10637     __ br(rscratch1);
10638 
10639     return start;
10640   }
10641 
10642   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10643   // are represented as long[5], with BITS_PER_LIMB = 26.
10644   // Pack five 26-bit limbs into three 64-bit registers.
10645   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10646     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10647     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10648     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10649     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10650 
10651     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10652     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10653     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10654     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10655 
10656     if (dest2->is_valid()) {
10657       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10658     } else {
10659 #ifdef ASSERT
10660       Label OK;
10661       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10662       __ br(__ EQ, OK);
10663       __ stop("high bits of Poly1305 integer should be zero");
10664       __ should_not_reach_here();
10665       __ bind(OK);
10666 #endif
10667     }
10668   }
10669 
10670   // As above, but return only a 128-bit integer, packed into two
10671   // 64-bit registers.
10672   void pack_26(Register dest0, Register dest1, Register src) {
10673     pack_26(dest0, dest1, noreg, src);
10674   }
10675 
10676   // Multiply and multiply-accumulate unsigned 64-bit registers.
10677   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10678     __ mul(prod_lo, n, m);
10679     __ umulh(prod_hi, n, m);
10680   }
10681   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10682     wide_mul(rscratch1, rscratch2, n, m);
10683     __ adds(sum_lo, sum_lo, rscratch1);
10684     __ adc(sum_hi, sum_hi, rscratch2);
10685   }
10686 
10687   // Poly1305, RFC 7539
10688 
10689   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10690   // description of the tricks used to simplify and accelerate this
10691   // computation.
10692 
10693   address generate_poly1305_processBlocks() {
10694     __ align(CodeEntryAlignment);
10695     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10696     StubCodeMark mark(this, stub_id);
10697     address start = __ pc();
10698     Label here;
10699     __ enter();
10700     RegSet callee_saved = RegSet::range(r19, r28);
10701     __ push(callee_saved, sp);
10702 
10703     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10704 
10705     // Arguments
10706     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10707 
10708     // R_n is the 128-bit randomly-generated key, packed into two
10709     // registers.  The caller passes this key to us as long[5], with
10710     // BITS_PER_LIMB = 26.
10711     const Register R_0 = *++regs, R_1 = *++regs;
10712     pack_26(R_0, R_1, r_start);
10713 
10714     // RR_n is (R_n >> 2) * 5
10715     const Register RR_0 = *++regs, RR_1 = *++regs;
10716     __ lsr(RR_0, R_0, 2);
10717     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10718     __ lsr(RR_1, R_1, 2);
10719     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10720 
10721     // U_n is the current checksum
10722     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10723     pack_26(U_0, U_1, U_2, acc_start);
10724 
10725     static constexpr int BLOCK_LENGTH = 16;
10726     Label DONE, LOOP;
10727 
10728     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10729     __ br(Assembler::LT, DONE); {
10730       __ bind(LOOP);
10731 
10732       // S_n is to be the sum of U_n and the next block of data
10733       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10734       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10735       __ adds(S_0, U_0, S_0);
10736       __ adcs(S_1, U_1, S_1);
10737       __ adc(S_2, U_2, zr);
10738       __ add(S_2, S_2, 1);
10739 
10740       const Register U_0HI = *++regs, U_1HI = *++regs;
10741 
10742       // NB: this logic depends on some of the special properties of
10743       // Poly1305 keys. In particular, because we know that the top
10744       // four bits of R_0 and R_1 are zero, we can add together
10745       // partial products without any risk of needing to propagate a
10746       // carry out.
10747       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10748       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10749       __ andr(U_2, R_0, 3);
10750       __ mul(U_2, S_2, U_2);
10751 
10752       // Recycle registers S_0, S_1, S_2
10753       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10754 
10755       // Partial reduction mod 2**130 - 5
10756       __ adds(U_1, U_0HI, U_1);
10757       __ adc(U_2, U_1HI, U_2);
10758       // Sum now in U_2:U_1:U_0.
10759       // Dead: U_0HI, U_1HI.
10760       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10761 
10762       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10763 
10764       // First, U_2:U_1:U_0 += (U_2 >> 2)
10765       __ lsr(rscratch1, U_2, 2);
10766       __ andr(U_2, U_2, (u8)3);
10767       __ adds(U_0, U_0, rscratch1);
10768       __ adcs(U_1, U_1, zr);
10769       __ adc(U_2, U_2, zr);
10770       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10771       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10772       __ adcs(U_1, U_1, zr);
10773       __ adc(U_2, U_2, zr);
10774 
10775       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10776       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10777       __ br(~ Assembler::LT, LOOP);
10778     }
10779 
10780     // Further reduce modulo 2^130 - 5
10781     __ lsr(rscratch1, U_2, 2);
10782     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10783     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10784     __ adcs(U_1, U_1, zr);
10785     __ andr(U_2, U_2, (u1)3);
10786     __ adc(U_2, U_2, zr);
10787 
10788     // Unpack the sum into five 26-bit limbs and write to memory.
10789     __ ubfiz(rscratch1, U_0, 0, 26);
10790     __ ubfx(rscratch2, U_0, 26, 26);
10791     __ stp(rscratch1, rscratch2, Address(acc_start));
10792     __ ubfx(rscratch1, U_0, 52, 12);
10793     __ bfi(rscratch1, U_1, 12, 14);
10794     __ ubfx(rscratch2, U_1, 14, 26);
10795     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10796     __ ubfx(rscratch1, U_1, 40, 24);
10797     __ bfi(rscratch1, U_2, 24, 3);
10798     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10799 
10800     __ bind(DONE);
10801     __ pop(callee_saved, sp);
10802     __ leave();
10803     __ ret(lr);
10804 
10805     return start;
10806   }
10807 
10808   // exception handler for upcall stubs
10809   address generate_upcall_stub_exception_handler() {
10810     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10811     StubCodeMark mark(this, stub_id);
10812     address start = __ pc();
10813 
10814     // Native caller has no idea how to handle exceptions,
10815     // so we just crash here. Up to callee to catch exceptions.
10816     __ verify_oop(r0);
10817     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10818     __ blr(rscratch1);
10819     __ should_not_reach_here();
10820 
10821     return start;
10822   }
10823 
10824   // load Method* target of MethodHandle
10825   // j_rarg0 = jobject receiver
10826   // rmethod = result
10827   address generate_upcall_stub_load_target() {
10828     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10829     StubCodeMark mark(this, stub_id);
10830     address start = __ pc();
10831 
10832     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10833       // Load target method from receiver
10834     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10835     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10836     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10837     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10838                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10839                       noreg, noreg);
10840     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10841 
10842     __ ret(lr);
10843 
10844     return start;
10845   }
10846 
10847 #undef __
10848 #define __ masm->
10849 
10850   class MontgomeryMultiplyGenerator : public MacroAssembler {
10851 
10852     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10853       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10854 
10855     RegSet _toSave;
10856     bool _squaring;
10857 
10858   public:
10859     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10860       : MacroAssembler(as->code()), _squaring(squaring) {
10861 
10862       // Register allocation
10863 
10864       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10865       Pa_base = *regs;       // Argument registers
10866       if (squaring)
10867         Pb_base = Pa_base;
10868       else
10869         Pb_base = *++regs;
10870       Pn_base = *++regs;
10871       Rlen= *++regs;
10872       inv = *++regs;
10873       Pm_base = *++regs;
10874 
10875                           // Working registers:
10876       Ra =  *++regs;        // The current digit of a, b, n, and m.
10877       Rb =  *++regs;
10878       Rm =  *++regs;
10879       Rn =  *++regs;
10880 
10881       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10882       Pb =  *++regs;
10883       Pm =  *++regs;
10884       Pn =  *++regs;
10885 
10886       t0 =  *++regs;        // Three registers which form a
10887       t1 =  *++regs;        // triple-precision accumuator.
10888       t2 =  *++regs;
10889 
10890       Ri =  *++regs;        // Inner and outer loop indexes.
10891       Rj =  *++regs;
10892 
10893       Rhi_ab = *++regs;     // Product registers: low and high parts
10894       Rlo_ab = *++regs;     // of a*b and m*n.
10895       Rhi_mn = *++regs;
10896       Rlo_mn = *++regs;
10897 
10898       // r19 and up are callee-saved.
10899       _toSave = RegSet::range(r19, *regs) + Pm_base;
10900     }
10901 
10902   private:
10903     void save_regs() {
10904       push(_toSave, sp);
10905     }
10906 
10907     void restore_regs() {
10908       pop(_toSave, sp);
10909     }
10910 
10911     template <typename T>
10912     void unroll_2(Register count, T block) {
10913       Label loop, end, odd;
10914       tbnz(count, 0, odd);
10915       cbz(count, end);
10916       align(16);
10917       bind(loop);
10918       (this->*block)();
10919       bind(odd);
10920       (this->*block)();
10921       subs(count, count, 2);
10922       br(Assembler::GT, loop);
10923       bind(end);
10924     }
10925 
10926     template <typename T>
10927     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10928       Label loop, end, odd;
10929       tbnz(count, 0, odd);
10930       cbz(count, end);
10931       align(16);
10932       bind(loop);
10933       (this->*block)(d, s, tmp);
10934       bind(odd);
10935       (this->*block)(d, s, tmp);
10936       subs(count, count, 2);
10937       br(Assembler::GT, loop);
10938       bind(end);
10939     }
10940 
10941     void pre1(RegisterOrConstant i) {
10942       block_comment("pre1");
10943       // Pa = Pa_base;
10944       // Pb = Pb_base + i;
10945       // Pm = Pm_base;
10946       // Pn = Pn_base + i;
10947       // Ra = *Pa;
10948       // Rb = *Pb;
10949       // Rm = *Pm;
10950       // Rn = *Pn;
10951       ldr(Ra, Address(Pa_base));
10952       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10953       ldr(Rm, Address(Pm_base));
10954       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10955       lea(Pa, Address(Pa_base));
10956       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10957       lea(Pm, Address(Pm_base));
10958       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10959 
10960       // Zero the m*n result.
10961       mov(Rhi_mn, zr);
10962       mov(Rlo_mn, zr);
10963     }
10964 
10965     // The core multiply-accumulate step of a Montgomery
10966     // multiplication.  The idea is to schedule operations as a
10967     // pipeline so that instructions with long latencies (loads and
10968     // multiplies) have time to complete before their results are
10969     // used.  This most benefits in-order implementations of the
10970     // architecture but out-of-order ones also benefit.
10971     void step() {
10972       block_comment("step");
10973       // MACC(Ra, Rb, t0, t1, t2);
10974       // Ra = *++Pa;
10975       // Rb = *--Pb;
10976       umulh(Rhi_ab, Ra, Rb);
10977       mul(Rlo_ab, Ra, Rb);
10978       ldr(Ra, pre(Pa, wordSize));
10979       ldr(Rb, pre(Pb, -wordSize));
10980       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
10981                                        // previous iteration.
10982       // MACC(Rm, Rn, t0, t1, t2);
10983       // Rm = *++Pm;
10984       // Rn = *--Pn;
10985       umulh(Rhi_mn, Rm, Rn);
10986       mul(Rlo_mn, Rm, Rn);
10987       ldr(Rm, pre(Pm, wordSize));
10988       ldr(Rn, pre(Pn, -wordSize));
10989       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10990     }
10991 
10992     void post1() {
10993       block_comment("post1");
10994 
10995       // MACC(Ra, Rb, t0, t1, t2);
10996       // Ra = *++Pa;
10997       // Rb = *--Pb;
10998       umulh(Rhi_ab, Ra, Rb);
10999       mul(Rlo_ab, Ra, Rb);
11000       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11001       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11002 
11003       // *Pm = Rm = t0 * inv;
11004       mul(Rm, t0, inv);
11005       str(Rm, Address(Pm));
11006 
11007       // MACC(Rm, Rn, t0, t1, t2);
11008       // t0 = t1; t1 = t2; t2 = 0;
11009       umulh(Rhi_mn, Rm, Rn);
11010 
11011 #ifndef PRODUCT
11012       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11013       {
11014         mul(Rlo_mn, Rm, Rn);
11015         add(Rlo_mn, t0, Rlo_mn);
11016         Label ok;
11017         cbz(Rlo_mn, ok); {
11018           stop("broken Montgomery multiply");
11019         } bind(ok);
11020       }
11021 #endif
11022       // We have very carefully set things up so that
11023       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11024       // the lower half of Rm * Rn because we know the result already:
11025       // it must be -t0.  t0 + (-t0) must generate a carry iff
11026       // t0 != 0.  So, rather than do a mul and an adds we just set
11027       // the carry flag iff t0 is nonzero.
11028       //
11029       // mul(Rlo_mn, Rm, Rn);
11030       // adds(zr, t0, Rlo_mn);
11031       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11032       adcs(t0, t1, Rhi_mn);
11033       adc(t1, t2, zr);
11034       mov(t2, zr);
11035     }
11036 
11037     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11038       block_comment("pre2");
11039       // Pa = Pa_base + i-len;
11040       // Pb = Pb_base + len;
11041       // Pm = Pm_base + i-len;
11042       // Pn = Pn_base + len;
11043 
11044       if (i.is_register()) {
11045         sub(Rj, i.as_register(), len);
11046       } else {
11047         mov(Rj, i.as_constant());
11048         sub(Rj, Rj, len);
11049       }
11050       // Rj == i-len
11051 
11052       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11053       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11054       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11055       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11056 
11057       // Ra = *++Pa;
11058       // Rb = *--Pb;
11059       // Rm = *++Pm;
11060       // Rn = *--Pn;
11061       ldr(Ra, pre(Pa, wordSize));
11062       ldr(Rb, pre(Pb, -wordSize));
11063       ldr(Rm, pre(Pm, wordSize));
11064       ldr(Rn, pre(Pn, -wordSize));
11065 
11066       mov(Rhi_mn, zr);
11067       mov(Rlo_mn, zr);
11068     }
11069 
11070     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11071       block_comment("post2");
11072       if (i.is_constant()) {
11073         mov(Rj, i.as_constant()-len.as_constant());
11074       } else {
11075         sub(Rj, i.as_register(), len);
11076       }
11077 
11078       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11079 
11080       // As soon as we know the least significant digit of our result,
11081       // store it.
11082       // Pm_base[i-len] = t0;
11083       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11084 
11085       // t0 = t1; t1 = t2; t2 = 0;
11086       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11087       adc(t1, t2, zr);
11088       mov(t2, zr);
11089     }
11090 
11091     // A carry in t0 after Montgomery multiplication means that we
11092     // should subtract multiples of n from our result in m.  We'll
11093     // keep doing that until there is no carry.
11094     void normalize(RegisterOrConstant len) {
11095       block_comment("normalize");
11096       // while (t0)
11097       //   t0 = sub(Pm_base, Pn_base, t0, len);
11098       Label loop, post, again;
11099       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11100       cbz(t0, post); {
11101         bind(again); {
11102           mov(i, zr);
11103           mov(cnt, len);
11104           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11105           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11106           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11107           align(16);
11108           bind(loop); {
11109             sbcs(Rm, Rm, Rn);
11110             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11111             add(i, i, 1);
11112             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11113             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11114             sub(cnt, cnt, 1);
11115           } cbnz(cnt, loop);
11116           sbc(t0, t0, zr);
11117         } cbnz(t0, again);
11118       } bind(post);
11119     }
11120 
11121     // Move memory at s to d, reversing words.
11122     //    Increments d to end of copied memory
11123     //    Destroys tmp1, tmp2
11124     //    Preserves len
11125     //    Leaves s pointing to the address which was in d at start
11126     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11127       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11128       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11129 
11130       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11131       mov(tmp1, len);
11132       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11133       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11134     }
11135     // where
11136     void reverse1(Register d, Register s, Register tmp) {
11137       ldr(tmp, pre(s, -wordSize));
11138       ror(tmp, tmp, 32);
11139       str(tmp, post(d, wordSize));
11140     }
11141 
11142     void step_squaring() {
11143       // An extra ACC
11144       step();
11145       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11146     }
11147 
11148     void last_squaring(RegisterOrConstant i) {
11149       Label dont;
11150       // if ((i & 1) == 0) {
11151       tbnz(i.as_register(), 0, dont); {
11152         // MACC(Ra, Rb, t0, t1, t2);
11153         // Ra = *++Pa;
11154         // Rb = *--Pb;
11155         umulh(Rhi_ab, Ra, Rb);
11156         mul(Rlo_ab, Ra, Rb);
11157         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11158       } bind(dont);
11159     }
11160 
11161     void extra_step_squaring() {
11162       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11163 
11164       // MACC(Rm, Rn, t0, t1, t2);
11165       // Rm = *++Pm;
11166       // Rn = *--Pn;
11167       umulh(Rhi_mn, Rm, Rn);
11168       mul(Rlo_mn, Rm, Rn);
11169       ldr(Rm, pre(Pm, wordSize));
11170       ldr(Rn, pre(Pn, -wordSize));
11171     }
11172 
11173     void post1_squaring() {
11174       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11175 
11176       // *Pm = Rm = t0 * inv;
11177       mul(Rm, t0, inv);
11178       str(Rm, Address(Pm));
11179 
11180       // MACC(Rm, Rn, t0, t1, t2);
11181       // t0 = t1; t1 = t2; t2 = 0;
11182       umulh(Rhi_mn, Rm, Rn);
11183 
11184 #ifndef PRODUCT
11185       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11186       {
11187         mul(Rlo_mn, Rm, Rn);
11188         add(Rlo_mn, t0, Rlo_mn);
11189         Label ok;
11190         cbz(Rlo_mn, ok); {
11191           stop("broken Montgomery multiply");
11192         } bind(ok);
11193       }
11194 #endif
11195       // We have very carefully set things up so that
11196       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11197       // the lower half of Rm * Rn because we know the result already:
11198       // it must be -t0.  t0 + (-t0) must generate a carry iff
11199       // t0 != 0.  So, rather than do a mul and an adds we just set
11200       // the carry flag iff t0 is nonzero.
11201       //
11202       // mul(Rlo_mn, Rm, Rn);
11203       // adds(zr, t0, Rlo_mn);
11204       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11205       adcs(t0, t1, Rhi_mn);
11206       adc(t1, t2, zr);
11207       mov(t2, zr);
11208     }
11209 
11210     void acc(Register Rhi, Register Rlo,
11211              Register t0, Register t1, Register t2) {
11212       adds(t0, t0, Rlo);
11213       adcs(t1, t1, Rhi);
11214       adc(t2, t2, zr);
11215     }
11216 
11217   public:
11218     /**
11219      * Fast Montgomery multiplication.  The derivation of the
11220      * algorithm is in A Cryptographic Library for the Motorola
11221      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11222      *
11223      * Arguments:
11224      *
11225      * Inputs for multiplication:
11226      *   c_rarg0   - int array elements a
11227      *   c_rarg1   - int array elements b
11228      *   c_rarg2   - int array elements n (the modulus)
11229      *   c_rarg3   - int length
11230      *   c_rarg4   - int inv
11231      *   c_rarg5   - int array elements m (the result)
11232      *
11233      * Inputs for squaring:
11234      *   c_rarg0   - int array elements a
11235      *   c_rarg1   - int array elements n (the modulus)
11236      *   c_rarg2   - int length
11237      *   c_rarg3   - int inv
11238      *   c_rarg4   - int array elements m (the result)
11239      *
11240      */
11241     address generate_multiply() {
11242       Label argh, nothing;
11243       bind(argh);
11244       stop("MontgomeryMultiply total_allocation must be <= 8192");
11245 
11246       align(CodeEntryAlignment);
11247       address entry = pc();
11248 
11249       cbzw(Rlen, nothing);
11250 
11251       enter();
11252 
11253       // Make room.
11254       cmpw(Rlen, 512);
11255       br(Assembler::HI, argh);
11256       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11257       andr(sp, Ra, -2 * wordSize);
11258 
11259       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11260 
11261       {
11262         // Copy input args, reversing as we go.  We use Ra as a
11263         // temporary variable.
11264         reverse(Ra, Pa_base, Rlen, t0, t1);
11265         if (!_squaring)
11266           reverse(Ra, Pb_base, Rlen, t0, t1);
11267         reverse(Ra, Pn_base, Rlen, t0, t1);
11268       }
11269 
11270       // Push all call-saved registers and also Pm_base which we'll need
11271       // at the end.
11272       save_regs();
11273 
11274 #ifndef PRODUCT
11275       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11276       {
11277         ldr(Rn, Address(Pn_base, 0));
11278         mul(Rlo_mn, Rn, inv);
11279         subs(zr, Rlo_mn, -1);
11280         Label ok;
11281         br(EQ, ok); {
11282           stop("broken inverse in Montgomery multiply");
11283         } bind(ok);
11284       }
11285 #endif
11286 
11287       mov(Pm_base, Ra);
11288 
11289       mov(t0, zr);
11290       mov(t1, zr);
11291       mov(t2, zr);
11292 
11293       block_comment("for (int i = 0; i < len; i++) {");
11294       mov(Ri, zr); {
11295         Label loop, end;
11296         cmpw(Ri, Rlen);
11297         br(Assembler::GE, end);
11298 
11299         bind(loop);
11300         pre1(Ri);
11301 
11302         block_comment("  for (j = i; j; j--) {"); {
11303           movw(Rj, Ri);
11304           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11305         } block_comment("  } // j");
11306 
11307         post1();
11308         addw(Ri, Ri, 1);
11309         cmpw(Ri, Rlen);
11310         br(Assembler::LT, loop);
11311         bind(end);
11312         block_comment("} // i");
11313       }
11314 
11315       block_comment("for (int i = len; i < 2*len; i++) {");
11316       mov(Ri, Rlen); {
11317         Label loop, end;
11318         cmpw(Ri, Rlen, Assembler::LSL, 1);
11319         br(Assembler::GE, end);
11320 
11321         bind(loop);
11322         pre2(Ri, Rlen);
11323 
11324         block_comment("  for (j = len*2-i-1; j; j--) {"); {
11325           lslw(Rj, Rlen, 1);
11326           subw(Rj, Rj, Ri);
11327           subw(Rj, Rj, 1);
11328           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11329         } block_comment("  } // j");
11330 
11331         post2(Ri, Rlen);
11332         addw(Ri, Ri, 1);
11333         cmpw(Ri, Rlen, Assembler::LSL, 1);
11334         br(Assembler::LT, loop);
11335         bind(end);
11336       }
11337       block_comment("} // i");
11338 
11339       normalize(Rlen);
11340 
11341       mov(Ra, Pm_base);  // Save Pm_base in Ra
11342       restore_regs();  // Restore caller's Pm_base
11343 
11344       // Copy our result into caller's Pm_base
11345       reverse(Pm_base, Ra, Rlen, t0, t1);
11346 
11347       leave();
11348       bind(nothing);
11349       ret(lr);
11350 
11351       return entry;
11352     }
11353     // In C, approximately:
11354 
11355     // void
11356     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11357     //                     julong Pn_base[], julong Pm_base[],
11358     //                     julong inv, int len) {
11359     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11360     //   julong *Pa, *Pb, *Pn, *Pm;
11361     //   julong Ra, Rb, Rn, Rm;
11362 
11363     //   int i;
11364 
11365     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11366 
11367     //   for (i = 0; i < len; i++) {
11368     //     int j;
11369 
11370     //     Pa = Pa_base;
11371     //     Pb = Pb_base + i;
11372     //     Pm = Pm_base;
11373     //     Pn = Pn_base + i;
11374 
11375     //     Ra = *Pa;
11376     //     Rb = *Pb;
11377     //     Rm = *Pm;
11378     //     Rn = *Pn;
11379 
11380     //     int iters = i;
11381     //     for (j = 0; iters--; j++) {
11382     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11383     //       MACC(Ra, Rb, t0, t1, t2);
11384     //       Ra = *++Pa;
11385     //       Rb = *--Pb;
11386     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11387     //       MACC(Rm, Rn, t0, t1, t2);
11388     //       Rm = *++Pm;
11389     //       Rn = *--Pn;
11390     //     }
11391 
11392     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11393     //     MACC(Ra, Rb, t0, t1, t2);
11394     //     *Pm = Rm = t0 * inv;
11395     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11396     //     MACC(Rm, Rn, t0, t1, t2);
11397 
11398     //     assert(t0 == 0, "broken Montgomery multiply");
11399 
11400     //     t0 = t1; t1 = t2; t2 = 0;
11401     //   }
11402 
11403     //   for (i = len; i < 2*len; i++) {
11404     //     int j;
11405 
11406     //     Pa = Pa_base + i-len;
11407     //     Pb = Pb_base + len;
11408     //     Pm = Pm_base + i-len;
11409     //     Pn = Pn_base + len;
11410 
11411     //     Ra = *++Pa;
11412     //     Rb = *--Pb;
11413     //     Rm = *++Pm;
11414     //     Rn = *--Pn;
11415 
11416     //     int iters = len*2-i-1;
11417     //     for (j = i-len+1; iters--; j++) {
11418     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11419     //       MACC(Ra, Rb, t0, t1, t2);
11420     //       Ra = *++Pa;
11421     //       Rb = *--Pb;
11422     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11423     //       MACC(Rm, Rn, t0, t1, t2);
11424     //       Rm = *++Pm;
11425     //       Rn = *--Pn;
11426     //     }
11427 
11428     //     Pm_base[i-len] = t0;
11429     //     t0 = t1; t1 = t2; t2 = 0;
11430     //   }
11431 
11432     //   while (t0)
11433     //     t0 = sub(Pm_base, Pn_base, t0, len);
11434     // }
11435 
11436     /**
11437      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11438      * multiplies than Montgomery multiplication so it should be up to
11439      * 25% faster.  However, its loop control is more complex and it
11440      * may actually run slower on some machines.
11441      *
11442      * Arguments:
11443      *
11444      * Inputs:
11445      *   c_rarg0   - int array elements a
11446      *   c_rarg1   - int array elements n (the modulus)
11447      *   c_rarg2   - int length
11448      *   c_rarg3   - int inv
11449      *   c_rarg4   - int array elements m (the result)
11450      *
11451      */
11452     address generate_square() {
11453       Label argh;
11454       bind(argh);
11455       stop("MontgomeryMultiply total_allocation must be <= 8192");
11456 
11457       align(CodeEntryAlignment);
11458       address entry = pc();
11459 
11460       enter();
11461 
11462       // Make room.
11463       cmpw(Rlen, 512);
11464       br(Assembler::HI, argh);
11465       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11466       andr(sp, Ra, -2 * wordSize);
11467 
11468       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11469 
11470       {
11471         // Copy input args, reversing as we go.  We use Ra as a
11472         // temporary variable.
11473         reverse(Ra, Pa_base, Rlen, t0, t1);
11474         reverse(Ra, Pn_base, Rlen, t0, t1);
11475       }
11476 
11477       // Push all call-saved registers and also Pm_base which we'll need
11478       // at the end.
11479       save_regs();
11480 
11481       mov(Pm_base, Ra);
11482 
11483       mov(t0, zr);
11484       mov(t1, zr);
11485       mov(t2, zr);
11486 
11487       block_comment("for (int i = 0; i < len; i++) {");
11488       mov(Ri, zr); {
11489         Label loop, end;
11490         bind(loop);
11491         cmp(Ri, Rlen);
11492         br(Assembler::GE, end);
11493 
11494         pre1(Ri);
11495 
11496         block_comment("for (j = (i+1)/2; j; j--) {"); {
11497           add(Rj, Ri, 1);
11498           lsr(Rj, Rj, 1);
11499           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11500         } block_comment("  } // j");
11501 
11502         last_squaring(Ri);
11503 
11504         block_comment("  for (j = i/2; j; j--) {"); {
11505           lsr(Rj, Ri, 1);
11506           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11507         } block_comment("  } // j");
11508 
11509         post1_squaring();
11510         add(Ri, Ri, 1);
11511         cmp(Ri, Rlen);
11512         br(Assembler::LT, loop);
11513 
11514         bind(end);
11515         block_comment("} // i");
11516       }
11517 
11518       block_comment("for (int i = len; i < 2*len; i++) {");
11519       mov(Ri, Rlen); {
11520         Label loop, end;
11521         bind(loop);
11522         cmp(Ri, Rlen, Assembler::LSL, 1);
11523         br(Assembler::GE, end);
11524 
11525         pre2(Ri, Rlen);
11526 
11527         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11528           lsl(Rj, Rlen, 1);
11529           sub(Rj, Rj, Ri);
11530           sub(Rj, Rj, 1);
11531           lsr(Rj, Rj, 1);
11532           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11533         } block_comment("  } // j");
11534 
11535         last_squaring(Ri);
11536 
11537         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11538           lsl(Rj, Rlen, 1);
11539           sub(Rj, Rj, Ri);
11540           lsr(Rj, Rj, 1);
11541           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11542         } block_comment("  } // j");
11543 
11544         post2(Ri, Rlen);
11545         add(Ri, Ri, 1);
11546         cmp(Ri, Rlen, Assembler::LSL, 1);
11547 
11548         br(Assembler::LT, loop);
11549         bind(end);
11550         block_comment("} // i");
11551       }
11552 
11553       normalize(Rlen);
11554 
11555       mov(Ra, Pm_base);  // Save Pm_base in Ra
11556       restore_regs();  // Restore caller's Pm_base
11557 
11558       // Copy our result into caller's Pm_base
11559       reverse(Pm_base, Ra, Rlen, t0, t1);
11560 
11561       leave();
11562       ret(lr);
11563 
11564       return entry;
11565     }
11566     // In C, approximately:
11567 
11568     // void
11569     // montgomery_square(julong Pa_base[], julong Pn_base[],
11570     //                   julong Pm_base[], julong inv, int len) {
11571     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11572     //   julong *Pa, *Pb, *Pn, *Pm;
11573     //   julong Ra, Rb, Rn, Rm;
11574 
11575     //   int i;
11576 
11577     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11578 
11579     //   for (i = 0; i < len; i++) {
11580     //     int j;
11581 
11582     //     Pa = Pa_base;
11583     //     Pb = Pa_base + i;
11584     //     Pm = Pm_base;
11585     //     Pn = Pn_base + i;
11586 
11587     //     Ra = *Pa;
11588     //     Rb = *Pb;
11589     //     Rm = *Pm;
11590     //     Rn = *Pn;
11591 
11592     //     int iters = (i+1)/2;
11593     //     for (j = 0; iters--; j++) {
11594     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11595     //       MACC2(Ra, Rb, t0, t1, t2);
11596     //       Ra = *++Pa;
11597     //       Rb = *--Pb;
11598     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11599     //       MACC(Rm, Rn, t0, t1, t2);
11600     //       Rm = *++Pm;
11601     //       Rn = *--Pn;
11602     //     }
11603     //     if ((i & 1) == 0) {
11604     //       assert(Ra == Pa_base[j], "must be");
11605     //       MACC(Ra, Ra, t0, t1, t2);
11606     //     }
11607     //     iters = i/2;
11608     //     assert(iters == i-j, "must be");
11609     //     for (; iters--; j++) {
11610     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11611     //       MACC(Rm, Rn, t0, t1, t2);
11612     //       Rm = *++Pm;
11613     //       Rn = *--Pn;
11614     //     }
11615 
11616     //     *Pm = Rm = t0 * inv;
11617     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11618     //     MACC(Rm, Rn, t0, t1, t2);
11619 
11620     //     assert(t0 == 0, "broken Montgomery multiply");
11621 
11622     //     t0 = t1; t1 = t2; t2 = 0;
11623     //   }
11624 
11625     //   for (i = len; i < 2*len; i++) {
11626     //     int start = i-len+1;
11627     //     int end = start + (len - start)/2;
11628     //     int j;
11629 
11630     //     Pa = Pa_base + i-len;
11631     //     Pb = Pa_base + len;
11632     //     Pm = Pm_base + i-len;
11633     //     Pn = Pn_base + len;
11634 
11635     //     Ra = *++Pa;
11636     //     Rb = *--Pb;
11637     //     Rm = *++Pm;
11638     //     Rn = *--Pn;
11639 
11640     //     int iters = (2*len-i-1)/2;
11641     //     assert(iters == end-start, "must be");
11642     //     for (j = start; iters--; j++) {
11643     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11644     //       MACC2(Ra, Rb, t0, t1, t2);
11645     //       Ra = *++Pa;
11646     //       Rb = *--Pb;
11647     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11648     //       MACC(Rm, Rn, t0, t1, t2);
11649     //       Rm = *++Pm;
11650     //       Rn = *--Pn;
11651     //     }
11652     //     if ((i & 1) == 0) {
11653     //       assert(Ra == Pa_base[j], "must be");
11654     //       MACC(Ra, Ra, t0, t1, t2);
11655     //     }
11656     //     iters =  (2*len-i)/2;
11657     //     assert(iters == len-j, "must be");
11658     //     for (; iters--; j++) {
11659     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11660     //       MACC(Rm, Rn, t0, t1, t2);
11661     //       Rm = *++Pm;
11662     //       Rn = *--Pn;
11663     //     }
11664     //     Pm_base[i-len] = t0;
11665     //     t0 = t1; t1 = t2; t2 = 0;
11666     //   }
11667 
11668     //   while (t0)
11669     //     t0 = sub(Pm_base, Pn_base, t0, len);
11670     // }
11671   };
11672 
11673   // Initialization
11674   void generate_preuniverse_stubs() {
11675     // preuniverse stubs are not needed for aarch64
11676   }
11677 
11678   void generate_initial_stubs() {
11679     // Generate initial stubs and initializes the entry points
11680 
11681     // entry points that exist in all platforms Note: This is code
11682     // that could be shared among different platforms - however the
11683     // benefit seems to be smaller than the disadvantage of having a
11684     // much more complicated generator structure. See also comment in
11685     // stubRoutines.hpp.
11686 
11687     StubRoutines::_forward_exception_entry = generate_forward_exception();
11688 
11689     StubRoutines::_call_stub_entry =
11690       generate_call_stub(StubRoutines::_call_stub_return_address);
11691 
11692     // is referenced by megamorphic call
11693     StubRoutines::_catch_exception_entry = generate_catch_exception();
11694 
11695     // Initialize table for copy memory (arraycopy) check.
11696     if (UnsafeMemoryAccess::_table == nullptr) {
11697       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11698     }
11699 
11700     if (UseCRC32Intrinsics) {
11701       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11702     }
11703 
11704     if (UseCRC32CIntrinsics) {
11705       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11706     }
11707 
11708     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11709       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11710     }
11711 
11712     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11713       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11714     }
11715 
11716     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11717         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11718       StubRoutines::_hf2f = generate_float16ToFloat();
11719       StubRoutines::_f2hf = generate_floatToFloat16();
11720     }
11721   }
11722 
11723   void generate_continuation_stubs() {
11724     // Continuation stubs:
11725     StubRoutines::_cont_thaw          = generate_cont_thaw();
11726     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11727     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11728     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11729   }
11730 
11731   void generate_final_stubs() {
11732     // support for verify_oop (must happen after universe_init)
11733     if (VerifyOops) {
11734       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11735     }
11736 
11737     // arraycopy stubs used by compilers
11738     generate_arraycopy_stubs();
11739 
11740     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11741 
11742     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11743 
11744     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11745     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11746 
11747 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11748 
11749     generate_atomic_entry_points();
11750 
11751 #endif // LINUX
11752 
11753 #ifdef COMPILER2
11754     if (UseSecondarySupersTable) {
11755       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11756       if (! InlineSecondarySupersTest) {
11757         generate_lookup_secondary_supers_table_stub();
11758       }
11759     }
11760 #endif
11761 
11762     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11763 
11764     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11765   }
11766 
11767   void generate_compiler_stubs() {
11768 #if COMPILER2_OR_JVMCI
11769 
11770     if (UseSVE == 0) {
11771       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11772     }
11773 
11774     // array equals stub for large arrays.
11775     if (!UseSimpleArrayEquals) {
11776       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11777     }
11778 
11779     // arrays_hascode stub for large arrays.
11780     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11781     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11782     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11783     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11784     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11785 
11786     // byte_array_inflate stub for large arrays.
11787     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11788 
11789     // countPositives stub for large arrays.
11790     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11791 
11792     generate_compare_long_strings();
11793 
11794     generate_string_indexof_stubs();
11795 
11796 #ifdef COMPILER2
11797     if (UseMultiplyToLenIntrinsic) {
11798       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11799     }
11800 
11801     if (UseSquareToLenIntrinsic) {
11802       StubRoutines::_squareToLen = generate_squareToLen();
11803     }
11804 
11805     if (UseMulAddIntrinsic) {
11806       StubRoutines::_mulAdd = generate_mulAdd();
11807     }
11808 
11809     if (UseSIMDForBigIntegerShiftIntrinsics) {
11810       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11811       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11812     }
11813 
11814     if (UseMontgomeryMultiplyIntrinsic) {
11815       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11816       StubCodeMark mark(this, stub_id);
11817       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11818       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11819     }
11820 
11821     if (UseMontgomerySquareIntrinsic) {
11822       StubId stub_id = StubId::stubgen_montgomerySquare_id;
11823       StubCodeMark mark(this, stub_id);
11824       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11825       // We use generate_multiply() rather than generate_square()
11826       // because it's faster for the sizes of modulus we care about.
11827       StubRoutines::_montgomerySquare = g.generate_multiply();
11828     }
11829 
11830 #endif // COMPILER2
11831 
11832     if (UseChaCha20Intrinsics) {
11833       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11834     }
11835 
11836     if (UseKyberIntrinsics) {
11837       StubRoutines::_kyberNtt = generate_kyberNtt();
11838       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11839       StubRoutines::_kyberNttMult = generate_kyberNttMult();
11840       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11841       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11842       StubRoutines::_kyber12To16 = generate_kyber12To16();
11843       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11844     }
11845 
11846     if (UseDilithiumIntrinsics) {
11847       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11848       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11849       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11850       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11851       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11852     }
11853 
11854     if (UseBASE64Intrinsics) {
11855         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11856         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11857     }
11858 
11859     // data cache line writeback
11860     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11861     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11862 
11863     if (UseAESIntrinsics) {
11864       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11865       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11866       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11867       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11868       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11869     }
11870     if (UseGHASHIntrinsics) {
11871       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11872       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11873     }
11874     if (UseAESIntrinsics && UseGHASHIntrinsics) {
11875       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11876     }
11877 
11878     if (UseMD5Intrinsics) {
11879       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
11880       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
11881     }
11882     if (UseSHA1Intrinsics) {
11883       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
11884       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
11885     }
11886     if (UseSHA256Intrinsics) {
11887       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
11888       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
11889     }
11890     if (UseSHA512Intrinsics) {
11891       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
11892       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
11893     }
11894     if (UseSHA3Intrinsics) {
11895 
11896       StubRoutines::_double_keccak         = generate_double_keccak();
11897       if (UseSIMDForSHA3Intrinsic) {
11898          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
11899          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
11900       } else {
11901          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
11902          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
11903       }
11904     }
11905 
11906     if (UsePoly1305Intrinsics) {
11907       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11908     }
11909 
11910     // generate Adler32 intrinsics code
11911     if (UseAdler32Intrinsics) {
11912       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11913     }
11914 
11915 #endif // COMPILER2_OR_JVMCI
11916   }
11917 
11918  public:
11919   StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
11920     switch(blob_id) {
11921     case BlobId::stubgen_preuniverse_id:
11922       generate_preuniverse_stubs();
11923       break;
11924     case BlobId::stubgen_initial_id:
11925       generate_initial_stubs();
11926       break;
11927      case BlobId::stubgen_continuation_id:
11928       generate_continuation_stubs();
11929       break;
11930     case BlobId::stubgen_compiler_id:
11931       generate_compiler_stubs();
11932       break;
11933     case BlobId::stubgen_final_id:
11934       generate_final_stubs();
11935       break;
11936     default:
11937       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
11938       break;
11939     };
11940   }
11941 }; // end class declaration
11942 
11943 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
11944   StubGenerator g(code, blob_id);
11945 }
11946 
11947 
11948 #if defined (LINUX)
11949 
11950 // Define pointers to atomic stubs and initialize them to point to the
11951 // code in atomic_aarch64.S.
11952 
11953 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
11954   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
11955     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
11956   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
11957     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
11958 
11959 DEFAULT_ATOMIC_OP(fetch_add, 4, )
11960 DEFAULT_ATOMIC_OP(fetch_add, 8, )
11961 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
11962 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
11963 DEFAULT_ATOMIC_OP(xchg, 4, )
11964 DEFAULT_ATOMIC_OP(xchg, 8, )
11965 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
11966 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
11967 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
11968 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
11969 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
11970 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
11971 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
11972 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
11973 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
11974 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
11975 
11976 #undef DEFAULT_ATOMIC_OP
11977 
11978 #endif // LINUX