1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "code/aotCodeCache.hpp"
   31 #include "compiler/oopMap.hpp"
   32 #include "gc/shared/barrierSet.hpp"
   33 #include "gc/shared/barrierSetAssembler.hpp"
   34 #include "gc/shared/gc_globals.hpp"
   35 #include "gc/shared/tlab_globals.hpp"
   36 #include "interpreter/interpreter.hpp"
   37 #include "memory/universe.hpp"
   38 #include "nativeInst_aarch64.hpp"
   39 #include "oops/instanceOop.hpp"
   40 #include "oops/method.hpp"
   41 #include "oops/objArrayKlass.hpp"
   42 #include "oops/oop.inline.hpp"
   43 #include "prims/methodHandles.hpp"
   44 #include "prims/upcallLinker.hpp"
   45 #include "runtime/arguments.hpp"
   46 #include "runtime/atomicAccess.hpp"
   47 #include "runtime/continuation.hpp"
   48 #include "runtime/continuationEntry.inline.hpp"
   49 #include "runtime/frame.inline.hpp"
   50 #include "runtime/handles.inline.hpp"
   51 #include "runtime/javaThread.hpp"
   52 #include "runtime/sharedRuntime.hpp"
   53 #include "runtime/stubCodeGenerator.hpp"
   54 #include "runtime/stubRoutines.hpp"
   55 #include "utilities/align.hpp"
   56 #include "utilities/checkedCast.hpp"
   57 #include "utilities/debug.hpp"
   58 #include "utilities/globalDefinitions.hpp"
   59 #include "utilities/intpow.hpp"
   60 #include "utilities/powerOfTwo.hpp"
   61 #ifdef COMPILER2
   62 #include "opto/runtime.hpp"
   63 #endif
   64 #if INCLUDE_ZGC
   65 #include "gc/z/zThreadLocalData.hpp"
   66 #endif
   67 
   68 // Declaration and definition of StubGenerator (no .hpp file).
   69 // For a more detailed description of the stub routine structure
   70 // see the comment in stubRoutines.hpp
   71 
   72 #undef __
   73 #define __ _masm->
   74 
   75 #ifdef PRODUCT
   76 #define BLOCK_COMMENT(str) /* nothing */
   77 #else
   78 #define BLOCK_COMMENT(str) __ block_comment(str)
   79 #endif
   80 
   81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   82 
   83 // Stub Code definitions
   84 
   85 class StubGenerator: public StubCodeGenerator {
   86  private:
   87 
   88 #ifdef PRODUCT
   89 #define inc_counter_np(counter) ((void)0)
   90 #else
   91   void inc_counter_np_(uint& counter) {
   92     __ incrementw(ExternalAddress((address)&counter));
   93   }
   94 #define inc_counter_np(counter) \
   95   BLOCK_COMMENT("inc_counter " #counter); \
   96   inc_counter_np_(counter);
   97 #endif
   98 
   99   // Call stubs are used to call Java from C
  100   //
  101   // Arguments:
  102   //    c_rarg0:   call wrapper address                   address
  103   //    c_rarg1:   result                                 address
  104   //    c_rarg2:   result type                            BasicType
  105   //    c_rarg3:   method                                 Method*
  106   //    c_rarg4:   (interpreter) entry point              address
  107   //    c_rarg5:   parameters                             intptr_t*
  108   //    c_rarg6:   parameter size (in words)              int
  109   //    c_rarg7:   thread                                 Thread*
  110   //
  111   // There is no return from the stub itself as any Java result
  112   // is written to result
  113   //
  114   // we save r30 (lr) as the return PC at the base of the frame and
  115   // link r29 (fp) below it as the frame pointer installing sp (r31)
  116   // into fp.
  117   //
  118   // we save r0-r7, which accounts for all the c arguments.
  119   //
  120   // TODO: strictly do we need to save them all? they are treated as
  121   // volatile by C so could we omit saving the ones we are going to
  122   // place in global registers (thread? method?) or those we only use
  123   // during setup of the Java call?
  124   //
  125   // we don't need to save r8 which C uses as an indirect result location
  126   // return register.
  127   //
  128   // we don't need to save r9-r15 which both C and Java treat as
  129   // volatile
  130   //
  131   // we don't need to save r16-18 because Java does not use them
  132   //
  133   // we save r19-r28 which Java uses as scratch registers and C
  134   // expects to be callee-save
  135   //
  136   // we save the bottom 64 bits of each value stored in v8-v15; it is
  137   // the responsibility of the caller to preserve larger values.
  138   //
  139   // so the stub frame looks like this when we enter Java code
  140   //
  141   //     [ return_from_Java     ] <--- sp
  142   //     [ argument word n      ]
  143   //      ...
  144   // -29 [ argument word 1      ]
  145   // -28 [ saved Floating-point Control Register ]
  146   // -26 [ saved v15            ] <--- sp_after_call
  147   // -25 [ saved v14            ]
  148   // -24 [ saved v13            ]
  149   // -23 [ saved v12            ]
  150   // -22 [ saved v11            ]
  151   // -21 [ saved v10            ]
  152   // -20 [ saved v9             ]
  153   // -19 [ saved v8             ]
  154   // -18 [ saved r28            ]
  155   // -17 [ saved r27            ]
  156   // -16 [ saved r26            ]
  157   // -15 [ saved r25            ]
  158   // -14 [ saved r24            ]
  159   // -13 [ saved r23            ]
  160   // -12 [ saved r22            ]
  161   // -11 [ saved r21            ]
  162   // -10 [ saved r20            ]
  163   //  -9 [ saved r19            ]
  164   //  -8 [ call wrapper    (r0) ]
  165   //  -7 [ result          (r1) ]
  166   //  -6 [ result type     (r2) ]
  167   //  -5 [ method          (r3) ]
  168   //  -4 [ entry point     (r4) ]
  169   //  -3 [ parameters      (r5) ]
  170   //  -2 [ parameter size  (r6) ]
  171   //  -1 [ thread (r7)          ]
  172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  173   //   1 [ saved lr       (r30) ]
  174 
  175   // Call stub stack layout word offsets from fp
  176   enum call_stub_layout {
  177     sp_after_call_off  = -28,
  178 
  179     fpcr_off           = sp_after_call_off,
  180     d15_off            = -26,
  181     d13_off            = -24,
  182     d11_off            = -22,
  183     d9_off             = -20,
  184 
  185     r28_off            = -18,
  186     r26_off            = -16,
  187     r24_off            = -14,
  188     r22_off            = -12,
  189     r20_off            = -10,
  190     call_wrapper_off   =  -8,
  191     result_off         =  -7,
  192     result_type_off    =  -6,
  193     method_off         =  -5,
  194     entry_point_off    =  -4,
  195     parameter_size_off =  -2,
  196     thread_off         =  -1,
  197     fp_f               =   0,
  198     retaddr_off        =   1,
  199   };
  200 
  201   address generate_call_stub(address& return_address) {
  202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  204            "adjust this code");
  205 
  206     StubId stub_id = StubId::stubgen_call_stub_id;
  207     StubCodeMark mark(this, stub_id);
  208     address start = __ pc();
  209 
  210     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  211 
  212     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  213     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  214     const Address result        (rfp, result_off         * wordSize);
  215     const Address result_type   (rfp, result_type_off    * wordSize);
  216     const Address method        (rfp, method_off         * wordSize);
  217     const Address entry_point   (rfp, entry_point_off    * wordSize);
  218     const Address parameter_size(rfp, parameter_size_off * wordSize);
  219 
  220     const Address thread        (rfp, thread_off         * wordSize);
  221 
  222     const Address d15_save      (rfp, d15_off * wordSize);
  223     const Address d13_save      (rfp, d13_off * wordSize);
  224     const Address d11_save      (rfp, d11_off * wordSize);
  225     const Address d9_save       (rfp, d9_off * wordSize);
  226 
  227     const Address r28_save      (rfp, r28_off * wordSize);
  228     const Address r26_save      (rfp, r26_off * wordSize);
  229     const Address r24_save      (rfp, r24_off * wordSize);
  230     const Address r22_save      (rfp, r22_off * wordSize);
  231     const Address r20_save      (rfp, r20_off * wordSize);
  232 
  233     // stub code
  234 
  235     address aarch64_entry = __ pc();
  236 
  237     // set up frame and move sp to end of save area
  238     __ enter();
  239     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  240 
  241     // save register parameters and Java scratch/global registers
  242     // n.b. we save thread even though it gets installed in
  243     // rthread because we want to sanity check rthread later
  244     __ str(c_rarg7,  thread);
  245     __ strw(c_rarg6, parameter_size);
  246     __ stp(c_rarg4, c_rarg5,  entry_point);
  247     __ stp(c_rarg2, c_rarg3,  result_type);
  248     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  249 
  250     __ stp(r20, r19,   r20_save);
  251     __ stp(r22, r21,   r22_save);
  252     __ stp(r24, r23,   r24_save);
  253     __ stp(r26, r25,   r26_save);
  254     __ stp(r28, r27,   r28_save);
  255 
  256     __ stpd(v9,  v8,   d9_save);
  257     __ stpd(v11, v10,  d11_save);
  258     __ stpd(v13, v12,  d13_save);
  259     __ stpd(v15, v14,  d15_save);
  260 
  261     __ get_fpcr(rscratch1);
  262     __ str(rscratch1, fpcr_save);
  263     // Set FPCR to the state we need. We do want Round to Nearest. We
  264     // don't want non-IEEE rounding modes or floating-point traps.
  265     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  266     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  267     __ set_fpcr(rscratch1);
  268 
  269     // install Java thread in global register now we have saved
  270     // whatever value it held
  271     __ mov(rthread, c_rarg7);
  272     // And method
  273     __ mov(rmethod, c_rarg3);
  274 
  275     // set up the heapbase register
  276     __ reinit_heapbase();
  277 
  278 #ifdef ASSERT
  279     // make sure we have no pending exceptions
  280     {
  281       Label L;
  282       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  283       __ cmp(rscratch1, (u1)NULL_WORD);
  284       __ br(Assembler::EQ, L);
  285       __ stop("StubRoutines::call_stub: entered with pending exception");
  286       __ BIND(L);
  287     }
  288 #endif
  289     // pass parameters if any
  290     __ mov(esp, sp);
  291     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  292     __ andr(sp, rscratch1, -2 * wordSize);
  293 
  294     BLOCK_COMMENT("pass parameters if any");
  295     Label parameters_done;
  296     // parameter count is still in c_rarg6
  297     // and parameter pointer identifying param 1 is in c_rarg5
  298     __ cbzw(c_rarg6, parameters_done);
  299 
  300     address loop = __ pc();
  301     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  302     __ subsw(c_rarg6, c_rarg6, 1);
  303     __ push(rscratch1);
  304     __ br(Assembler::GT, loop);
  305 
  306     __ BIND(parameters_done);
  307 
  308     // call Java entry -- passing methdoOop, and current sp
  309     //      rmethod: Method*
  310     //      r19_sender_sp: sender sp
  311     BLOCK_COMMENT("call Java function");
  312     __ mov(r19_sender_sp, sp);
  313     __ blr(c_rarg4);
  314 
  315     // we do this here because the notify will already have been done
  316     // if we get to the next instruction via an exception
  317     //
  318     // n.b. adding this instruction here affects the calculation of
  319     // whether or not a routine returns to the call stub (used when
  320     // doing stack walks) since the normal test is to check the return
  321     // pc against the address saved below. so we may need to allow for
  322     // this extra instruction in the check.
  323 
  324     // save current address for use by exception handling code
  325 
  326     return_address = __ pc();
  327 
  328     // store result depending on type (everything that is not
  329     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  330     // n.b. this assumes Java returns an integral result in r0
  331     // and a floating result in j_farg0
  332     __ ldr(j_rarg2, result);
  333     Label is_long, is_float, is_double, exit;
  334     __ ldr(j_rarg1, result_type);
  335     __ cmp(j_rarg1, (u1)T_OBJECT);
  336     __ br(Assembler::EQ, is_long);
  337     __ cmp(j_rarg1, (u1)T_LONG);
  338     __ br(Assembler::EQ, is_long);
  339     __ cmp(j_rarg1, (u1)T_FLOAT);
  340     __ br(Assembler::EQ, is_float);
  341     __ cmp(j_rarg1, (u1)T_DOUBLE);
  342     __ br(Assembler::EQ, is_double);
  343 
  344     // handle T_INT case
  345     __ strw(r0, Address(j_rarg2));
  346 
  347     __ BIND(exit);
  348 
  349     // pop parameters
  350     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  351 
  352 #ifdef ASSERT
  353     // verify that threads correspond
  354     {
  355       Label L, S;
  356       __ ldr(rscratch1, thread);
  357       __ cmp(rthread, rscratch1);
  358       __ br(Assembler::NE, S);
  359       __ get_thread(rscratch1);
  360       __ cmp(rthread, rscratch1);
  361       __ br(Assembler::EQ, L);
  362       __ BIND(S);
  363       __ stop("StubRoutines::call_stub: threads must correspond");
  364       __ BIND(L);
  365     }
  366 #endif
  367 
  368     __ pop_cont_fastpath(rthread);
  369 
  370     // restore callee-save registers
  371     __ ldpd(v15, v14,  d15_save);
  372     __ ldpd(v13, v12,  d13_save);
  373     __ ldpd(v11, v10,  d11_save);
  374     __ ldpd(v9,  v8,   d9_save);
  375 
  376     __ ldp(r28, r27,   r28_save);
  377     __ ldp(r26, r25,   r26_save);
  378     __ ldp(r24, r23,   r24_save);
  379     __ ldp(r22, r21,   r22_save);
  380     __ ldp(r20, r19,   r20_save);
  381 
  382     // restore fpcr
  383     __ ldr(rscratch1,  fpcr_save);
  384     __ set_fpcr(rscratch1);
  385 
  386     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  387     __ ldrw(c_rarg2, result_type);
  388     __ ldr(c_rarg3,  method);
  389     __ ldp(c_rarg4, c_rarg5,  entry_point);
  390     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  391 
  392     // leave frame and return to caller
  393     __ leave();
  394     __ ret(lr);
  395 
  396     // handle return types different from T_INT
  397 
  398     __ BIND(is_long);
  399     __ str(r0, Address(j_rarg2, 0));
  400     __ br(Assembler::AL, exit);
  401 
  402     __ BIND(is_float);
  403     __ strs(j_farg0, Address(j_rarg2, 0));
  404     __ br(Assembler::AL, exit);
  405 
  406     __ BIND(is_double);
  407     __ strd(j_farg0, Address(j_rarg2, 0));
  408     __ br(Assembler::AL, exit);
  409 
  410     return start;
  411   }
  412 
  413   // Return point for a Java call if there's an exception thrown in
  414   // Java code.  The exception is caught and transformed into a
  415   // pending exception stored in JavaThread that can be tested from
  416   // within the VM.
  417   //
  418   // Note: Usually the parameters are removed by the callee. In case
  419   // of an exception crossing an activation frame boundary, that is
  420   // not the case if the callee is compiled code => need to setup the
  421   // rsp.
  422   //
  423   // r0: exception oop
  424 
  425   address generate_catch_exception() {
  426     StubId stub_id = StubId::stubgen_catch_exception_id;
  427     StubCodeMark mark(this, stub_id);
  428     address start = __ pc();
  429 
  430     // same as in generate_call_stub():
  431     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  432     const Address thread        (rfp, thread_off         * wordSize);
  433 
  434 #ifdef ASSERT
  435     // verify that threads correspond
  436     {
  437       Label L, S;
  438       __ ldr(rscratch1, thread);
  439       __ cmp(rthread, rscratch1);
  440       __ br(Assembler::NE, S);
  441       __ get_thread(rscratch1);
  442       __ cmp(rthread, rscratch1);
  443       __ br(Assembler::EQ, L);
  444       __ bind(S);
  445       __ stop("StubRoutines::catch_exception: threads must correspond");
  446       __ bind(L);
  447     }
  448 #endif
  449 
  450     // set pending exception
  451     __ verify_oop(r0);
  452 
  453     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  454     __ mov(rscratch1, (address)__FILE__);
  455     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  456     __ movw(rscratch1, (int)__LINE__);
  457     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  458 
  459     // complete return to VM
  460     assert(StubRoutines::_call_stub_return_address != nullptr,
  461            "_call_stub_return_address must have been generated before");
  462     __ b(StubRoutines::_call_stub_return_address);
  463 
  464     return start;
  465   }
  466 
  467   // Continuation point for runtime calls returning with a pending
  468   // exception.  The pending exception check happened in the runtime
  469   // or native call stub.  The pending exception in Thread is
  470   // converted into a Java-level exception.
  471   //
  472   // Contract with Java-level exception handlers:
  473   // r0: exception
  474   // r3: throwing pc
  475   //
  476   // NOTE: At entry of this stub, exception-pc must be in LR !!
  477 
  478   // NOTE: this is always used as a jump target within generated code
  479   // so it just needs to be generated code with no x86 prolog
  480 
  481   address generate_forward_exception() {
  482     StubId stub_id = StubId::stubgen_forward_exception_id;
  483     StubCodeMark mark(this, stub_id);
  484     address start = __ pc();
  485 
  486     // Upon entry, LR points to the return address returning into
  487     // Java (interpreted or compiled) code; i.e., the return address
  488     // becomes the throwing pc.
  489     //
  490     // Arguments pushed before the runtime call are still on the stack
  491     // but the exception handler will reset the stack pointer ->
  492     // ignore them.  A potential result in registers can be ignored as
  493     // well.
  494 
  495 #ifdef ASSERT
  496     // make sure this code is only executed if there is a pending exception
  497     {
  498       Label L;
  499       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  500       __ cbnz(rscratch1, L);
  501       __ stop("StubRoutines::forward exception: no pending exception (1)");
  502       __ bind(L);
  503     }
  504 #endif
  505 
  506     // compute exception handler into r19
  507 
  508     // call the VM to find the handler address associated with the
  509     // caller address. pass thread in r0 and caller pc (ret address)
  510     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  511     // the stack.
  512     __ mov(c_rarg1, lr);
  513     // lr will be trashed by the VM call so we move it to R19
  514     // (callee-saved) because we also need to pass it to the handler
  515     // returned by this call.
  516     __ mov(r19, lr);
  517     BLOCK_COMMENT("call exception_handler_for_return_address");
  518     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  519                          SharedRuntime::exception_handler_for_return_address),
  520                     rthread, c_rarg1);
  521     // Reinitialize the ptrue predicate register, in case the external runtime
  522     // call clobbers ptrue reg, as we may return to SVE compiled code.
  523     __ reinitialize_ptrue();
  524 
  525     // we should not really care that lr is no longer the callee
  526     // address. we saved the value the handler needs in r19 so we can
  527     // just copy it to r3. however, the C2 handler will push its own
  528     // frame and then calls into the VM and the VM code asserts that
  529     // the PC for the frame above the handler belongs to a compiled
  530     // Java method. So, we restore lr here to satisfy that assert.
  531     __ mov(lr, r19);
  532     // setup r0 & r3 & clear pending exception
  533     __ mov(r3, r19);
  534     __ mov(r19, r0);
  535     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  536     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  537 
  538 #ifdef ASSERT
  539     // make sure exception is set
  540     {
  541       Label L;
  542       __ cbnz(r0, L);
  543       __ stop("StubRoutines::forward exception: no pending exception (2)");
  544       __ bind(L);
  545     }
  546 #endif
  547 
  548     // continue at exception handler
  549     // r0: exception
  550     // r3: throwing pc
  551     // r19: exception handler
  552     __ verify_oop(r0);
  553     __ br(r19);
  554 
  555     return start;
  556   }
  557 
  558   // Non-destructive plausibility checks for oops
  559   //
  560   // Arguments:
  561   //    r0: oop to verify
  562   //    rscratch1: error message
  563   //
  564   // Stack after saving c_rarg3:
  565   //    [tos + 0]: saved c_rarg3
  566   //    [tos + 1]: saved c_rarg2
  567   //    [tos + 2]: saved lr
  568   //    [tos + 3]: saved rscratch2
  569   //    [tos + 4]: saved r0
  570   //    [tos + 5]: saved rscratch1
  571   address generate_verify_oop() {
  572     StubId stub_id = StubId::stubgen_verify_oop_id;
  573     StubCodeMark mark(this, stub_id);
  574     address start = __ pc();
  575 
  576     Label exit, error;
  577 
  578     // save c_rarg2 and c_rarg3
  579     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  580 
  581     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  582     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  583     __ ldr(c_rarg3, Address(c_rarg2));
  584     __ add(c_rarg3, c_rarg3, 1);
  585     __ str(c_rarg3, Address(c_rarg2));
  586 
  587     // object is in r0
  588     // make sure object is 'reasonable'
  589     __ cbz(r0, exit); // if obj is null it is OK
  590 
  591     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  592     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  593 
  594     // return if everything seems ok
  595     __ bind(exit);
  596 
  597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  598     __ ret(lr);
  599 
  600     // handle errors
  601     __ bind(error);
  602     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  603 
  604     __ push(RegSet::range(r0, r29), sp);
  605     // debug(char* msg, int64_t pc, int64_t regs[])
  606     __ mov(c_rarg0, rscratch1);      // pass address of error message
  607     __ mov(c_rarg1, lr);             // pass return address
  608     __ mov(c_rarg2, sp);             // pass address of regs on stack
  609 #ifndef PRODUCT
  610     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  611 #endif
  612     BLOCK_COMMENT("call MacroAssembler::debug");
  613     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  614     __ blr(rscratch1);
  615     __ hlt(0);
  616 
  617     return start;
  618   }
  619 
  620   // Generate indices for iota vector.
  621   address generate_iota_indices(StubId stub_id) {
  622     __ align(CodeEntryAlignment);
  623     StubCodeMark mark(this, stub_id);
  624     address start = __ pc();
  625     // B
  626     __ emit_data64(0x0706050403020100, relocInfo::none);
  627     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  628     // H
  629     __ emit_data64(0x0003000200010000, relocInfo::none);
  630     __ emit_data64(0x0007000600050004, relocInfo::none);
  631     // S
  632     __ emit_data64(0x0000000100000000, relocInfo::none);
  633     __ emit_data64(0x0000000300000002, relocInfo::none);
  634     // D
  635     __ emit_data64(0x0000000000000000, relocInfo::none);
  636     __ emit_data64(0x0000000000000001, relocInfo::none);
  637     // S - FP
  638     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  639     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  640     // D - FP
  641     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  642     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  643     return start;
  644   }
  645 
  646   // The inner part of zero_words().  This is the bulk operation,
  647   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  648   // caller is responsible for zeroing the last few words.
  649   //
  650   // Inputs:
  651   // r10: the HeapWord-aligned base address of an array to zero.
  652   // r11: the count in HeapWords, r11 > 0.
  653   //
  654   // Returns r10 and r11, adjusted for the caller to clear.
  655   // r10: the base address of the tail of words left to clear.
  656   // r11: the number of words in the tail.
  657   //      r11 < MacroAssembler::zero_words_block_size.
  658 
  659   address generate_zero_blocks() {
  660     Label done;
  661     Label base_aligned;
  662 
  663     Register base = r10, cnt = r11;
  664 
  665     __ align(CodeEntryAlignment);
  666     StubId stub_id = StubId::stubgen_zero_blocks_id;
  667     StubCodeMark mark(this, stub_id);
  668     address start = __ pc();
  669 
  670     if (UseBlockZeroing) {
  671       int zva_length = VM_Version::zva_length();
  672 
  673       // Ensure ZVA length can be divided by 16. This is required by
  674       // the subsequent operations.
  675       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  676 
  677       __ tbz(base, 3, base_aligned);
  678       __ str(zr, Address(__ post(base, 8)));
  679       __ sub(cnt, cnt, 1);
  680       __ bind(base_aligned);
  681 
  682       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  683       // alignment.
  684       Label small;
  685       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  686       __ subs(rscratch1, cnt, low_limit >> 3);
  687       __ br(Assembler::LT, small);
  688       __ zero_dcache_blocks(base, cnt);
  689       __ bind(small);
  690     }
  691 
  692     {
  693       // Number of stp instructions we'll unroll
  694       const int unroll =
  695         MacroAssembler::zero_words_block_size / 2;
  696       // Clear the remaining blocks.
  697       Label loop;
  698       __ subs(cnt, cnt, unroll * 2);
  699       __ br(Assembler::LT, done);
  700       __ bind(loop);
  701       for (int i = 0; i < unroll; i++)
  702         __ stp(zr, zr, __ post(base, 16));
  703       __ subs(cnt, cnt, unroll * 2);
  704       __ br(Assembler::GE, loop);
  705       __ bind(done);
  706       __ add(cnt, cnt, unroll * 2);
  707     }
  708 
  709     __ ret(lr);
  710 
  711     return start;
  712   }
  713 
  714 
  715   typedef enum {
  716     copy_forwards = 1,
  717     copy_backwards = -1
  718   } copy_direction;
  719 
  720   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  721   // for arraycopy stubs.
  722   class ArrayCopyBarrierSetHelper : StackObj {
  723     BarrierSetAssembler* _bs_asm;
  724     MacroAssembler* _masm;
  725     DecoratorSet _decorators;
  726     BasicType _type;
  727     Register _gct1;
  728     Register _gct2;
  729     Register _gct3;
  730     FloatRegister _gcvt1;
  731     FloatRegister _gcvt2;
  732     FloatRegister _gcvt3;
  733 
  734   public:
  735     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  736                               DecoratorSet decorators,
  737                               BasicType type,
  738                               Register gct1,
  739                               Register gct2,
  740                               Register gct3,
  741                               FloatRegister gcvt1,
  742                               FloatRegister gcvt2,
  743                               FloatRegister gcvt3)
  744       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  745         _masm(masm),
  746         _decorators(decorators),
  747         _type(type),
  748         _gct1(gct1),
  749         _gct2(gct2),
  750         _gct3(gct3),
  751         _gcvt1(gcvt1),
  752         _gcvt2(gcvt2),
  753         _gcvt3(gcvt3) {
  754     }
  755 
  756     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  757       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  758                             dst1, dst2, src,
  759                             _gct1, _gct2, _gcvt1);
  760     }
  761 
  762     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  763       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  764                              dst, src1, src2,
  765                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  766     }
  767 
  768     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  769       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  770                             dst1, dst2, src,
  771                             _gct1);
  772     }
  773 
  774     void copy_store_at_16(Address dst, Register src1, Register src2) {
  775       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  776                              dst, src1, src2,
  777                              _gct1, _gct2, _gct3);
  778     }
  779 
  780     void copy_load_at_8(Register dst, Address src) {
  781       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  782                             dst, noreg, src,
  783                             _gct1);
  784     }
  785 
  786     void copy_store_at_8(Address dst, Register src) {
  787       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  788                              dst, src, noreg,
  789                              _gct1, _gct2, _gct3);
  790     }
  791   };
  792 
  793   // Bulk copy of blocks of 8 words.
  794   //
  795   // count is a count of words.
  796   //
  797   // Precondition: count >= 8
  798   //
  799   // Postconditions:
  800   //
  801   // The least significant bit of count contains the remaining count
  802   // of words to copy.  The rest of count is trash.
  803   //
  804   // s and d are adjusted to point to the remaining words to copy
  805   //
  806   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
  807     BasicType type;
  808     copy_direction direction;
  809 
  810     switch (stub_id) {
  811     case StubId::stubgen_copy_byte_f_id:
  812       direction = copy_forwards;
  813       type = T_BYTE;
  814       break;
  815     case StubId::stubgen_copy_byte_b_id:
  816       direction = copy_backwards;
  817       type = T_BYTE;
  818       break;
  819     case StubId::stubgen_copy_oop_f_id:
  820       direction = copy_forwards;
  821       type = T_OBJECT;
  822       break;
  823     case StubId::stubgen_copy_oop_b_id:
  824       direction = copy_backwards;
  825       type = T_OBJECT;
  826       break;
  827     case StubId::stubgen_copy_oop_uninit_f_id:
  828       direction = copy_forwards;
  829       type = T_OBJECT;
  830       break;
  831     case StubId::stubgen_copy_oop_uninit_b_id:
  832       direction = copy_backwards;
  833       type = T_OBJECT;
  834       break;
  835     default:
  836       ShouldNotReachHere();
  837     }
  838 
  839     int unit = wordSize * direction;
  840     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  841 
  842     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  843       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  844     const Register stride = r14;
  845     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  846     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  847     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  848 
  849     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  850     assert_different_registers(s, d, count, rscratch1, rscratch2);
  851 
  852     Label again, drain;
  853 
  854     __ align(CodeEntryAlignment);
  855 
  856     StubCodeMark mark(this, stub_id);
  857 
  858     address start = __ pc();
  859 
  860     Label unaligned_copy_long;
  861     if (AvoidUnalignedAccesses) {
  862       __ tbnz(d, 3, unaligned_copy_long);
  863     }
  864 
  865     if (direction == copy_forwards) {
  866       __ sub(s, s, bias);
  867       __ sub(d, d, bias);
  868     }
  869 
  870 #ifdef ASSERT
  871     // Make sure we are never given < 8 words
  872     {
  873       Label L;
  874       __ cmp(count, (u1)8);
  875       __ br(Assembler::GE, L);
  876       __ stop("genrate_copy_longs called with < 8 words");
  877       __ bind(L);
  878     }
  879 #endif
  880 
  881     // Fill 8 registers
  882     if (UseSIMDForMemoryOps) {
  883       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  884       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  885     } else {
  886       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  887       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  888       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  889       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  890     }
  891 
  892     __ subs(count, count, 16);
  893     __ br(Assembler::LO, drain);
  894 
  895     int prefetch = PrefetchCopyIntervalInBytes;
  896     bool use_stride = false;
  897     if (direction == copy_backwards) {
  898       use_stride = prefetch > 256;
  899       prefetch = -prefetch;
  900       if (use_stride) __ mov(stride, prefetch);
  901     }
  902 
  903     __ bind(again);
  904 
  905     if (PrefetchCopyIntervalInBytes > 0)
  906       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  907 
  908     if (UseSIMDForMemoryOps) {
  909       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  910       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  911       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  912       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  913     } else {
  914       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  915       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  916       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  917       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  918       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  919       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  920       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  921       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  922     }
  923 
  924     __ subs(count, count, 8);
  925     __ br(Assembler::HS, again);
  926 
  927     // Drain
  928     __ bind(drain);
  929     if (UseSIMDForMemoryOps) {
  930       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  931       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  932     } else {
  933       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  934       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  935       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  936       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  937     }
  938 
  939     {
  940       Label L1, L2;
  941       __ tbz(count, exact_log2(4), L1);
  942       if (UseSIMDForMemoryOps) {
  943         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  944         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  945       } else {
  946         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  947         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  948         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  950       }
  951       __ bind(L1);
  952 
  953       if (direction == copy_forwards) {
  954         __ add(s, s, bias);
  955         __ add(d, d, bias);
  956       }
  957 
  958       __ tbz(count, 1, L2);
  959       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  960       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  961       __ bind(L2);
  962     }
  963 
  964     __ ret(lr);
  965 
  966     if (AvoidUnalignedAccesses) {
  967       Label drain, again;
  968       // Register order for storing. Order is different for backward copy.
  969 
  970       __ bind(unaligned_copy_long);
  971 
  972       // source address is even aligned, target odd aligned
  973       //
  974       // when forward copying word pairs we read long pairs at offsets
  975       // {0, 2, 4, 6} (in long words). when backwards copying we read
  976       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  977       // address by -2 in the forwards case so we can compute the
  978       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  979       // or -1.
  980       //
  981       // when forward copying we need to store 1 word, 3 pairs and
  982       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  983       // zero offset We adjust the destination by -1 which means we
  984       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
  985       //
  986       // When backwards copyng we need to store 1 word, 3 pairs and
  987       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
  988       // offsets {1, 3, 5, 7, 8} * unit.
  989 
  990       if (direction == copy_forwards) {
  991         __ sub(s, s, 16);
  992         __ sub(d, d, 8);
  993       }
  994 
  995       // Fill 8 registers
  996       //
  997       // for forwards copy s was offset by -16 from the original input
  998       // value of s so the register contents are at these offsets
  999       // relative to the 64 bit block addressed by that original input
 1000       // and so on for each successive 64 byte block when s is updated
 1001       //
 1002       // t0 at offset 0,  t1 at offset 8
 1003       // t2 at offset 16, t3 at offset 24
 1004       // t4 at offset 32, t5 at offset 40
 1005       // t6 at offset 48, t7 at offset 56
 1006 
 1007       // for backwards copy s was not offset so the register contents
 1008       // are at these offsets into the preceding 64 byte block
 1009       // relative to that original input and so on for each successive
 1010       // preceding 64 byte block when s is updated. this explains the
 1011       // slightly counter-intuitive looking pattern of register usage
 1012       // in the stp instructions for backwards copy.
 1013       //
 1014       // t0 at offset -16, t1 at offset -8
 1015       // t2 at offset -32, t3 at offset -24
 1016       // t4 at offset -48, t5 at offset -40
 1017       // t6 at offset -64, t7 at offset -56
 1018 
 1019       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1020       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1021       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1022       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1023 
 1024       __ subs(count, count, 16);
 1025       __ br(Assembler::LO, drain);
 1026 
 1027       int prefetch = PrefetchCopyIntervalInBytes;
 1028       bool use_stride = false;
 1029       if (direction == copy_backwards) {
 1030         use_stride = prefetch > 256;
 1031         prefetch = -prefetch;
 1032         if (use_stride) __ mov(stride, prefetch);
 1033       }
 1034 
 1035       __ bind(again);
 1036 
 1037       if (PrefetchCopyIntervalInBytes > 0)
 1038         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1039 
 1040       if (direction == copy_forwards) {
 1041         // allowing for the offset of -8 the store instructions place
 1042         // registers into the target 64 bit block at the following
 1043         // offsets
 1044         //
 1045         // t0 at offset 0
 1046         // t1 at offset 8,  t2 at offset 16
 1047         // t3 at offset 24, t4 at offset 32
 1048         // t5 at offset 40, t6 at offset 48
 1049         // t7 at offset 56
 1050 
 1051         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1052         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1053         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1054         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1055         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1056         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1057         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1058         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1059         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1060       } else {
 1061         // d was not offset when we started so the registers are
 1062         // written into the 64 bit block preceding d with the following
 1063         // offsets
 1064         //
 1065         // t1 at offset -8
 1066         // t3 at offset -24, t0 at offset -16
 1067         // t5 at offset -48, t2 at offset -32
 1068         // t7 at offset -56, t4 at offset -48
 1069         //                   t6 at offset -64
 1070         //
 1071         // note that this matches the offsets previously noted for the
 1072         // loads
 1073 
 1074         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1075         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1076         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1077         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1078         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1079         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1080         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1081         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1082         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1083       }
 1084 
 1085       __ subs(count, count, 8);
 1086       __ br(Assembler::HS, again);
 1087 
 1088       // Drain
 1089       //
 1090       // this uses the same pattern of offsets and register arguments
 1091       // as above
 1092       __ bind(drain);
 1093       if (direction == copy_forwards) {
 1094         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1095         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1096         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1097         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1098         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1099       } else {
 1100         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1101         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1102         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1103         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1104         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1105       }
 1106       // now we need to copy any remaining part block which may
 1107       // include a 4 word block subblock and/or a 2 word subblock.
 1108       // bits 2 and 1 in the count are the tell-tale for whether we
 1109       // have each such subblock
 1110       {
 1111         Label L1, L2;
 1112         __ tbz(count, exact_log2(4), L1);
 1113         // this is the same as above but copying only 4 longs hence
 1114         // with only one intervening stp between the str instructions
 1115         // but note that the offsets and registers still follow the
 1116         // same pattern
 1117         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1118         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1119         if (direction == copy_forwards) {
 1120           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1121           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1122           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1123         } else {
 1124           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1125           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1126           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1127         }
 1128         __ bind(L1);
 1129 
 1130         __ tbz(count, 1, L2);
 1131         // this is the same as above but copying only 2 longs hence
 1132         // there is no intervening stp between the str instructions
 1133         // but note that the offset and register patterns are still
 1134         // the same
 1135         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1136         if (direction == copy_forwards) {
 1137           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1138           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1139         } else {
 1140           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1141           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1142         }
 1143         __ bind(L2);
 1144 
 1145         // for forwards copy we need to re-adjust the offsets we
 1146         // applied so that s and d are follow the last words written
 1147 
 1148         if (direction == copy_forwards) {
 1149           __ add(s, s, 16);
 1150           __ add(d, d, 8);
 1151         }
 1152 
 1153       }
 1154 
 1155       __ ret(lr);
 1156     }
 1157 
 1158     return start;
 1159   }
 1160 
 1161   // Small copy: less than 16 bytes.
 1162   //
 1163   // NB: Ignores all of the bits of count which represent more than 15
 1164   // bytes, so a caller doesn't have to mask them.
 1165 
 1166   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1167     bool is_backwards = step < 0;
 1168     size_t granularity = g_uabs(step);
 1169     int direction = is_backwards ? -1 : 1;
 1170 
 1171     Label Lword, Lint, Lshort, Lbyte;
 1172 
 1173     assert(granularity
 1174            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1175 
 1176     const Register t0 = r3;
 1177     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1178     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1179 
 1180     // ??? I don't know if this bit-test-and-branch is the right thing
 1181     // to do.  It does a lot of jumping, resulting in several
 1182     // mispredicted branches.  It might make more sense to do this
 1183     // with something like Duff's device with a single computed branch.
 1184 
 1185     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1186     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1187     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1188     __ bind(Lword);
 1189 
 1190     if (granularity <= sizeof (jint)) {
 1191       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1192       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1193       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1194       __ bind(Lint);
 1195     }
 1196 
 1197     if (granularity <= sizeof (jshort)) {
 1198       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1199       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1200       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1201       __ bind(Lshort);
 1202     }
 1203 
 1204     if (granularity <= sizeof (jbyte)) {
 1205       __ tbz(count, 0, Lbyte);
 1206       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1207       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1208       __ bind(Lbyte);
 1209     }
 1210   }
 1211 
 1212   // All-singing all-dancing memory copy.
 1213   //
 1214   // Copy count units of memory from s to d.  The size of a unit is
 1215   // step, which can be positive or negative depending on the direction
 1216   // of copy.  If is_aligned is false, we align the source address.
 1217   //
 1218 
 1219   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1220                    Register s, Register d, Register count, int step) {
 1221     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1222     bool is_backwards = step < 0;
 1223     unsigned int granularity = g_uabs(step);
 1224     const Register t0 = r3, t1 = r4;
 1225 
 1226     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1227     // load all the data before writing anything
 1228     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1229     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1230     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1231     const Register send = r17, dend = r16;
 1232     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1233     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1234     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1235 
 1236     if (PrefetchCopyIntervalInBytes > 0)
 1237       __ prfm(Address(s, 0), PLDL1KEEP);
 1238     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1239     __ br(Assembler::HI, copy_big);
 1240 
 1241     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1242     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1243 
 1244     __ cmp(count, u1(16/granularity));
 1245     __ br(Assembler::LS, copy16);
 1246 
 1247     __ cmp(count, u1(64/granularity));
 1248     __ br(Assembler::HI, copy80);
 1249 
 1250     __ cmp(count, u1(32/granularity));
 1251     __ br(Assembler::LS, copy32);
 1252 
 1253     // 33..64 bytes
 1254     if (UseSIMDForMemoryOps) {
 1255       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1256       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1257       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1258       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1259     } else {
 1260       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1261       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1262       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1263       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1264 
 1265       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1266       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1267       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1268       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1269     }
 1270     __ b(finish);
 1271 
 1272     // 17..32 bytes
 1273     __ bind(copy32);
 1274     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1275     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1276 
 1277     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1278     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1279     __ b(finish);
 1280 
 1281     // 65..80/96 bytes
 1282     // (96 bytes if SIMD because we do 32 byes per instruction)
 1283     __ bind(copy80);
 1284     if (UseSIMDForMemoryOps) {
 1285       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1286       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1287       // Unaligned pointers can be an issue for copying.
 1288       // The issue has more chances to happen when granularity of data is
 1289       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1290       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1291       // The most performance drop has been seen for the range 65-80 bytes.
 1292       // For such cases using the pair of ldp/stp instead of the third pair of
 1293       // ldpq/stpq fixes the performance issue.
 1294       if (granularity < sizeof (jint)) {
 1295         Label copy96;
 1296         __ cmp(count, u1(80/granularity));
 1297         __ br(Assembler::HI, copy96);
 1298         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1299 
 1300         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1301         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1302 
 1303         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1304         __ b(finish);
 1305 
 1306         __ bind(copy96);
 1307       }
 1308       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1309 
 1310       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1311       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1312 
 1313       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1314     } else {
 1315       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1316       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1317       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1318       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1319       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1320 
 1321       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1322       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1323       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1324       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1325       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1326     }
 1327     __ b(finish);
 1328 
 1329     // 0..16 bytes
 1330     __ bind(copy16);
 1331     __ cmp(count, u1(8/granularity));
 1332     __ br(Assembler::LO, copy8);
 1333 
 1334     // 8..16 bytes
 1335     bs.copy_load_at_8(t0, Address(s, 0));
 1336     bs.copy_load_at_8(t1, Address(send, -8));
 1337     bs.copy_store_at_8(Address(d, 0), t0);
 1338     bs.copy_store_at_8(Address(dend, -8), t1);
 1339     __ b(finish);
 1340 
 1341     if (granularity < 8) {
 1342       // 4..7 bytes
 1343       __ bind(copy8);
 1344       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1345       __ ldrw(t0, Address(s, 0));
 1346       __ ldrw(t1, Address(send, -4));
 1347       __ strw(t0, Address(d, 0));
 1348       __ strw(t1, Address(dend, -4));
 1349       __ b(finish);
 1350       if (granularity < 4) {
 1351         // 0..3 bytes
 1352         __ bind(copy4);
 1353         __ cbz(count, finish); // get rid of 0 case
 1354         if (granularity == 2) {
 1355           __ ldrh(t0, Address(s, 0));
 1356           __ strh(t0, Address(d, 0));
 1357         } else { // granularity == 1
 1358           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1359           // the first and last byte.
 1360           // Handle the 3 byte case by loading and storing base + count/2
 1361           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1362           // This does means in the 1 byte case we load/store the same
 1363           // byte 3 times.
 1364           __ lsr(count, count, 1);
 1365           __ ldrb(t0, Address(s, 0));
 1366           __ ldrb(t1, Address(send, -1));
 1367           __ ldrb(t2, Address(s, count));
 1368           __ strb(t0, Address(d, 0));
 1369           __ strb(t1, Address(dend, -1));
 1370           __ strb(t2, Address(d, count));
 1371         }
 1372         __ b(finish);
 1373       }
 1374     }
 1375 
 1376     __ bind(copy_big);
 1377     if (is_backwards) {
 1378       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1379       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1380     }
 1381 
 1382     // Now we've got the small case out of the way we can align the
 1383     // source address on a 2-word boundary.
 1384 
 1385     // Here we will materialize a count in r15, which is used by copy_memory_small
 1386     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1387     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1388     // can not be used as a temp register, as it contains the count.
 1389 
 1390     Label aligned;
 1391 
 1392     if (is_aligned) {
 1393       // We may have to adjust by 1 word to get s 2-word-aligned.
 1394       __ tbz(s, exact_log2(wordSize), aligned);
 1395       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1396       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1397       __ sub(count, count, wordSize/granularity);
 1398     } else {
 1399       if (is_backwards) {
 1400         __ andr(r15, s, 2 * wordSize - 1);
 1401       } else {
 1402         __ neg(r15, s);
 1403         __ andr(r15, r15, 2 * wordSize - 1);
 1404       }
 1405       // r15 is the byte adjustment needed to align s.
 1406       __ cbz(r15, aligned);
 1407       int shift = exact_log2(granularity);
 1408       if (shift > 0) {
 1409         __ lsr(r15, r15, shift);
 1410       }
 1411       __ sub(count, count, r15);
 1412 
 1413 #if 0
 1414       // ?? This code is only correct for a disjoint copy.  It may or
 1415       // may not make sense to use it in that case.
 1416 
 1417       // Copy the first pair; s and d may not be aligned.
 1418       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1419       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1420 
 1421       // Align s and d, adjust count
 1422       if (is_backwards) {
 1423         __ sub(s, s, r15);
 1424         __ sub(d, d, r15);
 1425       } else {
 1426         __ add(s, s, r15);
 1427         __ add(d, d, r15);
 1428       }
 1429 #else
 1430       copy_memory_small(decorators, type, s, d, r15, step);
 1431 #endif
 1432     }
 1433 
 1434     __ bind(aligned);
 1435 
 1436     // s is now 2-word-aligned.
 1437 
 1438     // We have a count of units and some trailing bytes. Adjust the
 1439     // count and do a bulk copy of words. If the shift is zero
 1440     // perform a move instead to benefit from zero latency moves.
 1441     int shift = exact_log2(wordSize/granularity);
 1442     if (shift > 0) {
 1443       __ lsr(r15, count, shift);
 1444     } else {
 1445       __ mov(r15, count);
 1446     }
 1447     if (direction == copy_forwards) {
 1448       if (type != T_OBJECT) {
 1449         __ bl(StubRoutines::aarch64::copy_byte_f());
 1450       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1451         __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
 1452       } else {
 1453         __ bl(StubRoutines::aarch64::copy_oop_f());
 1454       }
 1455     } else {
 1456       if (type != T_OBJECT) {
 1457         __ bl(StubRoutines::aarch64::copy_byte_b());
 1458       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1459         __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
 1460       } else {
 1461         __ bl(StubRoutines::aarch64::copy_oop_b());
 1462       }
 1463     }
 1464 
 1465     // And the tail.
 1466     copy_memory_small(decorators, type, s, d, count, step);
 1467 
 1468     if (granularity >= 8) __ bind(copy8);
 1469     if (granularity >= 4) __ bind(copy4);
 1470     __ bind(finish);
 1471   }
 1472 
 1473 
 1474   void clobber_registers() {
 1475 #ifdef ASSERT
 1476     RegSet clobbered
 1477       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1478     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1479     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1480     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1481       __ mov(*it, rscratch1);
 1482     }
 1483 #endif
 1484 
 1485   }
 1486 
 1487   // Scan over array at a for count oops, verifying each one.
 1488   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1489   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1490     Label loop, end;
 1491     __ mov(rscratch1, a);
 1492     __ mov(rscratch2, zr);
 1493     __ bind(loop);
 1494     __ cmp(rscratch2, count);
 1495     __ br(Assembler::HS, end);
 1496     if (size == wordSize) {
 1497       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1498       __ verify_oop(temp);
 1499     } else {
 1500       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1501       __ decode_heap_oop(temp); // calls verify_oop
 1502     }
 1503     __ add(rscratch2, rscratch2, 1);
 1504     __ b(loop);
 1505     __ bind(end);
 1506   }
 1507 
 1508   // Arguments:
 1509   //   stub_id - is used to name the stub and identify all details of
 1510   //             how to perform the copy.
 1511   //
 1512   //   entry - is assigned to the stub's post push entry point unless
 1513   //           it is null
 1514   //
 1515   // Inputs:
 1516   //   c_rarg0   - source array address
 1517   //   c_rarg1   - destination array address
 1518   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1519   //
 1520   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1521   // the hardware handle it.  The two dwords within qwords that span
 1522   // cache line boundaries will still be loaded and stored atomically.
 1523   //
 1524   // Side Effects: nopush_entry is set to the (post push) entry point
 1525   //               so it can be used by the corresponding conjoint
 1526   //               copy method
 1527   //
 1528   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1529     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1530     RegSet saved_reg = RegSet::of(s, d, count);
 1531     int size;
 1532     bool aligned;
 1533     bool is_oop;
 1534     bool dest_uninitialized;
 1535     switch (stub_id) {
 1536     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1537       size = sizeof(jbyte);
 1538       aligned = false;
 1539       is_oop = false;
 1540       dest_uninitialized = false;
 1541       break;
 1542     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1543       size = sizeof(jbyte);
 1544       aligned = true;
 1545       is_oop = false;
 1546       dest_uninitialized = false;
 1547       break;
 1548     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1549       size = sizeof(jshort);
 1550       aligned = false;
 1551       is_oop = false;
 1552       dest_uninitialized = false;
 1553       break;
 1554     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1555       size = sizeof(jshort);
 1556       aligned = true;
 1557       is_oop = false;
 1558       dest_uninitialized = false;
 1559       break;
 1560     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1561       size = sizeof(jint);
 1562       aligned = false;
 1563       is_oop = false;
 1564       dest_uninitialized = false;
 1565       break;
 1566     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1567       size = sizeof(jint);
 1568       aligned = true;
 1569       is_oop = false;
 1570       dest_uninitialized = false;
 1571       break;
 1572     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1573       // since this is always aligned we can (should!) use the same
 1574       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1575       ShouldNotReachHere();
 1576       break;
 1577     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1578       size = sizeof(jlong);
 1579       aligned = true;
 1580       is_oop = false;
 1581       dest_uninitialized = false;
 1582       break;
 1583     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1584       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1585       aligned = !UseCompressedOops;
 1586       is_oop = true;
 1587       dest_uninitialized = false;
 1588       break;
 1589     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1590       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1591       aligned = !UseCompressedOops;
 1592       is_oop = true;
 1593       dest_uninitialized = false;
 1594       break;
 1595     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1596       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1597       aligned = !UseCompressedOops;
 1598       is_oop = true;
 1599       dest_uninitialized = true;
 1600       break;
 1601     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1602       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1603       aligned = !UseCompressedOops;
 1604       is_oop = true;
 1605       dest_uninitialized = true;
 1606       break;
 1607     default:
 1608       ShouldNotReachHere();
 1609       break;
 1610     }
 1611 
 1612     __ align(CodeEntryAlignment);
 1613     StubCodeMark mark(this, stub_id);
 1614     address start = __ pc();
 1615     __ enter();
 1616 
 1617     if (nopush_entry != nullptr) {
 1618       *nopush_entry = __ pc();
 1619       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1620       BLOCK_COMMENT("Entry:");
 1621     }
 1622 
 1623     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1624     if (dest_uninitialized) {
 1625       decorators |= IS_DEST_UNINITIALIZED;
 1626     }
 1627     if (aligned) {
 1628       decorators |= ARRAYCOPY_ALIGNED;
 1629     }
 1630 
 1631     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1632     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1633 
 1634     if (is_oop) {
 1635       // save regs before copy_memory
 1636       __ push(RegSet::of(d, count), sp);
 1637     }
 1638     {
 1639       // UnsafeMemoryAccess page error: continue after unsafe access
 1640       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1641       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1642       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1643     }
 1644 
 1645     if (is_oop) {
 1646       __ pop(RegSet::of(d, count), sp);
 1647       if (VerifyOops)
 1648         verify_oop_array(size, d, count, r16);
 1649     }
 1650 
 1651     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1652 
 1653     __ leave();
 1654     __ mov(r0, zr); // return 0
 1655     __ ret(lr);
 1656     return start;
 1657   }
 1658 
 1659   // Arguments:
 1660   //   stub_id - is used to name the stub and identify all details of
 1661   //             how to perform the copy.
 1662   //
 1663   //   nooverlap_target - identifes the (post push) entry for the
 1664   //             corresponding disjoint copy routine which can be
 1665   //             jumped to if the ranges do not actually overlap
 1666   //
 1667   //   entry - is assigned to the stub's post push entry point unless
 1668   //           it is null
 1669   //
 1670   //
 1671   // Inputs:
 1672   //   c_rarg0   - source array address
 1673   //   c_rarg1   - destination array address
 1674   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1675   //
 1676   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1677   // the hardware handle it.  The two dwords within qwords that span
 1678   // cache line boundaries will still be loaded and stored atomically.
 1679   //
 1680   // Side Effects:
 1681   //   nopush_entry is set to the no-overlap entry point so it can be
 1682   //   used by some other conjoint copy method
 1683   //
 1684   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1685     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1686     RegSet saved_regs = RegSet::of(s, d, count);
 1687     int size;
 1688     bool aligned;
 1689     bool is_oop;
 1690     bool dest_uninitialized;
 1691     switch (stub_id) {
 1692     case StubId::stubgen_jbyte_arraycopy_id:
 1693       size = sizeof(jbyte);
 1694       aligned = false;
 1695       is_oop = false;
 1696       dest_uninitialized = false;
 1697       break;
 1698     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1699       size = sizeof(jbyte);
 1700       aligned = true;
 1701       is_oop = false;
 1702       dest_uninitialized = false;
 1703       break;
 1704     case StubId::stubgen_jshort_arraycopy_id:
 1705       size = sizeof(jshort);
 1706       aligned = false;
 1707       is_oop = false;
 1708       dest_uninitialized = false;
 1709       break;
 1710     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 1711       size = sizeof(jshort);
 1712       aligned = true;
 1713       is_oop = false;
 1714       dest_uninitialized = false;
 1715       break;
 1716     case StubId::stubgen_jint_arraycopy_id:
 1717       size = sizeof(jint);
 1718       aligned = false;
 1719       is_oop = false;
 1720       dest_uninitialized = false;
 1721       break;
 1722     case StubId::stubgen_arrayof_jint_arraycopy_id:
 1723       size = sizeof(jint);
 1724       aligned = true;
 1725       is_oop = false;
 1726       dest_uninitialized = false;
 1727       break;
 1728     case StubId::stubgen_jlong_arraycopy_id:
 1729       // since this is always aligned we can (should!) use the same
 1730       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1731       ShouldNotReachHere();
 1732       break;
 1733     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 1734       size = sizeof(jlong);
 1735       aligned = true;
 1736       is_oop = false;
 1737       dest_uninitialized = false;
 1738       break;
 1739     case StubId::stubgen_oop_arraycopy_id:
 1740       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1741       aligned = !UseCompressedOops;
 1742       is_oop = true;
 1743       dest_uninitialized = false;
 1744       break;
 1745     case StubId::stubgen_arrayof_oop_arraycopy_id:
 1746       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1747       aligned = !UseCompressedOops;
 1748       is_oop = true;
 1749       dest_uninitialized = false;
 1750       break;
 1751     case StubId::stubgen_oop_arraycopy_uninit_id:
 1752       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1753       aligned = !UseCompressedOops;
 1754       is_oop = true;
 1755       dest_uninitialized = true;
 1756       break;
 1757     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 1758       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1759       aligned = !UseCompressedOops;
 1760       is_oop = true;
 1761       dest_uninitialized = true;
 1762       break;
 1763     default:
 1764       ShouldNotReachHere();
 1765     }
 1766 
 1767     StubCodeMark mark(this, stub_id);
 1768     address start = __ pc();
 1769     __ enter();
 1770 
 1771     if (nopush_entry != nullptr) {
 1772       *nopush_entry = __ pc();
 1773       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1774       BLOCK_COMMENT("Entry:");
 1775     }
 1776 
 1777     // use fwd copy when (d-s) above_equal (count*size)
 1778     Label L_overlapping;
 1779     __ sub(rscratch1, d, s);
 1780     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1781     __ br(Assembler::LO, L_overlapping);
 1782     __ b(RuntimeAddress(nooverlap_target));
 1783     __ bind(L_overlapping);
 1784 
 1785     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1786     if (dest_uninitialized) {
 1787       decorators |= IS_DEST_UNINITIALIZED;
 1788     }
 1789     if (aligned) {
 1790       decorators |= ARRAYCOPY_ALIGNED;
 1791     }
 1792 
 1793     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1794     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1795 
 1796     if (is_oop) {
 1797       // save regs before copy_memory
 1798       __ push(RegSet::of(d, count), sp);
 1799     }
 1800     {
 1801       // UnsafeMemoryAccess page error: continue after unsafe access
 1802       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1803       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1804       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1805     }
 1806     if (is_oop) {
 1807       __ pop(RegSet::of(d, count), sp);
 1808       if (VerifyOops)
 1809         verify_oop_array(size, d, count, r16);
 1810     }
 1811     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1812     __ leave();
 1813     __ mov(r0, zr); // return 0
 1814     __ ret(lr);
 1815     return start;
 1816   }
 1817 
 1818   // Helper for generating a dynamic type check.
 1819   // Smashes rscratch1, rscratch2.
 1820   void generate_type_check(Register sub_klass,
 1821                            Register super_check_offset,
 1822                            Register super_klass,
 1823                            Register temp1,
 1824                            Register temp2,
 1825                            Register result,
 1826                            Label& L_success) {
 1827     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1828 
 1829     BLOCK_COMMENT("type_check:");
 1830 
 1831     Label L_miss;
 1832 
 1833     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1834                                      super_check_offset);
 1835     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1836 
 1837     // Fall through on failure!
 1838     __ BIND(L_miss);
 1839   }
 1840 
 1841   //
 1842   //  Generate checkcasting array copy stub
 1843   //
 1844   //  Input:
 1845   //    c_rarg0   - source array address
 1846   //    c_rarg1   - destination array address
 1847   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1848   //    c_rarg3   - size_t ckoff (super_check_offset)
 1849   //    c_rarg4   - oop ckval (super_klass)
 1850   //
 1851   //  Output:
 1852   //    r0 ==  0  -  success
 1853   //    r0 == -1^K - failure, where K is partial transfer count
 1854   //
 1855   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 1856     bool dest_uninitialized;
 1857     switch (stub_id) {
 1858     case StubId::stubgen_checkcast_arraycopy_id:
 1859       dest_uninitialized = false;
 1860       break;
 1861     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 1862       dest_uninitialized = true;
 1863       break;
 1864     default:
 1865       ShouldNotReachHere();
 1866     }
 1867 
 1868     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1869 
 1870     // Input registers (after setup_arg_regs)
 1871     const Register from        = c_rarg0;   // source array address
 1872     const Register to          = c_rarg1;   // destination array address
 1873     const Register count       = c_rarg2;   // elementscount
 1874     const Register ckoff       = c_rarg3;   // super_check_offset
 1875     const Register ckval       = c_rarg4;   // super_klass
 1876 
 1877     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1878     RegSet wb_post_saved_regs = RegSet::of(count);
 1879 
 1880     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1881     const Register copied_oop  = r22;       // actual oop copied
 1882     const Register count_save  = r21;       // orig elementscount
 1883     const Register start_to    = r20;       // destination array start address
 1884     const Register r19_klass   = r19;       // oop._klass
 1885 
 1886     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1887     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1888 
 1889     //---------------------------------------------------------------
 1890     // Assembler stub will be used for this call to arraycopy
 1891     // if the two arrays are subtypes of Object[] but the
 1892     // destination array type is not equal to or a supertype
 1893     // of the source type.  Each element must be separately
 1894     // checked.
 1895 
 1896     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1897                                copied_oop, r19_klass, count_save);
 1898 
 1899     __ align(CodeEntryAlignment);
 1900     StubCodeMark mark(this, stub_id);
 1901     address start = __ pc();
 1902 
 1903     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1904 
 1905 #ifdef ASSERT
 1906     // caller guarantees that the arrays really are different
 1907     // otherwise, we would have to make conjoint checks
 1908     { Label L;
 1909       __ b(L);                  // conjoint check not yet implemented
 1910       __ stop("checkcast_copy within a single array");
 1911       __ bind(L);
 1912     }
 1913 #endif //ASSERT
 1914 
 1915     // Caller of this entry point must set up the argument registers.
 1916     if (nopush_entry != nullptr) {
 1917       *nopush_entry = __ pc();
 1918       BLOCK_COMMENT("Entry:");
 1919     }
 1920 
 1921      // Empty array:  Nothing to do.
 1922     __ cbz(count, L_done);
 1923     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1924 
 1925 #ifdef ASSERT
 1926     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1927     // The ckoff and ckval must be mutually consistent,
 1928     // even though caller generates both.
 1929     { Label L;
 1930       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1931       __ ldrw(start_to, Address(ckval, sco_offset));
 1932       __ cmpw(ckoff, start_to);
 1933       __ br(Assembler::EQ, L);
 1934       __ stop("super_check_offset inconsistent");
 1935       __ bind(L);
 1936     }
 1937 #endif //ASSERT
 1938 
 1939     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1940     bool is_oop = true;
 1941     int element_size = UseCompressedOops ? 4 : 8;
 1942     if (dest_uninitialized) {
 1943       decorators |= IS_DEST_UNINITIALIZED;
 1944     }
 1945 
 1946     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1947     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1948 
 1949     // save the original count
 1950     __ mov(count_save, count);
 1951 
 1952     // Copy from low to high addresses
 1953     __ mov(start_to, to);              // Save destination array start address
 1954     __ b(L_load_element);
 1955 
 1956     // ======== begin loop ========
 1957     // (Loop is rotated; its entry is L_load_element.)
 1958     // Loop control:
 1959     //   for (; count != 0; count--) {
 1960     //     copied_oop = load_heap_oop(from++);
 1961     //     ... generate_type_check ...;
 1962     //     store_heap_oop(to++, copied_oop);
 1963     //   }
 1964     __ align(OptoLoopAlignment);
 1965 
 1966     __ BIND(L_store_element);
 1967     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1968                       __ post(to, element_size), copied_oop, noreg,
 1969                       gct1, gct2, gct3);
 1970     __ sub(count, count, 1);
 1971     __ cbz(count, L_do_card_marks);
 1972 
 1973     // ======== loop entry is here ========
 1974     __ BIND(L_load_element);
 1975     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1976                      copied_oop, noreg, __ post(from, element_size),
 1977                      gct1);
 1978     __ cbz(copied_oop, L_store_element);
 1979 
 1980     __ load_klass(r19_klass, copied_oop);// query the object klass
 1981 
 1982     BLOCK_COMMENT("type_check:");
 1983     generate_type_check(/*sub_klass*/r19_klass,
 1984                         /*super_check_offset*/ckoff,
 1985                         /*super_klass*/ckval,
 1986                         /*r_array_base*/gct1,
 1987                         /*temp2*/gct2,
 1988                         /*result*/r10, L_store_element);
 1989 
 1990     // Fall through on failure!
 1991 
 1992     // ======== end loop ========
 1993 
 1994     // It was a real error; we must depend on the caller to finish the job.
 1995     // Register count = remaining oops, count_orig = total oops.
 1996     // Emit GC store barriers for the oops we have copied and report
 1997     // their number to the caller.
 1998 
 1999     __ subs(count, count_save, count);     // K = partially copied oop count
 2000     __ eon(count, count, zr);              // report (-1^K) to caller
 2001     __ br(Assembler::EQ, L_done_pop);
 2002 
 2003     __ BIND(L_do_card_marks);
 2004     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2005 
 2006     __ bind(L_done_pop);
 2007     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2008     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2009 
 2010     __ bind(L_done);
 2011     __ mov(r0, count);
 2012     __ leave();
 2013     __ ret(lr);
 2014 
 2015     return start;
 2016   }
 2017 
 2018   // Perform range checks on the proposed arraycopy.
 2019   // Kills temp, but nothing else.
 2020   // Also, clean the sign bits of src_pos and dst_pos.
 2021   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2022                               Register src_pos, // source position (c_rarg1)
 2023                               Register dst,     // destination array oo (c_rarg2)
 2024                               Register dst_pos, // destination position (c_rarg3)
 2025                               Register length,
 2026                               Register temp,
 2027                               Label& L_failed) {
 2028     BLOCK_COMMENT("arraycopy_range_checks:");
 2029 
 2030     assert_different_registers(rscratch1, temp);
 2031 
 2032     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2033     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2034     __ addw(temp, length, src_pos);
 2035     __ cmpw(temp, rscratch1);
 2036     __ br(Assembler::HI, L_failed);
 2037 
 2038     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2039     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2040     __ addw(temp, length, dst_pos);
 2041     __ cmpw(temp, rscratch1);
 2042     __ br(Assembler::HI, L_failed);
 2043 
 2044     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2045     __ movw(src_pos, src_pos);
 2046     __ movw(dst_pos, dst_pos);
 2047 
 2048     BLOCK_COMMENT("arraycopy_range_checks done");
 2049   }
 2050 
 2051   // These stubs get called from some dumb test routine.
 2052   // I'll write them properly when they're called from
 2053   // something that's actually doing something.
 2054   static void fake_arraycopy_stub(address src, address dst, int count) {
 2055     assert(count == 0, "huh?");
 2056   }
 2057 
 2058 
 2059   //
 2060   //  Generate 'unsafe' array copy stub
 2061   //  Though just as safe as the other stubs, it takes an unscaled
 2062   //  size_t argument instead of an element count.
 2063   //
 2064   //  Input:
 2065   //    c_rarg0   - source array address
 2066   //    c_rarg1   - destination array address
 2067   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2068   //
 2069   // Examines the alignment of the operands and dispatches
 2070   // to a long, int, short, or byte copy loop.
 2071   //
 2072   address generate_unsafe_copy(address byte_copy_entry,
 2073                                address short_copy_entry,
 2074                                address int_copy_entry,
 2075                                address long_copy_entry) {
 2076     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2077 
 2078     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2079     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2080 
 2081     __ align(CodeEntryAlignment);
 2082     StubCodeMark mark(this, stub_id);
 2083     address start = __ pc();
 2084     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2085 
 2086     // bump this on entry, not on exit:
 2087     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2088 
 2089     __ orr(rscratch1, s, d);
 2090     __ orr(rscratch1, rscratch1, count);
 2091 
 2092     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2093     __ cbz(rscratch1, L_long_aligned);
 2094     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2095     __ cbz(rscratch1, L_int_aligned);
 2096     __ tbz(rscratch1, 0, L_short_aligned);
 2097     __ b(RuntimeAddress(byte_copy_entry));
 2098 
 2099     __ BIND(L_short_aligned);
 2100     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2101     __ b(RuntimeAddress(short_copy_entry));
 2102     __ BIND(L_int_aligned);
 2103     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2104     __ b(RuntimeAddress(int_copy_entry));
 2105     __ BIND(L_long_aligned);
 2106     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2107     __ b(RuntimeAddress(long_copy_entry));
 2108 
 2109     return start;
 2110   }
 2111 
 2112   //
 2113   //  Generate generic array copy stubs
 2114   //
 2115   //  Input:
 2116   //    c_rarg0    -  src oop
 2117   //    c_rarg1    -  src_pos (32-bits)
 2118   //    c_rarg2    -  dst oop
 2119   //    c_rarg3    -  dst_pos (32-bits)
 2120   //    c_rarg4    -  element count (32-bits)
 2121   //
 2122   //  Output:
 2123   //    r0 ==  0  -  success
 2124   //    r0 == -1^K - failure, where K is partial transfer count
 2125   //
 2126   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2127                                 address int_copy_entry, address oop_copy_entry,
 2128                                 address long_copy_entry, address checkcast_copy_entry) {
 2129     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2130 
 2131     Label L_failed, L_objArray;
 2132     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2133 
 2134     // Input registers
 2135     const Register src        = c_rarg0;  // source array oop
 2136     const Register src_pos    = c_rarg1;  // source position
 2137     const Register dst        = c_rarg2;  // destination array oop
 2138     const Register dst_pos    = c_rarg3;  // destination position
 2139     const Register length     = c_rarg4;
 2140 
 2141 
 2142     // Registers used as temps
 2143     const Register dst_klass  = c_rarg5;
 2144 
 2145     __ align(CodeEntryAlignment);
 2146 
 2147     StubCodeMark mark(this, stub_id);
 2148 
 2149     address start = __ pc();
 2150 
 2151     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2152 
 2153     // bump this on entry, not on exit:
 2154     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2155 
 2156     //-----------------------------------------------------------------------
 2157     // Assembler stub will be used for this call to arraycopy
 2158     // if the following conditions are met:
 2159     //
 2160     // (1) src and dst must not be null.
 2161     // (2) src_pos must not be negative.
 2162     // (3) dst_pos must not be negative.
 2163     // (4) length  must not be negative.
 2164     // (5) src klass and dst klass should be the same and not null.
 2165     // (6) src and dst should be arrays.
 2166     // (7) src_pos + length must not exceed length of src.
 2167     // (8) dst_pos + length must not exceed length of dst.
 2168     //
 2169 
 2170     //  if (src == nullptr) return -1;
 2171     __ cbz(src, L_failed);
 2172 
 2173     //  if (src_pos < 0) return -1;
 2174     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2175 
 2176     //  if (dst == nullptr) return -1;
 2177     __ cbz(dst, L_failed);
 2178 
 2179     //  if (dst_pos < 0) return -1;
 2180     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2181 
 2182     // registers used as temp
 2183     const Register scratch_length    = r16; // elements count to copy
 2184     const Register scratch_src_klass = r17; // array klass
 2185     const Register lh                = r15; // layout helper
 2186 
 2187     //  if (length < 0) return -1;
 2188     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2189     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2190 
 2191     __ load_klass(scratch_src_klass, src);
 2192 #ifdef ASSERT
 2193     //  assert(src->klass() != nullptr);
 2194     {
 2195       BLOCK_COMMENT("assert klasses not null {");
 2196       Label L1, L2;
 2197       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2198       __ bind(L1);
 2199       __ stop("broken null klass");
 2200       __ bind(L2);
 2201       __ load_klass(rscratch1, dst);
 2202       __ cbz(rscratch1, L1);     // this would be broken also
 2203       BLOCK_COMMENT("} assert klasses not null done");
 2204     }
 2205 #endif
 2206 
 2207     // Load layout helper (32-bits)
 2208     //
 2209     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2210     // 32        30    24            16              8     2                 0
 2211     //
 2212     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2213     //
 2214 
 2215     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2216 
 2217     // Handle objArrays completely differently...
 2218     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2219     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2220     __ movw(rscratch1, objArray_lh);
 2221     __ eorw(rscratch2, lh, rscratch1);
 2222     __ cbzw(rscratch2, L_objArray);
 2223 
 2224     //  if (src->klass() != dst->klass()) return -1;
 2225     __ load_klass(rscratch2, dst);
 2226     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2227     __ cbnz(rscratch2, L_failed);
 2228 
 2229     //  if (!src->is_Array()) return -1;
 2230     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2231 
 2232     // At this point, it is known to be a typeArray (array_tag 0x3).
 2233 #ifdef ASSERT
 2234     {
 2235       BLOCK_COMMENT("assert primitive array {");
 2236       Label L;
 2237       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2238       __ cmpw(lh, rscratch2);
 2239       __ br(Assembler::GE, L);
 2240       __ stop("must be a primitive array");
 2241       __ bind(L);
 2242       BLOCK_COMMENT("} assert primitive array done");
 2243     }
 2244 #endif
 2245 
 2246     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2247                            rscratch2, L_failed);
 2248 
 2249     // TypeArrayKlass
 2250     //
 2251     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2252     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2253     //
 2254 
 2255     const Register rscratch1_offset = rscratch1;    // array offset
 2256     const Register r15_elsize = lh; // element size
 2257 
 2258     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2259            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2260     __ add(src, src, rscratch1_offset);           // src array offset
 2261     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2262     BLOCK_COMMENT("choose copy loop based on element size");
 2263 
 2264     // next registers should be set before the jump to corresponding stub
 2265     const Register from     = c_rarg0;  // source array address
 2266     const Register to       = c_rarg1;  // destination array address
 2267     const Register count    = c_rarg2;  // elements count
 2268 
 2269     // 'from', 'to', 'count' registers should be set in such order
 2270     // since they are the same as 'src', 'src_pos', 'dst'.
 2271 
 2272     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2273 
 2274     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2275     // size in bytes).  We do a simple bitwise binary search.
 2276   __ BIND(L_copy_bytes);
 2277     __ tbnz(r15_elsize, 1, L_copy_ints);
 2278     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2279     __ lea(from, Address(src, src_pos));// src_addr
 2280     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2281     __ movw(count, scratch_length); // length
 2282     __ b(RuntimeAddress(byte_copy_entry));
 2283 
 2284   __ BIND(L_copy_shorts);
 2285     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2286     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2287     __ movw(count, scratch_length); // length
 2288     __ b(RuntimeAddress(short_copy_entry));
 2289 
 2290   __ BIND(L_copy_ints);
 2291     __ tbnz(r15_elsize, 0, L_copy_longs);
 2292     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2293     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2294     __ movw(count, scratch_length); // length
 2295     __ b(RuntimeAddress(int_copy_entry));
 2296 
 2297   __ BIND(L_copy_longs);
 2298 #ifdef ASSERT
 2299     {
 2300       BLOCK_COMMENT("assert long copy {");
 2301       Label L;
 2302       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2303       __ cmpw(r15_elsize, LogBytesPerLong);
 2304       __ br(Assembler::EQ, L);
 2305       __ stop("must be long copy, but elsize is wrong");
 2306       __ bind(L);
 2307       BLOCK_COMMENT("} assert long copy done");
 2308     }
 2309 #endif
 2310     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2311     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2312     __ movw(count, scratch_length); // length
 2313     __ b(RuntimeAddress(long_copy_entry));
 2314 
 2315     // ObjArrayKlass
 2316   __ BIND(L_objArray);
 2317     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2318 
 2319     Label L_plain_copy, L_checkcast_copy;
 2320     //  test array classes for subtyping
 2321     __ load_klass(r15, dst);
 2322     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2323     __ br(Assembler::NE, L_checkcast_copy);
 2324 
 2325     // Identically typed arrays can be copied without element-wise checks.
 2326     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2327                            rscratch2, L_failed);
 2328 
 2329     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2330     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2331     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2332     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2333     __ movw(count, scratch_length); // length
 2334   __ BIND(L_plain_copy);
 2335     __ b(RuntimeAddress(oop_copy_entry));
 2336 
 2337   __ BIND(L_checkcast_copy);
 2338     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2339     {
 2340       // Before looking at dst.length, make sure dst is also an objArray.
 2341       __ ldrw(rscratch1, Address(r15, lh_offset));
 2342       __ movw(rscratch2, objArray_lh);
 2343       __ eorw(rscratch1, rscratch1, rscratch2);
 2344       __ cbnzw(rscratch1, L_failed);
 2345 
 2346       // It is safe to examine both src.length and dst.length.
 2347       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2348                              r15, L_failed);
 2349 
 2350       __ load_klass(dst_klass, dst); // reload
 2351 
 2352       // Marshal the base address arguments now, freeing registers.
 2353       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2354       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2355       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2356       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2357       __ movw(count, length);           // length (reloaded)
 2358       Register sco_temp = c_rarg3;      // this register is free now
 2359       assert_different_registers(from, to, count, sco_temp,
 2360                                  dst_klass, scratch_src_klass);
 2361       // assert_clean_int(count, sco_temp);
 2362 
 2363       // Generate the type check.
 2364       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2365       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2366 
 2367       // Smashes rscratch1, rscratch2
 2368       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2369                           L_plain_copy);
 2370 
 2371       // Fetch destination element klass from the ObjArrayKlass header.
 2372       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2373       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2374       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2375 
 2376       // the checkcast_copy loop needs two extra arguments:
 2377       assert(c_rarg3 == sco_temp, "#3 already in place");
 2378       // Set up arguments for checkcast_copy_entry.
 2379       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2380       __ b(RuntimeAddress(checkcast_copy_entry));
 2381     }
 2382 
 2383   __ BIND(L_failed);
 2384     __ mov(r0, -1);
 2385     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2386     __ ret(lr);
 2387 
 2388     return start;
 2389   }
 2390 
 2391   //
 2392   // Generate stub for array fill. If "aligned" is true, the
 2393   // "to" address is assumed to be heapword aligned.
 2394   //
 2395   // Arguments for generated stub:
 2396   //   to:    c_rarg0
 2397   //   value: c_rarg1
 2398   //   count: c_rarg2 treated as signed
 2399   //
 2400   address generate_fill(StubId stub_id) {
 2401     BasicType t;
 2402     bool aligned;
 2403 
 2404     switch (stub_id) {
 2405     case StubId::stubgen_jbyte_fill_id:
 2406       t = T_BYTE;
 2407       aligned = false;
 2408       break;
 2409     case StubId::stubgen_jshort_fill_id:
 2410       t = T_SHORT;
 2411       aligned = false;
 2412       break;
 2413     case StubId::stubgen_jint_fill_id:
 2414       t = T_INT;
 2415       aligned = false;
 2416       break;
 2417     case StubId::stubgen_arrayof_jbyte_fill_id:
 2418       t = T_BYTE;
 2419       aligned = true;
 2420       break;
 2421     case StubId::stubgen_arrayof_jshort_fill_id:
 2422       t = T_SHORT;
 2423       aligned = true;
 2424       break;
 2425     case StubId::stubgen_arrayof_jint_fill_id:
 2426       t = T_INT;
 2427       aligned = true;
 2428       break;
 2429     default:
 2430       ShouldNotReachHere();
 2431     };
 2432 
 2433     __ align(CodeEntryAlignment);
 2434     StubCodeMark mark(this, stub_id);
 2435     address start = __ pc();
 2436 
 2437     BLOCK_COMMENT("Entry:");
 2438 
 2439     const Register to        = c_rarg0;  // source array address
 2440     const Register value     = c_rarg1;  // value
 2441     const Register count     = c_rarg2;  // elements count
 2442 
 2443     const Register bz_base = r10;        // base for block_zero routine
 2444     const Register cnt_words = r11;      // temp register
 2445 
 2446     __ enter();
 2447 
 2448     Label L_fill_elements, L_exit1;
 2449 
 2450     int shift = -1;
 2451     switch (t) {
 2452       case T_BYTE:
 2453         shift = 0;
 2454         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2455         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2456         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2457         __ br(Assembler::LO, L_fill_elements);
 2458         break;
 2459       case T_SHORT:
 2460         shift = 1;
 2461         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2462         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2463         __ br(Assembler::LO, L_fill_elements);
 2464         break;
 2465       case T_INT:
 2466         shift = 2;
 2467         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2468         __ br(Assembler::LO, L_fill_elements);
 2469         break;
 2470       default: ShouldNotReachHere();
 2471     }
 2472 
 2473     // Align source address at 8 bytes address boundary.
 2474     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2475     if (!aligned) {
 2476       switch (t) {
 2477         case T_BYTE:
 2478           // One byte misalignment happens only for byte arrays.
 2479           __ tbz(to, 0, L_skip_align1);
 2480           __ strb(value, Address(__ post(to, 1)));
 2481           __ subw(count, count, 1);
 2482           __ bind(L_skip_align1);
 2483           // Fallthrough
 2484         case T_SHORT:
 2485           // Two bytes misalignment happens only for byte and short (char) arrays.
 2486           __ tbz(to, 1, L_skip_align2);
 2487           __ strh(value, Address(__ post(to, 2)));
 2488           __ subw(count, count, 2 >> shift);
 2489           __ bind(L_skip_align2);
 2490           // Fallthrough
 2491         case T_INT:
 2492           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2493           __ tbz(to, 2, L_skip_align4);
 2494           __ strw(value, Address(__ post(to, 4)));
 2495           __ subw(count, count, 4 >> shift);
 2496           __ bind(L_skip_align4);
 2497           break;
 2498         default: ShouldNotReachHere();
 2499       }
 2500     }
 2501 
 2502     //
 2503     //  Fill large chunks
 2504     //
 2505     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2506     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2507     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2508     if (UseBlockZeroing) {
 2509       Label non_block_zeroing, rest;
 2510       // If the fill value is zero we can use the fast zero_words().
 2511       __ cbnz(value, non_block_zeroing);
 2512       __ mov(bz_base, to);
 2513       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2514       address tpc = __ zero_words(bz_base, cnt_words);
 2515       if (tpc == nullptr) {
 2516         fatal("CodeCache is full at generate_fill");
 2517       }
 2518       __ b(rest);
 2519       __ bind(non_block_zeroing);
 2520       __ fill_words(to, cnt_words, value);
 2521       __ bind(rest);
 2522     } else {
 2523       __ fill_words(to, cnt_words, value);
 2524     }
 2525 
 2526     // Remaining count is less than 8 bytes. Fill it by a single store.
 2527     // Note that the total length is no less than 8 bytes.
 2528     if (t == T_BYTE || t == T_SHORT) {
 2529       Label L_exit1;
 2530       __ cbzw(count, L_exit1);
 2531       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2532       __ str(value, Address(to, -8));    // overwrite some elements
 2533       __ bind(L_exit1);
 2534       __ leave();
 2535       __ ret(lr);
 2536     }
 2537 
 2538     // Handle copies less than 8 bytes.
 2539     Label L_fill_2, L_fill_4, L_exit2;
 2540     __ bind(L_fill_elements);
 2541     switch (t) {
 2542       case T_BYTE:
 2543         __ tbz(count, 0, L_fill_2);
 2544         __ strb(value, Address(__ post(to, 1)));
 2545         __ bind(L_fill_2);
 2546         __ tbz(count, 1, L_fill_4);
 2547         __ strh(value, Address(__ post(to, 2)));
 2548         __ bind(L_fill_4);
 2549         __ tbz(count, 2, L_exit2);
 2550         __ strw(value, Address(to));
 2551         break;
 2552       case T_SHORT:
 2553         __ tbz(count, 0, L_fill_4);
 2554         __ strh(value, Address(__ post(to, 2)));
 2555         __ bind(L_fill_4);
 2556         __ tbz(count, 1, L_exit2);
 2557         __ strw(value, Address(to));
 2558         break;
 2559       case T_INT:
 2560         __ cbzw(count, L_exit2);
 2561         __ strw(value, Address(to));
 2562         break;
 2563       default: ShouldNotReachHere();
 2564     }
 2565     __ bind(L_exit2);
 2566     __ leave();
 2567     __ ret(lr);
 2568     return start;
 2569   }
 2570 
 2571   address generate_unsafecopy_common_error_exit() {
 2572     address start_pc = __ pc();
 2573       __ leave();
 2574       __ mov(r0, 0);
 2575       __ ret(lr);
 2576     return start_pc;
 2577   }
 2578 
 2579   //
 2580   //  Generate 'unsafe' set memory stub
 2581   //  Though just as safe as the other stubs, it takes an unscaled
 2582   //  size_t (# bytes) argument instead of an element count.
 2583   //
 2584   //  This fill operation is atomicity preserving: as long as the
 2585   //  address supplied is sufficiently aligned, all writes of up to 64
 2586   //  bits in size are single-copy atomic.
 2587   //
 2588   //  Input:
 2589   //    c_rarg0   - destination array address
 2590   //    c_rarg1   - byte count (size_t)
 2591   //    c_rarg2   - byte value
 2592   //
 2593   address generate_unsafe_setmemory() {
 2594     __ align(CodeEntryAlignment);
 2595     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
 2596     address start = __ pc();
 2597 
 2598     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2599     Label tail;
 2600 
 2601     UnsafeMemoryAccessMark umam(this, true, false);
 2602 
 2603     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2604 
 2605     __ dup(v0, __ T16B, value);
 2606 
 2607     if (AvoidUnalignedAccesses) {
 2608       __ cmp(count, (u1)16);
 2609       __ br(__ LO, tail);
 2610 
 2611       __ mov(rscratch1, 16);
 2612       __ andr(rscratch2, dest, 15);
 2613       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2614       __ strq(v0, Address(dest));
 2615       __ sub(count, count, rscratch1);
 2616       __ add(dest, dest, rscratch1);
 2617     }
 2618 
 2619     __ subs(count, count, (u1)64);
 2620     __ br(__ LO, tail);
 2621     {
 2622       Label again;
 2623       __ bind(again);
 2624       __ stpq(v0, v0, Address(dest));
 2625       __ stpq(v0, v0, Address(dest, 32));
 2626 
 2627       __ subs(count, count, 64);
 2628       __ add(dest, dest, 64);
 2629       __ br(__ HS, again);
 2630     }
 2631 
 2632     __ bind(tail);
 2633     // The count of bytes is off by 64, but we don't need to correct
 2634     // it because we're only going to use the least-significant few
 2635     // count bits from here on.
 2636     // __ add(count, count, 64);
 2637 
 2638     {
 2639       Label dont;
 2640       __ tbz(count, exact_log2(32), dont);
 2641       __ stpq(v0, v0, __ post(dest, 32));
 2642       __ bind(dont);
 2643     }
 2644     {
 2645       Label dont;
 2646       __ tbz(count, exact_log2(16), dont);
 2647       __ strq(v0, __ post(dest, 16));
 2648       __ bind(dont);
 2649     }
 2650     {
 2651       Label dont;
 2652       __ tbz(count, exact_log2(8), dont);
 2653       __ strd(v0, __ post(dest, 8));
 2654       __ bind(dont);
 2655     }
 2656 
 2657     Label finished;
 2658     __ tst(count, 7);
 2659     __ br(__ EQ, finished);
 2660 
 2661     {
 2662       Label dont;
 2663       __ tbz(count, exact_log2(4), dont);
 2664       __ strs(v0, __ post(dest, 4));
 2665       __ bind(dont);
 2666     }
 2667     {
 2668       Label dont;
 2669       __ tbz(count, exact_log2(2), dont);
 2670       __ bfi(value, value, 8, 8);
 2671       __ strh(value, __ post(dest, 2));
 2672       __ bind(dont);
 2673     }
 2674     {
 2675       Label dont;
 2676       __ tbz(count, exact_log2(1), dont);
 2677       __ strb(value, Address(dest));
 2678       __ bind(dont);
 2679     }
 2680 
 2681     __ bind(finished);
 2682     __ leave();
 2683     __ ret(lr);
 2684 
 2685     return start;
 2686   }
 2687 
 2688   address generate_data_cache_writeback() {
 2689     const Register line        = c_rarg0;  // address of line to write back
 2690 
 2691     __ align(CodeEntryAlignment);
 2692 
 2693     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 2694     StubCodeMark mark(this, stub_id);
 2695 
 2696     address start = __ pc();
 2697     __ enter();
 2698     __ cache_wb(Address(line, 0));
 2699     __ leave();
 2700     __ ret(lr);
 2701 
 2702     return start;
 2703   }
 2704 
 2705   address generate_data_cache_writeback_sync() {
 2706     const Register is_pre     = c_rarg0;  // pre or post sync
 2707 
 2708     __ align(CodeEntryAlignment);
 2709 
 2710     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 2711     StubCodeMark mark(this, stub_id);
 2712 
 2713     // pre wbsync is a no-op
 2714     // post wbsync translates to an sfence
 2715 
 2716     Label skip;
 2717     address start = __ pc();
 2718     __ enter();
 2719     __ cbnz(is_pre, skip);
 2720     __ cache_wbsync(false);
 2721     __ bind(skip);
 2722     __ leave();
 2723     __ ret(lr);
 2724 
 2725     return start;
 2726   }
 2727 
 2728   void generate_arraycopy_stubs() {
 2729     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 2730     // entry immediately following their stack push. This can be used
 2731     // as a post-push branch target for compatible stubs when they
 2732     // identify a special case that can be handled by the fallback
 2733     // stub e.g a disjoint copy stub may be use as a special case
 2734     // fallback for its compatible conjoint copy stub.
 2735     //
 2736     // A no push entry is always returned in the following local and
 2737     // then published by assigning to the appropriate entry field in
 2738     // class StubRoutines. The entry value is then passed to the
 2739     // generator for the compatible stub. That means the entry must be
 2740     // listed when saving to/restoring from the AOT cache, ensuring
 2741     // that the inter-stub jumps are noted at AOT-cache save and
 2742     // relocated at AOT cache load.
 2743     address nopush_entry;
 2744 
 2745     // generate the common exit first so later stubs can rely on it if
 2746     // they want an UnsafeMemoryAccess exit non-local to the stub
 2747     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 2748     // register the stub as the default exit with class UnsafeMemoryAccess
 2749     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 2750 
 2751     // generate and publish arch64-specific bulk copy routines first
 2752     // so we can call them from other copy stubs
 2753     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2754     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2755 
 2756     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2757     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2758 
 2759     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2760     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2761 
 2762     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2763 
 2764     //*** jbyte
 2765     // Always need aligned and unaligned versions
 2766     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2767     // disjoint nopush entry is needed by conjoint copy
 2768     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2769     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 2770     // conjoint nopush entry is needed by generic/unsafe copy
 2771     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 2772     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2773     // disjoint arrayof nopush entry is needed by conjoint copy
 2774     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2775     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 2776 
 2777     //*** jshort
 2778     // Always need aligned and unaligned versions
 2779     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 2780     // disjoint nopush entry is needed by conjoint copy
 2781     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 2782     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 2783     // conjoint nopush entry is used by generic/unsafe copy
 2784     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 2785     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 2786     // disjoint arrayof nopush entry is needed by conjoint copy
 2787     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 2788     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 2789 
 2790     //*** jint
 2791     // Aligned versions
 2792     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 2793     // disjoint arrayof nopush entry is needed by conjoint copy
 2794     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 2795     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 2796     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2797     // jint_arraycopy_nopush always points to the unaligned version
 2798     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 2799     // disjoint nopush entry is needed by conjoint copy
 2800     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 2801     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 2802     // conjoint nopush entry is needed by generic/unsafe copy
 2803     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 2804 
 2805     //*** jlong
 2806     // It is always aligned
 2807     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 2808     // disjoint arrayof nopush entry is needed by conjoint copy
 2809     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 2810     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 2811     // conjoint nopush entry is needed by generic/unsafe copy
 2812     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 2813     // disjoint normal/nopush and conjoint normal entries are not
 2814     // generated since the arrayof versions are the same
 2815     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2816     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 2817     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2818 
 2819     //*** oops
 2820     {
 2821       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2822         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 2823       // disjoint arrayof nopush entry is needed by conjoint copy
 2824       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 2825       StubRoutines::_arrayof_oop_arraycopy
 2826         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 2827       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 2828       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 2829       // Aligned versions without pre-barriers
 2830       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2831         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 2832       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 2833       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 2834       // note that we don't need a returned nopush entry because the
 2835       // generic/unsafe copy does not cater for uninit arrays.
 2836       StubRoutines::_arrayof_oop_arraycopy_uninit
 2837         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 2838     }
 2839 
 2840     // for oop copies reuse arrayof entries for non-arrayof cases
 2841     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2842     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 2843     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2844     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2845     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 2846     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2847 
 2848     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 2849     // checkcast nopush entry is needed by generic copy
 2850     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 2851     // note that we don't need a returned nopush entry because the
 2852     // generic copy does not cater for uninit arrays.
 2853     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 2854 
 2855     // unsafe arraycopy may fallback on conjoint stubs
 2856     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2857                                                               StubRoutines::_jshort_arraycopy_nopush,
 2858                                                               StubRoutines::_jint_arraycopy_nopush,
 2859                                                               StubRoutines::_jlong_arraycopy_nopush);
 2860 
 2861     // generic arraycopy may fallback on conjoint stubs
 2862     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2863                                                                StubRoutines::_jshort_arraycopy_nopush,
 2864                                                                StubRoutines::_jint_arraycopy_nopush,
 2865                                                                StubRoutines::_oop_arraycopy_nopush,
 2866                                                                StubRoutines::_jlong_arraycopy_nopush,
 2867                                                                StubRoutines::_checkcast_arraycopy_nopush);
 2868 
 2869     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 2870     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 2871     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 2872     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 2873     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 2874     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 2875   }
 2876 
 2877   void generate_math_stubs() { Unimplemented(); }
 2878 
 2879   // Arguments:
 2880   //
 2881   // Inputs:
 2882   //   c_rarg0   - source byte array address
 2883   //   c_rarg1   - destination byte array address
 2884   //   c_rarg2   - K (key) in little endian int array
 2885   //
 2886   address generate_aescrypt_encryptBlock() {
 2887     __ align(CodeEntryAlignment);
 2888     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 2889     StubCodeMark mark(this, stub_id);
 2890 
 2891     const Register from        = c_rarg0;  // source array address
 2892     const Register to          = c_rarg1;  // destination array address
 2893     const Register key         = c_rarg2;  // key array address
 2894     const Register keylen      = rscratch1;
 2895 
 2896     address start = __ pc();
 2897     __ enter();
 2898 
 2899     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2900 
 2901     __ aesenc_loadkeys(key, keylen);
 2902     __ aesecb_encrypt(from, to, keylen);
 2903 
 2904     __ mov(r0, 0);
 2905 
 2906     __ leave();
 2907     __ ret(lr);
 2908 
 2909     return start;
 2910   }
 2911 
 2912   // Arguments:
 2913   //
 2914   // Inputs:
 2915   //   c_rarg0   - source byte array address
 2916   //   c_rarg1   - destination byte array address
 2917   //   c_rarg2   - K (key) in little endian int array
 2918   //
 2919   address generate_aescrypt_decryptBlock() {
 2920     assert(UseAES, "need AES cryptographic extension support");
 2921     __ align(CodeEntryAlignment);
 2922     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 2923     StubCodeMark mark(this, stub_id);
 2924     Label L_doLast;
 2925 
 2926     const Register from        = c_rarg0;  // source array address
 2927     const Register to          = c_rarg1;  // destination array address
 2928     const Register key         = c_rarg2;  // key array address
 2929     const Register keylen      = rscratch1;
 2930 
 2931     address start = __ pc();
 2932     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2933 
 2934     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2935 
 2936     __ aesecb_decrypt(from, to, key, keylen);
 2937 
 2938     __ mov(r0, 0);
 2939 
 2940     __ leave();
 2941     __ ret(lr);
 2942 
 2943     return start;
 2944   }
 2945 
 2946   // Arguments:
 2947   //
 2948   // Inputs:
 2949   //   c_rarg0   - source byte array address
 2950   //   c_rarg1   - destination byte array address
 2951   //   c_rarg2   - K (key) in little endian int array
 2952   //   c_rarg3   - r vector byte array address
 2953   //   c_rarg4   - input length
 2954   //
 2955   // Output:
 2956   //   x0        - input length
 2957   //
 2958   address generate_cipherBlockChaining_encryptAESCrypt() {
 2959     assert(UseAES, "need AES cryptographic extension support");
 2960     __ align(CodeEntryAlignment);
 2961     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 2962     StubCodeMark mark(this, stub_id);
 2963 
 2964     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2965 
 2966     const Register from        = c_rarg0;  // source array address
 2967     const Register to          = c_rarg1;  // destination array address
 2968     const Register key         = c_rarg2;  // key array address
 2969     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2970                                            // and left with the results of the last encryption block
 2971     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2972     const Register keylen      = rscratch1;
 2973 
 2974     address start = __ pc();
 2975 
 2976       __ enter();
 2977 
 2978       __ movw(rscratch2, len_reg);
 2979 
 2980       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2981 
 2982       __ ld1(v0, __ T16B, rvec);
 2983 
 2984       __ cmpw(keylen, 52);
 2985       __ br(Assembler::CC, L_loadkeys_44);
 2986       __ br(Assembler::EQ, L_loadkeys_52);
 2987 
 2988       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2989       __ rev32(v17, __ T16B, v17);
 2990       __ rev32(v18, __ T16B, v18);
 2991     __ BIND(L_loadkeys_52);
 2992       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2993       __ rev32(v19, __ T16B, v19);
 2994       __ rev32(v20, __ T16B, v20);
 2995     __ BIND(L_loadkeys_44);
 2996       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2997       __ rev32(v21, __ T16B, v21);
 2998       __ rev32(v22, __ T16B, v22);
 2999       __ rev32(v23, __ T16B, v23);
 3000       __ rev32(v24, __ T16B, v24);
 3001       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3002       __ rev32(v25, __ T16B, v25);
 3003       __ rev32(v26, __ T16B, v26);
 3004       __ rev32(v27, __ T16B, v27);
 3005       __ rev32(v28, __ T16B, v28);
 3006       __ ld1(v29, v30, v31, __ T16B, key);
 3007       __ rev32(v29, __ T16B, v29);
 3008       __ rev32(v30, __ T16B, v30);
 3009       __ rev32(v31, __ T16B, v31);
 3010 
 3011     __ BIND(L_aes_loop);
 3012       __ ld1(v1, __ T16B, __ post(from, 16));
 3013       __ eor(v0, __ T16B, v0, v1);
 3014 
 3015       __ br(Assembler::CC, L_rounds_44);
 3016       __ br(Assembler::EQ, L_rounds_52);
 3017 
 3018       __ aese(v0, v17); __ aesmc(v0, v0);
 3019       __ aese(v0, v18); __ aesmc(v0, v0);
 3020     __ BIND(L_rounds_52);
 3021       __ aese(v0, v19); __ aesmc(v0, v0);
 3022       __ aese(v0, v20); __ aesmc(v0, v0);
 3023     __ BIND(L_rounds_44);
 3024       __ aese(v0, v21); __ aesmc(v0, v0);
 3025       __ aese(v0, v22); __ aesmc(v0, v0);
 3026       __ aese(v0, v23); __ aesmc(v0, v0);
 3027       __ aese(v0, v24); __ aesmc(v0, v0);
 3028       __ aese(v0, v25); __ aesmc(v0, v0);
 3029       __ aese(v0, v26); __ aesmc(v0, v0);
 3030       __ aese(v0, v27); __ aesmc(v0, v0);
 3031       __ aese(v0, v28); __ aesmc(v0, v0);
 3032       __ aese(v0, v29); __ aesmc(v0, v0);
 3033       __ aese(v0, v30);
 3034       __ eor(v0, __ T16B, v0, v31);
 3035 
 3036       __ st1(v0, __ T16B, __ post(to, 16));
 3037 
 3038       __ subw(len_reg, len_reg, 16);
 3039       __ cbnzw(len_reg, L_aes_loop);
 3040 
 3041       __ st1(v0, __ T16B, rvec);
 3042 
 3043       __ mov(r0, rscratch2);
 3044 
 3045       __ leave();
 3046       __ ret(lr);
 3047 
 3048       return start;
 3049   }
 3050 
 3051   // Arguments:
 3052   //
 3053   // Inputs:
 3054   //   c_rarg0   - source byte array address
 3055   //   c_rarg1   - destination byte array address
 3056   //   c_rarg2   - K (key) in little endian int array
 3057   //   c_rarg3   - r vector byte array address
 3058   //   c_rarg4   - input length
 3059   //
 3060   // Output:
 3061   //   r0        - input length
 3062   //
 3063   address generate_cipherBlockChaining_decryptAESCrypt() {
 3064     assert(UseAES, "need AES cryptographic extension support");
 3065     __ align(CodeEntryAlignment);
 3066     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3067     StubCodeMark mark(this, stub_id);
 3068 
 3069     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3070 
 3071     const Register from        = c_rarg0;  // source array address
 3072     const Register to          = c_rarg1;  // destination array address
 3073     const Register key         = c_rarg2;  // key array address
 3074     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3075                                            // and left with the results of the last encryption block
 3076     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3077     const Register keylen      = rscratch1;
 3078 
 3079     address start = __ pc();
 3080 
 3081       __ enter();
 3082 
 3083       __ movw(rscratch2, len_reg);
 3084 
 3085       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3086 
 3087       __ ld1(v2, __ T16B, rvec);
 3088 
 3089       __ ld1(v31, __ T16B, __ post(key, 16));
 3090       __ rev32(v31, __ T16B, v31);
 3091 
 3092       __ cmpw(keylen, 52);
 3093       __ br(Assembler::CC, L_loadkeys_44);
 3094       __ br(Assembler::EQ, L_loadkeys_52);
 3095 
 3096       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3097       __ rev32(v17, __ T16B, v17);
 3098       __ rev32(v18, __ T16B, v18);
 3099     __ BIND(L_loadkeys_52);
 3100       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3101       __ rev32(v19, __ T16B, v19);
 3102       __ rev32(v20, __ T16B, v20);
 3103     __ BIND(L_loadkeys_44);
 3104       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3105       __ rev32(v21, __ T16B, v21);
 3106       __ rev32(v22, __ T16B, v22);
 3107       __ rev32(v23, __ T16B, v23);
 3108       __ rev32(v24, __ T16B, v24);
 3109       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3110       __ rev32(v25, __ T16B, v25);
 3111       __ rev32(v26, __ T16B, v26);
 3112       __ rev32(v27, __ T16B, v27);
 3113       __ rev32(v28, __ T16B, v28);
 3114       __ ld1(v29, v30, __ T16B, key);
 3115       __ rev32(v29, __ T16B, v29);
 3116       __ rev32(v30, __ T16B, v30);
 3117 
 3118     __ BIND(L_aes_loop);
 3119       __ ld1(v0, __ T16B, __ post(from, 16));
 3120       __ orr(v1, __ T16B, v0, v0);
 3121 
 3122       __ br(Assembler::CC, L_rounds_44);
 3123       __ br(Assembler::EQ, L_rounds_52);
 3124 
 3125       __ aesd(v0, v17); __ aesimc(v0, v0);
 3126       __ aesd(v0, v18); __ aesimc(v0, v0);
 3127     __ BIND(L_rounds_52);
 3128       __ aesd(v0, v19); __ aesimc(v0, v0);
 3129       __ aesd(v0, v20); __ aesimc(v0, v0);
 3130     __ BIND(L_rounds_44);
 3131       __ aesd(v0, v21); __ aesimc(v0, v0);
 3132       __ aesd(v0, v22); __ aesimc(v0, v0);
 3133       __ aesd(v0, v23); __ aesimc(v0, v0);
 3134       __ aesd(v0, v24); __ aesimc(v0, v0);
 3135       __ aesd(v0, v25); __ aesimc(v0, v0);
 3136       __ aesd(v0, v26); __ aesimc(v0, v0);
 3137       __ aesd(v0, v27); __ aesimc(v0, v0);
 3138       __ aesd(v0, v28); __ aesimc(v0, v0);
 3139       __ aesd(v0, v29); __ aesimc(v0, v0);
 3140       __ aesd(v0, v30);
 3141       __ eor(v0, __ T16B, v0, v31);
 3142       __ eor(v0, __ T16B, v0, v2);
 3143 
 3144       __ st1(v0, __ T16B, __ post(to, 16));
 3145       __ orr(v2, __ T16B, v1, v1);
 3146 
 3147       __ subw(len_reg, len_reg, 16);
 3148       __ cbnzw(len_reg, L_aes_loop);
 3149 
 3150       __ st1(v2, __ T16B, rvec);
 3151 
 3152       __ mov(r0, rscratch2);
 3153 
 3154       __ leave();
 3155       __ ret(lr);
 3156 
 3157     return start;
 3158   }
 3159 
 3160   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3161   // Inputs: 128-bits. in is preserved.
 3162   // The least-significant 64-bit word is in the upper dword of each vector.
 3163   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3164   // Output: result
 3165   void be_add_128_64(FloatRegister result, FloatRegister in,
 3166                      FloatRegister inc, FloatRegister tmp) {
 3167     assert_different_registers(result, tmp, inc);
 3168 
 3169     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3170                                            // input
 3171     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3172     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3173                                            // MSD == 0 (must be!) to LSD
 3174     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3175   }
 3176 
 3177   // CTR AES crypt.
 3178   // Arguments:
 3179   //
 3180   // Inputs:
 3181   //   c_rarg0   - source byte array address
 3182   //   c_rarg1   - destination byte array address
 3183   //   c_rarg2   - K (key) in little endian int array
 3184   //   c_rarg3   - counter vector byte array address
 3185   //   c_rarg4   - input length
 3186   //   c_rarg5   - saved encryptedCounter start
 3187   //   c_rarg6   - saved used length
 3188   //
 3189   // Output:
 3190   //   r0       - input length
 3191   //
 3192   address generate_counterMode_AESCrypt() {
 3193     const Register in = c_rarg0;
 3194     const Register out = c_rarg1;
 3195     const Register key = c_rarg2;
 3196     const Register counter = c_rarg3;
 3197     const Register saved_len = c_rarg4, len = r10;
 3198     const Register saved_encrypted_ctr = c_rarg5;
 3199     const Register used_ptr = c_rarg6, used = r12;
 3200 
 3201     const Register offset = r7;
 3202     const Register keylen = r11;
 3203 
 3204     const unsigned char block_size = 16;
 3205     const int bulk_width = 4;
 3206     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3207     // performance with larger data sizes, but it also means that the
 3208     // fast path isn't used until you have at least 8 blocks, and up
 3209     // to 127 bytes of data will be executed on the slow path. For
 3210     // that reason, and also so as not to blow away too much icache, 4
 3211     // blocks seems like a sensible compromise.
 3212 
 3213     // Algorithm:
 3214     //
 3215     //    if (len == 0) {
 3216     //        goto DONE;
 3217     //    }
 3218     //    int result = len;
 3219     //    do {
 3220     //        if (used >= blockSize) {
 3221     //            if (len >= bulk_width * blockSize) {
 3222     //                CTR_large_block();
 3223     //                if (len == 0)
 3224     //                    goto DONE;
 3225     //            }
 3226     //            for (;;) {
 3227     //                16ByteVector v0 = counter;
 3228     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3229     //                used = 0;
 3230     //                if (len < blockSize)
 3231     //                    break;    /* goto NEXT */
 3232     //                16ByteVector v1 = load16Bytes(in, offset);
 3233     //                v1 = v1 ^ encryptedCounter;
 3234     //                store16Bytes(out, offset);
 3235     //                used = blockSize;
 3236     //                offset += blockSize;
 3237     //                len -= blockSize;
 3238     //                if (len == 0)
 3239     //                    goto DONE;
 3240     //            }
 3241     //        }
 3242     //      NEXT:
 3243     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3244     //        len--;
 3245     //    } while (len != 0);
 3246     //  DONE:
 3247     //    return result;
 3248     //
 3249     // CTR_large_block()
 3250     //    Wide bulk encryption of whole blocks.
 3251 
 3252     __ align(CodeEntryAlignment);
 3253     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3254     StubCodeMark mark(this, stub_id);
 3255     const address start = __ pc();
 3256     __ enter();
 3257 
 3258     Label DONE, CTR_large_block, large_block_return;
 3259     __ ldrw(used, Address(used_ptr));
 3260     __ cbzw(saved_len, DONE);
 3261 
 3262     __ mov(len, saved_len);
 3263     __ mov(offset, 0);
 3264 
 3265     // Compute #rounds for AES based on the length of the key array
 3266     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3267 
 3268     __ aesenc_loadkeys(key, keylen);
 3269 
 3270     {
 3271       Label L_CTR_loop, NEXT;
 3272 
 3273       __ bind(L_CTR_loop);
 3274 
 3275       __ cmp(used, block_size);
 3276       __ br(__ LO, NEXT);
 3277 
 3278       // Maybe we have a lot of data
 3279       __ subsw(rscratch1, len, bulk_width * block_size);
 3280       __ br(__ HS, CTR_large_block);
 3281       __ BIND(large_block_return);
 3282       __ cbzw(len, DONE);
 3283 
 3284       // Setup the counter
 3285       __ movi(v4, __ T4S, 0);
 3286       __ movi(v5, __ T4S, 1);
 3287       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3288 
 3289       // 128-bit big-endian increment
 3290       __ ld1(v0, __ T16B, counter);
 3291       __ rev64(v16, __ T16B, v0);
 3292       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3293       __ rev64(v16, __ T16B, v16);
 3294       __ st1(v16, __ T16B, counter);
 3295       // Previous counter value is in v0
 3296       // v4 contains { 0, 1 }
 3297 
 3298       {
 3299         // We have fewer than bulk_width blocks of data left. Encrypt
 3300         // them one by one until there is less than a full block
 3301         // remaining, being careful to save both the encrypted counter
 3302         // and the counter.
 3303 
 3304         Label inner_loop;
 3305         __ bind(inner_loop);
 3306         // Counter to encrypt is in v0
 3307         __ aesecb_encrypt(noreg, noreg, keylen);
 3308         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3309 
 3310         // Do we have a remaining full block?
 3311 
 3312         __ mov(used, 0);
 3313         __ cmp(len, block_size);
 3314         __ br(__ LO, NEXT);
 3315 
 3316         // Yes, we have a full block
 3317         __ ldrq(v1, Address(in, offset));
 3318         __ eor(v1, __ T16B, v1, v0);
 3319         __ strq(v1, Address(out, offset));
 3320         __ mov(used, block_size);
 3321         __ add(offset, offset, block_size);
 3322 
 3323         __ subw(len, len, block_size);
 3324         __ cbzw(len, DONE);
 3325 
 3326         // Increment the counter, store it back
 3327         __ orr(v0, __ T16B, v16, v16);
 3328         __ rev64(v16, __ T16B, v16);
 3329         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3330         __ rev64(v16, __ T16B, v16);
 3331         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3332 
 3333         __ b(inner_loop);
 3334       }
 3335 
 3336       __ BIND(NEXT);
 3337 
 3338       // Encrypt a single byte, and loop.
 3339       // We expect this to be a rare event.
 3340       __ ldrb(rscratch1, Address(in, offset));
 3341       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3342       __ eor(rscratch1, rscratch1, rscratch2);
 3343       __ strb(rscratch1, Address(out, offset));
 3344       __ add(offset, offset, 1);
 3345       __ add(used, used, 1);
 3346       __ subw(len, len,1);
 3347       __ cbnzw(len, L_CTR_loop);
 3348     }
 3349 
 3350     __ bind(DONE);
 3351     __ strw(used, Address(used_ptr));
 3352     __ mov(r0, saved_len);
 3353 
 3354     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3355     __ ret(lr);
 3356 
 3357     // Bulk encryption
 3358 
 3359     __ BIND (CTR_large_block);
 3360     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3361 
 3362     if (bulk_width == 8) {
 3363       __ sub(sp, sp, 4 * 16);
 3364       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3365     }
 3366     __ sub(sp, sp, 4 * 16);
 3367     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3368     RegSet saved_regs = (RegSet::of(in, out, offset)
 3369                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3370     __ push(saved_regs, sp);
 3371     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3372     __ add(in, in, offset);
 3373     __ add(out, out, offset);
 3374 
 3375     // Keys should already be loaded into the correct registers
 3376 
 3377     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3378     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3379 
 3380     // AES/CTR loop
 3381     {
 3382       Label L_CTR_loop;
 3383       __ BIND(L_CTR_loop);
 3384 
 3385       // Setup the counters
 3386       __ movi(v8, __ T4S, 0);
 3387       __ movi(v9, __ T4S, 1);
 3388       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3389 
 3390       for (int i = 0; i < bulk_width; i++) {
 3391         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3392         __ rev64(v0_ofs, __ T16B, v16);
 3393         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3394       }
 3395 
 3396       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3397 
 3398       // Encrypt the counters
 3399       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3400 
 3401       if (bulk_width == 8) {
 3402         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3403       }
 3404 
 3405       // XOR the encrypted counters with the inputs
 3406       for (int i = 0; i < bulk_width; i++) {
 3407         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3408         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3409         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3410       }
 3411 
 3412       // Write the encrypted data
 3413       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3414       if (bulk_width == 8) {
 3415         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3416       }
 3417 
 3418       __ subw(len, len, 16 * bulk_width);
 3419       __ cbnzw(len, L_CTR_loop);
 3420     }
 3421 
 3422     // Save the counter back where it goes
 3423     __ rev64(v16, __ T16B, v16);
 3424     __ st1(v16, __ T16B, counter);
 3425 
 3426     __ pop(saved_regs, sp);
 3427 
 3428     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3429     if (bulk_width == 8) {
 3430       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3431     }
 3432 
 3433     __ andr(rscratch1, len, -16 * bulk_width);
 3434     __ sub(len, len, rscratch1);
 3435     __ add(offset, offset, rscratch1);
 3436     __ mov(used, 16);
 3437     __ strw(used, Address(used_ptr));
 3438     __ b(large_block_return);
 3439 
 3440     return start;
 3441   }
 3442 
 3443   // Vector AES Galois Counter Mode implementation. Parameters:
 3444   //
 3445   // in = c_rarg0
 3446   // len = c_rarg1
 3447   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3448   // out = c_rarg3
 3449   // key = c_rarg4
 3450   // state = c_rarg5 - GHASH.state
 3451   // subkeyHtbl = c_rarg6 - powers of H
 3452   // counter = c_rarg7 - 16 bytes of CTR
 3453   // return - number of processed bytes
 3454   address generate_galoisCounterMode_AESCrypt() {
 3455     Label ghash_polynomial; // local data generated after code
 3456 
 3457    __ align(CodeEntryAlignment);
 3458     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3459     StubCodeMark mark(this, stub_id);
 3460     address start = __ pc();
 3461     __ enter();
 3462 
 3463     const Register in = c_rarg0;
 3464     const Register len = c_rarg1;
 3465     const Register ct = c_rarg2;
 3466     const Register out = c_rarg3;
 3467     // and updated with the incremented counter in the end
 3468 
 3469     const Register key = c_rarg4;
 3470     const Register state = c_rarg5;
 3471 
 3472     const Register subkeyHtbl = c_rarg6;
 3473 
 3474     const Register counter = c_rarg7;
 3475 
 3476     const Register keylen = r10;
 3477     // Save state before entering routine
 3478     __ sub(sp, sp, 4 * 16);
 3479     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3480     __ sub(sp, sp, 4 * 16);
 3481     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3482 
 3483     // __ andr(len, len, -512);
 3484     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3485     __ str(len, __ pre(sp, -2 * wordSize));
 3486 
 3487     Label DONE;
 3488     __ cbz(len, DONE);
 3489 
 3490     // Compute #rounds for AES based on the length of the key array
 3491     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3492 
 3493     __ aesenc_loadkeys(key, keylen);
 3494     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3495     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3496 
 3497     // AES/CTR loop
 3498     {
 3499       Label L_CTR_loop;
 3500       __ BIND(L_CTR_loop);
 3501 
 3502       // Setup the counters
 3503       __ movi(v8, __ T4S, 0);
 3504       __ movi(v9, __ T4S, 1);
 3505       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3506 
 3507       assert(v0->encoding() < v8->encoding(), "");
 3508       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3509         FloatRegister f = as_FloatRegister(i);
 3510         __ rev32(f, __ T16B, v16);
 3511         __ addv(v16, __ T4S, v16, v8);
 3512       }
 3513 
 3514       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3515 
 3516       // Encrypt the counters
 3517       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3518 
 3519       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3520 
 3521       // XOR the encrypted counters with the inputs
 3522       for (int i = 0; i < 8; i++) {
 3523         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3524         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3525         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3526       }
 3527       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3528       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3529 
 3530       __ subw(len, len, 16 * 8);
 3531       __ cbnzw(len, L_CTR_loop);
 3532     }
 3533 
 3534     __ rev32(v16, __ T16B, v16);
 3535     __ st1(v16, __ T16B, counter);
 3536 
 3537     __ ldr(len, Address(sp));
 3538     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3539 
 3540     // GHASH/CTR loop
 3541     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3542                                 len, /*unrolls*/4);
 3543 
 3544 #ifdef ASSERT
 3545     { Label L;
 3546       __ cmp(len, (unsigned char)0);
 3547       __ br(Assembler::EQ, L);
 3548       __ stop("stubGenerator: abort");
 3549       __ bind(L);
 3550   }
 3551 #endif
 3552 
 3553   __ bind(DONE);
 3554     // Return the number of bytes processed
 3555     __ ldr(r0, __ post(sp, 2 * wordSize));
 3556 
 3557     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3558     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3559 
 3560     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3561     __ ret(lr);
 3562 
 3563     // bind label and generate polynomial data
 3564     __ align(wordSize * 2);
 3565     __ bind(ghash_polynomial);
 3566     __ emit_int64(0x87);  // The low-order bits of the field
 3567                           // polynomial (i.e. p = z^7+z^2+z+1)
 3568                           // repeated in the low and high parts of a
 3569                           // 128-bit vector
 3570     __ emit_int64(0x87);
 3571 
 3572     return start;
 3573   }
 3574 
 3575   class Cached64Bytes {
 3576   private:
 3577     MacroAssembler *_masm;
 3578     Register _regs[8];
 3579 
 3580   public:
 3581     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3582       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3583       auto it = rs.begin();
 3584       for (auto &r: _regs) {
 3585         r = *it;
 3586         ++it;
 3587       }
 3588     }
 3589 
 3590     void gen_loads(Register base) {
 3591       for (int i = 0; i < 8; i += 2) {
 3592         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3593       }
 3594     }
 3595 
 3596     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3597     void extract_u32(Register dest, int i) {
 3598       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3599     }
 3600   };
 3601 
 3602   // Utility routines for md5.
 3603   // Clobbers r10 and r11.
 3604   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3605               int k, int s, int t) {
 3606     Register rscratch3 = r10;
 3607     Register rscratch4 = r11;
 3608 
 3609     __ eorw(rscratch3, r3, r4);
 3610     __ movw(rscratch2, t);
 3611     __ andw(rscratch3, rscratch3, r2);
 3612     __ addw(rscratch4, r1, rscratch2);
 3613     reg_cache.extract_u32(rscratch1, k);
 3614     __ eorw(rscratch3, rscratch3, r4);
 3615     __ addw(rscratch4, rscratch4, rscratch1);
 3616     __ addw(rscratch3, rscratch3, rscratch4);
 3617     __ rorw(rscratch2, rscratch3, 32 - s);
 3618     __ addw(r1, rscratch2, r2);
 3619   }
 3620 
 3621   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3622               int k, int s, int t) {
 3623     Register rscratch3 = r10;
 3624     Register rscratch4 = r11;
 3625 
 3626     reg_cache.extract_u32(rscratch1, k);
 3627     __ movw(rscratch2, t);
 3628     __ addw(rscratch4, r1, rscratch2);
 3629     __ addw(rscratch4, rscratch4, rscratch1);
 3630     __ bicw(rscratch2, r3, r4);
 3631     __ andw(rscratch3, r2, r4);
 3632     __ addw(rscratch2, rscratch2, rscratch4);
 3633     __ addw(rscratch2, rscratch2, rscratch3);
 3634     __ rorw(rscratch2, rscratch2, 32 - s);
 3635     __ addw(r1, rscratch2, r2);
 3636   }
 3637 
 3638   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3639               int k, int s, int t) {
 3640     Register rscratch3 = r10;
 3641     Register rscratch4 = r11;
 3642 
 3643     __ eorw(rscratch3, r3, r4);
 3644     __ movw(rscratch2, t);
 3645     __ addw(rscratch4, r1, rscratch2);
 3646     reg_cache.extract_u32(rscratch1, k);
 3647     __ eorw(rscratch3, rscratch3, r2);
 3648     __ addw(rscratch4, rscratch4, rscratch1);
 3649     __ addw(rscratch3, rscratch3, rscratch4);
 3650     __ rorw(rscratch2, rscratch3, 32 - s);
 3651     __ addw(r1, rscratch2, r2);
 3652   }
 3653 
 3654   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3655               int k, int s, int t) {
 3656     Register rscratch3 = r10;
 3657     Register rscratch4 = r11;
 3658 
 3659     __ movw(rscratch3, t);
 3660     __ ornw(rscratch2, r2, r4);
 3661     __ addw(rscratch4, r1, rscratch3);
 3662     reg_cache.extract_u32(rscratch1, k);
 3663     __ eorw(rscratch3, rscratch2, r3);
 3664     __ addw(rscratch4, rscratch4, rscratch1);
 3665     __ addw(rscratch3, rscratch3, rscratch4);
 3666     __ rorw(rscratch2, rscratch3, 32 - s);
 3667     __ addw(r1, rscratch2, r2);
 3668   }
 3669 
 3670   // Arguments:
 3671   //
 3672   // Inputs:
 3673   //   c_rarg0   - byte[]  source+offset
 3674   //   c_rarg1   - int[]   SHA.state
 3675   //   c_rarg2   - int     offset
 3676   //   c_rarg3   - int     limit
 3677   //
 3678   address generate_md5_implCompress(StubId stub_id) {
 3679     bool multi_block;
 3680     switch (stub_id) {
 3681     case StubId::stubgen_md5_implCompress_id:
 3682       multi_block = false;
 3683       break;
 3684     case StubId::stubgen_md5_implCompressMB_id:
 3685       multi_block = true;
 3686       break;
 3687     default:
 3688       ShouldNotReachHere();
 3689     }
 3690     __ align(CodeEntryAlignment);
 3691 
 3692     StubCodeMark mark(this, stub_id);
 3693     address start = __ pc();
 3694 
 3695     Register buf       = c_rarg0;
 3696     Register state     = c_rarg1;
 3697     Register ofs       = c_rarg2;
 3698     Register limit     = c_rarg3;
 3699     Register a         = r4;
 3700     Register b         = r5;
 3701     Register c         = r6;
 3702     Register d         = r7;
 3703     Register rscratch3 = r10;
 3704     Register rscratch4 = r11;
 3705 
 3706     Register state_regs[2] = { r12, r13 };
 3707     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3708     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3709 
 3710     __ push(saved_regs, sp);
 3711 
 3712     __ ldp(state_regs[0], state_regs[1], Address(state));
 3713     __ ubfx(a, state_regs[0],  0, 32);
 3714     __ ubfx(b, state_regs[0], 32, 32);
 3715     __ ubfx(c, state_regs[1],  0, 32);
 3716     __ ubfx(d, state_regs[1], 32, 32);
 3717 
 3718     Label md5_loop;
 3719     __ BIND(md5_loop);
 3720 
 3721     reg_cache.gen_loads(buf);
 3722 
 3723     // Round 1
 3724     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3725     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3726     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3727     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3728     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3729     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3730     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3731     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3732     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3733     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3734     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3735     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3736     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3737     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3738     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3739     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3740 
 3741     // Round 2
 3742     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3743     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3744     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3745     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3746     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3747     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3748     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3749     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3750     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3751     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3752     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3753     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3754     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3755     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3756     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3757     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3758 
 3759     // Round 3
 3760     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3761     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3762     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3763     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3764     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3765     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3766     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3767     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3768     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3769     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3770     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3771     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3772     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3773     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3774     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3775     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3776 
 3777     // Round 4
 3778     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3779     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3780     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3781     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3782     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3783     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3784     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3785     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3786     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3787     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3788     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3789     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3790     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3791     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3792     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3793     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3794 
 3795     __ addw(a, state_regs[0], a);
 3796     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3797     __ addw(b, rscratch2, b);
 3798     __ addw(c, state_regs[1], c);
 3799     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3800     __ addw(d, rscratch4, d);
 3801 
 3802     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3803     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3804 
 3805     if (multi_block) {
 3806       __ add(buf, buf, 64);
 3807       __ add(ofs, ofs, 64);
 3808       __ cmp(ofs, limit);
 3809       __ br(Assembler::LE, md5_loop);
 3810       __ mov(c_rarg0, ofs); // return ofs
 3811     }
 3812 
 3813     // write hash values back in the correct order
 3814     __ stp(state_regs[0], state_regs[1], Address(state));
 3815 
 3816     __ pop(saved_regs, sp);
 3817 
 3818     __ ret(lr);
 3819 
 3820     return start;
 3821   }
 3822 
 3823   // Arguments:
 3824   //
 3825   // Inputs:
 3826   //   c_rarg0   - byte[]  source+offset
 3827   //   c_rarg1   - int[]   SHA.state
 3828   //   c_rarg2   - int     offset
 3829   //   c_rarg3   - int     limit
 3830   //
 3831   address generate_sha1_implCompress(StubId stub_id) {
 3832     bool multi_block;
 3833     switch (stub_id) {
 3834     case StubId::stubgen_sha1_implCompress_id:
 3835       multi_block = false;
 3836       break;
 3837     case StubId::stubgen_sha1_implCompressMB_id:
 3838       multi_block = true;
 3839       break;
 3840     default:
 3841       ShouldNotReachHere();
 3842     }
 3843 
 3844     __ align(CodeEntryAlignment);
 3845 
 3846     StubCodeMark mark(this, stub_id);
 3847     address start = __ pc();
 3848 
 3849     Register buf   = c_rarg0;
 3850     Register state = c_rarg1;
 3851     Register ofs   = c_rarg2;
 3852     Register limit = c_rarg3;
 3853 
 3854     Label keys;
 3855     Label sha1_loop;
 3856 
 3857     // load the keys into v0..v3
 3858     __ adr(rscratch1, keys);
 3859     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3860     // load 5 words state into v6, v7
 3861     __ ldrq(v6, Address(state, 0));
 3862     __ ldrs(v7, Address(state, 16));
 3863 
 3864 
 3865     __ BIND(sha1_loop);
 3866     // load 64 bytes of data into v16..v19
 3867     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3868     __ rev32(v16, __ T16B, v16);
 3869     __ rev32(v17, __ T16B, v17);
 3870     __ rev32(v18, __ T16B, v18);
 3871     __ rev32(v19, __ T16B, v19);
 3872 
 3873     // do the sha1
 3874     __ addv(v4, __ T4S, v16, v0);
 3875     __ orr(v20, __ T16B, v6, v6);
 3876 
 3877     FloatRegister d0 = v16;
 3878     FloatRegister d1 = v17;
 3879     FloatRegister d2 = v18;
 3880     FloatRegister d3 = v19;
 3881 
 3882     for (int round = 0; round < 20; round++) {
 3883       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3884       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3885       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3886       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3887       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3888 
 3889       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3890       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3891       __ sha1h(tmp2, __ T4S, v20);
 3892       if (round < 5)
 3893         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3894       else if (round < 10 || round >= 15)
 3895         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3896       else
 3897         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3898       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3899 
 3900       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3901     }
 3902 
 3903     __ addv(v7, __ T2S, v7, v21);
 3904     __ addv(v6, __ T4S, v6, v20);
 3905 
 3906     if (multi_block) {
 3907       __ add(ofs, ofs, 64);
 3908       __ cmp(ofs, limit);
 3909       __ br(Assembler::LE, sha1_loop);
 3910       __ mov(c_rarg0, ofs); // return ofs
 3911     }
 3912 
 3913     __ strq(v6, Address(state, 0));
 3914     __ strs(v7, Address(state, 16));
 3915 
 3916     __ ret(lr);
 3917 
 3918     __ bind(keys);
 3919     __ emit_int32(0x5a827999);
 3920     __ emit_int32(0x6ed9eba1);
 3921     __ emit_int32(0x8f1bbcdc);
 3922     __ emit_int32(0xca62c1d6);
 3923 
 3924     return start;
 3925   }
 3926 
 3927 
 3928   // Arguments:
 3929   //
 3930   // Inputs:
 3931   //   c_rarg0   - byte[]  source+offset
 3932   //   c_rarg1   - int[]   SHA.state
 3933   //   c_rarg2   - int     offset
 3934   //   c_rarg3   - int     limit
 3935   //
 3936   address generate_sha256_implCompress(StubId stub_id) {
 3937     bool multi_block;
 3938     switch (stub_id) {
 3939     case StubId::stubgen_sha256_implCompress_id:
 3940       multi_block = false;
 3941       break;
 3942     case StubId::stubgen_sha256_implCompressMB_id:
 3943       multi_block = true;
 3944       break;
 3945     default:
 3946       ShouldNotReachHere();
 3947     }
 3948 
 3949     static const uint32_t round_consts[64] = {
 3950       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3951       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3952       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3953       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3954       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3955       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3956       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3957       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3958       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3959       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3960       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3961       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3962       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3963       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3964       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3965       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3966     };
 3967 
 3968     __ align(CodeEntryAlignment);
 3969 
 3970     StubCodeMark mark(this, stub_id);
 3971     address start = __ pc();
 3972 
 3973     Register buf   = c_rarg0;
 3974     Register state = c_rarg1;
 3975     Register ofs   = c_rarg2;
 3976     Register limit = c_rarg3;
 3977 
 3978     Label sha1_loop;
 3979 
 3980     __ stpd(v8, v9, __ pre(sp, -32));
 3981     __ stpd(v10, v11, Address(sp, 16));
 3982 
 3983 // dga == v0
 3984 // dgb == v1
 3985 // dg0 == v2
 3986 // dg1 == v3
 3987 // dg2 == v4
 3988 // t0 == v6
 3989 // t1 == v7
 3990 
 3991     // load 16 keys to v16..v31
 3992     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3993     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3994     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3995     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3996     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3997 
 3998     // load 8 words (256 bits) state
 3999     __ ldpq(v0, v1, state);
 4000 
 4001     __ BIND(sha1_loop);
 4002     // load 64 bytes of data into v8..v11
 4003     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4004     __ rev32(v8, __ T16B, v8);
 4005     __ rev32(v9, __ T16B, v9);
 4006     __ rev32(v10, __ T16B, v10);
 4007     __ rev32(v11, __ T16B, v11);
 4008 
 4009     __ addv(v6, __ T4S, v8, v16);
 4010     __ orr(v2, __ T16B, v0, v0);
 4011     __ orr(v3, __ T16B, v1, v1);
 4012 
 4013     FloatRegister d0 = v8;
 4014     FloatRegister d1 = v9;
 4015     FloatRegister d2 = v10;
 4016     FloatRegister d3 = v11;
 4017 
 4018 
 4019     for (int round = 0; round < 16; round++) {
 4020       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4021       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4022       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4023       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4024 
 4025       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4026        __ orr(v4, __ T16B, v2, v2);
 4027       if (round < 15)
 4028         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4029       __ sha256h(v2, __ T4S, v3, tmp2);
 4030       __ sha256h2(v3, __ T4S, v4, tmp2);
 4031       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4032 
 4033       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4034     }
 4035 
 4036     __ addv(v0, __ T4S, v0, v2);
 4037     __ addv(v1, __ T4S, v1, v3);
 4038 
 4039     if (multi_block) {
 4040       __ add(ofs, ofs, 64);
 4041       __ cmp(ofs, limit);
 4042       __ br(Assembler::LE, sha1_loop);
 4043       __ mov(c_rarg0, ofs); // return ofs
 4044     }
 4045 
 4046     __ ldpd(v10, v11, Address(sp, 16));
 4047     __ ldpd(v8, v9, __ post(sp, 32));
 4048 
 4049     __ stpq(v0, v1, state);
 4050 
 4051     __ ret(lr);
 4052 
 4053     return start;
 4054   }
 4055 
 4056   // Double rounds for sha512.
 4057   void sha512_dround(int dr,
 4058                      FloatRegister vi0, FloatRegister vi1,
 4059                      FloatRegister vi2, FloatRegister vi3,
 4060                      FloatRegister vi4, FloatRegister vrc0,
 4061                      FloatRegister vrc1, FloatRegister vin0,
 4062                      FloatRegister vin1, FloatRegister vin2,
 4063                      FloatRegister vin3, FloatRegister vin4) {
 4064       if (dr < 36) {
 4065         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4066       }
 4067       __ addv(v5, __ T2D, vrc0, vin0);
 4068       __ ext(v6, __ T16B, vi2, vi3, 8);
 4069       __ ext(v5, __ T16B, v5, v5, 8);
 4070       __ ext(v7, __ T16B, vi1, vi2, 8);
 4071       __ addv(vi3, __ T2D, vi3, v5);
 4072       if (dr < 32) {
 4073         __ ext(v5, __ T16B, vin3, vin4, 8);
 4074         __ sha512su0(vin0, __ T2D, vin1);
 4075       }
 4076       __ sha512h(vi3, __ T2D, v6, v7);
 4077       if (dr < 32) {
 4078         __ sha512su1(vin0, __ T2D, vin2, v5);
 4079       }
 4080       __ addv(vi4, __ T2D, vi1, vi3);
 4081       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4082   }
 4083 
 4084   // Arguments:
 4085   //
 4086   // Inputs:
 4087   //   c_rarg0   - byte[]  source+offset
 4088   //   c_rarg1   - int[]   SHA.state
 4089   //   c_rarg2   - int     offset
 4090   //   c_rarg3   - int     limit
 4091   //
 4092   address generate_sha512_implCompress(StubId stub_id) {
 4093     bool multi_block;
 4094     switch (stub_id) {
 4095     case StubId::stubgen_sha512_implCompress_id:
 4096       multi_block = false;
 4097       break;
 4098     case StubId::stubgen_sha512_implCompressMB_id:
 4099       multi_block = true;
 4100       break;
 4101     default:
 4102       ShouldNotReachHere();
 4103     }
 4104 
 4105     static const uint64_t round_consts[80] = {
 4106       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4107       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4108       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4109       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4110       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4111       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4112       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4113       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4114       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4115       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4116       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4117       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4118       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4119       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4120       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4121       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4122       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4123       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4124       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4125       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4126       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4127       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4128       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4129       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4130       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4131       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4132       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4133     };
 4134 
 4135     __ align(CodeEntryAlignment);
 4136 
 4137     StubCodeMark mark(this, stub_id);
 4138     address start = __ pc();
 4139 
 4140     Register buf   = c_rarg0;
 4141     Register state = c_rarg1;
 4142     Register ofs   = c_rarg2;
 4143     Register limit = c_rarg3;
 4144 
 4145     __ stpd(v8, v9, __ pre(sp, -64));
 4146     __ stpd(v10, v11, Address(sp, 16));
 4147     __ stpd(v12, v13, Address(sp, 32));
 4148     __ stpd(v14, v15, Address(sp, 48));
 4149 
 4150     Label sha512_loop;
 4151 
 4152     // load state
 4153     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4154 
 4155     // load first 4 round constants
 4156     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4157     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4158 
 4159     __ BIND(sha512_loop);
 4160     // load 128B of data into v12..v19
 4161     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4162     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4163     __ rev64(v12, __ T16B, v12);
 4164     __ rev64(v13, __ T16B, v13);
 4165     __ rev64(v14, __ T16B, v14);
 4166     __ rev64(v15, __ T16B, v15);
 4167     __ rev64(v16, __ T16B, v16);
 4168     __ rev64(v17, __ T16B, v17);
 4169     __ rev64(v18, __ T16B, v18);
 4170     __ rev64(v19, __ T16B, v19);
 4171 
 4172     __ mov(rscratch2, rscratch1);
 4173 
 4174     __ mov(v0, __ T16B, v8);
 4175     __ mov(v1, __ T16B, v9);
 4176     __ mov(v2, __ T16B, v10);
 4177     __ mov(v3, __ T16B, v11);
 4178 
 4179     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4180     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4181     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4182     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4183     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4184     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4185     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4186     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4187     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4188     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4189     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4190     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4191     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4192     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4193     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4194     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4195     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4196     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4197     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4198     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4199     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4200     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4201     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4202     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4203     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4204     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4205     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4206     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4207     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4208     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4209     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4210     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4211     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4212     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4213     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4214     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4215     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4216     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4217     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4218     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4219 
 4220     __ addv(v8, __ T2D, v8, v0);
 4221     __ addv(v9, __ T2D, v9, v1);
 4222     __ addv(v10, __ T2D, v10, v2);
 4223     __ addv(v11, __ T2D, v11, v3);
 4224 
 4225     if (multi_block) {
 4226       __ add(ofs, ofs, 128);
 4227       __ cmp(ofs, limit);
 4228       __ br(Assembler::LE, sha512_loop);
 4229       __ mov(c_rarg0, ofs); // return ofs
 4230     }
 4231 
 4232     __ st1(v8, v9, v10, v11, __ T2D, state);
 4233 
 4234     __ ldpd(v14, v15, Address(sp, 48));
 4235     __ ldpd(v12, v13, Address(sp, 32));
 4236     __ ldpd(v10, v11, Address(sp, 16));
 4237     __ ldpd(v8, v9, __ post(sp, 64));
 4238 
 4239     __ ret(lr);
 4240 
 4241     return start;
 4242   }
 4243 
 4244   // Execute one round of keccak of two computations in parallel.
 4245   // One of the states should be loaded into the lower halves of
 4246   // the vector registers v0-v24, the other should be loaded into
 4247   // the upper halves of those registers. The ld1r instruction loads
 4248   // the round constant into both halves of register v31.
 4249   // Intermediate results c0...c5 and d0...d5 are computed
 4250   // in registers v25...v30.
 4251   // All vector instructions that are used operate on both register
 4252   // halves in parallel.
 4253   // If only a single computation is needed, one can only load the lower halves.
 4254   void keccak_round(Register rscratch1) {
 4255   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4256   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4257   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4258   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4259   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4260   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4261   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4262   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4263   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4264   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4265 
 4266   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4267   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4268   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4269   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4270   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4271 
 4272   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4273   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4274   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4275   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4276   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4277   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4278   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4279   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4280   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4281   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4282   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4283   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4284   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4285   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4286   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4287   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4288   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4289   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4290   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4291   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4292   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4293   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4294   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4295   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4296   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4297 
 4298   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4299   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4300   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4301   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4302   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4303 
 4304   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4305 
 4306   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4307   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4308   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4309   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4310   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4311 
 4312   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4313   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4314   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4315   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4316   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4317 
 4318   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4319   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4320   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4321   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4322   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4323 
 4324   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4325   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4326   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4327   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4328   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4329 
 4330   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4331   }
 4332 
 4333   // Arguments:
 4334   //
 4335   // Inputs:
 4336   //   c_rarg0   - byte[]  source+offset
 4337   //   c_rarg1   - byte[]  SHA.state
 4338   //   c_rarg2   - int     block_size
 4339   //   c_rarg3   - int     offset
 4340   //   c_rarg4   - int     limit
 4341   //
 4342   address generate_sha3_implCompress(StubId stub_id) {
 4343     bool multi_block;
 4344     switch (stub_id) {
 4345     case StubId::stubgen_sha3_implCompress_id:
 4346       multi_block = false;
 4347       break;
 4348     case StubId::stubgen_sha3_implCompressMB_id:
 4349       multi_block = true;
 4350       break;
 4351     default:
 4352       ShouldNotReachHere();
 4353     }
 4354 
 4355     static const uint64_t round_consts[24] = {
 4356       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4357       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4358       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4359       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4360       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4361       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4362       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4363       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4364     };
 4365 
 4366     __ align(CodeEntryAlignment);
 4367 
 4368     StubCodeMark mark(this, stub_id);
 4369     address start = __ pc();
 4370 
 4371     Register buf           = c_rarg0;
 4372     Register state         = c_rarg1;
 4373     Register block_size    = c_rarg2;
 4374     Register ofs           = c_rarg3;
 4375     Register limit         = c_rarg4;
 4376 
 4377     Label sha3_loop, rounds24_loop;
 4378     Label sha3_512_or_sha3_384, shake128;
 4379 
 4380     __ stpd(v8, v9, __ pre(sp, -64));
 4381     __ stpd(v10, v11, Address(sp, 16));
 4382     __ stpd(v12, v13, Address(sp, 32));
 4383     __ stpd(v14, v15, Address(sp, 48));
 4384 
 4385     // load state
 4386     __ add(rscratch1, state, 32);
 4387     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4388     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4389     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4390     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4391     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4392     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4393     __ ld1(v24, __ T1D, rscratch1);
 4394 
 4395     __ BIND(sha3_loop);
 4396 
 4397     // 24 keccak rounds
 4398     __ movw(rscratch2, 24);
 4399 
 4400     // load round_constants base
 4401     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4402 
 4403     // load input
 4404     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4405     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4406     __ eor(v0, __ T8B, v0, v25);
 4407     __ eor(v1, __ T8B, v1, v26);
 4408     __ eor(v2, __ T8B, v2, v27);
 4409     __ eor(v3, __ T8B, v3, v28);
 4410     __ eor(v4, __ T8B, v4, v29);
 4411     __ eor(v5, __ T8B, v5, v30);
 4412     __ eor(v6, __ T8B, v6, v31);
 4413 
 4414     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4415     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4416 
 4417     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4418     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4419     __ eor(v7, __ T8B, v7, v25);
 4420     __ eor(v8, __ T8B, v8, v26);
 4421     __ eor(v9, __ T8B, v9, v27);
 4422     __ eor(v10, __ T8B, v10, v28);
 4423     __ eor(v11, __ T8B, v11, v29);
 4424     __ eor(v12, __ T8B, v12, v30);
 4425     __ eor(v13, __ T8B, v13, v31);
 4426 
 4427     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4428     __ eor(v14, __ T8B, v14, v25);
 4429     __ eor(v15, __ T8B, v15, v26);
 4430     __ eor(v16, __ T8B, v16, v27);
 4431 
 4432     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4433     __ andw(c_rarg5, block_size, 48);
 4434     __ cbzw(c_rarg5, rounds24_loop);
 4435 
 4436     __ tbnz(block_size, 5, shake128);
 4437     // block_size == 144, bit5 == 0, SHA3-224
 4438     __ ldrd(v28, __ post(buf, 8));
 4439     __ eor(v17, __ T8B, v17, v28);
 4440     __ b(rounds24_loop);
 4441 
 4442     __ BIND(shake128);
 4443     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4444     __ eor(v17, __ T8B, v17, v28);
 4445     __ eor(v18, __ T8B, v18, v29);
 4446     __ eor(v19, __ T8B, v19, v30);
 4447     __ eor(v20, __ T8B, v20, v31);
 4448     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4449 
 4450     __ BIND(sha3_512_or_sha3_384);
 4451     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4452     __ eor(v7, __ T8B, v7, v25);
 4453     __ eor(v8, __ T8B, v8, v26);
 4454     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4455 
 4456     // SHA3-384
 4457     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4458     __ eor(v9,  __ T8B, v9,  v27);
 4459     __ eor(v10, __ T8B, v10, v28);
 4460     __ eor(v11, __ T8B, v11, v29);
 4461     __ eor(v12, __ T8B, v12, v30);
 4462 
 4463     __ BIND(rounds24_loop);
 4464     __ subw(rscratch2, rscratch2, 1);
 4465 
 4466     keccak_round(rscratch1);
 4467 
 4468     __ cbnzw(rscratch2, rounds24_loop);
 4469 
 4470     if (multi_block) {
 4471       __ add(ofs, ofs, block_size);
 4472       __ cmp(ofs, limit);
 4473       __ br(Assembler::LE, sha3_loop);
 4474       __ mov(c_rarg0, ofs); // return ofs
 4475     }
 4476 
 4477     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4478     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4479     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4480     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4481     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4482     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4483     __ st1(v24, __ T1D, state);
 4484 
 4485     // restore callee-saved registers
 4486     __ ldpd(v14, v15, Address(sp, 48));
 4487     __ ldpd(v12, v13, Address(sp, 32));
 4488     __ ldpd(v10, v11, Address(sp, 16));
 4489     __ ldpd(v8, v9, __ post(sp, 64));
 4490 
 4491     __ ret(lr);
 4492 
 4493     return start;
 4494   }
 4495 
 4496   // Inputs:
 4497   //   c_rarg0   - long[]  state0
 4498   //   c_rarg1   - long[]  state1
 4499   address generate_double_keccak() {
 4500     static const uint64_t round_consts[24] = {
 4501       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4502       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4503       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4504       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4505       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4506       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4507       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4508       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4509     };
 4510 
 4511     // Implements the double_keccak() method of the
 4512     // sun.secyrity.provider.SHA3Parallel class
 4513     __ align(CodeEntryAlignment);
 4514     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4515     address start = __ pc();
 4516     __ enter();
 4517 
 4518     Register state0        = c_rarg0;
 4519     Register state1        = c_rarg1;
 4520 
 4521     Label rounds24_loop;
 4522 
 4523     // save callee-saved registers
 4524     __ stpd(v8, v9, __ pre(sp, -64));
 4525     __ stpd(v10, v11, Address(sp, 16));
 4526     __ stpd(v12, v13, Address(sp, 32));
 4527     __ stpd(v14, v15, Address(sp, 48));
 4528 
 4529     // load states
 4530     __ add(rscratch1, state0, 32);
 4531     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4532     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4533     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4534     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4535     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4536     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4537     __ ld1(v24, __ D, 0, rscratch1);
 4538     __ add(rscratch1, state1, 32);
 4539     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4540     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4541     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4542     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4543     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4544     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4545     __ ld1(v24, __ D, 1, rscratch1);
 4546 
 4547     // 24 keccak rounds
 4548     __ movw(rscratch2, 24);
 4549 
 4550     // load round_constants base
 4551     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4552 
 4553     __ BIND(rounds24_loop);
 4554     __ subw(rscratch2, rscratch2, 1);
 4555     keccak_round(rscratch1);
 4556     __ cbnzw(rscratch2, rounds24_loop);
 4557 
 4558     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4559     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4560     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4561     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4562     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4563     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4564     __ st1(v24, __ D, 0, state0);
 4565     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4566     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4567     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4568     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4569     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4570     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4571     __ st1(v24, __ D, 1, state1);
 4572 
 4573     // restore callee-saved vector registers
 4574     __ ldpd(v14, v15, Address(sp, 48));
 4575     __ ldpd(v12, v13, Address(sp, 32));
 4576     __ ldpd(v10, v11, Address(sp, 16));
 4577     __ ldpd(v8, v9, __ post(sp, 64));
 4578 
 4579     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4580     __ mov(r0, zr); // return 0
 4581     __ ret(lr);
 4582 
 4583     return start;
 4584   }
 4585 
 4586   // ChaCha20 block function.  This version parallelizes the 32-bit
 4587   // state elements on each of 16 vectors, producing 4 blocks of
 4588   // keystream at a time.
 4589   //
 4590   // state (int[16]) = c_rarg0
 4591   // keystream (byte[256]) = c_rarg1
 4592   // return - number of bytes of produced keystream (always 256)
 4593   //
 4594   // This implementation takes each 32-bit integer from the state
 4595   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4596   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4597   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4598   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4599   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4600   // operations represented by one QUARTERROUND function, we instead stack all
 4601   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4602   // and then do the same for the second set of 4 quarter rounds.  This removes
 4603   // some latency that would otherwise be incurred by waiting for an add to
 4604   // complete before performing an xor (which depends on the result of the
 4605   // add), etc. An adjustment happens between the first and second groups of 4
 4606   // quarter rounds, but this is done only in the inputs to the macro functions
 4607   // that generate the assembly instructions - these adjustments themselves are
 4608   // not part of the resulting assembly.
 4609   // The 4 registers v0-v3 are used during the quarter round operations as
 4610   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4611   // registers become the vectors involved in adding the start state back onto
 4612   // the post-QR working state.  After the adds are complete, each of the 16
 4613   // vectors write their first lane back to the keystream buffer, followed
 4614   // by the second lane from all vectors and so on.
 4615   address generate_chacha20Block_blockpar() {
 4616     Label L_twoRounds, L_cc20_const;
 4617     __ align(CodeEntryAlignment);
 4618     StubId stub_id = StubId::stubgen_chacha20Block_id;
 4619     StubCodeMark mark(this, stub_id);
 4620     address start = __ pc();
 4621     __ enter();
 4622 
 4623     int i, j;
 4624     const Register state = c_rarg0;
 4625     const Register keystream = c_rarg1;
 4626     const Register loopCtr = r10;
 4627     const Register tmpAddr = r11;
 4628     const FloatRegister ctrAddOverlay = v28;
 4629     const FloatRegister lrot8Tbl = v29;
 4630 
 4631     // Organize SIMD registers in an array that facilitates
 4632     // putting repetitive opcodes into loop structures.  It is
 4633     // important that each grouping of 4 registers is monotonically
 4634     // increasing to support the requirements of multi-register
 4635     // instructions (e.g. ld4r, st4, etc.)
 4636     const FloatRegister workSt[16] = {
 4637          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4638         v20, v21, v22, v23, v24, v25, v26, v27
 4639     };
 4640 
 4641     // Pull in constant data.  The first 16 bytes are the add overlay
 4642     // which is applied to the vector holding the counter (state[12]).
 4643     // The second 16 bytes is the index register for the 8-bit left
 4644     // rotation tbl instruction.
 4645     __ adr(tmpAddr, L_cc20_const);
 4646     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4647 
 4648     // Load from memory and interlace across 16 SIMD registers,
 4649     // With each word from memory being broadcast to all lanes of
 4650     // each successive SIMD register.
 4651     //      Addr(0) -> All lanes in workSt[i]
 4652     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4653     __ mov(tmpAddr, state);
 4654     for (i = 0; i < 16; i += 4) {
 4655       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4656           __ post(tmpAddr, 16));
 4657     }
 4658     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4659 
 4660     // Before entering the loop, create 5 4-register arrays.  These
 4661     // will hold the 4 registers that represent the a/b/c/d fields
 4662     // in the quarter round operation.  For instance the "b" field
 4663     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4664     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4665     // since it is part of a diagonal organization.  The aSet and scratch
 4666     // register sets are defined at declaration time because they do not change
 4667     // organization at any point during the 20-round processing.
 4668     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4669     FloatRegister bSet[4];
 4670     FloatRegister cSet[4];
 4671     FloatRegister dSet[4];
 4672     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4673 
 4674     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4675     __ mov(loopCtr, 10);
 4676     __ BIND(L_twoRounds);
 4677 
 4678     // Set to columnar organization and do the following 4 quarter-rounds:
 4679     // QUARTERROUND(0, 4, 8, 12)
 4680     // QUARTERROUND(1, 5, 9, 13)
 4681     // QUARTERROUND(2, 6, 10, 14)
 4682     // QUARTERROUND(3, 7, 11, 15)
 4683     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4684     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4685     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4686 
 4687     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4688     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4689     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4690 
 4691     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4692     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4693     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4694 
 4695     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4696     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4697     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4698 
 4699     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4700     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4701     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4702 
 4703     // Set to diagonal organization and do the next 4 quarter-rounds:
 4704     // QUARTERROUND(0, 5, 10, 15)
 4705     // QUARTERROUND(1, 6, 11, 12)
 4706     // QUARTERROUND(2, 7, 8, 13)
 4707     // QUARTERROUND(3, 4, 9, 14)
 4708     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4709     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4710     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4711 
 4712     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4713     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4714     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4715 
 4716     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4717     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4718     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4719 
 4720     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4721     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4722     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4723 
 4724     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4725     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4726     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4727 
 4728     // Decrement and iterate
 4729     __ sub(loopCtr, loopCtr, 1);
 4730     __ cbnz(loopCtr, L_twoRounds);
 4731 
 4732     __ mov(tmpAddr, state);
 4733 
 4734     // Add the starting state back to the post-loop keystream
 4735     // state.  We read/interlace the state array from memory into
 4736     // 4 registers similar to what we did in the beginning.  Then
 4737     // add the counter overlay onto workSt[12] at the end.
 4738     for (i = 0; i < 16; i += 4) {
 4739       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4740       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4741       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4742       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4743       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4744     }
 4745     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4746 
 4747     // Write working state into the keystream buffer.  This is accomplished
 4748     // by taking the lane "i" from each of the four vectors and writing
 4749     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4750     // repeating with the next 4 vectors until all 16 vectors have been used.
 4751     // Then move to the next lane and repeat the process until all lanes have
 4752     // been written.
 4753     for (i = 0; i < 4; i++) {
 4754       for (j = 0; j < 16; j += 4) {
 4755         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4756             __ post(keystream, 16));
 4757       }
 4758     }
 4759 
 4760     __ mov(r0, 256);             // Return length of output keystream
 4761     __ leave();
 4762     __ ret(lr);
 4763 
 4764     // bind label and generate local constant data used by this stub
 4765     // The constant data is broken into two 128-bit segments to be loaded
 4766     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4767     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4768     // The second 128-bits is a table constant used for 8-bit left rotations.
 4769     __ BIND(L_cc20_const);
 4770     __ emit_int64(0x0000000100000000UL);
 4771     __ emit_int64(0x0000000300000002UL);
 4772     __ emit_int64(0x0605040702010003UL);
 4773     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4774 
 4775     return start;
 4776   }
 4777 
 4778   // Helpers to schedule parallel operation bundles across vector
 4779   // register sequences of size 2, 4 or 8.
 4780 
 4781   // Implement various primitive computations across vector sequences
 4782 
 4783   template<int N>
 4784   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4785                const VSeq<N>& v1, const VSeq<N>& v2) {
 4786     // output must not be constant
 4787     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4788     // output cannot overwrite pending inputs
 4789     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4790     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4791     for (int i = 0; i < N; i++) {
 4792       __ addv(v[i], T, v1[i], v2[i]);
 4793     }
 4794   }
 4795 
 4796   template<int N>
 4797   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4798                const VSeq<N>& v1, const VSeq<N>& v2) {
 4799     // output must not be constant
 4800     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4801     // output cannot overwrite pending inputs
 4802     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4803     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4804     for (int i = 0; i < N; i++) {
 4805       __ subv(v[i], T, v1[i], v2[i]);
 4806     }
 4807   }
 4808 
 4809   template<int N>
 4810   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4811                const VSeq<N>& v1, const VSeq<N>& v2) {
 4812     // output must not be constant
 4813     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4814     // output cannot overwrite pending inputs
 4815     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4816     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4817     for (int i = 0; i < N; i++) {
 4818       __ mulv(v[i], T, v1[i], v2[i]);
 4819     }
 4820   }
 4821 
 4822   template<int N>
 4823   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4824     // output must not be constant
 4825     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4826     // output cannot overwrite pending inputs
 4827     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4828     for (int i = 0; i < N; i++) {
 4829       __ negr(v[i], T, v1[i]);
 4830     }
 4831   }
 4832 
 4833   template<int N>
 4834   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4835                const VSeq<N>& v1, int shift) {
 4836     // output must not be constant
 4837     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4838     // output cannot overwrite pending inputs
 4839     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4840     for (int i = 0; i < N; i++) {
 4841       __ sshr(v[i], T, v1[i], shift);
 4842     }
 4843   }
 4844 
 4845   template<int N>
 4846   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4847     // output must not be constant
 4848     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4849     // output cannot overwrite pending inputs
 4850     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4851     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4852     for (int i = 0; i < N; i++) {
 4853       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4854     }
 4855   }
 4856 
 4857   template<int N>
 4858   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4859     // output must not be constant
 4860     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4861     // output cannot overwrite pending inputs
 4862     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4863     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4864     for (int i = 0; i < N; i++) {
 4865       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4866     }
 4867   }
 4868 
 4869   template<int N>
 4870   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4871     // output must not be constant
 4872     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4873     // output cannot overwrite pending inputs
 4874     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4875     for (int i = 0; i < N; i++) {
 4876       __ notr(v[i], __ T16B, v1[i]);
 4877     }
 4878   }
 4879 
 4880   template<int N>
 4881   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4882     // output must not be constant
 4883     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4884     // output cannot overwrite pending inputs
 4885     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4886     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4887     for (int i = 0; i < N; i++) {
 4888       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4889     }
 4890   }
 4891 
 4892   template<int N>
 4893   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4894     // output must not be constant
 4895     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4896     // output cannot overwrite pending inputs
 4897     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4898     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4899     for (int i = 0; i < N; i++) {
 4900       __ mlsv(v[i], T, v1[i], v2[i]);
 4901     }
 4902   }
 4903 
 4904   // load N/2 successive pairs of quadword values from memory in order
 4905   // into N successive vector registers of the sequence via the
 4906   // address supplied in base.
 4907   template<int N>
 4908   void vs_ldpq(const VSeq<N>& v, Register base) {
 4909     for (int i = 0; i < N; i += 2) {
 4910       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4911     }
 4912   }
 4913 
 4914   // load N/2 successive pairs of quadword values from memory in order
 4915   // into N vector registers of the sequence via the address supplied
 4916   // in base using post-increment addressing
 4917   template<int N>
 4918   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4919     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4920     for (int i = 0; i < N; i += 2) {
 4921       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4922     }
 4923   }
 4924 
 4925   // store N successive vector registers of the sequence into N/2
 4926   // successive pairs of quadword memory locations via the address
 4927   // supplied in base using post-increment addressing
 4928   template<int N>
 4929   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4930     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4931     for (int i = 0; i < N; i += 2) {
 4932       __ stpq(v[i], v[i+1], __ post(base, 32));
 4933     }
 4934   }
 4935 
 4936   // load N/2 pairs of quadword values from memory de-interleaved into
 4937   // N vector registers 2 at a time via the address supplied in base
 4938   // using post-increment addressing.
 4939   template<int N>
 4940   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4941     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4942     for (int i = 0; i < N; i += 2) {
 4943       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4944     }
 4945   }
 4946 
 4947   // store N vector registers interleaved into N/2 pairs of quadword
 4948   // memory locations via the address supplied in base using
 4949   // post-increment addressing.
 4950   template<int N>
 4951   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4952     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4953     for (int i = 0; i < N; i += 2) {
 4954       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4955     }
 4956   }
 4957 
 4958   // load N quadword values from memory de-interleaved into N vector
 4959   // registers 3 elements at a time via the address supplied in base.
 4960   template<int N>
 4961   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4962     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4963     for (int i = 0; i < N; i += 3) {
 4964       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4965     }
 4966   }
 4967 
 4968   // load N quadword values from memory de-interleaved into N vector
 4969   // registers 3 elements at a time via the address supplied in base
 4970   // using post-increment addressing.
 4971   template<int N>
 4972   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4973     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4974     for (int i = 0; i < N; i += 3) {
 4975       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4976     }
 4977   }
 4978 
 4979   // load N/2 pairs of quadword values from memory into N vector
 4980   // registers via the address supplied in base with each pair indexed
 4981   // using the the start offset plus the corresponding entry in the
 4982   // offsets array
 4983   template<int N>
 4984   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4985     for (int i = 0; i < N/2; i++) {
 4986       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4987     }
 4988   }
 4989 
 4990   // store N vector registers into N/2 pairs of quadword memory
 4991   // locations via the address supplied in base with each pair indexed
 4992   // using the the start offset plus the corresponding entry in the
 4993   // offsets array
 4994   template<int N>
 4995   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4996     for (int i = 0; i < N/2; i++) {
 4997       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4998     }
 4999   }
 5000 
 5001   // load N single quadword values from memory into N vector registers
 5002   // via the address supplied in base with each value indexed using
 5003   // the the start offset plus the corresponding entry in the offsets
 5004   // array
 5005   template<int N>
 5006   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5007                       int start, int (&offsets)[N]) {
 5008     for (int i = 0; i < N; i++) {
 5009       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5010     }
 5011   }
 5012 
 5013   // store N vector registers into N single quadword memory locations
 5014   // via the address supplied in base with each value indexed using
 5015   // the the start offset plus the corresponding entry in the offsets
 5016   // array
 5017   template<int N>
 5018   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5019                       int start, int (&offsets)[N]) {
 5020     for (int i = 0; i < N; i++) {
 5021       __ str(v[i], T, Address(base, start + offsets[i]));
 5022     }
 5023   }
 5024 
 5025   // load N/2 pairs of quadword values from memory de-interleaved into
 5026   // N vector registers 2 at a time via the address supplied in base
 5027   // with each pair indexed using the the start offset plus the
 5028   // corresponding entry in the offsets array
 5029   template<int N>
 5030   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5031                       Register tmp, int start, int (&offsets)[N/2]) {
 5032     for (int i = 0; i < N/2; i++) {
 5033       __ add(tmp, base, start + offsets[i]);
 5034       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5035     }
 5036   }
 5037 
 5038   // store N vector registers 2 at a time interleaved into N/2 pairs
 5039   // of quadword memory locations via the address supplied in base
 5040   // with each pair indexed using the the start offset plus the
 5041   // corresponding entry in the offsets array
 5042   template<int N>
 5043   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5044                       Register tmp, int start, int (&offsets)[N/2]) {
 5045     for (int i = 0; i < N/2; i++) {
 5046       __ add(tmp, base, start + offsets[i]);
 5047       __ st2(v[2*i], v[2*i+1], T, tmp);
 5048     }
 5049   }
 5050 
 5051   // Helper routines for various flavours of Montgomery multiply
 5052 
 5053   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5054   // multiplications in parallel
 5055   //
 5056 
 5057   // See the montMul() method of the sun.security.provider.ML_DSA
 5058   // class.
 5059   //
 5060   // Computes 4x4S results or 8x8H results
 5061   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5062   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5063   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5064   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5065   // Outputs: va - 4x4S or 4x8H vector register sequences
 5066   // vb, vc, vtmp and vq must all be disjoint
 5067   // va must be disjoint from all other inputs/temps or must equal vc
 5068   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5069   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5070   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5071                    Assembler::SIMD_Arrangement T,
 5072                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5073     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5074     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5075     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5076     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5077 
 5078     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5079     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5080 
 5081     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5082 
 5083     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5084     assert(vs_disjoint(va, vb), "va and vb overlap");
 5085     assert(vs_disjoint(va, vq), "va and vq overlap");
 5086     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5087     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5088 
 5089     // schedule 4 streams of instructions across the vector sequences
 5090     for (int i = 0; i < 4; i++) {
 5091       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5092       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5093     }
 5094 
 5095     for (int i = 0; i < 4; i++) {
 5096       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5097     }
 5098 
 5099     for (int i = 0; i < 4; i++) {
 5100       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5101     }
 5102 
 5103     for (int i = 0; i < 4; i++) {
 5104       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5105     }
 5106   }
 5107 
 5108   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5109   // multiplications in parallel
 5110   //
 5111 
 5112   // See the montMul() method of the sun.security.provider.ML_DSA
 5113   // class.
 5114   //
 5115   // Computes 4x4S results or 8x8H results
 5116   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5117   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5118   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5119   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5120   // Outputs: va - 4x4S or 4x8H vector register sequences
 5121   // vb, vc, vtmp and vq must all be disjoint
 5122   // va must be disjoint from all other inputs/temps or must equal vc
 5123   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5124   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5125   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5126                    Assembler::SIMD_Arrangement T,
 5127                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5128     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5129     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5130     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5131     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5132 
 5133     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5134     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5135 
 5136     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5137 
 5138     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5139     assert(vs_disjoint(va, vb), "va and vb overlap");
 5140     assert(vs_disjoint(va, vq), "va and vq overlap");
 5141     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5142     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5143 
 5144     // schedule 2 streams of instructions across the vector sequences
 5145     for (int i = 0; i < 2; i++) {
 5146       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5147       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5148     }
 5149 
 5150     for (int i = 0; i < 2; i++) {
 5151       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5152     }
 5153 
 5154     for (int i = 0; i < 2; i++) {
 5155       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5156     }
 5157 
 5158     for (int i = 0; i < 2; i++) {
 5159       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5160     }
 5161   }
 5162 
 5163   // Perform 16 16-bit Montgomery multiplications in parallel.
 5164   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5165                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5166     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5167     // It will assert that the register use is valid
 5168     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5169   }
 5170 
 5171   // Perform 32 16-bit Montgomery multiplications in parallel.
 5172   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5173                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5174     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5175     // It will assert that the register use is valid
 5176     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5177   }
 5178 
 5179   // Perform 64 16-bit Montgomery multiplications in parallel.
 5180   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5181                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5182     // Schedule two successive 4x8H multiplies via the montmul helper
 5183     // on the front and back halves of va, vb and vc. The helper will
 5184     // assert that the register use has no overlap conflicts on each
 5185     // individual call but we also need to ensure that the necessary
 5186     // disjoint/equality constraints are met across both calls.
 5187 
 5188     // vb, vc, vtmp and vq must be disjoint. va must either be
 5189     // disjoint from all other registers or equal vc
 5190 
 5191     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5192     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5193     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5194 
 5195     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5196     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5197 
 5198     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5199 
 5200     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5201     assert(vs_disjoint(va, vb), "va and vb overlap");
 5202     assert(vs_disjoint(va, vq), "va and vq overlap");
 5203     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5204 
 5205     // we multiply the front and back halves of each sequence 4 at a
 5206     // time because
 5207     //
 5208     // 1) we are currently only able to get 4-way instruction
 5209     // parallelism at best
 5210     //
 5211     // 2) we need registers for the constants in vq and temporary
 5212     // scratch registers to hold intermediate results so vtmp can only
 5213     // be a VSeq<4> which means we only have 4 scratch slots
 5214 
 5215     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5216     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5217   }
 5218 
 5219   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5220                                const VSeq<4>& vc,
 5221                                const VSeq<4>& vtmp,
 5222                                const VSeq<2>& vq) {
 5223     // compute a = montmul(a1, c)
 5224     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5225     // ouptut a1 = a0 - a
 5226     vs_subv(va1, __ T8H, va0, vc);
 5227     //    and a0 = a0 + a
 5228     vs_addv(va0, __ T8H, va0, vc);
 5229   }
 5230 
 5231   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5232                                const VSeq<4>& vb,
 5233                                const VSeq<4>& vtmp1,
 5234                                const VSeq<4>& vtmp2,
 5235                                const VSeq<2>& vq) {
 5236     // compute c = a0 - a1
 5237     vs_subv(vtmp1, __ T8H, va0, va1);
 5238     // output a0 = a0 + a1
 5239     vs_addv(va0, __ T8H, va0, va1);
 5240     // output a1 = b montmul c
 5241     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5242   }
 5243 
 5244   void load64shorts(const VSeq<8>& v, Register shorts) {
 5245     vs_ldpq_post(v, shorts);
 5246   }
 5247 
 5248   void load32shorts(const VSeq<4>& v, Register shorts) {
 5249     vs_ldpq_post(v, shorts);
 5250   }
 5251 
 5252   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5253     vs_stpq_post(v, tmpAddr);
 5254   }
 5255 
 5256   // Kyber NTT function.
 5257   // Implements
 5258   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5259   //
 5260   // coeffs (short[256]) = c_rarg0
 5261   // ntt_zetas (short[256]) = c_rarg1
 5262   address generate_kyberNtt() {
 5263 
 5264     __ align(CodeEntryAlignment);
 5265     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5266     StubCodeMark mark(this, stub_id);
 5267     address start = __ pc();
 5268     __ enter();
 5269 
 5270     const Register coeffs = c_rarg0;
 5271     const Register zetas = c_rarg1;
 5272 
 5273     const Register kyberConsts = r10;
 5274     const Register tmpAddr = r11;
 5275 
 5276     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5277     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5278     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5279 
 5280     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5281     // load the montmul constants
 5282     vs_ldpq(vq, kyberConsts);
 5283 
 5284     // Each level corresponds to an iteration of the outermost loop of the
 5285     // Java method seilerNTT(int[] coeffs). There are some differences
 5286     // from what is done in the seilerNTT() method, though:
 5287     // 1. The computation is using 16-bit signed values, we do not convert them
 5288     // to ints here.
 5289     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5290     // this array for each level, it is easier that way to fill up the vector
 5291     // registers.
 5292     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5293     // multiplications (this is because that way there should not be any
 5294     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5295     // that we can use the 16-bit arithmetic in the vector unit.
 5296     //
 5297     // On each level, we fill up the vector registers in such a way that the
 5298     // array elements that need to be multiplied by the zetas go into one
 5299     // set of vector registers while the corresponding ones that don't need to
 5300     // be multiplied, go into another set.
 5301     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5302     // registers interleaving the steps of 4 identical computations,
 5303     // each done on 8 16-bit values per register.
 5304 
 5305     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5306     // to the zetas occur in discrete blocks whose size is some multiple
 5307     // of 32.
 5308 
 5309     // level 0
 5310     __ add(tmpAddr, coeffs, 256);
 5311     load64shorts(vs1, tmpAddr);
 5312     load64shorts(vs2, zetas);
 5313     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5314     __ add(tmpAddr, coeffs, 0);
 5315     load64shorts(vs1, tmpAddr);
 5316     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5317     vs_addv(vs1, __ T8H, vs1, vs2);
 5318     __ add(tmpAddr, coeffs, 0);
 5319     vs_stpq_post(vs1, tmpAddr);
 5320     __ add(tmpAddr, coeffs, 256);
 5321     vs_stpq_post(vs3, tmpAddr);
 5322     // restore montmul constants
 5323     vs_ldpq(vq, kyberConsts);
 5324     load64shorts(vs1, tmpAddr);
 5325     load64shorts(vs2, zetas);
 5326     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5327     __ add(tmpAddr, coeffs, 128);
 5328     load64shorts(vs1, tmpAddr);
 5329     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5330     vs_addv(vs1, __ T8H, vs1, vs2);
 5331     __ add(tmpAddr, coeffs, 128);
 5332     store64shorts(vs1, tmpAddr);
 5333     __ add(tmpAddr, coeffs, 384);
 5334     store64shorts(vs3, tmpAddr);
 5335 
 5336     // level 1
 5337     // restore montmul constants
 5338     vs_ldpq(vq, kyberConsts);
 5339     __ add(tmpAddr, coeffs, 128);
 5340     load64shorts(vs1, tmpAddr);
 5341     load64shorts(vs2, zetas);
 5342     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5343     __ add(tmpAddr, coeffs, 0);
 5344     load64shorts(vs1, tmpAddr);
 5345     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5346     vs_addv(vs1, __ T8H, vs1, vs2);
 5347     __ add(tmpAddr, coeffs, 0);
 5348     store64shorts(vs1, tmpAddr);
 5349     store64shorts(vs3, tmpAddr);
 5350     vs_ldpq(vq, kyberConsts);
 5351     __ add(tmpAddr, coeffs, 384);
 5352     load64shorts(vs1, tmpAddr);
 5353     load64shorts(vs2, zetas);
 5354     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5355     __ add(tmpAddr, coeffs, 256);
 5356     load64shorts(vs1, tmpAddr);
 5357     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5358     vs_addv(vs1, __ T8H, vs1, vs2);
 5359     __ add(tmpAddr, coeffs, 256);
 5360     store64shorts(vs1, tmpAddr);
 5361     store64shorts(vs3, tmpAddr);
 5362 
 5363     // level 2
 5364     vs_ldpq(vq, kyberConsts);
 5365     int offsets1[4] = { 0, 32, 128, 160 };
 5366     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5367     load64shorts(vs2, zetas);
 5368     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5369     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5370     // kyber_subv_addv64();
 5371     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5372     vs_addv(vs1, __ T8H, vs1, vs2);
 5373     __ add(tmpAddr, coeffs, 0);
 5374     vs_stpq_post(vs_front(vs1), tmpAddr);
 5375     vs_stpq_post(vs_front(vs3), tmpAddr);
 5376     vs_stpq_post(vs_back(vs1), tmpAddr);
 5377     vs_stpq_post(vs_back(vs3), tmpAddr);
 5378     vs_ldpq(vq, kyberConsts);
 5379     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5380     load64shorts(vs2, zetas);
 5381     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5382     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5383     // kyber_subv_addv64();
 5384     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5385     vs_addv(vs1, __ T8H, vs1, vs2);
 5386     __ add(tmpAddr, coeffs, 256);
 5387     vs_stpq_post(vs_front(vs1), tmpAddr);
 5388     vs_stpq_post(vs_front(vs3), tmpAddr);
 5389     vs_stpq_post(vs_back(vs1), tmpAddr);
 5390     vs_stpq_post(vs_back(vs3), tmpAddr);
 5391 
 5392     // level 3
 5393     vs_ldpq(vq, kyberConsts);
 5394     int offsets2[4] = { 0, 64, 128, 192 };
 5395     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5396     load64shorts(vs2, zetas);
 5397     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5398     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5399     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5400     vs_addv(vs1, __ T8H, vs1, vs2);
 5401     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5402     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5403 
 5404     vs_ldpq(vq, kyberConsts);
 5405     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5406     load64shorts(vs2, zetas);
 5407     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5408     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5409     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5410     vs_addv(vs1, __ T8H, vs1, vs2);
 5411     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5412     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5413 
 5414     // level 4
 5415     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5416     // so they are loaded using employing an ldr at 8 distinct offsets.
 5417 
 5418     vs_ldpq(vq, kyberConsts);
 5419     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5420     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5421     load64shorts(vs2, zetas);
 5422     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5423     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5424     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5425     vs_addv(vs1, __ T8H, vs1, vs2);
 5426     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5427     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5428 
 5429     vs_ldpq(vq, kyberConsts);
 5430     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5431     load64shorts(vs2, zetas);
 5432     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5433     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5434     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5435     vs_addv(vs1, __ T8H, vs1, vs2);
 5436     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5437     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5438 
 5439     // level 5
 5440     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5441     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5442 
 5443     vs_ldpq(vq, kyberConsts);
 5444     int offsets4[4] = { 0, 32, 64, 96 };
 5445     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5446     load32shorts(vs_front(vs2), zetas);
 5447     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5448     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5449     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5450     load32shorts(vs_front(vs2), zetas);
 5451     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5452     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5453     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5454     load32shorts(vs_front(vs2), zetas);
 5455     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5456     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5457 
 5458     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5459     load32shorts(vs_front(vs2), zetas);
 5460     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5461     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5462 
 5463     // level 6
 5464     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5465     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5466 
 5467     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5468     load32shorts(vs_front(vs2), zetas);
 5469     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5470     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5471     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5472     // __ ldpq(v18, v19, __ post(zetas, 32));
 5473     load32shorts(vs_front(vs2), zetas);
 5474     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5475     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5476 
 5477     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5478     load32shorts(vs_front(vs2), zetas);
 5479     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5480     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5481 
 5482     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5483     load32shorts(vs_front(vs2), zetas);
 5484     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5485     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5486 
 5487     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5488     __ mov(r0, zr); // return 0
 5489     __ ret(lr);
 5490 
 5491     return start;
 5492   }
 5493 
 5494   // Kyber Inverse NTT function
 5495   // Implements
 5496   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5497   //
 5498   // coeffs (short[256]) = c_rarg0
 5499   // ntt_zetas (short[256]) = c_rarg1
 5500   address generate_kyberInverseNtt() {
 5501 
 5502     __ align(CodeEntryAlignment);
 5503     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5504     StubCodeMark mark(this, stub_id);
 5505     address start = __ pc();
 5506     __ enter();
 5507 
 5508     const Register coeffs = c_rarg0;
 5509     const Register zetas = c_rarg1;
 5510 
 5511     const Register kyberConsts = r10;
 5512     const Register tmpAddr = r11;
 5513     const Register tmpAddr2 = c_rarg2;
 5514 
 5515     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5516     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5517     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5518 
 5519     __ lea(kyberConsts,
 5520              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5521 
 5522     // level 0
 5523     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5524     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5525 
 5526     vs_ldpq(vq, kyberConsts);
 5527     int offsets4[4] = { 0, 32, 64, 96 };
 5528     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5529     load32shorts(vs_front(vs2), zetas);
 5530     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5531                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5532     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5533     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5534     load32shorts(vs_front(vs2), zetas);
 5535     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5536                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5537     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5538     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5539     load32shorts(vs_front(vs2), zetas);
 5540     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5541                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5542     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5543     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5544     load32shorts(vs_front(vs2), zetas);
 5545     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5546                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5547     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5548 
 5549     // level 1
 5550     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5551     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5552 
 5553     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5554     load32shorts(vs_front(vs2), zetas);
 5555     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5556                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5557     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5558     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5559     load32shorts(vs_front(vs2), zetas);
 5560     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5561                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5562     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5563 
 5564     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5565     load32shorts(vs_front(vs2), zetas);
 5566     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5567                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5568     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5569     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5570     load32shorts(vs_front(vs2), zetas);
 5571     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5572                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5573     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5574 
 5575     // level 2
 5576     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5577     // so they are loaded using employing an ldr at 8 distinct offsets.
 5578 
 5579     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5580     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5581     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5582     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5583     vs_subv(vs1, __ T8H, vs1, vs2);
 5584     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5585     load64shorts(vs2, zetas);
 5586     vs_ldpq(vq, kyberConsts);
 5587     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5588     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5589 
 5590     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5591     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5592     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5593     vs_subv(vs1, __ T8H, vs1, vs2);
 5594     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5595     load64shorts(vs2, zetas);
 5596     vs_ldpq(vq, kyberConsts);
 5597     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5598     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5599 
 5600     // Barrett reduction at indexes where overflow may happen
 5601 
 5602     // load q and the multiplier for the Barrett reduction
 5603     __ add(tmpAddr, kyberConsts, 16);
 5604     vs_ldpq(vq, tmpAddr);
 5605 
 5606     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5607     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5608     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5609     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5610     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5611     vs_sshr(vs2, __ T8H, vs2, 11);
 5612     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5613     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5614     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5615     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5616     vs_sshr(vs2, __ T8H, vs2, 11);
 5617     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5618     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5619 
 5620     // level 3
 5621     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5622     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5623 
 5624     int offsets2[4] = { 0, 64, 128, 192 };
 5625     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5626     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5627     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5628     vs_subv(vs1, __ T8H, vs1, vs2);
 5629     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5630     load64shorts(vs2, zetas);
 5631     vs_ldpq(vq, kyberConsts);
 5632     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5633     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5634 
 5635     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5636     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5637     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5638     vs_subv(vs1, __ T8H, vs1, vs2);
 5639     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5640     load64shorts(vs2, zetas);
 5641     vs_ldpq(vq, kyberConsts);
 5642     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5643     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5644 
 5645     // level 4
 5646 
 5647     int offsets1[4] = { 0, 32, 128, 160 };
 5648     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5649     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5650     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5651     vs_subv(vs1, __ T8H, vs1, vs2);
 5652     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5653     load64shorts(vs2, zetas);
 5654     vs_ldpq(vq, kyberConsts);
 5655     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5656     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5657 
 5658     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5659     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5660     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5661     vs_subv(vs1, __ T8H, vs1, vs2);
 5662     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5663     load64shorts(vs2, zetas);
 5664     vs_ldpq(vq, kyberConsts);
 5665     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5666     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5667 
 5668     // level 5
 5669 
 5670     __ add(tmpAddr, coeffs, 0);
 5671     load64shorts(vs1, tmpAddr);
 5672     __ add(tmpAddr, coeffs, 128);
 5673     load64shorts(vs2, tmpAddr);
 5674     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5675     vs_subv(vs1, __ T8H, vs1, vs2);
 5676     __ add(tmpAddr, coeffs, 0);
 5677     store64shorts(vs3, tmpAddr);
 5678     load64shorts(vs2, zetas);
 5679     vs_ldpq(vq, kyberConsts);
 5680     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5681     __ add(tmpAddr, coeffs, 128);
 5682     store64shorts(vs2, tmpAddr);
 5683 
 5684     load64shorts(vs1, tmpAddr);
 5685     __ add(tmpAddr, coeffs, 384);
 5686     load64shorts(vs2, tmpAddr);
 5687     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5688     vs_subv(vs1, __ T8H, vs1, vs2);
 5689     __ add(tmpAddr, coeffs, 256);
 5690     store64shorts(vs3, tmpAddr);
 5691     load64shorts(vs2, zetas);
 5692     vs_ldpq(vq, kyberConsts);
 5693     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5694     __ add(tmpAddr, coeffs, 384);
 5695     store64shorts(vs2, tmpAddr);
 5696 
 5697     // Barrett reduction at indexes where overflow may happen
 5698 
 5699     // load q and the multiplier for the Barrett reduction
 5700     __ add(tmpAddr, kyberConsts, 16);
 5701     vs_ldpq(vq, tmpAddr);
 5702 
 5703     int offsets0[2] = { 0, 256 };
 5704     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5705     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5706     vs_sshr(vs2, __ T8H, vs2, 11);
 5707     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5708     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5709 
 5710     // level 6
 5711 
 5712     __ add(tmpAddr, coeffs, 0);
 5713     load64shorts(vs1, tmpAddr);
 5714     __ add(tmpAddr, coeffs, 256);
 5715     load64shorts(vs2, tmpAddr);
 5716     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5717     vs_subv(vs1, __ T8H, vs1, vs2);
 5718     __ add(tmpAddr, coeffs, 0);
 5719     store64shorts(vs3, tmpAddr);
 5720     load64shorts(vs2, zetas);
 5721     vs_ldpq(vq, kyberConsts);
 5722     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5723     __ add(tmpAddr, coeffs, 256);
 5724     store64shorts(vs2, tmpAddr);
 5725 
 5726     __ add(tmpAddr, coeffs, 128);
 5727     load64shorts(vs1, tmpAddr);
 5728     __ add(tmpAddr, coeffs, 384);
 5729     load64shorts(vs2, tmpAddr);
 5730     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5731     vs_subv(vs1, __ T8H, vs1, vs2);
 5732     __ add(tmpAddr, coeffs, 128);
 5733     store64shorts(vs3, tmpAddr);
 5734     load64shorts(vs2, zetas);
 5735     vs_ldpq(vq, kyberConsts);
 5736     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5737     __ add(tmpAddr, coeffs, 384);
 5738     store64shorts(vs2, tmpAddr);
 5739 
 5740     // multiply by 2^-n
 5741 
 5742     // load toMont(2^-n mod q)
 5743     __ add(tmpAddr, kyberConsts, 48);
 5744     __ ldr(v29, __ Q, tmpAddr);
 5745 
 5746     vs_ldpq(vq, kyberConsts);
 5747     __ add(tmpAddr, coeffs, 0);
 5748     load64shorts(vs1, tmpAddr);
 5749     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5750     __ add(tmpAddr, coeffs, 0);
 5751     store64shorts(vs2, tmpAddr);
 5752 
 5753     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5754     load64shorts(vs1, tmpAddr);
 5755     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5756     __ add(tmpAddr, coeffs, 128);
 5757     store64shorts(vs2, tmpAddr);
 5758 
 5759     // now tmpAddr contains coeffs + 256
 5760     load64shorts(vs1, tmpAddr);
 5761     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5762     __ add(tmpAddr, coeffs, 256);
 5763     store64shorts(vs2, tmpAddr);
 5764 
 5765     // now tmpAddr contains coeffs + 384
 5766     load64shorts(vs1, tmpAddr);
 5767     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5768     __ add(tmpAddr, coeffs, 384);
 5769     store64shorts(vs2, tmpAddr);
 5770 
 5771     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5772     __ mov(r0, zr); // return 0
 5773     __ ret(lr);
 5774 
 5775     return start;
 5776   }
 5777 
 5778   // Kyber multiply polynomials in the NTT domain.
 5779   // Implements
 5780   // static int implKyberNttMult(
 5781   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5782   //
 5783   // result (short[256]) = c_rarg0
 5784   // ntta (short[256]) = c_rarg1
 5785   // nttb (short[256]) = c_rarg2
 5786   // zetas (short[128]) = c_rarg3
 5787   address generate_kyberNttMult() {
 5788 
 5789     __ align(CodeEntryAlignment);
 5790     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 5791     StubCodeMark mark(this, stub_id);
 5792     address start = __ pc();
 5793     __ enter();
 5794 
 5795     const Register result = c_rarg0;
 5796     const Register ntta = c_rarg1;
 5797     const Register nttb = c_rarg2;
 5798     const Register zetas = c_rarg3;
 5799 
 5800     const Register kyberConsts = r10;
 5801     const Register limit = r11;
 5802 
 5803     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5804     VSeq<4> vs3(16), vs4(20);
 5805     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5806     VSeq<2> vz(28);          // pair of zetas
 5807     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5808 
 5809     __ lea(kyberConsts,
 5810              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5811 
 5812     Label kyberNttMult_loop;
 5813 
 5814     __ add(limit, result, 512);
 5815 
 5816     // load q and qinv
 5817     vs_ldpq(vq, kyberConsts);
 5818 
 5819     // load R^2 mod q (to convert back from Montgomery representation)
 5820     __ add(kyberConsts, kyberConsts, 64);
 5821     __ ldr(v27, __ Q, kyberConsts);
 5822 
 5823     __ BIND(kyberNttMult_loop);
 5824 
 5825     // load 16 zetas
 5826     vs_ldpq_post(vz, zetas);
 5827 
 5828     // load 2 sets of 32 coefficients from the two input arrays
 5829     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5830     // are striped across pairs of vector registers
 5831     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5832     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5833     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5834     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5835 
 5836     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5837     // i.e. montmul the first and second halves of vs1 in order and
 5838     // then with one sequence reversed storing the two results in vs3
 5839     //
 5840     // vs3[0] <- montmul(a0, b0)
 5841     // vs3[1] <- montmul(a1, b1)
 5842     // vs3[2] <- montmul(a0, b1)
 5843     // vs3[3] <- montmul(a1, b0)
 5844     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5845     kyber_montmul16(vs_back(vs3),
 5846                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5847 
 5848     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5849     // i.e. montmul the first and second halves of vs4 in order and
 5850     // then with one sequence reversed storing the two results in vs1
 5851     //
 5852     // vs1[0] <- montmul(a2, b2)
 5853     // vs1[1] <- montmul(a3, b3)
 5854     // vs1[2] <- montmul(a2, b3)
 5855     // vs1[3] <- montmul(a3, b2)
 5856     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5857     kyber_montmul16(vs_back(vs1),
 5858                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5859 
 5860     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5861     // We can schedule two montmuls at a time if we use a suitable vector
 5862     // sequence <vs3[1], vs1[1]>.
 5863     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5864     VSeq<2> vs5(vs3[1], delta);
 5865 
 5866     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5867     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5868     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5869 
 5870     // add results in pairs storing in vs3
 5871     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5872     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5873     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5874 
 5875     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5876     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5877     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5878 
 5879     // vs1 <- montmul(vs3, montRSquareModQ)
 5880     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5881 
 5882     // store back the two pairs of result vectors de-interleaved as 8H elements
 5883     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5884     // in memory
 5885     vs_st2_post(vs1, __ T8H, result);
 5886 
 5887     __ cmp(result, limit);
 5888     __ br(Assembler::NE, kyberNttMult_loop);
 5889 
 5890     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5891     __ mov(r0, zr); // return 0
 5892     __ ret(lr);
 5893 
 5894     return start;
 5895   }
 5896 
 5897   // Kyber add 2 polynomials.
 5898   // Implements
 5899   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5900   //
 5901   // result (short[256]) = c_rarg0
 5902   // a (short[256]) = c_rarg1
 5903   // b (short[256]) = c_rarg2
 5904   address generate_kyberAddPoly_2() {
 5905 
 5906     __ align(CodeEntryAlignment);
 5907     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 5908     StubCodeMark mark(this, stub_id);
 5909     address start = __ pc();
 5910     __ enter();
 5911 
 5912     const Register result = c_rarg0;
 5913     const Register a = c_rarg1;
 5914     const Register b = c_rarg2;
 5915 
 5916     const Register kyberConsts = r11;
 5917 
 5918     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5919     // So, we can load, add and store the data in 3 groups of 11,
 5920     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5921     // registers. A further constraint is that the mapping needs
 5922     // to skip callee saves. So, we allocate the register
 5923     // sequences using two 8 sequences, two 2 sequences and two
 5924     // single registers.
 5925     VSeq<8> vs1_1(0);
 5926     VSeq<2> vs1_2(16);
 5927     FloatRegister vs1_3 = v28;
 5928     VSeq<8> vs2_1(18);
 5929     VSeq<2> vs2_2(26);
 5930     FloatRegister vs2_3 = v29;
 5931 
 5932     // two constant vector sequences
 5933     VSeq<8> vc_1(31, 0);
 5934     VSeq<2> vc_2(31, 0);
 5935 
 5936     FloatRegister vc_3 = v31;
 5937     __ lea(kyberConsts,
 5938              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5939 
 5940     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5941     for (int i = 0; i < 3; i++) {
 5942       // load 80 or 88 values from a into vs1_1/2/3
 5943       vs_ldpq_post(vs1_1, a);
 5944       vs_ldpq_post(vs1_2, a);
 5945       if (i < 2) {
 5946         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5947       }
 5948       // load 80 or 88 values from b into vs2_1/2/3
 5949       vs_ldpq_post(vs2_1, b);
 5950       vs_ldpq_post(vs2_2, b);
 5951       if (i < 2) {
 5952         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5953       }
 5954       // sum 80 or 88 values across vs1 and vs2 into vs1
 5955       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5956       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5957       if (i < 2) {
 5958         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5959       }
 5960       // add constant to all 80 or 88 results
 5961       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5962       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5963       if (i < 2) {
 5964         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5965       }
 5966       // store 80 or 88 values
 5967       vs_stpq_post(vs1_1, result);
 5968       vs_stpq_post(vs1_2, result);
 5969       if (i < 2) {
 5970         __ str(vs1_3, __ Q, __ post(result, 16));
 5971       }
 5972     }
 5973 
 5974     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5975     __ mov(r0, zr); // return 0
 5976     __ ret(lr);
 5977 
 5978     return start;
 5979   }
 5980 
 5981   // Kyber add 3 polynomials.
 5982   // Implements
 5983   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 5984   //
 5985   // result (short[256]) = c_rarg0
 5986   // a (short[256]) = c_rarg1
 5987   // b (short[256]) = c_rarg2
 5988   // c (short[256]) = c_rarg3
 5989   address generate_kyberAddPoly_3() {
 5990 
 5991     __ align(CodeEntryAlignment);
 5992     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 5993     StubCodeMark mark(this, stub_id);
 5994     address start = __ pc();
 5995     __ enter();
 5996 
 5997     const Register result = c_rarg0;
 5998     const Register a = c_rarg1;
 5999     const Register b = c_rarg2;
 6000     const Register c = c_rarg3;
 6001 
 6002     const Register kyberConsts = r11;
 6003 
 6004     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6005     // quadwords.  So, we can load, add and store the data in 3
 6006     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6007     // of 10 or 11 registers. A further constraint is that the
 6008     // mapping needs to skip callee saves. So, we allocate the
 6009     // register sequences using two 8 sequences, two 2 sequences
 6010     // and two single registers.
 6011     VSeq<8> vs1_1(0);
 6012     VSeq<2> vs1_2(16);
 6013     FloatRegister vs1_3 = v28;
 6014     VSeq<8> vs2_1(18);
 6015     VSeq<2> vs2_2(26);
 6016     FloatRegister vs2_3 = v29;
 6017 
 6018     // two constant vector sequences
 6019     VSeq<8> vc_1(31, 0);
 6020     VSeq<2> vc_2(31, 0);
 6021 
 6022     FloatRegister vc_3 = v31;
 6023 
 6024     __ lea(kyberConsts,
 6025              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6026 
 6027     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6028     for (int i = 0; i < 3; i++) {
 6029       // load 80 or 88 values from a into vs1_1/2/3
 6030       vs_ldpq_post(vs1_1, a);
 6031       vs_ldpq_post(vs1_2, a);
 6032       if (i < 2) {
 6033         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6034       }
 6035       // load 80 or 88 values from b into vs2_1/2/3
 6036       vs_ldpq_post(vs2_1, b);
 6037       vs_ldpq_post(vs2_2, b);
 6038       if (i < 2) {
 6039         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6040       }
 6041       // sum 80 or 88 values across vs1 and vs2 into vs1
 6042       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6043       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6044       if (i < 2) {
 6045         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6046       }
 6047       // load 80 or 88 values from c into vs2_1/2/3
 6048       vs_ldpq_post(vs2_1, c);
 6049       vs_ldpq_post(vs2_2, c);
 6050       if (i < 2) {
 6051         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6052       }
 6053       // sum 80 or 88 values across vs1 and vs2 into vs1
 6054       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6055       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6056       if (i < 2) {
 6057         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6058       }
 6059       // add constant to all 80 or 88 results
 6060       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6061       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6062       if (i < 2) {
 6063         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6064       }
 6065       // store 80 or 88 values
 6066       vs_stpq_post(vs1_1, result);
 6067       vs_stpq_post(vs1_2, result);
 6068       if (i < 2) {
 6069         __ str(vs1_3, __ Q, __ post(result, 16));
 6070       }
 6071     }
 6072 
 6073     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6074     __ mov(r0, zr); // return 0
 6075     __ ret(lr);
 6076 
 6077     return start;
 6078   }
 6079 
 6080   // Kyber parse XOF output to polynomial coefficient candidates
 6081   // or decodePoly(12, ...).
 6082   // Implements
 6083   // static int implKyber12To16(
 6084   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6085   //
 6086   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 6087   //
 6088   // condensed (byte[]) = c_rarg0
 6089   // condensedIndex = c_rarg1
 6090   // parsed (short[112 or 256]) = c_rarg2
 6091   // parsedLength (112 or 256) = c_rarg3
 6092   address generate_kyber12To16() {
 6093     Label L_F00, L_loop, L_end;
 6094 
 6095     __ align(CodeEntryAlignment);
 6096     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6097     StubCodeMark mark(this, stub_id);
 6098     address start = __ pc();
 6099     __ enter();
 6100 
 6101     const Register condensed = c_rarg0;
 6102     const Register condensedOffs = c_rarg1;
 6103     const Register parsed = c_rarg2;
 6104     const Register parsedLength = c_rarg3;
 6105 
 6106     const Register tmpAddr = r11;
 6107 
 6108     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6109     // quadwords so we need a 6 vector sequence for the inputs.
 6110     // Parsing produces 64 shorts, employing two 8 vector
 6111     // sequences to store and combine the intermediate data.
 6112     VSeq<6> vin(24);
 6113     VSeq<8> va(0), vb(16);
 6114 
 6115     __ adr(tmpAddr, L_F00);
 6116     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6117     __ add(condensed, condensed, condensedOffs);
 6118 
 6119     __ BIND(L_loop);
 6120     // load 96 (6 x 16B) byte values
 6121     vs_ld3_post(vin, __ T16B, condensed);
 6122 
 6123     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6124     // holds 48 (16x3) contiguous bytes from memory striped
 6125     // horizontally across each of the 16 byte lanes. Equivalently,
 6126     // that is 16 pairs of 12-bit integers. Likewise the back half
 6127     // holds the next 48 bytes in the same arrangement.
 6128 
 6129     // Each vector in the front half can also be viewed as a vertical
 6130     // strip across the 16 pairs of 12 bit integers. Each byte in
 6131     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6132     // byte in vin[1] stores the high 4 bits of the first int and the
 6133     // low 4 bits of the second int. Each byte in vin[2] stores the
 6134     // high 8 bits of the second int. Likewise the vectors in second
 6135     // half.
 6136 
 6137     // Converting the data to 16-bit shorts requires first of all
 6138     // expanding each of the 6 x 16B vectors into 6 corresponding
 6139     // pairs of 8H vectors. Mask, shift and add operations on the
 6140     // resulting vector pairs can be used to combine 4 and 8 bit
 6141     // parts of related 8H vector elements.
 6142     //
 6143     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6144     // twice, one copy manipulated to provide the lower 4 bits
 6145     // belonging to the first short in a pair and another copy
 6146     // manipulated to provide the higher 4 bits belonging to the
 6147     // second short in a pair. This is why the the vector sequences va
 6148     // and vb used to hold the expanded 8H elements are of length 8.
 6149 
 6150     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6151     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6152     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6153     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6154     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6155     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6156     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6157     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6158 
 6159     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6160     // and vb[4:5]
 6161     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6162     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6163     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6164     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6165     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6166     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6167 
 6168     // shift lo byte of copy 1 of the middle stripe into the high byte
 6169     __ shl(va[2], __ T8H, va[2], 8);
 6170     __ shl(va[3], __ T8H, va[3], 8);
 6171     __ shl(vb[2], __ T8H, vb[2], 8);
 6172     __ shl(vb[3], __ T8H, vb[3], 8);
 6173 
 6174     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6175     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6176     // are in bit positions [4..11].
 6177     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6178     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6179     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6180     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6181 
 6182     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6183     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6184     // copy2
 6185     __ andr(va[2], __ T16B, va[2], v31);
 6186     __ andr(va[3], __ T16B, va[3], v31);
 6187     __ ushr(va[4], __ T8H, va[4], 4);
 6188     __ ushr(va[5], __ T8H, va[5], 4);
 6189     __ andr(vb[2], __ T16B, vb[2], v31);
 6190     __ andr(vb[3], __ T16B, vb[3], v31);
 6191     __ ushr(vb[4], __ T8H, vb[4], 4);
 6192     __ ushr(vb[5], __ T8H, vb[5], 4);
 6193 
 6194     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6195     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6196     // n.b. the ordering ensures: i) inputs are consumed before they
 6197     // are overwritten ii) the order of 16-bit results across successive
 6198     // pairs of vectors in va and then vb reflects the order of the
 6199     // corresponding 12-bit inputs
 6200     __ addv(va[0], __ T8H, va[0], va[2]);
 6201     __ addv(va[2], __ T8H, va[1], va[3]);
 6202     __ addv(va[1], __ T8H, va[4], va[6]);
 6203     __ addv(va[3], __ T8H, va[5], va[7]);
 6204     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6205     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6206     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6207     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6208 
 6209     // store 64 results interleaved as shorts
 6210     vs_st2_post(vs_front(va), __ T8H, parsed);
 6211     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6212 
 6213     __ sub(parsedLength, parsedLength, 64);
 6214     __ cmp(parsedLength, (u1)64);
 6215     __ br(Assembler::GE, L_loop);
 6216     __ cbz(parsedLength, L_end);
 6217 
 6218     // if anything is left it should be a final 72 bytes of input
 6219     // i.e. a final 48 12-bit values. so we handle this by loading
 6220     // 48 bytes into all 16B lanes of front(vin) and only 24
 6221     // bytes into the lower 8B lane of back(vin)
 6222     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6223     vs_ld3(vs_back(vin), __ T8B, condensed);
 6224 
 6225     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6226     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6227     // 5 and target element 2 of vb duplicates element 4.
 6228     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6229     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6230     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6231     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6232     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6233     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6234 
 6235     // This time expand just the lower 8 lanes
 6236     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6237     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6238     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6239 
 6240     // shift lo byte of copy 1 of the middle stripe into the high byte
 6241     __ shl(va[2], __ T8H, va[2], 8);
 6242     __ shl(va[3], __ T8H, va[3], 8);
 6243     __ shl(vb[2], __ T8H, vb[2], 8);
 6244 
 6245     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6246     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6247     // int are in bit positions [4..11].
 6248     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6249     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6250     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6251 
 6252     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6253     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6254     // copy2
 6255     __ andr(va[2], __ T16B, va[2], v31);
 6256     __ andr(va[3], __ T16B, va[3], v31);
 6257     __ ushr(va[4], __ T8H, va[4], 4);
 6258     __ ushr(va[5], __ T8H, va[5], 4);
 6259     __ andr(vb[2], __ T16B, vb[2], v31);
 6260     __ ushr(vb[4], __ T8H, vb[4], 4);
 6261 
 6262 
 6263 
 6264     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6265     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6266 
 6267     // n.b. ordering ensures: i) inputs are consumed before they are
 6268     // overwritten ii) order of 16-bit results across succsessive
 6269     // pairs of vectors in va and then lower half of vb reflects order
 6270     // of corresponding 12-bit inputs
 6271     __ addv(va[0], __ T8H, va[0], va[2]);
 6272     __ addv(va[2], __ T8H, va[1], va[3]);
 6273     __ addv(va[1], __ T8H, va[4], va[6]);
 6274     __ addv(va[3], __ T8H, va[5], va[7]);
 6275     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6276     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6277 
 6278     // store 48 results interleaved as shorts
 6279     vs_st2_post(vs_front(va), __ T8H, parsed);
 6280     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6281 
 6282     __ BIND(L_end);
 6283 
 6284     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6285     __ mov(r0, zr); // return 0
 6286     __ ret(lr);
 6287 
 6288     // bind label and generate constant data used by this stub
 6289     __ BIND(L_F00);
 6290     __ emit_int64(0x0f000f000f000f00);
 6291     __ emit_int64(0x0f000f000f000f00);
 6292 
 6293     return start;
 6294   }
 6295 
 6296   // Kyber Barrett reduce function.
 6297   // Implements
 6298   // static int implKyberBarrettReduce(short[] coeffs) {}
 6299   //
 6300   // coeffs (short[256]) = c_rarg0
 6301   address generate_kyberBarrettReduce() {
 6302 
 6303     __ align(CodeEntryAlignment);
 6304     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6305     StubCodeMark mark(this, stub_id);
 6306     address start = __ pc();
 6307     __ enter();
 6308 
 6309     const Register coeffs = c_rarg0;
 6310 
 6311     const Register kyberConsts = r10;
 6312     const Register result = r11;
 6313 
 6314     // As above we process 256 sets of values in total i.e. 32 x
 6315     // 8H quadwords. So, we can load, add and store the data in 3
 6316     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6317     // of 10 or 11 registers. A further constraint is that the
 6318     // mapping needs to skip callee saves. So, we allocate the
 6319     // register sequences using two 8 sequences, two 2 sequences
 6320     // and two single registers.
 6321     VSeq<8> vs1_1(0);
 6322     VSeq<2> vs1_2(16);
 6323     FloatRegister vs1_3 = v28;
 6324     VSeq<8> vs2_1(18);
 6325     VSeq<2> vs2_2(26);
 6326     FloatRegister vs2_3 = v29;
 6327 
 6328     // we also need a pair of corresponding constant sequences
 6329 
 6330     VSeq<8> vc1_1(30, 0);
 6331     VSeq<2> vc1_2(30, 0);
 6332     FloatRegister vc1_3 = v30; // for kyber_q
 6333 
 6334     VSeq<8> vc2_1(31, 0);
 6335     VSeq<2> vc2_2(31, 0);
 6336     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6337 
 6338     __ add(result, coeffs, 0);
 6339     __ lea(kyberConsts,
 6340              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6341 
 6342     // load q and the multiplier for the Barrett reduction
 6343     __ add(kyberConsts, kyberConsts, 16);
 6344     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6345 
 6346     for (int i = 0; i < 3; i++) {
 6347       // load 80 or 88 coefficients
 6348       vs_ldpq_post(vs1_1, coeffs);
 6349       vs_ldpq_post(vs1_2, coeffs);
 6350       if (i < 2) {
 6351         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6352       }
 6353 
 6354       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6355       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6356       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6357       if (i < 2) {
 6358         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6359       }
 6360 
 6361       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6362       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6363       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6364       if (i < 2) {
 6365         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6366       }
 6367 
 6368       // vs1 <- vs1 - vs2 * kyber_q
 6369       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6370       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6371       if (i < 2) {
 6372         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6373       }
 6374 
 6375       vs_stpq_post(vs1_1, result);
 6376       vs_stpq_post(vs1_2, result);
 6377       if (i < 2) {
 6378         __ str(vs1_3, __ Q, __ post(result, 16));
 6379       }
 6380     }
 6381 
 6382     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6383     __ mov(r0, zr); // return 0
 6384     __ ret(lr);
 6385 
 6386     return start;
 6387   }
 6388 
 6389 
 6390   // Dilithium-specific montmul helper routines that generate parallel
 6391   // code for, respectively, a single 4x4s vector sequence montmul or
 6392   // two such multiplies in a row.
 6393 
 6394   // Perform 16 32-bit Montgomery multiplications in parallel
 6395   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6396                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6397     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6398     // It will assert that the register use is valid
 6399     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6400   }
 6401 
 6402   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6403   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6404                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6405     // Schedule two successive 4x4S multiplies via the montmul helper
 6406     // on the front and back halves of va, vb and vc. The helper will
 6407     // assert that the register use has no overlap conflicts on each
 6408     // individual call but we also need to ensure that the necessary
 6409     // disjoint/equality constraints are met across both calls.
 6410 
 6411     // vb, vc, vtmp and vq must be disjoint. va must either be
 6412     // disjoint from all other registers or equal vc
 6413 
 6414     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6415     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6416     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6417 
 6418     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6419     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6420 
 6421     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6422 
 6423     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6424     assert(vs_disjoint(va, vb), "va and vb overlap");
 6425     assert(vs_disjoint(va, vq), "va and vq overlap");
 6426     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6427 
 6428     // We multiply the front and back halves of each sequence 4 at a
 6429     // time because
 6430     //
 6431     // 1) we are currently only able to get 4-way instruction
 6432     // parallelism at best
 6433     //
 6434     // 2) we need registers for the constants in vq and temporary
 6435     // scratch registers to hold intermediate results so vtmp can only
 6436     // be a VSeq<4> which means we only have 4 scratch slots.
 6437 
 6438     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6439     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6440   }
 6441 
 6442   // Perform combined montmul then add/sub on 4x4S vectors.
 6443   void dilithium_montmul16_sub_add(
 6444           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6445           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6446     // compute a = montmul(a1, c)
 6447     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6448     // ouptut a1 = a0 - a
 6449     vs_subv(va1, __ T4S, va0, vc);
 6450     //    and a0 = a0 + a
 6451     vs_addv(va0, __ T4S, va0, vc);
 6452   }
 6453 
 6454   // Perform combined add/sub then montul on 4x4S vectors.
 6455   void dilithium_sub_add_montmul16(
 6456           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6457           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6458     // compute c = a0 - a1
 6459     vs_subv(vtmp1, __ T4S, va0, va1);
 6460     // output a0 = a0 + a1
 6461     vs_addv(va0, __ T4S, va0, va1);
 6462     // output a1 = b montmul c
 6463     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6464   }
 6465 
 6466   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6467   // in the Java implementation come in sequences of at least 8, so we
 6468   // can use ldpq to collect the corresponding data into pairs of vector
 6469   // registers.
 6470   // We collect the coefficients corresponding to the 'j+l' indexes into
 6471   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6472   // then we do the (Montgomery) multiplications by the zetas in parallel
 6473   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6474   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6475   // v0-v7 and finally save the results back to the coeffs array.
 6476   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6477     const Register coeffs, const Register zetas) {
 6478     int c1 = 0;
 6479     int c2 = 512;
 6480     int startIncr;
 6481     // don't use callee save registers v8 - v15
 6482     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6483     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6484     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6485     int offsets[4] = { 0, 32, 64, 96 };
 6486 
 6487     for (int level = 0; level < 5; level++) {
 6488       int c1Start = c1;
 6489       int c2Start = c2;
 6490       if (level == 3) {
 6491         offsets[1] = 32;
 6492         offsets[2] = 128;
 6493         offsets[3] = 160;
 6494       } else if (level == 4) {
 6495         offsets[1] = 64;
 6496         offsets[2] = 128;
 6497         offsets[3] = 192;
 6498       }
 6499 
 6500       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6501       // time at 4 different offsets and multiply them in order by the
 6502       // next set of input values. So we employ indexed load and store
 6503       // pair instructions with arrangement 4S.
 6504       for (int i = 0; i < 4; i++) {
 6505         // reload q and qinv
 6506         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6507         // load 8x4S coefficients via second start pos == c2
 6508         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6509         // load next 8x4S inputs == b
 6510         vs_ldpq_post(vs2, zetas);
 6511         // compute a == c2 * b mod MONT_Q
 6512         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6513         // load 8x4s coefficients via first start pos == c1
 6514         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6515         // compute a1 =  c1 + a
 6516         vs_addv(vs3, __ T4S, vs1, vs2);
 6517         // compute a2 =  c1 - a
 6518         vs_subv(vs1, __ T4S, vs1, vs2);
 6519         // output a1 and a2
 6520         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6521         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6522 
 6523         int k = 4 * level + i;
 6524 
 6525         if (k > 7) {
 6526           startIncr = 256;
 6527         } else if (k == 5) {
 6528           startIncr = 384;
 6529         } else {
 6530           startIncr = 128;
 6531         }
 6532 
 6533         c1Start += startIncr;
 6534         c2Start += startIncr;
 6535       }
 6536 
 6537       c2 /= 2;
 6538     }
 6539   }
 6540 
 6541   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6542   // Implements the method
 6543   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6544   // of the Java class sun.security.provider
 6545   //
 6546   // coeffs (int[256]) = c_rarg0
 6547   // zetas (int[256]) = c_rarg1
 6548   address generate_dilithiumAlmostNtt() {
 6549 
 6550     __ align(CodeEntryAlignment);
 6551     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 6552     StubCodeMark mark(this, stub_id);
 6553     address start = __ pc();
 6554     __ enter();
 6555 
 6556     const Register coeffs = c_rarg0;
 6557     const Register zetas = c_rarg1;
 6558 
 6559     const Register tmpAddr = r9;
 6560     const Register dilithiumConsts = r10;
 6561     const Register result = r11;
 6562     // don't use callee save registers v8 - v15
 6563     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6564     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6565     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6566     int offsets[4] = { 0, 32, 64, 96};
 6567     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6568     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6569     __ add(result, coeffs, 0);
 6570     __ lea(dilithiumConsts,
 6571              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6572 
 6573     // Each level represents one iteration of the outer for loop of the Java version.
 6574 
 6575     // level 0-4
 6576     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6577 
 6578     // level 5
 6579 
 6580     // At level 5 the coefficients we need to combine with the zetas
 6581     // are grouped in memory in blocks of size 4. So, for both sets of
 6582     // coefficients we load 4 adjacent values at 8 different offsets
 6583     // using an indexed ldr with register variant Q and multiply them
 6584     // in sequence order by the next set of inputs. Likewise we store
 6585     // the resuls using an indexed str with register variant Q.
 6586     for (int i = 0; i < 1024; i += 256) {
 6587       // reload constants q, qinv each iteration as they get clobbered later
 6588       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6589       // load 32 (8x4S) coefficients via first offsets = c1
 6590       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6591       // load next 32 (8x4S) inputs = b
 6592       vs_ldpq_post(vs2, zetas);
 6593       // a = b montul c1
 6594       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6595       // load 32 (8x4S) coefficients via second offsets = c2
 6596       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6597       // add/sub with result of multiply
 6598       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6599       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6600       // write back new coefficients using same offsets
 6601       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6602       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6603     }
 6604 
 6605     // level 6
 6606     // At level 6 the coefficients we need to combine with the zetas
 6607     // are grouped in memory in pairs, the first two being montmul
 6608     // inputs and the second add/sub inputs. We can still implement
 6609     // the montmul+sub+add using 4-way parallelism but only if we
 6610     // combine the coefficients with the zetas 16 at a time. We load 8
 6611     // adjacent values at 4 different offsets using an ld2 load with
 6612     // arrangement 2D. That interleaves the lower and upper halves of
 6613     // each pair of quadwords into successive vector registers. We
 6614     // then need to montmul the 4 even elements of the coefficients
 6615     // register sequence by the zetas in order and then add/sub the 4
 6616     // odd elements of the coefficients register sequence. We use an
 6617     // equivalent st2 operation to store the results back into memory
 6618     // de-interleaved.
 6619     for (int i = 0; i < 1024; i += 128) {
 6620       // reload constants q, qinv each iteration as they get clobbered later
 6621       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6622       // load interleaved 16 (4x2D) coefficients via offsets
 6623       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6624       // load next 16 (4x4S) inputs
 6625       vs_ldpq_post(vs_front(vs2), zetas);
 6626       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6627       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6628                                   vs_front(vs2), vtmp, vq);
 6629       // store interleaved 16 (4x2D) coefficients via offsets
 6630       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6631     }
 6632 
 6633     // level 7
 6634     // At level 7 the coefficients we need to combine with the zetas
 6635     // occur singly with montmul inputs alterating with add/sub
 6636     // inputs. Once again we can use 4-way parallelism to combine 16
 6637     // zetas at a time. However, we have to load 8 adjacent values at
 6638     // 4 different offsets using an ld2 load with arrangement 4S. That
 6639     // interleaves the the odd words of each pair into one
 6640     // coefficients vector register and the even words of the pair
 6641     // into the next register. We then need to montmul the 4 even
 6642     // elements of the coefficients register sequence by the zetas in
 6643     // order and then add/sub the 4 odd elements of the coefficients
 6644     // register sequence. We use an equivalent st2 operation to store
 6645     // the results back into memory de-interleaved.
 6646 
 6647     for (int i = 0; i < 1024; i += 128) {
 6648       // reload constants q, qinv each iteration as they get clobbered later
 6649       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6650       // load interleaved 16 (4x4S) coefficients via offsets
 6651       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6652       // load next 16 (4x4S) inputs
 6653       vs_ldpq_post(vs_front(vs2), zetas);
 6654       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6655       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6656                                   vs_front(vs2), vtmp, vq);
 6657       // store interleaved 16 (4x4S) coefficients via offsets
 6658       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6659     }
 6660     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6661     __ mov(r0, zr); // return 0
 6662     __ ret(lr);
 6663 
 6664     return start;
 6665   }
 6666 
 6667   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6668   // in the Java implementation come in sequences of at least 8, so we
 6669   // can use ldpq to collect the corresponding data into pairs of vector
 6670   // registers
 6671   // We collect the coefficients that correspond to the 'j's into vs1
 6672   // the coefficiets that correspond to the 'j+l's into vs2 then
 6673   // do the additions into vs3 and the subtractions into vs1 then
 6674   // save the result of the additions, load the zetas into vs2
 6675   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6676   // finally save the results back to the coeffs array
 6677   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6678     const Register coeffs, const Register zetas) {
 6679     int c1 = 0;
 6680     int c2 = 32;
 6681     int startIncr;
 6682     int offsets[4];
 6683     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6684     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6685     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6686 
 6687     offsets[0] = 0;
 6688 
 6689     for (int level = 3; level < 8; level++) {
 6690       int c1Start = c1;
 6691       int c2Start = c2;
 6692       if (level == 3) {
 6693         offsets[1] = 64;
 6694         offsets[2] = 128;
 6695         offsets[3] = 192;
 6696       } else if (level == 4) {
 6697         offsets[1] = 32;
 6698         offsets[2] = 128;
 6699         offsets[3] = 160;
 6700       } else {
 6701         offsets[1] = 32;
 6702         offsets[2] = 64;
 6703         offsets[3] = 96;
 6704       }
 6705 
 6706       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6707       // time at 4 different offsets and multiply them in order by the
 6708       // next set of input values. So we employ indexed load and store
 6709       // pair instructions with arrangement 4S.
 6710       for (int i = 0; i < 4; i++) {
 6711         // load v1 32 (8x4S) coefficients relative to first start index
 6712         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6713         // load v2 32 (8x4S) coefficients relative to second start index
 6714         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6715         // a0 = v1 + v2 -- n.b. clobbers vqs
 6716         vs_addv(vs3, __ T4S, vs1, vs2);
 6717         // a1 = v1 - v2
 6718         vs_subv(vs1, __ T4S, vs1, vs2);
 6719         // save a1 relative to first start index
 6720         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6721         // load constants q, qinv each iteration as they get clobbered above
 6722         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6723         // load b next 32 (8x4S) inputs
 6724         vs_ldpq_post(vs2, zetas);
 6725         // a = a1 montmul b
 6726         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6727         // save a relative to second start index
 6728         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6729 
 6730         int k = 4 * level + i;
 6731 
 6732         if (k < 24) {
 6733           startIncr = 256;
 6734         } else if (k == 25) {
 6735           startIncr = 384;
 6736         } else {
 6737           startIncr = 128;
 6738         }
 6739 
 6740         c1Start += startIncr;
 6741         c2Start += startIncr;
 6742       }
 6743 
 6744       c2 *= 2;
 6745     }
 6746   }
 6747 
 6748   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6749   // Implements the method
 6750   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6751   // the sun.security.provider.ML_DSA class.
 6752   //
 6753   // coeffs (int[256]) = c_rarg0
 6754   // zetas (int[256]) = c_rarg1
 6755   address generate_dilithiumAlmostInverseNtt() {
 6756 
 6757     __ align(CodeEntryAlignment);
 6758     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 6759     StubCodeMark mark(this, stub_id);
 6760     address start = __ pc();
 6761     __ enter();
 6762 
 6763     const Register coeffs = c_rarg0;
 6764     const Register zetas = c_rarg1;
 6765 
 6766     const Register tmpAddr = r9;
 6767     const Register dilithiumConsts = r10;
 6768     const Register result = r11;
 6769     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6770     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6771     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6772     int offsets[4] = { 0, 32, 64, 96 };
 6773     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6774     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6775 
 6776     __ add(result, coeffs, 0);
 6777     __ lea(dilithiumConsts,
 6778              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6779 
 6780     // Each level represents one iteration of the outer for loop of the Java version
 6781 
 6782     // level 0
 6783     // At level 0 we need to interleave adjacent quartets of
 6784     // coefficients before we multiply and add/sub by the next 16
 6785     // zetas just as we did for level 7 in the multiply code. So we
 6786     // load and store the values using an ld2/st2 with arrangement 4S.
 6787     for (int i = 0; i < 1024; i += 128) {
 6788       // load constants q, qinv
 6789       // n.b. this can be moved out of the loop as they do not get
 6790       // clobbered by first two loops
 6791       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6792       // a0/a1 load interleaved 32 (8x4S) coefficients
 6793       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6794       // b load next 32 (8x4S) inputs
 6795       vs_ldpq_post(vs_front(vs2), zetas);
 6796       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6797       // n.b. second half of vs2 provides temporary register storage
 6798       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6799                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6800       // a0/a1 store interleaved 32 (8x4S) coefficients
 6801       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6802     }
 6803 
 6804     // level 1
 6805     // At level 1 we need to interleave pairs of adjacent pairs of
 6806     // coefficients before we multiply by the next 16 zetas just as we
 6807     // did for level 6 in the multiply code. So we load and store the
 6808     // values an ld2/st2 with arrangement 2D.
 6809     for (int i = 0; i < 1024; i += 128) {
 6810       // a0/a1 load interleaved 32 (8x2D) coefficients
 6811       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6812       // b load next 16 (4x4S) inputs
 6813       vs_ldpq_post(vs_front(vs2), zetas);
 6814       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6815       // n.b. second half of vs2 provides temporary register storage
 6816       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6817                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6818       // a0/a1 store interleaved 32 (8x2D) coefficients
 6819       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6820     }
 6821 
 6822     // level 2
 6823     // At level 2 coefficients come in blocks of 4. So, we load 4
 6824     // adjacent coefficients at 8 distinct offsets for both the first
 6825     // and second coefficient sequences, using an ldr with register
 6826     // variant Q then combine them with next set of 32 zetas. Likewise
 6827     // we store the results using an str with register variant Q.
 6828     for (int i = 0; i < 1024; i += 256) {
 6829       // c0 load 32 (8x4S) coefficients via first offsets
 6830       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6831       // c1 load 32 (8x4S) coefficients via second offsets
 6832       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6833       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6834       vs_addv(vs3, __ T4S, vs1, vs2);
 6835       // c = c0 - c1
 6836       vs_subv(vs1, __ T4S, vs1, vs2);
 6837       // store a0 32 (8x4S) coefficients via first offsets
 6838       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6839       // b load 32 (8x4S) next inputs
 6840       vs_ldpq_post(vs2, zetas);
 6841       // reload constants q, qinv -- they were clobbered earlier
 6842       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6843       // compute a1 = b montmul c
 6844       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6845       // store a1 32 (8x4S) coefficients via second offsets
 6846       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6847     }
 6848 
 6849     // level 3-7
 6850     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6851 
 6852     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6853     __ mov(r0, zr); // return 0
 6854     __ ret(lr);
 6855 
 6856     return start;
 6857   }
 6858 
 6859   // Dilithium multiply polynomials in the NTT domain.
 6860   // Straightforward implementation of the method
 6861   // static int implDilithiumNttMult(
 6862   //              int[] result, int[] ntta, int[] nttb {} of
 6863   // the sun.security.provider.ML_DSA class.
 6864   //
 6865   // result (int[256]) = c_rarg0
 6866   // poly1 (int[256]) = c_rarg1
 6867   // poly2 (int[256]) = c_rarg2
 6868   address generate_dilithiumNttMult() {
 6869 
 6870         __ align(CodeEntryAlignment);
 6871     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 6872     StubCodeMark mark(this, stub_id);
 6873     address start = __ pc();
 6874     __ enter();
 6875 
 6876     Label L_loop;
 6877 
 6878     const Register result = c_rarg0;
 6879     const Register poly1 = c_rarg1;
 6880     const Register poly2 = c_rarg2;
 6881 
 6882     const Register dilithiumConsts = r10;
 6883     const Register len = r11;
 6884 
 6885     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6886     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6887     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6888     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6889 
 6890     __ lea(dilithiumConsts,
 6891              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6892 
 6893     // load constants q, qinv
 6894     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6895     // load constant rSquare into v29
 6896     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6897 
 6898     __ mov(len, zr);
 6899     __ add(len, len, 1024);
 6900 
 6901     __ BIND(L_loop);
 6902 
 6903     // b load 32 (8x4S) next inputs from poly1
 6904     vs_ldpq_post(vs1, poly1);
 6905     // c load 32 (8x4S) next inputs from poly2
 6906     vs_ldpq_post(vs2, poly2);
 6907     // compute a = b montmul c
 6908     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6909     // compute a = rsquare montmul a
 6910     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6911     // save a 32 (8x4S) results
 6912     vs_stpq_post(vs2, result);
 6913 
 6914     __ sub(len, len, 128);
 6915     __ cmp(len, (u1)128);
 6916     __ br(Assembler::GE, L_loop);
 6917 
 6918     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6919     __ mov(r0, zr); // return 0
 6920     __ ret(lr);
 6921 
 6922     return start;
 6923   }
 6924 
 6925   // Dilithium Motgomery multiply an array by a constant.
 6926   // A straightforward implementation of the method
 6927   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6928   // of the sun.security.provider.MLDSA class
 6929   //
 6930   // coeffs (int[256]) = c_rarg0
 6931   // constant (int) = c_rarg1
 6932   address generate_dilithiumMontMulByConstant() {
 6933 
 6934     __ align(CodeEntryAlignment);
 6935     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 6936     StubCodeMark mark(this, stub_id);
 6937     address start = __ pc();
 6938     __ enter();
 6939 
 6940     Label L_loop;
 6941 
 6942     const Register coeffs = c_rarg0;
 6943     const Register constant = c_rarg1;
 6944 
 6945     const Register dilithiumConsts = r10;
 6946     const Register result = r11;
 6947     const Register len = r12;
 6948 
 6949     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6950     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6951     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6952     VSeq<8> vconst(29, 0);             // for montmul by constant
 6953 
 6954     // results track inputs
 6955     __ add(result, coeffs, 0);
 6956     __ lea(dilithiumConsts,
 6957              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6958 
 6959     // load constants q, qinv -- they do not get clobbered by first two loops
 6960     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6961     // copy caller supplied constant across vconst
 6962     __ dup(vconst[0], __ T4S, constant);
 6963     __ mov(len, zr);
 6964     __ add(len, len, 1024);
 6965 
 6966     __ BIND(L_loop);
 6967 
 6968     // load next 32 inputs
 6969     vs_ldpq_post(vs2, coeffs);
 6970     // mont mul by constant
 6971     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6972     // write next 32 results
 6973     vs_stpq_post(vs2, result);
 6974 
 6975     __ sub(len, len, 128);
 6976     __ cmp(len, (u1)128);
 6977     __ br(Assembler::GE, L_loop);
 6978 
 6979     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6980     __ mov(r0, zr); // return 0
 6981     __ ret(lr);
 6982 
 6983     return start;
 6984   }
 6985 
 6986   // Dilithium decompose poly.
 6987   // Implements the method
 6988   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 6989   // of the sun.security.provider.ML_DSA class
 6990   //
 6991   // input (int[256]) = c_rarg0
 6992   // lowPart (int[256]) = c_rarg1
 6993   // highPart (int[256]) = c_rarg2
 6994   // twoGamma2  (int) = c_rarg3
 6995   // multiplier (int) = c_rarg4
 6996   address generate_dilithiumDecomposePoly() {
 6997 
 6998     __ align(CodeEntryAlignment);
 6999     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 7000     StubCodeMark mark(this, stub_id);
 7001     address start = __ pc();
 7002     Label L_loop;
 7003 
 7004     const Register input = c_rarg0;
 7005     const Register lowPart = c_rarg1;
 7006     const Register highPart = c_rarg2;
 7007     const Register twoGamma2 = c_rarg3;
 7008     const Register multiplier = c_rarg4;
 7009 
 7010     const Register len = r9;
 7011     const Register dilithiumConsts = r10;
 7012     const Register tmp = r11;
 7013 
 7014     // 6 independent sets of 4x4s values
 7015     VSeq<4> vs1(0), vs2(4), vs3(8);
 7016     VSeq<4> vs4(12), vs5(16), vtmp(20);
 7017 
 7018     // 7 constants for cross-multiplying
 7019     VSeq<4> one(25, 0);
 7020     VSeq<4> qminus1(26, 0);
 7021     VSeq<4> g2(27, 0);
 7022     VSeq<4> twog2(28, 0);
 7023     VSeq<4> mult(29, 0);
 7024     VSeq<4> q(30, 0);
 7025     VSeq<4> qadd(31, 0);
 7026 
 7027     __ enter();
 7028 
 7029     __ lea(dilithiumConsts,
 7030              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7031 
 7032     // save callee-saved registers
 7033     __ stpd(v8, v9, __ pre(sp, -64));
 7034     __ stpd(v10, v11, Address(sp, 16));
 7035     __ stpd(v12, v13, Address(sp, 32));
 7036     __ stpd(v14, v15, Address(sp, 48));
 7037 
 7038     // populate constant registers
 7039     __ mov(tmp, zr);
 7040     __ add(tmp, tmp, 1);
 7041     __ dup(one[0], __ T4S, tmp); // 1
 7042     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7043     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7044     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7045     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7046     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7047     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7048 
 7049     __ mov(len, zr);
 7050     __ add(len, len, 1024);
 7051 
 7052     __ BIND(L_loop);
 7053 
 7054     // load next 4x4S inputs interleaved: rplus --> vs1
 7055     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7056 
 7057     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7058     vs_addv(vtmp, __ T4S, vs1, qadd);
 7059     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7060     vs_mulv(vtmp, __ T4S, vtmp, q);
 7061     vs_subv(vs1, __ T4S, vs1, vtmp);
 7062 
 7063     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7064     vs_sshr(vtmp, __ T4S, vs1, 31);
 7065     vs_andr(vtmp, vtmp, q);
 7066     vs_addv(vs1, __ T4S, vs1, vtmp);
 7067 
 7068     // quotient --> vs2
 7069     // int quotient = (rplus * multiplier) >> 22;
 7070     vs_mulv(vtmp, __ T4S, vs1, mult);
 7071     vs_sshr(vs2, __ T4S, vtmp, 22);
 7072 
 7073     // r0 --> vs3
 7074     // int r0 = rplus - quotient * twoGamma2;
 7075     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7076     vs_subv(vs3, __ T4S, vs1, vtmp);
 7077 
 7078     // mask --> vs4
 7079     // int mask = (twoGamma2 - r0) >> 22;
 7080     vs_subv(vtmp, __ T4S, twog2, vs3);
 7081     vs_sshr(vs4, __ T4S, vtmp, 22);
 7082 
 7083     // r0 -= (mask & twoGamma2);
 7084     vs_andr(vtmp, vs4, twog2);
 7085     vs_subv(vs3, __ T4S, vs3, vtmp);
 7086 
 7087     //  quotient += (mask & 1);
 7088     vs_andr(vtmp, vs4, one);
 7089     vs_addv(vs2, __ T4S, vs2, vtmp);
 7090 
 7091     // mask = (twoGamma2 / 2 - r0) >> 31;
 7092     vs_subv(vtmp, __ T4S, g2, vs3);
 7093     vs_sshr(vs4, __ T4S, vtmp, 31);
 7094 
 7095     // r0 -= (mask & twoGamma2);
 7096     vs_andr(vtmp, vs4, twog2);
 7097     vs_subv(vs3, __ T4S, vs3, vtmp);
 7098 
 7099     // quotient += (mask & 1);
 7100     vs_andr(vtmp, vs4, one);
 7101     vs_addv(vs2, __ T4S, vs2, vtmp);
 7102 
 7103     // r1 --> vs5
 7104     // int r1 = rplus - r0 - (dilithium_q - 1);
 7105     vs_subv(vtmp, __ T4S, vs1, vs3);
 7106     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7107 
 7108     // r1 --> vs1 (overwriting rplus)
 7109     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7110     vs_negr(vtmp, __ T4S, vs5);
 7111     vs_orr(vtmp, vs5, vtmp);
 7112     vs_sshr(vs1, __ T4S, vtmp, 31);
 7113 
 7114     // r0 += ~r1;
 7115     vs_notr(vtmp, vs1);
 7116     vs_addv(vs3, __ T4S, vs3, vtmp);
 7117 
 7118     // r1 = r1 & quotient;
 7119     vs_andr(vs1, vs2, vs1);
 7120 
 7121     // store results inteleaved
 7122     // lowPart[m] = r0;
 7123     // highPart[m] = r1;
 7124     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7125     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7126 
 7127     __ sub(len, len, 64);
 7128     __ cmp(len, (u1)64);
 7129     __ br(Assembler::GE, L_loop);
 7130 
 7131     // restore callee-saved vector registers
 7132     __ ldpd(v14, v15, Address(sp, 48));
 7133     __ ldpd(v12, v13, Address(sp, 32));
 7134     __ ldpd(v10, v11, Address(sp, 16));
 7135     __ ldpd(v8, v9, __ post(sp, 64));
 7136 
 7137     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7138     __ mov(r0, zr); // return 0
 7139     __ ret(lr);
 7140 
 7141     return start;
 7142   }
 7143 
 7144   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7145              Register tmp0, Register tmp1, Register tmp2) {
 7146     __ bic(tmp0, a2, a1); // for a0
 7147     __ bic(tmp1, a3, a2); // for a1
 7148     __ bic(tmp2, a4, a3); // for a2
 7149     __ eor(a2, a2, tmp2);
 7150     __ bic(tmp2, a0, a4); // for a3
 7151     __ eor(a3, a3, tmp2);
 7152     __ bic(tmp2, a1, a0); // for a4
 7153     __ eor(a0, a0, tmp0);
 7154     __ eor(a1, a1, tmp1);
 7155     __ eor(a4, a4, tmp2);
 7156   }
 7157 
 7158   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7159                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7160                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7161                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7162                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7163                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7164                         Register tmp0, Register tmp1, Register tmp2) {
 7165     __ eor3(tmp1, a4, a9, a14);
 7166     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7167     __ eor3(tmp2, a1, a6, a11);
 7168     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7169     __ rax1(tmp2, tmp0, tmp1); // d0
 7170     {
 7171 
 7172       Register tmp3, tmp4;
 7173       if (can_use_fp && can_use_r18) {
 7174         tmp3 = rfp;
 7175         tmp4 = r18_tls;
 7176       } else {
 7177         tmp3 = a4;
 7178         tmp4 = a9;
 7179         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7180       }
 7181 
 7182       __ eor3(tmp3, a0, a5, a10);
 7183       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7184       __ eor(a0, a0, tmp2);
 7185       __ eor(a5, a5, tmp2);
 7186       __ eor(a10, a10, tmp2);
 7187       __ eor(a15, a15, tmp2);
 7188       __ eor(a20, a20, tmp2); // d0(tmp2)
 7189       __ eor3(tmp3, a2, a7, a12);
 7190       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7191       __ rax1(tmp3, tmp4, tmp2); // d1
 7192       __ eor(a1, a1, tmp3);
 7193       __ eor(a6, a6, tmp3);
 7194       __ eor(a11, a11, tmp3);
 7195       __ eor(a16, a16, tmp3);
 7196       __ eor(a21, a21, tmp3); // d1(tmp3)
 7197       __ rax1(tmp3, tmp2, tmp0); // d3
 7198       __ eor3(tmp2, a3, a8, a13);
 7199       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7200       __ eor(a3, a3, tmp3);
 7201       __ eor(a8, a8, tmp3);
 7202       __ eor(a13, a13, tmp3);
 7203       __ eor(a18, a18, tmp3);
 7204       __ eor(a23, a23, tmp3);
 7205       __ rax1(tmp2, tmp1, tmp0); // d2
 7206       __ eor(a2, a2, tmp2);
 7207       __ eor(a7, a7, tmp2);
 7208       __ eor(a12, a12, tmp2);
 7209       __ rax1(tmp0, tmp0, tmp4); // d4
 7210       if (!can_use_fp || !can_use_r18) {
 7211         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7212       }
 7213       __ eor(a17, a17, tmp2);
 7214       __ eor(a22, a22, tmp2);
 7215       __ eor(a4, a4, tmp0);
 7216       __ eor(a9, a9, tmp0);
 7217       __ eor(a14, a14, tmp0);
 7218       __ eor(a19, a19, tmp0);
 7219       __ eor(a24, a24, tmp0);
 7220     }
 7221 
 7222     __ rol(tmp0, a10, 3);
 7223     __ rol(a10, a1, 1);
 7224     __ rol(a1, a6, 44);
 7225     __ rol(a6, a9, 20);
 7226     __ rol(a9, a22, 61);
 7227     __ rol(a22, a14, 39);
 7228     __ rol(a14, a20, 18);
 7229     __ rol(a20, a2, 62);
 7230     __ rol(a2, a12, 43);
 7231     __ rol(a12, a13, 25);
 7232     __ rol(a13, a19, 8) ;
 7233     __ rol(a19, a23, 56);
 7234     __ rol(a23, a15, 41);
 7235     __ rol(a15, a4, 27);
 7236     __ rol(a4, a24, 14);
 7237     __ rol(a24, a21, 2);
 7238     __ rol(a21, a8, 55);
 7239     __ rol(a8, a16, 45);
 7240     __ rol(a16, a5, 36);
 7241     __ rol(a5, a3, 28);
 7242     __ rol(a3, a18, 21);
 7243     __ rol(a18, a17, 15);
 7244     __ rol(a17, a11, 10);
 7245     __ rol(a11, a7, 6);
 7246     __ mov(a7, tmp0);
 7247 
 7248     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7249     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7250     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7251     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7252     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7253 
 7254     __ ldr(tmp1, __ post(rc, 8));
 7255     __ eor(a0, a0, tmp1);
 7256 
 7257   }
 7258 
 7259   // Arguments:
 7260   //
 7261   // Inputs:
 7262   //   c_rarg0   - byte[]  source+offset
 7263   //   c_rarg1   - byte[]  SHA.state
 7264   //   c_rarg2   - int     block_size
 7265   //   c_rarg3   - int     offset
 7266   //   c_rarg4   - int     limit
 7267   //
 7268   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7269     bool multi_block;
 7270     switch (stub_id) {
 7271     case StubId::stubgen_sha3_implCompress_id:
 7272       multi_block = false;
 7273       break;
 7274     case StubId::stubgen_sha3_implCompressMB_id:
 7275       multi_block = true;
 7276       break;
 7277     default:
 7278       ShouldNotReachHere();
 7279     }
 7280 
 7281     static const uint64_t round_consts[24] = {
 7282       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 7283       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 7284       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 7285       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 7286       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 7287       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 7288       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 7289       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 7290     };
 7291 
 7292     __ align(CodeEntryAlignment);
 7293     StubCodeMark mark(this, stub_id);
 7294     address start = __ pc();
 7295 
 7296     Register buf           = c_rarg0;
 7297     Register state         = c_rarg1;
 7298     Register block_size    = c_rarg2;
 7299     Register ofs           = c_rarg3;
 7300     Register limit         = c_rarg4;
 7301 
 7302     // use r3.r17,r19..r28 to keep a0..a24.
 7303     // a0..a24 are respective locals from SHA3.java
 7304     Register a0 = r25,
 7305              a1 = r26,
 7306              a2 = r27,
 7307              a3 = r3,
 7308              a4 = r4,
 7309              a5 = r5,
 7310              a6 = r6,
 7311              a7 = r7,
 7312              a8 = rscratch1, // r8
 7313              a9 = rscratch2, // r9
 7314              a10 = r10,
 7315              a11 = r11,
 7316              a12 = r12,
 7317              a13 = r13,
 7318              a14 = r14,
 7319              a15 = r15,
 7320              a16 = r16,
 7321              a17 = r17,
 7322              a18 = r28,
 7323              a19 = r19,
 7324              a20 = r20,
 7325              a21 = r21,
 7326              a22 = r22,
 7327              a23 = r23,
 7328              a24 = r24;
 7329 
 7330     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7331 
 7332     Label sha3_loop, rounds24_preloop, loop_body;
 7333     Label sha3_512_or_sha3_384, shake128;
 7334 
 7335     bool can_use_r18 = false;
 7336 #ifndef R18_RESERVED
 7337     can_use_r18 = true;
 7338 #endif
 7339     bool can_use_fp = !PreserveFramePointer;
 7340 
 7341     __ enter();
 7342 
 7343     // save almost all yet unsaved gpr registers on stack
 7344     __ str(block_size, __ pre(sp, -128));
 7345     if (multi_block) {
 7346       __ stpw(ofs, limit, Address(sp, 8));
 7347     }
 7348     // 8 bytes at sp+16 will be used to keep buf
 7349     __ stp(r19, r20, Address(sp, 32));
 7350     __ stp(r21, r22, Address(sp, 48));
 7351     __ stp(r23, r24, Address(sp, 64));
 7352     __ stp(r25, r26, Address(sp, 80));
 7353     __ stp(r27, r28, Address(sp, 96));
 7354     if (can_use_r18 && can_use_fp) {
 7355       __ stp(r18_tls, state, Address(sp, 112));
 7356     } else {
 7357       __ str(state, Address(sp, 112));
 7358     }
 7359 
 7360     // begin sha3 calculations: loading a0..a24 from state arrary
 7361     __ ldp(a0, a1, state);
 7362     __ ldp(a2, a3, Address(state, 16));
 7363     __ ldp(a4, a5, Address(state, 32));
 7364     __ ldp(a6, a7, Address(state, 48));
 7365     __ ldp(a8, a9, Address(state, 64));
 7366     __ ldp(a10, a11, Address(state, 80));
 7367     __ ldp(a12, a13, Address(state, 96));
 7368     __ ldp(a14, a15, Address(state, 112));
 7369     __ ldp(a16, a17, Address(state, 128));
 7370     __ ldp(a18, a19, Address(state, 144));
 7371     __ ldp(a20, a21, Address(state, 160));
 7372     __ ldp(a22, a23, Address(state, 176));
 7373     __ ldr(a24, Address(state, 192));
 7374 
 7375     __ BIND(sha3_loop);
 7376 
 7377     // load input
 7378     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7379     __ eor(a0, a0, tmp3);
 7380     __ eor(a1, a1, tmp2);
 7381     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7382     __ eor(a2, a2, tmp3);
 7383     __ eor(a3, a3, tmp2);
 7384     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7385     __ eor(a4, a4, tmp3);
 7386     __ eor(a5, a5, tmp2);
 7387     __ ldr(tmp3, __ post(buf, 8));
 7388     __ eor(a6, a6, tmp3);
 7389 
 7390     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7391     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7392 
 7393     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7394     __ eor(a7, a7, tmp3);
 7395     __ eor(a8, a8, tmp2);
 7396     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7397     __ eor(a9, a9, tmp3);
 7398     __ eor(a10, a10, tmp2);
 7399     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7400     __ eor(a11, a11, tmp3);
 7401     __ eor(a12, a12, tmp2);
 7402     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7403     __ eor(a13, a13, tmp3);
 7404     __ eor(a14, a14, tmp2);
 7405     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7406     __ eor(a15, a15, tmp3);
 7407     __ eor(a16, a16, tmp2);
 7408 
 7409     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7410     __ andw(tmp2, block_size, 48);
 7411     __ cbzw(tmp2, rounds24_preloop);
 7412     __ tbnz(block_size, 5, shake128);
 7413     // block_size == 144, bit5 == 0, SHA3-244
 7414     __ ldr(tmp3, __ post(buf, 8));
 7415     __ eor(a17, a17, tmp3);
 7416     __ b(rounds24_preloop);
 7417 
 7418     __ BIND(shake128);
 7419     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7420     __ eor(a17, a17, tmp3);
 7421     __ eor(a18, a18, tmp2);
 7422     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7423     __ eor(a19, a19, tmp3);
 7424     __ eor(a20, a20, tmp2);
 7425     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7426 
 7427     __ BIND(sha3_512_or_sha3_384);
 7428     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7429     __ eor(a7, a7, tmp3);
 7430     __ eor(a8, a8, tmp2);
 7431     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7432 
 7433     // SHA3-384
 7434     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7435     __ eor(a9, a9, tmp3);
 7436     __ eor(a10, a10, tmp2);
 7437     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7438     __ eor(a11, a11, tmp3);
 7439     __ eor(a12, a12, tmp2);
 7440 
 7441     __ BIND(rounds24_preloop);
 7442     __ fmovs(v0, 24.0); // float loop counter,
 7443     __ fmovs(v1, 1.0);  // exact representation
 7444 
 7445     __ str(buf, Address(sp, 16));
 7446     __ lea(tmp3, ExternalAddress((address) round_consts));
 7447 
 7448     __ BIND(loop_body);
 7449     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7450                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7451                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7452                      tmp0, tmp1, tmp2);
 7453     __ fsubs(v0, v0, v1);
 7454     __ fcmps(v0, 0.0);
 7455     __ br(__ NE, loop_body);
 7456 
 7457     if (multi_block) {
 7458       __ ldrw(block_size, sp); // block_size
 7459       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7460       __ addw(tmp2, tmp2, block_size);
 7461       __ cmpw(tmp2, tmp1);
 7462       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7463       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7464       __ br(Assembler::LE, sha3_loop);
 7465       __ movw(c_rarg0, tmp2); // return offset
 7466     }
 7467     if (can_use_fp && can_use_r18) {
 7468       __ ldp(r18_tls, state, Address(sp, 112));
 7469     } else {
 7470       __ ldr(state, Address(sp, 112));
 7471     }
 7472     // save calculated sha3 state
 7473     __ stp(a0, a1, Address(state));
 7474     __ stp(a2, a3, Address(state, 16));
 7475     __ stp(a4, a5, Address(state, 32));
 7476     __ stp(a6, a7, Address(state, 48));
 7477     __ stp(a8, a9, Address(state, 64));
 7478     __ stp(a10, a11, Address(state, 80));
 7479     __ stp(a12, a13, Address(state, 96));
 7480     __ stp(a14, a15, Address(state, 112));
 7481     __ stp(a16, a17, Address(state, 128));
 7482     __ stp(a18, a19, Address(state, 144));
 7483     __ stp(a20, a21, Address(state, 160));
 7484     __ stp(a22, a23, Address(state, 176));
 7485     __ str(a24, Address(state, 192));
 7486 
 7487     // restore required registers from stack
 7488     __ ldp(r19, r20, Address(sp, 32));
 7489     __ ldp(r21, r22, Address(sp, 48));
 7490     __ ldp(r23, r24, Address(sp, 64));
 7491     __ ldp(r25, r26, Address(sp, 80));
 7492     __ ldp(r27, r28, Address(sp, 96));
 7493     if (can_use_fp && can_use_r18) {
 7494       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 7495     } // else no need to recalculate rfp, since it wasn't changed
 7496 
 7497     __ leave();
 7498 
 7499     __ ret(lr);
 7500 
 7501     return start;
 7502   }
 7503 
 7504   /**
 7505    *  Arguments:
 7506    *
 7507    * Inputs:
 7508    *   c_rarg0   - int crc
 7509    *   c_rarg1   - byte* buf
 7510    *   c_rarg2   - int length
 7511    *
 7512    * Output:
 7513    *       rax   - int crc result
 7514    */
 7515   address generate_updateBytesCRC32() {
 7516     assert(UseCRC32Intrinsics, "what are we doing here?");
 7517 
 7518     __ align(CodeEntryAlignment);
 7519     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 7520     StubCodeMark mark(this, stub_id);
 7521 
 7522     address start = __ pc();
 7523 
 7524     const Register crc   = c_rarg0;  // crc
 7525     const Register buf   = c_rarg1;  // source java byte array address
 7526     const Register len   = c_rarg2;  // length
 7527     const Register table0 = c_rarg3; // crc_table address
 7528     const Register table1 = c_rarg4;
 7529     const Register table2 = c_rarg5;
 7530     const Register table3 = c_rarg6;
 7531     const Register tmp3 = c_rarg7;
 7532 
 7533     BLOCK_COMMENT("Entry:");
 7534     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7535 
 7536     __ kernel_crc32(crc, buf, len,
 7537               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7538 
 7539     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7540     __ ret(lr);
 7541 
 7542     return start;
 7543   }
 7544 
 7545   /**
 7546    *  Arguments:
 7547    *
 7548    * Inputs:
 7549    *   c_rarg0   - int crc
 7550    *   c_rarg1   - byte* buf
 7551    *   c_rarg2   - int length
 7552    *   c_rarg3   - int* table
 7553    *
 7554    * Output:
 7555    *       r0   - int crc result
 7556    */
 7557   address generate_updateBytesCRC32C() {
 7558     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7559 
 7560     __ align(CodeEntryAlignment);
 7561     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 7562     StubCodeMark mark(this, stub_id);
 7563 
 7564     address start = __ pc();
 7565 
 7566     const Register crc   = c_rarg0;  // crc
 7567     const Register buf   = c_rarg1;  // source java byte array address
 7568     const Register len   = c_rarg2;  // length
 7569     const Register table0 = c_rarg3; // crc_table address
 7570     const Register table1 = c_rarg4;
 7571     const Register table2 = c_rarg5;
 7572     const Register table3 = c_rarg6;
 7573     const Register tmp3 = c_rarg7;
 7574 
 7575     BLOCK_COMMENT("Entry:");
 7576     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7577 
 7578     __ kernel_crc32c(crc, buf, len,
 7579               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7580 
 7581     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7582     __ ret(lr);
 7583 
 7584     return start;
 7585   }
 7586 
 7587   /***
 7588    *  Arguments:
 7589    *
 7590    *  Inputs:
 7591    *   c_rarg0   - int   adler
 7592    *   c_rarg1   - byte* buff
 7593    *   c_rarg2   - int   len
 7594    *
 7595    * Output:
 7596    *   c_rarg0   - int adler result
 7597    */
 7598   address generate_updateBytesAdler32() {
 7599     __ align(CodeEntryAlignment);
 7600     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 7601     StubCodeMark mark(this, stub_id);
 7602     address start = __ pc();
 7603 
 7604     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7605 
 7606     // Aliases
 7607     Register adler  = c_rarg0;
 7608     Register s1     = c_rarg0;
 7609     Register s2     = c_rarg3;
 7610     Register buff   = c_rarg1;
 7611     Register len    = c_rarg2;
 7612     Register nmax  = r4;
 7613     Register base  = r5;
 7614     Register count = r6;
 7615     Register temp0 = rscratch1;
 7616     Register temp1 = rscratch2;
 7617     FloatRegister vbytes = v0;
 7618     FloatRegister vs1acc = v1;
 7619     FloatRegister vs2acc = v2;
 7620     FloatRegister vtable = v3;
 7621 
 7622     // Max number of bytes we can process before having to take the mod
 7623     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7624     uint64_t BASE = 0xfff1;
 7625     uint64_t NMAX = 0x15B0;
 7626 
 7627     __ mov(base, BASE);
 7628     __ mov(nmax, NMAX);
 7629 
 7630     // Load accumulation coefficients for the upper 16 bits
 7631     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7632     __ ld1(vtable, __ T16B, Address(temp0));
 7633 
 7634     // s1 is initialized to the lower 16 bits of adler
 7635     // s2 is initialized to the upper 16 bits of adler
 7636     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7637     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7638 
 7639     // The pipelined loop needs at least 16 elements for 1 iteration
 7640     // It does check this, but it is more effective to skip to the cleanup loop
 7641     __ cmp(len, (u1)16);
 7642     __ br(Assembler::HS, L_nmax);
 7643     __ cbz(len, L_combine);
 7644 
 7645     __ bind(L_simple_by1_loop);
 7646     __ ldrb(temp0, Address(__ post(buff, 1)));
 7647     __ add(s1, s1, temp0);
 7648     __ add(s2, s2, s1);
 7649     __ subs(len, len, 1);
 7650     __ br(Assembler::HI, L_simple_by1_loop);
 7651 
 7652     // s1 = s1 % BASE
 7653     __ subs(temp0, s1, base);
 7654     __ csel(s1, temp0, s1, Assembler::HS);
 7655 
 7656     // s2 = s2 % BASE
 7657     __ lsr(temp0, s2, 16);
 7658     __ lsl(temp1, temp0, 4);
 7659     __ sub(temp1, temp1, temp0);
 7660     __ add(s2, temp1, s2, ext::uxth);
 7661 
 7662     __ subs(temp0, s2, base);
 7663     __ csel(s2, temp0, s2, Assembler::HS);
 7664 
 7665     __ b(L_combine);
 7666 
 7667     __ bind(L_nmax);
 7668     __ subs(len, len, nmax);
 7669     __ sub(count, nmax, 16);
 7670     __ br(Assembler::LO, L_by16);
 7671 
 7672     __ bind(L_nmax_loop);
 7673 
 7674     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7675                                       vbytes, vs1acc, vs2acc, vtable);
 7676 
 7677     __ subs(count, count, 16);
 7678     __ br(Assembler::HS, L_nmax_loop);
 7679 
 7680     // s1 = s1 % BASE
 7681     __ lsr(temp0, s1, 16);
 7682     __ lsl(temp1, temp0, 4);
 7683     __ sub(temp1, temp1, temp0);
 7684     __ add(temp1, temp1, s1, ext::uxth);
 7685 
 7686     __ lsr(temp0, temp1, 16);
 7687     __ lsl(s1, temp0, 4);
 7688     __ sub(s1, s1, temp0);
 7689     __ add(s1, s1, temp1, ext:: uxth);
 7690 
 7691     __ subs(temp0, s1, base);
 7692     __ csel(s1, temp0, s1, Assembler::HS);
 7693 
 7694     // s2 = s2 % BASE
 7695     __ lsr(temp0, s2, 16);
 7696     __ lsl(temp1, temp0, 4);
 7697     __ sub(temp1, temp1, temp0);
 7698     __ add(temp1, temp1, s2, ext::uxth);
 7699 
 7700     __ lsr(temp0, temp1, 16);
 7701     __ lsl(s2, temp0, 4);
 7702     __ sub(s2, s2, temp0);
 7703     __ add(s2, s2, temp1, ext:: uxth);
 7704 
 7705     __ subs(temp0, s2, base);
 7706     __ csel(s2, temp0, s2, Assembler::HS);
 7707 
 7708     __ subs(len, len, nmax);
 7709     __ sub(count, nmax, 16);
 7710     __ br(Assembler::HS, L_nmax_loop);
 7711 
 7712     __ bind(L_by16);
 7713     __ adds(len, len, count);
 7714     __ br(Assembler::LO, L_by1);
 7715 
 7716     __ bind(L_by16_loop);
 7717 
 7718     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7719                                       vbytes, vs1acc, vs2acc, vtable);
 7720 
 7721     __ subs(len, len, 16);
 7722     __ br(Assembler::HS, L_by16_loop);
 7723 
 7724     __ bind(L_by1);
 7725     __ adds(len, len, 15);
 7726     __ br(Assembler::LO, L_do_mod);
 7727 
 7728     __ bind(L_by1_loop);
 7729     __ ldrb(temp0, Address(__ post(buff, 1)));
 7730     __ add(s1, temp0, s1);
 7731     __ add(s2, s2, s1);
 7732     __ subs(len, len, 1);
 7733     __ br(Assembler::HS, L_by1_loop);
 7734 
 7735     __ bind(L_do_mod);
 7736     // s1 = s1 % BASE
 7737     __ lsr(temp0, s1, 16);
 7738     __ lsl(temp1, temp0, 4);
 7739     __ sub(temp1, temp1, temp0);
 7740     __ add(temp1, temp1, s1, ext::uxth);
 7741 
 7742     __ lsr(temp0, temp1, 16);
 7743     __ lsl(s1, temp0, 4);
 7744     __ sub(s1, s1, temp0);
 7745     __ add(s1, s1, temp1, ext:: uxth);
 7746 
 7747     __ subs(temp0, s1, base);
 7748     __ csel(s1, temp0, s1, Assembler::HS);
 7749 
 7750     // s2 = s2 % BASE
 7751     __ lsr(temp0, s2, 16);
 7752     __ lsl(temp1, temp0, 4);
 7753     __ sub(temp1, temp1, temp0);
 7754     __ add(temp1, temp1, s2, ext::uxth);
 7755 
 7756     __ lsr(temp0, temp1, 16);
 7757     __ lsl(s2, temp0, 4);
 7758     __ sub(s2, s2, temp0);
 7759     __ add(s2, s2, temp1, ext:: uxth);
 7760 
 7761     __ subs(temp0, s2, base);
 7762     __ csel(s2, temp0, s2, Assembler::HS);
 7763 
 7764     // Combine lower bits and higher bits
 7765     __ bind(L_combine);
 7766     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7767 
 7768     __ ret(lr);
 7769 
 7770     return start;
 7771   }
 7772 
 7773   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7774           Register temp0, Register temp1, FloatRegister vbytes,
 7775           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7776     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7777     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7778     // In non-vectorized code, we update s1 and s2 as:
 7779     //   s1 <- s1 + b1
 7780     //   s2 <- s2 + s1
 7781     //   s1 <- s1 + b2
 7782     //   s2 <- s2 + b1
 7783     //   ...
 7784     //   s1 <- s1 + b16
 7785     //   s2 <- s2 + s1
 7786     // Putting above assignments together, we have:
 7787     //   s1_new = s1 + b1 + b2 + ... + b16
 7788     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7789     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7790     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7791     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7792 
 7793     // s2 = s2 + s1 * 16
 7794     __ add(s2, s2, s1, Assembler::LSL, 4);
 7795 
 7796     // vs1acc = b1 + b2 + b3 + ... + b16
 7797     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7798     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7799     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7800     __ uaddlv(vs1acc, __ T16B, vbytes);
 7801     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7802 
 7803     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7804     __ fmovd(temp0, vs1acc);
 7805     __ fmovd(temp1, vs2acc);
 7806     __ add(s1, s1, temp0);
 7807     __ add(s2, s2, temp1);
 7808   }
 7809 
 7810   /**
 7811    *  Arguments:
 7812    *
 7813    *  Input:
 7814    *    c_rarg0   - x address
 7815    *    c_rarg1   - x length
 7816    *    c_rarg2   - y address
 7817    *    c_rarg3   - y length
 7818    *    c_rarg4   - z address
 7819    */
 7820   address generate_multiplyToLen() {
 7821     __ align(CodeEntryAlignment);
 7822     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 7823     StubCodeMark mark(this, stub_id);
 7824 
 7825     address start = __ pc();
 7826  
 7827     if (AOTCodeCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) {
 7828       return start;
 7829     }
 7830     const Register x     = r0;
 7831     const Register xlen  = r1;
 7832     const Register y     = r2;
 7833     const Register ylen  = r3;
 7834     const Register z     = r4;
 7835 
 7836     const Register tmp0  = r5;
 7837     const Register tmp1  = r10;
 7838     const Register tmp2  = r11;
 7839     const Register tmp3  = r12;
 7840     const Register tmp4  = r13;
 7841     const Register tmp5  = r14;
 7842     const Register tmp6  = r15;
 7843     const Register tmp7  = r16;
 7844 
 7845     BLOCK_COMMENT("Entry:");
 7846     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7847     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7848     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7849     __ ret(lr);
 7850 
 7851     AOTCodeCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start);
 7852     return start;
 7853   }
 7854 
 7855   address generate_squareToLen() {
 7856     // squareToLen algorithm for sizes 1..127 described in java code works
 7857     // faster than multiply_to_len on some CPUs and slower on others, but
 7858     // multiply_to_len shows a bit better overall results
 7859     __ align(CodeEntryAlignment);
 7860     StubId stub_id = StubId::stubgen_squareToLen_id;
 7861     StubCodeMark mark(this, stub_id);
 7862     address start = __ pc();
 7863 
 7864     if (AOTCodeCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) {
 7865       return start;
 7866     }
 7867     const Register x     = r0;
 7868     const Register xlen  = r1;
 7869     const Register z     = r2;
 7870     const Register y     = r4; // == x
 7871     const Register ylen  = r5; // == xlen
 7872 
 7873     const Register tmp0  = r3;
 7874     const Register tmp1  = r10;
 7875     const Register tmp2  = r11;
 7876     const Register tmp3  = r12;
 7877     const Register tmp4  = r13;
 7878     const Register tmp5  = r14;
 7879     const Register tmp6  = r15;
 7880     const Register tmp7  = r16;
 7881 
 7882     RegSet spilled_regs = RegSet::of(y, ylen);
 7883     BLOCK_COMMENT("Entry:");
 7884     __ enter();
 7885     __ push(spilled_regs, sp);
 7886     __ mov(y, x);
 7887     __ mov(ylen, xlen);
 7888     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7889     __ pop(spilled_regs, sp);
 7890     __ leave();
 7891     __ ret(lr);
 7892 
 7893     AOTCodeCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start);
 7894     return start;
 7895   }
 7896 
 7897   address generate_mulAdd() {
 7898     __ align(CodeEntryAlignment);
 7899     StubId stub_id = StubId::stubgen_mulAdd_id;
 7900     StubCodeMark mark(this, stub_id);
 7901 
 7902     address start = __ pc();
 7903 
 7904     if (AOTCodeCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) {
 7905       return start;
 7906     }
 7907     const Register out     = r0;
 7908     const Register in      = r1;
 7909     const Register offset  = r2;
 7910     const Register len     = r3;
 7911     const Register k       = r4;
 7912 
 7913     BLOCK_COMMENT("Entry:");
 7914     __ enter();
 7915     __ mul_add(out, in, offset, len, k);
 7916     __ leave();
 7917     __ ret(lr);
 7918 
 7919     AOTCodeCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start);
 7920     return start;
 7921   }
 7922 
 7923   // Arguments:
 7924   //
 7925   // Input:
 7926   //   c_rarg0   - newArr address
 7927   //   c_rarg1   - oldArr address
 7928   //   c_rarg2   - newIdx
 7929   //   c_rarg3   - shiftCount
 7930   //   c_rarg4   - numIter
 7931   //
 7932   address generate_bigIntegerRightShift() {
 7933     __ align(CodeEntryAlignment);
 7934     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 7935     StubCodeMark mark(this, stub_id);
 7936     address start = __ pc();
 7937 
 7938     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7939 
 7940     Register newArr        = c_rarg0;
 7941     Register oldArr        = c_rarg1;
 7942     Register newIdx        = c_rarg2;
 7943     Register shiftCount    = c_rarg3;
 7944     Register numIter       = c_rarg4;
 7945     Register idx           = numIter;
 7946 
 7947     Register newArrCur     = rscratch1;
 7948     Register shiftRevCount = rscratch2;
 7949     Register oldArrCur     = r13;
 7950     Register oldArrNext    = r14;
 7951 
 7952     FloatRegister oldElem0        = v0;
 7953     FloatRegister oldElem1        = v1;
 7954     FloatRegister newElem         = v2;
 7955     FloatRegister shiftVCount     = v3;
 7956     FloatRegister shiftVRevCount  = v4;
 7957 
 7958     __ cbz(idx, Exit);
 7959 
 7960     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7961 
 7962     // left shift count
 7963     __ movw(shiftRevCount, 32);
 7964     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7965 
 7966     // numIter too small to allow a 4-words SIMD loop, rolling back
 7967     __ cmp(numIter, (u1)4);
 7968     __ br(Assembler::LT, ShiftThree);
 7969 
 7970     __ dup(shiftVCount,    __ T4S, shiftCount);
 7971     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7972     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7973 
 7974     __ BIND(ShiftSIMDLoop);
 7975 
 7976     // Calculate the load addresses
 7977     __ sub(idx, idx, 4);
 7978     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7979     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7980     __ add(oldArrCur,  oldArrNext, 4);
 7981 
 7982     // Load 4 words and process
 7983     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7984     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7985     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7986     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7987     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7988     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7989 
 7990     __ cmp(idx, (u1)4);
 7991     __ br(Assembler::LT, ShiftTwoLoop);
 7992     __ b(ShiftSIMDLoop);
 7993 
 7994     __ BIND(ShiftTwoLoop);
 7995     __ cbz(idx, Exit);
 7996     __ cmp(idx, (u1)1);
 7997     __ br(Assembler::EQ, ShiftOne);
 7998 
 7999     // Calculate the load addresses
 8000     __ sub(idx, idx, 2);
 8001     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 8002     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 8003     __ add(oldArrCur,  oldArrNext, 4);
 8004 
 8005     // Load 2 words and process
 8006     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 8007     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 8008     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 8009     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 8010     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 8011     __ st1(newElem,   __ T2S, Address(newArrCur));
 8012     __ b(ShiftTwoLoop);
 8013 
 8014     __ BIND(ShiftThree);
 8015     __ tbz(idx, 1, ShiftOne);
 8016     __ tbz(idx, 0, ShiftTwo);
 8017     __ ldrw(r10,  Address(oldArr, 12));
 8018     __ ldrw(r11,  Address(oldArr, 8));
 8019     __ lsrvw(r10, r10, shiftCount);
 8020     __ lslvw(r11, r11, shiftRevCount);
 8021     __ orrw(r12,  r10, r11);
 8022     __ strw(r12,  Address(newArr, 8));
 8023 
 8024     __ BIND(ShiftTwo);
 8025     __ ldrw(r10,  Address(oldArr, 8));
 8026     __ ldrw(r11,  Address(oldArr, 4));
 8027     __ lsrvw(r10, r10, shiftCount);
 8028     __ lslvw(r11, r11, shiftRevCount);
 8029     __ orrw(r12,  r10, r11);
 8030     __ strw(r12,  Address(newArr, 4));
 8031 
 8032     __ BIND(ShiftOne);
 8033     __ ldrw(r10,  Address(oldArr, 4));
 8034     __ ldrw(r11,  Address(oldArr));
 8035     __ lsrvw(r10, r10, shiftCount);
 8036     __ lslvw(r11, r11, shiftRevCount);
 8037     __ orrw(r12,  r10, r11);
 8038     __ strw(r12,  Address(newArr));
 8039 
 8040     __ BIND(Exit);
 8041     __ ret(lr);
 8042 
 8043     return start;
 8044   }
 8045 
 8046   // Arguments:
 8047   //
 8048   // Input:
 8049   //   c_rarg0   - newArr address
 8050   //   c_rarg1   - oldArr address
 8051   //   c_rarg2   - newIdx
 8052   //   c_rarg3   - shiftCount
 8053   //   c_rarg4   - numIter
 8054   //
 8055   address generate_bigIntegerLeftShift() {
 8056     __ align(CodeEntryAlignment);
 8057     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 8058     StubCodeMark mark(this, stub_id);
 8059     address start = __ pc();
 8060 
 8061     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8062 
 8063     Register newArr        = c_rarg0;
 8064     Register oldArr        = c_rarg1;
 8065     Register newIdx        = c_rarg2;
 8066     Register shiftCount    = c_rarg3;
 8067     Register numIter       = c_rarg4;
 8068 
 8069     Register shiftRevCount = rscratch1;
 8070     Register oldArrNext    = rscratch2;
 8071 
 8072     FloatRegister oldElem0        = v0;
 8073     FloatRegister oldElem1        = v1;
 8074     FloatRegister newElem         = v2;
 8075     FloatRegister shiftVCount     = v3;
 8076     FloatRegister shiftVRevCount  = v4;
 8077 
 8078     __ cbz(numIter, Exit);
 8079 
 8080     __ add(oldArrNext, oldArr, 4);
 8081     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8082 
 8083     // right shift count
 8084     __ movw(shiftRevCount, 32);
 8085     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8086 
 8087     // numIter too small to allow a 4-words SIMD loop, rolling back
 8088     __ cmp(numIter, (u1)4);
 8089     __ br(Assembler::LT, ShiftThree);
 8090 
 8091     __ dup(shiftVCount,     __ T4S, shiftCount);
 8092     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8093     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8094 
 8095     __ BIND(ShiftSIMDLoop);
 8096 
 8097     // load 4 words and process
 8098     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8099     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8100     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8101     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8102     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8103     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8104     __ sub(numIter,   numIter, 4);
 8105 
 8106     __ cmp(numIter, (u1)4);
 8107     __ br(Assembler::LT, ShiftTwoLoop);
 8108     __ b(ShiftSIMDLoop);
 8109 
 8110     __ BIND(ShiftTwoLoop);
 8111     __ cbz(numIter, Exit);
 8112     __ cmp(numIter, (u1)1);
 8113     __ br(Assembler::EQ, ShiftOne);
 8114 
 8115     // load 2 words and process
 8116     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8117     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8118     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8119     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8120     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8121     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8122     __ sub(numIter,   numIter, 2);
 8123     __ b(ShiftTwoLoop);
 8124 
 8125     __ BIND(ShiftThree);
 8126     __ ldrw(r10,  __ post(oldArr, 4));
 8127     __ ldrw(r11,  __ post(oldArrNext, 4));
 8128     __ lslvw(r10, r10, shiftCount);
 8129     __ lsrvw(r11, r11, shiftRevCount);
 8130     __ orrw(r12,  r10, r11);
 8131     __ strw(r12,  __ post(newArr, 4));
 8132     __ tbz(numIter, 1, Exit);
 8133     __ tbz(numIter, 0, ShiftOne);
 8134 
 8135     __ BIND(ShiftTwo);
 8136     __ ldrw(r10,  __ post(oldArr, 4));
 8137     __ ldrw(r11,  __ post(oldArrNext, 4));
 8138     __ lslvw(r10, r10, shiftCount);
 8139     __ lsrvw(r11, r11, shiftRevCount);
 8140     __ orrw(r12,  r10, r11);
 8141     __ strw(r12,  __ post(newArr, 4));
 8142 
 8143     __ BIND(ShiftOne);
 8144     __ ldrw(r10,  Address(oldArr));
 8145     __ ldrw(r11,  Address(oldArrNext));
 8146     __ lslvw(r10, r10, shiftCount);
 8147     __ lsrvw(r11, r11, shiftRevCount);
 8148     __ orrw(r12,  r10, r11);
 8149     __ strw(r12,  Address(newArr));
 8150 
 8151     __ BIND(Exit);
 8152     __ ret(lr);
 8153 
 8154     return start;
 8155   }
 8156 
 8157   address generate_count_positives(address &count_positives_long) {
 8158     const u1 large_loop_size = 64;
 8159     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8160     int dcache_line = VM_Version::dcache_line_size();
 8161 
 8162     Register ary1 = r1, len = r2, result = r0;
 8163 
 8164     __ align(CodeEntryAlignment);
 8165 
 8166     StubId stub_id = StubId::stubgen_count_positives_id;
 8167     StubCodeMark mark(this, stub_id);
 8168 
 8169     address entry = __ pc();
 8170 
 8171     __ enter();
 8172     // precondition: a copy of len is already in result
 8173     // __ mov(result, len);
 8174 
 8175   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8176         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8177 
 8178   __ cmp(len, (u1)15);
 8179   __ br(Assembler::GT, LEN_OVER_15);
 8180   // The only case when execution falls into this code is when pointer is near
 8181   // the end of memory page and we have to avoid reading next page
 8182   __ add(ary1, ary1, len);
 8183   __ subs(len, len, 8);
 8184   __ br(Assembler::GT, LEN_OVER_8);
 8185   __ ldr(rscratch2, Address(ary1, -8));
 8186   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8187   __ lsrv(rscratch2, rscratch2, rscratch1);
 8188   __ tst(rscratch2, UPPER_BIT_MASK);
 8189   __ csel(result, zr, result, Assembler::NE);
 8190   __ leave();
 8191   __ ret(lr);
 8192   __ bind(LEN_OVER_8);
 8193   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8194   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8195   __ tst(rscratch2, UPPER_BIT_MASK);
 8196   __ br(Assembler::NE, RET_NO_POP);
 8197   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8198   __ lsrv(rscratch1, rscratch1, rscratch2);
 8199   __ tst(rscratch1, UPPER_BIT_MASK);
 8200   __ bind(RET_NO_POP);
 8201   __ csel(result, zr, result, Assembler::NE);
 8202   __ leave();
 8203   __ ret(lr);
 8204 
 8205   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8206   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8207 
 8208   count_positives_long = __ pc(); // 2nd entry point
 8209 
 8210   __ enter();
 8211 
 8212   __ bind(LEN_OVER_15);
 8213     __ push(spilled_regs, sp);
 8214     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8215     __ cbz(rscratch2, ALIGNED);
 8216     __ ldp(tmp6, tmp1, Address(ary1));
 8217     __ mov(tmp5, 16);
 8218     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8219     __ add(ary1, ary1, rscratch1);
 8220     __ orr(tmp6, tmp6, tmp1);
 8221     __ tst(tmp6, UPPER_BIT_MASK);
 8222     __ br(Assembler::NE, RET_ADJUST);
 8223     __ sub(len, len, rscratch1);
 8224 
 8225   __ bind(ALIGNED);
 8226     __ cmp(len, large_loop_size);
 8227     __ br(Assembler::LT, CHECK_16);
 8228     // Perform 16-byte load as early return in pre-loop to handle situation
 8229     // when initially aligned large array has negative values at starting bytes,
 8230     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8231     // slower. Cases with negative bytes further ahead won't be affected that
 8232     // much. In fact, it'll be faster due to early loads, less instructions and
 8233     // less branches in LARGE_LOOP.
 8234     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8235     __ sub(len, len, 16);
 8236     __ orr(tmp6, tmp6, tmp1);
 8237     __ tst(tmp6, UPPER_BIT_MASK);
 8238     __ br(Assembler::NE, RET_ADJUST_16);
 8239     __ cmp(len, large_loop_size);
 8240     __ br(Assembler::LT, CHECK_16);
 8241 
 8242     if (SoftwarePrefetchHintDistance >= 0
 8243         && SoftwarePrefetchHintDistance >= dcache_line) {
 8244       // initial prefetch
 8245       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8246     }
 8247   __ bind(LARGE_LOOP);
 8248     if (SoftwarePrefetchHintDistance >= 0) {
 8249       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8250     }
 8251     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8252     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8253     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8254     // instructions per cycle and have less branches, but this approach disables
 8255     // early return, thus, all 64 bytes are loaded and checked every time.
 8256     __ ldp(tmp2, tmp3, Address(ary1));
 8257     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8258     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8259     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8260     __ add(ary1, ary1, large_loop_size);
 8261     __ sub(len, len, large_loop_size);
 8262     __ orr(tmp2, tmp2, tmp3);
 8263     __ orr(tmp4, tmp4, tmp5);
 8264     __ orr(rscratch1, rscratch1, rscratch2);
 8265     __ orr(tmp6, tmp6, tmp1);
 8266     __ orr(tmp2, tmp2, tmp4);
 8267     __ orr(rscratch1, rscratch1, tmp6);
 8268     __ orr(tmp2, tmp2, rscratch1);
 8269     __ tst(tmp2, UPPER_BIT_MASK);
 8270     __ br(Assembler::NE, RET_ADJUST_LONG);
 8271     __ cmp(len, large_loop_size);
 8272     __ br(Assembler::GE, LARGE_LOOP);
 8273 
 8274   __ bind(CHECK_16); // small 16-byte load pre-loop
 8275     __ cmp(len, (u1)16);
 8276     __ br(Assembler::LT, POST_LOOP16);
 8277 
 8278   __ bind(LOOP16); // small 16-byte load loop
 8279     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8280     __ sub(len, len, 16);
 8281     __ orr(tmp2, tmp2, tmp3);
 8282     __ tst(tmp2, UPPER_BIT_MASK);
 8283     __ br(Assembler::NE, RET_ADJUST_16);
 8284     __ cmp(len, (u1)16);
 8285     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8286 
 8287   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8288     __ cmp(len, (u1)8);
 8289     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8290     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8291     __ tst(tmp3, UPPER_BIT_MASK);
 8292     __ br(Assembler::NE, RET_ADJUST);
 8293     __ sub(len, len, 8);
 8294 
 8295   __ bind(POST_LOOP16_LOAD_TAIL);
 8296     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8297     __ ldr(tmp1, Address(ary1));
 8298     __ mov(tmp2, 64);
 8299     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8300     __ lslv(tmp1, tmp1, tmp4);
 8301     __ tst(tmp1, UPPER_BIT_MASK);
 8302     __ br(Assembler::NE, RET_ADJUST);
 8303     // Fallthrough
 8304 
 8305   __ bind(RET_LEN);
 8306     __ pop(spilled_regs, sp);
 8307     __ leave();
 8308     __ ret(lr);
 8309 
 8310     // difference result - len is the count of guaranteed to be
 8311     // positive bytes
 8312 
 8313   __ bind(RET_ADJUST_LONG);
 8314     __ add(len, len, (u1)(large_loop_size - 16));
 8315   __ bind(RET_ADJUST_16);
 8316     __ add(len, len, 16);
 8317   __ bind(RET_ADJUST);
 8318     __ pop(spilled_regs, sp);
 8319     __ leave();
 8320     __ sub(result, result, len);
 8321     __ ret(lr);
 8322 
 8323     return entry;
 8324   }
 8325 
 8326   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8327         bool usePrefetch, Label &NOT_EQUAL) {
 8328     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8329         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8330         tmp7 = r12, tmp8 = r13;
 8331     Label LOOP;
 8332 
 8333     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8334     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8335     __ bind(LOOP);
 8336     if (usePrefetch) {
 8337       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8338       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8339     }
 8340     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8341     __ eor(tmp1, tmp1, tmp2);
 8342     __ eor(tmp3, tmp3, tmp4);
 8343     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8344     __ orr(tmp1, tmp1, tmp3);
 8345     __ cbnz(tmp1, NOT_EQUAL);
 8346     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8347     __ eor(tmp5, tmp5, tmp6);
 8348     __ eor(tmp7, tmp7, tmp8);
 8349     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8350     __ orr(tmp5, tmp5, tmp7);
 8351     __ cbnz(tmp5, NOT_EQUAL);
 8352     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8353     __ eor(tmp1, tmp1, tmp2);
 8354     __ eor(tmp3, tmp3, tmp4);
 8355     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8356     __ orr(tmp1, tmp1, tmp3);
 8357     __ cbnz(tmp1, NOT_EQUAL);
 8358     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8359     __ eor(tmp5, tmp5, tmp6);
 8360     __ sub(cnt1, cnt1, 8 * wordSize);
 8361     __ eor(tmp7, tmp7, tmp8);
 8362     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8363     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8364     // cmp) because subs allows an unlimited range of immediate operand.
 8365     __ subs(tmp6, cnt1, loopThreshold);
 8366     __ orr(tmp5, tmp5, tmp7);
 8367     __ cbnz(tmp5, NOT_EQUAL);
 8368     __ br(__ GE, LOOP);
 8369     // post-loop
 8370     __ eor(tmp1, tmp1, tmp2);
 8371     __ eor(tmp3, tmp3, tmp4);
 8372     __ orr(tmp1, tmp1, tmp3);
 8373     __ sub(cnt1, cnt1, 2 * wordSize);
 8374     __ cbnz(tmp1, NOT_EQUAL);
 8375   }
 8376 
 8377   void generate_large_array_equals_loop_simd(int loopThreshold,
 8378         bool usePrefetch, Label &NOT_EQUAL) {
 8379     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8380         tmp2 = rscratch2;
 8381     Label LOOP;
 8382 
 8383     __ bind(LOOP);
 8384     if (usePrefetch) {
 8385       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8386       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8387     }
 8388     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8389     __ sub(cnt1, cnt1, 8 * wordSize);
 8390     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8391     __ subs(tmp1, cnt1, loopThreshold);
 8392     __ eor(v0, __ T16B, v0, v4);
 8393     __ eor(v1, __ T16B, v1, v5);
 8394     __ eor(v2, __ T16B, v2, v6);
 8395     __ eor(v3, __ T16B, v3, v7);
 8396     __ orr(v0, __ T16B, v0, v1);
 8397     __ orr(v1, __ T16B, v2, v3);
 8398     __ orr(v0, __ T16B, v0, v1);
 8399     __ umov(tmp1, v0, __ D, 0);
 8400     __ umov(tmp2, v0, __ D, 1);
 8401     __ orr(tmp1, tmp1, tmp2);
 8402     __ cbnz(tmp1, NOT_EQUAL);
 8403     __ br(__ GE, LOOP);
 8404   }
 8405 
 8406   // a1 = r1 - array1 address
 8407   // a2 = r2 - array2 address
 8408   // result = r0 - return value. Already contains "false"
 8409   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8410   // r3-r5 are reserved temporary registers
 8411   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8412   address generate_large_array_equals() {
 8413     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8414         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8415         tmp7 = r12, tmp8 = r13;
 8416     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 8417         SMALL_LOOP, POST_LOOP;
 8418     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8419     // calculate if at least 32 prefetched bytes are used
 8420     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8421     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8422     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8423     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8424         tmp5, tmp6, tmp7, tmp8);
 8425 
 8426     __ align(CodeEntryAlignment);
 8427 
 8428     StubId stub_id = StubId::stubgen_large_array_equals_id;
 8429     StubCodeMark mark(this, stub_id);
 8430 
 8431     address entry = __ pc();
 8432     __ enter();
 8433     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8434     // also advance pointers to use post-increment instead of pre-increment
 8435     __ add(a1, a1, wordSize);
 8436     __ add(a2, a2, wordSize);
 8437     if (AvoidUnalignedAccesses) {
 8438       // both implementations (SIMD/nonSIMD) are using relatively large load
 8439       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8440       // on some CPUs in case of address is not at least 16-byte aligned.
 8441       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8442       // load if needed at least for 1st address and make if 16-byte aligned.
 8443       Label ALIGNED16;
 8444       __ tbz(a1, 3, ALIGNED16);
 8445       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8446       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8447       __ sub(cnt1, cnt1, wordSize);
 8448       __ eor(tmp1, tmp1, tmp2);
 8449       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8450       __ bind(ALIGNED16);
 8451     }
 8452     if (UseSIMDForArrayEquals) {
 8453       if (SoftwarePrefetchHintDistance >= 0) {
 8454         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8455         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8456         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8457             /* prfm = */ true, NOT_EQUAL);
 8458         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8459         __ br(__ LT, TAIL);
 8460       }
 8461       __ bind(NO_PREFETCH_LARGE_LOOP);
 8462       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8463           /* prfm = */ false, NOT_EQUAL);
 8464     } else {
 8465       __ push(spilled_regs, sp);
 8466       if (SoftwarePrefetchHintDistance >= 0) {
 8467         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8468         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8469         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8470             /* prfm = */ true, NOT_EQUAL);
 8471         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8472         __ br(__ LT, TAIL);
 8473       }
 8474       __ bind(NO_PREFETCH_LARGE_LOOP);
 8475       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8476           /* prfm = */ false, NOT_EQUAL);
 8477     }
 8478     __ bind(TAIL);
 8479       __ cbz(cnt1, EQUAL);
 8480       __ subs(cnt1, cnt1, wordSize);
 8481       __ br(__ LE, POST_LOOP);
 8482     __ bind(SMALL_LOOP);
 8483       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8484       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8485       __ subs(cnt1, cnt1, wordSize);
 8486       __ eor(tmp1, tmp1, tmp2);
 8487       __ cbnz(tmp1, NOT_EQUAL);
 8488       __ br(__ GT, SMALL_LOOP);
 8489     __ bind(POST_LOOP);
 8490       __ ldr(tmp1, Address(a1, cnt1));
 8491       __ ldr(tmp2, Address(a2, cnt1));
 8492       __ eor(tmp1, tmp1, tmp2);
 8493       __ cbnz(tmp1, NOT_EQUAL);
 8494     __ bind(EQUAL);
 8495       __ mov(result, true);
 8496     __ bind(NOT_EQUAL);
 8497       if (!UseSIMDForArrayEquals) {
 8498         __ pop(spilled_regs, sp);
 8499       }
 8500     __ bind(NOT_EQUAL_NO_POP);
 8501     __ leave();
 8502     __ ret(lr);
 8503     return entry;
 8504   }
 8505 
 8506   // result = r0 - return value. Contains initial hashcode value on entry.
 8507   // ary = r1 - array address
 8508   // cnt = r2 - elements count
 8509   // Clobbers: v0-v13, rscratch1, rscratch2
 8510   address generate_large_arrays_hashcode(BasicType eltype) {
 8511     const Register result = r0, ary = r1, cnt = r2;
 8512     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8513     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8514     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8515     const FloatRegister vpowm = v13;
 8516 
 8517     ARRAYS_HASHCODE_REGISTERS;
 8518 
 8519     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8520 
 8521     unsigned int vf; // vectorization factor
 8522     bool multiply_by_halves;
 8523     Assembler::SIMD_Arrangement load_arrangement;
 8524     switch (eltype) {
 8525     case T_BOOLEAN:
 8526     case T_BYTE:
 8527       load_arrangement = Assembler::T8B;
 8528       multiply_by_halves = true;
 8529       vf = 8;
 8530       break;
 8531     case T_CHAR:
 8532     case T_SHORT:
 8533       load_arrangement = Assembler::T8H;
 8534       multiply_by_halves = true;
 8535       vf = 8;
 8536       break;
 8537     case T_INT:
 8538       load_arrangement = Assembler::T4S;
 8539       multiply_by_halves = false;
 8540       vf = 4;
 8541       break;
 8542     default:
 8543       ShouldNotReachHere();
 8544     }
 8545 
 8546     // Unroll factor
 8547     const unsigned uf = 4;
 8548 
 8549     // Effective vectorization factor
 8550     const unsigned evf = vf * uf;
 8551 
 8552     __ align(CodeEntryAlignment);
 8553 
 8554     StubId stub_id;
 8555     switch (eltype) {
 8556     case T_BOOLEAN:
 8557       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 8558       break;
 8559     case T_BYTE:
 8560       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 8561       break;
 8562     case T_CHAR:
 8563       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 8564       break;
 8565     case T_SHORT:
 8566       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 8567       break;
 8568     case T_INT:
 8569       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 8570       break;
 8571     default:
 8572       stub_id = StubId::NO_STUBID;
 8573       ShouldNotReachHere();
 8574     };
 8575 
 8576     StubCodeMark mark(this, stub_id);
 8577 
 8578     address entry = __ pc();
 8579     __ enter();
 8580 
 8581     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8582     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8583     // value shouldn't change throughout both loops.
 8584     __ movw(rscratch1, intpow(31U, 3));
 8585     __ mov(vpow, Assembler::S, 0, rscratch1);
 8586     __ movw(rscratch1, intpow(31U, 2));
 8587     __ mov(vpow, Assembler::S, 1, rscratch1);
 8588     __ movw(rscratch1, intpow(31U, 1));
 8589     __ mov(vpow, Assembler::S, 2, rscratch1);
 8590     __ movw(rscratch1, intpow(31U, 0));
 8591     __ mov(vpow, Assembler::S, 3, rscratch1);
 8592 
 8593     __ mov(vmul0, Assembler::T16B, 0);
 8594     __ mov(vmul0, Assembler::S, 3, result);
 8595 
 8596     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8597     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8598 
 8599     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8600     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8601 
 8602     // SMALL LOOP
 8603     __ bind(SMALL_LOOP);
 8604 
 8605     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8606     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8607     __ subsw(rscratch2, rscratch2, vf);
 8608 
 8609     if (load_arrangement == Assembler::T8B) {
 8610       // Extend 8B to 8H to be able to use vector multiply
 8611       // instructions
 8612       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8613       if (is_signed_subword_type(eltype)) {
 8614         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8615       } else {
 8616         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8617       }
 8618     }
 8619 
 8620     switch (load_arrangement) {
 8621     case Assembler::T4S:
 8622       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8623       break;
 8624     case Assembler::T8B:
 8625     case Assembler::T8H:
 8626       assert(is_subword_type(eltype), "subword type expected");
 8627       if (is_signed_subword_type(eltype)) {
 8628         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8629       } else {
 8630         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8631       }
 8632       break;
 8633     default:
 8634       __ should_not_reach_here();
 8635     }
 8636 
 8637     // Process the upper half of a vector
 8638     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8639       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8640       if (is_signed_subword_type(eltype)) {
 8641         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8642       } else {
 8643         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8644       }
 8645     }
 8646 
 8647     __ br(Assembler::HI, SMALL_LOOP);
 8648 
 8649     // SMALL LOOP'S EPILOQUE
 8650     __ lsr(rscratch2, cnt, exact_log2(evf));
 8651     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8652 
 8653     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8654     __ addv(vmul0, Assembler::T4S, vmul0);
 8655     __ umov(result, vmul0, Assembler::S, 0);
 8656 
 8657     // TAIL
 8658     __ bind(TAIL);
 8659 
 8660     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8661     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8662     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8663     __ andr(rscratch2, cnt, vf - 1);
 8664     __ bind(TAIL_SHORTCUT);
 8665     __ adr(rscratch1, BR_BASE);
 8666     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8667     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8668     __ movw(rscratch2, 0x1f);
 8669     __ br(rscratch1);
 8670 
 8671     for (size_t i = 0; i < vf - 1; ++i) {
 8672       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8673                                    eltype);
 8674       __ maddw(result, result, rscratch2, rscratch1);
 8675       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8676       // Generate 2nd nop to have 4 instructions per iteration.
 8677       if (VM_Version::supports_a53mac()) {
 8678         __ nop();
 8679       }
 8680     }
 8681     __ bind(BR_BASE);
 8682 
 8683     __ leave();
 8684     __ ret(lr);
 8685 
 8686     // LARGE LOOP
 8687     __ bind(LARGE_LOOP_PREHEADER);
 8688 
 8689     __ lsr(rscratch2, cnt, exact_log2(evf));
 8690 
 8691     if (multiply_by_halves) {
 8692       // 31^4 - multiplier between lower and upper parts of a register
 8693       __ movw(rscratch1, intpow(31U, vf / 2));
 8694       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8695       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8696       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8697       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8698     } else {
 8699       // 31^16
 8700       __ movw(rscratch1, intpow(31U, evf));
 8701       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8702     }
 8703 
 8704     __ mov(vmul3, Assembler::T16B, 0);
 8705     __ mov(vmul2, Assembler::T16B, 0);
 8706     __ mov(vmul1, Assembler::T16B, 0);
 8707 
 8708     __ bind(LARGE_LOOP);
 8709 
 8710     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8711     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8712     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8713     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8714 
 8715     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8716            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8717 
 8718     if (load_arrangement == Assembler::T8B) {
 8719       // Extend 8B to 8H to be able to use vector multiply
 8720       // instructions
 8721       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8722       if (is_signed_subword_type(eltype)) {
 8723         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8724         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8725         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8726         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8727       } else {
 8728         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8729         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8730         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8731         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8732       }
 8733     }
 8734 
 8735     switch (load_arrangement) {
 8736     case Assembler::T4S:
 8737       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8738       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8739       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8740       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8741       break;
 8742     case Assembler::T8B:
 8743     case Assembler::T8H:
 8744       assert(is_subword_type(eltype), "subword type expected");
 8745       if (is_signed_subword_type(eltype)) {
 8746         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8747         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8748         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8749         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8750       } else {
 8751         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8752         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8753         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8754         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8755       }
 8756       break;
 8757     default:
 8758       __ should_not_reach_here();
 8759     }
 8760 
 8761     // Process the upper half of a vector
 8762     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8763       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8764       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8765       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8766       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8767       if (is_signed_subword_type(eltype)) {
 8768         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8769         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8770         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8771         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8772       } else {
 8773         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8774         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8775         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8776         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8777       }
 8778     }
 8779 
 8780     __ subsw(rscratch2, rscratch2, 1);
 8781     __ br(Assembler::HI, LARGE_LOOP);
 8782 
 8783     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8784     __ addv(vmul3, Assembler::T4S, vmul3);
 8785     __ umov(result, vmul3, Assembler::S, 0);
 8786 
 8787     __ mov(rscratch2, intpow(31U, vf));
 8788 
 8789     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8790     __ addv(vmul2, Assembler::T4S, vmul2);
 8791     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8792     __ maddw(result, result, rscratch2, rscratch1);
 8793 
 8794     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8795     __ addv(vmul1, Assembler::T4S, vmul1);
 8796     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8797     __ maddw(result, result, rscratch2, rscratch1);
 8798 
 8799     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8800     __ addv(vmul0, Assembler::T4S, vmul0);
 8801     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8802     __ maddw(result, result, rscratch2, rscratch1);
 8803 
 8804     __ andr(rscratch2, cnt, vf - 1);
 8805     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8806 
 8807     __ leave();
 8808     __ ret(lr);
 8809 
 8810     return entry;
 8811   }
 8812 
 8813   address generate_dsin_dcos(bool isCos) {
 8814     __ align(CodeEntryAlignment);
 8815     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 8816     StubCodeMark mark(this, stub_id);
 8817     address start = __ pc();
 8818     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8819         (address)StubRoutines::aarch64::_two_over_pi,
 8820         (address)StubRoutines::aarch64::_pio2,
 8821         (address)StubRoutines::aarch64::_dsin_coef,
 8822         (address)StubRoutines::aarch64::_dcos_coef);
 8823     return start;
 8824   }
 8825 
 8826   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8827   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8828       Label &DIFF2) {
 8829     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8830     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8831 
 8832     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8833     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8834     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8835     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8836 
 8837     __ fmovd(tmpL, vtmp3);
 8838     __ eor(rscratch2, tmp3, tmpL);
 8839     __ cbnz(rscratch2, DIFF2);
 8840 
 8841     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8842     __ umov(tmpL, vtmp3, __ D, 1);
 8843     __ eor(rscratch2, tmpU, tmpL);
 8844     __ cbnz(rscratch2, DIFF1);
 8845 
 8846     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8847     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8848     __ fmovd(tmpL, vtmp);
 8849     __ eor(rscratch2, tmp3, tmpL);
 8850     __ cbnz(rscratch2, DIFF2);
 8851 
 8852     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8853     __ umov(tmpL, vtmp, __ D, 1);
 8854     __ eor(rscratch2, tmpU, tmpL);
 8855     __ cbnz(rscratch2, DIFF1);
 8856   }
 8857 
 8858   // r0  = result
 8859   // r1  = str1
 8860   // r2  = cnt1
 8861   // r3  = str2
 8862   // r4  = cnt2
 8863   // r10 = tmp1
 8864   // r11 = tmp2
 8865   address generate_compare_long_string_different_encoding(bool isLU) {
 8866     __ align(CodeEntryAlignment);
 8867     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 8868     StubCodeMark mark(this, stub_id);
 8869     address entry = __ pc();
 8870     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8871         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8872         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8873     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8874         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8875     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8876     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8877 
 8878     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8879 
 8880     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8881     // cnt2 == amount of characters left to compare
 8882     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8883     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8884     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8885     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8886     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8887     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8888     __ eor(rscratch2, tmp1, tmp2);
 8889     __ mov(rscratch1, tmp2);
 8890     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8891     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8892              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8893     __ push(spilled_regs, sp);
 8894     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8895     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8896 
 8897     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8898 
 8899     if (SoftwarePrefetchHintDistance >= 0) {
 8900       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8901       __ br(__ LT, NO_PREFETCH);
 8902       __ bind(LARGE_LOOP_PREFETCH);
 8903         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8904         __ mov(tmp4, 2);
 8905         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8906         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8907           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8908           __ subs(tmp4, tmp4, 1);
 8909           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8910           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8911           __ mov(tmp4, 2);
 8912         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8913           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8914           __ subs(tmp4, tmp4, 1);
 8915           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8916           __ sub(cnt2, cnt2, 64);
 8917           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8918           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8919     }
 8920     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8921     __ bind(NO_PREFETCH);
 8922     __ subs(cnt2, cnt2, 16);
 8923     __ br(__ LT, TAIL);
 8924     __ align(OptoLoopAlignment);
 8925     __ bind(SMALL_LOOP); // smaller loop
 8926       __ subs(cnt2, cnt2, 16);
 8927       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8928       __ br(__ GE, SMALL_LOOP);
 8929       __ cmn(cnt2, (u1)16);
 8930       __ br(__ EQ, LOAD_LAST);
 8931     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8932       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8933       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8934       __ ldr(tmp3, Address(cnt1, -8));
 8935       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8936       __ b(LOAD_LAST);
 8937     __ bind(DIFF2);
 8938       __ mov(tmpU, tmp3);
 8939     __ bind(DIFF1);
 8940       __ pop(spilled_regs, sp);
 8941       __ b(CALCULATE_DIFFERENCE);
 8942     __ bind(LOAD_LAST);
 8943       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8944       // No need to load it again
 8945       __ mov(tmpU, tmp3);
 8946       __ pop(spilled_regs, sp);
 8947 
 8948       // tmp2 points to the address of the last 4 Latin1 characters right now
 8949       __ ldrs(vtmp, Address(tmp2));
 8950       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8951       __ fmovd(tmpL, vtmp);
 8952 
 8953       __ eor(rscratch2, tmpU, tmpL);
 8954       __ cbz(rscratch2, DONE);
 8955 
 8956     // Find the first different characters in the longwords and
 8957     // compute their difference.
 8958     __ bind(CALCULATE_DIFFERENCE);
 8959       __ rev(rscratch2, rscratch2);
 8960       __ clz(rscratch2, rscratch2);
 8961       __ andr(rscratch2, rscratch2, -16);
 8962       __ lsrv(tmp1, tmp1, rscratch2);
 8963       __ uxthw(tmp1, tmp1);
 8964       __ lsrv(rscratch1, rscratch1, rscratch2);
 8965       __ uxthw(rscratch1, rscratch1);
 8966       __ subw(result, tmp1, rscratch1);
 8967     __ bind(DONE);
 8968       __ ret(lr);
 8969     return entry;
 8970   }
 8971 
 8972   // r0 = input (float16)
 8973   // v0 = result (float)
 8974   // v1 = temporary float register
 8975   address generate_float16ToFloat() {
 8976     __ align(CodeEntryAlignment);
 8977     StubId stub_id = StubId::stubgen_hf2f_id;
 8978     StubCodeMark mark(this, stub_id);
 8979     address entry = __ pc();
 8980     BLOCK_COMMENT("Entry:");
 8981     __ flt16_to_flt(v0, r0, v1);
 8982     __ ret(lr);
 8983     return entry;
 8984   }
 8985 
 8986   // v0 = input (float)
 8987   // r0 = result (float16)
 8988   // v1 = temporary float register
 8989   address generate_floatToFloat16() {
 8990     __ align(CodeEntryAlignment);
 8991     StubId stub_id = StubId::stubgen_f2hf_id;
 8992     StubCodeMark mark(this, stub_id);
 8993     address entry = __ pc();
 8994     BLOCK_COMMENT("Entry:");
 8995     __ flt_to_flt16(r0, v0, v1);
 8996     __ ret(lr);
 8997     return entry;
 8998   }
 8999 
 9000   address generate_method_entry_barrier() {
 9001     __ align(CodeEntryAlignment);
 9002     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 9003     StubCodeMark mark(this, stub_id);
 9004 
 9005     Label deoptimize_label;
 9006 
 9007     address start = __ pc();
 9008 
 9009     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 9010 
 9011     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 9012       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 9013       // We can get here despite the nmethod being good, if we have not
 9014       // yet applied our cross modification fence (or data fence).
 9015       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 9016       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 9017       __ ldrw(rscratch2, rscratch2);
 9018       __ strw(rscratch2, thread_epoch_addr);
 9019       __ isb();
 9020       __ membar(__ LoadLoad);
 9021     }
 9022 
 9023     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 9024 
 9025     __ enter();
 9026     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 9027 
 9028     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 9029 
 9030     __ push_call_clobbered_registers();
 9031 
 9032     __ mov(c_rarg0, rscratch2);
 9033     __ call_VM_leaf
 9034          (CAST_FROM_FN_PTR
 9035           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 9036 
 9037     __ reset_last_Java_frame(true);
 9038 
 9039     __ mov(rscratch1, r0);
 9040 
 9041     __ pop_call_clobbered_registers();
 9042 
 9043     __ cbnz(rscratch1, deoptimize_label);
 9044 
 9045     __ leave();
 9046     __ ret(lr);
 9047 
 9048     __ BIND(deoptimize_label);
 9049 
 9050     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 9051     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 9052 
 9053     __ mov(sp, rscratch1);
 9054     __ br(rscratch2);
 9055 
 9056     return start;
 9057   }
 9058 
 9059   // r0  = result
 9060   // r1  = str1
 9061   // r2  = cnt1
 9062   // r3  = str2
 9063   // r4  = cnt2
 9064   // r10 = tmp1
 9065   // r11 = tmp2
 9066   address generate_compare_long_string_same_encoding(bool isLL) {
 9067     __ align(CodeEntryAlignment);
 9068     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9069     StubCodeMark mark(this, stub_id);
 9070     address entry = __ pc();
 9071     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9072         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9073 
 9074     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9075 
 9076     // exit from large loop when less than 64 bytes left to read or we're about
 9077     // to prefetch memory behind array border
 9078     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9079 
 9080     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9081     __ eor(rscratch2, tmp1, tmp2);
 9082     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9083 
 9084     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9085     // update pointers, because of previous read
 9086     __ add(str1, str1, wordSize);
 9087     __ add(str2, str2, wordSize);
 9088     if (SoftwarePrefetchHintDistance >= 0) {
 9089       __ align(OptoLoopAlignment);
 9090       __ bind(LARGE_LOOP_PREFETCH);
 9091         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9092         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9093 
 9094         for (int i = 0; i < 4; i++) {
 9095           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9096           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9097           __ cmp(tmp1, tmp2);
 9098           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9099           __ br(Assembler::NE, DIFF);
 9100         }
 9101         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9102         __ add(str1, str1, 64);
 9103         __ add(str2, str2, 64);
 9104         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9105         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9106         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9107     }
 9108 
 9109     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9110     __ br(Assembler::LE, LESS16);
 9111     __ align(OptoLoopAlignment);
 9112     __ bind(LOOP_COMPARE16);
 9113       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9114       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9115       __ cmp(tmp1, tmp2);
 9116       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9117       __ br(Assembler::NE, DIFF);
 9118       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9119       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9120       __ br(Assembler::LT, LESS16);
 9121 
 9122       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9123       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9124       __ cmp(tmp1, tmp2);
 9125       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9126       __ br(Assembler::NE, DIFF);
 9127       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9128       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9129       __ br(Assembler::GE, LOOP_COMPARE16);
 9130       __ cbz(cnt2, LENGTH_DIFF);
 9131 
 9132     __ bind(LESS16);
 9133       // each 8 compare
 9134       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9135       __ br(Assembler::LE, LESS8);
 9136       __ ldr(tmp1, Address(__ post(str1, 8)));
 9137       __ ldr(tmp2, Address(__ post(str2, 8)));
 9138       __ eor(rscratch2, tmp1, tmp2);
 9139       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9140       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9141 
 9142     __ bind(LESS8); // directly load last 8 bytes
 9143       if (!isLL) {
 9144         __ add(cnt2, cnt2, cnt2);
 9145       }
 9146       __ ldr(tmp1, Address(str1, cnt2));
 9147       __ ldr(tmp2, Address(str2, cnt2));
 9148       __ eor(rscratch2, tmp1, tmp2);
 9149       __ cbz(rscratch2, LENGTH_DIFF);
 9150       __ b(CAL_DIFFERENCE);
 9151 
 9152     __ bind(DIFF);
 9153       __ cmp(tmp1, tmp2);
 9154       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9155       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9156       // reuse rscratch2 register for the result of eor instruction
 9157       __ eor(rscratch2, tmp1, tmp2);
 9158 
 9159     __ bind(CAL_DIFFERENCE);
 9160       __ rev(rscratch2, rscratch2);
 9161       __ clz(rscratch2, rscratch2);
 9162       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9163       __ lsrv(tmp1, tmp1, rscratch2);
 9164       __ lsrv(tmp2, tmp2, rscratch2);
 9165       if (isLL) {
 9166         __ uxtbw(tmp1, tmp1);
 9167         __ uxtbw(tmp2, tmp2);
 9168       } else {
 9169         __ uxthw(tmp1, tmp1);
 9170         __ uxthw(tmp2, tmp2);
 9171       }
 9172       __ subw(result, tmp1, tmp2);
 9173 
 9174     __ bind(LENGTH_DIFF);
 9175       __ ret(lr);
 9176     return entry;
 9177   }
 9178 
 9179   enum string_compare_mode {
 9180     LL,
 9181     LU,
 9182     UL,
 9183     UU,
 9184   };
 9185 
 9186   // The following registers are declared in aarch64.ad
 9187   // r0  = result
 9188   // r1  = str1
 9189   // r2  = cnt1
 9190   // r3  = str2
 9191   // r4  = cnt2
 9192   // r10 = tmp1
 9193   // r11 = tmp2
 9194   // z0  = ztmp1
 9195   // z1  = ztmp2
 9196   // p0  = pgtmp1
 9197   // p1  = pgtmp2
 9198   address generate_compare_long_string_sve(string_compare_mode mode) {
 9199     StubId stub_id;
 9200     switch (mode) {
 9201       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9202       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9203       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9204       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9205       default: ShouldNotReachHere();
 9206     }
 9207 
 9208     __ align(CodeEntryAlignment);
 9209     address entry = __ pc();
 9210     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9211              tmp1 = r10, tmp2 = r11;
 9212 
 9213     Label LOOP, DONE, MISMATCH;
 9214     Register vec_len = tmp1;
 9215     Register idx = tmp2;
 9216     // The minimum of the string lengths has been stored in cnt2.
 9217     Register cnt = cnt2;
 9218     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9219     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9220 
 9221 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9222     switch (mode) {                                                            \
 9223       case LL:                                                                 \
 9224         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9225         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9226         break;                                                                 \
 9227       case LU:                                                                 \
 9228         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9229         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9230         break;                                                                 \
 9231       case UL:                                                                 \
 9232         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9233         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9234         break;                                                                 \
 9235       case UU:                                                                 \
 9236         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9237         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9238         break;                                                                 \
 9239       default:                                                                 \
 9240         ShouldNotReachHere();                                                  \
 9241     }
 9242 
 9243     StubCodeMark mark(this, stub_id);
 9244 
 9245     __ mov(idx, 0);
 9246     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9247 
 9248     if (mode == LL) {
 9249       __ sve_cntb(vec_len);
 9250     } else {
 9251       __ sve_cnth(vec_len);
 9252     }
 9253 
 9254     __ sub(rscratch1, cnt, vec_len);
 9255 
 9256     __ bind(LOOP);
 9257 
 9258       // main loop
 9259       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9260       __ add(idx, idx, vec_len);
 9261       // Compare strings.
 9262       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9263       __ br(__ NE, MISMATCH);
 9264       __ cmp(idx, rscratch1);
 9265       __ br(__ LT, LOOP);
 9266 
 9267     // post loop, last iteration
 9268     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9269 
 9270     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9271     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9272     __ br(__ EQ, DONE);
 9273 
 9274     __ bind(MISMATCH);
 9275 
 9276     // Crop the vector to find its location.
 9277     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9278     // Extract the first different characters of each string.
 9279     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9280     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9281 
 9282     // Compute the difference of the first different characters.
 9283     __ sub(result, rscratch1, rscratch2);
 9284 
 9285     __ bind(DONE);
 9286     __ ret(lr);
 9287 #undef LOAD_PAIR
 9288     return entry;
 9289   }
 9290 
 9291   void generate_compare_long_strings() {
 9292     if (UseSVE == 0) {
 9293       StubRoutines::aarch64::_compare_long_string_LL
 9294           = generate_compare_long_string_same_encoding(true);
 9295       StubRoutines::aarch64::_compare_long_string_UU
 9296           = generate_compare_long_string_same_encoding(false);
 9297       StubRoutines::aarch64::_compare_long_string_LU
 9298           = generate_compare_long_string_different_encoding(true);
 9299       StubRoutines::aarch64::_compare_long_string_UL
 9300           = generate_compare_long_string_different_encoding(false);
 9301     } else {
 9302       StubRoutines::aarch64::_compare_long_string_LL
 9303           = generate_compare_long_string_sve(LL);
 9304       StubRoutines::aarch64::_compare_long_string_UU
 9305           = generate_compare_long_string_sve(UU);
 9306       StubRoutines::aarch64::_compare_long_string_LU
 9307           = generate_compare_long_string_sve(LU);
 9308       StubRoutines::aarch64::_compare_long_string_UL
 9309           = generate_compare_long_string_sve(UL);
 9310     }
 9311   }
 9312 
 9313   // R0 = result
 9314   // R1 = str2
 9315   // R2 = cnt1
 9316   // R3 = str1
 9317   // R4 = cnt2
 9318   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9319   //
 9320   // This generic linear code use few additional ideas, which makes it faster:
 9321   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9322   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9323   // 2) we can use "fast" algorithm of finding single character to search for
 9324   // first symbol with less branches(1 branch per each loaded register instead
 9325   // of branch for each symbol), so, this is where constants like
 9326   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9327   // 3) after loading and analyzing 1st register of source string, it can be
 9328   // used to search for every 1st character entry, saving few loads in
 9329   // comparison with "simplier-but-slower" implementation
 9330   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9331   // re-using/re-initializing/compressing register values, which makes code
 9332   // larger and a bit less readable, however, most of extra operations are
 9333   // issued during loads or branches, so, penalty is minimal
 9334   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 9335     StubId stub_id;
 9336     if (str1_isL) {
 9337       if (str2_isL) {
 9338         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
 9339       } else {
 9340         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
 9341       }
 9342     } else {
 9343       if (str2_isL) {
 9344         ShouldNotReachHere();
 9345       } else {
 9346         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
 9347       }
 9348     }
 9349     __ align(CodeEntryAlignment);
 9350     StubCodeMark mark(this, stub_id);
 9351     address entry = __ pc();
 9352 
 9353     int str1_chr_size = str1_isL ? 1 : 2;
 9354     int str2_chr_size = str2_isL ? 1 : 2;
 9355     int str1_chr_shift = str1_isL ? 0 : 1;
 9356     int str2_chr_shift = str2_isL ? 0 : 1;
 9357     bool isL = str1_isL && str2_isL;
 9358    // parameters
 9359     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 9360     // temporary registers
 9361     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 9362     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 9363     // redefinitions
 9364     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 9365 
 9366     __ push(spilled_regs, sp);
 9367     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 9368         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 9369         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 9370         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 9371         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 9372         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 9373     // Read whole register from str1. It is safe, because length >=8 here
 9374     __ ldr(ch1, Address(str1));
 9375     // Read whole register from str2. It is safe, because length >=8 here
 9376     __ ldr(ch2, Address(str2));
 9377     __ sub(cnt2, cnt2, cnt1);
 9378     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 9379     if (str1_isL != str2_isL) {
 9380       __ eor(v0, __ T16B, v0, v0);
 9381     }
 9382     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 9383     __ mul(first, first, tmp1);
 9384     // check if we have less than 1 register to check
 9385     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 9386     if (str1_isL != str2_isL) {
 9387       __ fmovd(v1, ch1);
 9388     }
 9389     __ br(__ LE, L_SMALL);
 9390     __ eor(ch2, first, ch2);
 9391     if (str1_isL != str2_isL) {
 9392       __ zip1(v1, __ T16B, v1, v0);
 9393     }
 9394     __ sub(tmp2, ch2, tmp1);
 9395     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9396     __ bics(tmp2, tmp2, ch2);
 9397     if (str1_isL != str2_isL) {
 9398       __ fmovd(ch1, v1);
 9399     }
 9400     __ br(__ NE, L_HAS_ZERO);
 9401     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9402     __ add(result, result, wordSize/str2_chr_size);
 9403     __ add(str2, str2, wordSize);
 9404     __ br(__ LT, L_POST_LOOP);
 9405     __ BIND(L_LOOP);
 9406       __ ldr(ch2, Address(str2));
 9407       __ eor(ch2, first, ch2);
 9408       __ sub(tmp2, ch2, tmp1);
 9409       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9410       __ bics(tmp2, tmp2, ch2);
 9411       __ br(__ NE, L_HAS_ZERO);
 9412     __ BIND(L_LOOP_PROCEED);
 9413       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9414       __ add(str2, str2, wordSize);
 9415       __ add(result, result, wordSize/str2_chr_size);
 9416       __ br(__ GE, L_LOOP);
 9417     __ BIND(L_POST_LOOP);
 9418       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9419       __ br(__ LE, NOMATCH);
 9420       __ ldr(ch2, Address(str2));
 9421       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9422       __ eor(ch2, first, ch2);
 9423       __ sub(tmp2, ch2, tmp1);
 9424       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9425       __ mov(tmp4, -1); // all bits set
 9426       __ b(L_SMALL_PROCEED);
 9427     __ align(OptoLoopAlignment);
 9428     __ BIND(L_SMALL);
 9429       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9430       __ eor(ch2, first, ch2);
 9431       if (str1_isL != str2_isL) {
 9432         __ zip1(v1, __ T16B, v1, v0);
 9433       }
 9434       __ sub(tmp2, ch2, tmp1);
 9435       __ mov(tmp4, -1); // all bits set
 9436       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9437       if (str1_isL != str2_isL) {
 9438         __ fmovd(ch1, v1); // move converted 4 symbols
 9439       }
 9440     __ BIND(L_SMALL_PROCEED);
 9441       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9442       __ bic(tmp2, tmp2, ch2);
 9443       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9444       __ rbit(tmp2, tmp2);
 9445       __ br(__ EQ, NOMATCH);
 9446     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9447       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9448       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9449       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9450       if (str2_isL) { // LL
 9451         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9452         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9453         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9454         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9455         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9456       } else {
 9457         __ mov(ch2, 0xE); // all bits in byte set except last one
 9458         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9459         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9460         __ lslv(tmp2, tmp2, tmp4);
 9461         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9462         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9463         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9464         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9465       }
 9466       __ cmp(ch1, ch2);
 9467       __ mov(tmp4, wordSize/str2_chr_size);
 9468       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9469     __ BIND(L_SMALL_CMP_LOOP);
 9470       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9471                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9472       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9473                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9474       __ add(tmp4, tmp4, 1);
 9475       __ cmp(tmp4, cnt1);
 9476       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9477       __ cmp(first, ch2);
 9478       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9479     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9480       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9481       __ clz(tmp4, tmp2);
 9482       __ add(result, result, 1); // advance index
 9483       __ add(str2, str2, str2_chr_size); // advance pointer
 9484       __ b(L_SMALL_HAS_ZERO_LOOP);
 9485     __ align(OptoLoopAlignment);
 9486     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9487       __ cmp(first, ch2);
 9488       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9489       __ b(DONE);
 9490     __ align(OptoLoopAlignment);
 9491     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9492       if (str2_isL) { // LL
 9493         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9494         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9495         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9496         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9497         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9498       } else {
 9499         __ mov(ch2, 0xE); // all bits in byte set except last one
 9500         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9501         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9502         __ lslv(tmp2, tmp2, tmp4);
 9503         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9504         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9505         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9506         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9507       }
 9508       __ cmp(ch1, ch2);
 9509       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9510       __ b(DONE);
 9511     __ align(OptoLoopAlignment);
 9512     __ BIND(L_HAS_ZERO);
 9513       __ rbit(tmp2, tmp2);
 9514       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9515       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9516       // It's fine because both counters are 32bit and are not changed in this
 9517       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9518       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9519       __ sub(result, result, 1);
 9520     __ BIND(L_HAS_ZERO_LOOP);
 9521       __ mov(cnt1, wordSize/str2_chr_size);
 9522       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9523       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9524       if (str2_isL) {
 9525         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9526         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9527         __ lslv(tmp2, tmp2, tmp4);
 9528         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9529         __ add(tmp4, tmp4, 1);
 9530         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9531         __ lsl(tmp2, tmp2, 1);
 9532         __ mov(tmp4, wordSize/str2_chr_size);
 9533       } else {
 9534         __ mov(ch2, 0xE);
 9535         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9536         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9537         __ lslv(tmp2, tmp2, tmp4);
 9538         __ add(tmp4, tmp4, 1);
 9539         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9540         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9541         __ lsl(tmp2, tmp2, 1);
 9542         __ mov(tmp4, wordSize/str2_chr_size);
 9543         __ sub(str2, str2, str2_chr_size);
 9544       }
 9545       __ cmp(ch1, ch2);
 9546       __ mov(tmp4, wordSize/str2_chr_size);
 9547       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9548     __ BIND(L_CMP_LOOP);
 9549       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9550                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9551       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9552                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9553       __ add(tmp4, tmp4, 1);
 9554       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9555       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9556       __ cmp(cnt1, ch2);
 9557       __ br(__ EQ, L_CMP_LOOP);
 9558     __ BIND(L_CMP_LOOP_NOMATCH);
 9559       // here we're not matched
 9560       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9561       __ clz(tmp4, tmp2);
 9562       __ add(str2, str2, str2_chr_size); // advance pointer
 9563       __ b(L_HAS_ZERO_LOOP);
 9564     __ align(OptoLoopAlignment);
 9565     __ BIND(L_CMP_LOOP_LAST_CMP);
 9566       __ cmp(cnt1, ch2);
 9567       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9568       __ b(DONE);
 9569     __ align(OptoLoopAlignment);
 9570     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9571       if (str2_isL) {
 9572         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9573         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9574         __ lslv(tmp2, tmp2, tmp4);
 9575         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9576         __ add(tmp4, tmp4, 1);
 9577         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9578         __ lsl(tmp2, tmp2, 1);
 9579       } else {
 9580         __ mov(ch2, 0xE);
 9581         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9582         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9583         __ lslv(tmp2, tmp2, tmp4);
 9584         __ add(tmp4, tmp4, 1);
 9585         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9586         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9587         __ lsl(tmp2, tmp2, 1);
 9588         __ sub(str2, str2, str2_chr_size);
 9589       }
 9590       __ cmp(ch1, ch2);
 9591       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9592       __ b(DONE);
 9593     __ align(OptoLoopAlignment);
 9594     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9595       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9596       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9597       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9598       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9599       // result by analyzed characters value, so, we can just reset lower bits
 9600       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9601       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9602       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9603       // index of last analyzed substring inside current octet. So, str2 in at
 9604       // respective start address. We need to advance it to next octet
 9605       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9606       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9607       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9608       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9609       __ movw(cnt2, cnt2);
 9610       __ b(L_LOOP_PROCEED);
 9611     __ align(OptoLoopAlignment);
 9612     __ BIND(NOMATCH);
 9613       __ mov(result, -1);
 9614     __ BIND(DONE);
 9615       __ pop(spilled_regs, sp);
 9616       __ ret(lr);
 9617     return entry;
 9618   }
 9619 
 9620   void generate_string_indexof_stubs() {
 9621     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9622     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9623     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9624   }
 9625 
 9626   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9627       FloatRegister src1, FloatRegister src2) {
 9628     Register dst = r1;
 9629     __ zip1(v1, __ T16B, src1, v0);
 9630     __ zip2(v2, __ T16B, src1, v0);
 9631     if (generatePrfm) {
 9632       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9633     }
 9634     __ zip1(v3, __ T16B, src2, v0);
 9635     __ zip2(v4, __ T16B, src2, v0);
 9636     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9637   }
 9638 
 9639   // R0 = src
 9640   // R1 = dst
 9641   // R2 = len
 9642   // R3 = len >> 3
 9643   // V0 = 0
 9644   // v1 = loaded 8 bytes
 9645   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9646   address generate_large_byte_array_inflate() {
 9647     __ align(CodeEntryAlignment);
 9648     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
 9649     StubCodeMark mark(this, stub_id);
 9650     address entry = __ pc();
 9651     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9652     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9653     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9654 
 9655     // do one more 8-byte read to have address 16-byte aligned in most cases
 9656     // also use single store instruction
 9657     __ ldrd(v2, __ post(src, 8));
 9658     __ sub(octetCounter, octetCounter, 2);
 9659     __ zip1(v1, __ T16B, v1, v0);
 9660     __ zip1(v2, __ T16B, v2, v0);
 9661     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9662     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9663     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9664     __ br(__ LE, LOOP_START);
 9665     __ b(LOOP_PRFM_START);
 9666     __ bind(LOOP_PRFM);
 9667       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9668     __ bind(LOOP_PRFM_START);
 9669       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9670       __ sub(octetCounter, octetCounter, 8);
 9671       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9672       inflate_and_store_2_fp_registers(true, v3, v4);
 9673       inflate_and_store_2_fp_registers(true, v5, v6);
 9674       __ br(__ GT, LOOP_PRFM);
 9675       __ cmp(octetCounter, (u1)8);
 9676       __ br(__ LT, DONE);
 9677     __ bind(LOOP);
 9678       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9679       __ bind(LOOP_START);
 9680       __ sub(octetCounter, octetCounter, 8);
 9681       __ cmp(octetCounter, (u1)8);
 9682       inflate_and_store_2_fp_registers(false, v3, v4);
 9683       inflate_and_store_2_fp_registers(false, v5, v6);
 9684       __ br(__ GE, LOOP);
 9685     __ bind(DONE);
 9686       __ ret(lr);
 9687     return entry;
 9688   }
 9689 
 9690   /**
 9691    *  Arguments:
 9692    *
 9693    *  Input:
 9694    *  c_rarg0   - current state address
 9695    *  c_rarg1   - H key address
 9696    *  c_rarg2   - data address
 9697    *  c_rarg3   - number of blocks
 9698    *
 9699    *  Output:
 9700    *  Updated state at c_rarg0
 9701    */
 9702   address generate_ghash_processBlocks() {
 9703     // Bafflingly, GCM uses little-endian for the byte order, but
 9704     // big-endian for the bit order.  For example, the polynomial 1 is
 9705     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9706     //
 9707     // So, we must either reverse the bytes in each word and do
 9708     // everything big-endian or reverse the bits in each byte and do
 9709     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9710     // the bits in each byte (we have an instruction, RBIT, to do
 9711     // that) and keep the data in little-endian bit order through the
 9712     // calculation, bit-reversing the inputs and outputs.
 9713 
 9714     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
 9715     StubCodeMark mark(this, stub_id);
 9716     Label polynomial; // local data generated at end of stub
 9717     __ align(CodeEntryAlignment);
 9718     address start = __ pc();
 9719 
 9720     Register state   = c_rarg0;
 9721     Register subkeyH = c_rarg1;
 9722     Register data    = c_rarg2;
 9723     Register blocks  = c_rarg3;
 9724 
 9725     FloatRegister vzr = v30;
 9726     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9727 
 9728     __ adr(rscratch1, polynomial);
 9729     __ ldrq(v24, rscratch1);    // The field polynomial
 9730 
 9731     __ ldrq(v0, Address(state));
 9732     __ ldrq(v1, Address(subkeyH));
 9733 
 9734     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9735     __ rbit(v0, __ T16B, v0);
 9736     __ rev64(v1, __ T16B, v1);
 9737     __ rbit(v1, __ T16B, v1);
 9738 
 9739     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9740     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9741 
 9742     {
 9743       Label L_ghash_loop;
 9744       __ bind(L_ghash_loop);
 9745 
 9746       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9747                                                  // reversing each byte
 9748       __ rbit(v2, __ T16B, v2);
 9749       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9750 
 9751       // Multiply state in v2 by subkey in v1
 9752       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9753                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9754                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9755       // Reduce v7:v5 by the field polynomial
 9756       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9757 
 9758       __ sub(blocks, blocks, 1);
 9759       __ cbnz(blocks, L_ghash_loop);
 9760     }
 9761 
 9762     // The bit-reversed result is at this point in v0
 9763     __ rev64(v0, __ T16B, v0);
 9764     __ rbit(v0, __ T16B, v0);
 9765 
 9766     __ st1(v0, __ T16B, state);
 9767     __ ret(lr);
 9768 
 9769     // bind label and generate local polynomial data
 9770     __ align(wordSize * 2);
 9771     __ bind(polynomial);
 9772     __ emit_int64(0x87);  // The low-order bits of the field
 9773                           // polynomial (i.e. p = z^7+z^2+z+1)
 9774                           // repeated in the low and high parts of a
 9775                           // 128-bit vector
 9776     __ emit_int64(0x87);
 9777 
 9778     return start;
 9779   }
 9780 
 9781   address generate_ghash_processBlocks_wide() {
 9782     address small = generate_ghash_processBlocks();
 9783 
 9784     StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
 9785     StubCodeMark mark(this, stub_id);
 9786     Label polynomial;           // local data generated after stub
 9787     __ align(CodeEntryAlignment);
 9788     address start = __ pc();
 9789 
 9790     Register state   = c_rarg0;
 9791     Register subkeyH = c_rarg1;
 9792     Register data    = c_rarg2;
 9793     Register blocks  = c_rarg3;
 9794 
 9795     const int unroll = 4;
 9796 
 9797     __ cmp(blocks, (unsigned char)(unroll * 2));
 9798     __ br(__ LT, small);
 9799 
 9800     if (unroll > 1) {
 9801     // Save state before entering routine
 9802       __ sub(sp, sp, 4 * 16);
 9803       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9804       __ sub(sp, sp, 4 * 16);
 9805       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9806     }
 9807 
 9808     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
 9809 
 9810     if (unroll > 1) {
 9811       // And restore state
 9812       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9813       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9814     }
 9815 
 9816     __ cmp(blocks, (unsigned char)0);
 9817     __ br(__ GT, small);
 9818 
 9819     __ ret(lr);
 9820 
 9821     // bind label and generate polynomial data
 9822     __ align(wordSize * 2);
 9823     __ bind(polynomial);
 9824     __ emit_int64(0x87);  // The low-order bits of the field
 9825                           // polynomial (i.e. p = z^7+z^2+z+1)
 9826                           // repeated in the low and high parts of a
 9827                           // 128-bit vector
 9828     __ emit_int64(0x87);
 9829 
 9830     return start;
 9831 
 9832   }
 9833 
 9834   void generate_base64_encode_simdround(Register src, Register dst,
 9835         FloatRegister codec, u8 size) {
 9836 
 9837     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9838     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9839     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9840 
 9841     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9842 
 9843     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9844 
 9845     __ ushr(ind0, arrangement, in0,  2);
 9846 
 9847     __ ushr(ind1, arrangement, in1,  2);
 9848     __ shl(in0,   arrangement, in0,  6);
 9849     __ orr(ind1,  arrangement, ind1, in0);
 9850     __ ushr(ind1, arrangement, ind1, 2);
 9851 
 9852     __ ushr(ind2, arrangement, in2,  4);
 9853     __ shl(in1,   arrangement, in1,  4);
 9854     __ orr(ind2,  arrangement, in1,  ind2);
 9855     __ ushr(ind2, arrangement, ind2, 2);
 9856 
 9857     __ shl(ind3,  arrangement, in2,  2);
 9858     __ ushr(ind3, arrangement, ind3, 2);
 9859 
 9860     __ tbl(out0,  arrangement, codec,  4, ind0);
 9861     __ tbl(out1,  arrangement, codec,  4, ind1);
 9862     __ tbl(out2,  arrangement, codec,  4, ind2);
 9863     __ tbl(out3,  arrangement, codec,  4, ind3);
 9864 
 9865     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9866   }
 9867 
 9868    /**
 9869    *  Arguments:
 9870    *
 9871    *  Input:
 9872    *  c_rarg0   - src_start
 9873    *  c_rarg1   - src_offset
 9874    *  c_rarg2   - src_length
 9875    *  c_rarg3   - dest_start
 9876    *  c_rarg4   - dest_offset
 9877    *  c_rarg5   - isURL
 9878    *
 9879    */
 9880   address generate_base64_encodeBlock() {
 9881 
 9882     static const char toBase64[64] = {
 9883       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9884       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9885       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9886       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9887       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9888     };
 9889 
 9890     static const char toBase64URL[64] = {
 9891       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9892       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9893       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9894       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9895       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9896     };
 9897 
 9898     __ align(CodeEntryAlignment);
 9899     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
 9900     StubCodeMark mark(this, stub_id);
 9901     address start = __ pc();
 9902 
 9903     Register src   = c_rarg0;  // source array
 9904     Register soff  = c_rarg1;  // source start offset
 9905     Register send  = c_rarg2;  // source end offset
 9906     Register dst   = c_rarg3;  // dest array
 9907     Register doff  = c_rarg4;  // position for writing to dest array
 9908     Register isURL = c_rarg5;  // Base64 or URL character set
 9909 
 9910     // c_rarg6 and c_rarg7 are free to use as temps
 9911     Register codec  = c_rarg6;
 9912     Register length = c_rarg7;
 9913 
 9914     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9915 
 9916     __ add(src, src, soff);
 9917     __ add(dst, dst, doff);
 9918     __ sub(length, send, soff);
 9919 
 9920     // load the codec base address
 9921     __ lea(codec, ExternalAddress((address) toBase64));
 9922     __ cbz(isURL, ProcessData);
 9923     __ lea(codec, ExternalAddress((address) toBase64URL));
 9924 
 9925     __ BIND(ProcessData);
 9926 
 9927     // too short to formup a SIMD loop, roll back
 9928     __ cmp(length, (u1)24);
 9929     __ br(Assembler::LT, Process3B);
 9930 
 9931     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9932 
 9933     __ BIND(Process48B);
 9934     __ cmp(length, (u1)48);
 9935     __ br(Assembler::LT, Process24B);
 9936     generate_base64_encode_simdround(src, dst, v0, 16);
 9937     __ sub(length, length, 48);
 9938     __ b(Process48B);
 9939 
 9940     __ BIND(Process24B);
 9941     __ cmp(length, (u1)24);
 9942     __ br(Assembler::LT, SIMDExit);
 9943     generate_base64_encode_simdround(src, dst, v0, 8);
 9944     __ sub(length, length, 24);
 9945 
 9946     __ BIND(SIMDExit);
 9947     __ cbz(length, Exit);
 9948 
 9949     __ BIND(Process3B);
 9950     //  3 src bytes, 24 bits
 9951     __ ldrb(r10, __ post(src, 1));
 9952     __ ldrb(r11, __ post(src, 1));
 9953     __ ldrb(r12, __ post(src, 1));
 9954     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9955     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9956     // codec index
 9957     __ ubfmw(r15, r12, 18, 23);
 9958     __ ubfmw(r14, r12, 12, 17);
 9959     __ ubfmw(r13, r12, 6,  11);
 9960     __ andw(r12,  r12, 63);
 9961     // get the code based on the codec
 9962     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9963     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9964     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9965     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9966     __ strb(r15, __ post(dst, 1));
 9967     __ strb(r14, __ post(dst, 1));
 9968     __ strb(r13, __ post(dst, 1));
 9969     __ strb(r12, __ post(dst, 1));
 9970     __ sub(length, length, 3);
 9971     __ cbnz(length, Process3B);
 9972 
 9973     __ BIND(Exit);
 9974     __ ret(lr);
 9975 
 9976     return start;
 9977   }
 9978 
 9979   void generate_base64_decode_simdround(Register src, Register dst,
 9980         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9981 
 9982     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9983     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9984 
 9985     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9986     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9987 
 9988     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9989 
 9990     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9991 
 9992     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 9993 
 9994     // we need unsigned saturating subtract, to make sure all input values
 9995     // in range [0, 63] will have 0U value in the higher half lookup
 9996     __ uqsubv(decH0, __ T16B, in0, v27);
 9997     __ uqsubv(decH1, __ T16B, in1, v27);
 9998     __ uqsubv(decH2, __ T16B, in2, v27);
 9999     __ uqsubv(decH3, __ T16B, in3, v27);
10000 
10001     // lower half lookup
10002     __ tbl(decL0, arrangement, codecL, 4, in0);
10003     __ tbl(decL1, arrangement, codecL, 4, in1);
10004     __ tbl(decL2, arrangement, codecL, 4, in2);
10005     __ tbl(decL3, arrangement, codecL, 4, in3);
10006 
10007     // higher half lookup
10008     __ tbx(decH0, arrangement, codecH, 4, decH0);
10009     __ tbx(decH1, arrangement, codecH, 4, decH1);
10010     __ tbx(decH2, arrangement, codecH, 4, decH2);
10011     __ tbx(decH3, arrangement, codecH, 4, decH3);
10012 
10013     // combine lower and higher
10014     __ orr(decL0, arrangement, decL0, decH0);
10015     __ orr(decL1, arrangement, decL1, decH1);
10016     __ orr(decL2, arrangement, decL2, decH2);
10017     __ orr(decL3, arrangement, decL3, decH3);
10018 
10019     // check illegal inputs, value larger than 63 (maximum of 6 bits)
10020     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10021     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10022     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10023     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10024     __ orr(in0, arrangement, decH0, decH1);
10025     __ orr(in1, arrangement, decH2, decH3);
10026     __ orr(in2, arrangement, in0,   in1);
10027     __ umaxv(in3, arrangement, in2);
10028     __ umov(rscratch2, in3, __ B, 0);
10029 
10030     // get the data to output
10031     __ shl(out0,  arrangement, decL0, 2);
10032     __ ushr(out1, arrangement, decL1, 4);
10033     __ orr(out0,  arrangement, out0,  out1);
10034     __ shl(out1,  arrangement, decL1, 4);
10035     __ ushr(out2, arrangement, decL2, 2);
10036     __ orr(out1,  arrangement, out1,  out2);
10037     __ shl(out2,  arrangement, decL2, 6);
10038     __ orr(out2,  arrangement, out2,  decL3);
10039 
10040     __ cbz(rscratch2, NoIllegalData);
10041 
10042     // handle illegal input
10043     __ umov(r10, in2, __ D, 0);
10044     if (size == 16) {
10045       __ cbnz(r10, ErrorInLowerHalf);
10046 
10047       // illegal input is in higher half, store the lower half now.
10048       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10049 
10050       __ umov(r10, in2,  __ D, 1);
10051       __ umov(r11, out0, __ D, 1);
10052       __ umov(r12, out1, __ D, 1);
10053       __ umov(r13, out2, __ D, 1);
10054       __ b(StoreLegalData);
10055 
10056       __ BIND(ErrorInLowerHalf);
10057     }
10058     __ umov(r11, out0, __ D, 0);
10059     __ umov(r12, out1, __ D, 0);
10060     __ umov(r13, out2, __ D, 0);
10061 
10062     __ BIND(StoreLegalData);
10063     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10064     __ strb(r11, __ post(dst, 1));
10065     __ strb(r12, __ post(dst, 1));
10066     __ strb(r13, __ post(dst, 1));
10067     __ lsr(r10, r10, 8);
10068     __ lsr(r11, r11, 8);
10069     __ lsr(r12, r12, 8);
10070     __ lsr(r13, r13, 8);
10071     __ b(StoreLegalData);
10072 
10073     __ BIND(NoIllegalData);
10074     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10075   }
10076 
10077 
10078    /**
10079    *  Arguments:
10080    *
10081    *  Input:
10082    *  c_rarg0   - src_start
10083    *  c_rarg1   - src_offset
10084    *  c_rarg2   - src_length
10085    *  c_rarg3   - dest_start
10086    *  c_rarg4   - dest_offset
10087    *  c_rarg5   - isURL
10088    *  c_rarg6   - isMIME
10089    *
10090    */
10091   address generate_base64_decodeBlock() {
10092 
10093     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10094     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10095     // titled "Base64 decoding".
10096 
10097     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10098     // except the trailing character '=' is also treated illegal value in this intrinsic. That
10099     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10100     static const uint8_t fromBase64ForNoSIMD[256] = {
10101       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10102       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10103       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10104        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10105       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10106        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
10107       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10108        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10109       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10110       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10111       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10112       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10113       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10114       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10115       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10116       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10117     };
10118 
10119     static const uint8_t fromBase64URLForNoSIMD[256] = {
10120       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10121       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10122       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10123        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10124       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10125        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
10126       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10127        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10128       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10129       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10130       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10131       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10132       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10133       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10134       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10135       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10136     };
10137 
10138     // A legal value of base64 code is in range [0, 127].  We need two lookups
10139     // with tbl/tbx and combine them to get the decode data. The 1st table vector
10140     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10141     // table vector lookup use tbx, out of range indices are unchanged in
10142     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10143     // The value of index 64 is set to 0, so that we know that we already get the
10144     // decoded data with the 1st lookup.
10145     static const uint8_t fromBase64ForSIMD[128] = {
10146       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10147       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10148       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10149        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10150         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10151        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10152       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10153        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10154     };
10155 
10156     static const uint8_t fromBase64URLForSIMD[128] = {
10157       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10158       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10159       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10160        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10161         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10162        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10163        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10164        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10165     };
10166 
10167     __ align(CodeEntryAlignment);
10168     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10169     StubCodeMark mark(this, stub_id);
10170     address start = __ pc();
10171 
10172     Register src    = c_rarg0;  // source array
10173     Register soff   = c_rarg1;  // source start offset
10174     Register send   = c_rarg2;  // source end offset
10175     Register dst    = c_rarg3;  // dest array
10176     Register doff   = c_rarg4;  // position for writing to dest array
10177     Register isURL  = c_rarg5;  // Base64 or URL character set
10178     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10179 
10180     Register length = send;    // reuse send as length of source data to process
10181 
10182     Register simd_codec   = c_rarg6;
10183     Register nosimd_codec = c_rarg7;
10184 
10185     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10186 
10187     __ enter();
10188 
10189     __ add(src, src, soff);
10190     __ add(dst, dst, doff);
10191 
10192     __ mov(doff, dst);
10193 
10194     __ sub(length, send, soff);
10195     __ bfm(length, zr, 0, 1);
10196 
10197     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10198     __ cbz(isURL, ProcessData);
10199     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10200 
10201     __ BIND(ProcessData);
10202     __ mov(rscratch1, length);
10203     __ cmp(length, (u1)144); // 144 = 80 + 64
10204     __ br(Assembler::LT, Process4B);
10205 
10206     // In the MIME case, the line length cannot be more than 76
10207     // bytes (see RFC 2045). This is too short a block for SIMD
10208     // to be worthwhile, so we use non-SIMD here.
10209     __ movw(rscratch1, 79);
10210 
10211     __ BIND(Process4B);
10212     __ ldrw(r14, __ post(src, 4));
10213     __ ubfxw(r10, r14, 0,  8);
10214     __ ubfxw(r11, r14, 8,  8);
10215     __ ubfxw(r12, r14, 16, 8);
10216     __ ubfxw(r13, r14, 24, 8);
10217     // get the de-code
10218     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10219     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10220     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10221     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10222     // error detection, 255u indicates an illegal input
10223     __ orrw(r14, r10, r11);
10224     __ orrw(r15, r12, r13);
10225     __ orrw(r14, r14, r15);
10226     __ tbnz(r14, 7, Exit);
10227     // recover the data
10228     __ lslw(r14, r10, 10);
10229     __ bfiw(r14, r11, 4, 6);
10230     __ bfmw(r14, r12, 2, 5);
10231     __ rev16w(r14, r14);
10232     __ bfiw(r13, r12, 6, 2);
10233     __ strh(r14, __ post(dst, 2));
10234     __ strb(r13, __ post(dst, 1));
10235     // non-simd loop
10236     __ subsw(rscratch1, rscratch1, 4);
10237     __ br(Assembler::GT, Process4B);
10238 
10239     // if exiting from PreProcess80B, rscratch1 == -1;
10240     // otherwise, rscratch1 == 0.
10241     __ cbzw(rscratch1, Exit);
10242     __ sub(length, length, 80);
10243 
10244     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10245     __ cbz(isURL, SIMDEnter);
10246     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10247 
10248     __ BIND(SIMDEnter);
10249     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10250     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10251     __ mov(rscratch1, 63);
10252     __ dup(v27, __ T16B, rscratch1);
10253 
10254     __ BIND(Process64B);
10255     __ cmp(length, (u1)64);
10256     __ br(Assembler::LT, Process32B);
10257     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10258     __ sub(length, length, 64);
10259     __ b(Process64B);
10260 
10261     __ BIND(Process32B);
10262     __ cmp(length, (u1)32);
10263     __ br(Assembler::LT, SIMDExit);
10264     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10265     __ sub(length, length, 32);
10266     __ b(Process32B);
10267 
10268     __ BIND(SIMDExit);
10269     __ cbz(length, Exit);
10270     __ movw(rscratch1, length);
10271     __ b(Process4B);
10272 
10273     __ BIND(Exit);
10274     __ sub(c_rarg0, dst, doff);
10275 
10276     __ leave();
10277     __ ret(lr);
10278 
10279     return start;
10280   }
10281 
10282   // Support for spin waits.
10283   address generate_spin_wait() {
10284     __ align(CodeEntryAlignment);
10285     StubId stub_id = StubId::stubgen_spin_wait_id;
10286     StubCodeMark mark(this, stub_id);
10287     address start = __ pc();
10288 
10289     __ spin_wait();
10290     __ ret(lr);
10291 
10292     return start;
10293   }
10294 
10295   void generate_lookup_secondary_supers_table_stub() {
10296     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10297     StubCodeMark mark(this, stub_id);
10298 
10299     const Register
10300       r_super_klass  = r0,
10301       r_array_base   = r1,
10302       r_array_length = r2,
10303       r_array_index  = r3,
10304       r_sub_klass    = r4,
10305       r_bitmap       = rscratch2,
10306       result         = r5;
10307     const FloatRegister
10308       vtemp          = v0;
10309 
10310     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10311       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10312       Label L_success;
10313       __ enter();
10314       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10315                                              r_array_base, r_array_length, r_array_index,
10316                                              vtemp, result, slot,
10317                                              /*stub_is_near*/true);
10318       __ leave();
10319       __ ret(lr);
10320     }
10321   }
10322 
10323   // Slow path implementation for UseSecondarySupersTable.
10324   address generate_lookup_secondary_supers_table_slow_path_stub() {
10325     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10326     StubCodeMark mark(this, stub_id);
10327 
10328     address start = __ pc();
10329     const Register
10330       r_super_klass  = r0,        // argument
10331       r_array_base   = r1,        // argument
10332       temp1          = r2,        // temp
10333       r_array_index  = r3,        // argument
10334       r_bitmap       = rscratch2, // argument
10335       result         = r5;        // argument
10336 
10337     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10338     __ ret(lr);
10339 
10340     return start;
10341   }
10342 
10343 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10344 
10345   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10346   //
10347   // If LSE is in use, generate LSE versions of all the stubs. The
10348   // non-LSE versions are in atomic_aarch64.S.
10349 
10350   // class AtomicStubMark records the entry point of a stub and the
10351   // stub pointer which will point to it. The stub pointer is set to
10352   // the entry point when ~AtomicStubMark() is called, which must be
10353   // after ICache::invalidate_range. This ensures safe publication of
10354   // the generated code.
10355   class AtomicStubMark {
10356     address _entry_point;
10357     aarch64_atomic_stub_t *_stub;
10358     MacroAssembler *_masm;
10359   public:
10360     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10361       _masm = masm;
10362       __ align(32);
10363       _entry_point = __ pc();
10364       _stub = stub;
10365     }
10366     ~AtomicStubMark() {
10367       *_stub = (aarch64_atomic_stub_t)_entry_point;
10368     }
10369   };
10370 
10371   // NB: For memory_order_conservative we need a trailing membar after
10372   // LSE atomic operations but not a leading membar.
10373   //
10374   // We don't need a leading membar because a clause in the Arm ARM
10375   // says:
10376   //
10377   //   Barrier-ordered-before
10378   //
10379   //   Barrier instructions order prior Memory effects before subsequent
10380   //   Memory effects generated by the same Observer. A read or a write
10381   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10382   //   Observer if and only if RW1 appears in program order before RW 2
10383   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10384   //   instruction with both Acquire and Release semantics.
10385   //
10386   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10387   // and Release semantics, therefore we don't need a leading
10388   // barrier. However, there is no corresponding Barrier-ordered-after
10389   // relationship, therefore we need a trailing membar to prevent a
10390   // later store or load from being reordered with the store in an
10391   // atomic instruction.
10392   //
10393   // This was checked by using the herd7 consistency model simulator
10394   // (http://diy.inria.fr/) with this test case:
10395   //
10396   // AArch64 LseCas
10397   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10398   // P0 | P1;
10399   // LDR W4, [X2] | MOV W3, #0;
10400   // DMB LD       | MOV W4, #1;
10401   // LDR W3, [X1] | CASAL W3, W4, [X1];
10402   //              | DMB ISH;
10403   //              | STR W4, [X2];
10404   // exists
10405   // (0:X3=0 /\ 0:X4=1)
10406   //
10407   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10408   // with the store to x in P1. Without the DMB in P1 this may happen.
10409   //
10410   // At the time of writing we don't know of any AArch64 hardware that
10411   // reorders stores in this way, but the Reference Manual permits it.
10412 
10413   void gen_cas_entry(Assembler::operand_size size,
10414                      atomic_memory_order order) {
10415     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10416       exchange_val = c_rarg2;
10417     bool acquire, release;
10418     switch (order) {
10419       case memory_order_relaxed:
10420         acquire = false;
10421         release = false;
10422         break;
10423       case memory_order_release:
10424         acquire = false;
10425         release = true;
10426         break;
10427       default:
10428         acquire = true;
10429         release = true;
10430         break;
10431     }
10432     __ mov(prev, compare_val);
10433     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10434     if (order == memory_order_conservative) {
10435       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10436     }
10437     if (size == Assembler::xword) {
10438       __ mov(r0, prev);
10439     } else {
10440       __ movw(r0, prev);
10441     }
10442     __ ret(lr);
10443   }
10444 
10445   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10446     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10447     // If not relaxed, then default to conservative.  Relaxed is the only
10448     // case we use enough to be worth specializing.
10449     if (order == memory_order_relaxed) {
10450       __ ldadd(size, incr, prev, addr);
10451     } else {
10452       __ ldaddal(size, incr, prev, addr);
10453       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10454     }
10455     if (size == Assembler::xword) {
10456       __ mov(r0, prev);
10457     } else {
10458       __ movw(r0, prev);
10459     }
10460     __ ret(lr);
10461   }
10462 
10463   void gen_swpal_entry(Assembler::operand_size size) {
10464     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10465     __ swpal(size, incr, prev, addr);
10466     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10467     if (size == Assembler::xword) {
10468       __ mov(r0, prev);
10469     } else {
10470       __ movw(r0, prev);
10471     }
10472     __ ret(lr);
10473   }
10474 
10475   void generate_atomic_entry_points() {
10476     if (! UseLSE) {
10477       return;
10478     }
10479     __ align(CodeEntryAlignment);
10480     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10481     StubCodeMark mark(this, stub_id);
10482     address first_entry = __ pc();
10483 
10484     // ADD, memory_order_conservative
10485     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10486     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10487     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10488     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10489 
10490     // ADD, memory_order_relaxed
10491     AtomicStubMark mark_fetch_add_4_relaxed
10492       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10493     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10494     AtomicStubMark mark_fetch_add_8_relaxed
10495       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10496     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10497 
10498     // XCHG, memory_order_conservative
10499     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10500     gen_swpal_entry(Assembler::word);
10501     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10502     gen_swpal_entry(Assembler::xword);
10503 
10504     // CAS, memory_order_conservative
10505     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10506     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10507     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10508     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10509     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10510     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10511 
10512     // CAS, memory_order_relaxed
10513     AtomicStubMark mark_cmpxchg_1_relaxed
10514       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10515     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10516     AtomicStubMark mark_cmpxchg_4_relaxed
10517       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10518     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10519     AtomicStubMark mark_cmpxchg_8_relaxed
10520       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10521     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10522 
10523     AtomicStubMark mark_cmpxchg_4_release
10524       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10525     gen_cas_entry(MacroAssembler::word, memory_order_release);
10526     AtomicStubMark mark_cmpxchg_8_release
10527       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10528     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10529 
10530     AtomicStubMark mark_cmpxchg_4_seq_cst
10531       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10532     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10533     AtomicStubMark mark_cmpxchg_8_seq_cst
10534       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10535     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10536 
10537     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10538   }
10539 #endif // LINUX
10540 
10541   address generate_cont_thaw(Continuation::thaw_kind kind) {
10542     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10543     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10544 
10545     address start = __ pc();
10546 
10547     if (return_barrier) {
10548       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10549       __ mov(sp, rscratch1);
10550     }
10551     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10552 
10553     if (return_barrier) {
10554       // preserve possible return value from a method returning to the return barrier
10555       __ fmovd(rscratch1, v0);
10556       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10557     }
10558 
10559     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10560     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10561     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10562 
10563     if (return_barrier) {
10564       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10565       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10566       __ fmovd(v0, rscratch1);
10567     }
10568     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10569 
10570 
10571     Label thaw_success;
10572     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10573     __ cbnz(rscratch2, thaw_success);
10574     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10575     __ br(rscratch1);
10576     __ bind(thaw_success);
10577 
10578     // make room for the thawed frames
10579     __ sub(rscratch1, sp, rscratch2);
10580     __ andr(rscratch1, rscratch1, -16); // align
10581     __ mov(sp, rscratch1);
10582 
10583     if (return_barrier) {
10584       // save original return value -- again
10585       __ fmovd(rscratch1, v0);
10586       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10587     }
10588 
10589     // If we want, we can templatize thaw by kind, and have three different entries
10590     __ movw(c_rarg1, (uint32_t)kind);
10591 
10592     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10593     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10594 
10595     if (return_barrier) {
10596       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10597       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10598       __ fmovd(v0, rscratch1);
10599     } else {
10600       __ mov(r0, zr); // return 0 (success) from doYield
10601     }
10602 
10603     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10604     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10605     __ mov(rfp, sp);
10606 
10607     if (return_barrier_exception) {
10608       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10609       __ authenticate_return_address(c_rarg1);
10610       __ verify_oop(r0);
10611       // save return value containing the exception oop in callee-saved R19
10612       __ mov(r19, r0);
10613 
10614       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10615 
10616       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10617       // __ reinitialize_ptrue();
10618 
10619       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10620 
10621       __ mov(r1, r0); // the exception handler
10622       __ mov(r0, r19); // restore return value containing the exception oop
10623       __ verify_oop(r0);
10624 
10625       __ leave();
10626       __ mov(r3, lr);
10627       __ br(r1); // the exception handler
10628     } else {
10629       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10630       __ leave();
10631       __ ret(lr);
10632     }
10633 
10634     return start;
10635   }
10636 
10637   address generate_cont_thaw() {
10638     if (!Continuations::enabled()) return nullptr;
10639 
10640     StubId stub_id = StubId::stubgen_cont_thaw_id;
10641     StubCodeMark mark(this, stub_id);
10642     address start = __ pc();
10643     generate_cont_thaw(Continuation::thaw_top);
10644     return start;
10645   }
10646 
10647   address generate_cont_returnBarrier() {
10648     if (!Continuations::enabled()) return nullptr;
10649 
10650     // TODO: will probably need multiple return barriers depending on return type
10651     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10652     StubCodeMark mark(this, stub_id);
10653     address start = __ pc();
10654 
10655     generate_cont_thaw(Continuation::thaw_return_barrier);
10656 
10657     return start;
10658   }
10659 
10660   address generate_cont_returnBarrier_exception() {
10661     if (!Continuations::enabled()) return nullptr;
10662 
10663     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10664     StubCodeMark mark(this, stub_id);
10665     address start = __ pc();
10666 
10667     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10668 
10669     return start;
10670   }
10671 
10672   address generate_cont_preempt_stub() {
10673     if (!Continuations::enabled()) return nullptr;
10674     StubId stub_id = StubId::stubgen_cont_preempt_id;
10675     StubCodeMark mark(this, stub_id);
10676     address start = __ pc();
10677 
10678     __ reset_last_Java_frame(true);
10679 
10680     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10681     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10682     __ mov(sp, rscratch2);
10683 
10684     Label preemption_cancelled;
10685     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10686     __ cbnz(rscratch1, preemption_cancelled);
10687 
10688     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10689     SharedRuntime::continuation_enter_cleanup(_masm);
10690     __ leave();
10691     __ ret(lr);
10692 
10693     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10694     __ bind(preemption_cancelled);
10695     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10696     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10697     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10698     __ ldr(rscratch1, Address(rscratch1));
10699     __ br(rscratch1);
10700 
10701     return start;
10702   }
10703 
10704   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10705   // are represented as long[5], with BITS_PER_LIMB = 26.
10706   // Pack five 26-bit limbs into three 64-bit registers.
10707   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10708     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10709     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10710     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10711     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10712 
10713     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10714     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10715     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10716     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10717 
10718     if (dest2->is_valid()) {
10719       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10720     } else {
10721 #ifdef ASSERT
10722       Label OK;
10723       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10724       __ br(__ EQ, OK);
10725       __ stop("high bits of Poly1305 integer should be zero");
10726       __ should_not_reach_here();
10727       __ bind(OK);
10728 #endif
10729     }
10730   }
10731 
10732   // As above, but return only a 128-bit integer, packed into two
10733   // 64-bit registers.
10734   void pack_26(Register dest0, Register dest1, Register src) {
10735     pack_26(dest0, dest1, noreg, src);
10736   }
10737 
10738   // Multiply and multiply-accumulate unsigned 64-bit registers.
10739   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10740     __ mul(prod_lo, n, m);
10741     __ umulh(prod_hi, n, m);
10742   }
10743   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10744     wide_mul(rscratch1, rscratch2, n, m);
10745     __ adds(sum_lo, sum_lo, rscratch1);
10746     __ adc(sum_hi, sum_hi, rscratch2);
10747   }
10748 
10749   // Poly1305, RFC 7539
10750 
10751   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10752   // description of the tricks used to simplify and accelerate this
10753   // computation.
10754 
10755   address generate_poly1305_processBlocks() {
10756     __ align(CodeEntryAlignment);
10757     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10758     StubCodeMark mark(this, stub_id);
10759     address start = __ pc();
10760     Label here;
10761     __ enter();
10762     RegSet callee_saved = RegSet::range(r19, r28);
10763     __ push(callee_saved, sp);
10764 
10765     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10766 
10767     // Arguments
10768     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10769 
10770     // R_n is the 128-bit randomly-generated key, packed into two
10771     // registers.  The caller passes this key to us as long[5], with
10772     // BITS_PER_LIMB = 26.
10773     const Register R_0 = *++regs, R_1 = *++regs;
10774     pack_26(R_0, R_1, r_start);
10775 
10776     // RR_n is (R_n >> 2) * 5
10777     const Register RR_0 = *++regs, RR_1 = *++regs;
10778     __ lsr(RR_0, R_0, 2);
10779     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10780     __ lsr(RR_1, R_1, 2);
10781     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10782 
10783     // U_n is the current checksum
10784     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10785     pack_26(U_0, U_1, U_2, acc_start);
10786 
10787     static constexpr int BLOCK_LENGTH = 16;
10788     Label DONE, LOOP;
10789 
10790     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10791     __ br(Assembler::LT, DONE); {
10792       __ bind(LOOP);
10793 
10794       // S_n is to be the sum of U_n and the next block of data
10795       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10796       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10797       __ adds(S_0, U_0, S_0);
10798       __ adcs(S_1, U_1, S_1);
10799       __ adc(S_2, U_2, zr);
10800       __ add(S_2, S_2, 1);
10801 
10802       const Register U_0HI = *++regs, U_1HI = *++regs;
10803 
10804       // NB: this logic depends on some of the special properties of
10805       // Poly1305 keys. In particular, because we know that the top
10806       // four bits of R_0 and R_1 are zero, we can add together
10807       // partial products without any risk of needing to propagate a
10808       // carry out.
10809       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10810       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10811       __ andr(U_2, R_0, 3);
10812       __ mul(U_2, S_2, U_2);
10813 
10814       // Recycle registers S_0, S_1, S_2
10815       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10816 
10817       // Partial reduction mod 2**130 - 5
10818       __ adds(U_1, U_0HI, U_1);
10819       __ adc(U_2, U_1HI, U_2);
10820       // Sum now in U_2:U_1:U_0.
10821       // Dead: U_0HI, U_1HI.
10822       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10823 
10824       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10825 
10826       // First, U_2:U_1:U_0 += (U_2 >> 2)
10827       __ lsr(rscratch1, U_2, 2);
10828       __ andr(U_2, U_2, (u8)3);
10829       __ adds(U_0, U_0, rscratch1);
10830       __ adcs(U_1, U_1, zr);
10831       __ adc(U_2, U_2, zr);
10832       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10833       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10834       __ adcs(U_1, U_1, zr);
10835       __ adc(U_2, U_2, zr);
10836 
10837       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10838       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10839       __ br(~ Assembler::LT, LOOP);
10840     }
10841 
10842     // Further reduce modulo 2^130 - 5
10843     __ lsr(rscratch1, U_2, 2);
10844     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10845     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10846     __ adcs(U_1, U_1, zr);
10847     __ andr(U_2, U_2, (u1)3);
10848     __ adc(U_2, U_2, zr);
10849 
10850     // Unpack the sum into five 26-bit limbs and write to memory.
10851     __ ubfiz(rscratch1, U_0, 0, 26);
10852     __ ubfx(rscratch2, U_0, 26, 26);
10853     __ stp(rscratch1, rscratch2, Address(acc_start));
10854     __ ubfx(rscratch1, U_0, 52, 12);
10855     __ bfi(rscratch1, U_1, 12, 14);
10856     __ ubfx(rscratch2, U_1, 14, 26);
10857     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10858     __ ubfx(rscratch1, U_1, 40, 24);
10859     __ bfi(rscratch1, U_2, 24, 3);
10860     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10861 
10862     __ bind(DONE);
10863     __ pop(callee_saved, sp);
10864     __ leave();
10865     __ ret(lr);
10866 
10867     return start;
10868   }
10869 
10870   // exception handler for upcall stubs
10871   address generate_upcall_stub_exception_handler() {
10872     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10873     StubCodeMark mark(this, stub_id);
10874     address start = __ pc();
10875 
10876     // Native caller has no idea how to handle exceptions,
10877     // so we just crash here. Up to callee to catch exceptions.
10878     __ verify_oop(r0);
10879     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10880     __ blr(rscratch1);
10881     __ should_not_reach_here();
10882 
10883     return start;
10884   }
10885 
10886   // load Method* target of MethodHandle
10887   // j_rarg0 = jobject receiver
10888   // rmethod = result
10889   address generate_upcall_stub_load_target() {
10890     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10891     StubCodeMark mark(this, stub_id);
10892     address start = __ pc();
10893 
10894     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10895       // Load target method from receiver
10896     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10897     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10898     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10899     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10900                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10901                       noreg, noreg);
10902     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10903 
10904     __ ret(lr);
10905 
10906     return start;
10907   }
10908 
10909 #undef __
10910 #define __ masm->
10911 
10912   class MontgomeryMultiplyGenerator : public MacroAssembler {
10913 
10914     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10915       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10916 
10917     RegSet _toSave;
10918     bool _squaring;
10919 
10920   public:
10921     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10922       : MacroAssembler(as->code()), _squaring(squaring) {
10923 
10924       // Register allocation
10925 
10926       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10927       Pa_base = *regs;       // Argument registers
10928       if (squaring)
10929         Pb_base = Pa_base;
10930       else
10931         Pb_base = *++regs;
10932       Pn_base = *++regs;
10933       Rlen= *++regs;
10934       inv = *++regs;
10935       Pm_base = *++regs;
10936 
10937                           // Working registers:
10938       Ra =  *++regs;        // The current digit of a, b, n, and m.
10939       Rb =  *++regs;
10940       Rm =  *++regs;
10941       Rn =  *++regs;
10942 
10943       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10944       Pb =  *++regs;
10945       Pm =  *++regs;
10946       Pn =  *++regs;
10947 
10948       t0 =  *++regs;        // Three registers which form a
10949       t1 =  *++regs;        // triple-precision accumuator.
10950       t2 =  *++regs;
10951 
10952       Ri =  *++regs;        // Inner and outer loop indexes.
10953       Rj =  *++regs;
10954 
10955       Rhi_ab = *++regs;     // Product registers: low and high parts
10956       Rlo_ab = *++regs;     // of a*b and m*n.
10957       Rhi_mn = *++regs;
10958       Rlo_mn = *++regs;
10959 
10960       // r19 and up are callee-saved.
10961       _toSave = RegSet::range(r19, *regs) + Pm_base;
10962     }
10963 
10964   private:
10965     void save_regs() {
10966       push(_toSave, sp);
10967     }
10968 
10969     void restore_regs() {
10970       pop(_toSave, sp);
10971     }
10972 
10973     template <typename T>
10974     void unroll_2(Register count, T block) {
10975       Label loop, end, odd;
10976       tbnz(count, 0, odd);
10977       cbz(count, end);
10978       align(16);
10979       bind(loop);
10980       (this->*block)();
10981       bind(odd);
10982       (this->*block)();
10983       subs(count, count, 2);
10984       br(Assembler::GT, loop);
10985       bind(end);
10986     }
10987 
10988     template <typename T>
10989     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10990       Label loop, end, odd;
10991       tbnz(count, 0, odd);
10992       cbz(count, end);
10993       align(16);
10994       bind(loop);
10995       (this->*block)(d, s, tmp);
10996       bind(odd);
10997       (this->*block)(d, s, tmp);
10998       subs(count, count, 2);
10999       br(Assembler::GT, loop);
11000       bind(end);
11001     }
11002 
11003     void pre1(RegisterOrConstant i) {
11004       block_comment("pre1");
11005       // Pa = Pa_base;
11006       // Pb = Pb_base + i;
11007       // Pm = Pm_base;
11008       // Pn = Pn_base + i;
11009       // Ra = *Pa;
11010       // Rb = *Pb;
11011       // Rm = *Pm;
11012       // Rn = *Pn;
11013       ldr(Ra, Address(Pa_base));
11014       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11015       ldr(Rm, Address(Pm_base));
11016       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11017       lea(Pa, Address(Pa_base));
11018       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11019       lea(Pm, Address(Pm_base));
11020       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11021 
11022       // Zero the m*n result.
11023       mov(Rhi_mn, zr);
11024       mov(Rlo_mn, zr);
11025     }
11026 
11027     // The core multiply-accumulate step of a Montgomery
11028     // multiplication.  The idea is to schedule operations as a
11029     // pipeline so that instructions with long latencies (loads and
11030     // multiplies) have time to complete before their results are
11031     // used.  This most benefits in-order implementations of the
11032     // architecture but out-of-order ones also benefit.
11033     void step() {
11034       block_comment("step");
11035       // MACC(Ra, Rb, t0, t1, t2);
11036       // Ra = *++Pa;
11037       // Rb = *--Pb;
11038       umulh(Rhi_ab, Ra, Rb);
11039       mul(Rlo_ab, Ra, Rb);
11040       ldr(Ra, pre(Pa, wordSize));
11041       ldr(Rb, pre(Pb, -wordSize));
11042       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11043                                        // previous iteration.
11044       // MACC(Rm, Rn, t0, t1, t2);
11045       // Rm = *++Pm;
11046       // Rn = *--Pn;
11047       umulh(Rhi_mn, Rm, Rn);
11048       mul(Rlo_mn, Rm, Rn);
11049       ldr(Rm, pre(Pm, wordSize));
11050       ldr(Rn, pre(Pn, -wordSize));
11051       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11052     }
11053 
11054     void post1() {
11055       block_comment("post1");
11056 
11057       // MACC(Ra, Rb, t0, t1, t2);
11058       // Ra = *++Pa;
11059       // Rb = *--Pb;
11060       umulh(Rhi_ab, Ra, Rb);
11061       mul(Rlo_ab, Ra, Rb);
11062       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11063       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11064 
11065       // *Pm = Rm = t0 * inv;
11066       mul(Rm, t0, inv);
11067       str(Rm, Address(Pm));
11068 
11069       // MACC(Rm, Rn, t0, t1, t2);
11070       // t0 = t1; t1 = t2; t2 = 0;
11071       umulh(Rhi_mn, Rm, Rn);
11072 
11073 #ifndef PRODUCT
11074       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11075       {
11076         mul(Rlo_mn, Rm, Rn);
11077         add(Rlo_mn, t0, Rlo_mn);
11078         Label ok;
11079         cbz(Rlo_mn, ok); {
11080           stop("broken Montgomery multiply");
11081         } bind(ok);
11082       }
11083 #endif
11084       // We have very carefully set things up so that
11085       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11086       // the lower half of Rm * Rn because we know the result already:
11087       // it must be -t0.  t0 + (-t0) must generate a carry iff
11088       // t0 != 0.  So, rather than do a mul and an adds we just set
11089       // the carry flag iff t0 is nonzero.
11090       //
11091       // mul(Rlo_mn, Rm, Rn);
11092       // adds(zr, t0, Rlo_mn);
11093       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11094       adcs(t0, t1, Rhi_mn);
11095       adc(t1, t2, zr);
11096       mov(t2, zr);
11097     }
11098 
11099     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11100       block_comment("pre2");
11101       // Pa = Pa_base + i-len;
11102       // Pb = Pb_base + len;
11103       // Pm = Pm_base + i-len;
11104       // Pn = Pn_base + len;
11105 
11106       if (i.is_register()) {
11107         sub(Rj, i.as_register(), len);
11108       } else {
11109         mov(Rj, i.as_constant());
11110         sub(Rj, Rj, len);
11111       }
11112       // Rj == i-len
11113 
11114       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11115       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11116       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11117       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11118 
11119       // Ra = *++Pa;
11120       // Rb = *--Pb;
11121       // Rm = *++Pm;
11122       // Rn = *--Pn;
11123       ldr(Ra, pre(Pa, wordSize));
11124       ldr(Rb, pre(Pb, -wordSize));
11125       ldr(Rm, pre(Pm, wordSize));
11126       ldr(Rn, pre(Pn, -wordSize));
11127 
11128       mov(Rhi_mn, zr);
11129       mov(Rlo_mn, zr);
11130     }
11131 
11132     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11133       block_comment("post2");
11134       if (i.is_constant()) {
11135         mov(Rj, i.as_constant()-len.as_constant());
11136       } else {
11137         sub(Rj, i.as_register(), len);
11138       }
11139 
11140       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11141 
11142       // As soon as we know the least significant digit of our result,
11143       // store it.
11144       // Pm_base[i-len] = t0;
11145       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11146 
11147       // t0 = t1; t1 = t2; t2 = 0;
11148       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11149       adc(t1, t2, zr);
11150       mov(t2, zr);
11151     }
11152 
11153     // A carry in t0 after Montgomery multiplication means that we
11154     // should subtract multiples of n from our result in m.  We'll
11155     // keep doing that until there is no carry.
11156     void normalize(RegisterOrConstant len) {
11157       block_comment("normalize");
11158       // while (t0)
11159       //   t0 = sub(Pm_base, Pn_base, t0, len);
11160       Label loop, post, again;
11161       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11162       cbz(t0, post); {
11163         bind(again); {
11164           mov(i, zr);
11165           mov(cnt, len);
11166           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11167           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11168           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11169           align(16);
11170           bind(loop); {
11171             sbcs(Rm, Rm, Rn);
11172             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11173             add(i, i, 1);
11174             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11175             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11176             sub(cnt, cnt, 1);
11177           } cbnz(cnt, loop);
11178           sbc(t0, t0, zr);
11179         } cbnz(t0, again);
11180       } bind(post);
11181     }
11182 
11183     // Move memory at s to d, reversing words.
11184     //    Increments d to end of copied memory
11185     //    Destroys tmp1, tmp2
11186     //    Preserves len
11187     //    Leaves s pointing to the address which was in d at start
11188     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11189       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11190       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11191 
11192       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11193       mov(tmp1, len);
11194       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11195       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11196     }
11197     // where
11198     void reverse1(Register d, Register s, Register tmp) {
11199       ldr(tmp, pre(s, -wordSize));
11200       ror(tmp, tmp, 32);
11201       str(tmp, post(d, wordSize));
11202     }
11203 
11204     void step_squaring() {
11205       // An extra ACC
11206       step();
11207       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11208     }
11209 
11210     void last_squaring(RegisterOrConstant i) {
11211       Label dont;
11212       // if ((i & 1) == 0) {
11213       tbnz(i.as_register(), 0, dont); {
11214         // MACC(Ra, Rb, t0, t1, t2);
11215         // Ra = *++Pa;
11216         // Rb = *--Pb;
11217         umulh(Rhi_ab, Ra, Rb);
11218         mul(Rlo_ab, Ra, Rb);
11219         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11220       } bind(dont);
11221     }
11222 
11223     void extra_step_squaring() {
11224       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11225 
11226       // MACC(Rm, Rn, t0, t1, t2);
11227       // Rm = *++Pm;
11228       // Rn = *--Pn;
11229       umulh(Rhi_mn, Rm, Rn);
11230       mul(Rlo_mn, Rm, Rn);
11231       ldr(Rm, pre(Pm, wordSize));
11232       ldr(Rn, pre(Pn, -wordSize));
11233     }
11234 
11235     void post1_squaring() {
11236       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11237 
11238       // *Pm = Rm = t0 * inv;
11239       mul(Rm, t0, inv);
11240       str(Rm, Address(Pm));
11241 
11242       // MACC(Rm, Rn, t0, t1, t2);
11243       // t0 = t1; t1 = t2; t2 = 0;
11244       umulh(Rhi_mn, Rm, Rn);
11245 
11246 #ifndef PRODUCT
11247       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11248       {
11249         mul(Rlo_mn, Rm, Rn);
11250         add(Rlo_mn, t0, Rlo_mn);
11251         Label ok;
11252         cbz(Rlo_mn, ok); {
11253           stop("broken Montgomery multiply");
11254         } bind(ok);
11255       }
11256 #endif
11257       // We have very carefully set things up so that
11258       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11259       // the lower half of Rm * Rn because we know the result already:
11260       // it must be -t0.  t0 + (-t0) must generate a carry iff
11261       // t0 != 0.  So, rather than do a mul and an adds we just set
11262       // the carry flag iff t0 is nonzero.
11263       //
11264       // mul(Rlo_mn, Rm, Rn);
11265       // adds(zr, t0, Rlo_mn);
11266       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11267       adcs(t0, t1, Rhi_mn);
11268       adc(t1, t2, zr);
11269       mov(t2, zr);
11270     }
11271 
11272     void acc(Register Rhi, Register Rlo,
11273              Register t0, Register t1, Register t2) {
11274       adds(t0, t0, Rlo);
11275       adcs(t1, t1, Rhi);
11276       adc(t2, t2, zr);
11277     }
11278 
11279   public:
11280     /**
11281      * Fast Montgomery multiplication.  The derivation of the
11282      * algorithm is in A Cryptographic Library for the Motorola
11283      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11284      *
11285      * Arguments:
11286      *
11287      * Inputs for multiplication:
11288      *   c_rarg0   - int array elements a
11289      *   c_rarg1   - int array elements b
11290      *   c_rarg2   - int array elements n (the modulus)
11291      *   c_rarg3   - int length
11292      *   c_rarg4   - int inv
11293      *   c_rarg5   - int array elements m (the result)
11294      *
11295      * Inputs for squaring:
11296      *   c_rarg0   - int array elements a
11297      *   c_rarg1   - int array elements n (the modulus)
11298      *   c_rarg2   - int length
11299      *   c_rarg3   - int inv
11300      *   c_rarg4   - int array elements m (the result)
11301      *
11302      */
11303     address generate_multiply() {
11304       Label argh, nothing;
11305       bind(argh);
11306       stop("MontgomeryMultiply total_allocation must be <= 8192");
11307 
11308       align(CodeEntryAlignment);
11309       address entry = pc();
11310 
11311       cbzw(Rlen, nothing);
11312 
11313       enter();
11314 
11315       // Make room.
11316       cmpw(Rlen, 512);
11317       br(Assembler::HI, argh);
11318       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11319       andr(sp, Ra, -2 * wordSize);
11320 
11321       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11322 
11323       {
11324         // Copy input args, reversing as we go.  We use Ra as a
11325         // temporary variable.
11326         reverse(Ra, Pa_base, Rlen, t0, t1);
11327         if (!_squaring)
11328           reverse(Ra, Pb_base, Rlen, t0, t1);
11329         reverse(Ra, Pn_base, Rlen, t0, t1);
11330       }
11331 
11332       // Push all call-saved registers and also Pm_base which we'll need
11333       // at the end.
11334       save_regs();
11335 
11336 #ifndef PRODUCT
11337       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11338       {
11339         ldr(Rn, Address(Pn_base, 0));
11340         mul(Rlo_mn, Rn, inv);
11341         subs(zr, Rlo_mn, -1);
11342         Label ok;
11343         br(EQ, ok); {
11344           stop("broken inverse in Montgomery multiply");
11345         } bind(ok);
11346       }
11347 #endif
11348 
11349       mov(Pm_base, Ra);
11350 
11351       mov(t0, zr);
11352       mov(t1, zr);
11353       mov(t2, zr);
11354 
11355       block_comment("for (int i = 0; i < len; i++) {");
11356       mov(Ri, zr); {
11357         Label loop, end;
11358         cmpw(Ri, Rlen);
11359         br(Assembler::GE, end);
11360 
11361         bind(loop);
11362         pre1(Ri);
11363 
11364         block_comment("  for (j = i; j; j--) {"); {
11365           movw(Rj, Ri);
11366           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11367         } block_comment("  } // j");
11368 
11369         post1();
11370         addw(Ri, Ri, 1);
11371         cmpw(Ri, Rlen);
11372         br(Assembler::LT, loop);
11373         bind(end);
11374         block_comment("} // i");
11375       }
11376 
11377       block_comment("for (int i = len; i < 2*len; i++) {");
11378       mov(Ri, Rlen); {
11379         Label loop, end;
11380         cmpw(Ri, Rlen, Assembler::LSL, 1);
11381         br(Assembler::GE, end);
11382 
11383         bind(loop);
11384         pre2(Ri, Rlen);
11385 
11386         block_comment("  for (j = len*2-i-1; j; j--) {"); {
11387           lslw(Rj, Rlen, 1);
11388           subw(Rj, Rj, Ri);
11389           subw(Rj, Rj, 1);
11390           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11391         } block_comment("  } // j");
11392 
11393         post2(Ri, Rlen);
11394         addw(Ri, Ri, 1);
11395         cmpw(Ri, Rlen, Assembler::LSL, 1);
11396         br(Assembler::LT, loop);
11397         bind(end);
11398       }
11399       block_comment("} // i");
11400 
11401       normalize(Rlen);
11402 
11403       mov(Ra, Pm_base);  // Save Pm_base in Ra
11404       restore_regs();  // Restore caller's Pm_base
11405 
11406       // Copy our result into caller's Pm_base
11407       reverse(Pm_base, Ra, Rlen, t0, t1);
11408 
11409       leave();
11410       bind(nothing);
11411       ret(lr);
11412 
11413       return entry;
11414     }
11415     // In C, approximately:
11416 
11417     // void
11418     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11419     //                     julong Pn_base[], julong Pm_base[],
11420     //                     julong inv, int len) {
11421     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11422     //   julong *Pa, *Pb, *Pn, *Pm;
11423     //   julong Ra, Rb, Rn, Rm;
11424 
11425     //   int i;
11426 
11427     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11428 
11429     //   for (i = 0; i < len; i++) {
11430     //     int j;
11431 
11432     //     Pa = Pa_base;
11433     //     Pb = Pb_base + i;
11434     //     Pm = Pm_base;
11435     //     Pn = Pn_base + i;
11436 
11437     //     Ra = *Pa;
11438     //     Rb = *Pb;
11439     //     Rm = *Pm;
11440     //     Rn = *Pn;
11441 
11442     //     int iters = i;
11443     //     for (j = 0; iters--; j++) {
11444     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11445     //       MACC(Ra, Rb, t0, t1, t2);
11446     //       Ra = *++Pa;
11447     //       Rb = *--Pb;
11448     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11449     //       MACC(Rm, Rn, t0, t1, t2);
11450     //       Rm = *++Pm;
11451     //       Rn = *--Pn;
11452     //     }
11453 
11454     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11455     //     MACC(Ra, Rb, t0, t1, t2);
11456     //     *Pm = Rm = t0 * inv;
11457     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11458     //     MACC(Rm, Rn, t0, t1, t2);
11459 
11460     //     assert(t0 == 0, "broken Montgomery multiply");
11461 
11462     //     t0 = t1; t1 = t2; t2 = 0;
11463     //   }
11464 
11465     //   for (i = len; i < 2*len; i++) {
11466     //     int j;
11467 
11468     //     Pa = Pa_base + i-len;
11469     //     Pb = Pb_base + len;
11470     //     Pm = Pm_base + i-len;
11471     //     Pn = Pn_base + len;
11472 
11473     //     Ra = *++Pa;
11474     //     Rb = *--Pb;
11475     //     Rm = *++Pm;
11476     //     Rn = *--Pn;
11477 
11478     //     int iters = len*2-i-1;
11479     //     for (j = i-len+1; iters--; j++) {
11480     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11481     //       MACC(Ra, Rb, t0, t1, t2);
11482     //       Ra = *++Pa;
11483     //       Rb = *--Pb;
11484     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11485     //       MACC(Rm, Rn, t0, t1, t2);
11486     //       Rm = *++Pm;
11487     //       Rn = *--Pn;
11488     //     }
11489 
11490     //     Pm_base[i-len] = t0;
11491     //     t0 = t1; t1 = t2; t2 = 0;
11492     //   }
11493 
11494     //   while (t0)
11495     //     t0 = sub(Pm_base, Pn_base, t0, len);
11496     // }
11497 
11498     /**
11499      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11500      * multiplies than Montgomery multiplication so it should be up to
11501      * 25% faster.  However, its loop control is more complex and it
11502      * may actually run slower on some machines.
11503      *
11504      * Arguments:
11505      *
11506      * Inputs:
11507      *   c_rarg0   - int array elements a
11508      *   c_rarg1   - int array elements n (the modulus)
11509      *   c_rarg2   - int length
11510      *   c_rarg3   - int inv
11511      *   c_rarg4   - int array elements m (the result)
11512      *
11513      */
11514     address generate_square() {
11515       Label argh;
11516       bind(argh);
11517       stop("MontgomeryMultiply total_allocation must be <= 8192");
11518 
11519       align(CodeEntryAlignment);
11520       address entry = pc();
11521 
11522       enter();
11523 
11524       // Make room.
11525       cmpw(Rlen, 512);
11526       br(Assembler::HI, argh);
11527       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11528       andr(sp, Ra, -2 * wordSize);
11529 
11530       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11531 
11532       {
11533         // Copy input args, reversing as we go.  We use Ra as a
11534         // temporary variable.
11535         reverse(Ra, Pa_base, Rlen, t0, t1);
11536         reverse(Ra, Pn_base, Rlen, t0, t1);
11537       }
11538 
11539       // Push all call-saved registers and also Pm_base which we'll need
11540       // at the end.
11541       save_regs();
11542 
11543       mov(Pm_base, Ra);
11544 
11545       mov(t0, zr);
11546       mov(t1, zr);
11547       mov(t2, zr);
11548 
11549       block_comment("for (int i = 0; i < len; i++) {");
11550       mov(Ri, zr); {
11551         Label loop, end;
11552         bind(loop);
11553         cmp(Ri, Rlen);
11554         br(Assembler::GE, end);
11555 
11556         pre1(Ri);
11557 
11558         block_comment("for (j = (i+1)/2; j; j--) {"); {
11559           add(Rj, Ri, 1);
11560           lsr(Rj, Rj, 1);
11561           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11562         } block_comment("  } // j");
11563 
11564         last_squaring(Ri);
11565 
11566         block_comment("  for (j = i/2; j; j--) {"); {
11567           lsr(Rj, Ri, 1);
11568           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11569         } block_comment("  } // j");
11570 
11571         post1_squaring();
11572         add(Ri, Ri, 1);
11573         cmp(Ri, Rlen);
11574         br(Assembler::LT, loop);
11575 
11576         bind(end);
11577         block_comment("} // i");
11578       }
11579 
11580       block_comment("for (int i = len; i < 2*len; i++) {");
11581       mov(Ri, Rlen); {
11582         Label loop, end;
11583         bind(loop);
11584         cmp(Ri, Rlen, Assembler::LSL, 1);
11585         br(Assembler::GE, end);
11586 
11587         pre2(Ri, Rlen);
11588 
11589         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11590           lsl(Rj, Rlen, 1);
11591           sub(Rj, Rj, Ri);
11592           sub(Rj, Rj, 1);
11593           lsr(Rj, Rj, 1);
11594           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11595         } block_comment("  } // j");
11596 
11597         last_squaring(Ri);
11598 
11599         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11600           lsl(Rj, Rlen, 1);
11601           sub(Rj, Rj, Ri);
11602           lsr(Rj, Rj, 1);
11603           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11604         } block_comment("  } // j");
11605 
11606         post2(Ri, Rlen);
11607         add(Ri, Ri, 1);
11608         cmp(Ri, Rlen, Assembler::LSL, 1);
11609 
11610         br(Assembler::LT, loop);
11611         bind(end);
11612         block_comment("} // i");
11613       }
11614 
11615       normalize(Rlen);
11616 
11617       mov(Ra, Pm_base);  // Save Pm_base in Ra
11618       restore_regs();  // Restore caller's Pm_base
11619 
11620       // Copy our result into caller's Pm_base
11621       reverse(Pm_base, Ra, Rlen, t0, t1);
11622 
11623       leave();
11624       ret(lr);
11625 
11626       return entry;
11627     }
11628     // In C, approximately:
11629 
11630     // void
11631     // montgomery_square(julong Pa_base[], julong Pn_base[],
11632     //                   julong Pm_base[], julong inv, int len) {
11633     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11634     //   julong *Pa, *Pb, *Pn, *Pm;
11635     //   julong Ra, Rb, Rn, Rm;
11636 
11637     //   int i;
11638 
11639     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11640 
11641     //   for (i = 0; i < len; i++) {
11642     //     int j;
11643 
11644     //     Pa = Pa_base;
11645     //     Pb = Pa_base + i;
11646     //     Pm = Pm_base;
11647     //     Pn = Pn_base + i;
11648 
11649     //     Ra = *Pa;
11650     //     Rb = *Pb;
11651     //     Rm = *Pm;
11652     //     Rn = *Pn;
11653 
11654     //     int iters = (i+1)/2;
11655     //     for (j = 0; iters--; j++) {
11656     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11657     //       MACC2(Ra, Rb, t0, t1, t2);
11658     //       Ra = *++Pa;
11659     //       Rb = *--Pb;
11660     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11661     //       MACC(Rm, Rn, t0, t1, t2);
11662     //       Rm = *++Pm;
11663     //       Rn = *--Pn;
11664     //     }
11665     //     if ((i & 1) == 0) {
11666     //       assert(Ra == Pa_base[j], "must be");
11667     //       MACC(Ra, Ra, t0, t1, t2);
11668     //     }
11669     //     iters = i/2;
11670     //     assert(iters == i-j, "must be");
11671     //     for (; iters--; j++) {
11672     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11673     //       MACC(Rm, Rn, t0, t1, t2);
11674     //       Rm = *++Pm;
11675     //       Rn = *--Pn;
11676     //     }
11677 
11678     //     *Pm = Rm = t0 * inv;
11679     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11680     //     MACC(Rm, Rn, t0, t1, t2);
11681 
11682     //     assert(t0 == 0, "broken Montgomery multiply");
11683 
11684     //     t0 = t1; t1 = t2; t2 = 0;
11685     //   }
11686 
11687     //   for (i = len; i < 2*len; i++) {
11688     //     int start = i-len+1;
11689     //     int end = start + (len - start)/2;
11690     //     int j;
11691 
11692     //     Pa = Pa_base + i-len;
11693     //     Pb = Pa_base + len;
11694     //     Pm = Pm_base + i-len;
11695     //     Pn = Pn_base + len;
11696 
11697     //     Ra = *++Pa;
11698     //     Rb = *--Pb;
11699     //     Rm = *++Pm;
11700     //     Rn = *--Pn;
11701 
11702     //     int iters = (2*len-i-1)/2;
11703     //     assert(iters == end-start, "must be");
11704     //     for (j = start; iters--; j++) {
11705     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11706     //       MACC2(Ra, Rb, t0, t1, t2);
11707     //       Ra = *++Pa;
11708     //       Rb = *--Pb;
11709     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11710     //       MACC(Rm, Rn, t0, t1, t2);
11711     //       Rm = *++Pm;
11712     //       Rn = *--Pn;
11713     //     }
11714     //     if ((i & 1) == 0) {
11715     //       assert(Ra == Pa_base[j], "must be");
11716     //       MACC(Ra, Ra, t0, t1, t2);
11717     //     }
11718     //     iters =  (2*len-i)/2;
11719     //     assert(iters == len-j, "must be");
11720     //     for (; iters--; j++) {
11721     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11722     //       MACC(Rm, Rn, t0, t1, t2);
11723     //       Rm = *++Pm;
11724     //       Rn = *--Pn;
11725     //     }
11726     //     Pm_base[i-len] = t0;
11727     //     t0 = t1; t1 = t2; t2 = 0;
11728     //   }
11729 
11730     //   while (t0)
11731     //     t0 = sub(Pm_base, Pn_base, t0, len);
11732     // }
11733   };
11734 
11735   // Initialization
11736   void generate_preuniverse_stubs() {
11737     // preuniverse stubs are not needed for aarch64
11738   }
11739 
11740   void generate_initial_stubs() {
11741     // Generate initial stubs and initializes the entry points
11742 
11743     // entry points that exist in all platforms Note: This is code
11744     // that could be shared among different platforms - however the
11745     // benefit seems to be smaller than the disadvantage of having a
11746     // much more complicated generator structure. See also comment in
11747     // stubRoutines.hpp.
11748 
11749     StubRoutines::_forward_exception_entry = generate_forward_exception();
11750 
11751     StubRoutines::_call_stub_entry =
11752       generate_call_stub(StubRoutines::_call_stub_return_address);
11753 
11754     // is referenced by megamorphic call
11755     StubRoutines::_catch_exception_entry = generate_catch_exception();
11756 
11757     // Initialize table for copy memory (arraycopy) check.
11758     if (UnsafeMemoryAccess::_table == nullptr) {
11759       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11760     }
11761 
11762     if (UseCRC32Intrinsics) {
11763       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11764     }
11765 
11766     if (UseCRC32CIntrinsics) {
11767       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11768     }
11769 
11770     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11771       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11772     }
11773 
11774     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11775       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11776     }
11777 
11778     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11779         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11780       StubRoutines::_hf2f = generate_float16ToFloat();
11781       StubRoutines::_f2hf = generate_floatToFloat16();
11782     }
11783   }
11784 
11785   void generate_continuation_stubs() {
11786     // Continuation stubs:
11787     StubRoutines::_cont_thaw          = generate_cont_thaw();
11788     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11789     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11790     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11791   }
11792 
11793   void generate_final_stubs() {
11794     // support for verify_oop (must happen after universe_init)
11795     if (VerifyOops) {
11796       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11797     }
11798 
11799     // arraycopy stubs used by compilers
11800     generate_arraycopy_stubs();
11801 
11802     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11803 
11804     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11805 
11806     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11807     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11808 
11809 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11810 
11811     generate_atomic_entry_points();
11812 
11813 #endif // LINUX
11814 
11815 #ifdef COMPILER2
11816     if (UseSecondarySupersTable) {
11817       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11818       if (! InlineSecondarySupersTest) {
11819         generate_lookup_secondary_supers_table_stub();
11820       }
11821     }
11822 #endif
11823 
11824     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11825 
11826     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11827   }
11828 
11829   void generate_compiler_stubs() {
11830 #if COMPILER2_OR_JVMCI
11831 
11832     if (UseSVE == 0) {
11833       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11834     }
11835 
11836     // array equals stub for large arrays.
11837     if (!UseSimpleArrayEquals) {
11838       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11839     }
11840 
11841     // arrays_hascode stub for large arrays.
11842     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11843     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11844     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11845     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11846     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11847 
11848     // byte_array_inflate stub for large arrays.
11849     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11850 
11851     // countPositives stub for large arrays.
11852     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11853 
11854     generate_compare_long_strings();
11855 
11856     generate_string_indexof_stubs();
11857 
11858 #ifdef COMPILER2
11859     if (UseMultiplyToLenIntrinsic) {
11860       StubRoutines::_multiplyToLen = generate_multiplyToLen();
11861     }
11862 
11863     if (UseSquareToLenIntrinsic) {
11864       StubRoutines::_squareToLen = generate_squareToLen();
11865     }
11866 
11867     if (UseMulAddIntrinsic) {
11868       StubRoutines::_mulAdd = generate_mulAdd();
11869     }
11870 
11871     if (UseSIMDForBigIntegerShiftIntrinsics) {
11872       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11873       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
11874     }
11875 
11876     if (UseMontgomeryMultiplyIntrinsic) {
11877       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11878       StubCodeMark mark(this, stub_id);
11879       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11880       StubRoutines::_montgomeryMultiply = g.generate_multiply();
11881     }
11882 
11883     if (UseMontgomerySquareIntrinsic) {
11884       StubId stub_id = StubId::stubgen_montgomerySquare_id;
11885       StubCodeMark mark(this, stub_id);
11886       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11887       // We use generate_multiply() rather than generate_square()
11888       // because it's faster for the sizes of modulus we care about.
11889       StubRoutines::_montgomerySquare = g.generate_multiply();
11890     }
11891 
11892 #endif // COMPILER2
11893 
11894     if (UseChaCha20Intrinsics) {
11895       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11896     }
11897 
11898     if (UseKyberIntrinsics) {
11899       StubRoutines::_kyberNtt = generate_kyberNtt();
11900       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11901       StubRoutines::_kyberNttMult = generate_kyberNttMult();
11902       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11903       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11904       StubRoutines::_kyber12To16 = generate_kyber12To16();
11905       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11906     }
11907 
11908     if (UseDilithiumIntrinsics) {
11909       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11910       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11911       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11912       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11913       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11914     }
11915 
11916     if (UseBASE64Intrinsics) {
11917         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11918         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11919     }
11920 
11921     // data cache line writeback
11922     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11923     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11924 
11925     if (UseAESIntrinsics) {
11926       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11927       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11928       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11929       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11930       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11931     }
11932     if (UseGHASHIntrinsics) {
11933       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11934       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11935     }
11936     if (UseAESIntrinsics && UseGHASHIntrinsics) {
11937       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11938     }
11939 
11940     if (UseMD5Intrinsics) {
11941       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
11942       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
11943     }
11944     if (UseSHA1Intrinsics) {
11945       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
11946       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
11947     }
11948     if (UseSHA256Intrinsics) {
11949       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
11950       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
11951     }
11952     if (UseSHA512Intrinsics) {
11953       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
11954       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
11955     }
11956     if (UseSHA3Intrinsics) {
11957 
11958       StubRoutines::_double_keccak         = generate_double_keccak();
11959       if (UseSIMDForSHA3Intrinsic) {
11960          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
11961          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
11962       } else {
11963          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
11964          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
11965       }
11966     }
11967 
11968     if (UsePoly1305Intrinsics) {
11969       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11970     }
11971 
11972     // generate Adler32 intrinsics code
11973     if (UseAdler32Intrinsics) {
11974       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11975     }
11976 
11977 #endif // COMPILER2_OR_JVMCI
11978   }
11979 
11980  public:
11981   StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
11982     switch(blob_id) {
11983     case BlobId::stubgen_preuniverse_id:
11984       generate_preuniverse_stubs();
11985       break;
11986     case BlobId::stubgen_initial_id:
11987       generate_initial_stubs();
11988       break;
11989      case BlobId::stubgen_continuation_id:
11990       generate_continuation_stubs();
11991       break;
11992     case BlobId::stubgen_compiler_id:
11993       generate_compiler_stubs();
11994       break;
11995     case BlobId::stubgen_final_id:
11996       generate_final_stubs();
11997       break;
11998     default:
11999       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12000       break;
12001     };
12002   }
12003 }; // end class declaration
12004 
12005 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
12006   StubGenerator g(code, blob_id);
12007 }
12008 
12009 
12010 #if defined (LINUX)
12011 
12012 // Define pointers to atomic stubs and initialize them to point to the
12013 // code in atomic_aarch64.S.
12014 
12015 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
12016   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12017     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
12018   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12019     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12020 
12021 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12022 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12023 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12024 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12025 DEFAULT_ATOMIC_OP(xchg, 4, )
12026 DEFAULT_ATOMIC_OP(xchg, 8, )
12027 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12028 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12029 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12030 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12031 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12032 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12033 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12034 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12035 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12036 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12037 
12038 #undef DEFAULT_ATOMIC_OP
12039 
12040 #endif // LINUX