1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomicAccess.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubId stub_id = StubId::stubgen_call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     // All of j_rargN may be used to return inline type fields so be careful
  332     // not to clobber those.
  333     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
  334     // assignment of Rresult below.
  335     Register Rresult = r14, Rresult_type = r15;
  336     __ ldr(Rresult, result);
  337     Label is_long, is_float, is_double, check_prim, exit;
  338     __ ldr(Rresult_type, result_type);
  339     __ cmp(Rresult_type, (u1)T_OBJECT);
  340     __ br(Assembler::EQ, check_prim);
  341     __ cmp(Rresult_type, (u1)T_LONG);
  342     __ br(Assembler::EQ, is_long);
  343     __ cmp(Rresult_type, (u1)T_FLOAT);
  344     __ br(Assembler::EQ, is_float);
  345     __ cmp(Rresult_type, (u1)T_DOUBLE);
  346     __ br(Assembler::EQ, is_double);
  347 
  348     // handle T_INT case
  349     __ strw(r0, Address(Rresult));
  350 
  351     __ BIND(exit);
  352 
  353     // pop parameters
  354     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  355 
  356 #ifdef ASSERT
  357     // verify that threads correspond
  358     {
  359       Label L, S;
  360       __ ldr(rscratch1, thread);
  361       __ cmp(rthread, rscratch1);
  362       __ br(Assembler::NE, S);
  363       __ get_thread(rscratch1);
  364       __ cmp(rthread, rscratch1);
  365       __ br(Assembler::EQ, L);
  366       __ BIND(S);
  367       __ stop("StubRoutines::call_stub: threads must correspond");
  368       __ BIND(L);
  369     }
  370 #endif
  371 
  372     __ pop_cont_fastpath(rthread);
  373 
  374     // restore callee-save registers
  375     __ ldpd(v15, v14,  d15_save);
  376     __ ldpd(v13, v12,  d13_save);
  377     __ ldpd(v11, v10,  d11_save);
  378     __ ldpd(v9,  v8,   d9_save);
  379 
  380     __ ldp(r28, r27,   r28_save);
  381     __ ldp(r26, r25,   r26_save);
  382     __ ldp(r24, r23,   r24_save);
  383     __ ldp(r22, r21,   r22_save);
  384     __ ldp(r20, r19,   r20_save);
  385 
  386     // restore fpcr
  387     __ ldr(rscratch1,  fpcr_save);
  388     __ set_fpcr(rscratch1);
  389 
  390     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  391     __ ldrw(c_rarg2, result_type);
  392     __ ldr(c_rarg3,  method);
  393     __ ldp(c_rarg4, c_rarg5,  entry_point);
  394     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  395 
  396     // leave frame and return to caller
  397     __ leave();
  398     __ ret(lr);
  399 
  400     // handle return types different from T_INT
  401     __ BIND(check_prim);
  402     if (InlineTypeReturnedAsFields) {
  403       // Check for scalarized return value
  404       __ tbz(r0, 0, is_long);
  405       // Load pack handler address
  406       __ andr(rscratch1, r0, -2);
  407       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
  408       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
  409       __ blr(rscratch1);
  410       __ b(exit);
  411     }
  412 
  413     __ BIND(is_long);
  414     __ str(r0, Address(Rresult, 0));
  415     __ br(Assembler::AL, exit);
  416 
  417     __ BIND(is_float);
  418     __ strs(j_farg0, Address(Rresult, 0));
  419     __ br(Assembler::AL, exit);
  420 
  421     __ BIND(is_double);
  422     __ strd(j_farg0, Address(Rresult, 0));
  423     __ br(Assembler::AL, exit);
  424 
  425     return start;
  426   }
  427 
  428   // Return point for a Java call if there's an exception thrown in
  429   // Java code.  The exception is caught and transformed into a
  430   // pending exception stored in JavaThread that can be tested from
  431   // within the VM.
  432   //
  433   // Note: Usually the parameters are removed by the callee. In case
  434   // of an exception crossing an activation frame boundary, that is
  435   // not the case if the callee is compiled code => need to setup the
  436   // rsp.
  437   //
  438   // r0: exception oop
  439 
  440   address generate_catch_exception() {
  441     StubId stub_id = StubId::stubgen_catch_exception_id;
  442     StubCodeMark mark(this, stub_id);
  443     address start = __ pc();
  444 
  445     // same as in generate_call_stub():
  446     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  447     const Address thread        (rfp, thread_off         * wordSize);
  448 
  449 #ifdef ASSERT
  450     // verify that threads correspond
  451     {
  452       Label L, S;
  453       __ ldr(rscratch1, thread);
  454       __ cmp(rthread, rscratch1);
  455       __ br(Assembler::NE, S);
  456       __ get_thread(rscratch1);
  457       __ cmp(rthread, rscratch1);
  458       __ br(Assembler::EQ, L);
  459       __ bind(S);
  460       __ stop("StubRoutines::catch_exception: threads must correspond");
  461       __ bind(L);
  462     }
  463 #endif
  464 
  465     // set pending exception
  466     __ verify_oop(r0);
  467 
  468     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  469     __ mov(rscratch1, (address)__FILE__);
  470     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  471     __ movw(rscratch1, (int)__LINE__);
  472     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  473 
  474     // complete return to VM
  475     assert(StubRoutines::_call_stub_return_address != nullptr,
  476            "_call_stub_return_address must have been generated before");
  477     __ b(StubRoutines::_call_stub_return_address);
  478 
  479     return start;
  480   }
  481 
  482   // Continuation point for runtime calls returning with a pending
  483   // exception.  The pending exception check happened in the runtime
  484   // or native call stub.  The pending exception in Thread is
  485   // converted into a Java-level exception.
  486   //
  487   // Contract with Java-level exception handlers:
  488   // r0: exception
  489   // r3: throwing pc
  490   //
  491   // NOTE: At entry of this stub, exception-pc must be in LR !!
  492 
  493   // NOTE: this is always used as a jump target within generated code
  494   // so it just needs to be generated code with no x86 prolog
  495 
  496   address generate_forward_exception() {
  497     StubId stub_id = StubId::stubgen_forward_exception_id;
  498     StubCodeMark mark(this, stub_id);
  499     address start = __ pc();
  500 
  501     // Upon entry, LR points to the return address returning into
  502     // Java (interpreted or compiled) code; i.e., the return address
  503     // becomes the throwing pc.
  504     //
  505     // Arguments pushed before the runtime call are still on the stack
  506     // but the exception handler will reset the stack pointer ->
  507     // ignore them.  A potential result in registers can be ignored as
  508     // well.
  509 
  510 #ifdef ASSERT
  511     // make sure this code is only executed if there is a pending exception
  512     {
  513       Label L;
  514       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  515       __ cbnz(rscratch1, L);
  516       __ stop("StubRoutines::forward exception: no pending exception (1)");
  517       __ bind(L);
  518     }
  519 #endif
  520 
  521     // compute exception handler into r19
  522 
  523     // call the VM to find the handler address associated with the
  524     // caller address. pass thread in r0 and caller pc (ret address)
  525     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  526     // the stack.
  527     __ mov(c_rarg1, lr);
  528     // lr will be trashed by the VM call so we move it to R19
  529     // (callee-saved) because we also need to pass it to the handler
  530     // returned by this call.
  531     __ mov(r19, lr);
  532     BLOCK_COMMENT("call exception_handler_for_return_address");
  533     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  534                          SharedRuntime::exception_handler_for_return_address),
  535                     rthread, c_rarg1);
  536     // Reinitialize the ptrue predicate register, in case the external runtime
  537     // call clobbers ptrue reg, as we may return to SVE compiled code.
  538     __ reinitialize_ptrue();
  539 
  540     // we should not really care that lr is no longer the callee
  541     // address. we saved the value the handler needs in r19 so we can
  542     // just copy it to r3. however, the C2 handler will push its own
  543     // frame and then calls into the VM and the VM code asserts that
  544     // the PC for the frame above the handler belongs to a compiled
  545     // Java method. So, we restore lr here to satisfy that assert.
  546     __ mov(lr, r19);
  547     // setup r0 & r3 & clear pending exception
  548     __ mov(r3, r19);
  549     __ mov(r19, r0);
  550     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  551     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  552 
  553 #ifdef ASSERT
  554     // make sure exception is set
  555     {
  556       Label L;
  557       __ cbnz(r0, L);
  558       __ stop("StubRoutines::forward exception: no pending exception (2)");
  559       __ bind(L);
  560     }
  561 #endif
  562 
  563     // continue at exception handler
  564     // r0: exception
  565     // r3: throwing pc
  566     // r19: exception handler
  567     __ verify_oop(r0);
  568     __ br(r19);
  569 
  570     return start;
  571   }
  572 
  573   // Non-destructive plausibility checks for oops
  574   //
  575   // Arguments:
  576   //    r0: oop to verify
  577   //    rscratch1: error message
  578   //
  579   // Stack after saving c_rarg3:
  580   //    [tos + 0]: saved c_rarg3
  581   //    [tos + 1]: saved c_rarg2
  582   //    [tos + 2]: saved lr
  583   //    [tos + 3]: saved rscratch2
  584   //    [tos + 4]: saved r0
  585   //    [tos + 5]: saved rscratch1
  586   address generate_verify_oop() {
  587     StubId stub_id = StubId::stubgen_verify_oop_id;
  588     StubCodeMark mark(this, stub_id);
  589     address start = __ pc();
  590 
  591     Label exit, error;
  592 
  593     // save c_rarg2 and c_rarg3
  594     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  595 
  596     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  597     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  598     __ ldr(c_rarg3, Address(c_rarg2));
  599     __ add(c_rarg3, c_rarg3, 1);
  600     __ str(c_rarg3, Address(c_rarg2));
  601 
  602     // object is in r0
  603     // make sure object is 'reasonable'
  604     __ cbz(r0, exit); // if obj is null it is OK
  605 
  606     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  607     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  608 
  609     // return if everything seems ok
  610     __ bind(exit);
  611 
  612     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  613     __ ret(lr);
  614 
  615     // handle errors
  616     __ bind(error);
  617     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  618 
  619     __ push(RegSet::range(r0, r29), sp);
  620     // debug(char* msg, int64_t pc, int64_t regs[])
  621     __ mov(c_rarg0, rscratch1);      // pass address of error message
  622     __ mov(c_rarg1, lr);             // pass return address
  623     __ mov(c_rarg2, sp);             // pass address of regs on stack
  624 #ifndef PRODUCT
  625     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  626 #endif
  627     BLOCK_COMMENT("call MacroAssembler::debug");
  628     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  629     __ blr(rscratch1);
  630     __ hlt(0);
  631 
  632     return start;
  633   }
  634 
  635   // Generate indices for iota vector.
  636   address generate_iota_indices(StubId stub_id) {
  637     __ align(CodeEntryAlignment);
  638     StubCodeMark mark(this, stub_id);
  639     address start = __ pc();
  640     // B
  641     __ emit_data64(0x0706050403020100, relocInfo::none);
  642     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  643     // H
  644     __ emit_data64(0x0003000200010000, relocInfo::none);
  645     __ emit_data64(0x0007000600050004, relocInfo::none);
  646     // S
  647     __ emit_data64(0x0000000100000000, relocInfo::none);
  648     __ emit_data64(0x0000000300000002, relocInfo::none);
  649     // D
  650     __ emit_data64(0x0000000000000000, relocInfo::none);
  651     __ emit_data64(0x0000000000000001, relocInfo::none);
  652     // S - FP
  653     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  654     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  655     // D - FP
  656     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  657     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  658     return start;
  659   }
  660 
  661   // The inner part of zero_words().  This is the bulk operation,
  662   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  663   // caller is responsible for zeroing the last few words.
  664   //
  665   // Inputs:
  666   // r10: the HeapWord-aligned base address of an array to zero.
  667   // r11: the count in HeapWords, r11 > 0.
  668   //
  669   // Returns r10 and r11, adjusted for the caller to clear.
  670   // r10: the base address of the tail of words left to clear.
  671   // r11: the number of words in the tail.
  672   //      r11 < MacroAssembler::zero_words_block_size.
  673 
  674   address generate_zero_blocks() {
  675     Label done;
  676     Label base_aligned;
  677 
  678     Register base = r10, cnt = r11;
  679 
  680     __ align(CodeEntryAlignment);
  681     StubId stub_id = StubId::stubgen_zero_blocks_id;
  682     StubCodeMark mark(this, stub_id);
  683     address start = __ pc();
  684 
  685     if (UseBlockZeroing) {
  686       int zva_length = VM_Version::zva_length();
  687 
  688       // Ensure ZVA length can be divided by 16. This is required by
  689       // the subsequent operations.
  690       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  691 
  692       __ tbz(base, 3, base_aligned);
  693       __ str(zr, Address(__ post(base, 8)));
  694       __ sub(cnt, cnt, 1);
  695       __ bind(base_aligned);
  696 
  697       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  698       // alignment.
  699       Label small;
  700       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  701       __ subs(rscratch1, cnt, low_limit >> 3);
  702       __ br(Assembler::LT, small);
  703       __ zero_dcache_blocks(base, cnt);
  704       __ bind(small);
  705     }
  706 
  707     {
  708       // Number of stp instructions we'll unroll
  709       const int unroll =
  710         MacroAssembler::zero_words_block_size / 2;
  711       // Clear the remaining blocks.
  712       Label loop;
  713       __ subs(cnt, cnt, unroll * 2);
  714       __ br(Assembler::LT, done);
  715       __ bind(loop);
  716       for (int i = 0; i < unroll; i++)
  717         __ stp(zr, zr, __ post(base, 16));
  718       __ subs(cnt, cnt, unroll * 2);
  719       __ br(Assembler::GE, loop);
  720       __ bind(done);
  721       __ add(cnt, cnt, unroll * 2);
  722     }
  723 
  724     __ ret(lr);
  725 
  726     return start;
  727   }
  728 
  729 
  730   typedef enum {
  731     copy_forwards = 1,
  732     copy_backwards = -1
  733   } copy_direction;
  734 
  735   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  736   // for arraycopy stubs.
  737   class ArrayCopyBarrierSetHelper : StackObj {
  738     BarrierSetAssembler* _bs_asm;
  739     MacroAssembler* _masm;
  740     DecoratorSet _decorators;
  741     BasicType _type;
  742     Register _gct1;
  743     Register _gct2;
  744     Register _gct3;
  745     FloatRegister _gcvt1;
  746     FloatRegister _gcvt2;
  747     FloatRegister _gcvt3;
  748 
  749   public:
  750     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  751                               DecoratorSet decorators,
  752                               BasicType type,
  753                               Register gct1,
  754                               Register gct2,
  755                               Register gct3,
  756                               FloatRegister gcvt1,
  757                               FloatRegister gcvt2,
  758                               FloatRegister gcvt3)
  759       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  760         _masm(masm),
  761         _decorators(decorators),
  762         _type(type),
  763         _gct1(gct1),
  764         _gct2(gct2),
  765         _gct3(gct3),
  766         _gcvt1(gcvt1),
  767         _gcvt2(gcvt2),
  768         _gcvt3(gcvt3) {
  769     }
  770 
  771     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  772       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  773                             dst1, dst2, src,
  774                             _gct1, _gct2, _gcvt1);
  775     }
  776 
  777     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  778       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  779                              dst, src1, src2,
  780                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  781     }
  782 
  783     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  784       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  785                             dst1, dst2, src,
  786                             _gct1);
  787     }
  788 
  789     void copy_store_at_16(Address dst, Register src1, Register src2) {
  790       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  791                              dst, src1, src2,
  792                              _gct1, _gct2, _gct3);
  793     }
  794 
  795     void copy_load_at_8(Register dst, Address src) {
  796       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  797                             dst, noreg, src,
  798                             _gct1);
  799     }
  800 
  801     void copy_store_at_8(Address dst, Register src) {
  802       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  803                              dst, src, noreg,
  804                              _gct1, _gct2, _gct3);
  805     }
  806   };
  807 
  808   // Bulk copy of blocks of 8 words.
  809   //
  810   // count is a count of words.
  811   //
  812   // Precondition: count >= 8
  813   //
  814   // Postconditions:
  815   //
  816   // The least significant bit of count contains the remaining count
  817   // of words to copy.  The rest of count is trash.
  818   //
  819   // s and d are adjusted to point to the remaining words to copy
  820   //
  821   address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
  822     BasicType type;
  823     copy_direction direction;
  824 
  825     switch (stub_id) {
  826     case StubId::stubgen_copy_byte_f_id:
  827       direction = copy_forwards;
  828       type = T_BYTE;
  829       break;
  830     case StubId::stubgen_copy_byte_b_id:
  831       direction = copy_backwards;
  832       type = T_BYTE;
  833       break;
  834     case StubId::stubgen_copy_oop_f_id:
  835       direction = copy_forwards;
  836       type = T_OBJECT;
  837       break;
  838     case StubId::stubgen_copy_oop_b_id:
  839       direction = copy_backwards;
  840       type = T_OBJECT;
  841       break;
  842     case StubId::stubgen_copy_oop_uninit_f_id:
  843       direction = copy_forwards;
  844       type = T_OBJECT;
  845       break;
  846     case StubId::stubgen_copy_oop_uninit_b_id:
  847       direction = copy_backwards;
  848       type = T_OBJECT;
  849       break;
  850     default:
  851       ShouldNotReachHere();
  852     }
  853 
  854     int unit = wordSize * direction;
  855     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  856 
  857     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  858       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  859     const Register stride = r14;
  860     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  861     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  862     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  863 
  864     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  865     assert_different_registers(s, d, count, rscratch1, rscratch2);
  866 
  867     Label again, drain;
  868 
  869     __ align(CodeEntryAlignment);
  870 
  871     StubCodeMark mark(this, stub_id);
  872 
  873     address start = __ pc();
  874 
  875     Label unaligned_copy_long;
  876     if (AvoidUnalignedAccesses) {
  877       __ tbnz(d, 3, unaligned_copy_long);
  878     }
  879 
  880     if (direction == copy_forwards) {
  881       __ sub(s, s, bias);
  882       __ sub(d, d, bias);
  883     }
  884 
  885 #ifdef ASSERT
  886     // Make sure we are never given < 8 words
  887     {
  888       Label L;
  889       __ cmp(count, (u1)8);
  890       __ br(Assembler::GE, L);
  891       __ stop("genrate_copy_longs called with < 8 words");
  892       __ bind(L);
  893     }
  894 #endif
  895 
  896     // Fill 8 registers
  897     if (UseSIMDForMemoryOps) {
  898       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  899       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  900     } else {
  901       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  902       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  903       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  904       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  905     }
  906 
  907     __ subs(count, count, 16);
  908     __ br(Assembler::LO, drain);
  909 
  910     int prefetch = PrefetchCopyIntervalInBytes;
  911     bool use_stride = false;
  912     if (direction == copy_backwards) {
  913       use_stride = prefetch > 256;
  914       prefetch = -prefetch;
  915       if (use_stride) __ mov(stride, prefetch);
  916     }
  917 
  918     __ bind(again);
  919 
  920     if (PrefetchCopyIntervalInBytes > 0)
  921       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  922 
  923     if (UseSIMDForMemoryOps) {
  924       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  925       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  926       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  927       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  928     } else {
  929       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  930       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  931       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  932       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  933       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  934       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  937     }
  938 
  939     __ subs(count, count, 8);
  940     __ br(Assembler::HS, again);
  941 
  942     // Drain
  943     __ bind(drain);
  944     if (UseSIMDForMemoryOps) {
  945       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  946       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  947     } else {
  948       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  950       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  951       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  952     }
  953 
  954     {
  955       Label L1, L2;
  956       __ tbz(count, exact_log2(4), L1);
  957       if (UseSIMDForMemoryOps) {
  958         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  959         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  960       } else {
  961         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  962         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  963         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  964         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  965       }
  966       __ bind(L1);
  967 
  968       if (direction == copy_forwards) {
  969         __ add(s, s, bias);
  970         __ add(d, d, bias);
  971       }
  972 
  973       __ tbz(count, 1, L2);
  974       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  975       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  976       __ bind(L2);
  977     }
  978 
  979     __ ret(lr);
  980 
  981     if (AvoidUnalignedAccesses) {
  982       Label drain, again;
  983       // Register order for storing. Order is different for backward copy.
  984 
  985       __ bind(unaligned_copy_long);
  986 
  987       // source address is even aligned, target odd aligned
  988       //
  989       // when forward copying word pairs we read long pairs at offsets
  990       // {0, 2, 4, 6} (in long words). when backwards copying we read
  991       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  992       // address by -2 in the forwards case so we can compute the
  993       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  994       // or -1.
  995       //
  996       // when forward copying we need to store 1 word, 3 pairs and
  997       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  998       // zero offset We adjust the destination by -1 which means we
  999       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1000       //
 1001       // When backwards copyng we need to store 1 word, 3 pairs and
 1002       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1003       // offsets {1, 3, 5, 7, 8} * unit.
 1004 
 1005       if (direction == copy_forwards) {
 1006         __ sub(s, s, 16);
 1007         __ sub(d, d, 8);
 1008       }
 1009 
 1010       // Fill 8 registers
 1011       //
 1012       // for forwards copy s was offset by -16 from the original input
 1013       // value of s so the register contents are at these offsets
 1014       // relative to the 64 bit block addressed by that original input
 1015       // and so on for each successive 64 byte block when s is updated
 1016       //
 1017       // t0 at offset 0,  t1 at offset 8
 1018       // t2 at offset 16, t3 at offset 24
 1019       // t4 at offset 32, t5 at offset 40
 1020       // t6 at offset 48, t7 at offset 56
 1021 
 1022       // for backwards copy s was not offset so the register contents
 1023       // are at these offsets into the preceding 64 byte block
 1024       // relative to that original input and so on for each successive
 1025       // preceding 64 byte block when s is updated. this explains the
 1026       // slightly counter-intuitive looking pattern of register usage
 1027       // in the stp instructions for backwards copy.
 1028       //
 1029       // t0 at offset -16, t1 at offset -8
 1030       // t2 at offset -32, t3 at offset -24
 1031       // t4 at offset -48, t5 at offset -40
 1032       // t6 at offset -64, t7 at offset -56
 1033 
 1034       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1035       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1036       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1037       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1038 
 1039       __ subs(count, count, 16);
 1040       __ br(Assembler::LO, drain);
 1041 
 1042       int prefetch = PrefetchCopyIntervalInBytes;
 1043       bool use_stride = false;
 1044       if (direction == copy_backwards) {
 1045         use_stride = prefetch > 256;
 1046         prefetch = -prefetch;
 1047         if (use_stride) __ mov(stride, prefetch);
 1048       }
 1049 
 1050       __ bind(again);
 1051 
 1052       if (PrefetchCopyIntervalInBytes > 0)
 1053         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1054 
 1055       if (direction == copy_forwards) {
 1056         // allowing for the offset of -8 the store instructions place
 1057         // registers into the target 64 bit block at the following
 1058         // offsets
 1059         //
 1060         // t0 at offset 0
 1061         // t1 at offset 8,  t2 at offset 16
 1062         // t3 at offset 24, t4 at offset 32
 1063         // t5 at offset 40, t6 at offset 48
 1064         // t7 at offset 56
 1065 
 1066         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1067         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1068         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1069         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1070         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1071         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1072         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1073         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1074         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1075       } else {
 1076         // d was not offset when we started so the registers are
 1077         // written into the 64 bit block preceding d with the following
 1078         // offsets
 1079         //
 1080         // t1 at offset -8
 1081         // t3 at offset -24, t0 at offset -16
 1082         // t5 at offset -48, t2 at offset -32
 1083         // t7 at offset -56, t4 at offset -48
 1084         //                   t6 at offset -64
 1085         //
 1086         // note that this matches the offsets previously noted for the
 1087         // loads
 1088 
 1089         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1090         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1091         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1092         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1093         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1094         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1095         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1096         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1097         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1098       }
 1099 
 1100       __ subs(count, count, 8);
 1101       __ br(Assembler::HS, again);
 1102 
 1103       // Drain
 1104       //
 1105       // this uses the same pattern of offsets and register arguments
 1106       // as above
 1107       __ bind(drain);
 1108       if (direction == copy_forwards) {
 1109         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1110         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1111         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1112         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1113         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1114       } else {
 1115         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1116         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1117         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1118         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1119         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1120       }
 1121       // now we need to copy any remaining part block which may
 1122       // include a 4 word block subblock and/or a 2 word subblock.
 1123       // bits 2 and 1 in the count are the tell-tale for whether we
 1124       // have each such subblock
 1125       {
 1126         Label L1, L2;
 1127         __ tbz(count, exact_log2(4), L1);
 1128         // this is the same as above but copying only 4 longs hence
 1129         // with only one intervening stp between the str instructions
 1130         // but note that the offsets and registers still follow the
 1131         // same pattern
 1132         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1133         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1134         if (direction == copy_forwards) {
 1135           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1136           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1137           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1141           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1142         }
 1143         __ bind(L1);
 1144 
 1145         __ tbz(count, 1, L2);
 1146         // this is the same as above but copying only 2 longs hence
 1147         // there is no intervening stp between the str instructions
 1148         // but note that the offset and register patterns are still
 1149         // the same
 1150         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1151         if (direction == copy_forwards) {
 1152           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1153           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1154         } else {
 1155           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1156           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1157         }
 1158         __ bind(L2);
 1159 
 1160         // for forwards copy we need to re-adjust the offsets we
 1161         // applied so that s and d are follow the last words written
 1162 
 1163         if (direction == copy_forwards) {
 1164           __ add(s, s, 16);
 1165           __ add(d, d, 8);
 1166         }
 1167 
 1168       }
 1169 
 1170       __ ret(lr);
 1171     }
 1172 
 1173     return start;
 1174   }
 1175 
 1176   // Small copy: less than 16 bytes.
 1177   //
 1178   // NB: Ignores all of the bits of count which represent more than 15
 1179   // bytes, so a caller doesn't have to mask them.
 1180 
 1181   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1182     bool is_backwards = step < 0;
 1183     size_t granularity = g_uabs(step);
 1184     int direction = is_backwards ? -1 : 1;
 1185 
 1186     Label Lword, Lint, Lshort, Lbyte;
 1187 
 1188     assert(granularity
 1189            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1190 
 1191     const Register t0 = r3;
 1192     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1193     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1194 
 1195     // ??? I don't know if this bit-test-and-branch is the right thing
 1196     // to do.  It does a lot of jumping, resulting in several
 1197     // mispredicted branches.  It might make more sense to do this
 1198     // with something like Duff's device with a single computed branch.
 1199 
 1200     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1201     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1202     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1203     __ bind(Lword);
 1204 
 1205     if (granularity <= sizeof (jint)) {
 1206       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1207       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1208       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1209       __ bind(Lint);
 1210     }
 1211 
 1212     if (granularity <= sizeof (jshort)) {
 1213       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1214       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1215       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1216       __ bind(Lshort);
 1217     }
 1218 
 1219     if (granularity <= sizeof (jbyte)) {
 1220       __ tbz(count, 0, Lbyte);
 1221       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1222       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1223       __ bind(Lbyte);
 1224     }
 1225   }
 1226 
 1227   // All-singing all-dancing memory copy.
 1228   //
 1229   // Copy count units of memory from s to d.  The size of a unit is
 1230   // step, which can be positive or negative depending on the direction
 1231   // of copy.  If is_aligned is false, we align the source address.
 1232   //
 1233 
 1234   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1235                    Register s, Register d, Register count, int step) {
 1236     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1237     bool is_backwards = step < 0;
 1238     unsigned int granularity = g_uabs(step);
 1239     const Register t0 = r3, t1 = r4;
 1240 
 1241     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1242     // load all the data before writing anything
 1243     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1244     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1245     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1246     const Register send = r17, dend = r16;
 1247     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1248     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1249     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1250 
 1251     if (PrefetchCopyIntervalInBytes > 0)
 1252       __ prfm(Address(s, 0), PLDL1KEEP);
 1253     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1254     __ br(Assembler::HI, copy_big);
 1255 
 1256     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1257     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1258 
 1259     __ cmp(count, u1(16/granularity));
 1260     __ br(Assembler::LS, copy16);
 1261 
 1262     __ cmp(count, u1(64/granularity));
 1263     __ br(Assembler::HI, copy80);
 1264 
 1265     __ cmp(count, u1(32/granularity));
 1266     __ br(Assembler::LS, copy32);
 1267 
 1268     // 33..64 bytes
 1269     if (UseSIMDForMemoryOps) {
 1270       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1271       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1272       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1273       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1274     } else {
 1275       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1276       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1277       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1278       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1279 
 1280       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1281       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1282       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1283       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1284     }
 1285     __ b(finish);
 1286 
 1287     // 17..32 bytes
 1288     __ bind(copy32);
 1289     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1290     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1291 
 1292     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1293     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1294     __ b(finish);
 1295 
 1296     // 65..80/96 bytes
 1297     // (96 bytes if SIMD because we do 32 byes per instruction)
 1298     __ bind(copy80);
 1299     if (UseSIMDForMemoryOps) {
 1300       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1301       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1302       // Unaligned pointers can be an issue for copying.
 1303       // The issue has more chances to happen when granularity of data is
 1304       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1305       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1306       // The most performance drop has been seen for the range 65-80 bytes.
 1307       // For such cases using the pair of ldp/stp instead of the third pair of
 1308       // ldpq/stpq fixes the performance issue.
 1309       if (granularity < sizeof (jint)) {
 1310         Label copy96;
 1311         __ cmp(count, u1(80/granularity));
 1312         __ br(Assembler::HI, copy96);
 1313         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1314 
 1315         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1316         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1317 
 1318         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1319         __ b(finish);
 1320 
 1321         __ bind(copy96);
 1322       }
 1323       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1324 
 1325       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1326       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1327 
 1328       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1329     } else {
 1330       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1331       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1332       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1333       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1334       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1335 
 1336       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1337       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1338       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1339       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1340       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1341     }
 1342     __ b(finish);
 1343 
 1344     // 0..16 bytes
 1345     __ bind(copy16);
 1346     __ cmp(count, u1(8/granularity));
 1347     __ br(Assembler::LO, copy8);
 1348 
 1349     // 8..16 bytes
 1350     bs.copy_load_at_8(t0, Address(s, 0));
 1351     bs.copy_load_at_8(t1, Address(send, -8));
 1352     bs.copy_store_at_8(Address(d, 0), t0);
 1353     bs.copy_store_at_8(Address(dend, -8), t1);
 1354     __ b(finish);
 1355 
 1356     if (granularity < 8) {
 1357       // 4..7 bytes
 1358       __ bind(copy8);
 1359       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1360       __ ldrw(t0, Address(s, 0));
 1361       __ ldrw(t1, Address(send, -4));
 1362       __ strw(t0, Address(d, 0));
 1363       __ strw(t1, Address(dend, -4));
 1364       __ b(finish);
 1365       if (granularity < 4) {
 1366         // 0..3 bytes
 1367         __ bind(copy4);
 1368         __ cbz(count, finish); // get rid of 0 case
 1369         if (granularity == 2) {
 1370           __ ldrh(t0, Address(s, 0));
 1371           __ strh(t0, Address(d, 0));
 1372         } else { // granularity == 1
 1373           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1374           // the first and last byte.
 1375           // Handle the 3 byte case by loading and storing base + count/2
 1376           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1377           // This does means in the 1 byte case we load/store the same
 1378           // byte 3 times.
 1379           __ lsr(count, count, 1);
 1380           __ ldrb(t0, Address(s, 0));
 1381           __ ldrb(t1, Address(send, -1));
 1382           __ ldrb(t2, Address(s, count));
 1383           __ strb(t0, Address(d, 0));
 1384           __ strb(t1, Address(dend, -1));
 1385           __ strb(t2, Address(d, count));
 1386         }
 1387         __ b(finish);
 1388       }
 1389     }
 1390 
 1391     __ bind(copy_big);
 1392     if (is_backwards) {
 1393       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1394       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1395     }
 1396 
 1397     // Now we've got the small case out of the way we can align the
 1398     // source address on a 2-word boundary.
 1399 
 1400     // Here we will materialize a count in r15, which is used by copy_memory_small
 1401     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1402     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1403     // can not be used as a temp register, as it contains the count.
 1404 
 1405     Label aligned;
 1406 
 1407     if (is_aligned) {
 1408       // We may have to adjust by 1 word to get s 2-word-aligned.
 1409       __ tbz(s, exact_log2(wordSize), aligned);
 1410       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1411       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1412       __ sub(count, count, wordSize/granularity);
 1413     } else {
 1414       if (is_backwards) {
 1415         __ andr(r15, s, 2 * wordSize - 1);
 1416       } else {
 1417         __ neg(r15, s);
 1418         __ andr(r15, r15, 2 * wordSize - 1);
 1419       }
 1420       // r15 is the byte adjustment needed to align s.
 1421       __ cbz(r15, aligned);
 1422       int shift = exact_log2(granularity);
 1423       if (shift > 0) {
 1424         __ lsr(r15, r15, shift);
 1425       }
 1426       __ sub(count, count, r15);
 1427 
 1428 #if 0
 1429       // ?? This code is only correct for a disjoint copy.  It may or
 1430       // may not make sense to use it in that case.
 1431 
 1432       // Copy the first pair; s and d may not be aligned.
 1433       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1434       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1435 
 1436       // Align s and d, adjust count
 1437       if (is_backwards) {
 1438         __ sub(s, s, r15);
 1439         __ sub(d, d, r15);
 1440       } else {
 1441         __ add(s, s, r15);
 1442         __ add(d, d, r15);
 1443       }
 1444 #else
 1445       copy_memory_small(decorators, type, s, d, r15, step);
 1446 #endif
 1447     }
 1448 
 1449     __ bind(aligned);
 1450 
 1451     // s is now 2-word-aligned.
 1452 
 1453     // We have a count of units and some trailing bytes. Adjust the
 1454     // count and do a bulk copy of words. If the shift is zero
 1455     // perform a move instead to benefit from zero latency moves.
 1456     int shift = exact_log2(wordSize/granularity);
 1457     if (shift > 0) {
 1458       __ lsr(r15, count, shift);
 1459     } else {
 1460       __ mov(r15, count);
 1461     }
 1462     if (direction == copy_forwards) {
 1463       if (type != T_OBJECT) {
 1464         __ bl(StubRoutines::aarch64::copy_byte_f());
 1465       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1466         __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
 1467       } else {
 1468         __ bl(StubRoutines::aarch64::copy_oop_f());
 1469       }
 1470     } else {
 1471       if (type != T_OBJECT) {
 1472         __ bl(StubRoutines::aarch64::copy_byte_b());
 1473       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1474         __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
 1475       } else {
 1476         __ bl(StubRoutines::aarch64::copy_oop_b());
 1477       }
 1478     }
 1479 
 1480     // And the tail.
 1481     copy_memory_small(decorators, type, s, d, count, step);
 1482 
 1483     if (granularity >= 8) __ bind(copy8);
 1484     if (granularity >= 4) __ bind(copy4);
 1485     __ bind(finish);
 1486   }
 1487 
 1488 
 1489   void clobber_registers() {
 1490 #ifdef ASSERT
 1491     RegSet clobbered
 1492       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1493     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1494     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1495     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1496       __ mov(*it, rscratch1);
 1497     }
 1498 #endif
 1499 
 1500   }
 1501 
 1502   // Scan over array at a for count oops, verifying each one.
 1503   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1504   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1505     Label loop, end;
 1506     __ mov(rscratch1, a);
 1507     __ mov(rscratch2, zr);
 1508     __ bind(loop);
 1509     __ cmp(rscratch2, count);
 1510     __ br(Assembler::HS, end);
 1511     if (size == wordSize) {
 1512       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1513       __ verify_oop(temp);
 1514     } else {
 1515       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1516       __ decode_heap_oop(temp); // calls verify_oop
 1517     }
 1518     __ add(rscratch2, rscratch2, 1);
 1519     __ b(loop);
 1520     __ bind(end);
 1521   }
 1522 
 1523   // Arguments:
 1524   //   stub_id - is used to name the stub and identify all details of
 1525   //             how to perform the copy.
 1526   //
 1527   //   entry - is assigned to the stub's post push entry point unless
 1528   //           it is null
 1529   //
 1530   // Inputs:
 1531   //   c_rarg0   - source array address
 1532   //   c_rarg1   - destination array address
 1533   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1534   //
 1535   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1536   // the hardware handle it.  The two dwords within qwords that span
 1537   // cache line boundaries will still be loaded and stored atomically.
 1538   //
 1539   // Side Effects: nopush_entry is set to the (post push) entry point
 1540   //               so it can be used by the corresponding conjoint
 1541   //               copy method
 1542   //
 1543   address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
 1544     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1545     RegSet saved_reg = RegSet::of(s, d, count);
 1546     int size;
 1547     bool aligned;
 1548     bool is_oop;
 1549     bool dest_uninitialized;
 1550     switch (stub_id) {
 1551     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
 1552       size = sizeof(jbyte);
 1553       aligned = false;
 1554       is_oop = false;
 1555       dest_uninitialized = false;
 1556       break;
 1557     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
 1558       size = sizeof(jbyte);
 1559       aligned = true;
 1560       is_oop = false;
 1561       dest_uninitialized = false;
 1562       break;
 1563     case StubId::stubgen_jshort_disjoint_arraycopy_id:
 1564       size = sizeof(jshort);
 1565       aligned = false;
 1566       is_oop = false;
 1567       dest_uninitialized = false;
 1568       break;
 1569     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
 1570       size = sizeof(jshort);
 1571       aligned = true;
 1572       is_oop = false;
 1573       dest_uninitialized = false;
 1574       break;
 1575     case StubId::stubgen_jint_disjoint_arraycopy_id:
 1576       size = sizeof(jint);
 1577       aligned = false;
 1578       is_oop = false;
 1579       dest_uninitialized = false;
 1580       break;
 1581     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
 1582       size = sizeof(jint);
 1583       aligned = true;
 1584       is_oop = false;
 1585       dest_uninitialized = false;
 1586       break;
 1587     case StubId::stubgen_jlong_disjoint_arraycopy_id:
 1588       // since this is always aligned we can (should!) use the same
 1589       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1590       ShouldNotReachHere();
 1591       break;
 1592     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
 1593       size = sizeof(jlong);
 1594       aligned = true;
 1595       is_oop = false;
 1596       dest_uninitialized = false;
 1597       break;
 1598     case StubId::stubgen_oop_disjoint_arraycopy_id:
 1599       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1600       aligned = !UseCompressedOops;
 1601       is_oop = true;
 1602       dest_uninitialized = false;
 1603       break;
 1604     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
 1605       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1606       aligned = !UseCompressedOops;
 1607       is_oop = true;
 1608       dest_uninitialized = false;
 1609       break;
 1610     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
 1611       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1612       aligned = !UseCompressedOops;
 1613       is_oop = true;
 1614       dest_uninitialized = true;
 1615       break;
 1616     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
 1617       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1618       aligned = !UseCompressedOops;
 1619       is_oop = true;
 1620       dest_uninitialized = true;
 1621       break;
 1622     default:
 1623       ShouldNotReachHere();
 1624       break;
 1625     }
 1626 
 1627     __ align(CodeEntryAlignment);
 1628     StubCodeMark mark(this, stub_id);
 1629     address start = __ pc();
 1630     __ enter();
 1631 
 1632     if (nopush_entry != nullptr) {
 1633       *nopush_entry = __ pc();
 1634       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1635       BLOCK_COMMENT("Entry:");
 1636     }
 1637 
 1638     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1639     if (dest_uninitialized) {
 1640       decorators |= IS_DEST_UNINITIALIZED;
 1641     }
 1642     if (aligned) {
 1643       decorators |= ARRAYCOPY_ALIGNED;
 1644     }
 1645 
 1646     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1647     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1648 
 1649     if (is_oop) {
 1650       // save regs before copy_memory
 1651       __ push(RegSet::of(d, count), sp);
 1652     }
 1653     {
 1654       // UnsafeMemoryAccess page error: continue after unsafe access
 1655       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1656       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1657       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1658     }
 1659 
 1660     if (is_oop) {
 1661       __ pop(RegSet::of(d, count), sp);
 1662       if (VerifyOops)
 1663         verify_oop_array(size, d, count, r16);
 1664     }
 1665 
 1666     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1667 
 1668     __ leave();
 1669     __ mov(r0, zr); // return 0
 1670     __ ret(lr);
 1671     return start;
 1672   }
 1673 
 1674   // Arguments:
 1675   //   stub_id - is used to name the stub and identify all details of
 1676   //             how to perform the copy.
 1677   //
 1678   //   nooverlap_target - identifes the (post push) entry for the
 1679   //             corresponding disjoint copy routine which can be
 1680   //             jumped to if the ranges do not actually overlap
 1681   //
 1682   //   entry - is assigned to the stub's post push entry point unless
 1683   //           it is null
 1684   //
 1685   //
 1686   // Inputs:
 1687   //   c_rarg0   - source array address
 1688   //   c_rarg1   - destination array address
 1689   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1690   //
 1691   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1692   // the hardware handle it.  The two dwords within qwords that span
 1693   // cache line boundaries will still be loaded and stored atomically.
 1694   //
 1695   // Side Effects:
 1696   //   nopush_entry is set to the no-overlap entry point so it can be
 1697   //   used by some other conjoint copy method
 1698   //
 1699   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
 1700     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1701     RegSet saved_regs = RegSet::of(s, d, count);
 1702     int size;
 1703     bool aligned;
 1704     bool is_oop;
 1705     bool dest_uninitialized;
 1706     switch (stub_id) {
 1707     case StubId::stubgen_jbyte_arraycopy_id:
 1708       size = sizeof(jbyte);
 1709       aligned = false;
 1710       is_oop = false;
 1711       dest_uninitialized = false;
 1712       break;
 1713     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
 1714       size = sizeof(jbyte);
 1715       aligned = true;
 1716       is_oop = false;
 1717       dest_uninitialized = false;
 1718       break;
 1719     case StubId::stubgen_jshort_arraycopy_id:
 1720       size = sizeof(jshort);
 1721       aligned = false;
 1722       is_oop = false;
 1723       dest_uninitialized = false;
 1724       break;
 1725     case StubId::stubgen_arrayof_jshort_arraycopy_id:
 1726       size = sizeof(jshort);
 1727       aligned = true;
 1728       is_oop = false;
 1729       dest_uninitialized = false;
 1730       break;
 1731     case StubId::stubgen_jint_arraycopy_id:
 1732       size = sizeof(jint);
 1733       aligned = false;
 1734       is_oop = false;
 1735       dest_uninitialized = false;
 1736       break;
 1737     case StubId::stubgen_arrayof_jint_arraycopy_id:
 1738       size = sizeof(jint);
 1739       aligned = true;
 1740       is_oop = false;
 1741       dest_uninitialized = false;
 1742       break;
 1743     case StubId::stubgen_jlong_arraycopy_id:
 1744       // since this is always aligned we can (should!) use the same
 1745       // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
 1746       ShouldNotReachHere();
 1747       break;
 1748     case StubId::stubgen_arrayof_jlong_arraycopy_id:
 1749       size = sizeof(jlong);
 1750       aligned = true;
 1751       is_oop = false;
 1752       dest_uninitialized = false;
 1753       break;
 1754     case StubId::stubgen_oop_arraycopy_id:
 1755       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1756       aligned = !UseCompressedOops;
 1757       is_oop = true;
 1758       dest_uninitialized = false;
 1759       break;
 1760     case StubId::stubgen_arrayof_oop_arraycopy_id:
 1761       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1762       aligned = !UseCompressedOops;
 1763       is_oop = true;
 1764       dest_uninitialized = false;
 1765       break;
 1766     case StubId::stubgen_oop_arraycopy_uninit_id:
 1767       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1768       aligned = !UseCompressedOops;
 1769       is_oop = true;
 1770       dest_uninitialized = true;
 1771       break;
 1772     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
 1773       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1774       aligned = !UseCompressedOops;
 1775       is_oop = true;
 1776       dest_uninitialized = true;
 1777       break;
 1778     default:
 1779       ShouldNotReachHere();
 1780     }
 1781 
 1782     StubCodeMark mark(this, stub_id);
 1783     address start = __ pc();
 1784     __ enter();
 1785 
 1786     if (nopush_entry != nullptr) {
 1787       *nopush_entry = __ pc();
 1788       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1789       BLOCK_COMMENT("Entry:");
 1790     }
 1791 
 1792     // use fwd copy when (d-s) above_equal (count*size)
 1793     Label L_overlapping;
 1794     __ sub(rscratch1, d, s);
 1795     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1796     __ br(Assembler::LO, L_overlapping);
 1797     __ b(RuntimeAddress(nooverlap_target));
 1798     __ bind(L_overlapping);
 1799 
 1800     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1801     if (dest_uninitialized) {
 1802       decorators |= IS_DEST_UNINITIALIZED;
 1803     }
 1804     if (aligned) {
 1805       decorators |= ARRAYCOPY_ALIGNED;
 1806     }
 1807 
 1808     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1809     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1810 
 1811     if (is_oop) {
 1812       // save regs before copy_memory
 1813       __ push(RegSet::of(d, count), sp);
 1814     }
 1815     {
 1816       // UnsafeMemoryAccess page error: continue after unsafe access
 1817       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1818       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1819       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1820     }
 1821     if (is_oop) {
 1822       __ pop(RegSet::of(d, count), sp);
 1823       if (VerifyOops)
 1824         verify_oop_array(size, d, count, r16);
 1825     }
 1826     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1827     __ leave();
 1828     __ mov(r0, zr); // return 0
 1829     __ ret(lr);
 1830     return start;
 1831   }
 1832 
 1833   // Helper for generating a dynamic type check.
 1834   // Smashes rscratch1, rscratch2.
 1835   void generate_type_check(Register sub_klass,
 1836                            Register super_check_offset,
 1837                            Register super_klass,
 1838                            Register temp1,
 1839                            Register temp2,
 1840                            Register result,
 1841                            Label& L_success) {
 1842     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1843 
 1844     BLOCK_COMMENT("type_check:");
 1845 
 1846     Label L_miss;
 1847 
 1848     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1849                                      super_check_offset);
 1850     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1851 
 1852     // Fall through on failure!
 1853     __ BIND(L_miss);
 1854   }
 1855 
 1856   //
 1857   //  Generate checkcasting array copy stub
 1858   //
 1859   //  Input:
 1860   //    c_rarg0   - source array address
 1861   //    c_rarg1   - destination array address
 1862   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1863   //    c_rarg3   - size_t ckoff (super_check_offset)
 1864   //    c_rarg4   - oop ckval (super_klass)
 1865   //
 1866   //  Output:
 1867   //    r0 ==  0  -  success
 1868   //    r0 == -1^K - failure, where K is partial transfer count
 1869   //
 1870   address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
 1871     bool dest_uninitialized;
 1872     switch (stub_id) {
 1873     case StubId::stubgen_checkcast_arraycopy_id:
 1874       dest_uninitialized = false;
 1875       break;
 1876     case StubId::stubgen_checkcast_arraycopy_uninit_id:
 1877       dest_uninitialized = true;
 1878       break;
 1879     default:
 1880       ShouldNotReachHere();
 1881     }
 1882 
 1883     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1884 
 1885     // Input registers (after setup_arg_regs)
 1886     const Register from        = c_rarg0;   // source array address
 1887     const Register to          = c_rarg1;   // destination array address
 1888     const Register count       = c_rarg2;   // elementscount
 1889     const Register ckoff       = c_rarg3;   // super_check_offset
 1890     const Register ckval       = c_rarg4;   // super_klass
 1891 
 1892     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1893     RegSet wb_post_saved_regs = RegSet::of(count);
 1894 
 1895     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1896     const Register copied_oop  = r22;       // actual oop copied
 1897     const Register count_save  = r21;       // orig elementscount
 1898     const Register start_to    = r20;       // destination array start address
 1899     const Register r19_klass   = r19;       // oop._klass
 1900 
 1901     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1902     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1903 
 1904     //---------------------------------------------------------------
 1905     // Assembler stub will be used for this call to arraycopy
 1906     // if the two arrays are subtypes of Object[] but the
 1907     // destination array type is not equal to or a supertype
 1908     // of the source type.  Each element must be separately
 1909     // checked.
 1910 
 1911     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1912                                copied_oop, r19_klass, count_save);
 1913 
 1914     __ align(CodeEntryAlignment);
 1915     StubCodeMark mark(this, stub_id);
 1916     address start = __ pc();
 1917 
 1918     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1919 
 1920 #ifdef ASSERT
 1921     // caller guarantees that the arrays really are different
 1922     // otherwise, we would have to make conjoint checks
 1923     { Label L;
 1924       __ b(L);                  // conjoint check not yet implemented
 1925       __ stop("checkcast_copy within a single array");
 1926       __ bind(L);
 1927     }
 1928 #endif //ASSERT
 1929 
 1930     // Caller of this entry point must set up the argument registers.
 1931     if (nopush_entry != nullptr) {
 1932       *nopush_entry = __ pc();
 1933       BLOCK_COMMENT("Entry:");
 1934     }
 1935 
 1936      // Empty array:  Nothing to do.
 1937     __ cbz(count, L_done);
 1938     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1939 
 1940 #ifdef ASSERT
 1941     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1942     // The ckoff and ckval must be mutually consistent,
 1943     // even though caller generates both.
 1944     { Label L;
 1945       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1946       __ ldrw(start_to, Address(ckval, sco_offset));
 1947       __ cmpw(ckoff, start_to);
 1948       __ br(Assembler::EQ, L);
 1949       __ stop("super_check_offset inconsistent");
 1950       __ bind(L);
 1951     }
 1952 #endif //ASSERT
 1953 
 1954     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1955     bool is_oop = true;
 1956     int element_size = UseCompressedOops ? 4 : 8;
 1957     if (dest_uninitialized) {
 1958       decorators |= IS_DEST_UNINITIALIZED;
 1959     }
 1960 
 1961     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1962     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1963 
 1964     // save the original count
 1965     __ mov(count_save, count);
 1966 
 1967     // Copy from low to high addresses
 1968     __ mov(start_to, to);              // Save destination array start address
 1969     __ b(L_load_element);
 1970 
 1971     // ======== begin loop ========
 1972     // (Loop is rotated; its entry is L_load_element.)
 1973     // Loop control:
 1974     //   for (; count != 0; count--) {
 1975     //     copied_oop = load_heap_oop(from++);
 1976     //     ... generate_type_check ...;
 1977     //     store_heap_oop(to++, copied_oop);
 1978     //   }
 1979     __ align(OptoLoopAlignment);
 1980 
 1981     __ BIND(L_store_element);
 1982     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1983                       __ post(to, element_size), copied_oop, noreg,
 1984                       gct1, gct2, gct3);
 1985     __ sub(count, count, 1);
 1986     __ cbz(count, L_do_card_marks);
 1987 
 1988     // ======== loop entry is here ========
 1989     __ BIND(L_load_element);
 1990     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1991                      copied_oop, noreg, __ post(from, element_size),
 1992                      gct1);
 1993     __ cbz(copied_oop, L_store_element);
 1994 
 1995     __ load_klass(r19_klass, copied_oop);// query the object klass
 1996 
 1997     BLOCK_COMMENT("type_check:");
 1998     generate_type_check(/*sub_klass*/r19_klass,
 1999                         /*super_check_offset*/ckoff,
 2000                         /*super_klass*/ckval,
 2001                         /*r_array_base*/gct1,
 2002                         /*temp2*/gct2,
 2003                         /*result*/r10, L_store_element);
 2004 
 2005     // Fall through on failure!
 2006 
 2007     // ======== end loop ========
 2008 
 2009     // It was a real error; we must depend on the caller to finish the job.
 2010     // Register count = remaining oops, count_orig = total oops.
 2011     // Emit GC store barriers for the oops we have copied and report
 2012     // their number to the caller.
 2013 
 2014     __ subs(count, count_save, count);     // K = partially copied oop count
 2015     __ eon(count, count, zr);              // report (-1^K) to caller
 2016     __ br(Assembler::EQ, L_done_pop);
 2017 
 2018     __ BIND(L_do_card_marks);
 2019     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2020 
 2021     __ bind(L_done_pop);
 2022     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2023     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2024 
 2025     __ bind(L_done);
 2026     __ mov(r0, count);
 2027     __ leave();
 2028     __ ret(lr);
 2029 
 2030     return start;
 2031   }
 2032 
 2033   // Perform range checks on the proposed arraycopy.
 2034   // Kills temp, but nothing else.
 2035   // Also, clean the sign bits of src_pos and dst_pos.
 2036   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2037                               Register src_pos, // source position (c_rarg1)
 2038                               Register dst,     // destination array oo (c_rarg2)
 2039                               Register dst_pos, // destination position (c_rarg3)
 2040                               Register length,
 2041                               Register temp,
 2042                               Label& L_failed) {
 2043     BLOCK_COMMENT("arraycopy_range_checks:");
 2044 
 2045     assert_different_registers(rscratch1, temp);
 2046 
 2047     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2048     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2049     __ addw(temp, length, src_pos);
 2050     __ cmpw(temp, rscratch1);
 2051     __ br(Assembler::HI, L_failed);
 2052 
 2053     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2054     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2055     __ addw(temp, length, dst_pos);
 2056     __ cmpw(temp, rscratch1);
 2057     __ br(Assembler::HI, L_failed);
 2058 
 2059     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2060     __ movw(src_pos, src_pos);
 2061     __ movw(dst_pos, dst_pos);
 2062 
 2063     BLOCK_COMMENT("arraycopy_range_checks done");
 2064   }
 2065 
 2066   // These stubs get called from some dumb test routine.
 2067   // I'll write them properly when they're called from
 2068   // something that's actually doing something.
 2069   static void fake_arraycopy_stub(address src, address dst, int count) {
 2070     assert(count == 0, "huh?");
 2071   }
 2072 
 2073 
 2074   //
 2075   //  Generate 'unsafe' array copy stub
 2076   //  Though just as safe as the other stubs, it takes an unscaled
 2077   //  size_t argument instead of an element count.
 2078   //
 2079   //  Input:
 2080   //    c_rarg0   - source array address
 2081   //    c_rarg1   - destination array address
 2082   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2083   //
 2084   // Examines the alignment of the operands and dispatches
 2085   // to a long, int, short, or byte copy loop.
 2086   //
 2087   address generate_unsafe_copy(address byte_copy_entry,
 2088                                address short_copy_entry,
 2089                                address int_copy_entry,
 2090                                address long_copy_entry) {
 2091     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
 2092 
 2093     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2094     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2095 
 2096     __ align(CodeEntryAlignment);
 2097     StubCodeMark mark(this, stub_id);
 2098     address start = __ pc();
 2099     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2100 
 2101     // bump this on entry, not on exit:
 2102     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2103 
 2104     __ orr(rscratch1, s, d);
 2105     __ orr(rscratch1, rscratch1, count);
 2106 
 2107     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2108     __ cbz(rscratch1, L_long_aligned);
 2109     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2110     __ cbz(rscratch1, L_int_aligned);
 2111     __ tbz(rscratch1, 0, L_short_aligned);
 2112     __ b(RuntimeAddress(byte_copy_entry));
 2113 
 2114     __ BIND(L_short_aligned);
 2115     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2116     __ b(RuntimeAddress(short_copy_entry));
 2117     __ BIND(L_int_aligned);
 2118     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2119     __ b(RuntimeAddress(int_copy_entry));
 2120     __ BIND(L_long_aligned);
 2121     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2122     __ b(RuntimeAddress(long_copy_entry));
 2123 
 2124     return start;
 2125   }
 2126 
 2127   //
 2128   //  Generate generic array copy stubs
 2129   //
 2130   //  Input:
 2131   //    c_rarg0    -  src oop
 2132   //    c_rarg1    -  src_pos (32-bits)
 2133   //    c_rarg2    -  dst oop
 2134   //    c_rarg3    -  dst_pos (32-bits)
 2135   //    c_rarg4    -  element count (32-bits)
 2136   //
 2137   //  Output:
 2138   //    r0 ==  0  -  success
 2139   //    r0 == -1^K - failure, where K is partial transfer count
 2140   //
 2141   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2142                                 address int_copy_entry, address oop_copy_entry,
 2143                                 address long_copy_entry, address checkcast_copy_entry) {
 2144     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
 2145 
 2146     Label L_failed, L_objArray;
 2147     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2148 
 2149     // Input registers
 2150     const Register src        = c_rarg0;  // source array oop
 2151     const Register src_pos    = c_rarg1;  // source position
 2152     const Register dst        = c_rarg2;  // destination array oop
 2153     const Register dst_pos    = c_rarg3;  // destination position
 2154     const Register length     = c_rarg4;
 2155 
 2156 
 2157     // Registers used as temps
 2158     const Register dst_klass  = c_rarg5;
 2159 
 2160     __ align(CodeEntryAlignment);
 2161 
 2162     StubCodeMark mark(this, stub_id);
 2163 
 2164     address start = __ pc();
 2165 
 2166     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2167 
 2168     // bump this on entry, not on exit:
 2169     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2170 
 2171     //-----------------------------------------------------------------------
 2172     // Assembler stub will be used for this call to arraycopy
 2173     // if the following conditions are met:
 2174     //
 2175     // (1) src and dst must not be null.
 2176     // (2) src_pos must not be negative.
 2177     // (3) dst_pos must not be negative.
 2178     // (4) length  must not be negative.
 2179     // (5) src klass and dst klass should be the same and not null.
 2180     // (6) src and dst should be arrays.
 2181     // (7) src_pos + length must not exceed length of src.
 2182     // (8) dst_pos + length must not exceed length of dst.
 2183     //
 2184 
 2185     //  if (src == nullptr) return -1;
 2186     __ cbz(src, L_failed);
 2187 
 2188     //  if (src_pos < 0) return -1;
 2189     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2190 
 2191     //  if (dst == nullptr) return -1;
 2192     __ cbz(dst, L_failed);
 2193 
 2194     //  if (dst_pos < 0) return -1;
 2195     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2196 
 2197     // registers used as temp
 2198     const Register scratch_length    = r16; // elements count to copy
 2199     const Register scratch_src_klass = r17; // array klass
 2200     const Register lh                = r15; // layout helper
 2201 
 2202     //  if (length < 0) return -1;
 2203     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2204     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2205 
 2206     __ load_klass(scratch_src_klass, src);
 2207 #ifdef ASSERT
 2208     //  assert(src->klass() != nullptr);
 2209     {
 2210       BLOCK_COMMENT("assert klasses not null {");
 2211       Label L1, L2;
 2212       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2213       __ bind(L1);
 2214       __ stop("broken null klass");
 2215       __ bind(L2);
 2216       __ load_klass(rscratch1, dst);
 2217       __ cbz(rscratch1, L1);     // this would be broken also
 2218       BLOCK_COMMENT("} assert klasses not null done");
 2219     }
 2220 #endif
 2221 
 2222     // Load layout helper (32-bits)
 2223     //
 2224     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2225     // 32        30    24            16              8     2                 0
 2226     //
 2227     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2228     //
 2229 
 2230     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2231 
 2232     // Handle objArrays completely differently...
 2233     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2234     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2235     __ movw(rscratch1, objArray_lh);
 2236     __ eorw(rscratch2, lh, rscratch1);
 2237     __ cbzw(rscratch2, L_objArray);
 2238 
 2239     //  if (src->klass() != dst->klass()) return -1;
 2240     __ load_klass(rscratch2, dst);
 2241     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2242     __ cbnz(rscratch2, L_failed);
 2243 
 2244     // Check for flat inline type array -> return -1
 2245     __ test_flat_array_oop(src, rscratch2, L_failed);
 2246 
 2247     // Check for null-free (non-flat) inline type array -> handle as object array
 2248     __ test_null_free_array_oop(src, rscratch2, L_objArray);
 2249 
 2250     //  if (!src->is_Array()) return -1;
 2251     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2252 
 2253     // At this point, it is known to be a typeArray (array_tag 0x3).
 2254 #ifdef ASSERT
 2255     {
 2256       BLOCK_COMMENT("assert primitive array {");
 2257       Label L;
 2258       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2259       __ cmpw(lh, rscratch2);
 2260       __ br(Assembler::GE, L);
 2261       __ stop("must be a primitive array");
 2262       __ bind(L);
 2263       BLOCK_COMMENT("} assert primitive array done");
 2264     }
 2265 #endif
 2266 
 2267     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2268                            rscratch2, L_failed);
 2269 
 2270     // TypeArrayKlass
 2271     //
 2272     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2273     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2274     //
 2275 
 2276     const Register rscratch1_offset = rscratch1;    // array offset
 2277     const Register r15_elsize = lh; // element size
 2278 
 2279     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2280            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2281     __ add(src, src, rscratch1_offset);           // src array offset
 2282     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2283     BLOCK_COMMENT("choose copy loop based on element size");
 2284 
 2285     // next registers should be set before the jump to corresponding stub
 2286     const Register from     = c_rarg0;  // source array address
 2287     const Register to       = c_rarg1;  // destination array address
 2288     const Register count    = c_rarg2;  // elements count
 2289 
 2290     // 'from', 'to', 'count' registers should be set in such order
 2291     // since they are the same as 'src', 'src_pos', 'dst'.
 2292 
 2293     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2294 
 2295     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2296     // size in bytes).  We do a simple bitwise binary search.
 2297   __ BIND(L_copy_bytes);
 2298     __ tbnz(r15_elsize, 1, L_copy_ints);
 2299     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2300     __ lea(from, Address(src, src_pos));// src_addr
 2301     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2302     __ movw(count, scratch_length); // length
 2303     __ b(RuntimeAddress(byte_copy_entry));
 2304 
 2305   __ BIND(L_copy_shorts);
 2306     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2307     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2308     __ movw(count, scratch_length); // length
 2309     __ b(RuntimeAddress(short_copy_entry));
 2310 
 2311   __ BIND(L_copy_ints);
 2312     __ tbnz(r15_elsize, 0, L_copy_longs);
 2313     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2314     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2315     __ movw(count, scratch_length); // length
 2316     __ b(RuntimeAddress(int_copy_entry));
 2317 
 2318   __ BIND(L_copy_longs);
 2319 #ifdef ASSERT
 2320     {
 2321       BLOCK_COMMENT("assert long copy {");
 2322       Label L;
 2323       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2324       __ cmpw(r15_elsize, LogBytesPerLong);
 2325       __ br(Assembler::EQ, L);
 2326       __ stop("must be long copy, but elsize is wrong");
 2327       __ bind(L);
 2328       BLOCK_COMMENT("} assert long copy done");
 2329     }
 2330 #endif
 2331     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2332     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2333     __ movw(count, scratch_length); // length
 2334     __ b(RuntimeAddress(long_copy_entry));
 2335 
 2336     // ObjArrayKlass
 2337   __ BIND(L_objArray);
 2338     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2339 
 2340     Label L_plain_copy, L_checkcast_copy;
 2341     //  test array classes for subtyping
 2342     __ load_klass(r15, dst);
 2343     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2344     __ br(Assembler::NE, L_checkcast_copy);
 2345 
 2346     // Identically typed arrays can be copied without element-wise checks.
 2347     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2348                            rscratch2, L_failed);
 2349 
 2350     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2351     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2352     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2353     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2354     __ movw(count, scratch_length); // length
 2355   __ BIND(L_plain_copy);
 2356     __ b(RuntimeAddress(oop_copy_entry));
 2357 
 2358   __ BIND(L_checkcast_copy);
 2359     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2360     {
 2361       // Before looking at dst.length, make sure dst is also an objArray.
 2362       __ ldrw(rscratch1, Address(r15, lh_offset));
 2363       __ movw(rscratch2, objArray_lh);
 2364       __ eorw(rscratch1, rscratch1, rscratch2);
 2365       __ cbnzw(rscratch1, L_failed);
 2366 
 2367       // It is safe to examine both src.length and dst.length.
 2368       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2369                              r15, L_failed);
 2370 
 2371       __ load_klass(dst_klass, dst); // reload
 2372 
 2373       // Marshal the base address arguments now, freeing registers.
 2374       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2375       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2376       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2377       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2378       __ movw(count, length);           // length (reloaded)
 2379       Register sco_temp = c_rarg3;      // this register is free now
 2380       assert_different_registers(from, to, count, sco_temp,
 2381                                  dst_klass, scratch_src_klass);
 2382       // assert_clean_int(count, sco_temp);
 2383 
 2384       // Generate the type check.
 2385       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2386       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2387 
 2388       // Smashes rscratch1, rscratch2
 2389       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2390                           L_plain_copy);
 2391 
 2392       // Fetch destination element klass from the ObjArrayKlass header.
 2393       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2394       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2395       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2396 
 2397       // the checkcast_copy loop needs two extra arguments:
 2398       assert(c_rarg3 == sco_temp, "#3 already in place");
 2399       // Set up arguments for checkcast_copy_entry.
 2400       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2401       __ b(RuntimeAddress(checkcast_copy_entry));
 2402     }
 2403 
 2404   __ BIND(L_failed);
 2405     __ mov(r0, -1);
 2406     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2407     __ ret(lr);
 2408 
 2409     return start;
 2410   }
 2411 
 2412   //
 2413   // Generate stub for array fill. If "aligned" is true, the
 2414   // "to" address is assumed to be heapword aligned.
 2415   //
 2416   // Arguments for generated stub:
 2417   //   to:    c_rarg0
 2418   //   value: c_rarg1
 2419   //   count: c_rarg2 treated as signed
 2420   //
 2421   address generate_fill(StubId stub_id) {
 2422     BasicType t;
 2423     bool aligned;
 2424 
 2425     switch (stub_id) {
 2426     case StubId::stubgen_jbyte_fill_id:
 2427       t = T_BYTE;
 2428       aligned = false;
 2429       break;
 2430     case StubId::stubgen_jshort_fill_id:
 2431       t = T_SHORT;
 2432       aligned = false;
 2433       break;
 2434     case StubId::stubgen_jint_fill_id:
 2435       t = T_INT;
 2436       aligned = false;
 2437       break;
 2438     case StubId::stubgen_arrayof_jbyte_fill_id:
 2439       t = T_BYTE;
 2440       aligned = true;
 2441       break;
 2442     case StubId::stubgen_arrayof_jshort_fill_id:
 2443       t = T_SHORT;
 2444       aligned = true;
 2445       break;
 2446     case StubId::stubgen_arrayof_jint_fill_id:
 2447       t = T_INT;
 2448       aligned = true;
 2449       break;
 2450     default:
 2451       ShouldNotReachHere();
 2452     };
 2453 
 2454     __ align(CodeEntryAlignment);
 2455     StubCodeMark mark(this, stub_id);
 2456     address start = __ pc();
 2457 
 2458     BLOCK_COMMENT("Entry:");
 2459 
 2460     const Register to        = c_rarg0;  // source array address
 2461     const Register value     = c_rarg1;  // value
 2462     const Register count     = c_rarg2;  // elements count
 2463 
 2464     const Register bz_base = r10;        // base for block_zero routine
 2465     const Register cnt_words = r11;      // temp register
 2466 
 2467     __ enter();
 2468 
 2469     Label L_fill_elements, L_exit1;
 2470 
 2471     int shift = -1;
 2472     switch (t) {
 2473       case T_BYTE:
 2474         shift = 0;
 2475         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2476         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2477         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2478         __ br(Assembler::LO, L_fill_elements);
 2479         break;
 2480       case T_SHORT:
 2481         shift = 1;
 2482         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2483         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2484         __ br(Assembler::LO, L_fill_elements);
 2485         break;
 2486       case T_INT:
 2487         shift = 2;
 2488         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2489         __ br(Assembler::LO, L_fill_elements);
 2490         break;
 2491       default: ShouldNotReachHere();
 2492     }
 2493 
 2494     // Align source address at 8 bytes address boundary.
 2495     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2496     if (!aligned) {
 2497       switch (t) {
 2498         case T_BYTE:
 2499           // One byte misalignment happens only for byte arrays.
 2500           __ tbz(to, 0, L_skip_align1);
 2501           __ strb(value, Address(__ post(to, 1)));
 2502           __ subw(count, count, 1);
 2503           __ bind(L_skip_align1);
 2504           // Fallthrough
 2505         case T_SHORT:
 2506           // Two bytes misalignment happens only for byte and short (char) arrays.
 2507           __ tbz(to, 1, L_skip_align2);
 2508           __ strh(value, Address(__ post(to, 2)));
 2509           __ subw(count, count, 2 >> shift);
 2510           __ bind(L_skip_align2);
 2511           // Fallthrough
 2512         case T_INT:
 2513           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2514           __ tbz(to, 2, L_skip_align4);
 2515           __ strw(value, Address(__ post(to, 4)));
 2516           __ subw(count, count, 4 >> shift);
 2517           __ bind(L_skip_align4);
 2518           break;
 2519         default: ShouldNotReachHere();
 2520       }
 2521     }
 2522 
 2523     //
 2524     //  Fill large chunks
 2525     //
 2526     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2527     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2528     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2529     if (UseBlockZeroing) {
 2530       Label non_block_zeroing, rest;
 2531       // If the fill value is zero we can use the fast zero_words().
 2532       __ cbnz(value, non_block_zeroing);
 2533       __ mov(bz_base, to);
 2534       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2535       address tpc = __ zero_words(bz_base, cnt_words);
 2536       if (tpc == nullptr) {
 2537         fatal("CodeCache is full at generate_fill");
 2538       }
 2539       __ b(rest);
 2540       __ bind(non_block_zeroing);
 2541       __ fill_words(to, cnt_words, value);
 2542       __ bind(rest);
 2543     } else {
 2544       __ fill_words(to, cnt_words, value);
 2545     }
 2546 
 2547     // Remaining count is less than 8 bytes. Fill it by a single store.
 2548     // Note that the total length is no less than 8 bytes.
 2549     if (t == T_BYTE || t == T_SHORT) {
 2550       Label L_exit1;
 2551       __ cbzw(count, L_exit1);
 2552       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2553       __ str(value, Address(to, -8));    // overwrite some elements
 2554       __ bind(L_exit1);
 2555       __ leave();
 2556       __ ret(lr);
 2557     }
 2558 
 2559     // Handle copies less than 8 bytes.
 2560     Label L_fill_2, L_fill_4, L_exit2;
 2561     __ bind(L_fill_elements);
 2562     switch (t) {
 2563       case T_BYTE:
 2564         __ tbz(count, 0, L_fill_2);
 2565         __ strb(value, Address(__ post(to, 1)));
 2566         __ bind(L_fill_2);
 2567         __ tbz(count, 1, L_fill_4);
 2568         __ strh(value, Address(__ post(to, 2)));
 2569         __ bind(L_fill_4);
 2570         __ tbz(count, 2, L_exit2);
 2571         __ strw(value, Address(to));
 2572         break;
 2573       case T_SHORT:
 2574         __ tbz(count, 0, L_fill_4);
 2575         __ strh(value, Address(__ post(to, 2)));
 2576         __ bind(L_fill_4);
 2577         __ tbz(count, 1, L_exit2);
 2578         __ strw(value, Address(to));
 2579         break;
 2580       case T_INT:
 2581         __ cbzw(count, L_exit2);
 2582         __ strw(value, Address(to));
 2583         break;
 2584       default: ShouldNotReachHere();
 2585     }
 2586     __ bind(L_exit2);
 2587     __ leave();
 2588     __ ret(lr);
 2589     return start;
 2590   }
 2591 
 2592   address generate_unsafecopy_common_error_exit() {
 2593     address start_pc = __ pc();
 2594       __ leave();
 2595       __ mov(r0, 0);
 2596       __ ret(lr);
 2597     return start_pc;
 2598   }
 2599 
 2600   //
 2601   //  Generate 'unsafe' set memory stub
 2602   //  Though just as safe as the other stubs, it takes an unscaled
 2603   //  size_t (# bytes) argument instead of an element count.
 2604   //
 2605   //  This fill operation is atomicity preserving: as long as the
 2606   //  address supplied is sufficiently aligned, all writes of up to 64
 2607   //  bits in size are single-copy atomic.
 2608   //
 2609   //  Input:
 2610   //    c_rarg0   - destination array address
 2611   //    c_rarg1   - byte count (size_t)
 2612   //    c_rarg2   - byte value
 2613   //
 2614   address generate_unsafe_setmemory() {
 2615     __ align(CodeEntryAlignment);
 2616     StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
 2617     address start = __ pc();
 2618 
 2619     Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
 2620     Label tail;
 2621 
 2622     UnsafeMemoryAccessMark umam(this, true, false);
 2623 
 2624     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2625 
 2626     __ dup(v0, __ T16B, value);
 2627 
 2628     if (AvoidUnalignedAccesses) {
 2629       __ cmp(count, (u1)16);
 2630       __ br(__ LO, tail);
 2631 
 2632       __ mov(rscratch1, 16);
 2633       __ andr(rscratch2, dest, 15);
 2634       __ sub(rscratch1, rscratch1, rscratch2);  // Bytes needed to 16-align dest
 2635       __ strq(v0, Address(dest));
 2636       __ sub(count, count, rscratch1);
 2637       __ add(dest, dest, rscratch1);
 2638     }
 2639 
 2640     __ subs(count, count, (u1)64);
 2641     __ br(__ LO, tail);
 2642     {
 2643       Label again;
 2644       __ bind(again);
 2645       __ stpq(v0, v0, Address(dest));
 2646       __ stpq(v0, v0, Address(dest, 32));
 2647 
 2648       __ subs(count, count, 64);
 2649       __ add(dest, dest, 64);
 2650       __ br(__ HS, again);
 2651     }
 2652 
 2653     __ bind(tail);
 2654     // The count of bytes is off by 64, but we don't need to correct
 2655     // it because we're only going to use the least-significant few
 2656     // count bits from here on.
 2657     // __ add(count, count, 64);
 2658 
 2659     {
 2660       Label dont;
 2661       __ tbz(count, exact_log2(32), dont);
 2662       __ stpq(v0, v0, __ post(dest, 32));
 2663       __ bind(dont);
 2664     }
 2665     {
 2666       Label dont;
 2667       __ tbz(count, exact_log2(16), dont);
 2668       __ strq(v0, __ post(dest, 16));
 2669       __ bind(dont);
 2670     }
 2671     {
 2672       Label dont;
 2673       __ tbz(count, exact_log2(8), dont);
 2674       __ strd(v0, __ post(dest, 8));
 2675       __ bind(dont);
 2676     }
 2677 
 2678     Label finished;
 2679     __ tst(count, 7);
 2680     __ br(__ EQ, finished);
 2681 
 2682     {
 2683       Label dont;
 2684       __ tbz(count, exact_log2(4), dont);
 2685       __ strs(v0, __ post(dest, 4));
 2686       __ bind(dont);
 2687     }
 2688     {
 2689       Label dont;
 2690       __ tbz(count, exact_log2(2), dont);
 2691       __ bfi(value, value, 8, 8);
 2692       __ strh(value, __ post(dest, 2));
 2693       __ bind(dont);
 2694     }
 2695     {
 2696       Label dont;
 2697       __ tbz(count, exact_log2(1), dont);
 2698       __ strb(value, Address(dest));
 2699       __ bind(dont);
 2700     }
 2701 
 2702     __ bind(finished);
 2703     __ leave();
 2704     __ ret(lr);
 2705 
 2706     return start;
 2707   }
 2708 
 2709   address generate_data_cache_writeback() {
 2710     const Register line        = c_rarg0;  // address of line to write back
 2711 
 2712     __ align(CodeEntryAlignment);
 2713 
 2714     StubId stub_id = StubId::stubgen_data_cache_writeback_id;
 2715     StubCodeMark mark(this, stub_id);
 2716 
 2717     address start = __ pc();
 2718     __ enter();
 2719     __ cache_wb(Address(line, 0));
 2720     __ leave();
 2721     __ ret(lr);
 2722 
 2723     return start;
 2724   }
 2725 
 2726   address generate_data_cache_writeback_sync() {
 2727     const Register is_pre     = c_rarg0;  // pre or post sync
 2728 
 2729     __ align(CodeEntryAlignment);
 2730 
 2731     StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
 2732     StubCodeMark mark(this, stub_id);
 2733 
 2734     // pre wbsync is a no-op
 2735     // post wbsync translates to an sfence
 2736 
 2737     Label skip;
 2738     address start = __ pc();
 2739     __ enter();
 2740     __ cbnz(is_pre, skip);
 2741     __ cache_wbsync(false);
 2742     __ bind(skip);
 2743     __ leave();
 2744     __ ret(lr);
 2745 
 2746     return start;
 2747   }
 2748 
 2749   void generate_arraycopy_stubs() {
 2750     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
 2751     // entry immediately following their stack push. This can be used
 2752     // as a post-push branch target for compatible stubs when they
 2753     // identify a special case that can be handled by the fallback
 2754     // stub e.g a disjoint copy stub may be use as a special case
 2755     // fallback for its compatible conjoint copy stub.
 2756     //
 2757     // A no push entry is always returned in the following local and
 2758     // then published by assigning to the appropriate entry field in
 2759     // class StubRoutines. The entry value is then passed to the
 2760     // generator for the compatible stub. That means the entry must be
 2761     // listed when saving to/restoring from the AOT cache, ensuring
 2762     // that the inter-stub jumps are noted at AOT-cache save and
 2763     // relocated at AOT cache load.
 2764     address nopush_entry;
 2765 
 2766     // generate the common exit first so later stubs can rely on it if
 2767     // they want an UnsafeMemoryAccess exit non-local to the stub
 2768     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
 2769     // register the stub as the default exit with class UnsafeMemoryAccess
 2770     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
 2771 
 2772     // generate and publish arch64-specific bulk copy routines first
 2773     // so we can call them from other copy stubs
 2774     StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2775     StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2776 
 2777     StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2778     StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
 2779 
 2780     StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2781     StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
 2782 
 2783     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2784 
 2785     //*** jbyte
 2786     // Always need aligned and unaligned versions
 2787     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2788     // disjoint nopush entry is needed by conjoint copy
 2789     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2790     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
 2791     // conjoint nopush entry is needed by generic/unsafe copy
 2792     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
 2793     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
 2794     // disjoint arrayof nopush entry is needed by conjoint copy
 2795     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
 2796     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
 2797 
 2798     //*** jshort
 2799     // Always need aligned and unaligned versions
 2800     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
 2801     // disjoint nopush entry is needed by conjoint copy
 2802     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
 2803     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
 2804     // conjoint nopush entry is used by generic/unsafe copy
 2805     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
 2806     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
 2807     // disjoint arrayof nopush entry is needed by conjoint copy
 2808     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
 2809     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
 2810 
 2811     //*** jint
 2812     // Aligned versions
 2813     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
 2814     // disjoint arrayof nopush entry is needed by conjoint copy
 2815     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
 2816     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
 2817     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2818     // jint_arraycopy_nopush always points to the unaligned version
 2819     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
 2820     // disjoint nopush entry is needed by conjoint copy
 2821     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
 2822     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
 2823     // conjoint nopush entry is needed by generic/unsafe copy
 2824     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
 2825 
 2826     //*** jlong
 2827     // It is always aligned
 2828     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
 2829     // disjoint arrayof nopush entry is needed by conjoint copy
 2830     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
 2831     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
 2832     // conjoint nopush entry is needed by generic/unsafe copy
 2833     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
 2834     // disjoint normal/nopush and conjoint normal entries are not
 2835     // generated since the arrayof versions are the same
 2836     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2837     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
 2838     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2839 
 2840     //*** oops
 2841     {
 2842       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2843         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
 2844       // disjoint arrayof nopush entry is needed by conjoint copy
 2845       StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
 2846       StubRoutines::_arrayof_oop_arraycopy
 2847         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
 2848       // conjoint arrayof nopush entry is needed by generic/unsafe copy
 2849       StubRoutines::_oop_arraycopy_nopush = nopush_entry;
 2850       // Aligned versions without pre-barriers
 2851       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2852         = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
 2853       // disjoint arrayof+uninit nopush entry is needed by conjoint copy
 2854       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
 2855       // note that we don't need a returned nopush entry because the
 2856       // generic/unsafe copy does not cater for uninit arrays.
 2857       StubRoutines::_arrayof_oop_arraycopy_uninit
 2858         = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
 2859     }
 2860 
 2861     // for oop copies reuse arrayof entries for non-arrayof cases
 2862     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2863     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
 2864     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2865     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2866     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
 2867     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2868 
 2869     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
 2870     // checkcast nopush entry is needed by generic copy
 2871     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
 2872     // note that we don't need a returned nopush entry because the
 2873     // generic copy does not cater for uninit arrays.
 2874     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
 2875 
 2876     // unsafe arraycopy may fallback on conjoint stubs
 2877     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2878                                                               StubRoutines::_jshort_arraycopy_nopush,
 2879                                                               StubRoutines::_jint_arraycopy_nopush,
 2880                                                               StubRoutines::_jlong_arraycopy_nopush);
 2881 
 2882     // generic arraycopy may fallback on conjoint stubs
 2883     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
 2884                                                                StubRoutines::_jshort_arraycopy_nopush,
 2885                                                                StubRoutines::_jint_arraycopy_nopush,
 2886                                                                StubRoutines::_oop_arraycopy_nopush,
 2887                                                                StubRoutines::_jlong_arraycopy_nopush,
 2888                                                                StubRoutines::_checkcast_arraycopy_nopush);
 2889 
 2890     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
 2891     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
 2892     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
 2893     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
 2894     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
 2895     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
 2896   }
 2897 
 2898   void generate_math_stubs() { Unimplemented(); }
 2899 
 2900   // Arguments:
 2901   //
 2902   // Inputs:
 2903   //   c_rarg0   - source byte array address
 2904   //   c_rarg1   - destination byte array address
 2905   //   c_rarg2   - K (key) in little endian int array
 2906   //
 2907   address generate_aescrypt_encryptBlock() {
 2908     __ align(CodeEntryAlignment);
 2909     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
 2910     StubCodeMark mark(this, stub_id);
 2911 
 2912     const Register from        = c_rarg0;  // source array address
 2913     const Register to          = c_rarg1;  // destination array address
 2914     const Register key         = c_rarg2;  // key array address
 2915     const Register keylen      = rscratch1;
 2916 
 2917     address start = __ pc();
 2918     __ enter();
 2919 
 2920     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2921 
 2922     __ aesenc_loadkeys(key, keylen);
 2923     __ aesecb_encrypt(from, to, keylen);
 2924 
 2925     __ mov(r0, 0);
 2926 
 2927     __ leave();
 2928     __ ret(lr);
 2929 
 2930     return start;
 2931   }
 2932 
 2933   // Arguments:
 2934   //
 2935   // Inputs:
 2936   //   c_rarg0   - source byte array address
 2937   //   c_rarg1   - destination byte array address
 2938   //   c_rarg2   - K (key) in little endian int array
 2939   //
 2940   address generate_aescrypt_decryptBlock() {
 2941     assert(UseAES, "need AES cryptographic extension support");
 2942     __ align(CodeEntryAlignment);
 2943     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
 2944     StubCodeMark mark(this, stub_id);
 2945     Label L_doLast;
 2946 
 2947     const Register from        = c_rarg0;  // source array address
 2948     const Register to          = c_rarg1;  // destination array address
 2949     const Register key         = c_rarg2;  // key array address
 2950     const Register keylen      = rscratch1;
 2951 
 2952     address start = __ pc();
 2953     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2954 
 2955     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2956 
 2957     __ aesecb_decrypt(from, to, key, keylen);
 2958 
 2959     __ mov(r0, 0);
 2960 
 2961     __ leave();
 2962     __ ret(lr);
 2963 
 2964     return start;
 2965   }
 2966 
 2967   // Arguments:
 2968   //
 2969   // Inputs:
 2970   //   c_rarg0   - source byte array address
 2971   //   c_rarg1   - destination byte array address
 2972   //   c_rarg2   - K (key) in little endian int array
 2973   //   c_rarg3   - r vector byte array address
 2974   //   c_rarg4   - input length
 2975   //
 2976   // Output:
 2977   //   x0        - input length
 2978   //
 2979   address generate_cipherBlockChaining_encryptAESCrypt() {
 2980     assert(UseAES, "need AES cryptographic extension support");
 2981     __ align(CodeEntryAlignment);
 2982     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
 2983     StubCodeMark mark(this, stub_id);
 2984 
 2985     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2986 
 2987     const Register from        = c_rarg0;  // source array address
 2988     const Register to          = c_rarg1;  // destination array address
 2989     const Register key         = c_rarg2;  // key array address
 2990     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2991                                            // and left with the results of the last encryption block
 2992     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2993     const Register keylen      = rscratch1;
 2994 
 2995     address start = __ pc();
 2996 
 2997       __ enter();
 2998 
 2999       __ movw(rscratch2, len_reg);
 3000 
 3001       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3002 
 3003       __ ld1(v0, __ T16B, rvec);
 3004 
 3005       __ cmpw(keylen, 52);
 3006       __ br(Assembler::CC, L_loadkeys_44);
 3007       __ br(Assembler::EQ, L_loadkeys_52);
 3008 
 3009       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3010       __ rev32(v17, __ T16B, v17);
 3011       __ rev32(v18, __ T16B, v18);
 3012     __ BIND(L_loadkeys_52);
 3013       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3014       __ rev32(v19, __ T16B, v19);
 3015       __ rev32(v20, __ T16B, v20);
 3016     __ BIND(L_loadkeys_44);
 3017       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3018       __ rev32(v21, __ T16B, v21);
 3019       __ rev32(v22, __ T16B, v22);
 3020       __ rev32(v23, __ T16B, v23);
 3021       __ rev32(v24, __ T16B, v24);
 3022       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3023       __ rev32(v25, __ T16B, v25);
 3024       __ rev32(v26, __ T16B, v26);
 3025       __ rev32(v27, __ T16B, v27);
 3026       __ rev32(v28, __ T16B, v28);
 3027       __ ld1(v29, v30, v31, __ T16B, key);
 3028       __ rev32(v29, __ T16B, v29);
 3029       __ rev32(v30, __ T16B, v30);
 3030       __ rev32(v31, __ T16B, v31);
 3031 
 3032     __ BIND(L_aes_loop);
 3033       __ ld1(v1, __ T16B, __ post(from, 16));
 3034       __ eor(v0, __ T16B, v0, v1);
 3035 
 3036       __ br(Assembler::CC, L_rounds_44);
 3037       __ br(Assembler::EQ, L_rounds_52);
 3038 
 3039       __ aese(v0, v17); __ aesmc(v0, v0);
 3040       __ aese(v0, v18); __ aesmc(v0, v0);
 3041     __ BIND(L_rounds_52);
 3042       __ aese(v0, v19); __ aesmc(v0, v0);
 3043       __ aese(v0, v20); __ aesmc(v0, v0);
 3044     __ BIND(L_rounds_44);
 3045       __ aese(v0, v21); __ aesmc(v0, v0);
 3046       __ aese(v0, v22); __ aesmc(v0, v0);
 3047       __ aese(v0, v23); __ aesmc(v0, v0);
 3048       __ aese(v0, v24); __ aesmc(v0, v0);
 3049       __ aese(v0, v25); __ aesmc(v0, v0);
 3050       __ aese(v0, v26); __ aesmc(v0, v0);
 3051       __ aese(v0, v27); __ aesmc(v0, v0);
 3052       __ aese(v0, v28); __ aesmc(v0, v0);
 3053       __ aese(v0, v29); __ aesmc(v0, v0);
 3054       __ aese(v0, v30);
 3055       __ eor(v0, __ T16B, v0, v31);
 3056 
 3057       __ st1(v0, __ T16B, __ post(to, 16));
 3058 
 3059       __ subw(len_reg, len_reg, 16);
 3060       __ cbnzw(len_reg, L_aes_loop);
 3061 
 3062       __ st1(v0, __ T16B, rvec);
 3063 
 3064       __ mov(r0, rscratch2);
 3065 
 3066       __ leave();
 3067       __ ret(lr);
 3068 
 3069       return start;
 3070   }
 3071 
 3072   // Arguments:
 3073   //
 3074   // Inputs:
 3075   //   c_rarg0   - source byte array address
 3076   //   c_rarg1   - destination byte array address
 3077   //   c_rarg2   - K (key) in little endian int array
 3078   //   c_rarg3   - r vector byte array address
 3079   //   c_rarg4   - input length
 3080   //
 3081   // Output:
 3082   //   r0        - input length
 3083   //
 3084   address generate_cipherBlockChaining_decryptAESCrypt() {
 3085     assert(UseAES, "need AES cryptographic extension support");
 3086     __ align(CodeEntryAlignment);
 3087     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
 3088     StubCodeMark mark(this, stub_id);
 3089 
 3090     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 3091 
 3092     const Register from        = c_rarg0;  // source array address
 3093     const Register to          = c_rarg1;  // destination array address
 3094     const Register key         = c_rarg2;  // key array address
 3095     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 3096                                            // and left with the results of the last encryption block
 3097     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 3098     const Register keylen      = rscratch1;
 3099 
 3100     address start = __ pc();
 3101 
 3102       __ enter();
 3103 
 3104       __ movw(rscratch2, len_reg);
 3105 
 3106       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3107 
 3108       __ ld1(v2, __ T16B, rvec);
 3109 
 3110       __ ld1(v31, __ T16B, __ post(key, 16));
 3111       __ rev32(v31, __ T16B, v31);
 3112 
 3113       __ cmpw(keylen, 52);
 3114       __ br(Assembler::CC, L_loadkeys_44);
 3115       __ br(Assembler::EQ, L_loadkeys_52);
 3116 
 3117       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 3118       __ rev32(v17, __ T16B, v17);
 3119       __ rev32(v18, __ T16B, v18);
 3120     __ BIND(L_loadkeys_52);
 3121       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 3122       __ rev32(v19, __ T16B, v19);
 3123       __ rev32(v20, __ T16B, v20);
 3124     __ BIND(L_loadkeys_44);
 3125       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 3126       __ rev32(v21, __ T16B, v21);
 3127       __ rev32(v22, __ T16B, v22);
 3128       __ rev32(v23, __ T16B, v23);
 3129       __ rev32(v24, __ T16B, v24);
 3130       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 3131       __ rev32(v25, __ T16B, v25);
 3132       __ rev32(v26, __ T16B, v26);
 3133       __ rev32(v27, __ T16B, v27);
 3134       __ rev32(v28, __ T16B, v28);
 3135       __ ld1(v29, v30, __ T16B, key);
 3136       __ rev32(v29, __ T16B, v29);
 3137       __ rev32(v30, __ T16B, v30);
 3138 
 3139     __ BIND(L_aes_loop);
 3140       __ ld1(v0, __ T16B, __ post(from, 16));
 3141       __ orr(v1, __ T16B, v0, v0);
 3142 
 3143       __ br(Assembler::CC, L_rounds_44);
 3144       __ br(Assembler::EQ, L_rounds_52);
 3145 
 3146       __ aesd(v0, v17); __ aesimc(v0, v0);
 3147       __ aesd(v0, v18); __ aesimc(v0, v0);
 3148     __ BIND(L_rounds_52);
 3149       __ aesd(v0, v19); __ aesimc(v0, v0);
 3150       __ aesd(v0, v20); __ aesimc(v0, v0);
 3151     __ BIND(L_rounds_44);
 3152       __ aesd(v0, v21); __ aesimc(v0, v0);
 3153       __ aesd(v0, v22); __ aesimc(v0, v0);
 3154       __ aesd(v0, v23); __ aesimc(v0, v0);
 3155       __ aesd(v0, v24); __ aesimc(v0, v0);
 3156       __ aesd(v0, v25); __ aesimc(v0, v0);
 3157       __ aesd(v0, v26); __ aesimc(v0, v0);
 3158       __ aesd(v0, v27); __ aesimc(v0, v0);
 3159       __ aesd(v0, v28); __ aesimc(v0, v0);
 3160       __ aesd(v0, v29); __ aesimc(v0, v0);
 3161       __ aesd(v0, v30);
 3162       __ eor(v0, __ T16B, v0, v31);
 3163       __ eor(v0, __ T16B, v0, v2);
 3164 
 3165       __ st1(v0, __ T16B, __ post(to, 16));
 3166       __ orr(v2, __ T16B, v1, v1);
 3167 
 3168       __ subw(len_reg, len_reg, 16);
 3169       __ cbnzw(len_reg, L_aes_loop);
 3170 
 3171       __ st1(v2, __ T16B, rvec);
 3172 
 3173       __ mov(r0, rscratch2);
 3174 
 3175       __ leave();
 3176       __ ret(lr);
 3177 
 3178     return start;
 3179   }
 3180 
 3181   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3182   // Inputs: 128-bits. in is preserved.
 3183   // The least-significant 64-bit word is in the upper dword of each vector.
 3184   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3185   // Output: result
 3186   void be_add_128_64(FloatRegister result, FloatRegister in,
 3187                      FloatRegister inc, FloatRegister tmp) {
 3188     assert_different_registers(result, tmp, inc);
 3189 
 3190     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3191                                            // input
 3192     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3193     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3194                                            // MSD == 0 (must be!) to LSD
 3195     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3196   }
 3197 
 3198   // CTR AES crypt.
 3199   // Arguments:
 3200   //
 3201   // Inputs:
 3202   //   c_rarg0   - source byte array address
 3203   //   c_rarg1   - destination byte array address
 3204   //   c_rarg2   - K (key) in little endian int array
 3205   //   c_rarg3   - counter vector byte array address
 3206   //   c_rarg4   - input length
 3207   //   c_rarg5   - saved encryptedCounter start
 3208   //   c_rarg6   - saved used length
 3209   //
 3210   // Output:
 3211   //   r0       - input length
 3212   //
 3213   address generate_counterMode_AESCrypt() {
 3214     const Register in = c_rarg0;
 3215     const Register out = c_rarg1;
 3216     const Register key = c_rarg2;
 3217     const Register counter = c_rarg3;
 3218     const Register saved_len = c_rarg4, len = r10;
 3219     const Register saved_encrypted_ctr = c_rarg5;
 3220     const Register used_ptr = c_rarg6, used = r12;
 3221 
 3222     const Register offset = r7;
 3223     const Register keylen = r11;
 3224 
 3225     const unsigned char block_size = 16;
 3226     const int bulk_width = 4;
 3227     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3228     // performance with larger data sizes, but it also means that the
 3229     // fast path isn't used until you have at least 8 blocks, and up
 3230     // to 127 bytes of data will be executed on the slow path. For
 3231     // that reason, and also so as not to blow away too much icache, 4
 3232     // blocks seems like a sensible compromise.
 3233 
 3234     // Algorithm:
 3235     //
 3236     //    if (len == 0) {
 3237     //        goto DONE;
 3238     //    }
 3239     //    int result = len;
 3240     //    do {
 3241     //        if (used >= blockSize) {
 3242     //            if (len >= bulk_width * blockSize) {
 3243     //                CTR_large_block();
 3244     //                if (len == 0)
 3245     //                    goto DONE;
 3246     //            }
 3247     //            for (;;) {
 3248     //                16ByteVector v0 = counter;
 3249     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3250     //                used = 0;
 3251     //                if (len < blockSize)
 3252     //                    break;    /* goto NEXT */
 3253     //                16ByteVector v1 = load16Bytes(in, offset);
 3254     //                v1 = v1 ^ encryptedCounter;
 3255     //                store16Bytes(out, offset);
 3256     //                used = blockSize;
 3257     //                offset += blockSize;
 3258     //                len -= blockSize;
 3259     //                if (len == 0)
 3260     //                    goto DONE;
 3261     //            }
 3262     //        }
 3263     //      NEXT:
 3264     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3265     //        len--;
 3266     //    } while (len != 0);
 3267     //  DONE:
 3268     //    return result;
 3269     //
 3270     // CTR_large_block()
 3271     //    Wide bulk encryption of whole blocks.
 3272 
 3273     __ align(CodeEntryAlignment);
 3274     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
 3275     StubCodeMark mark(this, stub_id);
 3276     const address start = __ pc();
 3277     __ enter();
 3278 
 3279     Label DONE, CTR_large_block, large_block_return;
 3280     __ ldrw(used, Address(used_ptr));
 3281     __ cbzw(saved_len, DONE);
 3282 
 3283     __ mov(len, saved_len);
 3284     __ mov(offset, 0);
 3285 
 3286     // Compute #rounds for AES based on the length of the key array
 3287     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3288 
 3289     __ aesenc_loadkeys(key, keylen);
 3290 
 3291     {
 3292       Label L_CTR_loop, NEXT;
 3293 
 3294       __ bind(L_CTR_loop);
 3295 
 3296       __ cmp(used, block_size);
 3297       __ br(__ LO, NEXT);
 3298 
 3299       // Maybe we have a lot of data
 3300       __ subsw(rscratch1, len, bulk_width * block_size);
 3301       __ br(__ HS, CTR_large_block);
 3302       __ BIND(large_block_return);
 3303       __ cbzw(len, DONE);
 3304 
 3305       // Setup the counter
 3306       __ movi(v4, __ T4S, 0);
 3307       __ movi(v5, __ T4S, 1);
 3308       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3309 
 3310       // 128-bit big-endian increment
 3311       __ ld1(v0, __ T16B, counter);
 3312       __ rev64(v16, __ T16B, v0);
 3313       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3314       __ rev64(v16, __ T16B, v16);
 3315       __ st1(v16, __ T16B, counter);
 3316       // Previous counter value is in v0
 3317       // v4 contains { 0, 1 }
 3318 
 3319       {
 3320         // We have fewer than bulk_width blocks of data left. Encrypt
 3321         // them one by one until there is less than a full block
 3322         // remaining, being careful to save both the encrypted counter
 3323         // and the counter.
 3324 
 3325         Label inner_loop;
 3326         __ bind(inner_loop);
 3327         // Counter to encrypt is in v0
 3328         __ aesecb_encrypt(noreg, noreg, keylen);
 3329         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3330 
 3331         // Do we have a remaining full block?
 3332 
 3333         __ mov(used, 0);
 3334         __ cmp(len, block_size);
 3335         __ br(__ LO, NEXT);
 3336 
 3337         // Yes, we have a full block
 3338         __ ldrq(v1, Address(in, offset));
 3339         __ eor(v1, __ T16B, v1, v0);
 3340         __ strq(v1, Address(out, offset));
 3341         __ mov(used, block_size);
 3342         __ add(offset, offset, block_size);
 3343 
 3344         __ subw(len, len, block_size);
 3345         __ cbzw(len, DONE);
 3346 
 3347         // Increment the counter, store it back
 3348         __ orr(v0, __ T16B, v16, v16);
 3349         __ rev64(v16, __ T16B, v16);
 3350         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3351         __ rev64(v16, __ T16B, v16);
 3352         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3353 
 3354         __ b(inner_loop);
 3355       }
 3356 
 3357       __ BIND(NEXT);
 3358 
 3359       // Encrypt a single byte, and loop.
 3360       // We expect this to be a rare event.
 3361       __ ldrb(rscratch1, Address(in, offset));
 3362       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3363       __ eor(rscratch1, rscratch1, rscratch2);
 3364       __ strb(rscratch1, Address(out, offset));
 3365       __ add(offset, offset, 1);
 3366       __ add(used, used, 1);
 3367       __ subw(len, len,1);
 3368       __ cbnzw(len, L_CTR_loop);
 3369     }
 3370 
 3371     __ bind(DONE);
 3372     __ strw(used, Address(used_ptr));
 3373     __ mov(r0, saved_len);
 3374 
 3375     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3376     __ ret(lr);
 3377 
 3378     // Bulk encryption
 3379 
 3380     __ BIND (CTR_large_block);
 3381     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3382 
 3383     if (bulk_width == 8) {
 3384       __ sub(sp, sp, 4 * 16);
 3385       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3386     }
 3387     __ sub(sp, sp, 4 * 16);
 3388     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3389     RegSet saved_regs = (RegSet::of(in, out, offset)
 3390                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3391     __ push(saved_regs, sp);
 3392     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3393     __ add(in, in, offset);
 3394     __ add(out, out, offset);
 3395 
 3396     // Keys should already be loaded into the correct registers
 3397 
 3398     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3399     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3400 
 3401     // AES/CTR loop
 3402     {
 3403       Label L_CTR_loop;
 3404       __ BIND(L_CTR_loop);
 3405 
 3406       // Setup the counters
 3407       __ movi(v8, __ T4S, 0);
 3408       __ movi(v9, __ T4S, 1);
 3409       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3410 
 3411       for (int i = 0; i < bulk_width; i++) {
 3412         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3413         __ rev64(v0_ofs, __ T16B, v16);
 3414         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3415       }
 3416 
 3417       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3418 
 3419       // Encrypt the counters
 3420       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3421 
 3422       if (bulk_width == 8) {
 3423         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3424       }
 3425 
 3426       // XOR the encrypted counters with the inputs
 3427       for (int i = 0; i < bulk_width; i++) {
 3428         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3429         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3430         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3431       }
 3432 
 3433       // Write the encrypted data
 3434       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3435       if (bulk_width == 8) {
 3436         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3437       }
 3438 
 3439       __ subw(len, len, 16 * bulk_width);
 3440       __ cbnzw(len, L_CTR_loop);
 3441     }
 3442 
 3443     // Save the counter back where it goes
 3444     __ rev64(v16, __ T16B, v16);
 3445     __ st1(v16, __ T16B, counter);
 3446 
 3447     __ pop(saved_regs, sp);
 3448 
 3449     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3450     if (bulk_width == 8) {
 3451       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3452     }
 3453 
 3454     __ andr(rscratch1, len, -16 * bulk_width);
 3455     __ sub(len, len, rscratch1);
 3456     __ add(offset, offset, rscratch1);
 3457     __ mov(used, 16);
 3458     __ strw(used, Address(used_ptr));
 3459     __ b(large_block_return);
 3460 
 3461     return start;
 3462   }
 3463 
 3464   // Vector AES Galois Counter Mode implementation. Parameters:
 3465   //
 3466   // in = c_rarg0
 3467   // len = c_rarg1
 3468   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3469   // out = c_rarg3
 3470   // key = c_rarg4
 3471   // state = c_rarg5 - GHASH.state
 3472   // subkeyHtbl = c_rarg6 - powers of H
 3473   // counter = c_rarg7 - 16 bytes of CTR
 3474   // return - number of processed bytes
 3475   address generate_galoisCounterMode_AESCrypt() {
 3476     Label ghash_polynomial; // local data generated after code
 3477 
 3478    __ align(CodeEntryAlignment);
 3479     StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
 3480     StubCodeMark mark(this, stub_id);
 3481     address start = __ pc();
 3482     __ enter();
 3483 
 3484     const Register in = c_rarg0;
 3485     const Register len = c_rarg1;
 3486     const Register ct = c_rarg2;
 3487     const Register out = c_rarg3;
 3488     // and updated with the incremented counter in the end
 3489 
 3490     const Register key = c_rarg4;
 3491     const Register state = c_rarg5;
 3492 
 3493     const Register subkeyHtbl = c_rarg6;
 3494 
 3495     const Register counter = c_rarg7;
 3496 
 3497     const Register keylen = r10;
 3498     // Save state before entering routine
 3499     __ sub(sp, sp, 4 * 16);
 3500     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3501     __ sub(sp, sp, 4 * 16);
 3502     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3503 
 3504     // __ andr(len, len, -512);
 3505     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3506     __ str(len, __ pre(sp, -2 * wordSize));
 3507 
 3508     Label DONE;
 3509     __ cbz(len, DONE);
 3510 
 3511     // Compute #rounds for AES based on the length of the key array
 3512     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3513 
 3514     __ aesenc_loadkeys(key, keylen);
 3515     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3516     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3517 
 3518     // AES/CTR loop
 3519     {
 3520       Label L_CTR_loop;
 3521       __ BIND(L_CTR_loop);
 3522 
 3523       // Setup the counters
 3524       __ movi(v8, __ T4S, 0);
 3525       __ movi(v9, __ T4S, 1);
 3526       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3527 
 3528       assert(v0->encoding() < v8->encoding(), "");
 3529       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3530         FloatRegister f = as_FloatRegister(i);
 3531         __ rev32(f, __ T16B, v16);
 3532         __ addv(v16, __ T4S, v16, v8);
 3533       }
 3534 
 3535       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3536 
 3537       // Encrypt the counters
 3538       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3539 
 3540       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3541 
 3542       // XOR the encrypted counters with the inputs
 3543       for (int i = 0; i < 8; i++) {
 3544         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3545         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3546         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3547       }
 3548       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3549       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3550 
 3551       __ subw(len, len, 16 * 8);
 3552       __ cbnzw(len, L_CTR_loop);
 3553     }
 3554 
 3555     __ rev32(v16, __ T16B, v16);
 3556     __ st1(v16, __ T16B, counter);
 3557 
 3558     __ ldr(len, Address(sp));
 3559     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3560 
 3561     // GHASH/CTR loop
 3562     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3563                                 len, /*unrolls*/4);
 3564 
 3565 #ifdef ASSERT
 3566     { Label L;
 3567       __ cmp(len, (unsigned char)0);
 3568       __ br(Assembler::EQ, L);
 3569       __ stop("stubGenerator: abort");
 3570       __ bind(L);
 3571   }
 3572 #endif
 3573 
 3574   __ bind(DONE);
 3575     // Return the number of bytes processed
 3576     __ ldr(r0, __ post(sp, 2 * wordSize));
 3577 
 3578     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3579     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3580 
 3581     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3582     __ ret(lr);
 3583 
 3584     // bind label and generate polynomial data
 3585     __ align(wordSize * 2);
 3586     __ bind(ghash_polynomial);
 3587     __ emit_int64(0x87);  // The low-order bits of the field
 3588                           // polynomial (i.e. p = z^7+z^2+z+1)
 3589                           // repeated in the low and high parts of a
 3590                           // 128-bit vector
 3591     __ emit_int64(0x87);
 3592 
 3593     return start;
 3594   }
 3595 
 3596   class Cached64Bytes {
 3597   private:
 3598     MacroAssembler *_masm;
 3599     Register _regs[8];
 3600 
 3601   public:
 3602     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3603       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3604       auto it = rs.begin();
 3605       for (auto &r: _regs) {
 3606         r = *it;
 3607         ++it;
 3608       }
 3609     }
 3610 
 3611     void gen_loads(Register base) {
 3612       for (int i = 0; i < 8; i += 2) {
 3613         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3614       }
 3615     }
 3616 
 3617     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3618     void extract_u32(Register dest, int i) {
 3619       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3620     }
 3621   };
 3622 
 3623   // Utility routines for md5.
 3624   // Clobbers r10 and r11.
 3625   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3626               int k, int s, int t) {
 3627     Register rscratch3 = r10;
 3628     Register rscratch4 = r11;
 3629 
 3630     __ eorw(rscratch3, r3, r4);
 3631     __ movw(rscratch2, t);
 3632     __ andw(rscratch3, rscratch3, r2);
 3633     __ addw(rscratch4, r1, rscratch2);
 3634     reg_cache.extract_u32(rscratch1, k);
 3635     __ eorw(rscratch3, rscratch3, r4);
 3636     __ addw(rscratch4, rscratch4, rscratch1);
 3637     __ addw(rscratch3, rscratch3, rscratch4);
 3638     __ rorw(rscratch2, rscratch3, 32 - s);
 3639     __ addw(r1, rscratch2, r2);
 3640   }
 3641 
 3642   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3643               int k, int s, int t) {
 3644     Register rscratch3 = r10;
 3645     Register rscratch4 = r11;
 3646 
 3647     reg_cache.extract_u32(rscratch1, k);
 3648     __ movw(rscratch2, t);
 3649     __ addw(rscratch4, r1, rscratch2);
 3650     __ addw(rscratch4, rscratch4, rscratch1);
 3651     __ bicw(rscratch2, r3, r4);
 3652     __ andw(rscratch3, r2, r4);
 3653     __ addw(rscratch2, rscratch2, rscratch4);
 3654     __ addw(rscratch2, rscratch2, rscratch3);
 3655     __ rorw(rscratch2, rscratch2, 32 - s);
 3656     __ addw(r1, rscratch2, r2);
 3657   }
 3658 
 3659   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3660               int k, int s, int t) {
 3661     Register rscratch3 = r10;
 3662     Register rscratch4 = r11;
 3663 
 3664     __ eorw(rscratch3, r3, r4);
 3665     __ movw(rscratch2, t);
 3666     __ addw(rscratch4, r1, rscratch2);
 3667     reg_cache.extract_u32(rscratch1, k);
 3668     __ eorw(rscratch3, rscratch3, r2);
 3669     __ addw(rscratch4, rscratch4, rscratch1);
 3670     __ addw(rscratch3, rscratch3, rscratch4);
 3671     __ rorw(rscratch2, rscratch3, 32 - s);
 3672     __ addw(r1, rscratch2, r2);
 3673   }
 3674 
 3675   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3676               int k, int s, int t) {
 3677     Register rscratch3 = r10;
 3678     Register rscratch4 = r11;
 3679 
 3680     __ movw(rscratch3, t);
 3681     __ ornw(rscratch2, r2, r4);
 3682     __ addw(rscratch4, r1, rscratch3);
 3683     reg_cache.extract_u32(rscratch1, k);
 3684     __ eorw(rscratch3, rscratch2, r3);
 3685     __ addw(rscratch4, rscratch4, rscratch1);
 3686     __ addw(rscratch3, rscratch3, rscratch4);
 3687     __ rorw(rscratch2, rscratch3, 32 - s);
 3688     __ addw(r1, rscratch2, r2);
 3689   }
 3690 
 3691   // Arguments:
 3692   //
 3693   // Inputs:
 3694   //   c_rarg0   - byte[]  source+offset
 3695   //   c_rarg1   - int[]   SHA.state
 3696   //   c_rarg2   - int     offset
 3697   //   c_rarg3   - int     limit
 3698   //
 3699   address generate_md5_implCompress(StubId stub_id) {
 3700     bool multi_block;
 3701     switch (stub_id) {
 3702     case StubId::stubgen_md5_implCompress_id:
 3703       multi_block = false;
 3704       break;
 3705     case StubId::stubgen_md5_implCompressMB_id:
 3706       multi_block = true;
 3707       break;
 3708     default:
 3709       ShouldNotReachHere();
 3710     }
 3711     __ align(CodeEntryAlignment);
 3712 
 3713     StubCodeMark mark(this, stub_id);
 3714     address start = __ pc();
 3715 
 3716     Register buf       = c_rarg0;
 3717     Register state     = c_rarg1;
 3718     Register ofs       = c_rarg2;
 3719     Register limit     = c_rarg3;
 3720     Register a         = r4;
 3721     Register b         = r5;
 3722     Register c         = r6;
 3723     Register d         = r7;
 3724     Register rscratch3 = r10;
 3725     Register rscratch4 = r11;
 3726 
 3727     Register state_regs[2] = { r12, r13 };
 3728     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3729     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3730 
 3731     __ push(saved_regs, sp);
 3732 
 3733     __ ldp(state_regs[0], state_regs[1], Address(state));
 3734     __ ubfx(a, state_regs[0],  0, 32);
 3735     __ ubfx(b, state_regs[0], 32, 32);
 3736     __ ubfx(c, state_regs[1],  0, 32);
 3737     __ ubfx(d, state_regs[1], 32, 32);
 3738 
 3739     Label md5_loop;
 3740     __ BIND(md5_loop);
 3741 
 3742     reg_cache.gen_loads(buf);
 3743 
 3744     // Round 1
 3745     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3746     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3747     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3748     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3749     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3750     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3751     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3752     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3753     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3754     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3755     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3756     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3757     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3758     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3759     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3760     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3761 
 3762     // Round 2
 3763     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3764     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3765     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3766     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3767     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3768     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3769     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3770     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3771     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3772     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3773     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3774     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3775     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3776     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3777     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3778     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3779 
 3780     // Round 3
 3781     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3782     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3783     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3784     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3785     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3786     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3787     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3788     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3789     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3790     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3791     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3792     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3793     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3794     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3795     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3796     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3797 
 3798     // Round 4
 3799     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3800     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3801     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3802     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3803     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3804     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3805     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3806     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3807     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3808     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3809     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3810     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3811     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3812     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3813     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3814     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3815 
 3816     __ addw(a, state_regs[0], a);
 3817     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3818     __ addw(b, rscratch2, b);
 3819     __ addw(c, state_regs[1], c);
 3820     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3821     __ addw(d, rscratch4, d);
 3822 
 3823     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3824     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3825 
 3826     if (multi_block) {
 3827       __ add(buf, buf, 64);
 3828       __ add(ofs, ofs, 64);
 3829       __ cmp(ofs, limit);
 3830       __ br(Assembler::LE, md5_loop);
 3831       __ mov(c_rarg0, ofs); // return ofs
 3832     }
 3833 
 3834     // write hash values back in the correct order
 3835     __ stp(state_regs[0], state_regs[1], Address(state));
 3836 
 3837     __ pop(saved_regs, sp);
 3838 
 3839     __ ret(lr);
 3840 
 3841     return start;
 3842   }
 3843 
 3844   // Arguments:
 3845   //
 3846   // Inputs:
 3847   //   c_rarg0   - byte[]  source+offset
 3848   //   c_rarg1   - int[]   SHA.state
 3849   //   c_rarg2   - int     offset
 3850   //   c_rarg3   - int     limit
 3851   //
 3852   address generate_sha1_implCompress(StubId stub_id) {
 3853     bool multi_block;
 3854     switch (stub_id) {
 3855     case StubId::stubgen_sha1_implCompress_id:
 3856       multi_block = false;
 3857       break;
 3858     case StubId::stubgen_sha1_implCompressMB_id:
 3859       multi_block = true;
 3860       break;
 3861     default:
 3862       ShouldNotReachHere();
 3863     }
 3864 
 3865     __ align(CodeEntryAlignment);
 3866 
 3867     StubCodeMark mark(this, stub_id);
 3868     address start = __ pc();
 3869 
 3870     Register buf   = c_rarg0;
 3871     Register state = c_rarg1;
 3872     Register ofs   = c_rarg2;
 3873     Register limit = c_rarg3;
 3874 
 3875     Label keys;
 3876     Label sha1_loop;
 3877 
 3878     // load the keys into v0..v3
 3879     __ adr(rscratch1, keys);
 3880     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3881     // load 5 words state into v6, v7
 3882     __ ldrq(v6, Address(state, 0));
 3883     __ ldrs(v7, Address(state, 16));
 3884 
 3885 
 3886     __ BIND(sha1_loop);
 3887     // load 64 bytes of data into v16..v19
 3888     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3889     __ rev32(v16, __ T16B, v16);
 3890     __ rev32(v17, __ T16B, v17);
 3891     __ rev32(v18, __ T16B, v18);
 3892     __ rev32(v19, __ T16B, v19);
 3893 
 3894     // do the sha1
 3895     __ addv(v4, __ T4S, v16, v0);
 3896     __ orr(v20, __ T16B, v6, v6);
 3897 
 3898     FloatRegister d0 = v16;
 3899     FloatRegister d1 = v17;
 3900     FloatRegister d2 = v18;
 3901     FloatRegister d3 = v19;
 3902 
 3903     for (int round = 0; round < 20; round++) {
 3904       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3905       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3906       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3907       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3908       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3909 
 3910       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3911       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3912       __ sha1h(tmp2, __ T4S, v20);
 3913       if (round < 5)
 3914         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3915       else if (round < 10 || round >= 15)
 3916         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3917       else
 3918         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3919       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3920 
 3921       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3922     }
 3923 
 3924     __ addv(v7, __ T2S, v7, v21);
 3925     __ addv(v6, __ T4S, v6, v20);
 3926 
 3927     if (multi_block) {
 3928       __ add(ofs, ofs, 64);
 3929       __ cmp(ofs, limit);
 3930       __ br(Assembler::LE, sha1_loop);
 3931       __ mov(c_rarg0, ofs); // return ofs
 3932     }
 3933 
 3934     __ strq(v6, Address(state, 0));
 3935     __ strs(v7, Address(state, 16));
 3936 
 3937     __ ret(lr);
 3938 
 3939     __ bind(keys);
 3940     __ emit_int32(0x5a827999);
 3941     __ emit_int32(0x6ed9eba1);
 3942     __ emit_int32(0x8f1bbcdc);
 3943     __ emit_int32(0xca62c1d6);
 3944 
 3945     return start;
 3946   }
 3947 
 3948 
 3949   // Arguments:
 3950   //
 3951   // Inputs:
 3952   //   c_rarg0   - byte[]  source+offset
 3953   //   c_rarg1   - int[]   SHA.state
 3954   //   c_rarg2   - int     offset
 3955   //   c_rarg3   - int     limit
 3956   //
 3957   address generate_sha256_implCompress(StubId stub_id) {
 3958     bool multi_block;
 3959     switch (stub_id) {
 3960     case StubId::stubgen_sha256_implCompress_id:
 3961       multi_block = false;
 3962       break;
 3963     case StubId::stubgen_sha256_implCompressMB_id:
 3964       multi_block = true;
 3965       break;
 3966     default:
 3967       ShouldNotReachHere();
 3968     }
 3969 
 3970     static const uint32_t round_consts[64] = {
 3971       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3972       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3973       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3974       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3975       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3976       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3977       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3978       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3979       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3980       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3981       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3982       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3983       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3984       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3985       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3986       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3987     };
 3988 
 3989     __ align(CodeEntryAlignment);
 3990 
 3991     StubCodeMark mark(this, stub_id);
 3992     address start = __ pc();
 3993 
 3994     Register buf   = c_rarg0;
 3995     Register state = c_rarg1;
 3996     Register ofs   = c_rarg2;
 3997     Register limit = c_rarg3;
 3998 
 3999     Label sha1_loop;
 4000 
 4001     __ stpd(v8, v9, __ pre(sp, -32));
 4002     __ stpd(v10, v11, Address(sp, 16));
 4003 
 4004 // dga == v0
 4005 // dgb == v1
 4006 // dg0 == v2
 4007 // dg1 == v3
 4008 // dg2 == v4
 4009 // t0 == v6
 4010 // t1 == v7
 4011 
 4012     // load 16 keys to v16..v31
 4013     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4014     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 4015     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 4016     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 4017     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 4018 
 4019     // load 8 words (256 bits) state
 4020     __ ldpq(v0, v1, state);
 4021 
 4022     __ BIND(sha1_loop);
 4023     // load 64 bytes of data into v8..v11
 4024     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 4025     __ rev32(v8, __ T16B, v8);
 4026     __ rev32(v9, __ T16B, v9);
 4027     __ rev32(v10, __ T16B, v10);
 4028     __ rev32(v11, __ T16B, v11);
 4029 
 4030     __ addv(v6, __ T4S, v8, v16);
 4031     __ orr(v2, __ T16B, v0, v0);
 4032     __ orr(v3, __ T16B, v1, v1);
 4033 
 4034     FloatRegister d0 = v8;
 4035     FloatRegister d1 = v9;
 4036     FloatRegister d2 = v10;
 4037     FloatRegister d3 = v11;
 4038 
 4039 
 4040     for (int round = 0; round < 16; round++) {
 4041       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 4042       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 4043       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 4044       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 4045 
 4046       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 4047        __ orr(v4, __ T16B, v2, v2);
 4048       if (round < 15)
 4049         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 4050       __ sha256h(v2, __ T4S, v3, tmp2);
 4051       __ sha256h2(v3, __ T4S, v4, tmp2);
 4052       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 4053 
 4054       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 4055     }
 4056 
 4057     __ addv(v0, __ T4S, v0, v2);
 4058     __ addv(v1, __ T4S, v1, v3);
 4059 
 4060     if (multi_block) {
 4061       __ add(ofs, ofs, 64);
 4062       __ cmp(ofs, limit);
 4063       __ br(Assembler::LE, sha1_loop);
 4064       __ mov(c_rarg0, ofs); // return ofs
 4065     }
 4066 
 4067     __ ldpd(v10, v11, Address(sp, 16));
 4068     __ ldpd(v8, v9, __ post(sp, 32));
 4069 
 4070     __ stpq(v0, v1, state);
 4071 
 4072     __ ret(lr);
 4073 
 4074     return start;
 4075   }
 4076 
 4077   // Double rounds for sha512.
 4078   void sha512_dround(int dr,
 4079                      FloatRegister vi0, FloatRegister vi1,
 4080                      FloatRegister vi2, FloatRegister vi3,
 4081                      FloatRegister vi4, FloatRegister vrc0,
 4082                      FloatRegister vrc1, FloatRegister vin0,
 4083                      FloatRegister vin1, FloatRegister vin2,
 4084                      FloatRegister vin3, FloatRegister vin4) {
 4085       if (dr < 36) {
 4086         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 4087       }
 4088       __ addv(v5, __ T2D, vrc0, vin0);
 4089       __ ext(v6, __ T16B, vi2, vi3, 8);
 4090       __ ext(v5, __ T16B, v5, v5, 8);
 4091       __ ext(v7, __ T16B, vi1, vi2, 8);
 4092       __ addv(vi3, __ T2D, vi3, v5);
 4093       if (dr < 32) {
 4094         __ ext(v5, __ T16B, vin3, vin4, 8);
 4095         __ sha512su0(vin0, __ T2D, vin1);
 4096       }
 4097       __ sha512h(vi3, __ T2D, v6, v7);
 4098       if (dr < 32) {
 4099         __ sha512su1(vin0, __ T2D, vin2, v5);
 4100       }
 4101       __ addv(vi4, __ T2D, vi1, vi3);
 4102       __ sha512h2(vi3, __ T2D, vi1, vi0);
 4103   }
 4104 
 4105   // Arguments:
 4106   //
 4107   // Inputs:
 4108   //   c_rarg0   - byte[]  source+offset
 4109   //   c_rarg1   - int[]   SHA.state
 4110   //   c_rarg2   - int     offset
 4111   //   c_rarg3   - int     limit
 4112   //
 4113   address generate_sha512_implCompress(StubId stub_id) {
 4114     bool multi_block;
 4115     switch (stub_id) {
 4116     case StubId::stubgen_sha512_implCompress_id:
 4117       multi_block = false;
 4118       break;
 4119     case StubId::stubgen_sha512_implCompressMB_id:
 4120       multi_block = true;
 4121       break;
 4122     default:
 4123       ShouldNotReachHere();
 4124     }
 4125 
 4126     static const uint64_t round_consts[80] = {
 4127       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 4128       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 4129       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 4130       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 4131       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 4132       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 4133       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 4134       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 4135       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 4136       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 4137       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 4138       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 4139       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 4140       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 4141       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 4142       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 4143       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 4144       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 4145       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 4146       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 4147       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 4148       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 4149       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 4150       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 4151       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 4152       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 4153       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 4154     };
 4155 
 4156     __ align(CodeEntryAlignment);
 4157 
 4158     StubCodeMark mark(this, stub_id);
 4159     address start = __ pc();
 4160 
 4161     Register buf   = c_rarg0;
 4162     Register state = c_rarg1;
 4163     Register ofs   = c_rarg2;
 4164     Register limit = c_rarg3;
 4165 
 4166     __ stpd(v8, v9, __ pre(sp, -64));
 4167     __ stpd(v10, v11, Address(sp, 16));
 4168     __ stpd(v12, v13, Address(sp, 32));
 4169     __ stpd(v14, v15, Address(sp, 48));
 4170 
 4171     Label sha512_loop;
 4172 
 4173     // load state
 4174     __ ld1(v8, v9, v10, v11, __ T2D, state);
 4175 
 4176     // load first 4 round constants
 4177     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4178     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4179 
 4180     __ BIND(sha512_loop);
 4181     // load 128B of data into v12..v19
 4182     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4183     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4184     __ rev64(v12, __ T16B, v12);
 4185     __ rev64(v13, __ T16B, v13);
 4186     __ rev64(v14, __ T16B, v14);
 4187     __ rev64(v15, __ T16B, v15);
 4188     __ rev64(v16, __ T16B, v16);
 4189     __ rev64(v17, __ T16B, v17);
 4190     __ rev64(v18, __ T16B, v18);
 4191     __ rev64(v19, __ T16B, v19);
 4192 
 4193     __ mov(rscratch2, rscratch1);
 4194 
 4195     __ mov(v0, __ T16B, v8);
 4196     __ mov(v1, __ T16B, v9);
 4197     __ mov(v2, __ T16B, v10);
 4198     __ mov(v3, __ T16B, v11);
 4199 
 4200     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4201     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4202     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4203     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4204     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4205     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4206     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4207     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4208     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4209     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4210     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4211     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4212     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4213     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4214     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4215     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4216     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4217     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4218     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4219     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4220     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4221     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4222     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4223     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4224     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4225     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4226     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4227     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4228     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4229     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4230     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4231     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4232     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4233     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4234     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4235     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4236     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4237     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4238     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4239     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4240 
 4241     __ addv(v8, __ T2D, v8, v0);
 4242     __ addv(v9, __ T2D, v9, v1);
 4243     __ addv(v10, __ T2D, v10, v2);
 4244     __ addv(v11, __ T2D, v11, v3);
 4245 
 4246     if (multi_block) {
 4247       __ add(ofs, ofs, 128);
 4248       __ cmp(ofs, limit);
 4249       __ br(Assembler::LE, sha512_loop);
 4250       __ mov(c_rarg0, ofs); // return ofs
 4251     }
 4252 
 4253     __ st1(v8, v9, v10, v11, __ T2D, state);
 4254 
 4255     __ ldpd(v14, v15, Address(sp, 48));
 4256     __ ldpd(v12, v13, Address(sp, 32));
 4257     __ ldpd(v10, v11, Address(sp, 16));
 4258     __ ldpd(v8, v9, __ post(sp, 64));
 4259 
 4260     __ ret(lr);
 4261 
 4262     return start;
 4263   }
 4264 
 4265   // Execute one round of keccak of two computations in parallel.
 4266   // One of the states should be loaded into the lower halves of
 4267   // the vector registers v0-v24, the other should be loaded into
 4268   // the upper halves of those registers. The ld1r instruction loads
 4269   // the round constant into both halves of register v31.
 4270   // Intermediate results c0...c5 and d0...d5 are computed
 4271   // in registers v25...v30.
 4272   // All vector instructions that are used operate on both register
 4273   // halves in parallel.
 4274   // If only a single computation is needed, one can only load the lower halves.
 4275   void keccak_round(Register rscratch1) {
 4276   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4277   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4278   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4279   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4280   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4281   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4282   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4283   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4284   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4285   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4286 
 4287   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4288   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4289   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4290   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4291   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4292 
 4293   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4294   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4295   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4296   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4297   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4298   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4299   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4300   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4301   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4302   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4303   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4304   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4305   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4306   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4307   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4308   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4309   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4310   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4311   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4312   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4313   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4314   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4315   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4316   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4317   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4318 
 4319   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4320   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4321   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4322   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4323   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4324 
 4325   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4326 
 4327   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4328   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4329   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4330   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4331   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4332 
 4333   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4334   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4335   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4336   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4337   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4338 
 4339   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4340   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4341   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4342   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4343   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4344 
 4345   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4346   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4347   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4348   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4349   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4350 
 4351   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4352   }
 4353 
 4354   // Arguments:
 4355   //
 4356   // Inputs:
 4357   //   c_rarg0   - byte[]  source+offset
 4358   //   c_rarg1   - byte[]  SHA.state
 4359   //   c_rarg2   - int     block_size
 4360   //   c_rarg3   - int     offset
 4361   //   c_rarg4   - int     limit
 4362   //
 4363   address generate_sha3_implCompress(StubId stub_id) {
 4364     bool multi_block;
 4365     switch (stub_id) {
 4366     case StubId::stubgen_sha3_implCompress_id:
 4367       multi_block = false;
 4368       break;
 4369     case StubId::stubgen_sha3_implCompressMB_id:
 4370       multi_block = true;
 4371       break;
 4372     default:
 4373       ShouldNotReachHere();
 4374     }
 4375 
 4376     static const uint64_t round_consts[24] = {
 4377       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4378       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4379       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4380       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4381       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4382       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4383       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4384       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4385     };
 4386 
 4387     __ align(CodeEntryAlignment);
 4388 
 4389     StubCodeMark mark(this, stub_id);
 4390     address start = __ pc();
 4391 
 4392     Register buf           = c_rarg0;
 4393     Register state         = c_rarg1;
 4394     Register block_size    = c_rarg2;
 4395     Register ofs           = c_rarg3;
 4396     Register limit         = c_rarg4;
 4397 
 4398     Label sha3_loop, rounds24_loop;
 4399     Label sha3_512_or_sha3_384, shake128;
 4400 
 4401     __ stpd(v8, v9, __ pre(sp, -64));
 4402     __ stpd(v10, v11, Address(sp, 16));
 4403     __ stpd(v12, v13, Address(sp, 32));
 4404     __ stpd(v14, v15, Address(sp, 48));
 4405 
 4406     // load state
 4407     __ add(rscratch1, state, 32);
 4408     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4409     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4410     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4411     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4412     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4413     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4414     __ ld1(v24, __ T1D, rscratch1);
 4415 
 4416     __ BIND(sha3_loop);
 4417 
 4418     // 24 keccak rounds
 4419     __ movw(rscratch2, 24);
 4420 
 4421     // load round_constants base
 4422     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4423 
 4424     // load input
 4425     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4426     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4427     __ eor(v0, __ T8B, v0, v25);
 4428     __ eor(v1, __ T8B, v1, v26);
 4429     __ eor(v2, __ T8B, v2, v27);
 4430     __ eor(v3, __ T8B, v3, v28);
 4431     __ eor(v4, __ T8B, v4, v29);
 4432     __ eor(v5, __ T8B, v5, v30);
 4433     __ eor(v6, __ T8B, v6, v31);
 4434 
 4435     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4436     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4437 
 4438     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4439     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4440     __ eor(v7, __ T8B, v7, v25);
 4441     __ eor(v8, __ T8B, v8, v26);
 4442     __ eor(v9, __ T8B, v9, v27);
 4443     __ eor(v10, __ T8B, v10, v28);
 4444     __ eor(v11, __ T8B, v11, v29);
 4445     __ eor(v12, __ T8B, v12, v30);
 4446     __ eor(v13, __ T8B, v13, v31);
 4447 
 4448     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4449     __ eor(v14, __ T8B, v14, v25);
 4450     __ eor(v15, __ T8B, v15, v26);
 4451     __ eor(v16, __ T8B, v16, v27);
 4452 
 4453     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4454     __ andw(c_rarg5, block_size, 48);
 4455     __ cbzw(c_rarg5, rounds24_loop);
 4456 
 4457     __ tbnz(block_size, 5, shake128);
 4458     // block_size == 144, bit5 == 0, SHA3-224
 4459     __ ldrd(v28, __ post(buf, 8));
 4460     __ eor(v17, __ T8B, v17, v28);
 4461     __ b(rounds24_loop);
 4462 
 4463     __ BIND(shake128);
 4464     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4465     __ eor(v17, __ T8B, v17, v28);
 4466     __ eor(v18, __ T8B, v18, v29);
 4467     __ eor(v19, __ T8B, v19, v30);
 4468     __ eor(v20, __ T8B, v20, v31);
 4469     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4470 
 4471     __ BIND(sha3_512_or_sha3_384);
 4472     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4473     __ eor(v7, __ T8B, v7, v25);
 4474     __ eor(v8, __ T8B, v8, v26);
 4475     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4476 
 4477     // SHA3-384
 4478     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4479     __ eor(v9,  __ T8B, v9,  v27);
 4480     __ eor(v10, __ T8B, v10, v28);
 4481     __ eor(v11, __ T8B, v11, v29);
 4482     __ eor(v12, __ T8B, v12, v30);
 4483 
 4484     __ BIND(rounds24_loop);
 4485     __ subw(rscratch2, rscratch2, 1);
 4486 
 4487     keccak_round(rscratch1);
 4488 
 4489     __ cbnzw(rscratch2, rounds24_loop);
 4490 
 4491     if (multi_block) {
 4492       __ add(ofs, ofs, block_size);
 4493       __ cmp(ofs, limit);
 4494       __ br(Assembler::LE, sha3_loop);
 4495       __ mov(c_rarg0, ofs); // return ofs
 4496     }
 4497 
 4498     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4499     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4500     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4501     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4502     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4503     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4504     __ st1(v24, __ T1D, state);
 4505 
 4506     // restore callee-saved registers
 4507     __ ldpd(v14, v15, Address(sp, 48));
 4508     __ ldpd(v12, v13, Address(sp, 32));
 4509     __ ldpd(v10, v11, Address(sp, 16));
 4510     __ ldpd(v8, v9, __ post(sp, 64));
 4511 
 4512     __ ret(lr);
 4513 
 4514     return start;
 4515   }
 4516 
 4517   // Inputs:
 4518   //   c_rarg0   - long[]  state0
 4519   //   c_rarg1   - long[]  state1
 4520   address generate_double_keccak() {
 4521     static const uint64_t round_consts[24] = {
 4522       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4523       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4524       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4525       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4526       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4527       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4528       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4529       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4530     };
 4531 
 4532     // Implements the double_keccak() method of the
 4533     // sun.secyrity.provider.SHA3Parallel class
 4534     __ align(CodeEntryAlignment);
 4535     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4536     address start = __ pc();
 4537     __ enter();
 4538 
 4539     Register state0        = c_rarg0;
 4540     Register state1        = c_rarg1;
 4541 
 4542     Label rounds24_loop;
 4543 
 4544     // save callee-saved registers
 4545     __ stpd(v8, v9, __ pre(sp, -64));
 4546     __ stpd(v10, v11, Address(sp, 16));
 4547     __ stpd(v12, v13, Address(sp, 32));
 4548     __ stpd(v14, v15, Address(sp, 48));
 4549 
 4550     // load states
 4551     __ add(rscratch1, state0, 32);
 4552     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4553     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4554     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4555     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4556     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4557     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4558     __ ld1(v24, __ D, 0, rscratch1);
 4559     __ add(rscratch1, state1, 32);
 4560     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4561     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4562     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4563     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4564     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4565     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4566     __ ld1(v24, __ D, 1, rscratch1);
 4567 
 4568     // 24 keccak rounds
 4569     __ movw(rscratch2, 24);
 4570 
 4571     // load round_constants base
 4572     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4573 
 4574     __ BIND(rounds24_loop);
 4575     __ subw(rscratch2, rscratch2, 1);
 4576     keccak_round(rscratch1);
 4577     __ cbnzw(rscratch2, rounds24_loop);
 4578 
 4579     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4580     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4581     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4582     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4583     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4584     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4585     __ st1(v24, __ D, 0, state0);
 4586     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4587     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4588     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4589     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4590     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4591     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4592     __ st1(v24, __ D, 1, state1);
 4593 
 4594     // restore callee-saved vector registers
 4595     __ ldpd(v14, v15, Address(sp, 48));
 4596     __ ldpd(v12, v13, Address(sp, 32));
 4597     __ ldpd(v10, v11, Address(sp, 16));
 4598     __ ldpd(v8, v9, __ post(sp, 64));
 4599 
 4600     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4601     __ mov(r0, zr); // return 0
 4602     __ ret(lr);
 4603 
 4604     return start;
 4605   }
 4606 
 4607   // ChaCha20 block function.  This version parallelizes the 32-bit
 4608   // state elements on each of 16 vectors, producing 4 blocks of
 4609   // keystream at a time.
 4610   //
 4611   // state (int[16]) = c_rarg0
 4612   // keystream (byte[256]) = c_rarg1
 4613   // return - number of bytes of produced keystream (always 256)
 4614   //
 4615   // This implementation takes each 32-bit integer from the state
 4616   // array and broadcasts it across all 4 32-bit lanes of a vector register
 4617   // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
 4618   // of v5, etc.).  Once all 16 elements have been broadcast onto 16 vectors,
 4619   // the quarter round schedule is implemented as outlined in RFC 7539 section
 4620   // 2.3.  However, instead of sequentially processing the 3 quarter round
 4621   // operations represented by one QUARTERROUND function, we instead stack all
 4622   // the adds, xors and left-rotations from the first 4 quarter rounds together
 4623   // and then do the same for the second set of 4 quarter rounds.  This removes
 4624   // some latency that would otherwise be incurred by waiting for an add to
 4625   // complete before performing an xor (which depends on the result of the
 4626   // add), etc. An adjustment happens between the first and second groups of 4
 4627   // quarter rounds, but this is done only in the inputs to the macro functions
 4628   // that generate the assembly instructions - these adjustments themselves are
 4629   // not part of the resulting assembly.
 4630   // The 4 registers v0-v3 are used during the quarter round operations as
 4631   // scratch registers.  Once the 20 rounds are complete, these 4 scratch
 4632   // registers become the vectors involved in adding the start state back onto
 4633   // the post-QR working state.  After the adds are complete, each of the 16
 4634   // vectors write their first lane back to the keystream buffer, followed
 4635   // by the second lane from all vectors and so on.
 4636   address generate_chacha20Block_blockpar() {
 4637     Label L_twoRounds, L_cc20_const;
 4638     __ align(CodeEntryAlignment);
 4639     StubId stub_id = StubId::stubgen_chacha20Block_id;
 4640     StubCodeMark mark(this, stub_id);
 4641     address start = __ pc();
 4642     __ enter();
 4643 
 4644     int i, j;
 4645     const Register state = c_rarg0;
 4646     const Register keystream = c_rarg1;
 4647     const Register loopCtr = r10;
 4648     const Register tmpAddr = r11;
 4649     const FloatRegister ctrAddOverlay = v28;
 4650     const FloatRegister lrot8Tbl = v29;
 4651 
 4652     // Organize SIMD registers in an array that facilitates
 4653     // putting repetitive opcodes into loop structures.  It is
 4654     // important that each grouping of 4 registers is monotonically
 4655     // increasing to support the requirements of multi-register
 4656     // instructions (e.g. ld4r, st4, etc.)
 4657     const FloatRegister workSt[16] = {
 4658          v4,  v5,  v6,  v7, v16, v17, v18, v19,
 4659         v20, v21, v22, v23, v24, v25, v26, v27
 4660     };
 4661 
 4662     // Pull in constant data.  The first 16 bytes are the add overlay
 4663     // which is applied to the vector holding the counter (state[12]).
 4664     // The second 16 bytes is the index register for the 8-bit left
 4665     // rotation tbl instruction.
 4666     __ adr(tmpAddr, L_cc20_const);
 4667     __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
 4668 
 4669     // Load from memory and interlace across 16 SIMD registers,
 4670     // With each word from memory being broadcast to all lanes of
 4671     // each successive SIMD register.
 4672     //      Addr(0) -> All lanes in workSt[i]
 4673     //      Addr(4) -> All lanes workSt[i + 1], etc.
 4674     __ mov(tmpAddr, state);
 4675     for (i = 0; i < 16; i += 4) {
 4676       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
 4677           __ post(tmpAddr, 16));
 4678     }
 4679     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4680 
 4681     // Before entering the loop, create 5 4-register arrays.  These
 4682     // will hold the 4 registers that represent the a/b/c/d fields
 4683     // in the quarter round operation.  For instance the "b" field
 4684     // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
 4685     // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
 4686     // since it is part of a diagonal organization.  The aSet and scratch
 4687     // register sets are defined at declaration time because they do not change
 4688     // organization at any point during the 20-round processing.
 4689     FloatRegister aSet[4] = { v4, v5, v6, v7 };
 4690     FloatRegister bSet[4];
 4691     FloatRegister cSet[4];
 4692     FloatRegister dSet[4];
 4693     FloatRegister scratch[4] = { v0, v1, v2, v3 };
 4694 
 4695     // Set up the 10 iteration loop and perform all 8 quarter round ops
 4696     __ mov(loopCtr, 10);
 4697     __ BIND(L_twoRounds);
 4698 
 4699     // Set to columnar organization and do the following 4 quarter-rounds:
 4700     // QUARTERROUND(0, 4, 8, 12)
 4701     // QUARTERROUND(1, 5, 9, 13)
 4702     // QUARTERROUND(2, 6, 10, 14)
 4703     // QUARTERROUND(3, 7, 11, 15)
 4704     __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
 4705     __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
 4706     __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
 4707 
 4708     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4709     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4710     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4711 
 4712     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4713     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4714     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4715 
 4716     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4717     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4718     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4719 
 4720     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4721     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4722     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4723 
 4724     // Set to diagonal organization and do the next 4 quarter-rounds:
 4725     // QUARTERROUND(0, 5, 10, 15)
 4726     // QUARTERROUND(1, 6, 11, 12)
 4727     // QUARTERROUND(2, 7, 8, 13)
 4728     // QUARTERROUND(3, 4, 9, 14)
 4729     __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
 4730     __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
 4731     __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
 4732 
 4733     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4734     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4735     __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl);     // d <<<= 16
 4736 
 4737     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4738     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4739     __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl);  // b <<<= 12
 4740 
 4741     __ cc20_qr_add4(aSet, bSet);                    // a += b
 4742     __ cc20_qr_xor4(dSet, aSet, dSet);              // d ^= a
 4743     __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl);      // d <<<= 8
 4744 
 4745     __ cc20_qr_add4(cSet, dSet);                    // c += d
 4746     __ cc20_qr_xor4(bSet, cSet, scratch);           // b ^= c (scratch)
 4747     __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl);   // b <<<= 12
 4748 
 4749     // Decrement and iterate
 4750     __ sub(loopCtr, loopCtr, 1);
 4751     __ cbnz(loopCtr, L_twoRounds);
 4752 
 4753     __ mov(tmpAddr, state);
 4754 
 4755     // Add the starting state back to the post-loop keystream
 4756     // state.  We read/interlace the state array from memory into
 4757     // 4 registers similar to what we did in the beginning.  Then
 4758     // add the counter overlay onto workSt[12] at the end.
 4759     for (i = 0; i < 16; i += 4) {
 4760       __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
 4761       __ addv(workSt[i], __ T4S, workSt[i], v0);
 4762       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
 4763       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
 4764       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
 4765     }
 4766     __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
 4767 
 4768     // Write working state into the keystream buffer.  This is accomplished
 4769     // by taking the lane "i" from each of the four vectors and writing
 4770     // it to consecutive 4-byte offsets, then post-incrementing by 16 and
 4771     // repeating with the next 4 vectors until all 16 vectors have been used.
 4772     // Then move to the next lane and repeat the process until all lanes have
 4773     // been written.
 4774     for (i = 0; i < 4; i++) {
 4775       for (j = 0; j < 16; j += 4) {
 4776         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
 4777             __ post(keystream, 16));
 4778       }
 4779     }
 4780 
 4781     __ mov(r0, 256);             // Return length of output keystream
 4782     __ leave();
 4783     __ ret(lr);
 4784 
 4785     // bind label and generate local constant data used by this stub
 4786     // The constant data is broken into two 128-bit segments to be loaded
 4787     // onto FloatRegisters.  The first 128 bits are a counter add overlay
 4788     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
 4789     // The second 128-bits is a table constant used for 8-bit left rotations.
 4790     __ BIND(L_cc20_const);
 4791     __ emit_int64(0x0000000100000000UL);
 4792     __ emit_int64(0x0000000300000002UL);
 4793     __ emit_int64(0x0605040702010003UL);
 4794     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4795 
 4796     return start;
 4797   }
 4798 
 4799   // Helpers to schedule parallel operation bundles across vector
 4800   // register sequences of size 2, 4 or 8.
 4801 
 4802   // Implement various primitive computations across vector sequences
 4803 
 4804   template<int N>
 4805   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4806                const VSeq<N>& v1, const VSeq<N>& v2) {
 4807     // output must not be constant
 4808     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4809     // output cannot overwrite pending inputs
 4810     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4811     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4812     for (int i = 0; i < N; i++) {
 4813       __ addv(v[i], T, v1[i], v2[i]);
 4814     }
 4815   }
 4816 
 4817   template<int N>
 4818   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4819                const VSeq<N>& v1, const VSeq<N>& v2) {
 4820     // output must not be constant
 4821     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4822     // output cannot overwrite pending inputs
 4823     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4824     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4825     for (int i = 0; i < N; i++) {
 4826       __ subv(v[i], T, v1[i], v2[i]);
 4827     }
 4828   }
 4829 
 4830   template<int N>
 4831   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4832                const VSeq<N>& v1, const VSeq<N>& v2) {
 4833     // output must not be constant
 4834     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4835     // output cannot overwrite pending inputs
 4836     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4837     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4838     for (int i = 0; i < N; i++) {
 4839       __ mulv(v[i], T, v1[i], v2[i]);
 4840     }
 4841   }
 4842 
 4843   template<int N>
 4844   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4845     // output must not be constant
 4846     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4847     // output cannot overwrite pending inputs
 4848     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4849     for (int i = 0; i < N; i++) {
 4850       __ negr(v[i], T, v1[i]);
 4851     }
 4852   }
 4853 
 4854   template<int N>
 4855   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4856                const VSeq<N>& v1, int shift) {
 4857     // output must not be constant
 4858     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4859     // output cannot overwrite pending inputs
 4860     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4861     for (int i = 0; i < N; i++) {
 4862       __ sshr(v[i], T, v1[i], shift);
 4863     }
 4864   }
 4865 
 4866   template<int N>
 4867   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4868     // output must not be constant
 4869     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4870     // output cannot overwrite pending inputs
 4871     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4872     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4873     for (int i = 0; i < N; i++) {
 4874       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4875     }
 4876   }
 4877 
 4878   template<int N>
 4879   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4880     // output must not be constant
 4881     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4882     // output cannot overwrite pending inputs
 4883     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4884     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4885     for (int i = 0; i < N; i++) {
 4886       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4887     }
 4888   }
 4889 
 4890   template<int N>
 4891   void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4892     // output must not be constant
 4893     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4894     // output cannot overwrite pending inputs
 4895     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4896     for (int i = 0; i < N; i++) {
 4897       __ notr(v[i], __ T16B, v1[i]);
 4898     }
 4899   }
 4900 
 4901   template<int N>
 4902   void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
 4903     // output must not be constant
 4904     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4905     // output cannot overwrite pending inputs
 4906     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4907     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4908     for (int i = 0; i < N; i++) {
 4909       __ sqdmulh(v[i], T, v1[i], v2[i]);
 4910     }
 4911   }
 4912 
 4913   template<int N>
 4914   void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
 4915     // output must not be constant
 4916     assert(N == 1  || !v.is_constant(), "cannot output multiple values to a constant vector");
 4917     // output cannot overwrite pending inputs
 4918     assert(!vs_write_before_read(v, v1), "output overwrites input");
 4919     assert(!vs_write_before_read(v, v2), "output overwrites input");
 4920     for (int i = 0; i < N; i++) {
 4921       __ mlsv(v[i], T, v1[i], v2[i]);
 4922     }
 4923   }
 4924 
 4925   // load N/2 successive pairs of quadword values from memory in order
 4926   // into N successive vector registers of the sequence via the
 4927   // address supplied in base.
 4928   template<int N>
 4929   void vs_ldpq(const VSeq<N>& v, Register base) {
 4930     for (int i = 0; i < N; i += 2) {
 4931       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4932     }
 4933   }
 4934 
 4935   // load N/2 successive pairs of quadword values from memory in order
 4936   // into N vector registers of the sequence via the address supplied
 4937   // in base using post-increment addressing
 4938   template<int N>
 4939   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4940     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4941     for (int i = 0; i < N; i += 2) {
 4942       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4943     }
 4944   }
 4945 
 4946   // store N successive vector registers of the sequence into N/2
 4947   // successive pairs of quadword memory locations via the address
 4948   // supplied in base using post-increment addressing
 4949   template<int N>
 4950   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4951     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4952     for (int i = 0; i < N; i += 2) {
 4953       __ stpq(v[i], v[i+1], __ post(base, 32));
 4954     }
 4955   }
 4956 
 4957   // load N/2 pairs of quadword values from memory de-interleaved into
 4958   // N vector registers 2 at a time via the address supplied in base
 4959   // using post-increment addressing.
 4960   template<int N>
 4961   void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4962     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4963     for (int i = 0; i < N; i += 2) {
 4964       __ ld2(v[i], v[i+1], T, __ post(base, 32));
 4965     }
 4966   }
 4967 
 4968   // store N vector registers interleaved into N/2 pairs of quadword
 4969   // memory locations via the address supplied in base using
 4970   // post-increment addressing.
 4971   template<int N>
 4972   void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4973     static_assert((N & (N - 1)) == 0, "sequence length must be even");
 4974     for (int i = 0; i < N; i += 2) {
 4975       __ st2(v[i], v[i+1], T, __ post(base, 32));
 4976     }
 4977   }
 4978 
 4979   // load N quadword values from memory de-interleaved into N vector
 4980   // registers 3 elements at a time via the address supplied in base.
 4981   template<int N>
 4982   void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4983     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4984     for (int i = 0; i < N; i += 3) {
 4985       __ ld3(v[i], v[i+1], v[i+2], T, base);
 4986     }
 4987   }
 4988 
 4989   // load N quadword values from memory de-interleaved into N vector
 4990   // registers 3 elements at a time via the address supplied in base
 4991   // using post-increment addressing.
 4992   template<int N>
 4993   void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
 4994     static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
 4995     for (int i = 0; i < N; i += 3) {
 4996       __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
 4997     }
 4998   }
 4999 
 5000   // load N/2 pairs of quadword values from memory into N vector
 5001   // registers via the address supplied in base with each pair indexed
 5002   // using the the start offset plus the corresponding entry in the
 5003   // offsets array
 5004   template<int N>
 5005   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 5006     for (int i = 0; i < N/2; i++) {
 5007       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5008     }
 5009   }
 5010 
 5011   // store N vector registers into N/2 pairs of quadword memory
 5012   // locations via the address supplied in base with each pair indexed
 5013   // using the the start offset plus the corresponding entry in the
 5014   // offsets array
 5015   template<int N>
 5016   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 5017     for (int i = 0; i < N/2; i++) {
 5018       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 5019     }
 5020   }
 5021 
 5022   // load N single quadword values from memory into N vector registers
 5023   // via the address supplied in base with each value indexed using
 5024   // the the start offset plus the corresponding entry in the offsets
 5025   // array
 5026   template<int N>
 5027   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5028                       int start, int (&offsets)[N]) {
 5029     for (int i = 0; i < N; i++) {
 5030       __ ldr(v[i], T, Address(base, start + offsets[i]));
 5031     }
 5032   }
 5033 
 5034   // store N vector registers into N single quadword memory locations
 5035   // via the address supplied in base with each value indexed using
 5036   // the the start offset plus the corresponding entry in the offsets
 5037   // array
 5038   template<int N>
 5039   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 5040                       int start, int (&offsets)[N]) {
 5041     for (int i = 0; i < N; i++) {
 5042       __ str(v[i], T, Address(base, start + offsets[i]));
 5043     }
 5044   }
 5045 
 5046   // load N/2 pairs of quadword values from memory de-interleaved into
 5047   // N vector registers 2 at a time via the address supplied in base
 5048   // with each pair indexed using the the start offset plus the
 5049   // corresponding entry in the offsets array
 5050   template<int N>
 5051   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5052                       Register tmp, int start, int (&offsets)[N/2]) {
 5053     for (int i = 0; i < N/2; i++) {
 5054       __ add(tmp, base, start + offsets[i]);
 5055       __ ld2(v[2*i], v[2*i+1], T, tmp);
 5056     }
 5057   }
 5058 
 5059   // store N vector registers 2 at a time interleaved into N/2 pairs
 5060   // of quadword memory locations via the address supplied in base
 5061   // with each pair indexed using the the start offset plus the
 5062   // corresponding entry in the offsets array
 5063   template<int N>
 5064   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 5065                       Register tmp, int start, int (&offsets)[N/2]) {
 5066     for (int i = 0; i < N/2; i++) {
 5067       __ add(tmp, base, start + offsets[i]);
 5068       __ st2(v[2*i], v[2*i+1], T, tmp);
 5069     }
 5070   }
 5071 
 5072   // Helper routines for various flavours of Montgomery multiply
 5073 
 5074   // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
 5075   // multiplications in parallel
 5076   //
 5077 
 5078   // See the montMul() method of the sun.security.provider.ML_DSA
 5079   // class.
 5080   //
 5081   // Computes 4x4S results or 8x8H results
 5082   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5083   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5084   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5085   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5086   // Outputs: va - 4x4S or 4x8H vector register sequences
 5087   // vb, vc, vtmp and vq must all be disjoint
 5088   // va must be disjoint from all other inputs/temps or must equal vc
 5089   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5090   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5091   void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5092                    Assembler::SIMD_Arrangement T,
 5093                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5094     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5095     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5096     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5097     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5098 
 5099     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5100     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5101 
 5102     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5103 
 5104     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5105     assert(vs_disjoint(va, vb), "va and vb overlap");
 5106     assert(vs_disjoint(va, vq), "va and vq overlap");
 5107     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5108     assert(!va.is_constant(), "output vector must identify 4 different registers");
 5109 
 5110     // schedule 4 streams of instructions across the vector sequences
 5111     for (int i = 0; i < 4; i++) {
 5112       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5113       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5114     }
 5115 
 5116     for (int i = 0; i < 4; i++) {
 5117       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5118     }
 5119 
 5120     for (int i = 0; i < 4; i++) {
 5121       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5122     }
 5123 
 5124     for (int i = 0; i < 4; i++) {
 5125       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5126     }
 5127   }
 5128 
 5129   // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
 5130   // multiplications in parallel
 5131   //
 5132 
 5133   // See the montMul() method of the sun.security.provider.ML_DSA
 5134   // class.
 5135   //
 5136   // Computes 4x4S results or 8x8H results
 5137   //    a = b * c * 2^MONT_R_BITS mod MONT_Q
 5138   // Inputs:  vb, vc - 4x4S or 4x8H vector register sequences
 5139   //          vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
 5140   // Temps:   vtmp - 4x4S or 4x8H vector sequence trashed after call
 5141   // Outputs: va - 4x4S or 4x8H vector register sequences
 5142   // vb, vc, vtmp and vq must all be disjoint
 5143   // va must be disjoint from all other inputs/temps or must equal vc
 5144   // va must have a non-zero delta i.e. it must not be a constant vseq.
 5145   // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
 5146   void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5147                    Assembler::SIMD_Arrangement T,
 5148                    const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5149     assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
 5150     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5151     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5152     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5153 
 5154     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5155     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5156 
 5157     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5158 
 5159     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5160     assert(vs_disjoint(va, vb), "va and vb overlap");
 5161     assert(vs_disjoint(va, vq), "va and vq overlap");
 5162     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5163     assert(!va.is_constant(), "output vector must identify 2 different registers");
 5164 
 5165     // schedule 2 streams of instructions across the vector sequences
 5166     for (int i = 0; i < 2; i++) {
 5167       __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 5168       __ mulv(va[i], T, vb[i], vc[i]);    // aLow = lo32(b * c)
 5169     }
 5170 
 5171     for (int i = 0; i < 2; i++) {
 5172       __ mulv(va[i], T, va[i], vq[0]);     // m = aLow * qinv
 5173     }
 5174 
 5175     for (int i = 0; i < 2; i++) {
 5176       __ sqdmulh(va[i], T, va[i], vq[1]);  // n = hi32(2 * m * q)
 5177     }
 5178 
 5179     for (int i = 0; i < 2; i++) {
 5180       __ shsubv(va[i], T, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 5181     }
 5182   }
 5183 
 5184   // Perform 16 16-bit Montgomery multiplications in parallel.
 5185   void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
 5186                        const VSeq<2>& vtmp, const VSeq<2>& vq) {
 5187     // Use the helper routine to schedule a 2x8H Montgomery multiply.
 5188     // It will assert that the register use is valid
 5189     vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
 5190   }
 5191 
 5192   // Perform 32 16-bit Montgomery multiplications in parallel.
 5193   void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 5194                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5195     // Use the helper routine to schedule a 4x8H Montgomery multiply.
 5196     // It will assert that the register use is valid
 5197     vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
 5198   }
 5199 
 5200   // Perform 64 16-bit Montgomery multiplications in parallel.
 5201   void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 5202                        const VSeq<4>& vtmp, const VSeq<2>& vq) {
 5203     // Schedule two successive 4x8H multiplies via the montmul helper
 5204     // on the front and back halves of va, vb and vc. The helper will
 5205     // assert that the register use has no overlap conflicts on each
 5206     // individual call but we also need to ensure that the necessary
 5207     // disjoint/equality constraints are met across both calls.
 5208 
 5209     // vb, vc, vtmp and vq must be disjoint. va must either be
 5210     // disjoint from all other registers or equal vc
 5211 
 5212     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 5213     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 5214     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 5215 
 5216     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 5217     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 5218 
 5219     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 5220 
 5221     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 5222     assert(vs_disjoint(va, vb), "va and vb overlap");
 5223     assert(vs_disjoint(va, vq), "va and vq overlap");
 5224     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 5225 
 5226     // we multiply the front and back halves of each sequence 4 at a
 5227     // time because
 5228     //
 5229     // 1) we are currently only able to get 4-way instruction
 5230     // parallelism at best
 5231     //
 5232     // 2) we need registers for the constants in vq and temporary
 5233     // scratch registers to hold intermediate results so vtmp can only
 5234     // be a VSeq<4> which means we only have 4 scratch slots
 5235 
 5236     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
 5237     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
 5238   }
 5239 
 5240   void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
 5241                                const VSeq<4>& vc,
 5242                                const VSeq<4>& vtmp,
 5243                                const VSeq<2>& vq) {
 5244     // compute a = montmul(a1, c)
 5245     kyber_montmul32(vc, va1, vc, vtmp, vq);
 5246     // ouptut a1 = a0 - a
 5247     vs_subv(va1, __ T8H, va0, vc);
 5248     //    and a0 = a0 + a
 5249     vs_addv(va0, __ T8H, va0, vc);
 5250   }
 5251 
 5252   void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
 5253                                const VSeq<4>& vb,
 5254                                const VSeq<4>& vtmp1,
 5255                                const VSeq<4>& vtmp2,
 5256                                const VSeq<2>& vq) {
 5257     // compute c = a0 - a1
 5258     vs_subv(vtmp1, __ T8H, va0, va1);
 5259     // output a0 = a0 + a1
 5260     vs_addv(va0, __ T8H, va0, va1);
 5261     // output a1 = b montmul c
 5262     kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
 5263   }
 5264 
 5265   void load64shorts(const VSeq<8>& v, Register shorts) {
 5266     vs_ldpq_post(v, shorts);
 5267   }
 5268 
 5269   void load32shorts(const VSeq<4>& v, Register shorts) {
 5270     vs_ldpq_post(v, shorts);
 5271   }
 5272 
 5273   void store64shorts(VSeq<8> v, Register tmpAddr) {
 5274     vs_stpq_post(v, tmpAddr);
 5275   }
 5276 
 5277   // Kyber NTT function.
 5278   // Implements
 5279   // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
 5280   //
 5281   // coeffs (short[256]) = c_rarg0
 5282   // ntt_zetas (short[256]) = c_rarg1
 5283   address generate_kyberNtt() {
 5284 
 5285     __ align(CodeEntryAlignment);
 5286     StubId stub_id = StubId::stubgen_kyberNtt_id;
 5287     StubCodeMark mark(this, stub_id);
 5288     address start = __ pc();
 5289     __ enter();
 5290 
 5291     const Register coeffs = c_rarg0;
 5292     const Register zetas = c_rarg1;
 5293 
 5294     const Register kyberConsts = r10;
 5295     const Register tmpAddr = r11;
 5296 
 5297     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5298     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5299     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5300 
 5301     __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5302     // load the montmul constants
 5303     vs_ldpq(vq, kyberConsts);
 5304 
 5305     // Each level corresponds to an iteration of the outermost loop of the
 5306     // Java method seilerNTT(int[] coeffs). There are some differences
 5307     // from what is done in the seilerNTT() method, though:
 5308     // 1. The computation is using 16-bit signed values, we do not convert them
 5309     // to ints here.
 5310     // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
 5311     // this array for each level, it is easier that way to fill up the vector
 5312     // registers.
 5313     // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
 5314     // multiplications (this is because that way there should not be any
 5315     // overflow during the inverse NTT computation), here we usr R = 2^16 so
 5316     // that we can use the 16-bit arithmetic in the vector unit.
 5317     //
 5318     // On each level, we fill up the vector registers in such a way that the
 5319     // array elements that need to be multiplied by the zetas go into one
 5320     // set of vector registers while the corresponding ones that don't need to
 5321     // be multiplied, go into another set.
 5322     // We can do 32 Montgomery multiplications in parallel, using 12 vector
 5323     // registers interleaving the steps of 4 identical computations,
 5324     // each done on 8 16-bit values per register.
 5325 
 5326     // At levels 0-3 the coefficients multiplied by or added/subtracted
 5327     // to the zetas occur in discrete blocks whose size is some multiple
 5328     // of 32.
 5329 
 5330     // level 0
 5331     __ add(tmpAddr, coeffs, 256);
 5332     load64shorts(vs1, tmpAddr);
 5333     load64shorts(vs2, zetas);
 5334     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5335     __ add(tmpAddr, coeffs, 0);
 5336     load64shorts(vs1, tmpAddr);
 5337     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5338     vs_addv(vs1, __ T8H, vs1, vs2);
 5339     __ add(tmpAddr, coeffs, 0);
 5340     vs_stpq_post(vs1, tmpAddr);
 5341     __ add(tmpAddr, coeffs, 256);
 5342     vs_stpq_post(vs3, tmpAddr);
 5343     // restore montmul constants
 5344     vs_ldpq(vq, kyberConsts);
 5345     load64shorts(vs1, tmpAddr);
 5346     load64shorts(vs2, zetas);
 5347     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5348     __ add(tmpAddr, coeffs, 128);
 5349     load64shorts(vs1, tmpAddr);
 5350     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5351     vs_addv(vs1, __ T8H, vs1, vs2);
 5352     __ add(tmpAddr, coeffs, 128);
 5353     store64shorts(vs1, tmpAddr);
 5354     __ add(tmpAddr, coeffs, 384);
 5355     store64shorts(vs3, tmpAddr);
 5356 
 5357     // level 1
 5358     // restore montmul constants
 5359     vs_ldpq(vq, kyberConsts);
 5360     __ add(tmpAddr, coeffs, 128);
 5361     load64shorts(vs1, tmpAddr);
 5362     load64shorts(vs2, zetas);
 5363     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5364     __ add(tmpAddr, coeffs, 0);
 5365     load64shorts(vs1, tmpAddr);
 5366     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5367     vs_addv(vs1, __ T8H, vs1, vs2);
 5368     __ add(tmpAddr, coeffs, 0);
 5369     store64shorts(vs1, tmpAddr);
 5370     store64shorts(vs3, tmpAddr);
 5371     vs_ldpq(vq, kyberConsts);
 5372     __ add(tmpAddr, coeffs, 384);
 5373     load64shorts(vs1, tmpAddr);
 5374     load64shorts(vs2, zetas);
 5375     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5376     __ add(tmpAddr, coeffs, 256);
 5377     load64shorts(vs1, tmpAddr);
 5378     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5379     vs_addv(vs1, __ T8H, vs1, vs2);
 5380     __ add(tmpAddr, coeffs, 256);
 5381     store64shorts(vs1, tmpAddr);
 5382     store64shorts(vs3, tmpAddr);
 5383 
 5384     // level 2
 5385     vs_ldpq(vq, kyberConsts);
 5386     int offsets1[4] = { 0, 32, 128, 160 };
 5387     vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
 5388     load64shorts(vs2, zetas);
 5389     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5390     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5391     // kyber_subv_addv64();
 5392     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5393     vs_addv(vs1, __ T8H, vs1, vs2);
 5394     __ add(tmpAddr, coeffs, 0);
 5395     vs_stpq_post(vs_front(vs1), tmpAddr);
 5396     vs_stpq_post(vs_front(vs3), tmpAddr);
 5397     vs_stpq_post(vs_back(vs1), tmpAddr);
 5398     vs_stpq_post(vs_back(vs3), tmpAddr);
 5399     vs_ldpq(vq, kyberConsts);
 5400     vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
 5401     load64shorts(vs2, zetas);
 5402     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5403     vs_ldpq_indexed(vs1,  coeffs, 256, offsets1);
 5404     // kyber_subv_addv64();
 5405     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5406     vs_addv(vs1, __ T8H, vs1, vs2);
 5407     __ add(tmpAddr, coeffs, 256);
 5408     vs_stpq_post(vs_front(vs1), tmpAddr);
 5409     vs_stpq_post(vs_front(vs3), tmpAddr);
 5410     vs_stpq_post(vs_back(vs1), tmpAddr);
 5411     vs_stpq_post(vs_back(vs3), tmpAddr);
 5412 
 5413     // level 3
 5414     vs_ldpq(vq, kyberConsts);
 5415     int offsets2[4] = { 0, 64, 128, 192 };
 5416     vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
 5417     load64shorts(vs2, zetas);
 5418     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5419     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5420     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5421     vs_addv(vs1, __ T8H, vs1, vs2);
 5422     vs_stpq_indexed(vs1, coeffs, 0, offsets2);
 5423     vs_stpq_indexed(vs3, coeffs, 32, offsets2);
 5424 
 5425     vs_ldpq(vq, kyberConsts);
 5426     vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
 5427     load64shorts(vs2, zetas);
 5428     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5429     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5430     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5431     vs_addv(vs1, __ T8H, vs1, vs2);
 5432     vs_stpq_indexed(vs1, coeffs, 256, offsets2);
 5433     vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
 5434 
 5435     // level 4
 5436     // At level 4 coefficients occur in 8 discrete blocks of size 16
 5437     // so they are loaded using employing an ldr at 8 distinct offsets.
 5438 
 5439     vs_ldpq(vq, kyberConsts);
 5440     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5441     vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
 5442     load64shorts(vs2, zetas);
 5443     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5444     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5445     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5446     vs_addv(vs1, __ T8H, vs1, vs2);
 5447     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5448     vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
 5449 
 5450     vs_ldpq(vq, kyberConsts);
 5451     vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
 5452     load64shorts(vs2, zetas);
 5453     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5454     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5455     vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5456     vs_addv(vs1, __ T8H, vs1, vs2);
 5457     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5458     vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
 5459 
 5460     // level 5
 5461     // At level 5 related coefficients occur in discrete blocks of size 8 so
 5462     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5463 
 5464     vs_ldpq(vq, kyberConsts);
 5465     int offsets4[4] = { 0, 32, 64, 96 };
 5466     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5467     load32shorts(vs_front(vs2), zetas);
 5468     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5469     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5470     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5471     load32shorts(vs_front(vs2), zetas);
 5472     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5473     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5474     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5475     load32shorts(vs_front(vs2), zetas);
 5476     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5477     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5478 
 5479     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5480     load32shorts(vs_front(vs2), zetas);
 5481     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5482     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5483 
 5484     // level 6
 5485     // At level 6 related coefficients occur in discrete blocks of size 4 so
 5486     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5487 
 5488     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5489     load32shorts(vs_front(vs2), zetas);
 5490     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5491     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5492     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5493     // __ ldpq(v18, v19, __ post(zetas, 32));
 5494     load32shorts(vs_front(vs2), zetas);
 5495     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5496     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5497 
 5498     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5499     load32shorts(vs_front(vs2), zetas);
 5500     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5501     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5502 
 5503     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5504     load32shorts(vs_front(vs2), zetas);
 5505     kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
 5506     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5507 
 5508     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5509     __ mov(r0, zr); // return 0
 5510     __ ret(lr);
 5511 
 5512     return start;
 5513   }
 5514 
 5515   // Kyber Inverse NTT function
 5516   // Implements
 5517   // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
 5518   //
 5519   // coeffs (short[256]) = c_rarg0
 5520   // ntt_zetas (short[256]) = c_rarg1
 5521   address generate_kyberInverseNtt() {
 5522 
 5523     __ align(CodeEntryAlignment);
 5524     StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
 5525     StubCodeMark mark(this, stub_id);
 5526     address start = __ pc();
 5527     __ enter();
 5528 
 5529     const Register coeffs = c_rarg0;
 5530     const Register zetas = c_rarg1;
 5531 
 5532     const Register kyberConsts = r10;
 5533     const Register tmpAddr = r11;
 5534     const Register tmpAddr2 = c_rarg2;
 5535 
 5536     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x8H inputs/outputs
 5537     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5538     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5539 
 5540     __ lea(kyberConsts,
 5541              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5542 
 5543     // level 0
 5544     // At level 0 related coefficients occur in discrete blocks of size 4 so
 5545     // need to be loaded interleaved using an ld2 operation with arrangement 4S.
 5546 
 5547     vs_ldpq(vq, kyberConsts);
 5548     int offsets4[4] = { 0, 32, 64, 96 };
 5549     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5550     load32shorts(vs_front(vs2), zetas);
 5551     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5552                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5553     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
 5554     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5555     load32shorts(vs_front(vs2), zetas);
 5556     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5557                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5558     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
 5559     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5560     load32shorts(vs_front(vs2), zetas);
 5561     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5562                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5563     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
 5564     vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5565     load32shorts(vs_front(vs2), zetas);
 5566     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5567                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5568     vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
 5569 
 5570     // level 1
 5571     // At level 1 related coefficients occur in discrete blocks of size 8 so
 5572     // need to be loaded interleaved using an ld2 operation with arrangement 2D.
 5573 
 5574     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5575     load32shorts(vs_front(vs2), zetas);
 5576     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5577                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5578     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
 5579     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5580     load32shorts(vs_front(vs2), zetas);
 5581     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5582                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5583     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
 5584 
 5585     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5586     load32shorts(vs_front(vs2), zetas);
 5587     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5588                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5589     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
 5590     vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5591     load32shorts(vs_front(vs2), zetas);
 5592     kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
 5593                             vs_front(vs2), vs_back(vs2), vtmp, vq);
 5594     vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
 5595 
 5596     // level 2
 5597     // At level 2 coefficients occur in 8 discrete blocks of size 16
 5598     // so they are loaded using employing an ldr at 8 distinct offsets.
 5599 
 5600     int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5601     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5602     vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5603     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5604     vs_subv(vs1, __ T8H, vs1, vs2);
 5605     vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
 5606     load64shorts(vs2, zetas);
 5607     vs_ldpq(vq, kyberConsts);
 5608     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5609     vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
 5610 
 5611     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5612     vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5613     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5614     vs_subv(vs1, __ T8H, vs1, vs2);
 5615     vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
 5616     load64shorts(vs2, zetas);
 5617     vs_ldpq(vq, kyberConsts);
 5618     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5619     vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
 5620 
 5621     // Barrett reduction at indexes where overflow may happen
 5622 
 5623     // load q and the multiplier for the Barrett reduction
 5624     __ add(tmpAddr, kyberConsts, 16);
 5625     vs_ldpq(vq, tmpAddr);
 5626 
 5627     VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
 5628     VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
 5629     VSeq<8> vq3 = VSeq<8>(v29, 0);   // 3rd sequence for const montmul
 5630     vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5631     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5632     vs_sshr(vs2, __ T8H, vs2, 11);
 5633     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5634     vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
 5635     vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5636     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5637     vs_sshr(vs2, __ T8H, vs2, 11);
 5638     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5639     vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
 5640 
 5641     // level 3
 5642     // From level 3 upwards coefficients occur in discrete blocks whose size is
 5643     // some multiple of 32 so can be loaded using ldpq and suitable indexes.
 5644 
 5645     int offsets2[4] = { 0, 64, 128, 192 };
 5646     vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
 5647     vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
 5648     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5649     vs_subv(vs1, __ T8H, vs1, vs2);
 5650     vs_stpq_indexed(vs3, coeffs, 0, offsets2);
 5651     load64shorts(vs2, zetas);
 5652     vs_ldpq(vq, kyberConsts);
 5653     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5654     vs_stpq_indexed(vs2, coeffs, 32, offsets2);
 5655 
 5656     vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
 5657     vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5658     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5659     vs_subv(vs1, __ T8H, vs1, vs2);
 5660     vs_stpq_indexed(vs3, coeffs, 256, offsets2);
 5661     load64shorts(vs2, zetas);
 5662     vs_ldpq(vq, kyberConsts);
 5663     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5664     vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
 5665 
 5666     // level 4
 5667 
 5668     int offsets1[4] = { 0, 32, 128, 160 };
 5669     vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
 5670     vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
 5671     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5672     vs_subv(vs1, __ T8H, vs1, vs2);
 5673     vs_stpq_indexed(vs3, coeffs, 0, offsets1);
 5674     load64shorts(vs2, zetas);
 5675     vs_ldpq(vq, kyberConsts);
 5676     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5677     vs_stpq_indexed(vs2, coeffs, 64, offsets1);
 5678 
 5679     vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
 5680     vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5681     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5682     vs_subv(vs1, __ T8H, vs1, vs2);
 5683     vs_stpq_indexed(vs3, coeffs, 256, offsets1);
 5684     load64shorts(vs2, zetas);
 5685     vs_ldpq(vq, kyberConsts);
 5686     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5687     vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
 5688 
 5689     // level 5
 5690 
 5691     __ add(tmpAddr, coeffs, 0);
 5692     load64shorts(vs1, tmpAddr);
 5693     __ add(tmpAddr, coeffs, 128);
 5694     load64shorts(vs2, tmpAddr);
 5695     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5696     vs_subv(vs1, __ T8H, vs1, vs2);
 5697     __ add(tmpAddr, coeffs, 0);
 5698     store64shorts(vs3, tmpAddr);
 5699     load64shorts(vs2, zetas);
 5700     vs_ldpq(vq, kyberConsts);
 5701     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5702     __ add(tmpAddr, coeffs, 128);
 5703     store64shorts(vs2, tmpAddr);
 5704 
 5705     load64shorts(vs1, tmpAddr);
 5706     __ add(tmpAddr, coeffs, 384);
 5707     load64shorts(vs2, tmpAddr);
 5708     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5709     vs_subv(vs1, __ T8H, vs1, vs2);
 5710     __ add(tmpAddr, coeffs, 256);
 5711     store64shorts(vs3, tmpAddr);
 5712     load64shorts(vs2, zetas);
 5713     vs_ldpq(vq, kyberConsts);
 5714     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5715     __ add(tmpAddr, coeffs, 384);
 5716     store64shorts(vs2, tmpAddr);
 5717 
 5718     // Barrett reduction at indexes where overflow may happen
 5719 
 5720     // load q and the multiplier for the Barrett reduction
 5721     __ add(tmpAddr, kyberConsts, 16);
 5722     vs_ldpq(vq, tmpAddr);
 5723 
 5724     int offsets0[2] = { 0, 256 };
 5725     vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5726     vs_sqdmulh(vs2, __ T8H, vs1, vq2);
 5727     vs_sshr(vs2, __ T8H, vs2, 11);
 5728     vs_mlsv(vs1, __ T8H, vs2, vq1);
 5729     vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
 5730 
 5731     // level 6
 5732 
 5733     __ add(tmpAddr, coeffs, 0);
 5734     load64shorts(vs1, tmpAddr);
 5735     __ add(tmpAddr, coeffs, 256);
 5736     load64shorts(vs2, tmpAddr);
 5737     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5738     vs_subv(vs1, __ T8H, vs1, vs2);
 5739     __ add(tmpAddr, coeffs, 0);
 5740     store64shorts(vs3, tmpAddr);
 5741     load64shorts(vs2, zetas);
 5742     vs_ldpq(vq, kyberConsts);
 5743     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5744     __ add(tmpAddr, coeffs, 256);
 5745     store64shorts(vs2, tmpAddr);
 5746 
 5747     __ add(tmpAddr, coeffs, 128);
 5748     load64shorts(vs1, tmpAddr);
 5749     __ add(tmpAddr, coeffs, 384);
 5750     load64shorts(vs2, tmpAddr);
 5751     vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
 5752     vs_subv(vs1, __ T8H, vs1, vs2);
 5753     __ add(tmpAddr, coeffs, 128);
 5754     store64shorts(vs3, tmpAddr);
 5755     load64shorts(vs2, zetas);
 5756     vs_ldpq(vq, kyberConsts);
 5757     kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
 5758     __ add(tmpAddr, coeffs, 384);
 5759     store64shorts(vs2, tmpAddr);
 5760 
 5761     // multiply by 2^-n
 5762 
 5763     // load toMont(2^-n mod q)
 5764     __ add(tmpAddr, kyberConsts, 48);
 5765     __ ldr(v29, __ Q, tmpAddr);
 5766 
 5767     vs_ldpq(vq, kyberConsts);
 5768     __ add(tmpAddr, coeffs, 0);
 5769     load64shorts(vs1, tmpAddr);
 5770     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5771     __ add(tmpAddr, coeffs, 0);
 5772     store64shorts(vs2, tmpAddr);
 5773 
 5774     // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
 5775     load64shorts(vs1, tmpAddr);
 5776     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5777     __ add(tmpAddr, coeffs, 128);
 5778     store64shorts(vs2, tmpAddr);
 5779 
 5780     // now tmpAddr contains coeffs + 256
 5781     load64shorts(vs1, tmpAddr);
 5782     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5783     __ add(tmpAddr, coeffs, 256);
 5784     store64shorts(vs2, tmpAddr);
 5785 
 5786     // now tmpAddr contains coeffs + 384
 5787     load64shorts(vs1, tmpAddr);
 5788     kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
 5789     __ add(tmpAddr, coeffs, 384);
 5790     store64shorts(vs2, tmpAddr);
 5791 
 5792     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5793     __ mov(r0, zr); // return 0
 5794     __ ret(lr);
 5795 
 5796     return start;
 5797   }
 5798 
 5799   // Kyber multiply polynomials in the NTT domain.
 5800   // Implements
 5801   // static int implKyberNttMult(
 5802   //              short[] result, short[] ntta, short[] nttb, short[] zetas) {}
 5803   //
 5804   // result (short[256]) = c_rarg0
 5805   // ntta (short[256]) = c_rarg1
 5806   // nttb (short[256]) = c_rarg2
 5807   // zetas (short[128]) = c_rarg3
 5808   address generate_kyberNttMult() {
 5809 
 5810     __ align(CodeEntryAlignment);
 5811     StubId stub_id = StubId::stubgen_kyberNttMult_id;
 5812     StubCodeMark mark(this, stub_id);
 5813     address start = __ pc();
 5814     __ enter();
 5815 
 5816     const Register result = c_rarg0;
 5817     const Register ntta = c_rarg1;
 5818     const Register nttb = c_rarg2;
 5819     const Register zetas = c_rarg3;
 5820 
 5821     const Register kyberConsts = r10;
 5822     const Register limit = r11;
 5823 
 5824     VSeq<4> vs1(0), vs2(4);  // 4 sets of 8x8H inputs/outputs/tmps
 5825     VSeq<4> vs3(16), vs4(20);
 5826     VSeq<2> vq(30);          // pair of constants for montmul: q, qinv
 5827     VSeq<2> vz(28);          // pair of zetas
 5828     VSeq<4> vc(27, 0);       // constant sequence for montmul: montRSquareModQ
 5829 
 5830     __ lea(kyberConsts,
 5831              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5832 
 5833     Label kyberNttMult_loop;
 5834 
 5835     __ add(limit, result, 512);
 5836 
 5837     // load q and qinv
 5838     vs_ldpq(vq, kyberConsts);
 5839 
 5840     // load R^2 mod q (to convert back from Montgomery representation)
 5841     __ add(kyberConsts, kyberConsts, 64);
 5842     __ ldr(v27, __ Q, kyberConsts);
 5843 
 5844     __ BIND(kyberNttMult_loop);
 5845 
 5846     // load 16 zetas
 5847     vs_ldpq_post(vz, zetas);
 5848 
 5849     // load 2 sets of 32 coefficients from the two input arrays
 5850     // interleaved as shorts. i.e. pairs of shorts adjacent in memory
 5851     // are striped across pairs of vector registers
 5852     vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
 5853     vs_ld2_post(vs_back(vs1), __ T8H, nttb);  // <b0, b1> x 8H
 5854     vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
 5855     vs_ld2_post(vs_back(vs4), __ T8H, nttb);  // <b2, b3> x 8H
 5856 
 5857     // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
 5858     // i.e. montmul the first and second halves of vs1 in order and
 5859     // then with one sequence reversed storing the two results in vs3
 5860     //
 5861     // vs3[0] <- montmul(a0, b0)
 5862     // vs3[1] <- montmul(a1, b1)
 5863     // vs3[2] <- montmul(a0, b1)
 5864     // vs3[3] <- montmul(a1, b0)
 5865     kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
 5866     kyber_montmul16(vs_back(vs3),
 5867                     vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
 5868 
 5869     // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
 5870     // i.e. montmul the first and second halves of vs4 in order and
 5871     // then with one sequence reversed storing the two results in vs1
 5872     //
 5873     // vs1[0] <- montmul(a2, b2)
 5874     // vs1[1] <- montmul(a3, b3)
 5875     // vs1[2] <- montmul(a2, b3)
 5876     // vs1[3] <- montmul(a3, b2)
 5877     kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
 5878     kyber_montmul16(vs_back(vs1),
 5879                     vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
 5880 
 5881     // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
 5882     // We can schedule two montmuls at a time if we use a suitable vector
 5883     // sequence <vs3[1], vs1[1]>.
 5884     int delta = vs1[1]->encoding() - vs3[1]->encoding();
 5885     VSeq<2> vs5(vs3[1], delta);
 5886 
 5887     // vs3[1] <- montmul(montmul(a1, b1), z0)
 5888     // vs1[1] <- montmul(montmul(a3, b3), z1)
 5889     kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
 5890 
 5891     // add results in pairs storing in vs3
 5892     // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
 5893     // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
 5894     vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
 5895 
 5896     // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
 5897     // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
 5898     vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
 5899 
 5900     // vs1 <- montmul(vs3, montRSquareModQ)
 5901     kyber_montmul32(vs1, vs3, vc, vs2, vq);
 5902 
 5903     // store back the two pairs of result vectors de-interleaved as 8H elements
 5904     // i.e. storing each pairs of shorts striped across a register pair adjacent
 5905     // in memory
 5906     vs_st2_post(vs1, __ T8H, result);
 5907 
 5908     __ cmp(result, limit);
 5909     __ br(Assembler::NE, kyberNttMult_loop);
 5910 
 5911     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5912     __ mov(r0, zr); // return 0
 5913     __ ret(lr);
 5914 
 5915     return start;
 5916   }
 5917 
 5918   // Kyber add 2 polynomials.
 5919   // Implements
 5920   // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
 5921   //
 5922   // result (short[256]) = c_rarg0
 5923   // a (short[256]) = c_rarg1
 5924   // b (short[256]) = c_rarg2
 5925   address generate_kyberAddPoly_2() {
 5926 
 5927     __ align(CodeEntryAlignment);
 5928     StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
 5929     StubCodeMark mark(this, stub_id);
 5930     address start = __ pc();
 5931     __ enter();
 5932 
 5933     const Register result = c_rarg0;
 5934     const Register a = c_rarg1;
 5935     const Register b = c_rarg2;
 5936 
 5937     const Register kyberConsts = r11;
 5938 
 5939     // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
 5940     // So, we can load, add and store the data in 3 groups of 11,
 5941     // 11 and 10 at a time i.e. we need to map sets of 10 or 11
 5942     // registers. A further constraint is that the mapping needs
 5943     // to skip callee saves. So, we allocate the register
 5944     // sequences using two 8 sequences, two 2 sequences and two
 5945     // single registers.
 5946     VSeq<8> vs1_1(0);
 5947     VSeq<2> vs1_2(16);
 5948     FloatRegister vs1_3 = v28;
 5949     VSeq<8> vs2_1(18);
 5950     VSeq<2> vs2_2(26);
 5951     FloatRegister vs2_3 = v29;
 5952 
 5953     // two constant vector sequences
 5954     VSeq<8> vc_1(31, 0);
 5955     VSeq<2> vc_2(31, 0);
 5956 
 5957     FloatRegister vc_3 = v31;
 5958     __ lea(kyberConsts,
 5959              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 5960 
 5961     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 5962     for (int i = 0; i < 3; i++) {
 5963       // load 80 or 88 values from a into vs1_1/2/3
 5964       vs_ldpq_post(vs1_1, a);
 5965       vs_ldpq_post(vs1_2, a);
 5966       if (i < 2) {
 5967         __ ldr(vs1_3, __ Q, __ post(a, 16));
 5968       }
 5969       // load 80 or 88 values from b into vs2_1/2/3
 5970       vs_ldpq_post(vs2_1, b);
 5971       vs_ldpq_post(vs2_2, b);
 5972       if (i < 2) {
 5973         __ ldr(vs2_3, __ Q, __ post(b, 16));
 5974       }
 5975       // sum 80 or 88 values across vs1 and vs2 into vs1
 5976       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 5977       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 5978       if (i < 2) {
 5979         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 5980       }
 5981       // add constant to all 80 or 88 results
 5982       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 5983       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 5984       if (i < 2) {
 5985         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 5986       }
 5987       // store 80 or 88 values
 5988       vs_stpq_post(vs1_1, result);
 5989       vs_stpq_post(vs1_2, result);
 5990       if (i < 2) {
 5991         __ str(vs1_3, __ Q, __ post(result, 16));
 5992       }
 5993     }
 5994 
 5995     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5996     __ mov(r0, zr); // return 0
 5997     __ ret(lr);
 5998 
 5999     return start;
 6000   }
 6001 
 6002   // Kyber add 3 polynomials.
 6003   // Implements
 6004   // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
 6005   //
 6006   // result (short[256]) = c_rarg0
 6007   // a (short[256]) = c_rarg1
 6008   // b (short[256]) = c_rarg2
 6009   // c (short[256]) = c_rarg3
 6010   address generate_kyberAddPoly_3() {
 6011 
 6012     __ align(CodeEntryAlignment);
 6013     StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
 6014     StubCodeMark mark(this, stub_id);
 6015     address start = __ pc();
 6016     __ enter();
 6017 
 6018     const Register result = c_rarg0;
 6019     const Register a = c_rarg1;
 6020     const Register b = c_rarg2;
 6021     const Register c = c_rarg3;
 6022 
 6023     const Register kyberConsts = r11;
 6024 
 6025     // As above we sum 256 sets of values in total i.e. 32 x 8H
 6026     // quadwords.  So, we can load, add and store the data in 3
 6027     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6028     // of 10 or 11 registers. A further constraint is that the
 6029     // mapping needs to skip callee saves. So, we allocate the
 6030     // register sequences using two 8 sequences, two 2 sequences
 6031     // and two single registers.
 6032     VSeq<8> vs1_1(0);
 6033     VSeq<2> vs1_2(16);
 6034     FloatRegister vs1_3 = v28;
 6035     VSeq<8> vs2_1(18);
 6036     VSeq<2> vs2_2(26);
 6037     FloatRegister vs2_3 = v29;
 6038 
 6039     // two constant vector sequences
 6040     VSeq<8> vc_1(31, 0);
 6041     VSeq<2> vc_2(31, 0);
 6042 
 6043     FloatRegister vc_3 = v31;
 6044 
 6045     __ lea(kyberConsts,
 6046              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6047 
 6048     __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
 6049     for (int i = 0; i < 3; i++) {
 6050       // load 80 or 88 values from a into vs1_1/2/3
 6051       vs_ldpq_post(vs1_1, a);
 6052       vs_ldpq_post(vs1_2, a);
 6053       if (i < 2) {
 6054         __ ldr(vs1_3, __ Q, __ post(a, 16));
 6055       }
 6056       // load 80 or 88 values from b into vs2_1/2/3
 6057       vs_ldpq_post(vs2_1, b);
 6058       vs_ldpq_post(vs2_2, b);
 6059       if (i < 2) {
 6060         __ ldr(vs2_3, __ Q, __ post(b, 16));
 6061       }
 6062       // sum 80 or 88 values across vs1 and vs2 into vs1
 6063       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6064       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6065       if (i < 2) {
 6066         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6067       }
 6068       // load 80 or 88 values from c into vs2_1/2/3
 6069       vs_ldpq_post(vs2_1, c);
 6070       vs_ldpq_post(vs2_2, c);
 6071       if (i < 2) {
 6072         __ ldr(vs2_3, __ Q, __ post(c, 16));
 6073       }
 6074       // sum 80 or 88 values across vs1 and vs2 into vs1
 6075       vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
 6076       vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
 6077       if (i < 2) {
 6078         __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
 6079       }
 6080       // add constant to all 80 or 88 results
 6081       vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
 6082       vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
 6083       if (i < 2) {
 6084         __ addv(vs1_3, __ T8H, vs1_3, vc_3);
 6085       }
 6086       // store 80 or 88 values
 6087       vs_stpq_post(vs1_1, result);
 6088       vs_stpq_post(vs1_2, result);
 6089       if (i < 2) {
 6090         __ str(vs1_3, __ Q, __ post(result, 16));
 6091       }
 6092     }
 6093 
 6094     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6095     __ mov(r0, zr); // return 0
 6096     __ ret(lr);
 6097 
 6098     return start;
 6099   }
 6100 
 6101   // Kyber parse XOF output to polynomial coefficient candidates
 6102   // or decodePoly(12, ...).
 6103   // Implements
 6104   // static int implKyber12To16(
 6105   //         byte[] condensed, int index, short[] parsed, int parsedLength) {}
 6106   //
 6107   // (parsedLength or (parsedLength - 48) must be divisible by 64.)
 6108   //
 6109   // condensed (byte[]) = c_rarg0
 6110   // condensedIndex = c_rarg1
 6111   // parsed (short[112 or 256]) = c_rarg2
 6112   // parsedLength (112 or 256) = c_rarg3
 6113   address generate_kyber12To16() {
 6114     Label L_F00, L_loop, L_end;
 6115 
 6116     __ align(CodeEntryAlignment);
 6117     StubId stub_id = StubId::stubgen_kyber12To16_id;
 6118     StubCodeMark mark(this, stub_id);
 6119     address start = __ pc();
 6120     __ enter();
 6121 
 6122     const Register condensed = c_rarg0;
 6123     const Register condensedOffs = c_rarg1;
 6124     const Register parsed = c_rarg2;
 6125     const Register parsedLength = c_rarg3;
 6126 
 6127     const Register tmpAddr = r11;
 6128 
 6129     // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
 6130     // quadwords so we need a 6 vector sequence for the inputs.
 6131     // Parsing produces 64 shorts, employing two 8 vector
 6132     // sequences to store and combine the intermediate data.
 6133     VSeq<6> vin(24);
 6134     VSeq<8> va(0), vb(16);
 6135 
 6136     __ adr(tmpAddr, L_F00);
 6137     __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
 6138     __ add(condensed, condensed, condensedOffs);
 6139 
 6140     __ BIND(L_loop);
 6141     // load 96 (6 x 16B) byte values
 6142     vs_ld3_post(vin, __ T16B, condensed);
 6143 
 6144     // The front half of sequence vin (vin[0], vin[1] and vin[2])
 6145     // holds 48 (16x3) contiguous bytes from memory striped
 6146     // horizontally across each of the 16 byte lanes. Equivalently,
 6147     // that is 16 pairs of 12-bit integers. Likewise the back half
 6148     // holds the next 48 bytes in the same arrangement.
 6149 
 6150     // Each vector in the front half can also be viewed as a vertical
 6151     // strip across the 16 pairs of 12 bit integers. Each byte in
 6152     // vin[0] stores the low 8 bits of the first int in a pair. Each
 6153     // byte in vin[1] stores the high 4 bits of the first int and the
 6154     // low 4 bits of the second int. Each byte in vin[2] stores the
 6155     // high 8 bits of the second int. Likewise the vectors in second
 6156     // half.
 6157 
 6158     // Converting the data to 16-bit shorts requires first of all
 6159     // expanding each of the 6 x 16B vectors into 6 corresponding
 6160     // pairs of 8H vectors. Mask, shift and add operations on the
 6161     // resulting vector pairs can be used to combine 4 and 8 bit
 6162     // parts of related 8H vector elements.
 6163     //
 6164     // The middle vectors (vin[2] and vin[5]) are actually expanded
 6165     // twice, one copy manipulated to provide the lower 4 bits
 6166     // belonging to the first short in a pair and another copy
 6167     // manipulated to provide the higher 4 bits belonging to the
 6168     // second short in a pair. This is why the the vector sequences va
 6169     // and vb used to hold the expanded 8H elements are of length 8.
 6170 
 6171     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6172     // n.b. target elements 2 and 3 duplicate elements 4 and 5
 6173     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6174     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6175     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6176     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6177     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6178     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6179 
 6180     // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
 6181     // and vb[4:5]
 6182     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6183     __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
 6184     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6185     __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
 6186     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6187     __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
 6188 
 6189     // shift lo byte of copy 1 of the middle stripe into the high byte
 6190     __ shl(va[2], __ T8H, va[2], 8);
 6191     __ shl(va[3], __ T8H, va[3], 8);
 6192     __ shl(vb[2], __ T8H, vb[2], 8);
 6193     __ shl(vb[3], __ T8H, vb[3], 8);
 6194 
 6195     // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
 6196     // time pre-shifted by 4 to ensure top bits of input 12-bit int
 6197     // are in bit positions [4..11].
 6198     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6199     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6200     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6201     __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
 6202 
 6203     // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
 6204     // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
 6205     // copy2
 6206     __ andr(va[2], __ T16B, va[2], v31);
 6207     __ andr(va[3], __ T16B, va[3], v31);
 6208     __ ushr(va[4], __ T8H, va[4], 4);
 6209     __ ushr(va[5], __ T8H, va[5], 4);
 6210     __ andr(vb[2], __ T16B, vb[2], v31);
 6211     __ andr(vb[3], __ T16B, vb[3], v31);
 6212     __ ushr(vb[4], __ T8H, vb[4], 4);
 6213     __ ushr(vb[5], __ T8H, vb[5], 4);
 6214 
 6215     // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
 6216     // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
 6217     // n.b. the ordering ensures: i) inputs are consumed before they
 6218     // are overwritten ii) the order of 16-bit results across successive
 6219     // pairs of vectors in va and then vb reflects the order of the
 6220     // corresponding 12-bit inputs
 6221     __ addv(va[0], __ T8H, va[0], va[2]);
 6222     __ addv(va[2], __ T8H, va[1], va[3]);
 6223     __ addv(va[1], __ T8H, va[4], va[6]);
 6224     __ addv(va[3], __ T8H, va[5], va[7]);
 6225     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6226     __ addv(vb[2], __ T8H, vb[1], vb[3]);
 6227     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6228     __ addv(vb[3], __ T8H, vb[5], vb[7]);
 6229 
 6230     // store 64 results interleaved as shorts
 6231     vs_st2_post(vs_front(va), __ T8H, parsed);
 6232     vs_st2_post(vs_front(vb), __ T8H, parsed);
 6233 
 6234     __ sub(parsedLength, parsedLength, 64);
 6235     __ cmp(parsedLength, (u1)64);
 6236     __ br(Assembler::GE, L_loop);
 6237     __ cbz(parsedLength, L_end);
 6238 
 6239     // if anything is left it should be a final 72 bytes of input
 6240     // i.e. a final 48 12-bit values. so we handle this by loading
 6241     // 48 bytes into all 16B lanes of front(vin) and only 24
 6242     // bytes into the lower 8B lane of back(vin)
 6243     vs_ld3_post(vs_front(vin), __ T16B, condensed);
 6244     vs_ld3(vs_back(vin), __ T8B, condensed);
 6245 
 6246     // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
 6247     // n.b. target elements 2 and 3 of va duplicate elements 4 and
 6248     // 5 and target element 2 of vb duplicates element 4.
 6249     __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
 6250     __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
 6251     __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
 6252     __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
 6253     __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
 6254     __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
 6255 
 6256     // This time expand just the lower 8 lanes
 6257     __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
 6258     __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
 6259     __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
 6260 
 6261     // shift lo byte of copy 1 of the middle stripe into the high byte
 6262     __ shl(va[2], __ T8H, va[2], 8);
 6263     __ shl(va[3], __ T8H, va[3], 8);
 6264     __ shl(vb[2], __ T8H, vb[2], 8);
 6265 
 6266     // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
 6267     // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
 6268     // int are in bit positions [4..11].
 6269     __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
 6270     __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
 6271     __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
 6272 
 6273     // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
 6274     // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
 6275     // copy2
 6276     __ andr(va[2], __ T16B, va[2], v31);
 6277     __ andr(va[3], __ T16B, va[3], v31);
 6278     __ ushr(va[4], __ T8H, va[4], 4);
 6279     __ ushr(va[5], __ T8H, va[5], 4);
 6280     __ andr(vb[2], __ T16B, vb[2], v31);
 6281     __ ushr(vb[4], __ T8H, vb[4], 4);
 6282 
 6283 
 6284 
 6285     // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
 6286     // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
 6287 
 6288     // n.b. ordering ensures: i) inputs are consumed before they are
 6289     // overwritten ii) order of 16-bit results across succsessive
 6290     // pairs of vectors in va and then lower half of vb reflects order
 6291     // of corresponding 12-bit inputs
 6292     __ addv(va[0], __ T8H, va[0], va[2]);
 6293     __ addv(va[2], __ T8H, va[1], va[3]);
 6294     __ addv(va[1], __ T8H, va[4], va[6]);
 6295     __ addv(va[3], __ T8H, va[5], va[7]);
 6296     __ addv(vb[0], __ T8H, vb[0], vb[2]);
 6297     __ addv(vb[1], __ T8H, vb[4], vb[6]);
 6298 
 6299     // store 48 results interleaved as shorts
 6300     vs_st2_post(vs_front(va), __ T8H, parsed);
 6301     vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
 6302 
 6303     __ BIND(L_end);
 6304 
 6305     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6306     __ mov(r0, zr); // return 0
 6307     __ ret(lr);
 6308 
 6309     // bind label and generate constant data used by this stub
 6310     __ BIND(L_F00);
 6311     __ emit_int64(0x0f000f000f000f00);
 6312     __ emit_int64(0x0f000f000f000f00);
 6313 
 6314     return start;
 6315   }
 6316 
 6317   // Kyber Barrett reduce function.
 6318   // Implements
 6319   // static int implKyberBarrettReduce(short[] coeffs) {}
 6320   //
 6321   // coeffs (short[256]) = c_rarg0
 6322   address generate_kyberBarrettReduce() {
 6323 
 6324     __ align(CodeEntryAlignment);
 6325     StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
 6326     StubCodeMark mark(this, stub_id);
 6327     address start = __ pc();
 6328     __ enter();
 6329 
 6330     const Register coeffs = c_rarg0;
 6331 
 6332     const Register kyberConsts = r10;
 6333     const Register result = r11;
 6334 
 6335     // As above we process 256 sets of values in total i.e. 32 x
 6336     // 8H quadwords. So, we can load, add and store the data in 3
 6337     // groups of 11, 11 and 10 at a time i.e. we need to map sets
 6338     // of 10 or 11 registers. A further constraint is that the
 6339     // mapping needs to skip callee saves. So, we allocate the
 6340     // register sequences using two 8 sequences, two 2 sequences
 6341     // and two single registers.
 6342     VSeq<8> vs1_1(0);
 6343     VSeq<2> vs1_2(16);
 6344     FloatRegister vs1_3 = v28;
 6345     VSeq<8> vs2_1(18);
 6346     VSeq<2> vs2_2(26);
 6347     FloatRegister vs2_3 = v29;
 6348 
 6349     // we also need a pair of corresponding constant sequences
 6350 
 6351     VSeq<8> vc1_1(30, 0);
 6352     VSeq<2> vc1_2(30, 0);
 6353     FloatRegister vc1_3 = v30; // for kyber_q
 6354 
 6355     VSeq<8> vc2_1(31, 0);
 6356     VSeq<2> vc2_2(31, 0);
 6357     FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
 6358 
 6359     __ add(result, coeffs, 0);
 6360     __ lea(kyberConsts,
 6361              ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
 6362 
 6363     // load q and the multiplier for the Barrett reduction
 6364     __ add(kyberConsts, kyberConsts, 16);
 6365     __ ldpq(vc1_3, vc2_3, kyberConsts);
 6366 
 6367     for (int i = 0; i < 3; i++) {
 6368       // load 80 or 88 coefficients
 6369       vs_ldpq_post(vs1_1, coeffs);
 6370       vs_ldpq_post(vs1_2, coeffs);
 6371       if (i < 2) {
 6372         __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
 6373       }
 6374 
 6375       // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
 6376       vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
 6377       vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
 6378       if (i < 2) {
 6379         __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
 6380       }
 6381 
 6382       // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
 6383       vs_sshr(vs2_1, __ T8H, vs2_1, 11);
 6384       vs_sshr(vs2_2, __ T8H, vs2_2, 11);
 6385       if (i < 2) {
 6386         __ sshr(vs2_3, __ T8H, vs2_3, 11);
 6387       }
 6388 
 6389       // vs1 <- vs1 - vs2 * kyber_q
 6390       vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
 6391       vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
 6392       if (i < 2) {
 6393         __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
 6394       }
 6395 
 6396       vs_stpq_post(vs1_1, result);
 6397       vs_stpq_post(vs1_2, result);
 6398       if (i < 2) {
 6399         __ str(vs1_3, __ Q, __ post(result, 16));
 6400       }
 6401     }
 6402 
 6403     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6404     __ mov(r0, zr); // return 0
 6405     __ ret(lr);
 6406 
 6407     return start;
 6408   }
 6409 
 6410 
 6411   // Dilithium-specific montmul helper routines that generate parallel
 6412   // code for, respectively, a single 4x4s vector sequence montmul or
 6413   // two such multiplies in a row.
 6414 
 6415   // Perform 16 32-bit Montgomery multiplications in parallel
 6416   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 6417                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6418     // Use the helper routine to schedule a 4x4S Montgomery multiply.
 6419     // It will assert that the register use is valid
 6420     vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
 6421   }
 6422 
 6423   // Perform 2x16 32-bit Montgomery multiplications in parallel
 6424   void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 6425                            const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6426     // Schedule two successive 4x4S multiplies via the montmul helper
 6427     // on the front and back halves of va, vb and vc. The helper will
 6428     // assert that the register use has no overlap conflicts on each
 6429     // individual call but we also need to ensure that the necessary
 6430     // disjoint/equality constraints are met across both calls.
 6431 
 6432     // vb, vc, vtmp and vq must be disjoint. va must either be
 6433     // disjoint from all other registers or equal vc
 6434 
 6435     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 6436     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 6437     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 6438 
 6439     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 6440     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 6441 
 6442     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 6443 
 6444     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 6445     assert(vs_disjoint(va, vb), "va and vb overlap");
 6446     assert(vs_disjoint(va, vq), "va and vq overlap");
 6447     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 6448 
 6449     // We multiply the front and back halves of each sequence 4 at a
 6450     // time because
 6451     //
 6452     // 1) we are currently only able to get 4-way instruction
 6453     // parallelism at best
 6454     //
 6455     // 2) we need registers for the constants in vq and temporary
 6456     // scratch registers to hold intermediate results so vtmp can only
 6457     // be a VSeq<4> which means we only have 4 scratch slots.
 6458 
 6459     vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
 6460     vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
 6461   }
 6462 
 6463   // Perform combined montmul then add/sub on 4x4S vectors.
 6464   void dilithium_montmul16_sub_add(
 6465           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 6466           const VSeq<4>& vtmp, const VSeq<2>& vq) {
 6467     // compute a = montmul(a1, c)
 6468     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 6469     // ouptut a1 = a0 - a
 6470     vs_subv(va1, __ T4S, va0, vc);
 6471     //    and a0 = a0 + a
 6472     vs_addv(va0, __ T4S, va0, vc);
 6473   }
 6474 
 6475   // Perform combined add/sub then montul on 4x4S vectors.
 6476   void dilithium_sub_add_montmul16(
 6477           const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 6478           const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 6479     // compute c = a0 - a1
 6480     vs_subv(vtmp1, __ T4S, va0, va1);
 6481     // output a0 = a0 + a1
 6482     vs_addv(va0, __ T4S, va0, va1);
 6483     // output a1 = b montmul c
 6484     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 6485   }
 6486 
 6487   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6488   // in the Java implementation come in sequences of at least 8, so we
 6489   // can use ldpq to collect the corresponding data into pairs of vector
 6490   // registers.
 6491   // We collect the coefficients corresponding to the 'j+l' indexes into
 6492   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 6493   // then we do the (Montgomery) multiplications by the zetas in parallel
 6494   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 6495   // v0-v7, then do the additions into v24-v31 and the subtractions into
 6496   // v0-v7 and finally save the results back to the coeffs array.
 6497   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 6498     const Register coeffs, const Register zetas) {
 6499     int c1 = 0;
 6500     int c2 = 512;
 6501     int startIncr;
 6502     // don't use callee save registers v8 - v15
 6503     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6504     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6505     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6506     int offsets[4] = { 0, 32, 64, 96 };
 6507 
 6508     for (int level = 0; level < 5; level++) {
 6509       int c1Start = c1;
 6510       int c2Start = c2;
 6511       if (level == 3) {
 6512         offsets[1] = 32;
 6513         offsets[2] = 128;
 6514         offsets[3] = 160;
 6515       } else if (level == 4) {
 6516         offsets[1] = 64;
 6517         offsets[2] = 128;
 6518         offsets[3] = 192;
 6519       }
 6520 
 6521       // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
 6522       // time at 4 different offsets and multiply them in order by the
 6523       // next set of input values. So we employ indexed load and store
 6524       // pair instructions with arrangement 4S.
 6525       for (int i = 0; i < 4; i++) {
 6526         // reload q and qinv
 6527         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6528         // load 8x4S coefficients via second start pos == c2
 6529         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 6530         // load next 8x4S inputs == b
 6531         vs_ldpq_post(vs2, zetas);
 6532         // compute a == c2 * b mod MONT_Q
 6533         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6534         // load 8x4s coefficients via first start pos == c1
 6535         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6536         // compute a1 =  c1 + a
 6537         vs_addv(vs3, __ T4S, vs1, vs2);
 6538         // compute a2 =  c1 - a
 6539         vs_subv(vs1, __ T4S, vs1, vs2);
 6540         // output a1 and a2
 6541         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6542         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 6543 
 6544         int k = 4 * level + i;
 6545 
 6546         if (k > 7) {
 6547           startIncr = 256;
 6548         } else if (k == 5) {
 6549           startIncr = 384;
 6550         } else {
 6551           startIncr = 128;
 6552         }
 6553 
 6554         c1Start += startIncr;
 6555         c2Start += startIncr;
 6556       }
 6557 
 6558       c2 /= 2;
 6559     }
 6560   }
 6561 
 6562   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 6563   // Implements the method
 6564   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 6565   // of the Java class sun.security.provider
 6566   //
 6567   // coeffs (int[256]) = c_rarg0
 6568   // zetas (int[256]) = c_rarg1
 6569   address generate_dilithiumAlmostNtt() {
 6570 
 6571     __ align(CodeEntryAlignment);
 6572     StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
 6573     StubCodeMark mark(this, stub_id);
 6574     address start = __ pc();
 6575     __ enter();
 6576 
 6577     const Register coeffs = c_rarg0;
 6578     const Register zetas = c_rarg1;
 6579 
 6580     const Register tmpAddr = r9;
 6581     const Register dilithiumConsts = r10;
 6582     const Register result = r11;
 6583     // don't use callee save registers v8 - v15
 6584     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6585     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6586     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6587     int offsets[4] = { 0, 32, 64, 96};
 6588     int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6589     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6590     __ add(result, coeffs, 0);
 6591     __ lea(dilithiumConsts,
 6592              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6593 
 6594     // Each level represents one iteration of the outer for loop of the Java version.
 6595 
 6596     // level 0-4
 6597     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 6598 
 6599     // level 5
 6600 
 6601     // At level 5 the coefficients we need to combine with the zetas
 6602     // are grouped in memory in blocks of size 4. So, for both sets of
 6603     // coefficients we load 4 adjacent values at 8 different offsets
 6604     // using an indexed ldr with register variant Q and multiply them
 6605     // in sequence order by the next set of inputs. Likewise we store
 6606     // the resuls using an indexed str with register variant Q.
 6607     for (int i = 0; i < 1024; i += 256) {
 6608       // reload constants q, qinv each iteration as they get clobbered later
 6609       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6610       // load 32 (8x4S) coefficients via first offsets = c1
 6611       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6612       // load next 32 (8x4S) inputs = b
 6613       vs_ldpq_post(vs2, zetas);
 6614       // a = b montul c1
 6615       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6616       // load 32 (8x4S) coefficients via second offsets = c2
 6617       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 6618       // add/sub with result of multiply
 6619       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 6620       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 6621       // write back new coefficients using same offsets
 6622       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 6623       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 6624     }
 6625 
 6626     // level 6
 6627     // At level 6 the coefficients we need to combine with the zetas
 6628     // are grouped in memory in pairs, the first two being montmul
 6629     // inputs and the second add/sub inputs. We can still implement
 6630     // the montmul+sub+add using 4-way parallelism but only if we
 6631     // combine the coefficients with the zetas 16 at a time. We load 8
 6632     // adjacent values at 4 different offsets using an ld2 load with
 6633     // arrangement 2D. That interleaves the lower and upper halves of
 6634     // each pair of quadwords into successive vector registers. We
 6635     // then need to montmul the 4 even elements of the coefficients
 6636     // register sequence by the zetas in order and then add/sub the 4
 6637     // odd elements of the coefficients register sequence. We use an
 6638     // equivalent st2 operation to store the results back into memory
 6639     // de-interleaved.
 6640     for (int i = 0; i < 1024; i += 128) {
 6641       // reload constants q, qinv each iteration as they get clobbered later
 6642       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6643       // load interleaved 16 (4x2D) coefficients via offsets
 6644       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6645       // load next 16 (4x4S) inputs
 6646       vs_ldpq_post(vs_front(vs2), zetas);
 6647       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6648       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6649                                   vs_front(vs2), vtmp, vq);
 6650       // store interleaved 16 (4x2D) coefficients via offsets
 6651       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6652     }
 6653 
 6654     // level 7
 6655     // At level 7 the coefficients we need to combine with the zetas
 6656     // occur singly with montmul inputs alterating with add/sub
 6657     // inputs. Once again we can use 4-way parallelism to combine 16
 6658     // zetas at a time. However, we have to load 8 adjacent values at
 6659     // 4 different offsets using an ld2 load with arrangement 4S. That
 6660     // interleaves the the odd words of each pair into one
 6661     // coefficients vector register and the even words of the pair
 6662     // into the next register. We then need to montmul the 4 even
 6663     // elements of the coefficients register sequence by the zetas in
 6664     // order and then add/sub the 4 odd elements of the coefficients
 6665     // register sequence. We use an equivalent st2 operation to store
 6666     // the results back into memory de-interleaved.
 6667 
 6668     for (int i = 0; i < 1024; i += 128) {
 6669       // reload constants q, qinv each iteration as they get clobbered later
 6670       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6671       // load interleaved 16 (4x4S) coefficients via offsets
 6672       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6673       // load next 16 (4x4S) inputs
 6674       vs_ldpq_post(vs_front(vs2), zetas);
 6675       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 6676       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 6677                                   vs_front(vs2), vtmp, vq);
 6678       // store interleaved 16 (4x4S) coefficients via offsets
 6679       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6680     }
 6681     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6682     __ mov(r0, zr); // return 0
 6683     __ ret(lr);
 6684 
 6685     return start;
 6686   }
 6687 
 6688   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 6689   // in the Java implementation come in sequences of at least 8, so we
 6690   // can use ldpq to collect the corresponding data into pairs of vector
 6691   // registers
 6692   // We collect the coefficients that correspond to the 'j's into vs1
 6693   // the coefficiets that correspond to the 'j+l's into vs2 then
 6694   // do the additions into vs3 and the subtractions into vs1 then
 6695   // save the result of the additions, load the zetas into vs2
 6696   // do the (Montgomery) multiplications by zeta in parallel into vs2
 6697   // finally save the results back to the coeffs array
 6698   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 6699     const Register coeffs, const Register zetas) {
 6700     int c1 = 0;
 6701     int c2 = 32;
 6702     int startIncr;
 6703     int offsets[4];
 6704     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6705     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6706     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6707 
 6708     offsets[0] = 0;
 6709 
 6710     for (int level = 3; level < 8; level++) {
 6711       int c1Start = c1;
 6712       int c2Start = c2;
 6713       if (level == 3) {
 6714         offsets[1] = 64;
 6715         offsets[2] = 128;
 6716         offsets[3] = 192;
 6717       } else if (level == 4) {
 6718         offsets[1] = 32;
 6719         offsets[2] = 128;
 6720         offsets[3] = 160;
 6721       } else {
 6722         offsets[1] = 32;
 6723         offsets[2] = 64;
 6724         offsets[3] = 96;
 6725       }
 6726 
 6727       // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
 6728       // time at 4 different offsets and multiply them in order by the
 6729       // next set of input values. So we employ indexed load and store
 6730       // pair instructions with arrangement 4S.
 6731       for (int i = 0; i < 4; i++) {
 6732         // load v1 32 (8x4S) coefficients relative to first start index
 6733         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 6734         // load v2 32 (8x4S) coefficients relative to second start index
 6735         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 6736         // a0 = v1 + v2 -- n.b. clobbers vqs
 6737         vs_addv(vs3, __ T4S, vs1, vs2);
 6738         // a1 = v1 - v2
 6739         vs_subv(vs1, __ T4S, vs1, vs2);
 6740         // save a1 relative to first start index
 6741         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 6742         // load constants q, qinv each iteration as they get clobbered above
 6743         vs_ldpq(vq, dilithiumConsts); // qInv, q
 6744         // load b next 32 (8x4S) inputs
 6745         vs_ldpq_post(vs2, zetas);
 6746         // a = a1 montmul b
 6747         dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6748         // save a relative to second start index
 6749         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 6750 
 6751         int k = 4 * level + i;
 6752 
 6753         if (k < 24) {
 6754           startIncr = 256;
 6755         } else if (k == 25) {
 6756           startIncr = 384;
 6757         } else {
 6758           startIncr = 128;
 6759         }
 6760 
 6761         c1Start += startIncr;
 6762         c2Start += startIncr;
 6763       }
 6764 
 6765       c2 *= 2;
 6766     }
 6767   }
 6768 
 6769   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 6770   // Implements the method
 6771   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 6772   // the sun.security.provider.ML_DSA class.
 6773   //
 6774   // coeffs (int[256]) = c_rarg0
 6775   // zetas (int[256]) = c_rarg1
 6776   address generate_dilithiumAlmostInverseNtt() {
 6777 
 6778     __ align(CodeEntryAlignment);
 6779     StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
 6780     StubCodeMark mark(this, stub_id);
 6781     address start = __ pc();
 6782     __ enter();
 6783 
 6784     const Register coeffs = c_rarg0;
 6785     const Register zetas = c_rarg1;
 6786 
 6787     const Register tmpAddr = r9;
 6788     const Register dilithiumConsts = r10;
 6789     const Register result = r11;
 6790     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6791     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 6792     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6793     int offsets[4] = { 0, 32, 64, 96 };
 6794     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 6795     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 6796 
 6797     __ add(result, coeffs, 0);
 6798     __ lea(dilithiumConsts,
 6799              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6800 
 6801     // Each level represents one iteration of the outer for loop of the Java version
 6802 
 6803     // level 0
 6804     // At level 0 we need to interleave adjacent quartets of
 6805     // coefficients before we multiply and add/sub by the next 16
 6806     // zetas just as we did for level 7 in the multiply code. So we
 6807     // load and store the values using an ld2/st2 with arrangement 4S.
 6808     for (int i = 0; i < 1024; i += 128) {
 6809       // load constants q, qinv
 6810       // n.b. this can be moved out of the loop as they do not get
 6811       // clobbered by first two loops
 6812       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6813       // a0/a1 load interleaved 32 (8x4S) coefficients
 6814       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6815       // b load next 32 (8x4S) inputs
 6816       vs_ldpq_post(vs_front(vs2), zetas);
 6817       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6818       // n.b. second half of vs2 provides temporary register storage
 6819       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6820                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6821       // a0/a1 store interleaved 32 (8x4S) coefficients
 6822       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 6823     }
 6824 
 6825     // level 1
 6826     // At level 1 we need to interleave pairs of adjacent pairs of
 6827     // coefficients before we multiply by the next 16 zetas just as we
 6828     // did for level 6 in the multiply code. So we load and store the
 6829     // values an ld2/st2 with arrangement 2D.
 6830     for (int i = 0; i < 1024; i += 128) {
 6831       // a0/a1 load interleaved 32 (8x2D) coefficients
 6832       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6833       // b load next 16 (4x4S) inputs
 6834       vs_ldpq_post(vs_front(vs2), zetas);
 6835       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 6836       // n.b. second half of vs2 provides temporary register storage
 6837       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 6838                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 6839       // a0/a1 store interleaved 32 (8x2D) coefficients
 6840       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 6841     }
 6842 
 6843     // level 2
 6844     // At level 2 coefficients come in blocks of 4. So, we load 4
 6845     // adjacent coefficients at 8 distinct offsets for both the first
 6846     // and second coefficient sequences, using an ldr with register
 6847     // variant Q then combine them with next set of 32 zetas. Likewise
 6848     // we store the results using an str with register variant Q.
 6849     for (int i = 0; i < 1024; i += 256) {
 6850       // c0 load 32 (8x4S) coefficients via first offsets
 6851       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 6852       // c1 load 32 (8x4S) coefficients via second offsets
 6853       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 6854       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 6855       vs_addv(vs3, __ T4S, vs1, vs2);
 6856       // c = c0 - c1
 6857       vs_subv(vs1, __ T4S, vs1, vs2);
 6858       // store a0 32 (8x4S) coefficients via first offsets
 6859       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 6860       // b load 32 (8x4S) next inputs
 6861       vs_ldpq_post(vs2, zetas);
 6862       // reload constants q, qinv -- they were clobbered earlier
 6863       vs_ldpq(vq, dilithiumConsts); // qInv, q
 6864       // compute a1 = b montmul c
 6865       dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6866       // store a1 32 (8x4S) coefficients via second offsets
 6867       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 6868     }
 6869 
 6870     // level 3-7
 6871     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 6872 
 6873     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6874     __ mov(r0, zr); // return 0
 6875     __ ret(lr);
 6876 
 6877     return start;
 6878   }
 6879 
 6880   // Dilithium multiply polynomials in the NTT domain.
 6881   // Straightforward implementation of the method
 6882   // static int implDilithiumNttMult(
 6883   //              int[] result, int[] ntta, int[] nttb {} of
 6884   // the sun.security.provider.ML_DSA class.
 6885   //
 6886   // result (int[256]) = c_rarg0
 6887   // poly1 (int[256]) = c_rarg1
 6888   // poly2 (int[256]) = c_rarg2
 6889   address generate_dilithiumNttMult() {
 6890 
 6891         __ align(CodeEntryAlignment);
 6892     StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
 6893     StubCodeMark mark(this, stub_id);
 6894     address start = __ pc();
 6895     __ enter();
 6896 
 6897     Label L_loop;
 6898 
 6899     const Register result = c_rarg0;
 6900     const Register poly1 = c_rarg1;
 6901     const Register poly2 = c_rarg2;
 6902 
 6903     const Register dilithiumConsts = r10;
 6904     const Register len = r11;
 6905 
 6906     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6907     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 6908     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6909     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 6910 
 6911     __ lea(dilithiumConsts,
 6912              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6913 
 6914     // load constants q, qinv
 6915     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6916     // load constant rSquare into v29
 6917     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 6918 
 6919     __ mov(len, zr);
 6920     __ add(len, len, 1024);
 6921 
 6922     __ BIND(L_loop);
 6923 
 6924     // b load 32 (8x4S) next inputs from poly1
 6925     vs_ldpq_post(vs1, poly1);
 6926     // c load 32 (8x4S) next inputs from poly2
 6927     vs_ldpq_post(vs2, poly2);
 6928     // compute a = b montmul c
 6929     dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
 6930     // compute a = rsquare montmul a
 6931     dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 6932     // save a 32 (8x4S) results
 6933     vs_stpq_post(vs2, result);
 6934 
 6935     __ sub(len, len, 128);
 6936     __ cmp(len, (u1)128);
 6937     __ br(Assembler::GE, L_loop);
 6938 
 6939     __ leave(); // required for proper stackwalking of RuntimeStub frame
 6940     __ mov(r0, zr); // return 0
 6941     __ ret(lr);
 6942 
 6943     return start;
 6944   }
 6945 
 6946   // Dilithium Motgomery multiply an array by a constant.
 6947   // A straightforward implementation of the method
 6948   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 6949   // of the sun.security.provider.MLDSA class
 6950   //
 6951   // coeffs (int[256]) = c_rarg0
 6952   // constant (int) = c_rarg1
 6953   address generate_dilithiumMontMulByConstant() {
 6954 
 6955     __ align(CodeEntryAlignment);
 6956     StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
 6957     StubCodeMark mark(this, stub_id);
 6958     address start = __ pc();
 6959     __ enter();
 6960 
 6961     Label L_loop;
 6962 
 6963     const Register coeffs = c_rarg0;
 6964     const Register constant = c_rarg1;
 6965 
 6966     const Register dilithiumConsts = r10;
 6967     const Register result = r11;
 6968     const Register len = r12;
 6969 
 6970     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 6971     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 6972     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 6973     VSeq<8> vconst(29, 0);             // for montmul by constant
 6974 
 6975     // results track inputs
 6976     __ add(result, coeffs, 0);
 6977     __ lea(dilithiumConsts,
 6978              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 6979 
 6980     // load constants q, qinv -- they do not get clobbered by first two loops
 6981     vs_ldpq(vq, dilithiumConsts); // qInv, q
 6982     // copy caller supplied constant across vconst
 6983     __ dup(vconst[0], __ T4S, constant);
 6984     __ mov(len, zr);
 6985     __ add(len, len, 1024);
 6986 
 6987     __ BIND(L_loop);
 6988 
 6989     // load next 32 inputs
 6990     vs_ldpq_post(vs2, coeffs);
 6991     // mont mul by constant
 6992     dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
 6993     // write next 32 results
 6994     vs_stpq_post(vs2, result);
 6995 
 6996     __ sub(len, len, 128);
 6997     __ cmp(len, (u1)128);
 6998     __ br(Assembler::GE, L_loop);
 6999 
 7000     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7001     __ mov(r0, zr); // return 0
 7002     __ ret(lr);
 7003 
 7004     return start;
 7005   }
 7006 
 7007   // Dilithium decompose poly.
 7008   // Implements the method
 7009   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 7010   // of the sun.security.provider.ML_DSA class
 7011   //
 7012   // input (int[256]) = c_rarg0
 7013   // lowPart (int[256]) = c_rarg1
 7014   // highPart (int[256]) = c_rarg2
 7015   // twoGamma2  (int) = c_rarg3
 7016   // multiplier (int) = c_rarg4
 7017   address generate_dilithiumDecomposePoly() {
 7018 
 7019     __ align(CodeEntryAlignment);
 7020     StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
 7021     StubCodeMark mark(this, stub_id);
 7022     address start = __ pc();
 7023     Label L_loop;
 7024 
 7025     const Register input = c_rarg0;
 7026     const Register lowPart = c_rarg1;
 7027     const Register highPart = c_rarg2;
 7028     const Register twoGamma2 = c_rarg3;
 7029     const Register multiplier = c_rarg4;
 7030 
 7031     const Register len = r9;
 7032     const Register dilithiumConsts = r10;
 7033     const Register tmp = r11;
 7034 
 7035     // 6 independent sets of 4x4s values
 7036     VSeq<4> vs1(0), vs2(4), vs3(8);
 7037     VSeq<4> vs4(12), vs5(16), vtmp(20);
 7038 
 7039     // 7 constants for cross-multiplying
 7040     VSeq<4> one(25, 0);
 7041     VSeq<4> qminus1(26, 0);
 7042     VSeq<4> g2(27, 0);
 7043     VSeq<4> twog2(28, 0);
 7044     VSeq<4> mult(29, 0);
 7045     VSeq<4> q(30, 0);
 7046     VSeq<4> qadd(31, 0);
 7047 
 7048     __ enter();
 7049 
 7050     __ lea(dilithiumConsts,
 7051              ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 7052 
 7053     // save callee-saved registers
 7054     __ stpd(v8, v9, __ pre(sp, -64));
 7055     __ stpd(v10, v11, Address(sp, 16));
 7056     __ stpd(v12, v13, Address(sp, 32));
 7057     __ stpd(v14, v15, Address(sp, 48));
 7058 
 7059     // populate constant registers
 7060     __ mov(tmp, zr);
 7061     __ add(tmp, tmp, 1);
 7062     __ dup(one[0], __ T4S, tmp); // 1
 7063     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 7064     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 7065     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 7066     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 7067     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 7068     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 7069 
 7070     __ mov(len, zr);
 7071     __ add(len, len, 1024);
 7072 
 7073     __ BIND(L_loop);
 7074 
 7075     // load next 4x4S inputs interleaved: rplus --> vs1
 7076     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 7077 
 7078     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 7079     vs_addv(vtmp, __ T4S, vs1, qadd);
 7080     vs_sshr(vtmp, __ T4S, vtmp, 23);
 7081     vs_mulv(vtmp, __ T4S, vtmp, q);
 7082     vs_subv(vs1, __ T4S, vs1, vtmp);
 7083 
 7084     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 7085     vs_sshr(vtmp, __ T4S, vs1, 31);
 7086     vs_andr(vtmp, vtmp, q);
 7087     vs_addv(vs1, __ T4S, vs1, vtmp);
 7088 
 7089     // quotient --> vs2
 7090     // int quotient = (rplus * multiplier) >> 22;
 7091     vs_mulv(vtmp, __ T4S, vs1, mult);
 7092     vs_sshr(vs2, __ T4S, vtmp, 22);
 7093 
 7094     // r0 --> vs3
 7095     // int r0 = rplus - quotient * twoGamma2;
 7096     vs_mulv(vtmp, __ T4S, vs2, twog2);
 7097     vs_subv(vs3, __ T4S, vs1, vtmp);
 7098 
 7099     // mask --> vs4
 7100     // int mask = (twoGamma2 - r0) >> 22;
 7101     vs_subv(vtmp, __ T4S, twog2, vs3);
 7102     vs_sshr(vs4, __ T4S, vtmp, 22);
 7103 
 7104     // r0 -= (mask & twoGamma2);
 7105     vs_andr(vtmp, vs4, twog2);
 7106     vs_subv(vs3, __ T4S, vs3, vtmp);
 7107 
 7108     //  quotient += (mask & 1);
 7109     vs_andr(vtmp, vs4, one);
 7110     vs_addv(vs2, __ T4S, vs2, vtmp);
 7111 
 7112     // mask = (twoGamma2 / 2 - r0) >> 31;
 7113     vs_subv(vtmp, __ T4S, g2, vs3);
 7114     vs_sshr(vs4, __ T4S, vtmp, 31);
 7115 
 7116     // r0 -= (mask & twoGamma2);
 7117     vs_andr(vtmp, vs4, twog2);
 7118     vs_subv(vs3, __ T4S, vs3, vtmp);
 7119 
 7120     // quotient += (mask & 1);
 7121     vs_andr(vtmp, vs4, one);
 7122     vs_addv(vs2, __ T4S, vs2, vtmp);
 7123 
 7124     // r1 --> vs5
 7125     // int r1 = rplus - r0 - (dilithium_q - 1);
 7126     vs_subv(vtmp, __ T4S, vs1, vs3);
 7127     vs_subv(vs5, __ T4S, vtmp, qminus1);
 7128 
 7129     // r1 --> vs1 (overwriting rplus)
 7130     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 7131     vs_negr(vtmp, __ T4S, vs5);
 7132     vs_orr(vtmp, vs5, vtmp);
 7133     vs_sshr(vs1, __ T4S, vtmp, 31);
 7134 
 7135     // r0 += ~r1;
 7136     vs_notr(vtmp, vs1);
 7137     vs_addv(vs3, __ T4S, vs3, vtmp);
 7138 
 7139     // r1 = r1 & quotient;
 7140     vs_andr(vs1, vs2, vs1);
 7141 
 7142     // store results inteleaved
 7143     // lowPart[m] = r0;
 7144     // highPart[m] = r1;
 7145     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 7146     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 7147 
 7148     __ sub(len, len, 64);
 7149     __ cmp(len, (u1)64);
 7150     __ br(Assembler::GE, L_loop);
 7151 
 7152     // restore callee-saved vector registers
 7153     __ ldpd(v14, v15, Address(sp, 48));
 7154     __ ldpd(v12, v13, Address(sp, 32));
 7155     __ ldpd(v10, v11, Address(sp, 16));
 7156     __ ldpd(v8, v9, __ post(sp, 64));
 7157 
 7158     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7159     __ mov(r0, zr); // return 0
 7160     __ ret(lr);
 7161 
 7162     return start;
 7163   }
 7164 
 7165   void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
 7166              Register tmp0, Register tmp1, Register tmp2) {
 7167     __ bic(tmp0, a2, a1); // for a0
 7168     __ bic(tmp1, a3, a2); // for a1
 7169     __ bic(tmp2, a4, a3); // for a2
 7170     __ eor(a2, a2, tmp2);
 7171     __ bic(tmp2, a0, a4); // for a3
 7172     __ eor(a3, a3, tmp2);
 7173     __ bic(tmp2, a1, a0); // for a4
 7174     __ eor(a0, a0, tmp0);
 7175     __ eor(a1, a1, tmp1);
 7176     __ eor(a4, a4, tmp2);
 7177   }
 7178 
 7179   void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
 7180                         Register a0, Register a1, Register a2, Register a3, Register a4,
 7181                         Register a5, Register a6, Register a7, Register a8, Register a9,
 7182                         Register a10, Register a11, Register a12, Register a13, Register a14,
 7183                         Register a15, Register a16, Register a17, Register a18, Register a19,
 7184                         Register a20, Register a21, Register a22, Register a23, Register a24,
 7185                         Register tmp0, Register tmp1, Register tmp2) {
 7186     __ eor3(tmp1, a4, a9, a14);
 7187     __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
 7188     __ eor3(tmp2, a1, a6, a11);
 7189     __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
 7190     __ rax1(tmp2, tmp0, tmp1); // d0
 7191     {
 7192 
 7193       Register tmp3, tmp4;
 7194       if (can_use_fp && can_use_r18) {
 7195         tmp3 = rfp;
 7196         tmp4 = r18_tls;
 7197       } else {
 7198         tmp3 = a4;
 7199         tmp4 = a9;
 7200         __ stp(tmp3, tmp4, __ pre(sp, -16));
 7201       }
 7202 
 7203       __ eor3(tmp3, a0, a5, a10);
 7204       __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
 7205       __ eor(a0, a0, tmp2);
 7206       __ eor(a5, a5, tmp2);
 7207       __ eor(a10, a10, tmp2);
 7208       __ eor(a15, a15, tmp2);
 7209       __ eor(a20, a20, tmp2); // d0(tmp2)
 7210       __ eor3(tmp3, a2, a7, a12);
 7211       __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
 7212       __ rax1(tmp3, tmp4, tmp2); // d1
 7213       __ eor(a1, a1, tmp3);
 7214       __ eor(a6, a6, tmp3);
 7215       __ eor(a11, a11, tmp3);
 7216       __ eor(a16, a16, tmp3);
 7217       __ eor(a21, a21, tmp3); // d1(tmp3)
 7218       __ rax1(tmp3, tmp2, tmp0); // d3
 7219       __ eor3(tmp2, a3, a8, a13);
 7220       __ eor3(tmp0, tmp2, a18, a23);  // tmp0 = a3^a8^a13^a18^a23 = c3
 7221       __ eor(a3, a3, tmp3);
 7222       __ eor(a8, a8, tmp3);
 7223       __ eor(a13, a13, tmp3);
 7224       __ eor(a18, a18, tmp3);
 7225       __ eor(a23, a23, tmp3);
 7226       __ rax1(tmp2, tmp1, tmp0); // d2
 7227       __ eor(a2, a2, tmp2);
 7228       __ eor(a7, a7, tmp2);
 7229       __ eor(a12, a12, tmp2);
 7230       __ rax1(tmp0, tmp0, tmp4); // d4
 7231       if (!can_use_fp || !can_use_r18) {
 7232         __ ldp(tmp3, tmp4, __ post(sp, 16));
 7233       }
 7234       __ eor(a17, a17, tmp2);
 7235       __ eor(a22, a22, tmp2);
 7236       __ eor(a4, a4, tmp0);
 7237       __ eor(a9, a9, tmp0);
 7238       __ eor(a14, a14, tmp0);
 7239       __ eor(a19, a19, tmp0);
 7240       __ eor(a24, a24, tmp0);
 7241     }
 7242 
 7243     __ rol(tmp0, a10, 3);
 7244     __ rol(a10, a1, 1);
 7245     __ rol(a1, a6, 44);
 7246     __ rol(a6, a9, 20);
 7247     __ rol(a9, a22, 61);
 7248     __ rol(a22, a14, 39);
 7249     __ rol(a14, a20, 18);
 7250     __ rol(a20, a2, 62);
 7251     __ rol(a2, a12, 43);
 7252     __ rol(a12, a13, 25);
 7253     __ rol(a13, a19, 8) ;
 7254     __ rol(a19, a23, 56);
 7255     __ rol(a23, a15, 41);
 7256     __ rol(a15, a4, 27);
 7257     __ rol(a4, a24, 14);
 7258     __ rol(a24, a21, 2);
 7259     __ rol(a21, a8, 55);
 7260     __ rol(a8, a16, 45);
 7261     __ rol(a16, a5, 36);
 7262     __ rol(a5, a3, 28);
 7263     __ rol(a3, a18, 21);
 7264     __ rol(a18, a17, 15);
 7265     __ rol(a17, a11, 10);
 7266     __ rol(a11, a7, 6);
 7267     __ mov(a7, tmp0);
 7268 
 7269     bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
 7270     bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
 7271     bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
 7272     bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
 7273     bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
 7274 
 7275     __ ldr(tmp1, __ post(rc, 8));
 7276     __ eor(a0, a0, tmp1);
 7277 
 7278   }
 7279 
 7280   // Arguments:
 7281   //
 7282   // Inputs:
 7283   //   c_rarg0   - byte[]  source+offset
 7284   //   c_rarg1   - byte[]  SHA.state
 7285   //   c_rarg2   - int     block_size
 7286   //   c_rarg3   - int     offset
 7287   //   c_rarg4   - int     limit
 7288   //
 7289   address generate_sha3_implCompress_gpr(StubId stub_id) {
 7290     bool multi_block;
 7291     switch (stub_id) {
 7292     case StubId::stubgen_sha3_implCompress_id:
 7293       multi_block = false;
 7294       break;
 7295     case StubId::stubgen_sha3_implCompressMB_id:
 7296       multi_block = true;
 7297       break;
 7298     default:
 7299       ShouldNotReachHere();
 7300     }
 7301 
 7302     static const uint64_t round_consts[24] = {
 7303       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 7304       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 7305       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 7306       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 7307       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 7308       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 7309       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 7310       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 7311     };
 7312 
 7313     __ align(CodeEntryAlignment);
 7314     StubCodeMark mark(this, stub_id);
 7315     address start = __ pc();
 7316 
 7317     Register buf           = c_rarg0;
 7318     Register state         = c_rarg1;
 7319     Register block_size    = c_rarg2;
 7320     Register ofs           = c_rarg3;
 7321     Register limit         = c_rarg4;
 7322 
 7323     // use r3.r17,r19..r28 to keep a0..a24.
 7324     // a0..a24 are respective locals from SHA3.java
 7325     Register a0 = r25,
 7326              a1 = r26,
 7327              a2 = r27,
 7328              a3 = r3,
 7329              a4 = r4,
 7330              a5 = r5,
 7331              a6 = r6,
 7332              a7 = r7,
 7333              a8 = rscratch1, // r8
 7334              a9 = rscratch2, // r9
 7335              a10 = r10,
 7336              a11 = r11,
 7337              a12 = r12,
 7338              a13 = r13,
 7339              a14 = r14,
 7340              a15 = r15,
 7341              a16 = r16,
 7342              a17 = r17,
 7343              a18 = r28,
 7344              a19 = r19,
 7345              a20 = r20,
 7346              a21 = r21,
 7347              a22 = r22,
 7348              a23 = r23,
 7349              a24 = r24;
 7350 
 7351     Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
 7352 
 7353     Label sha3_loop, rounds24_preloop, loop_body;
 7354     Label sha3_512_or_sha3_384, shake128;
 7355 
 7356     bool can_use_r18 = false;
 7357 #ifndef R18_RESERVED
 7358     can_use_r18 = true;
 7359 #endif
 7360     bool can_use_fp = !PreserveFramePointer;
 7361 
 7362     __ enter();
 7363 
 7364     // save almost all yet unsaved gpr registers on stack
 7365     __ str(block_size, __ pre(sp, -128));
 7366     if (multi_block) {
 7367       __ stpw(ofs, limit, Address(sp, 8));
 7368     }
 7369     // 8 bytes at sp+16 will be used to keep buf
 7370     __ stp(r19, r20, Address(sp, 32));
 7371     __ stp(r21, r22, Address(sp, 48));
 7372     __ stp(r23, r24, Address(sp, 64));
 7373     __ stp(r25, r26, Address(sp, 80));
 7374     __ stp(r27, r28, Address(sp, 96));
 7375     if (can_use_r18 && can_use_fp) {
 7376       __ stp(r18_tls, state, Address(sp, 112));
 7377     } else {
 7378       __ str(state, Address(sp, 112));
 7379     }
 7380 
 7381     // begin sha3 calculations: loading a0..a24 from state arrary
 7382     __ ldp(a0, a1, state);
 7383     __ ldp(a2, a3, Address(state, 16));
 7384     __ ldp(a4, a5, Address(state, 32));
 7385     __ ldp(a6, a7, Address(state, 48));
 7386     __ ldp(a8, a9, Address(state, 64));
 7387     __ ldp(a10, a11, Address(state, 80));
 7388     __ ldp(a12, a13, Address(state, 96));
 7389     __ ldp(a14, a15, Address(state, 112));
 7390     __ ldp(a16, a17, Address(state, 128));
 7391     __ ldp(a18, a19, Address(state, 144));
 7392     __ ldp(a20, a21, Address(state, 160));
 7393     __ ldp(a22, a23, Address(state, 176));
 7394     __ ldr(a24, Address(state, 192));
 7395 
 7396     __ BIND(sha3_loop);
 7397 
 7398     // load input
 7399     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7400     __ eor(a0, a0, tmp3);
 7401     __ eor(a1, a1, tmp2);
 7402     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7403     __ eor(a2, a2, tmp3);
 7404     __ eor(a3, a3, tmp2);
 7405     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7406     __ eor(a4, a4, tmp3);
 7407     __ eor(a5, a5, tmp2);
 7408     __ ldr(tmp3, __ post(buf, 8));
 7409     __ eor(a6, a6, tmp3);
 7410 
 7411     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 7412     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 7413 
 7414     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7415     __ eor(a7, a7, tmp3);
 7416     __ eor(a8, a8, tmp2);
 7417     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7418     __ eor(a9, a9, tmp3);
 7419     __ eor(a10, a10, tmp2);
 7420     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7421     __ eor(a11, a11, tmp3);
 7422     __ eor(a12, a12, tmp2);
 7423     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7424     __ eor(a13, a13, tmp3);
 7425     __ eor(a14, a14, tmp2);
 7426     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7427     __ eor(a15, a15, tmp3);
 7428     __ eor(a16, a16, tmp2);
 7429 
 7430     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 7431     __ andw(tmp2, block_size, 48);
 7432     __ cbzw(tmp2, rounds24_preloop);
 7433     __ tbnz(block_size, 5, shake128);
 7434     // block_size == 144, bit5 == 0, SHA3-244
 7435     __ ldr(tmp3, __ post(buf, 8));
 7436     __ eor(a17, a17, tmp3);
 7437     __ b(rounds24_preloop);
 7438 
 7439     __ BIND(shake128);
 7440     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7441     __ eor(a17, a17, tmp3);
 7442     __ eor(a18, a18, tmp2);
 7443     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7444     __ eor(a19, a19, tmp3);
 7445     __ eor(a20, a20, tmp2);
 7446     __ b(rounds24_preloop); // block_size == 168, SHAKE128
 7447 
 7448     __ BIND(sha3_512_or_sha3_384);
 7449     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7450     __ eor(a7, a7, tmp3);
 7451     __ eor(a8, a8, tmp2);
 7452     __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
 7453 
 7454     // SHA3-384
 7455     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7456     __ eor(a9, a9, tmp3);
 7457     __ eor(a10, a10, tmp2);
 7458     __ ldp(tmp3, tmp2, __ post(buf, 16));
 7459     __ eor(a11, a11, tmp3);
 7460     __ eor(a12, a12, tmp2);
 7461 
 7462     __ BIND(rounds24_preloop);
 7463     __ fmovs(v0, 24.0); // float loop counter,
 7464     __ fmovs(v1, 1.0);  // exact representation
 7465 
 7466     __ str(buf, Address(sp, 16));
 7467     __ lea(tmp3, ExternalAddress((address) round_consts));
 7468 
 7469     __ BIND(loop_body);
 7470     keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
 7471                      a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
 7472                      a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
 7473                      tmp0, tmp1, tmp2);
 7474     __ fsubs(v0, v0, v1);
 7475     __ fcmps(v0, 0.0);
 7476     __ br(__ NE, loop_body);
 7477 
 7478     if (multi_block) {
 7479       __ ldrw(block_size, sp); // block_size
 7480       __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
 7481       __ addw(tmp2, tmp2, block_size);
 7482       __ cmpw(tmp2, tmp1);
 7483       __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
 7484       __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
 7485       __ br(Assembler::LE, sha3_loop);
 7486       __ movw(c_rarg0, tmp2); // return offset
 7487     }
 7488     if (can_use_fp && can_use_r18) {
 7489       __ ldp(r18_tls, state, Address(sp, 112));
 7490     } else {
 7491       __ ldr(state, Address(sp, 112));
 7492     }
 7493     // save calculated sha3 state
 7494     __ stp(a0, a1, Address(state));
 7495     __ stp(a2, a3, Address(state, 16));
 7496     __ stp(a4, a5, Address(state, 32));
 7497     __ stp(a6, a7, Address(state, 48));
 7498     __ stp(a8, a9, Address(state, 64));
 7499     __ stp(a10, a11, Address(state, 80));
 7500     __ stp(a12, a13, Address(state, 96));
 7501     __ stp(a14, a15, Address(state, 112));
 7502     __ stp(a16, a17, Address(state, 128));
 7503     __ stp(a18, a19, Address(state, 144));
 7504     __ stp(a20, a21, Address(state, 160));
 7505     __ stp(a22, a23, Address(state, 176));
 7506     __ str(a24, Address(state, 192));
 7507 
 7508     // restore required registers from stack
 7509     __ ldp(r19, r20, Address(sp, 32));
 7510     __ ldp(r21, r22, Address(sp, 48));
 7511     __ ldp(r23, r24, Address(sp, 64));
 7512     __ ldp(r25, r26, Address(sp, 80));
 7513     __ ldp(r27, r28, Address(sp, 96));
 7514     if (can_use_fp && can_use_r18) {
 7515       __ add(rfp, sp, 128); // leave() will copy rfp to sp below
 7516     } // else no need to recalculate rfp, since it wasn't changed
 7517 
 7518     __ leave();
 7519 
 7520     __ ret(lr);
 7521 
 7522     return start;
 7523   }
 7524 
 7525   /**
 7526    *  Arguments:
 7527    *
 7528    * Inputs:
 7529    *   c_rarg0   - int crc
 7530    *   c_rarg1   - byte* buf
 7531    *   c_rarg2   - int length
 7532    *
 7533    * Output:
 7534    *       rax   - int crc result
 7535    */
 7536   address generate_updateBytesCRC32() {
 7537     assert(UseCRC32Intrinsics, "what are we doing here?");
 7538 
 7539     __ align(CodeEntryAlignment);
 7540     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
 7541     StubCodeMark mark(this, stub_id);
 7542 
 7543     address start = __ pc();
 7544 
 7545     const Register crc   = c_rarg0;  // crc
 7546     const Register buf   = c_rarg1;  // source java byte array address
 7547     const Register len   = c_rarg2;  // length
 7548     const Register table0 = c_rarg3; // crc_table address
 7549     const Register table1 = c_rarg4;
 7550     const Register table2 = c_rarg5;
 7551     const Register table3 = c_rarg6;
 7552     const Register tmp3 = c_rarg7;
 7553 
 7554     BLOCK_COMMENT("Entry:");
 7555     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7556 
 7557     __ kernel_crc32(crc, buf, len,
 7558               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7559 
 7560     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7561     __ ret(lr);
 7562 
 7563     return start;
 7564   }
 7565 
 7566   /**
 7567    *  Arguments:
 7568    *
 7569    * Inputs:
 7570    *   c_rarg0   - int crc
 7571    *   c_rarg1   - byte* buf
 7572    *   c_rarg2   - int length
 7573    *   c_rarg3   - int* table
 7574    *
 7575    * Output:
 7576    *       r0   - int crc result
 7577    */
 7578   address generate_updateBytesCRC32C() {
 7579     assert(UseCRC32CIntrinsics, "what are we doing here?");
 7580 
 7581     __ align(CodeEntryAlignment);
 7582     StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
 7583     StubCodeMark mark(this, stub_id);
 7584 
 7585     address start = __ pc();
 7586 
 7587     const Register crc   = c_rarg0;  // crc
 7588     const Register buf   = c_rarg1;  // source java byte array address
 7589     const Register len   = c_rarg2;  // length
 7590     const Register table0 = c_rarg3; // crc_table address
 7591     const Register table1 = c_rarg4;
 7592     const Register table2 = c_rarg5;
 7593     const Register table3 = c_rarg6;
 7594     const Register tmp3 = c_rarg7;
 7595 
 7596     BLOCK_COMMENT("Entry:");
 7597     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7598 
 7599     __ kernel_crc32c(crc, buf, len,
 7600               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 7601 
 7602     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7603     __ ret(lr);
 7604 
 7605     return start;
 7606   }
 7607 
 7608   /***
 7609    *  Arguments:
 7610    *
 7611    *  Inputs:
 7612    *   c_rarg0   - int   adler
 7613    *   c_rarg1   - byte* buff
 7614    *   c_rarg2   - int   len
 7615    *
 7616    * Output:
 7617    *   c_rarg0   - int adler result
 7618    */
 7619   address generate_updateBytesAdler32() {
 7620     __ align(CodeEntryAlignment);
 7621     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
 7622     StubCodeMark mark(this, stub_id);
 7623     address start = __ pc();
 7624 
 7625     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 7626 
 7627     // Aliases
 7628     Register adler  = c_rarg0;
 7629     Register s1     = c_rarg0;
 7630     Register s2     = c_rarg3;
 7631     Register buff   = c_rarg1;
 7632     Register len    = c_rarg2;
 7633     Register nmax  = r4;
 7634     Register base  = r5;
 7635     Register count = r6;
 7636     Register temp0 = rscratch1;
 7637     Register temp1 = rscratch2;
 7638     FloatRegister vbytes = v0;
 7639     FloatRegister vs1acc = v1;
 7640     FloatRegister vs2acc = v2;
 7641     FloatRegister vtable = v3;
 7642 
 7643     // Max number of bytes we can process before having to take the mod
 7644     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 7645     uint64_t BASE = 0xfff1;
 7646     uint64_t NMAX = 0x15B0;
 7647 
 7648     __ mov(base, BASE);
 7649     __ mov(nmax, NMAX);
 7650 
 7651     // Load accumulation coefficients for the upper 16 bits
 7652     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 7653     __ ld1(vtable, __ T16B, Address(temp0));
 7654 
 7655     // s1 is initialized to the lower 16 bits of adler
 7656     // s2 is initialized to the upper 16 bits of adler
 7657     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 7658     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 7659 
 7660     // The pipelined loop needs at least 16 elements for 1 iteration
 7661     // It does check this, but it is more effective to skip to the cleanup loop
 7662     __ cmp(len, (u1)16);
 7663     __ br(Assembler::HS, L_nmax);
 7664     __ cbz(len, L_combine);
 7665 
 7666     __ bind(L_simple_by1_loop);
 7667     __ ldrb(temp0, Address(__ post(buff, 1)));
 7668     __ add(s1, s1, temp0);
 7669     __ add(s2, s2, s1);
 7670     __ subs(len, len, 1);
 7671     __ br(Assembler::HI, L_simple_by1_loop);
 7672 
 7673     // s1 = s1 % BASE
 7674     __ subs(temp0, s1, base);
 7675     __ csel(s1, temp0, s1, Assembler::HS);
 7676 
 7677     // s2 = s2 % BASE
 7678     __ lsr(temp0, s2, 16);
 7679     __ lsl(temp1, temp0, 4);
 7680     __ sub(temp1, temp1, temp0);
 7681     __ add(s2, temp1, s2, ext::uxth);
 7682 
 7683     __ subs(temp0, s2, base);
 7684     __ csel(s2, temp0, s2, Assembler::HS);
 7685 
 7686     __ b(L_combine);
 7687 
 7688     __ bind(L_nmax);
 7689     __ subs(len, len, nmax);
 7690     __ sub(count, nmax, 16);
 7691     __ br(Assembler::LO, L_by16);
 7692 
 7693     __ bind(L_nmax_loop);
 7694 
 7695     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7696                                       vbytes, vs1acc, vs2acc, vtable);
 7697 
 7698     __ subs(count, count, 16);
 7699     __ br(Assembler::HS, L_nmax_loop);
 7700 
 7701     // s1 = s1 % BASE
 7702     __ lsr(temp0, s1, 16);
 7703     __ lsl(temp1, temp0, 4);
 7704     __ sub(temp1, temp1, temp0);
 7705     __ add(temp1, temp1, s1, ext::uxth);
 7706 
 7707     __ lsr(temp0, temp1, 16);
 7708     __ lsl(s1, temp0, 4);
 7709     __ sub(s1, s1, temp0);
 7710     __ add(s1, s1, temp1, ext:: uxth);
 7711 
 7712     __ subs(temp0, s1, base);
 7713     __ csel(s1, temp0, s1, Assembler::HS);
 7714 
 7715     // s2 = s2 % BASE
 7716     __ lsr(temp0, s2, 16);
 7717     __ lsl(temp1, temp0, 4);
 7718     __ sub(temp1, temp1, temp0);
 7719     __ add(temp1, temp1, s2, ext::uxth);
 7720 
 7721     __ lsr(temp0, temp1, 16);
 7722     __ lsl(s2, temp0, 4);
 7723     __ sub(s2, s2, temp0);
 7724     __ add(s2, s2, temp1, ext:: uxth);
 7725 
 7726     __ subs(temp0, s2, base);
 7727     __ csel(s2, temp0, s2, Assembler::HS);
 7728 
 7729     __ subs(len, len, nmax);
 7730     __ sub(count, nmax, 16);
 7731     __ br(Assembler::HS, L_nmax_loop);
 7732 
 7733     __ bind(L_by16);
 7734     __ adds(len, len, count);
 7735     __ br(Assembler::LO, L_by1);
 7736 
 7737     __ bind(L_by16_loop);
 7738 
 7739     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 7740                                       vbytes, vs1acc, vs2acc, vtable);
 7741 
 7742     __ subs(len, len, 16);
 7743     __ br(Assembler::HS, L_by16_loop);
 7744 
 7745     __ bind(L_by1);
 7746     __ adds(len, len, 15);
 7747     __ br(Assembler::LO, L_do_mod);
 7748 
 7749     __ bind(L_by1_loop);
 7750     __ ldrb(temp0, Address(__ post(buff, 1)));
 7751     __ add(s1, temp0, s1);
 7752     __ add(s2, s2, s1);
 7753     __ subs(len, len, 1);
 7754     __ br(Assembler::HS, L_by1_loop);
 7755 
 7756     __ bind(L_do_mod);
 7757     // s1 = s1 % BASE
 7758     __ lsr(temp0, s1, 16);
 7759     __ lsl(temp1, temp0, 4);
 7760     __ sub(temp1, temp1, temp0);
 7761     __ add(temp1, temp1, s1, ext::uxth);
 7762 
 7763     __ lsr(temp0, temp1, 16);
 7764     __ lsl(s1, temp0, 4);
 7765     __ sub(s1, s1, temp0);
 7766     __ add(s1, s1, temp1, ext:: uxth);
 7767 
 7768     __ subs(temp0, s1, base);
 7769     __ csel(s1, temp0, s1, Assembler::HS);
 7770 
 7771     // s2 = s2 % BASE
 7772     __ lsr(temp0, s2, 16);
 7773     __ lsl(temp1, temp0, 4);
 7774     __ sub(temp1, temp1, temp0);
 7775     __ add(temp1, temp1, s2, ext::uxth);
 7776 
 7777     __ lsr(temp0, temp1, 16);
 7778     __ lsl(s2, temp0, 4);
 7779     __ sub(s2, s2, temp0);
 7780     __ add(s2, s2, temp1, ext:: uxth);
 7781 
 7782     __ subs(temp0, s2, base);
 7783     __ csel(s2, temp0, s2, Assembler::HS);
 7784 
 7785     // Combine lower bits and higher bits
 7786     __ bind(L_combine);
 7787     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 7788 
 7789     __ ret(lr);
 7790 
 7791     return start;
 7792   }
 7793 
 7794   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 7795           Register temp0, Register temp1, FloatRegister vbytes,
 7796           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 7797     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 7798     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 7799     // In non-vectorized code, we update s1 and s2 as:
 7800     //   s1 <- s1 + b1
 7801     //   s2 <- s2 + s1
 7802     //   s1 <- s1 + b2
 7803     //   s2 <- s2 + b1
 7804     //   ...
 7805     //   s1 <- s1 + b16
 7806     //   s2 <- s2 + s1
 7807     // Putting above assignments together, we have:
 7808     //   s1_new = s1 + b1 + b2 + ... + b16
 7809     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 7810     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 7811     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 7812     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 7813 
 7814     // s2 = s2 + s1 * 16
 7815     __ add(s2, s2, s1, Assembler::LSL, 4);
 7816 
 7817     // vs1acc = b1 + b2 + b3 + ... + b16
 7818     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 7819     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 7820     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 7821     __ uaddlv(vs1acc, __ T16B, vbytes);
 7822     __ uaddlv(vs2acc, __ T8H, vs2acc);
 7823 
 7824     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 7825     __ fmovd(temp0, vs1acc);
 7826     __ fmovd(temp1, vs2acc);
 7827     __ add(s1, s1, temp0);
 7828     __ add(s2, s2, temp1);
 7829   }
 7830 
 7831   /**
 7832    *  Arguments:
 7833    *
 7834    *  Input:
 7835    *    c_rarg0   - x address
 7836    *    c_rarg1   - x length
 7837    *    c_rarg2   - y address
 7838    *    c_rarg3   - y length
 7839    *    c_rarg4   - z address
 7840    */
 7841   address generate_multiplyToLen() {
 7842     __ align(CodeEntryAlignment);
 7843     StubId stub_id = StubId::stubgen_multiplyToLen_id;
 7844     StubCodeMark mark(this, stub_id);
 7845 
 7846     address start = __ pc();
 7847     const Register x     = r0;
 7848     const Register xlen  = r1;
 7849     const Register y     = r2;
 7850     const Register ylen  = r3;
 7851     const Register z     = r4;
 7852 
 7853     const Register tmp0  = r5;
 7854     const Register tmp1  = r10;
 7855     const Register tmp2  = r11;
 7856     const Register tmp3  = r12;
 7857     const Register tmp4  = r13;
 7858     const Register tmp5  = r14;
 7859     const Register tmp6  = r15;
 7860     const Register tmp7  = r16;
 7861 
 7862     BLOCK_COMMENT("Entry:");
 7863     __ enter(); // required for proper stackwalking of RuntimeStub frame
 7864     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7865     __ leave(); // required for proper stackwalking of RuntimeStub frame
 7866     __ ret(lr);
 7867 
 7868     return start;
 7869   }
 7870 
 7871   address generate_squareToLen() {
 7872     // squareToLen algorithm for sizes 1..127 described in java code works
 7873     // faster than multiply_to_len on some CPUs and slower on others, but
 7874     // multiply_to_len shows a bit better overall results
 7875     __ align(CodeEntryAlignment);
 7876     StubId stub_id = StubId::stubgen_squareToLen_id;
 7877     StubCodeMark mark(this, stub_id);
 7878     address start = __ pc();
 7879 
 7880     const Register x     = r0;
 7881     const Register xlen  = r1;
 7882     const Register z     = r2;
 7883     const Register y     = r4; // == x
 7884     const Register ylen  = r5; // == xlen
 7885 
 7886     const Register tmp0  = r3;
 7887     const Register tmp1  = r10;
 7888     const Register tmp2  = r11;
 7889     const Register tmp3  = r12;
 7890     const Register tmp4  = r13;
 7891     const Register tmp5  = r14;
 7892     const Register tmp6  = r15;
 7893     const Register tmp7  = r16;
 7894 
 7895     RegSet spilled_regs = RegSet::of(y, ylen);
 7896     BLOCK_COMMENT("Entry:");
 7897     __ enter();
 7898     __ push(spilled_regs, sp);
 7899     __ mov(y, x);
 7900     __ mov(ylen, xlen);
 7901     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 7902     __ pop(spilled_regs, sp);
 7903     __ leave();
 7904     __ ret(lr);
 7905     return start;
 7906   }
 7907 
 7908   address generate_mulAdd() {
 7909     __ align(CodeEntryAlignment);
 7910     StubId stub_id = StubId::stubgen_mulAdd_id;
 7911     StubCodeMark mark(this, stub_id);
 7912 
 7913     address start = __ pc();
 7914 
 7915     const Register out     = r0;
 7916     const Register in      = r1;
 7917     const Register offset  = r2;
 7918     const Register len     = r3;
 7919     const Register k       = r4;
 7920 
 7921     BLOCK_COMMENT("Entry:");
 7922     __ enter();
 7923     __ mul_add(out, in, offset, len, k);
 7924     __ leave();
 7925     __ ret(lr);
 7926 
 7927     return start;
 7928   }
 7929 
 7930   // Arguments:
 7931   //
 7932   // Input:
 7933   //   c_rarg0   - newArr address
 7934   //   c_rarg1   - oldArr address
 7935   //   c_rarg2   - newIdx
 7936   //   c_rarg3   - shiftCount
 7937   //   c_rarg4   - numIter
 7938   //
 7939   address generate_bigIntegerRightShift() {
 7940     __ align(CodeEntryAlignment);
 7941     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
 7942     StubCodeMark mark(this, stub_id);
 7943     address start = __ pc();
 7944 
 7945     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 7946 
 7947     Register newArr        = c_rarg0;
 7948     Register oldArr        = c_rarg1;
 7949     Register newIdx        = c_rarg2;
 7950     Register shiftCount    = c_rarg3;
 7951     Register numIter       = c_rarg4;
 7952     Register idx           = numIter;
 7953 
 7954     Register newArrCur     = rscratch1;
 7955     Register shiftRevCount = rscratch2;
 7956     Register oldArrCur     = r13;
 7957     Register oldArrNext    = r14;
 7958 
 7959     FloatRegister oldElem0        = v0;
 7960     FloatRegister oldElem1        = v1;
 7961     FloatRegister newElem         = v2;
 7962     FloatRegister shiftVCount     = v3;
 7963     FloatRegister shiftVRevCount  = v4;
 7964 
 7965     __ cbz(idx, Exit);
 7966 
 7967     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 7968 
 7969     // left shift count
 7970     __ movw(shiftRevCount, 32);
 7971     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 7972 
 7973     // numIter too small to allow a 4-words SIMD loop, rolling back
 7974     __ cmp(numIter, (u1)4);
 7975     __ br(Assembler::LT, ShiftThree);
 7976 
 7977     __ dup(shiftVCount,    __ T4S, shiftCount);
 7978     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 7979     __ negr(shiftVCount,   __ T4S, shiftVCount);
 7980 
 7981     __ BIND(ShiftSIMDLoop);
 7982 
 7983     // Calculate the load addresses
 7984     __ sub(idx, idx, 4);
 7985     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 7986     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 7987     __ add(oldArrCur,  oldArrNext, 4);
 7988 
 7989     // Load 4 words and process
 7990     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 7991     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 7992     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 7993     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 7994     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 7995     __ st1(newElem,   __ T4S,  Address(newArrCur));
 7996 
 7997     __ cmp(idx, (u1)4);
 7998     __ br(Assembler::LT, ShiftTwoLoop);
 7999     __ b(ShiftSIMDLoop);
 8000 
 8001     __ BIND(ShiftTwoLoop);
 8002     __ cbz(idx, Exit);
 8003     __ cmp(idx, (u1)1);
 8004     __ br(Assembler::EQ, ShiftOne);
 8005 
 8006     // Calculate the load addresses
 8007     __ sub(idx, idx, 2);
 8008     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 8009     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 8010     __ add(oldArrCur,  oldArrNext, 4);
 8011 
 8012     // Load 2 words and process
 8013     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 8014     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 8015     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 8016     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 8017     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 8018     __ st1(newElem,   __ T2S, Address(newArrCur));
 8019     __ b(ShiftTwoLoop);
 8020 
 8021     __ BIND(ShiftThree);
 8022     __ tbz(idx, 1, ShiftOne);
 8023     __ tbz(idx, 0, ShiftTwo);
 8024     __ ldrw(r10,  Address(oldArr, 12));
 8025     __ ldrw(r11,  Address(oldArr, 8));
 8026     __ lsrvw(r10, r10, shiftCount);
 8027     __ lslvw(r11, r11, shiftRevCount);
 8028     __ orrw(r12,  r10, r11);
 8029     __ strw(r12,  Address(newArr, 8));
 8030 
 8031     __ BIND(ShiftTwo);
 8032     __ ldrw(r10,  Address(oldArr, 8));
 8033     __ ldrw(r11,  Address(oldArr, 4));
 8034     __ lsrvw(r10, r10, shiftCount);
 8035     __ lslvw(r11, r11, shiftRevCount);
 8036     __ orrw(r12,  r10, r11);
 8037     __ strw(r12,  Address(newArr, 4));
 8038 
 8039     __ BIND(ShiftOne);
 8040     __ ldrw(r10,  Address(oldArr, 4));
 8041     __ ldrw(r11,  Address(oldArr));
 8042     __ lsrvw(r10, r10, shiftCount);
 8043     __ lslvw(r11, r11, shiftRevCount);
 8044     __ orrw(r12,  r10, r11);
 8045     __ strw(r12,  Address(newArr));
 8046 
 8047     __ BIND(Exit);
 8048     __ ret(lr);
 8049 
 8050     return start;
 8051   }
 8052 
 8053   // Arguments:
 8054   //
 8055   // Input:
 8056   //   c_rarg0   - newArr address
 8057   //   c_rarg1   - oldArr address
 8058   //   c_rarg2   - newIdx
 8059   //   c_rarg3   - shiftCount
 8060   //   c_rarg4   - numIter
 8061   //
 8062   address generate_bigIntegerLeftShift() {
 8063     __ align(CodeEntryAlignment);
 8064     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
 8065     StubCodeMark mark(this, stub_id);
 8066     address start = __ pc();
 8067 
 8068     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 8069 
 8070     Register newArr        = c_rarg0;
 8071     Register oldArr        = c_rarg1;
 8072     Register newIdx        = c_rarg2;
 8073     Register shiftCount    = c_rarg3;
 8074     Register numIter       = c_rarg4;
 8075 
 8076     Register shiftRevCount = rscratch1;
 8077     Register oldArrNext    = rscratch2;
 8078 
 8079     FloatRegister oldElem0        = v0;
 8080     FloatRegister oldElem1        = v1;
 8081     FloatRegister newElem         = v2;
 8082     FloatRegister shiftVCount     = v3;
 8083     FloatRegister shiftVRevCount  = v4;
 8084 
 8085     __ cbz(numIter, Exit);
 8086 
 8087     __ add(oldArrNext, oldArr, 4);
 8088     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 8089 
 8090     // right shift count
 8091     __ movw(shiftRevCount, 32);
 8092     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 8093 
 8094     // numIter too small to allow a 4-words SIMD loop, rolling back
 8095     __ cmp(numIter, (u1)4);
 8096     __ br(Assembler::LT, ShiftThree);
 8097 
 8098     __ dup(shiftVCount,     __ T4S, shiftCount);
 8099     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 8100     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 8101 
 8102     __ BIND(ShiftSIMDLoop);
 8103 
 8104     // load 4 words and process
 8105     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 8106     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 8107     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 8108     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 8109     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 8110     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 8111     __ sub(numIter,   numIter, 4);
 8112 
 8113     __ cmp(numIter, (u1)4);
 8114     __ br(Assembler::LT, ShiftTwoLoop);
 8115     __ b(ShiftSIMDLoop);
 8116 
 8117     __ BIND(ShiftTwoLoop);
 8118     __ cbz(numIter, Exit);
 8119     __ cmp(numIter, (u1)1);
 8120     __ br(Assembler::EQ, ShiftOne);
 8121 
 8122     // load 2 words and process
 8123     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 8124     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 8125     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 8126     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 8127     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 8128     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 8129     __ sub(numIter,   numIter, 2);
 8130     __ b(ShiftTwoLoop);
 8131 
 8132     __ BIND(ShiftThree);
 8133     __ ldrw(r10,  __ post(oldArr, 4));
 8134     __ ldrw(r11,  __ post(oldArrNext, 4));
 8135     __ lslvw(r10, r10, shiftCount);
 8136     __ lsrvw(r11, r11, shiftRevCount);
 8137     __ orrw(r12,  r10, r11);
 8138     __ strw(r12,  __ post(newArr, 4));
 8139     __ tbz(numIter, 1, Exit);
 8140     __ tbz(numIter, 0, ShiftOne);
 8141 
 8142     __ BIND(ShiftTwo);
 8143     __ ldrw(r10,  __ post(oldArr, 4));
 8144     __ ldrw(r11,  __ post(oldArrNext, 4));
 8145     __ lslvw(r10, r10, shiftCount);
 8146     __ lsrvw(r11, r11, shiftRevCount);
 8147     __ orrw(r12,  r10, r11);
 8148     __ strw(r12,  __ post(newArr, 4));
 8149 
 8150     __ BIND(ShiftOne);
 8151     __ ldrw(r10,  Address(oldArr));
 8152     __ ldrw(r11,  Address(oldArrNext));
 8153     __ lslvw(r10, r10, shiftCount);
 8154     __ lsrvw(r11, r11, shiftRevCount);
 8155     __ orrw(r12,  r10, r11);
 8156     __ strw(r12,  Address(newArr));
 8157 
 8158     __ BIND(Exit);
 8159     __ ret(lr);
 8160 
 8161     return start;
 8162   }
 8163 
 8164   address generate_count_positives(address &count_positives_long) {
 8165     const u1 large_loop_size = 64;
 8166     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 8167     int dcache_line = VM_Version::dcache_line_size();
 8168 
 8169     Register ary1 = r1, len = r2, result = r0;
 8170 
 8171     __ align(CodeEntryAlignment);
 8172 
 8173     StubId stub_id = StubId::stubgen_count_positives_id;
 8174     StubCodeMark mark(this, stub_id);
 8175 
 8176     address entry = __ pc();
 8177 
 8178     __ enter();
 8179     // precondition: a copy of len is already in result
 8180     // __ mov(result, len);
 8181 
 8182   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 8183         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 8184 
 8185   __ cmp(len, (u1)15);
 8186   __ br(Assembler::GT, LEN_OVER_15);
 8187   // The only case when execution falls into this code is when pointer is near
 8188   // the end of memory page and we have to avoid reading next page
 8189   __ add(ary1, ary1, len);
 8190   __ subs(len, len, 8);
 8191   __ br(Assembler::GT, LEN_OVER_8);
 8192   __ ldr(rscratch2, Address(ary1, -8));
 8193   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 8194   __ lsrv(rscratch2, rscratch2, rscratch1);
 8195   __ tst(rscratch2, UPPER_BIT_MASK);
 8196   __ csel(result, zr, result, Assembler::NE);
 8197   __ leave();
 8198   __ ret(lr);
 8199   __ bind(LEN_OVER_8);
 8200   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 8201   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 8202   __ tst(rscratch2, UPPER_BIT_MASK);
 8203   __ br(Assembler::NE, RET_NO_POP);
 8204   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 8205   __ lsrv(rscratch1, rscratch1, rscratch2);
 8206   __ tst(rscratch1, UPPER_BIT_MASK);
 8207   __ bind(RET_NO_POP);
 8208   __ csel(result, zr, result, Assembler::NE);
 8209   __ leave();
 8210   __ ret(lr);
 8211 
 8212   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 8213   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 8214 
 8215   count_positives_long = __ pc(); // 2nd entry point
 8216 
 8217   __ enter();
 8218 
 8219   __ bind(LEN_OVER_15);
 8220     __ push(spilled_regs, sp);
 8221     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 8222     __ cbz(rscratch2, ALIGNED);
 8223     __ ldp(tmp6, tmp1, Address(ary1));
 8224     __ mov(tmp5, 16);
 8225     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 8226     __ add(ary1, ary1, rscratch1);
 8227     __ orr(tmp6, tmp6, tmp1);
 8228     __ tst(tmp6, UPPER_BIT_MASK);
 8229     __ br(Assembler::NE, RET_ADJUST);
 8230     __ sub(len, len, rscratch1);
 8231 
 8232   __ bind(ALIGNED);
 8233     __ cmp(len, large_loop_size);
 8234     __ br(Assembler::LT, CHECK_16);
 8235     // Perform 16-byte load as early return in pre-loop to handle situation
 8236     // when initially aligned large array has negative values at starting bytes,
 8237     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 8238     // slower. Cases with negative bytes further ahead won't be affected that
 8239     // much. In fact, it'll be faster due to early loads, less instructions and
 8240     // less branches in LARGE_LOOP.
 8241     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 8242     __ sub(len, len, 16);
 8243     __ orr(tmp6, tmp6, tmp1);
 8244     __ tst(tmp6, UPPER_BIT_MASK);
 8245     __ br(Assembler::NE, RET_ADJUST_16);
 8246     __ cmp(len, large_loop_size);
 8247     __ br(Assembler::LT, CHECK_16);
 8248 
 8249     if (SoftwarePrefetchHintDistance >= 0
 8250         && SoftwarePrefetchHintDistance >= dcache_line) {
 8251       // initial prefetch
 8252       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 8253     }
 8254   __ bind(LARGE_LOOP);
 8255     if (SoftwarePrefetchHintDistance >= 0) {
 8256       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 8257     }
 8258     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 8259     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 8260     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 8261     // instructions per cycle and have less branches, but this approach disables
 8262     // early return, thus, all 64 bytes are loaded and checked every time.
 8263     __ ldp(tmp2, tmp3, Address(ary1));
 8264     __ ldp(tmp4, tmp5, Address(ary1, 16));
 8265     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 8266     __ ldp(tmp6, tmp1, Address(ary1, 48));
 8267     __ add(ary1, ary1, large_loop_size);
 8268     __ sub(len, len, large_loop_size);
 8269     __ orr(tmp2, tmp2, tmp3);
 8270     __ orr(tmp4, tmp4, tmp5);
 8271     __ orr(rscratch1, rscratch1, rscratch2);
 8272     __ orr(tmp6, tmp6, tmp1);
 8273     __ orr(tmp2, tmp2, tmp4);
 8274     __ orr(rscratch1, rscratch1, tmp6);
 8275     __ orr(tmp2, tmp2, rscratch1);
 8276     __ tst(tmp2, UPPER_BIT_MASK);
 8277     __ br(Assembler::NE, RET_ADJUST_LONG);
 8278     __ cmp(len, large_loop_size);
 8279     __ br(Assembler::GE, LARGE_LOOP);
 8280 
 8281   __ bind(CHECK_16); // small 16-byte load pre-loop
 8282     __ cmp(len, (u1)16);
 8283     __ br(Assembler::LT, POST_LOOP16);
 8284 
 8285   __ bind(LOOP16); // small 16-byte load loop
 8286     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 8287     __ sub(len, len, 16);
 8288     __ orr(tmp2, tmp2, tmp3);
 8289     __ tst(tmp2, UPPER_BIT_MASK);
 8290     __ br(Assembler::NE, RET_ADJUST_16);
 8291     __ cmp(len, (u1)16);
 8292     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 8293 
 8294   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 8295     __ cmp(len, (u1)8);
 8296     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 8297     __ ldr(tmp3, Address(__ post(ary1, 8)));
 8298     __ tst(tmp3, UPPER_BIT_MASK);
 8299     __ br(Assembler::NE, RET_ADJUST);
 8300     __ sub(len, len, 8);
 8301 
 8302   __ bind(POST_LOOP16_LOAD_TAIL);
 8303     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 8304     __ ldr(tmp1, Address(ary1));
 8305     __ mov(tmp2, 64);
 8306     __ sub(tmp4, tmp2, len, __ LSL, 3);
 8307     __ lslv(tmp1, tmp1, tmp4);
 8308     __ tst(tmp1, UPPER_BIT_MASK);
 8309     __ br(Assembler::NE, RET_ADJUST);
 8310     // Fallthrough
 8311 
 8312   __ bind(RET_LEN);
 8313     __ pop(spilled_regs, sp);
 8314     __ leave();
 8315     __ ret(lr);
 8316 
 8317     // difference result - len is the count of guaranteed to be
 8318     // positive bytes
 8319 
 8320   __ bind(RET_ADJUST_LONG);
 8321     __ add(len, len, (u1)(large_loop_size - 16));
 8322   __ bind(RET_ADJUST_16);
 8323     __ add(len, len, 16);
 8324   __ bind(RET_ADJUST);
 8325     __ pop(spilled_regs, sp);
 8326     __ leave();
 8327     __ sub(result, result, len);
 8328     __ ret(lr);
 8329 
 8330     return entry;
 8331   }
 8332 
 8333   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 8334         bool usePrefetch, Label &NOT_EQUAL) {
 8335     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8336         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8337         tmp7 = r12, tmp8 = r13;
 8338     Label LOOP;
 8339 
 8340     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8341     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8342     __ bind(LOOP);
 8343     if (usePrefetch) {
 8344       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8345       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8346     }
 8347     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8348     __ eor(tmp1, tmp1, tmp2);
 8349     __ eor(tmp3, tmp3, tmp4);
 8350     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8351     __ orr(tmp1, tmp1, tmp3);
 8352     __ cbnz(tmp1, NOT_EQUAL);
 8353     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8354     __ eor(tmp5, tmp5, tmp6);
 8355     __ eor(tmp7, tmp7, tmp8);
 8356     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8357     __ orr(tmp5, tmp5, tmp7);
 8358     __ cbnz(tmp5, NOT_EQUAL);
 8359     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 8360     __ eor(tmp1, tmp1, tmp2);
 8361     __ eor(tmp3, tmp3, tmp4);
 8362     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 8363     __ orr(tmp1, tmp1, tmp3);
 8364     __ cbnz(tmp1, NOT_EQUAL);
 8365     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 8366     __ eor(tmp5, tmp5, tmp6);
 8367     __ sub(cnt1, cnt1, 8 * wordSize);
 8368     __ eor(tmp7, tmp7, tmp8);
 8369     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 8370     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 8371     // cmp) because subs allows an unlimited range of immediate operand.
 8372     __ subs(tmp6, cnt1, loopThreshold);
 8373     __ orr(tmp5, tmp5, tmp7);
 8374     __ cbnz(tmp5, NOT_EQUAL);
 8375     __ br(__ GE, LOOP);
 8376     // post-loop
 8377     __ eor(tmp1, tmp1, tmp2);
 8378     __ eor(tmp3, tmp3, tmp4);
 8379     __ orr(tmp1, tmp1, tmp3);
 8380     __ sub(cnt1, cnt1, 2 * wordSize);
 8381     __ cbnz(tmp1, NOT_EQUAL);
 8382   }
 8383 
 8384   void generate_large_array_equals_loop_simd(int loopThreshold,
 8385         bool usePrefetch, Label &NOT_EQUAL) {
 8386     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8387         tmp2 = rscratch2;
 8388     Label LOOP;
 8389 
 8390     __ bind(LOOP);
 8391     if (usePrefetch) {
 8392       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 8393       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 8394     }
 8395     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 8396     __ sub(cnt1, cnt1, 8 * wordSize);
 8397     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 8398     __ subs(tmp1, cnt1, loopThreshold);
 8399     __ eor(v0, __ T16B, v0, v4);
 8400     __ eor(v1, __ T16B, v1, v5);
 8401     __ eor(v2, __ T16B, v2, v6);
 8402     __ eor(v3, __ T16B, v3, v7);
 8403     __ orr(v0, __ T16B, v0, v1);
 8404     __ orr(v1, __ T16B, v2, v3);
 8405     __ orr(v0, __ T16B, v0, v1);
 8406     __ umov(tmp1, v0, __ D, 0);
 8407     __ umov(tmp2, v0, __ D, 1);
 8408     __ orr(tmp1, tmp1, tmp2);
 8409     __ cbnz(tmp1, NOT_EQUAL);
 8410     __ br(__ GE, LOOP);
 8411   }
 8412 
 8413   // a1 = r1 - array1 address
 8414   // a2 = r2 - array2 address
 8415   // result = r0 - return value. Already contains "false"
 8416   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 8417   // r3-r5 are reserved temporary registers
 8418   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 8419   address generate_large_array_equals() {
 8420     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 8421         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 8422         tmp7 = r12, tmp8 = r13;
 8423     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 8424         SMALL_LOOP, POST_LOOP;
 8425     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 8426     // calculate if at least 32 prefetched bytes are used
 8427     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 8428     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 8429     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 8430     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 8431         tmp5, tmp6, tmp7, tmp8);
 8432 
 8433     __ align(CodeEntryAlignment);
 8434 
 8435     StubId stub_id = StubId::stubgen_large_array_equals_id;
 8436     StubCodeMark mark(this, stub_id);
 8437 
 8438     address entry = __ pc();
 8439     __ enter();
 8440     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 8441     // also advance pointers to use post-increment instead of pre-increment
 8442     __ add(a1, a1, wordSize);
 8443     __ add(a2, a2, wordSize);
 8444     if (AvoidUnalignedAccesses) {
 8445       // both implementations (SIMD/nonSIMD) are using relatively large load
 8446       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 8447       // on some CPUs in case of address is not at least 16-byte aligned.
 8448       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 8449       // load if needed at least for 1st address and make if 16-byte aligned.
 8450       Label ALIGNED16;
 8451       __ tbz(a1, 3, ALIGNED16);
 8452       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8453       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8454       __ sub(cnt1, cnt1, wordSize);
 8455       __ eor(tmp1, tmp1, tmp2);
 8456       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 8457       __ bind(ALIGNED16);
 8458     }
 8459     if (UseSIMDForArrayEquals) {
 8460       if (SoftwarePrefetchHintDistance >= 0) {
 8461         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8462         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8463         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 8464             /* prfm = */ true, NOT_EQUAL);
 8465         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8466         __ br(__ LT, TAIL);
 8467       }
 8468       __ bind(NO_PREFETCH_LARGE_LOOP);
 8469       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 8470           /* prfm = */ false, NOT_EQUAL);
 8471     } else {
 8472       __ push(spilled_regs, sp);
 8473       if (SoftwarePrefetchHintDistance >= 0) {
 8474         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 8475         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 8476         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 8477             /* prfm = */ true, NOT_EQUAL);
 8478         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 8479         __ br(__ LT, TAIL);
 8480       }
 8481       __ bind(NO_PREFETCH_LARGE_LOOP);
 8482       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 8483           /* prfm = */ false, NOT_EQUAL);
 8484     }
 8485     __ bind(TAIL);
 8486       __ cbz(cnt1, EQUAL);
 8487       __ subs(cnt1, cnt1, wordSize);
 8488       __ br(__ LE, POST_LOOP);
 8489     __ bind(SMALL_LOOP);
 8490       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 8491       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 8492       __ subs(cnt1, cnt1, wordSize);
 8493       __ eor(tmp1, tmp1, tmp2);
 8494       __ cbnz(tmp1, NOT_EQUAL);
 8495       __ br(__ GT, SMALL_LOOP);
 8496     __ bind(POST_LOOP);
 8497       __ ldr(tmp1, Address(a1, cnt1));
 8498       __ ldr(tmp2, Address(a2, cnt1));
 8499       __ eor(tmp1, tmp1, tmp2);
 8500       __ cbnz(tmp1, NOT_EQUAL);
 8501     __ bind(EQUAL);
 8502       __ mov(result, true);
 8503     __ bind(NOT_EQUAL);
 8504       if (!UseSIMDForArrayEquals) {
 8505         __ pop(spilled_regs, sp);
 8506       }
 8507     __ bind(NOT_EQUAL_NO_POP);
 8508     __ leave();
 8509     __ ret(lr);
 8510     return entry;
 8511   }
 8512 
 8513   // result = r0 - return value. Contains initial hashcode value on entry.
 8514   // ary = r1 - array address
 8515   // cnt = r2 - elements count
 8516   // Clobbers: v0-v13, rscratch1, rscratch2
 8517   address generate_large_arrays_hashcode(BasicType eltype) {
 8518     const Register result = r0, ary = r1, cnt = r2;
 8519     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 8520     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 8521     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 8522     const FloatRegister vpowm = v13;
 8523 
 8524     ARRAYS_HASHCODE_REGISTERS;
 8525 
 8526     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 8527 
 8528     unsigned int vf; // vectorization factor
 8529     bool multiply_by_halves;
 8530     Assembler::SIMD_Arrangement load_arrangement;
 8531     switch (eltype) {
 8532     case T_BOOLEAN:
 8533     case T_BYTE:
 8534       load_arrangement = Assembler::T8B;
 8535       multiply_by_halves = true;
 8536       vf = 8;
 8537       break;
 8538     case T_CHAR:
 8539     case T_SHORT:
 8540       load_arrangement = Assembler::T8H;
 8541       multiply_by_halves = true;
 8542       vf = 8;
 8543       break;
 8544     case T_INT:
 8545       load_arrangement = Assembler::T4S;
 8546       multiply_by_halves = false;
 8547       vf = 4;
 8548       break;
 8549     default:
 8550       ShouldNotReachHere();
 8551     }
 8552 
 8553     // Unroll factor
 8554     const unsigned uf = 4;
 8555 
 8556     // Effective vectorization factor
 8557     const unsigned evf = vf * uf;
 8558 
 8559     __ align(CodeEntryAlignment);
 8560 
 8561     StubId stub_id;
 8562     switch (eltype) {
 8563     case T_BOOLEAN:
 8564       stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
 8565       break;
 8566     case T_BYTE:
 8567       stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
 8568       break;
 8569     case T_CHAR:
 8570       stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
 8571       break;
 8572     case T_SHORT:
 8573       stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
 8574       break;
 8575     case T_INT:
 8576       stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
 8577       break;
 8578     default:
 8579       stub_id = StubId::NO_STUBID;
 8580       ShouldNotReachHere();
 8581     };
 8582 
 8583     StubCodeMark mark(this, stub_id);
 8584 
 8585     address entry = __ pc();
 8586     __ enter();
 8587 
 8588     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 8589     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 8590     // value shouldn't change throughout both loops.
 8591     __ movw(rscratch1, intpow(31U, 3));
 8592     __ mov(vpow, Assembler::S, 0, rscratch1);
 8593     __ movw(rscratch1, intpow(31U, 2));
 8594     __ mov(vpow, Assembler::S, 1, rscratch1);
 8595     __ movw(rscratch1, intpow(31U, 1));
 8596     __ mov(vpow, Assembler::S, 2, rscratch1);
 8597     __ movw(rscratch1, intpow(31U, 0));
 8598     __ mov(vpow, Assembler::S, 3, rscratch1);
 8599 
 8600     __ mov(vmul0, Assembler::T16B, 0);
 8601     __ mov(vmul0, Assembler::S, 3, result);
 8602 
 8603     __ andr(rscratch2, cnt, (uf - 1) * vf);
 8604     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 8605 
 8606     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 8607     __ mov(vpowm, Assembler::S, 0, rscratch1);
 8608 
 8609     // SMALL LOOP
 8610     __ bind(SMALL_LOOP);
 8611 
 8612     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 8613     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8614     __ subsw(rscratch2, rscratch2, vf);
 8615 
 8616     if (load_arrangement == Assembler::T8B) {
 8617       // Extend 8B to 8H to be able to use vector multiply
 8618       // instructions
 8619       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8620       if (is_signed_subword_type(eltype)) {
 8621         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8622       } else {
 8623         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8624       }
 8625     }
 8626 
 8627     switch (load_arrangement) {
 8628     case Assembler::T4S:
 8629       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8630       break;
 8631     case Assembler::T8B:
 8632     case Assembler::T8H:
 8633       assert(is_subword_type(eltype), "subword type expected");
 8634       if (is_signed_subword_type(eltype)) {
 8635         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8636       } else {
 8637         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8638       }
 8639       break;
 8640     default:
 8641       __ should_not_reach_here();
 8642     }
 8643 
 8644     // Process the upper half of a vector
 8645     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8646       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8647       if (is_signed_subword_type(eltype)) {
 8648         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8649       } else {
 8650         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8651       }
 8652     }
 8653 
 8654     __ br(Assembler::HI, SMALL_LOOP);
 8655 
 8656     // SMALL LOOP'S EPILOQUE
 8657     __ lsr(rscratch2, cnt, exact_log2(evf));
 8658     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 8659 
 8660     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8661     __ addv(vmul0, Assembler::T4S, vmul0);
 8662     __ umov(result, vmul0, Assembler::S, 0);
 8663 
 8664     // TAIL
 8665     __ bind(TAIL);
 8666 
 8667     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 8668     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 8669     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 8670     __ andr(rscratch2, cnt, vf - 1);
 8671     __ bind(TAIL_SHORTCUT);
 8672     __ adr(rscratch1, BR_BASE);
 8673     // For Cortex-A53 offset is 4 because 2 nops are generated.
 8674     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
 8675     __ movw(rscratch2, 0x1f);
 8676     __ br(rscratch1);
 8677 
 8678     for (size_t i = 0; i < vf - 1; ++i) {
 8679       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 8680                                    eltype);
 8681       __ maddw(result, result, rscratch2, rscratch1);
 8682       // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
 8683       // Generate 2nd nop to have 4 instructions per iteration.
 8684       if (VM_Version::supports_a53mac()) {
 8685         __ nop();
 8686       }
 8687     }
 8688     __ bind(BR_BASE);
 8689 
 8690     __ leave();
 8691     __ ret(lr);
 8692 
 8693     // LARGE LOOP
 8694     __ bind(LARGE_LOOP_PREHEADER);
 8695 
 8696     __ lsr(rscratch2, cnt, exact_log2(evf));
 8697 
 8698     if (multiply_by_halves) {
 8699       // 31^4 - multiplier between lower and upper parts of a register
 8700       __ movw(rscratch1, intpow(31U, vf / 2));
 8701       __ mov(vpowm, Assembler::S, 1, rscratch1);
 8702       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 8703       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 8704       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8705     } else {
 8706       // 31^16
 8707       __ movw(rscratch1, intpow(31U, evf));
 8708       __ mov(vpowm, Assembler::S, 0, rscratch1);
 8709     }
 8710 
 8711     __ mov(vmul3, Assembler::T16B, 0);
 8712     __ mov(vmul2, Assembler::T16B, 0);
 8713     __ mov(vmul1, Assembler::T16B, 0);
 8714 
 8715     __ bind(LARGE_LOOP);
 8716 
 8717     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 8718     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 8719     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 8720     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 8721 
 8722     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 8723            Address(__ post(ary, evf * type2aelembytes(eltype))));
 8724 
 8725     if (load_arrangement == Assembler::T8B) {
 8726       // Extend 8B to 8H to be able to use vector multiply
 8727       // instructions
 8728       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 8729       if (is_signed_subword_type(eltype)) {
 8730         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8731         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8732         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8733         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8734       } else {
 8735         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 8736         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 8737         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 8738         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 8739       }
 8740     }
 8741 
 8742     switch (load_arrangement) {
 8743     case Assembler::T4S:
 8744       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 8745       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 8746       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 8747       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 8748       break;
 8749     case Assembler::T8B:
 8750     case Assembler::T8H:
 8751       assert(is_subword_type(eltype), "subword type expected");
 8752       if (is_signed_subword_type(eltype)) {
 8753         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8754         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8755         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8756         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8757       } else {
 8758         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 8759         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 8760         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 8761         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 8762       }
 8763       break;
 8764     default:
 8765       __ should_not_reach_here();
 8766     }
 8767 
 8768     // Process the upper half of a vector
 8769     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 8770       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 8771       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 8772       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 8773       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 8774       if (is_signed_subword_type(eltype)) {
 8775         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8776         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8777         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8778         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8779       } else {
 8780         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 8781         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 8782         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 8783         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 8784       }
 8785     }
 8786 
 8787     __ subsw(rscratch2, rscratch2, 1);
 8788     __ br(Assembler::HI, LARGE_LOOP);
 8789 
 8790     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 8791     __ addv(vmul3, Assembler::T4S, vmul3);
 8792     __ umov(result, vmul3, Assembler::S, 0);
 8793 
 8794     __ mov(rscratch2, intpow(31U, vf));
 8795 
 8796     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 8797     __ addv(vmul2, Assembler::T4S, vmul2);
 8798     __ umov(rscratch1, vmul2, Assembler::S, 0);
 8799     __ maddw(result, result, rscratch2, rscratch1);
 8800 
 8801     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 8802     __ addv(vmul1, Assembler::T4S, vmul1);
 8803     __ umov(rscratch1, vmul1, Assembler::S, 0);
 8804     __ maddw(result, result, rscratch2, rscratch1);
 8805 
 8806     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 8807     __ addv(vmul0, Assembler::T4S, vmul0);
 8808     __ umov(rscratch1, vmul0, Assembler::S, 0);
 8809     __ maddw(result, result, rscratch2, rscratch1);
 8810 
 8811     __ andr(rscratch2, cnt, vf - 1);
 8812     __ cbnz(rscratch2, TAIL_SHORTCUT);
 8813 
 8814     __ leave();
 8815     __ ret(lr);
 8816 
 8817     return entry;
 8818   }
 8819 
 8820   address generate_dsin_dcos(bool isCos) {
 8821     __ align(CodeEntryAlignment);
 8822     StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
 8823     StubCodeMark mark(this, stub_id);
 8824     address start = __ pc();
 8825     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 8826         (address)StubRoutines::aarch64::_two_over_pi,
 8827         (address)StubRoutines::aarch64::_pio2,
 8828         (address)StubRoutines::aarch64::_dsin_coef,
 8829         (address)StubRoutines::aarch64::_dcos_coef);
 8830     return start;
 8831   }
 8832 
 8833   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 8834   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 8835       Label &DIFF2) {
 8836     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 8837     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 8838 
 8839     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 8840     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8841     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 8842     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 8843 
 8844     __ fmovd(tmpL, vtmp3);
 8845     __ eor(rscratch2, tmp3, tmpL);
 8846     __ cbnz(rscratch2, DIFF2);
 8847 
 8848     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8849     __ umov(tmpL, vtmp3, __ D, 1);
 8850     __ eor(rscratch2, tmpU, tmpL);
 8851     __ cbnz(rscratch2, DIFF1);
 8852 
 8853     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 8854     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 8855     __ fmovd(tmpL, vtmp);
 8856     __ eor(rscratch2, tmp3, tmpL);
 8857     __ cbnz(rscratch2, DIFF2);
 8858 
 8859     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8860     __ umov(tmpL, vtmp, __ D, 1);
 8861     __ eor(rscratch2, tmpU, tmpL);
 8862     __ cbnz(rscratch2, DIFF1);
 8863   }
 8864 
 8865   // r0  = result
 8866   // r1  = str1
 8867   // r2  = cnt1
 8868   // r3  = str2
 8869   // r4  = cnt2
 8870   // r10 = tmp1
 8871   // r11 = tmp2
 8872   address generate_compare_long_string_different_encoding(bool isLU) {
 8873     __ align(CodeEntryAlignment);
 8874     StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
 8875     StubCodeMark mark(this, stub_id);
 8876     address entry = __ pc();
 8877     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 8878         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 8879         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 8880     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 8881         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 8882     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 8883     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 8884 
 8885     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 8886 
 8887     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 8888     // cnt2 == amount of characters left to compare
 8889     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 8890     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8891     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 8892     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 8893     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 8894     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 8895     __ eor(rscratch2, tmp1, tmp2);
 8896     __ mov(rscratch1, tmp2);
 8897     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 8898     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 8899              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 8900     __ push(spilled_regs, sp);
 8901     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 8902     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 8903 
 8904     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 8905 
 8906     if (SoftwarePrefetchHintDistance >= 0) {
 8907       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8908       __ br(__ LT, NO_PREFETCH);
 8909       __ bind(LARGE_LOOP_PREFETCH);
 8910         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 8911         __ mov(tmp4, 2);
 8912         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8913         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 8914           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8915           __ subs(tmp4, tmp4, 1);
 8916           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 8917           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 8918           __ mov(tmp4, 2);
 8919         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 8920           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8921           __ subs(tmp4, tmp4, 1);
 8922           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 8923           __ sub(cnt2, cnt2, 64);
 8924           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 8925           __ br(__ GE, LARGE_LOOP_PREFETCH);
 8926     }
 8927     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 8928     __ bind(NO_PREFETCH);
 8929     __ subs(cnt2, cnt2, 16);
 8930     __ br(__ LT, TAIL);
 8931     __ align(OptoLoopAlignment);
 8932     __ bind(SMALL_LOOP); // smaller loop
 8933       __ subs(cnt2, cnt2, 16);
 8934       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 8935       __ br(__ GE, SMALL_LOOP);
 8936       __ cmn(cnt2, (u1)16);
 8937       __ br(__ EQ, LOAD_LAST);
 8938     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 8939       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 8940       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 8941       __ ldr(tmp3, Address(cnt1, -8));
 8942       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 8943       __ b(LOAD_LAST);
 8944     __ bind(DIFF2);
 8945       __ mov(tmpU, tmp3);
 8946     __ bind(DIFF1);
 8947       __ pop(spilled_regs, sp);
 8948       __ b(CALCULATE_DIFFERENCE);
 8949     __ bind(LOAD_LAST);
 8950       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 8951       // No need to load it again
 8952       __ mov(tmpU, tmp3);
 8953       __ pop(spilled_regs, sp);
 8954 
 8955       // tmp2 points to the address of the last 4 Latin1 characters right now
 8956       __ ldrs(vtmp, Address(tmp2));
 8957       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 8958       __ fmovd(tmpL, vtmp);
 8959 
 8960       __ eor(rscratch2, tmpU, tmpL);
 8961       __ cbz(rscratch2, DONE);
 8962 
 8963     // Find the first different characters in the longwords and
 8964     // compute their difference.
 8965     __ bind(CALCULATE_DIFFERENCE);
 8966       __ rev(rscratch2, rscratch2);
 8967       __ clz(rscratch2, rscratch2);
 8968       __ andr(rscratch2, rscratch2, -16);
 8969       __ lsrv(tmp1, tmp1, rscratch2);
 8970       __ uxthw(tmp1, tmp1);
 8971       __ lsrv(rscratch1, rscratch1, rscratch2);
 8972       __ uxthw(rscratch1, rscratch1);
 8973       __ subw(result, tmp1, rscratch1);
 8974     __ bind(DONE);
 8975       __ ret(lr);
 8976     return entry;
 8977   }
 8978 
 8979   // r0 = input (float16)
 8980   // v0 = result (float)
 8981   // v1 = temporary float register
 8982   address generate_float16ToFloat() {
 8983     __ align(CodeEntryAlignment);
 8984     StubId stub_id = StubId::stubgen_hf2f_id;
 8985     StubCodeMark mark(this, stub_id);
 8986     address entry = __ pc();
 8987     BLOCK_COMMENT("Entry:");
 8988     __ flt16_to_flt(v0, r0, v1);
 8989     __ ret(lr);
 8990     return entry;
 8991   }
 8992 
 8993   // v0 = input (float)
 8994   // r0 = result (float16)
 8995   // v1 = temporary float register
 8996   address generate_floatToFloat16() {
 8997     __ align(CodeEntryAlignment);
 8998     StubId stub_id = StubId::stubgen_f2hf_id;
 8999     StubCodeMark mark(this, stub_id);
 9000     address entry = __ pc();
 9001     BLOCK_COMMENT("Entry:");
 9002     __ flt_to_flt16(r0, v0, v1);
 9003     __ ret(lr);
 9004     return entry;
 9005   }
 9006 
 9007   address generate_method_entry_barrier() {
 9008     __ align(CodeEntryAlignment);
 9009     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
 9010     StubCodeMark mark(this, stub_id);
 9011 
 9012     Label deoptimize_label;
 9013 
 9014     address start = __ pc();
 9015 
 9016     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 9017 
 9018     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 9019       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 9020       // We can get here despite the nmethod being good, if we have not
 9021       // yet applied our cross modification fence (or data fence).
 9022       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 9023       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 9024       __ ldrw(rscratch2, rscratch2);
 9025       __ strw(rscratch2, thread_epoch_addr);
 9026       __ isb();
 9027       __ membar(__ LoadLoad);
 9028     }
 9029 
 9030     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 9031 
 9032     __ enter();
 9033     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 9034 
 9035     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 9036 
 9037     __ push_call_clobbered_registers();
 9038 
 9039     __ mov(c_rarg0, rscratch2);
 9040     __ call_VM_leaf
 9041          (CAST_FROM_FN_PTR
 9042           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 9043 
 9044     __ reset_last_Java_frame(true);
 9045 
 9046     __ mov(rscratch1, r0);
 9047 
 9048     __ pop_call_clobbered_registers();
 9049 
 9050     __ cbnz(rscratch1, deoptimize_label);
 9051 
 9052     __ leave();
 9053     __ ret(lr);
 9054 
 9055     __ BIND(deoptimize_label);
 9056 
 9057     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 9058     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 9059 
 9060     __ mov(sp, rscratch1);
 9061     __ br(rscratch2);
 9062 
 9063     return start;
 9064   }
 9065 
 9066   // r0  = result
 9067   // r1  = str1
 9068   // r2  = cnt1
 9069   // r3  = str2
 9070   // r4  = cnt2
 9071   // r10 = tmp1
 9072   // r11 = tmp2
 9073   address generate_compare_long_string_same_encoding(bool isLL) {
 9074     __ align(CodeEntryAlignment);
 9075     StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
 9076     StubCodeMark mark(this, stub_id);
 9077     address entry = __ pc();
 9078     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9079         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 9080 
 9081     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 9082 
 9083     // exit from large loop when less than 64 bytes left to read or we're about
 9084     // to prefetch memory behind array border
 9085     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 9086 
 9087     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 9088     __ eor(rscratch2, tmp1, tmp2);
 9089     __ cbnz(rscratch2, CAL_DIFFERENCE);
 9090 
 9091     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 9092     // update pointers, because of previous read
 9093     __ add(str1, str1, wordSize);
 9094     __ add(str2, str2, wordSize);
 9095     if (SoftwarePrefetchHintDistance >= 0) {
 9096       __ align(OptoLoopAlignment);
 9097       __ bind(LARGE_LOOP_PREFETCH);
 9098         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 9099         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 9100 
 9101         for (int i = 0; i < 4; i++) {
 9102           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 9103           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 9104           __ cmp(tmp1, tmp2);
 9105           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9106           __ br(Assembler::NE, DIFF);
 9107         }
 9108         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 9109         __ add(str1, str1, 64);
 9110         __ add(str2, str2, 64);
 9111         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 9112         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 9113         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 9114     }
 9115 
 9116     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 9117     __ br(Assembler::LE, LESS16);
 9118     __ align(OptoLoopAlignment);
 9119     __ bind(LOOP_COMPARE16);
 9120       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9121       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9122       __ cmp(tmp1, tmp2);
 9123       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9124       __ br(Assembler::NE, DIFF);
 9125       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9126       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9127       __ br(Assembler::LT, LESS16);
 9128 
 9129       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 9130       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 9131       __ cmp(tmp1, tmp2);
 9132       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 9133       __ br(Assembler::NE, DIFF);
 9134       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 9135       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 9136       __ br(Assembler::GE, LOOP_COMPARE16);
 9137       __ cbz(cnt2, LENGTH_DIFF);
 9138 
 9139     __ bind(LESS16);
 9140       // each 8 compare
 9141       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 9142       __ br(Assembler::LE, LESS8);
 9143       __ ldr(tmp1, Address(__ post(str1, 8)));
 9144       __ ldr(tmp2, Address(__ post(str2, 8)));
 9145       __ eor(rscratch2, tmp1, tmp2);
 9146       __ cbnz(rscratch2, CAL_DIFFERENCE);
 9147       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 9148 
 9149     __ bind(LESS8); // directly load last 8 bytes
 9150       if (!isLL) {
 9151         __ add(cnt2, cnt2, cnt2);
 9152       }
 9153       __ ldr(tmp1, Address(str1, cnt2));
 9154       __ ldr(tmp2, Address(str2, cnt2));
 9155       __ eor(rscratch2, tmp1, tmp2);
 9156       __ cbz(rscratch2, LENGTH_DIFF);
 9157       __ b(CAL_DIFFERENCE);
 9158 
 9159     __ bind(DIFF);
 9160       __ cmp(tmp1, tmp2);
 9161       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 9162       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 9163       // reuse rscratch2 register for the result of eor instruction
 9164       __ eor(rscratch2, tmp1, tmp2);
 9165 
 9166     __ bind(CAL_DIFFERENCE);
 9167       __ rev(rscratch2, rscratch2);
 9168       __ clz(rscratch2, rscratch2);
 9169       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 9170       __ lsrv(tmp1, tmp1, rscratch2);
 9171       __ lsrv(tmp2, tmp2, rscratch2);
 9172       if (isLL) {
 9173         __ uxtbw(tmp1, tmp1);
 9174         __ uxtbw(tmp2, tmp2);
 9175       } else {
 9176         __ uxthw(tmp1, tmp1);
 9177         __ uxthw(tmp2, tmp2);
 9178       }
 9179       __ subw(result, tmp1, tmp2);
 9180 
 9181     __ bind(LENGTH_DIFF);
 9182       __ ret(lr);
 9183     return entry;
 9184   }
 9185 
 9186   enum string_compare_mode {
 9187     LL,
 9188     LU,
 9189     UL,
 9190     UU,
 9191   };
 9192 
 9193   // The following registers are declared in aarch64.ad
 9194   // r0  = result
 9195   // r1  = str1
 9196   // r2  = cnt1
 9197   // r3  = str2
 9198   // r4  = cnt2
 9199   // r10 = tmp1
 9200   // r11 = tmp2
 9201   // z0  = ztmp1
 9202   // z1  = ztmp2
 9203   // p0  = pgtmp1
 9204   // p1  = pgtmp2
 9205   address generate_compare_long_string_sve(string_compare_mode mode) {
 9206     StubId stub_id;
 9207     switch (mode) {
 9208       case LL: stub_id = StubId::stubgen_compare_long_string_LL_id;  break;
 9209       case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
 9210       case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
 9211       case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
 9212       default: ShouldNotReachHere();
 9213     }
 9214 
 9215     __ align(CodeEntryAlignment);
 9216     address entry = __ pc();
 9217     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 9218              tmp1 = r10, tmp2 = r11;
 9219 
 9220     Label LOOP, DONE, MISMATCH;
 9221     Register vec_len = tmp1;
 9222     Register idx = tmp2;
 9223     // The minimum of the string lengths has been stored in cnt2.
 9224     Register cnt = cnt2;
 9225     FloatRegister ztmp1 = z0, ztmp2 = z1;
 9226     PRegister pgtmp1 = p0, pgtmp2 = p1;
 9227 
 9228 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 9229     switch (mode) {                                                            \
 9230       case LL:                                                                 \
 9231         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 9232         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 9233         break;                                                                 \
 9234       case LU:                                                                 \
 9235         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 9236         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9237         break;                                                                 \
 9238       case UL:                                                                 \
 9239         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9240         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 9241         break;                                                                 \
 9242       case UU:                                                                 \
 9243         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 9244         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 9245         break;                                                                 \
 9246       default:                                                                 \
 9247         ShouldNotReachHere();                                                  \
 9248     }
 9249 
 9250     StubCodeMark mark(this, stub_id);
 9251 
 9252     __ mov(idx, 0);
 9253     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9254 
 9255     if (mode == LL) {
 9256       __ sve_cntb(vec_len);
 9257     } else {
 9258       __ sve_cnth(vec_len);
 9259     }
 9260 
 9261     __ sub(rscratch1, cnt, vec_len);
 9262 
 9263     __ bind(LOOP);
 9264 
 9265       // main loop
 9266       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9267       __ add(idx, idx, vec_len);
 9268       // Compare strings.
 9269       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9270       __ br(__ NE, MISMATCH);
 9271       __ cmp(idx, rscratch1);
 9272       __ br(__ LT, LOOP);
 9273 
 9274     // post loop, last iteration
 9275     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 9276 
 9277     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 9278     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 9279     __ br(__ EQ, DONE);
 9280 
 9281     __ bind(MISMATCH);
 9282 
 9283     // Crop the vector to find its location.
 9284     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 9285     // Extract the first different characters of each string.
 9286     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 9287     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 9288 
 9289     // Compute the difference of the first different characters.
 9290     __ sub(result, rscratch1, rscratch2);
 9291 
 9292     __ bind(DONE);
 9293     __ ret(lr);
 9294 #undef LOAD_PAIR
 9295     return entry;
 9296   }
 9297 
 9298   void generate_compare_long_strings() {
 9299     if (UseSVE == 0) {
 9300       StubRoutines::aarch64::_compare_long_string_LL
 9301           = generate_compare_long_string_same_encoding(true);
 9302       StubRoutines::aarch64::_compare_long_string_UU
 9303           = generate_compare_long_string_same_encoding(false);
 9304       StubRoutines::aarch64::_compare_long_string_LU
 9305           = generate_compare_long_string_different_encoding(true);
 9306       StubRoutines::aarch64::_compare_long_string_UL
 9307           = generate_compare_long_string_different_encoding(false);
 9308     } else {
 9309       StubRoutines::aarch64::_compare_long_string_LL
 9310           = generate_compare_long_string_sve(LL);
 9311       StubRoutines::aarch64::_compare_long_string_UU
 9312           = generate_compare_long_string_sve(UU);
 9313       StubRoutines::aarch64::_compare_long_string_LU
 9314           = generate_compare_long_string_sve(LU);
 9315       StubRoutines::aarch64::_compare_long_string_UL
 9316           = generate_compare_long_string_sve(UL);
 9317     }
 9318   }
 9319 
 9320   // R0 = result
 9321   // R1 = str2
 9322   // R2 = cnt1
 9323   // R3 = str1
 9324   // R4 = cnt2
 9325   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 9326   //
 9327   // This generic linear code use few additional ideas, which makes it faster:
 9328   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 9329   // in order to skip initial loading(help in systems with 1 ld pipeline)
 9330   // 2) we can use "fast" algorithm of finding single character to search for
 9331   // first symbol with less branches(1 branch per each loaded register instead
 9332   // of branch for each symbol), so, this is where constants like
 9333   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 9334   // 3) after loading and analyzing 1st register of source string, it can be
 9335   // used to search for every 1st character entry, saving few loads in
 9336   // comparison with "simplier-but-slower" implementation
 9337   // 4) in order to avoid lots of push/pop operations, code below is heavily
 9338   // re-using/re-initializing/compressing register values, which makes code
 9339   // larger and a bit less readable, however, most of extra operations are
 9340   // issued during loads or branches, so, penalty is minimal
 9341   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 9342     StubId stub_id;
 9343     if (str1_isL) {
 9344       if (str2_isL) {
 9345         stub_id = StubId::stubgen_string_indexof_linear_ll_id;
 9346       } else {
 9347         stub_id = StubId::stubgen_string_indexof_linear_ul_id;
 9348       }
 9349     } else {
 9350       if (str2_isL) {
 9351         ShouldNotReachHere();
 9352       } else {
 9353         stub_id = StubId::stubgen_string_indexof_linear_uu_id;
 9354       }
 9355     }
 9356     __ align(CodeEntryAlignment);
 9357     StubCodeMark mark(this, stub_id);
 9358     address entry = __ pc();
 9359 
 9360     int str1_chr_size = str1_isL ? 1 : 2;
 9361     int str2_chr_size = str2_isL ? 1 : 2;
 9362     int str1_chr_shift = str1_isL ? 0 : 1;
 9363     int str2_chr_shift = str2_isL ? 0 : 1;
 9364     bool isL = str1_isL && str2_isL;
 9365    // parameters
 9366     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 9367     // temporary registers
 9368     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 9369     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 9370     // redefinitions
 9371     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 9372 
 9373     __ push(spilled_regs, sp);
 9374     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 9375         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 9376         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 9377         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 9378         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 9379         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 9380     // Read whole register from str1. It is safe, because length >=8 here
 9381     __ ldr(ch1, Address(str1));
 9382     // Read whole register from str2. It is safe, because length >=8 here
 9383     __ ldr(ch2, Address(str2));
 9384     __ sub(cnt2, cnt2, cnt1);
 9385     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 9386     if (str1_isL != str2_isL) {
 9387       __ eor(v0, __ T16B, v0, v0);
 9388     }
 9389     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 9390     __ mul(first, first, tmp1);
 9391     // check if we have less than 1 register to check
 9392     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 9393     if (str1_isL != str2_isL) {
 9394       __ fmovd(v1, ch1);
 9395     }
 9396     __ br(__ LE, L_SMALL);
 9397     __ eor(ch2, first, ch2);
 9398     if (str1_isL != str2_isL) {
 9399       __ zip1(v1, __ T16B, v1, v0);
 9400     }
 9401     __ sub(tmp2, ch2, tmp1);
 9402     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9403     __ bics(tmp2, tmp2, ch2);
 9404     if (str1_isL != str2_isL) {
 9405       __ fmovd(ch1, v1);
 9406     }
 9407     __ br(__ NE, L_HAS_ZERO);
 9408     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9409     __ add(result, result, wordSize/str2_chr_size);
 9410     __ add(str2, str2, wordSize);
 9411     __ br(__ LT, L_POST_LOOP);
 9412     __ BIND(L_LOOP);
 9413       __ ldr(ch2, Address(str2));
 9414       __ eor(ch2, first, ch2);
 9415       __ sub(tmp2, ch2, tmp1);
 9416       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9417       __ bics(tmp2, tmp2, ch2);
 9418       __ br(__ NE, L_HAS_ZERO);
 9419     __ BIND(L_LOOP_PROCEED);
 9420       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 9421       __ add(str2, str2, wordSize);
 9422       __ add(result, result, wordSize/str2_chr_size);
 9423       __ br(__ GE, L_LOOP);
 9424     __ BIND(L_POST_LOOP);
 9425       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 9426       __ br(__ LE, NOMATCH);
 9427       __ ldr(ch2, Address(str2));
 9428       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9429       __ eor(ch2, first, ch2);
 9430       __ sub(tmp2, ch2, tmp1);
 9431       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9432       __ mov(tmp4, -1); // all bits set
 9433       __ b(L_SMALL_PROCEED);
 9434     __ align(OptoLoopAlignment);
 9435     __ BIND(L_SMALL);
 9436       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 9437       __ eor(ch2, first, ch2);
 9438       if (str1_isL != str2_isL) {
 9439         __ zip1(v1, __ T16B, v1, v0);
 9440       }
 9441       __ sub(tmp2, ch2, tmp1);
 9442       __ mov(tmp4, -1); // all bits set
 9443       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 9444       if (str1_isL != str2_isL) {
 9445         __ fmovd(ch1, v1); // move converted 4 symbols
 9446       }
 9447     __ BIND(L_SMALL_PROCEED);
 9448       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 9449       __ bic(tmp2, tmp2, ch2);
 9450       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 9451       __ rbit(tmp2, tmp2);
 9452       __ br(__ EQ, NOMATCH);
 9453     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 9454       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 9455       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 9456       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 9457       if (str2_isL) { // LL
 9458         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9459         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9460         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9461         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9462         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9463       } else {
 9464         __ mov(ch2, 0xE); // all bits in byte set except last one
 9465         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9466         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9467         __ lslv(tmp2, tmp2, tmp4);
 9468         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9469         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9470         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9471         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9472       }
 9473       __ cmp(ch1, ch2);
 9474       __ mov(tmp4, wordSize/str2_chr_size);
 9475       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9476     __ BIND(L_SMALL_CMP_LOOP);
 9477       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9478                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9479       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9480                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9481       __ add(tmp4, tmp4, 1);
 9482       __ cmp(tmp4, cnt1);
 9483       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 9484       __ cmp(first, ch2);
 9485       __ br(__ EQ, L_SMALL_CMP_LOOP);
 9486     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 9487       __ cbz(tmp2, NOMATCH); // no more matches. exit
 9488       __ clz(tmp4, tmp2);
 9489       __ add(result, result, 1); // advance index
 9490       __ add(str2, str2, str2_chr_size); // advance pointer
 9491       __ b(L_SMALL_HAS_ZERO_LOOP);
 9492     __ align(OptoLoopAlignment);
 9493     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 9494       __ cmp(first, ch2);
 9495       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9496       __ b(DONE);
 9497     __ align(OptoLoopAlignment);
 9498     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 9499       if (str2_isL) { // LL
 9500         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 9501         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 9502         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 9503         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 9504         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9505       } else {
 9506         __ mov(ch2, 0xE); // all bits in byte set except last one
 9507         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9508         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9509         __ lslv(tmp2, tmp2, tmp4);
 9510         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9511         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9512         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 9513         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9514       }
 9515       __ cmp(ch1, ch2);
 9516       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 9517       __ b(DONE);
 9518     __ align(OptoLoopAlignment);
 9519     __ BIND(L_HAS_ZERO);
 9520       __ rbit(tmp2, tmp2);
 9521       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 9522       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 9523       // It's fine because both counters are 32bit and are not changed in this
 9524       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 9525       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 9526       __ sub(result, result, 1);
 9527     __ BIND(L_HAS_ZERO_LOOP);
 9528       __ mov(cnt1, wordSize/str2_chr_size);
 9529       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9530       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 9531       if (str2_isL) {
 9532         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9533         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9534         __ lslv(tmp2, tmp2, tmp4);
 9535         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9536         __ add(tmp4, tmp4, 1);
 9537         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9538         __ lsl(tmp2, tmp2, 1);
 9539         __ mov(tmp4, wordSize/str2_chr_size);
 9540       } else {
 9541         __ mov(ch2, 0xE);
 9542         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9543         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9544         __ lslv(tmp2, tmp2, tmp4);
 9545         __ add(tmp4, tmp4, 1);
 9546         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9547         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9548         __ lsl(tmp2, tmp2, 1);
 9549         __ mov(tmp4, wordSize/str2_chr_size);
 9550         __ sub(str2, str2, str2_chr_size);
 9551       }
 9552       __ cmp(ch1, ch2);
 9553       __ mov(tmp4, wordSize/str2_chr_size);
 9554       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9555     __ BIND(L_CMP_LOOP);
 9556       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 9557                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 9558       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 9559                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 9560       __ add(tmp4, tmp4, 1);
 9561       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 9562       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 9563       __ cmp(cnt1, ch2);
 9564       __ br(__ EQ, L_CMP_LOOP);
 9565     __ BIND(L_CMP_LOOP_NOMATCH);
 9566       // here we're not matched
 9567       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 9568       __ clz(tmp4, tmp2);
 9569       __ add(str2, str2, str2_chr_size); // advance pointer
 9570       __ b(L_HAS_ZERO_LOOP);
 9571     __ align(OptoLoopAlignment);
 9572     __ BIND(L_CMP_LOOP_LAST_CMP);
 9573       __ cmp(cnt1, ch2);
 9574       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9575       __ b(DONE);
 9576     __ align(OptoLoopAlignment);
 9577     __ BIND(L_CMP_LOOP_LAST_CMP2);
 9578       if (str2_isL) {
 9579         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 9580         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9581         __ lslv(tmp2, tmp2, tmp4);
 9582         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9583         __ add(tmp4, tmp4, 1);
 9584         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9585         __ lsl(tmp2, tmp2, 1);
 9586       } else {
 9587         __ mov(ch2, 0xE);
 9588         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 9589         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 9590         __ lslv(tmp2, tmp2, tmp4);
 9591         __ add(tmp4, tmp4, 1);
 9592         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 9593         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 9594         __ lsl(tmp2, tmp2, 1);
 9595         __ sub(str2, str2, str2_chr_size);
 9596       }
 9597       __ cmp(ch1, ch2);
 9598       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 9599       __ b(DONE);
 9600     __ align(OptoLoopAlignment);
 9601     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 9602       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 9603       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 9604       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 9605       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 9606       // result by analyzed characters value, so, we can just reset lower bits
 9607       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 9608       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 9609       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 9610       // index of last analyzed substring inside current octet. So, str2 in at
 9611       // respective start address. We need to advance it to next octet
 9612       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 9613       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 9614       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 9615       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 9616       __ movw(cnt2, cnt2);
 9617       __ b(L_LOOP_PROCEED);
 9618     __ align(OptoLoopAlignment);
 9619     __ BIND(NOMATCH);
 9620       __ mov(result, -1);
 9621     __ BIND(DONE);
 9622       __ pop(spilled_regs, sp);
 9623       __ ret(lr);
 9624     return entry;
 9625   }
 9626 
 9627   void generate_string_indexof_stubs() {
 9628     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 9629     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 9630     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 9631   }
 9632 
 9633   void inflate_and_store_2_fp_registers(bool generatePrfm,
 9634       FloatRegister src1, FloatRegister src2) {
 9635     Register dst = r1;
 9636     __ zip1(v1, __ T16B, src1, v0);
 9637     __ zip2(v2, __ T16B, src1, v0);
 9638     if (generatePrfm) {
 9639       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 9640     }
 9641     __ zip1(v3, __ T16B, src2, v0);
 9642     __ zip2(v4, __ T16B, src2, v0);
 9643     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 9644   }
 9645 
 9646   // R0 = src
 9647   // R1 = dst
 9648   // R2 = len
 9649   // R3 = len >> 3
 9650   // V0 = 0
 9651   // v1 = loaded 8 bytes
 9652   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 9653   address generate_large_byte_array_inflate() {
 9654     __ align(CodeEntryAlignment);
 9655     StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
 9656     StubCodeMark mark(this, stub_id);
 9657     address entry = __ pc();
 9658     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 9659     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 9660     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 9661 
 9662     // do one more 8-byte read to have address 16-byte aligned in most cases
 9663     // also use single store instruction
 9664     __ ldrd(v2, __ post(src, 8));
 9665     __ sub(octetCounter, octetCounter, 2);
 9666     __ zip1(v1, __ T16B, v1, v0);
 9667     __ zip1(v2, __ T16B, v2, v0);
 9668     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 9669     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9670     __ subs(rscratch1, octetCounter, large_loop_threshold);
 9671     __ br(__ LE, LOOP_START);
 9672     __ b(LOOP_PRFM_START);
 9673     __ bind(LOOP_PRFM);
 9674       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9675     __ bind(LOOP_PRFM_START);
 9676       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 9677       __ sub(octetCounter, octetCounter, 8);
 9678       __ subs(rscratch1, octetCounter, large_loop_threshold);
 9679       inflate_and_store_2_fp_registers(true, v3, v4);
 9680       inflate_and_store_2_fp_registers(true, v5, v6);
 9681       __ br(__ GT, LOOP_PRFM);
 9682       __ cmp(octetCounter, (u1)8);
 9683       __ br(__ LT, DONE);
 9684     __ bind(LOOP);
 9685       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 9686       __ bind(LOOP_START);
 9687       __ sub(octetCounter, octetCounter, 8);
 9688       __ cmp(octetCounter, (u1)8);
 9689       inflate_and_store_2_fp_registers(false, v3, v4);
 9690       inflate_and_store_2_fp_registers(false, v5, v6);
 9691       __ br(__ GE, LOOP);
 9692     __ bind(DONE);
 9693       __ ret(lr);
 9694     return entry;
 9695   }
 9696 
 9697   /**
 9698    *  Arguments:
 9699    *
 9700    *  Input:
 9701    *  c_rarg0   - current state address
 9702    *  c_rarg1   - H key address
 9703    *  c_rarg2   - data address
 9704    *  c_rarg3   - number of blocks
 9705    *
 9706    *  Output:
 9707    *  Updated state at c_rarg0
 9708    */
 9709   address generate_ghash_processBlocks() {
 9710     // Bafflingly, GCM uses little-endian for the byte order, but
 9711     // big-endian for the bit order.  For example, the polynomial 1 is
 9712     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 9713     //
 9714     // So, we must either reverse the bytes in each word and do
 9715     // everything big-endian or reverse the bits in each byte and do
 9716     // it little-endian.  On AArch64 it's more idiomatic to reverse
 9717     // the bits in each byte (we have an instruction, RBIT, to do
 9718     // that) and keep the data in little-endian bit order through the
 9719     // calculation, bit-reversing the inputs and outputs.
 9720 
 9721     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
 9722     StubCodeMark mark(this, stub_id);
 9723     Label polynomial; // local data generated at end of stub
 9724     __ align(CodeEntryAlignment);
 9725     address start = __ pc();
 9726 
 9727     Register state   = c_rarg0;
 9728     Register subkeyH = c_rarg1;
 9729     Register data    = c_rarg2;
 9730     Register blocks  = c_rarg3;
 9731 
 9732     FloatRegister vzr = v30;
 9733     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 9734 
 9735     __ adr(rscratch1, polynomial);
 9736     __ ldrq(v24, rscratch1);    // The field polynomial
 9737 
 9738     __ ldrq(v0, Address(state));
 9739     __ ldrq(v1, Address(subkeyH));
 9740 
 9741     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 9742     __ rbit(v0, __ T16B, v0);
 9743     __ rev64(v1, __ T16B, v1);
 9744     __ rbit(v1, __ T16B, v1);
 9745 
 9746     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 9747     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 9748 
 9749     {
 9750       Label L_ghash_loop;
 9751       __ bind(L_ghash_loop);
 9752 
 9753       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 9754                                                  // reversing each byte
 9755       __ rbit(v2, __ T16B, v2);
 9756       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 9757 
 9758       // Multiply state in v2 by subkey in v1
 9759       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 9760                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 9761                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 9762       // Reduce v7:v5 by the field polynomial
 9763       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 9764 
 9765       __ sub(blocks, blocks, 1);
 9766       __ cbnz(blocks, L_ghash_loop);
 9767     }
 9768 
 9769     // The bit-reversed result is at this point in v0
 9770     __ rev64(v0, __ T16B, v0);
 9771     __ rbit(v0, __ T16B, v0);
 9772 
 9773     __ st1(v0, __ T16B, state);
 9774     __ ret(lr);
 9775 
 9776     // bind label and generate local polynomial data
 9777     __ align(wordSize * 2);
 9778     __ bind(polynomial);
 9779     __ emit_int64(0x87);  // The low-order bits of the field
 9780                           // polynomial (i.e. p = z^7+z^2+z+1)
 9781                           // repeated in the low and high parts of a
 9782                           // 128-bit vector
 9783     __ emit_int64(0x87);
 9784 
 9785     return start;
 9786   }
 9787 
 9788   address generate_ghash_processBlocks_wide() {
 9789     address small = generate_ghash_processBlocks();
 9790 
 9791     StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
 9792     StubCodeMark mark(this, stub_id);
 9793     Label polynomial;           // local data generated after stub
 9794     __ align(CodeEntryAlignment);
 9795     address start = __ pc();
 9796 
 9797     Register state   = c_rarg0;
 9798     Register subkeyH = c_rarg1;
 9799     Register data    = c_rarg2;
 9800     Register blocks  = c_rarg3;
 9801 
 9802     const int unroll = 4;
 9803 
 9804     __ cmp(blocks, (unsigned char)(unroll * 2));
 9805     __ br(__ LT, small);
 9806 
 9807     if (unroll > 1) {
 9808     // Save state before entering routine
 9809       __ sub(sp, sp, 4 * 16);
 9810       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 9811       __ sub(sp, sp, 4 * 16);
 9812       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 9813     }
 9814 
 9815     __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
 9816 
 9817     if (unroll > 1) {
 9818       // And restore state
 9819       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 9820       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 9821     }
 9822 
 9823     __ cmp(blocks, (unsigned char)0);
 9824     __ br(__ GT, small);
 9825 
 9826     __ ret(lr);
 9827 
 9828     // bind label and generate polynomial data
 9829     __ align(wordSize * 2);
 9830     __ bind(polynomial);
 9831     __ emit_int64(0x87);  // The low-order bits of the field
 9832                           // polynomial (i.e. p = z^7+z^2+z+1)
 9833                           // repeated in the low and high parts of a
 9834                           // 128-bit vector
 9835     __ emit_int64(0x87);
 9836 
 9837     return start;
 9838 
 9839   }
 9840 
 9841   void generate_base64_encode_simdround(Register src, Register dst,
 9842         FloatRegister codec, u8 size) {
 9843 
 9844     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 9845     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 9846     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 9847 
 9848     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9849 
 9850     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 9851 
 9852     __ ushr(ind0, arrangement, in0,  2);
 9853 
 9854     __ ushr(ind1, arrangement, in1,  2);
 9855     __ shl(in0,   arrangement, in0,  6);
 9856     __ orr(ind1,  arrangement, ind1, in0);
 9857     __ ushr(ind1, arrangement, ind1, 2);
 9858 
 9859     __ ushr(ind2, arrangement, in2,  4);
 9860     __ shl(in1,   arrangement, in1,  4);
 9861     __ orr(ind2,  arrangement, in1,  ind2);
 9862     __ ushr(ind2, arrangement, ind2, 2);
 9863 
 9864     __ shl(ind3,  arrangement, in2,  2);
 9865     __ ushr(ind3, arrangement, ind3, 2);
 9866 
 9867     __ tbl(out0,  arrangement, codec,  4, ind0);
 9868     __ tbl(out1,  arrangement, codec,  4, ind1);
 9869     __ tbl(out2,  arrangement, codec,  4, ind2);
 9870     __ tbl(out3,  arrangement, codec,  4, ind3);
 9871 
 9872     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 9873   }
 9874 
 9875    /**
 9876    *  Arguments:
 9877    *
 9878    *  Input:
 9879    *  c_rarg0   - src_start
 9880    *  c_rarg1   - src_offset
 9881    *  c_rarg2   - src_length
 9882    *  c_rarg3   - dest_start
 9883    *  c_rarg4   - dest_offset
 9884    *  c_rarg5   - isURL
 9885    *
 9886    */
 9887   address generate_base64_encodeBlock() {
 9888 
 9889     static const char toBase64[64] = {
 9890       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9891       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9892       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9893       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9894       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 9895     };
 9896 
 9897     static const char toBase64URL[64] = {
 9898       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 9899       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 9900       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 9901       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 9902       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 9903     };
 9904 
 9905     __ align(CodeEntryAlignment);
 9906     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
 9907     StubCodeMark mark(this, stub_id);
 9908     address start = __ pc();
 9909 
 9910     Register src   = c_rarg0;  // source array
 9911     Register soff  = c_rarg1;  // source start offset
 9912     Register send  = c_rarg2;  // source end offset
 9913     Register dst   = c_rarg3;  // dest array
 9914     Register doff  = c_rarg4;  // position for writing to dest array
 9915     Register isURL = c_rarg5;  // Base64 or URL character set
 9916 
 9917     // c_rarg6 and c_rarg7 are free to use as temps
 9918     Register codec  = c_rarg6;
 9919     Register length = c_rarg7;
 9920 
 9921     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 9922 
 9923     __ add(src, src, soff);
 9924     __ add(dst, dst, doff);
 9925     __ sub(length, send, soff);
 9926 
 9927     // load the codec base address
 9928     __ lea(codec, ExternalAddress((address) toBase64));
 9929     __ cbz(isURL, ProcessData);
 9930     __ lea(codec, ExternalAddress((address) toBase64URL));
 9931 
 9932     __ BIND(ProcessData);
 9933 
 9934     // too short to formup a SIMD loop, roll back
 9935     __ cmp(length, (u1)24);
 9936     __ br(Assembler::LT, Process3B);
 9937 
 9938     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 9939 
 9940     __ BIND(Process48B);
 9941     __ cmp(length, (u1)48);
 9942     __ br(Assembler::LT, Process24B);
 9943     generate_base64_encode_simdround(src, dst, v0, 16);
 9944     __ sub(length, length, 48);
 9945     __ b(Process48B);
 9946 
 9947     __ BIND(Process24B);
 9948     __ cmp(length, (u1)24);
 9949     __ br(Assembler::LT, SIMDExit);
 9950     generate_base64_encode_simdround(src, dst, v0, 8);
 9951     __ sub(length, length, 24);
 9952 
 9953     __ BIND(SIMDExit);
 9954     __ cbz(length, Exit);
 9955 
 9956     __ BIND(Process3B);
 9957     //  3 src bytes, 24 bits
 9958     __ ldrb(r10, __ post(src, 1));
 9959     __ ldrb(r11, __ post(src, 1));
 9960     __ ldrb(r12, __ post(src, 1));
 9961     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 9962     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 9963     // codec index
 9964     __ ubfmw(r15, r12, 18, 23);
 9965     __ ubfmw(r14, r12, 12, 17);
 9966     __ ubfmw(r13, r12, 6,  11);
 9967     __ andw(r12,  r12, 63);
 9968     // get the code based on the codec
 9969     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 9970     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 9971     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 9972     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 9973     __ strb(r15, __ post(dst, 1));
 9974     __ strb(r14, __ post(dst, 1));
 9975     __ strb(r13, __ post(dst, 1));
 9976     __ strb(r12, __ post(dst, 1));
 9977     __ sub(length, length, 3);
 9978     __ cbnz(length, Process3B);
 9979 
 9980     __ BIND(Exit);
 9981     __ ret(lr);
 9982 
 9983     return start;
 9984   }
 9985 
 9986   void generate_base64_decode_simdround(Register src, Register dst,
 9987         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 9988 
 9989     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 9990     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 9991 
 9992     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 9993     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 9994 
 9995     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 9996 
 9997     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 9998 
 9999     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
10000 
10001     // we need unsigned saturating subtract, to make sure all input values
10002     // in range [0, 63] will have 0U value in the higher half lookup
10003     __ uqsubv(decH0, __ T16B, in0, v27);
10004     __ uqsubv(decH1, __ T16B, in1, v27);
10005     __ uqsubv(decH2, __ T16B, in2, v27);
10006     __ uqsubv(decH3, __ T16B, in3, v27);
10007 
10008     // lower half lookup
10009     __ tbl(decL0, arrangement, codecL, 4, in0);
10010     __ tbl(decL1, arrangement, codecL, 4, in1);
10011     __ tbl(decL2, arrangement, codecL, 4, in2);
10012     __ tbl(decL3, arrangement, codecL, 4, in3);
10013 
10014     // higher half lookup
10015     __ tbx(decH0, arrangement, codecH, 4, decH0);
10016     __ tbx(decH1, arrangement, codecH, 4, decH1);
10017     __ tbx(decH2, arrangement, codecH, 4, decH2);
10018     __ tbx(decH3, arrangement, codecH, 4, decH3);
10019 
10020     // combine lower and higher
10021     __ orr(decL0, arrangement, decL0, decH0);
10022     __ orr(decL1, arrangement, decL1, decH1);
10023     __ orr(decL2, arrangement, decL2, decH2);
10024     __ orr(decL3, arrangement, decL3, decH3);
10025 
10026     // check illegal inputs, value larger than 63 (maximum of 6 bits)
10027     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10028     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10029     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10030     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10031     __ orr(in0, arrangement, decH0, decH1);
10032     __ orr(in1, arrangement, decH2, decH3);
10033     __ orr(in2, arrangement, in0,   in1);
10034     __ umaxv(in3, arrangement, in2);
10035     __ umov(rscratch2, in3, __ B, 0);
10036 
10037     // get the data to output
10038     __ shl(out0,  arrangement, decL0, 2);
10039     __ ushr(out1, arrangement, decL1, 4);
10040     __ orr(out0,  arrangement, out0,  out1);
10041     __ shl(out1,  arrangement, decL1, 4);
10042     __ ushr(out2, arrangement, decL2, 2);
10043     __ orr(out1,  arrangement, out1,  out2);
10044     __ shl(out2,  arrangement, decL2, 6);
10045     __ orr(out2,  arrangement, out2,  decL3);
10046 
10047     __ cbz(rscratch2, NoIllegalData);
10048 
10049     // handle illegal input
10050     __ umov(r10, in2, __ D, 0);
10051     if (size == 16) {
10052       __ cbnz(r10, ErrorInLowerHalf);
10053 
10054       // illegal input is in higher half, store the lower half now.
10055       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10056 
10057       __ umov(r10, in2,  __ D, 1);
10058       __ umov(r11, out0, __ D, 1);
10059       __ umov(r12, out1, __ D, 1);
10060       __ umov(r13, out2, __ D, 1);
10061       __ b(StoreLegalData);
10062 
10063       __ BIND(ErrorInLowerHalf);
10064     }
10065     __ umov(r11, out0, __ D, 0);
10066     __ umov(r12, out1, __ D, 0);
10067     __ umov(r13, out2, __ D, 0);
10068 
10069     __ BIND(StoreLegalData);
10070     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10071     __ strb(r11, __ post(dst, 1));
10072     __ strb(r12, __ post(dst, 1));
10073     __ strb(r13, __ post(dst, 1));
10074     __ lsr(r10, r10, 8);
10075     __ lsr(r11, r11, 8);
10076     __ lsr(r12, r12, 8);
10077     __ lsr(r13, r13, 8);
10078     __ b(StoreLegalData);
10079 
10080     __ BIND(NoIllegalData);
10081     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10082   }
10083 
10084 
10085    /**
10086    *  Arguments:
10087    *
10088    *  Input:
10089    *  c_rarg0   - src_start
10090    *  c_rarg1   - src_offset
10091    *  c_rarg2   - src_length
10092    *  c_rarg3   - dest_start
10093    *  c_rarg4   - dest_offset
10094    *  c_rarg5   - isURL
10095    *  c_rarg6   - isMIME
10096    *
10097    */
10098   address generate_base64_decodeBlock() {
10099 
10100     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10101     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10102     // titled "Base64 decoding".
10103 
10104     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10105     // except the trailing character '=' is also treated illegal value in this intrinsic. That
10106     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10107     static const uint8_t fromBase64ForNoSIMD[256] = {
10108       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10109       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10110       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10111        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10112       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10113        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
10114       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10115        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10116       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10117       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10118       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10119       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10120       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10121       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10122       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10123       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10124     };
10125 
10126     static const uint8_t fromBase64URLForNoSIMD[256] = {
10127       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10128       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10129       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10130        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10131       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
10132        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
10133       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
10134        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
10135       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10136       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10137       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10138       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10139       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10140       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10141       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10142       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10143     };
10144 
10145     // A legal value of base64 code is in range [0, 127].  We need two lookups
10146     // with tbl/tbx and combine them to get the decode data. The 1st table vector
10147     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10148     // table vector lookup use tbx, out of range indices are unchanged in
10149     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10150     // The value of index 64 is set to 0, so that we know that we already get the
10151     // decoded data with the 1st lookup.
10152     static const uint8_t fromBase64ForSIMD[128] = {
10153       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10154       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10155       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
10156        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10157         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10158        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10159       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10160        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10161     };
10162 
10163     static const uint8_t fromBase64URLForSIMD[128] = {
10164       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10165       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10166       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
10167        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
10168         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
10169        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
10170        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
10171        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
10172     };
10173 
10174     __ align(CodeEntryAlignment);
10175     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10176     StubCodeMark mark(this, stub_id);
10177     address start = __ pc();
10178 
10179     Register src    = c_rarg0;  // source array
10180     Register soff   = c_rarg1;  // source start offset
10181     Register send   = c_rarg2;  // source end offset
10182     Register dst    = c_rarg3;  // dest array
10183     Register doff   = c_rarg4;  // position for writing to dest array
10184     Register isURL  = c_rarg5;  // Base64 or URL character set
10185     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
10186 
10187     Register length = send;    // reuse send as length of source data to process
10188 
10189     Register simd_codec   = c_rarg6;
10190     Register nosimd_codec = c_rarg7;
10191 
10192     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10193 
10194     __ enter();
10195 
10196     __ add(src, src, soff);
10197     __ add(dst, dst, doff);
10198 
10199     __ mov(doff, dst);
10200 
10201     __ sub(length, send, soff);
10202     __ bfm(length, zr, 0, 1);
10203 
10204     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10205     __ cbz(isURL, ProcessData);
10206     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10207 
10208     __ BIND(ProcessData);
10209     __ mov(rscratch1, length);
10210     __ cmp(length, (u1)144); // 144 = 80 + 64
10211     __ br(Assembler::LT, Process4B);
10212 
10213     // In the MIME case, the line length cannot be more than 76
10214     // bytes (see RFC 2045). This is too short a block for SIMD
10215     // to be worthwhile, so we use non-SIMD here.
10216     __ movw(rscratch1, 79);
10217 
10218     __ BIND(Process4B);
10219     __ ldrw(r14, __ post(src, 4));
10220     __ ubfxw(r10, r14, 0,  8);
10221     __ ubfxw(r11, r14, 8,  8);
10222     __ ubfxw(r12, r14, 16, 8);
10223     __ ubfxw(r13, r14, 24, 8);
10224     // get the de-code
10225     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10226     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10227     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10228     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10229     // error detection, 255u indicates an illegal input
10230     __ orrw(r14, r10, r11);
10231     __ orrw(r15, r12, r13);
10232     __ orrw(r14, r14, r15);
10233     __ tbnz(r14, 7, Exit);
10234     // recover the data
10235     __ lslw(r14, r10, 10);
10236     __ bfiw(r14, r11, 4, 6);
10237     __ bfmw(r14, r12, 2, 5);
10238     __ rev16w(r14, r14);
10239     __ bfiw(r13, r12, 6, 2);
10240     __ strh(r14, __ post(dst, 2));
10241     __ strb(r13, __ post(dst, 1));
10242     // non-simd loop
10243     __ subsw(rscratch1, rscratch1, 4);
10244     __ br(Assembler::GT, Process4B);
10245 
10246     // if exiting from PreProcess80B, rscratch1 == -1;
10247     // otherwise, rscratch1 == 0.
10248     __ cbzw(rscratch1, Exit);
10249     __ sub(length, length, 80);
10250 
10251     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10252     __ cbz(isURL, SIMDEnter);
10253     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10254 
10255     __ BIND(SIMDEnter);
10256     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10257     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10258     __ mov(rscratch1, 63);
10259     __ dup(v27, __ T16B, rscratch1);
10260 
10261     __ BIND(Process64B);
10262     __ cmp(length, (u1)64);
10263     __ br(Assembler::LT, Process32B);
10264     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10265     __ sub(length, length, 64);
10266     __ b(Process64B);
10267 
10268     __ BIND(Process32B);
10269     __ cmp(length, (u1)32);
10270     __ br(Assembler::LT, SIMDExit);
10271     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10272     __ sub(length, length, 32);
10273     __ b(Process32B);
10274 
10275     __ BIND(SIMDExit);
10276     __ cbz(length, Exit);
10277     __ movw(rscratch1, length);
10278     __ b(Process4B);
10279 
10280     __ BIND(Exit);
10281     __ sub(c_rarg0, dst, doff);
10282 
10283     __ leave();
10284     __ ret(lr);
10285 
10286     return start;
10287   }
10288 
10289   // Support for spin waits.
10290   address generate_spin_wait() {
10291     __ align(CodeEntryAlignment);
10292     StubId stub_id = StubId::stubgen_spin_wait_id;
10293     StubCodeMark mark(this, stub_id);
10294     address start = __ pc();
10295 
10296     __ spin_wait();
10297     __ ret(lr);
10298 
10299     return start;
10300   }
10301 
10302   void generate_lookup_secondary_supers_table_stub() {
10303     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10304     StubCodeMark mark(this, stub_id);
10305 
10306     const Register
10307       r_super_klass  = r0,
10308       r_array_base   = r1,
10309       r_array_length = r2,
10310       r_array_index  = r3,
10311       r_sub_klass    = r4,
10312       r_bitmap       = rscratch2,
10313       result         = r5;
10314     const FloatRegister
10315       vtemp          = v0;
10316 
10317     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10318       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10319       Label L_success;
10320       __ enter();
10321       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10322                                              r_array_base, r_array_length, r_array_index,
10323                                              vtemp, result, slot,
10324                                              /*stub_is_near*/true);
10325       __ leave();
10326       __ ret(lr);
10327     }
10328   }
10329 
10330   // Slow path implementation for UseSecondarySupersTable.
10331   address generate_lookup_secondary_supers_table_slow_path_stub() {
10332     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10333     StubCodeMark mark(this, stub_id);
10334 
10335     address start = __ pc();
10336     const Register
10337       r_super_klass  = r0,        // argument
10338       r_array_base   = r1,        // argument
10339       temp1          = r2,        // temp
10340       r_array_index  = r3,        // argument
10341       r_bitmap       = rscratch2, // argument
10342       result         = r5;        // argument
10343 
10344     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10345     __ ret(lr);
10346 
10347     return start;
10348   }
10349 
10350 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10351 
10352   // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10353   //
10354   // If LSE is in use, generate LSE versions of all the stubs. The
10355   // non-LSE versions are in atomic_aarch64.S.
10356 
10357   // class AtomicStubMark records the entry point of a stub and the
10358   // stub pointer which will point to it. The stub pointer is set to
10359   // the entry point when ~AtomicStubMark() is called, which must be
10360   // after ICache::invalidate_range. This ensures safe publication of
10361   // the generated code.
10362   class AtomicStubMark {
10363     address _entry_point;
10364     aarch64_atomic_stub_t *_stub;
10365     MacroAssembler *_masm;
10366   public:
10367     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10368       _masm = masm;
10369       __ align(32);
10370       _entry_point = __ pc();
10371       _stub = stub;
10372     }
10373     ~AtomicStubMark() {
10374       *_stub = (aarch64_atomic_stub_t)_entry_point;
10375     }
10376   };
10377 
10378   // NB: For memory_order_conservative we need a trailing membar after
10379   // LSE atomic operations but not a leading membar.
10380   //
10381   // We don't need a leading membar because a clause in the Arm ARM
10382   // says:
10383   //
10384   //   Barrier-ordered-before
10385   //
10386   //   Barrier instructions order prior Memory effects before subsequent
10387   //   Memory effects generated by the same Observer. A read or a write
10388   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10389   //   Observer if and only if RW1 appears in program order before RW 2
10390   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10391   //   instruction with both Acquire and Release semantics.
10392   //
10393   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10394   // and Release semantics, therefore we don't need a leading
10395   // barrier. However, there is no corresponding Barrier-ordered-after
10396   // relationship, therefore we need a trailing membar to prevent a
10397   // later store or load from being reordered with the store in an
10398   // atomic instruction.
10399   //
10400   // This was checked by using the herd7 consistency model simulator
10401   // (http://diy.inria.fr/) with this test case:
10402   //
10403   // AArch64 LseCas
10404   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10405   // P0 | P1;
10406   // LDR W4, [X2] | MOV W3, #0;
10407   // DMB LD       | MOV W4, #1;
10408   // LDR W3, [X1] | CASAL W3, W4, [X1];
10409   //              | DMB ISH;
10410   //              | STR W4, [X2];
10411   // exists
10412   // (0:X3=0 /\ 0:X4=1)
10413   //
10414   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10415   // with the store to x in P1. Without the DMB in P1 this may happen.
10416   //
10417   // At the time of writing we don't know of any AArch64 hardware that
10418   // reorders stores in this way, but the Reference Manual permits it.
10419 
10420   void gen_cas_entry(Assembler::operand_size size,
10421                      atomic_memory_order order) {
10422     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10423       exchange_val = c_rarg2;
10424     bool acquire, release;
10425     switch (order) {
10426       case memory_order_relaxed:
10427         acquire = false;
10428         release = false;
10429         break;
10430       case memory_order_release:
10431         acquire = false;
10432         release = true;
10433         break;
10434       default:
10435         acquire = true;
10436         release = true;
10437         break;
10438     }
10439     __ mov(prev, compare_val);
10440     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10441     if (order == memory_order_conservative) {
10442       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10443     }
10444     if (size == Assembler::xword) {
10445       __ mov(r0, prev);
10446     } else {
10447       __ movw(r0, prev);
10448     }
10449     __ ret(lr);
10450   }
10451 
10452   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10453     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10454     // If not relaxed, then default to conservative.  Relaxed is the only
10455     // case we use enough to be worth specializing.
10456     if (order == memory_order_relaxed) {
10457       __ ldadd(size, incr, prev, addr);
10458     } else {
10459       __ ldaddal(size, incr, prev, addr);
10460       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10461     }
10462     if (size == Assembler::xword) {
10463       __ mov(r0, prev);
10464     } else {
10465       __ movw(r0, prev);
10466     }
10467     __ ret(lr);
10468   }
10469 
10470   void gen_swpal_entry(Assembler::operand_size size) {
10471     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10472     __ swpal(size, incr, prev, addr);
10473     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10474     if (size == Assembler::xword) {
10475       __ mov(r0, prev);
10476     } else {
10477       __ movw(r0, prev);
10478     }
10479     __ ret(lr);
10480   }
10481 
10482   void generate_atomic_entry_points() {
10483     if (! UseLSE) {
10484       return;
10485     }
10486     __ align(CodeEntryAlignment);
10487     StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10488     StubCodeMark mark(this, stub_id);
10489     address first_entry = __ pc();
10490 
10491     // ADD, memory_order_conservative
10492     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10493     gen_ldadd_entry(Assembler::word, memory_order_conservative);
10494     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10495     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10496 
10497     // ADD, memory_order_relaxed
10498     AtomicStubMark mark_fetch_add_4_relaxed
10499       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10500     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10501     AtomicStubMark mark_fetch_add_8_relaxed
10502       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10503     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10504 
10505     // XCHG, memory_order_conservative
10506     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10507     gen_swpal_entry(Assembler::word);
10508     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10509     gen_swpal_entry(Assembler::xword);
10510 
10511     // CAS, memory_order_conservative
10512     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10513     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10514     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10515     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10516     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10517     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10518 
10519     // CAS, memory_order_relaxed
10520     AtomicStubMark mark_cmpxchg_1_relaxed
10521       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10522     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10523     AtomicStubMark mark_cmpxchg_4_relaxed
10524       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10525     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10526     AtomicStubMark mark_cmpxchg_8_relaxed
10527       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10528     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10529 
10530     AtomicStubMark mark_cmpxchg_4_release
10531       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10532     gen_cas_entry(MacroAssembler::word, memory_order_release);
10533     AtomicStubMark mark_cmpxchg_8_release
10534       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10535     gen_cas_entry(MacroAssembler::xword, memory_order_release);
10536 
10537     AtomicStubMark mark_cmpxchg_4_seq_cst
10538       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10539     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10540     AtomicStubMark mark_cmpxchg_8_seq_cst
10541       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10542     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10543 
10544     ICache::invalidate_range(first_entry, __ pc() - first_entry);
10545   }
10546 #endif // LINUX
10547 
10548   static void save_return_registers(MacroAssembler* masm) {
10549     if (InlineTypeReturnedAsFields) {
10550       masm->push(RegSet::range(r0, r7), sp);
10551       masm->sub(sp, sp, 4 * wordSize);
10552       masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
10553       masm->sub(sp, sp, 4 * wordSize);
10554       masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
10555     } else {
10556       masm->fmovd(rscratch1, v0);
10557       masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
10558     }
10559   }
10560 
10561   static void restore_return_registers(MacroAssembler* masm) {
10562     if (InlineTypeReturnedAsFields) {
10563       masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10564       masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10565       masm->pop(RegSet::range(r0, r7), sp);
10566     } else {
10567       masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
10568       masm->fmovd(v0, rscratch1);
10569     }
10570   }
10571 
10572   address generate_cont_thaw(Continuation::thaw_kind kind) {
10573     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10574     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10575 
10576     address start = __ pc();
10577 
10578     if (return_barrier) {
10579       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10580       __ mov(sp, rscratch1);
10581     }
10582     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10583 
10584     if (return_barrier) {
10585       // preserve possible return value from a method returning to the return barrier
10586       save_return_registers(_masm);
10587     }
10588 
10589     __ movw(c_rarg1, (return_barrier ? 1 : 0));
10590     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10591     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10592 
10593     if (return_barrier) {
10594       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10595       restore_return_registers(_masm);
10596     }
10597     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10598 
10599 
10600     Label thaw_success;
10601     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10602     __ cbnz(rscratch2, thaw_success);
10603     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10604     __ br(rscratch1);
10605     __ bind(thaw_success);
10606 
10607     // make room for the thawed frames
10608     __ sub(rscratch1, sp, rscratch2);
10609     __ andr(rscratch1, rscratch1, -16); // align
10610     __ mov(sp, rscratch1);
10611 
10612     if (return_barrier) {
10613       // save original return value -- again
10614       save_return_registers(_masm);
10615     }
10616 
10617     // If we want, we can templatize thaw by kind, and have three different entries
10618     __ movw(c_rarg1, (uint32_t)kind);
10619 
10620     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10621     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10622 
10623     if (return_barrier) {
10624       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10625       restore_return_registers(_masm);
10626     } else {
10627       __ mov(r0, zr); // return 0 (success) from doYield
10628     }
10629 
10630     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10631     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10632     __ mov(rfp, sp);
10633 
10634     if (return_barrier_exception) {
10635       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10636       __ authenticate_return_address(c_rarg1);
10637       __ verify_oop(r0);
10638       // save return value containing the exception oop in callee-saved R19
10639       __ mov(r19, r0);
10640 
10641       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10642 
10643       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10644       // __ reinitialize_ptrue();
10645 
10646       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10647 
10648       __ mov(r1, r0); // the exception handler
10649       __ mov(r0, r19); // restore return value containing the exception oop
10650       __ verify_oop(r0);
10651 
10652       __ leave();
10653       __ mov(r3, lr);
10654       __ br(r1); // the exception handler
10655     } else {
10656       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10657       __ leave();
10658       __ ret(lr);
10659     }
10660 
10661     return start;
10662   }
10663 
10664   address generate_cont_thaw() {
10665     if (!Continuations::enabled()) return nullptr;
10666 
10667     StubId stub_id = StubId::stubgen_cont_thaw_id;
10668     StubCodeMark mark(this, stub_id);
10669     address start = __ pc();
10670     generate_cont_thaw(Continuation::thaw_top);
10671     return start;
10672   }
10673 
10674   address generate_cont_returnBarrier() {
10675     if (!Continuations::enabled()) return nullptr;
10676 
10677     // TODO: will probably need multiple return barriers depending on return type
10678     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10679     StubCodeMark mark(this, stub_id);
10680     address start = __ pc();
10681 
10682     generate_cont_thaw(Continuation::thaw_return_barrier);
10683 
10684     return start;
10685   }
10686 
10687   address generate_cont_returnBarrier_exception() {
10688     if (!Continuations::enabled()) return nullptr;
10689 
10690     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10691     StubCodeMark mark(this, stub_id);
10692     address start = __ pc();
10693 
10694     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10695 
10696     return start;
10697   }
10698 
10699   address generate_cont_preempt_stub() {
10700     if (!Continuations::enabled()) return nullptr;
10701     StubId stub_id = StubId::stubgen_cont_preempt_id;
10702     StubCodeMark mark(this, stub_id);
10703     address start = __ pc();
10704 
10705     __ reset_last_Java_frame(true);
10706 
10707     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10708     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10709     __ mov(sp, rscratch2);
10710 
10711     Label preemption_cancelled;
10712     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10713     __ cbnz(rscratch1, preemption_cancelled);
10714 
10715     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10716     SharedRuntime::continuation_enter_cleanup(_masm);
10717     __ leave();
10718     __ ret(lr);
10719 
10720     // We acquired the monitor after freezing the frames so call thaw to continue execution.
10721     __ bind(preemption_cancelled);
10722     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10723     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10724     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10725     __ ldr(rscratch1, Address(rscratch1));
10726     __ br(rscratch1);
10727 
10728     return start;
10729   }
10730 
10731   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10732   // are represented as long[5], with BITS_PER_LIMB = 26.
10733   // Pack five 26-bit limbs into three 64-bit registers.
10734   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10735     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
10736     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
10737     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10738     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
10739 
10740     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
10741     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
10742     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10743     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
10744 
10745     if (dest2->is_valid()) {
10746       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10747     } else {
10748 #ifdef ASSERT
10749       Label OK;
10750       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
10751       __ br(__ EQ, OK);
10752       __ stop("high bits of Poly1305 integer should be zero");
10753       __ should_not_reach_here();
10754       __ bind(OK);
10755 #endif
10756     }
10757   }
10758 
10759   // As above, but return only a 128-bit integer, packed into two
10760   // 64-bit registers.
10761   void pack_26(Register dest0, Register dest1, Register src) {
10762     pack_26(dest0, dest1, noreg, src);
10763   }
10764 
10765   // Multiply and multiply-accumulate unsigned 64-bit registers.
10766   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10767     __ mul(prod_lo, n, m);
10768     __ umulh(prod_hi, n, m);
10769   }
10770   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10771     wide_mul(rscratch1, rscratch2, n, m);
10772     __ adds(sum_lo, sum_lo, rscratch1);
10773     __ adc(sum_hi, sum_hi, rscratch2);
10774   }
10775 
10776   // Poly1305, RFC 7539
10777 
10778   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10779   // description of the tricks used to simplify and accelerate this
10780   // computation.
10781 
10782   address generate_poly1305_processBlocks() {
10783     __ align(CodeEntryAlignment);
10784     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10785     StubCodeMark mark(this, stub_id);
10786     address start = __ pc();
10787     Label here;
10788     __ enter();
10789     RegSet callee_saved = RegSet::range(r19, r28);
10790     __ push(callee_saved, sp);
10791 
10792     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10793 
10794     // Arguments
10795     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10796 
10797     // R_n is the 128-bit randomly-generated key, packed into two
10798     // registers.  The caller passes this key to us as long[5], with
10799     // BITS_PER_LIMB = 26.
10800     const Register R_0 = *++regs, R_1 = *++regs;
10801     pack_26(R_0, R_1, r_start);
10802 
10803     // RR_n is (R_n >> 2) * 5
10804     const Register RR_0 = *++regs, RR_1 = *++regs;
10805     __ lsr(RR_0, R_0, 2);
10806     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10807     __ lsr(RR_1, R_1, 2);
10808     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10809 
10810     // U_n is the current checksum
10811     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10812     pack_26(U_0, U_1, U_2, acc_start);
10813 
10814     static constexpr int BLOCK_LENGTH = 16;
10815     Label DONE, LOOP;
10816 
10817     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10818     __ br(Assembler::LT, DONE); {
10819       __ bind(LOOP);
10820 
10821       // S_n is to be the sum of U_n and the next block of data
10822       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10823       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10824       __ adds(S_0, U_0, S_0);
10825       __ adcs(S_1, U_1, S_1);
10826       __ adc(S_2, U_2, zr);
10827       __ add(S_2, S_2, 1);
10828 
10829       const Register U_0HI = *++regs, U_1HI = *++regs;
10830 
10831       // NB: this logic depends on some of the special properties of
10832       // Poly1305 keys. In particular, because we know that the top
10833       // four bits of R_0 and R_1 are zero, we can add together
10834       // partial products without any risk of needing to propagate a
10835       // carry out.
10836       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10837       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
10838       __ andr(U_2, R_0, 3);
10839       __ mul(U_2, S_2, U_2);
10840 
10841       // Recycle registers S_0, S_1, S_2
10842       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10843 
10844       // Partial reduction mod 2**130 - 5
10845       __ adds(U_1, U_0HI, U_1);
10846       __ adc(U_2, U_1HI, U_2);
10847       // Sum now in U_2:U_1:U_0.
10848       // Dead: U_0HI, U_1HI.
10849       regs = (regs.remaining() + U_0HI + U_1HI).begin();
10850 
10851       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10852 
10853       // First, U_2:U_1:U_0 += (U_2 >> 2)
10854       __ lsr(rscratch1, U_2, 2);
10855       __ andr(U_2, U_2, (u8)3);
10856       __ adds(U_0, U_0, rscratch1);
10857       __ adcs(U_1, U_1, zr);
10858       __ adc(U_2, U_2, zr);
10859       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10860       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10861       __ adcs(U_1, U_1, zr);
10862       __ adc(U_2, U_2, zr);
10863 
10864       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10865       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10866       __ br(~ Assembler::LT, LOOP);
10867     }
10868 
10869     // Further reduce modulo 2^130 - 5
10870     __ lsr(rscratch1, U_2, 2);
10871     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10872     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10873     __ adcs(U_1, U_1, zr);
10874     __ andr(U_2, U_2, (u1)3);
10875     __ adc(U_2, U_2, zr);
10876 
10877     // Unpack the sum into five 26-bit limbs and write to memory.
10878     __ ubfiz(rscratch1, U_0, 0, 26);
10879     __ ubfx(rscratch2, U_0, 26, 26);
10880     __ stp(rscratch1, rscratch2, Address(acc_start));
10881     __ ubfx(rscratch1, U_0, 52, 12);
10882     __ bfi(rscratch1, U_1, 12, 14);
10883     __ ubfx(rscratch2, U_1, 14, 26);
10884     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10885     __ ubfx(rscratch1, U_1, 40, 24);
10886     __ bfi(rscratch1, U_2, 24, 3);
10887     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10888 
10889     __ bind(DONE);
10890     __ pop(callee_saved, sp);
10891     __ leave();
10892     __ ret(lr);
10893 
10894     return start;
10895   }
10896 
10897   // exception handler for upcall stubs
10898   address generate_upcall_stub_exception_handler() {
10899     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10900     StubCodeMark mark(this, stub_id);
10901     address start = __ pc();
10902 
10903     // Native caller has no idea how to handle exceptions,
10904     // so we just crash here. Up to callee to catch exceptions.
10905     __ verify_oop(r0);
10906     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10907     __ blr(rscratch1);
10908     __ should_not_reach_here();
10909 
10910     return start;
10911   }
10912 
10913   // load Method* target of MethodHandle
10914   // j_rarg0 = jobject receiver
10915   // rmethod = result
10916   address generate_upcall_stub_load_target() {
10917     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10918     StubCodeMark mark(this, stub_id);
10919     address start = __ pc();
10920 
10921     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10922       // Load target method from receiver
10923     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10924     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10925     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10926     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10927                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10928                       noreg, noreg);
10929     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10930 
10931     __ ret(lr);
10932 
10933     return start;
10934   }
10935 
10936 #undef __
10937 #define __ masm->
10938 
10939   class MontgomeryMultiplyGenerator : public MacroAssembler {
10940 
10941     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10942       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10943 
10944     RegSet _toSave;
10945     bool _squaring;
10946 
10947   public:
10948     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10949       : MacroAssembler(as->code()), _squaring(squaring) {
10950 
10951       // Register allocation
10952 
10953       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10954       Pa_base = *regs;       // Argument registers
10955       if (squaring)
10956         Pb_base = Pa_base;
10957       else
10958         Pb_base = *++regs;
10959       Pn_base = *++regs;
10960       Rlen= *++regs;
10961       inv = *++regs;
10962       Pm_base = *++regs;
10963 
10964                           // Working registers:
10965       Ra =  *++regs;        // The current digit of a, b, n, and m.
10966       Rb =  *++regs;
10967       Rm =  *++regs;
10968       Rn =  *++regs;
10969 
10970       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
10971       Pb =  *++regs;
10972       Pm =  *++regs;
10973       Pn =  *++regs;
10974 
10975       t0 =  *++regs;        // Three registers which form a
10976       t1 =  *++regs;        // triple-precision accumuator.
10977       t2 =  *++regs;
10978 
10979       Ri =  *++regs;        // Inner and outer loop indexes.
10980       Rj =  *++regs;
10981 
10982       Rhi_ab = *++regs;     // Product registers: low and high parts
10983       Rlo_ab = *++regs;     // of a*b and m*n.
10984       Rhi_mn = *++regs;
10985       Rlo_mn = *++regs;
10986 
10987       // r19 and up are callee-saved.
10988       _toSave = RegSet::range(r19, *regs) + Pm_base;
10989     }
10990 
10991   private:
10992     void save_regs() {
10993       push(_toSave, sp);
10994     }
10995 
10996     void restore_regs() {
10997       pop(_toSave, sp);
10998     }
10999 
11000     template <typename T>
11001     void unroll_2(Register count, T block) {
11002       Label loop, end, odd;
11003       tbnz(count, 0, odd);
11004       cbz(count, end);
11005       align(16);
11006       bind(loop);
11007       (this->*block)();
11008       bind(odd);
11009       (this->*block)();
11010       subs(count, count, 2);
11011       br(Assembler::GT, loop);
11012       bind(end);
11013     }
11014 
11015     template <typename T>
11016     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
11017       Label loop, end, odd;
11018       tbnz(count, 0, odd);
11019       cbz(count, end);
11020       align(16);
11021       bind(loop);
11022       (this->*block)(d, s, tmp);
11023       bind(odd);
11024       (this->*block)(d, s, tmp);
11025       subs(count, count, 2);
11026       br(Assembler::GT, loop);
11027       bind(end);
11028     }
11029 
11030     void pre1(RegisterOrConstant i) {
11031       block_comment("pre1");
11032       // Pa = Pa_base;
11033       // Pb = Pb_base + i;
11034       // Pm = Pm_base;
11035       // Pn = Pn_base + i;
11036       // Ra = *Pa;
11037       // Rb = *Pb;
11038       // Rm = *Pm;
11039       // Rn = *Pn;
11040       ldr(Ra, Address(Pa_base));
11041       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11042       ldr(Rm, Address(Pm_base));
11043       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11044       lea(Pa, Address(Pa_base));
11045       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11046       lea(Pm, Address(Pm_base));
11047       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11048 
11049       // Zero the m*n result.
11050       mov(Rhi_mn, zr);
11051       mov(Rlo_mn, zr);
11052     }
11053 
11054     // The core multiply-accumulate step of a Montgomery
11055     // multiplication.  The idea is to schedule operations as a
11056     // pipeline so that instructions with long latencies (loads and
11057     // multiplies) have time to complete before their results are
11058     // used.  This most benefits in-order implementations of the
11059     // architecture but out-of-order ones also benefit.
11060     void step() {
11061       block_comment("step");
11062       // MACC(Ra, Rb, t0, t1, t2);
11063       // Ra = *++Pa;
11064       // Rb = *--Pb;
11065       umulh(Rhi_ab, Ra, Rb);
11066       mul(Rlo_ab, Ra, Rb);
11067       ldr(Ra, pre(Pa, wordSize));
11068       ldr(Rb, pre(Pb, -wordSize));
11069       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11070                                        // previous iteration.
11071       // MACC(Rm, Rn, t0, t1, t2);
11072       // Rm = *++Pm;
11073       // Rn = *--Pn;
11074       umulh(Rhi_mn, Rm, Rn);
11075       mul(Rlo_mn, Rm, Rn);
11076       ldr(Rm, pre(Pm, wordSize));
11077       ldr(Rn, pre(Pn, -wordSize));
11078       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11079     }
11080 
11081     void post1() {
11082       block_comment("post1");
11083 
11084       // MACC(Ra, Rb, t0, t1, t2);
11085       // Ra = *++Pa;
11086       // Rb = *--Pb;
11087       umulh(Rhi_ab, Ra, Rb);
11088       mul(Rlo_ab, Ra, Rb);
11089       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11090       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11091 
11092       // *Pm = Rm = t0 * inv;
11093       mul(Rm, t0, inv);
11094       str(Rm, Address(Pm));
11095 
11096       // MACC(Rm, Rn, t0, t1, t2);
11097       // t0 = t1; t1 = t2; t2 = 0;
11098       umulh(Rhi_mn, Rm, Rn);
11099 
11100 #ifndef PRODUCT
11101       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11102       {
11103         mul(Rlo_mn, Rm, Rn);
11104         add(Rlo_mn, t0, Rlo_mn);
11105         Label ok;
11106         cbz(Rlo_mn, ok); {
11107           stop("broken Montgomery multiply");
11108         } bind(ok);
11109       }
11110 #endif
11111       // We have very carefully set things up so that
11112       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11113       // the lower half of Rm * Rn because we know the result already:
11114       // it must be -t0.  t0 + (-t0) must generate a carry iff
11115       // t0 != 0.  So, rather than do a mul and an adds we just set
11116       // the carry flag iff t0 is nonzero.
11117       //
11118       // mul(Rlo_mn, Rm, Rn);
11119       // adds(zr, t0, Rlo_mn);
11120       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11121       adcs(t0, t1, Rhi_mn);
11122       adc(t1, t2, zr);
11123       mov(t2, zr);
11124     }
11125 
11126     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11127       block_comment("pre2");
11128       // Pa = Pa_base + i-len;
11129       // Pb = Pb_base + len;
11130       // Pm = Pm_base + i-len;
11131       // Pn = Pn_base + len;
11132 
11133       if (i.is_register()) {
11134         sub(Rj, i.as_register(), len);
11135       } else {
11136         mov(Rj, i.as_constant());
11137         sub(Rj, Rj, len);
11138       }
11139       // Rj == i-len
11140 
11141       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11142       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11143       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11144       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11145 
11146       // Ra = *++Pa;
11147       // Rb = *--Pb;
11148       // Rm = *++Pm;
11149       // Rn = *--Pn;
11150       ldr(Ra, pre(Pa, wordSize));
11151       ldr(Rb, pre(Pb, -wordSize));
11152       ldr(Rm, pre(Pm, wordSize));
11153       ldr(Rn, pre(Pn, -wordSize));
11154 
11155       mov(Rhi_mn, zr);
11156       mov(Rlo_mn, zr);
11157     }
11158 
11159     void post2(RegisterOrConstant i, RegisterOrConstant len) {
11160       block_comment("post2");
11161       if (i.is_constant()) {
11162         mov(Rj, i.as_constant()-len.as_constant());
11163       } else {
11164         sub(Rj, i.as_register(), len);
11165       }
11166 
11167       adds(t0, t0, Rlo_mn); // The pending m*n, low part
11168 
11169       // As soon as we know the least significant digit of our result,
11170       // store it.
11171       // Pm_base[i-len] = t0;
11172       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11173 
11174       // t0 = t1; t1 = t2; t2 = 0;
11175       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11176       adc(t1, t2, zr);
11177       mov(t2, zr);
11178     }
11179 
11180     // A carry in t0 after Montgomery multiplication means that we
11181     // should subtract multiples of n from our result in m.  We'll
11182     // keep doing that until there is no carry.
11183     void normalize(RegisterOrConstant len) {
11184       block_comment("normalize");
11185       // while (t0)
11186       //   t0 = sub(Pm_base, Pn_base, t0, len);
11187       Label loop, post, again;
11188       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11189       cbz(t0, post); {
11190         bind(again); {
11191           mov(i, zr);
11192           mov(cnt, len);
11193           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11194           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11195           subs(zr, zr, zr); // set carry flag, i.e. no borrow
11196           align(16);
11197           bind(loop); {
11198             sbcs(Rm, Rm, Rn);
11199             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11200             add(i, i, 1);
11201             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11202             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11203             sub(cnt, cnt, 1);
11204           } cbnz(cnt, loop);
11205           sbc(t0, t0, zr);
11206         } cbnz(t0, again);
11207       } bind(post);
11208     }
11209 
11210     // Move memory at s to d, reversing words.
11211     //    Increments d to end of copied memory
11212     //    Destroys tmp1, tmp2
11213     //    Preserves len
11214     //    Leaves s pointing to the address which was in d at start
11215     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11216       assert(tmp1->encoding() < r19->encoding(), "register corruption");
11217       assert(tmp2->encoding() < r19->encoding(), "register corruption");
11218 
11219       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11220       mov(tmp1, len);
11221       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11222       sub(s, d, len, ext::uxtw, LogBytesPerWord);
11223     }
11224     // where
11225     void reverse1(Register d, Register s, Register tmp) {
11226       ldr(tmp, pre(s, -wordSize));
11227       ror(tmp, tmp, 32);
11228       str(tmp, post(d, wordSize));
11229     }
11230 
11231     void step_squaring() {
11232       // An extra ACC
11233       step();
11234       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11235     }
11236 
11237     void last_squaring(RegisterOrConstant i) {
11238       Label dont;
11239       // if ((i & 1) == 0) {
11240       tbnz(i.as_register(), 0, dont); {
11241         // MACC(Ra, Rb, t0, t1, t2);
11242         // Ra = *++Pa;
11243         // Rb = *--Pb;
11244         umulh(Rhi_ab, Ra, Rb);
11245         mul(Rlo_ab, Ra, Rb);
11246         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11247       } bind(dont);
11248     }
11249 
11250     void extra_step_squaring() {
11251       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11252 
11253       // MACC(Rm, Rn, t0, t1, t2);
11254       // Rm = *++Pm;
11255       // Rn = *--Pn;
11256       umulh(Rhi_mn, Rm, Rn);
11257       mul(Rlo_mn, Rm, Rn);
11258       ldr(Rm, pre(Pm, wordSize));
11259       ldr(Rn, pre(Pn, -wordSize));
11260     }
11261 
11262     void post1_squaring() {
11263       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
11264 
11265       // *Pm = Rm = t0 * inv;
11266       mul(Rm, t0, inv);
11267       str(Rm, Address(Pm));
11268 
11269       // MACC(Rm, Rn, t0, t1, t2);
11270       // t0 = t1; t1 = t2; t2 = 0;
11271       umulh(Rhi_mn, Rm, Rn);
11272 
11273 #ifndef PRODUCT
11274       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11275       {
11276         mul(Rlo_mn, Rm, Rn);
11277         add(Rlo_mn, t0, Rlo_mn);
11278         Label ok;
11279         cbz(Rlo_mn, ok); {
11280           stop("broken Montgomery multiply");
11281         } bind(ok);
11282       }
11283 #endif
11284       // We have very carefully set things up so that
11285       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11286       // the lower half of Rm * Rn because we know the result already:
11287       // it must be -t0.  t0 + (-t0) must generate a carry iff
11288       // t0 != 0.  So, rather than do a mul and an adds we just set
11289       // the carry flag iff t0 is nonzero.
11290       //
11291       // mul(Rlo_mn, Rm, Rn);
11292       // adds(zr, t0, Rlo_mn);
11293       subs(zr, t0, 1); // Set carry iff t0 is nonzero
11294       adcs(t0, t1, Rhi_mn);
11295       adc(t1, t2, zr);
11296       mov(t2, zr);
11297     }
11298 
11299     void acc(Register Rhi, Register Rlo,
11300              Register t0, Register t1, Register t2) {
11301       adds(t0, t0, Rlo);
11302       adcs(t1, t1, Rhi);
11303       adc(t2, t2, zr);
11304     }
11305 
11306   public:
11307     /**
11308      * Fast Montgomery multiplication.  The derivation of the
11309      * algorithm is in A Cryptographic Library for the Motorola
11310      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11311      *
11312      * Arguments:
11313      *
11314      * Inputs for multiplication:
11315      *   c_rarg0   - int array elements a
11316      *   c_rarg1   - int array elements b
11317      *   c_rarg2   - int array elements n (the modulus)
11318      *   c_rarg3   - int length
11319      *   c_rarg4   - int inv
11320      *   c_rarg5   - int array elements m (the result)
11321      *
11322      * Inputs for squaring:
11323      *   c_rarg0   - int array elements a
11324      *   c_rarg1   - int array elements n (the modulus)
11325      *   c_rarg2   - int length
11326      *   c_rarg3   - int inv
11327      *   c_rarg4   - int array elements m (the result)
11328      *
11329      */
11330     address generate_multiply() {
11331       Label argh, nothing;
11332       bind(argh);
11333       stop("MontgomeryMultiply total_allocation must be <= 8192");
11334 
11335       align(CodeEntryAlignment);
11336       address entry = pc();
11337 
11338       cbzw(Rlen, nothing);
11339 
11340       enter();
11341 
11342       // Make room.
11343       cmpw(Rlen, 512);
11344       br(Assembler::HI, argh);
11345       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11346       andr(sp, Ra, -2 * wordSize);
11347 
11348       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11349 
11350       {
11351         // Copy input args, reversing as we go.  We use Ra as a
11352         // temporary variable.
11353         reverse(Ra, Pa_base, Rlen, t0, t1);
11354         if (!_squaring)
11355           reverse(Ra, Pb_base, Rlen, t0, t1);
11356         reverse(Ra, Pn_base, Rlen, t0, t1);
11357       }
11358 
11359       // Push all call-saved registers and also Pm_base which we'll need
11360       // at the end.
11361       save_regs();
11362 
11363 #ifndef PRODUCT
11364       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11365       {
11366         ldr(Rn, Address(Pn_base, 0));
11367         mul(Rlo_mn, Rn, inv);
11368         subs(zr, Rlo_mn, -1);
11369         Label ok;
11370         br(EQ, ok); {
11371           stop("broken inverse in Montgomery multiply");
11372         } bind(ok);
11373       }
11374 #endif
11375 
11376       mov(Pm_base, Ra);
11377 
11378       mov(t0, zr);
11379       mov(t1, zr);
11380       mov(t2, zr);
11381 
11382       block_comment("for (int i = 0; i < len; i++) {");
11383       mov(Ri, zr); {
11384         Label loop, end;
11385         cmpw(Ri, Rlen);
11386         br(Assembler::GE, end);
11387 
11388         bind(loop);
11389         pre1(Ri);
11390 
11391         block_comment("  for (j = i; j; j--) {"); {
11392           movw(Rj, Ri);
11393           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11394         } block_comment("  } // j");
11395 
11396         post1();
11397         addw(Ri, Ri, 1);
11398         cmpw(Ri, Rlen);
11399         br(Assembler::LT, loop);
11400         bind(end);
11401         block_comment("} // i");
11402       }
11403 
11404       block_comment("for (int i = len; i < 2*len; i++) {");
11405       mov(Ri, Rlen); {
11406         Label loop, end;
11407         cmpw(Ri, Rlen, Assembler::LSL, 1);
11408         br(Assembler::GE, end);
11409 
11410         bind(loop);
11411         pre2(Ri, Rlen);
11412 
11413         block_comment("  for (j = len*2-i-1; j; j--) {"); {
11414           lslw(Rj, Rlen, 1);
11415           subw(Rj, Rj, Ri);
11416           subw(Rj, Rj, 1);
11417           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11418         } block_comment("  } // j");
11419 
11420         post2(Ri, Rlen);
11421         addw(Ri, Ri, 1);
11422         cmpw(Ri, Rlen, Assembler::LSL, 1);
11423         br(Assembler::LT, loop);
11424         bind(end);
11425       }
11426       block_comment("} // i");
11427 
11428       normalize(Rlen);
11429 
11430       mov(Ra, Pm_base);  // Save Pm_base in Ra
11431       restore_regs();  // Restore caller's Pm_base
11432 
11433       // Copy our result into caller's Pm_base
11434       reverse(Pm_base, Ra, Rlen, t0, t1);
11435 
11436       leave();
11437       bind(nothing);
11438       ret(lr);
11439 
11440       return entry;
11441     }
11442     // In C, approximately:
11443 
11444     // void
11445     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11446     //                     julong Pn_base[], julong Pm_base[],
11447     //                     julong inv, int len) {
11448     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11449     //   julong *Pa, *Pb, *Pn, *Pm;
11450     //   julong Ra, Rb, Rn, Rm;
11451 
11452     //   int i;
11453 
11454     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11455 
11456     //   for (i = 0; i < len; i++) {
11457     //     int j;
11458 
11459     //     Pa = Pa_base;
11460     //     Pb = Pb_base + i;
11461     //     Pm = Pm_base;
11462     //     Pn = Pn_base + i;
11463 
11464     //     Ra = *Pa;
11465     //     Rb = *Pb;
11466     //     Rm = *Pm;
11467     //     Rn = *Pn;
11468 
11469     //     int iters = i;
11470     //     for (j = 0; iters--; j++) {
11471     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11472     //       MACC(Ra, Rb, t0, t1, t2);
11473     //       Ra = *++Pa;
11474     //       Rb = *--Pb;
11475     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11476     //       MACC(Rm, Rn, t0, t1, t2);
11477     //       Rm = *++Pm;
11478     //       Rn = *--Pn;
11479     //     }
11480 
11481     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11482     //     MACC(Ra, Rb, t0, t1, t2);
11483     //     *Pm = Rm = t0 * inv;
11484     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11485     //     MACC(Rm, Rn, t0, t1, t2);
11486 
11487     //     assert(t0 == 0, "broken Montgomery multiply");
11488 
11489     //     t0 = t1; t1 = t2; t2 = 0;
11490     //   }
11491 
11492     //   for (i = len; i < 2*len; i++) {
11493     //     int j;
11494 
11495     //     Pa = Pa_base + i-len;
11496     //     Pb = Pb_base + len;
11497     //     Pm = Pm_base + i-len;
11498     //     Pn = Pn_base + len;
11499 
11500     //     Ra = *++Pa;
11501     //     Rb = *--Pb;
11502     //     Rm = *++Pm;
11503     //     Rn = *--Pn;
11504 
11505     //     int iters = len*2-i-1;
11506     //     for (j = i-len+1; iters--; j++) {
11507     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11508     //       MACC(Ra, Rb, t0, t1, t2);
11509     //       Ra = *++Pa;
11510     //       Rb = *--Pb;
11511     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11512     //       MACC(Rm, Rn, t0, t1, t2);
11513     //       Rm = *++Pm;
11514     //       Rn = *--Pn;
11515     //     }
11516 
11517     //     Pm_base[i-len] = t0;
11518     //     t0 = t1; t1 = t2; t2 = 0;
11519     //   }
11520 
11521     //   while (t0)
11522     //     t0 = sub(Pm_base, Pn_base, t0, len);
11523     // }
11524 
11525     /**
11526      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
11527      * multiplies than Montgomery multiplication so it should be up to
11528      * 25% faster.  However, its loop control is more complex and it
11529      * may actually run slower on some machines.
11530      *
11531      * Arguments:
11532      *
11533      * Inputs:
11534      *   c_rarg0   - int array elements a
11535      *   c_rarg1   - int array elements n (the modulus)
11536      *   c_rarg2   - int length
11537      *   c_rarg3   - int inv
11538      *   c_rarg4   - int array elements m (the result)
11539      *
11540      */
11541     address generate_square() {
11542       Label argh;
11543       bind(argh);
11544       stop("MontgomeryMultiply total_allocation must be <= 8192");
11545 
11546       align(CodeEntryAlignment);
11547       address entry = pc();
11548 
11549       enter();
11550 
11551       // Make room.
11552       cmpw(Rlen, 512);
11553       br(Assembler::HI, argh);
11554       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11555       andr(sp, Ra, -2 * wordSize);
11556 
11557       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
11558 
11559       {
11560         // Copy input args, reversing as we go.  We use Ra as a
11561         // temporary variable.
11562         reverse(Ra, Pa_base, Rlen, t0, t1);
11563         reverse(Ra, Pn_base, Rlen, t0, t1);
11564       }
11565 
11566       // Push all call-saved registers and also Pm_base which we'll need
11567       // at the end.
11568       save_regs();
11569 
11570       mov(Pm_base, Ra);
11571 
11572       mov(t0, zr);
11573       mov(t1, zr);
11574       mov(t2, zr);
11575 
11576       block_comment("for (int i = 0; i < len; i++) {");
11577       mov(Ri, zr); {
11578         Label loop, end;
11579         bind(loop);
11580         cmp(Ri, Rlen);
11581         br(Assembler::GE, end);
11582 
11583         pre1(Ri);
11584 
11585         block_comment("for (j = (i+1)/2; j; j--) {"); {
11586           add(Rj, Ri, 1);
11587           lsr(Rj, Rj, 1);
11588           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11589         } block_comment("  } // j");
11590 
11591         last_squaring(Ri);
11592 
11593         block_comment("  for (j = i/2; j; j--) {"); {
11594           lsr(Rj, Ri, 1);
11595           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11596         } block_comment("  } // j");
11597 
11598         post1_squaring();
11599         add(Ri, Ri, 1);
11600         cmp(Ri, Rlen);
11601         br(Assembler::LT, loop);
11602 
11603         bind(end);
11604         block_comment("} // i");
11605       }
11606 
11607       block_comment("for (int i = len; i < 2*len; i++) {");
11608       mov(Ri, Rlen); {
11609         Label loop, end;
11610         bind(loop);
11611         cmp(Ri, Rlen, Assembler::LSL, 1);
11612         br(Assembler::GE, end);
11613 
11614         pre2(Ri, Rlen);
11615 
11616         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
11617           lsl(Rj, Rlen, 1);
11618           sub(Rj, Rj, Ri);
11619           sub(Rj, Rj, 1);
11620           lsr(Rj, Rj, 1);
11621           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11622         } block_comment("  } // j");
11623 
11624         last_squaring(Ri);
11625 
11626         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
11627           lsl(Rj, Rlen, 1);
11628           sub(Rj, Rj, Ri);
11629           lsr(Rj, Rj, 1);
11630           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11631         } block_comment("  } // j");
11632 
11633         post2(Ri, Rlen);
11634         add(Ri, Ri, 1);
11635         cmp(Ri, Rlen, Assembler::LSL, 1);
11636 
11637         br(Assembler::LT, loop);
11638         bind(end);
11639         block_comment("} // i");
11640       }
11641 
11642       normalize(Rlen);
11643 
11644       mov(Ra, Pm_base);  // Save Pm_base in Ra
11645       restore_regs();  // Restore caller's Pm_base
11646 
11647       // Copy our result into caller's Pm_base
11648       reverse(Pm_base, Ra, Rlen, t0, t1);
11649 
11650       leave();
11651       ret(lr);
11652 
11653       return entry;
11654     }
11655     // In C, approximately:
11656 
11657     // void
11658     // montgomery_square(julong Pa_base[], julong Pn_base[],
11659     //                   julong Pm_base[], julong inv, int len) {
11660     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11661     //   julong *Pa, *Pb, *Pn, *Pm;
11662     //   julong Ra, Rb, Rn, Rm;
11663 
11664     //   int i;
11665 
11666     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11667 
11668     //   for (i = 0; i < len; i++) {
11669     //     int j;
11670 
11671     //     Pa = Pa_base;
11672     //     Pb = Pa_base + i;
11673     //     Pm = Pm_base;
11674     //     Pn = Pn_base + i;
11675 
11676     //     Ra = *Pa;
11677     //     Rb = *Pb;
11678     //     Rm = *Pm;
11679     //     Rn = *Pn;
11680 
11681     //     int iters = (i+1)/2;
11682     //     for (j = 0; iters--; j++) {
11683     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11684     //       MACC2(Ra, Rb, t0, t1, t2);
11685     //       Ra = *++Pa;
11686     //       Rb = *--Pb;
11687     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11688     //       MACC(Rm, Rn, t0, t1, t2);
11689     //       Rm = *++Pm;
11690     //       Rn = *--Pn;
11691     //     }
11692     //     if ((i & 1) == 0) {
11693     //       assert(Ra == Pa_base[j], "must be");
11694     //       MACC(Ra, Ra, t0, t1, t2);
11695     //     }
11696     //     iters = i/2;
11697     //     assert(iters == i-j, "must be");
11698     //     for (; iters--; j++) {
11699     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11700     //       MACC(Rm, Rn, t0, t1, t2);
11701     //       Rm = *++Pm;
11702     //       Rn = *--Pn;
11703     //     }
11704 
11705     //     *Pm = Rm = t0 * inv;
11706     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11707     //     MACC(Rm, Rn, t0, t1, t2);
11708 
11709     //     assert(t0 == 0, "broken Montgomery multiply");
11710 
11711     //     t0 = t1; t1 = t2; t2 = 0;
11712     //   }
11713 
11714     //   for (i = len; i < 2*len; i++) {
11715     //     int start = i-len+1;
11716     //     int end = start + (len - start)/2;
11717     //     int j;
11718 
11719     //     Pa = Pa_base + i-len;
11720     //     Pb = Pa_base + len;
11721     //     Pm = Pm_base + i-len;
11722     //     Pn = Pn_base + len;
11723 
11724     //     Ra = *++Pa;
11725     //     Rb = *--Pb;
11726     //     Rm = *++Pm;
11727     //     Rn = *--Pn;
11728 
11729     //     int iters = (2*len-i-1)/2;
11730     //     assert(iters == end-start, "must be");
11731     //     for (j = start; iters--; j++) {
11732     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11733     //       MACC2(Ra, Rb, t0, t1, t2);
11734     //       Ra = *++Pa;
11735     //       Rb = *--Pb;
11736     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11737     //       MACC(Rm, Rn, t0, t1, t2);
11738     //       Rm = *++Pm;
11739     //       Rn = *--Pn;
11740     //     }
11741     //     if ((i & 1) == 0) {
11742     //       assert(Ra == Pa_base[j], "must be");
11743     //       MACC(Ra, Ra, t0, t1, t2);
11744     //     }
11745     //     iters =  (2*len-i)/2;
11746     //     assert(iters == len-j, "must be");
11747     //     for (; iters--; j++) {
11748     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11749     //       MACC(Rm, Rn, t0, t1, t2);
11750     //       Rm = *++Pm;
11751     //       Rn = *--Pn;
11752     //     }
11753     //     Pm_base[i-len] = t0;
11754     //     t0 = t1; t1 = t2; t2 = 0;
11755     //   }
11756 
11757     //   while (t0)
11758     //     t0 = sub(Pm_base, Pn_base, t0, len);
11759     // }
11760   };
11761 
11762   // Call here from the interpreter or compiled code to either load
11763   // multiple returned values from the inline type instance being
11764   // returned to registers or to store returned values to a newly
11765   // allocated inline type instance.
11766   address generate_return_value_stub(address destination, const char* name, bool has_res) {
11767     // We need to save all registers the calling convention may use so
11768     // the runtime calls read or update those registers. This needs to
11769     // be in sync with SharedRuntime::java_return_convention().
11770     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
11771     enum layout {
11772       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
11773       j_rarg6_off, j_rarg6_2,
11774       j_rarg5_off, j_rarg5_2,
11775       j_rarg4_off, j_rarg4_2,
11776       j_rarg3_off, j_rarg3_2,
11777       j_rarg2_off, j_rarg2_2,
11778       j_rarg1_off, j_rarg1_2,
11779       j_rarg0_off, j_rarg0_2,
11780 
11781       j_farg7_off, j_farg7_2,
11782       j_farg6_off, j_farg6_2,
11783       j_farg5_off, j_farg5_2,
11784       j_farg4_off, j_farg4_2,
11785       j_farg3_off, j_farg3_2,
11786       j_farg2_off, j_farg2_2,
11787       j_farg1_off, j_farg1_2,
11788       j_farg0_off, j_farg0_2,
11789 
11790       rfp_off, rfp_off2,
11791       return_off, return_off2,
11792 
11793       framesize // inclusive of return address
11794     };
11795 
11796     CodeBuffer code(name, 512, 64);
11797     MacroAssembler* masm = new MacroAssembler(&code);
11798 
11799     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
11800     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
11801     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
11802     int frame_size_in_words = frame_size_in_bytes / wordSize;
11803 
11804     OopMapSet* oop_maps = new OopMapSet();
11805     OopMap* map = new OopMap(frame_size_in_slots, 0);
11806 
11807     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
11808     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
11809     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
11810     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
11811     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
11812     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
11813     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
11814     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
11815 
11816     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
11817     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
11818     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
11819     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
11820     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
11821     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
11822     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
11823     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
11824 
11825     address start = __ pc();
11826 
11827     __ enter(); // Save FP and LR before call
11828 
11829     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
11830     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
11831     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
11832     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
11833 
11834     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
11835     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
11836     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
11837     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
11838 
11839     int frame_complete = __ offset();
11840 
11841     // Set up last_Java_sp and last_Java_fp
11842     address the_pc = __ pc();
11843     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
11844 
11845     // Call runtime
11846     __ mov(c_rarg1, r0);
11847     __ mov(c_rarg0, rthread);
11848 
11849     __ mov(rscratch1, destination);
11850     __ blr(rscratch1);
11851 
11852     oop_maps->add_gc_map(the_pc - start, map);
11853 
11854     __ reset_last_Java_frame(false);
11855 
11856     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
11857     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
11858     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
11859     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
11860 
11861     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
11862     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
11863     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
11864     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
11865 
11866     __ leave();
11867 
11868     // check for pending exceptions
11869     Label pending;
11870     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
11871     __ cbnz(rscratch1, pending);
11872 
11873     if (has_res) {
11874       __ get_vm_result_oop(r0, rthread);
11875     }
11876 
11877     __ ret(lr);
11878 
11879     __ bind(pending);
11880     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
11881 
11882     // -------------
11883     // make sure all code is generated
11884     masm->flush();
11885 
11886     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
11887     return stub->entry_point();
11888   }
11889 
11890   // Initialization
11891   void generate_preuniverse_stubs() {
11892     // preuniverse stubs are not needed for aarch64
11893   }
11894 
11895   void generate_initial_stubs() {
11896     // Generate initial stubs and initializes the entry points
11897 
11898     // entry points that exist in all platforms Note: This is code
11899     // that could be shared among different platforms - however the
11900     // benefit seems to be smaller than the disadvantage of having a
11901     // much more complicated generator structure. See also comment in
11902     // stubRoutines.hpp.
11903 
11904     StubRoutines::_forward_exception_entry = generate_forward_exception();
11905 
11906     StubRoutines::_call_stub_entry =
11907       generate_call_stub(StubRoutines::_call_stub_return_address);
11908 
11909     // is referenced by megamorphic call
11910     StubRoutines::_catch_exception_entry = generate_catch_exception();
11911 
11912     // Initialize table for copy memory (arraycopy) check.
11913     if (UnsafeMemoryAccess::_table == nullptr) {
11914       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11915     }
11916 
11917     if (UseCRC32Intrinsics) {
11918       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11919     }
11920 
11921     if (UseCRC32CIntrinsics) {
11922       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11923     }
11924 
11925     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11926       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11927     }
11928 
11929     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11930       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11931     }
11932 
11933     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11934         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11935       StubRoutines::_hf2f = generate_float16ToFloat();
11936       StubRoutines::_f2hf = generate_floatToFloat16();
11937     }
11938 
11939     if (InlineTypeReturnedAsFields) {
11940       StubRoutines::_load_inline_type_fields_in_regs =
11941          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
11942       StubRoutines::_store_inline_type_fields_to_buf =
11943          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
11944     }
11945 
11946   }
11947 
11948   void generate_continuation_stubs() {
11949     // Continuation stubs:
11950     StubRoutines::_cont_thaw          = generate_cont_thaw();
11951     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11952     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11953     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11954   }
11955 
11956   void generate_final_stubs() {
11957     // support for verify_oop (must happen after universe_init)
11958     if (VerifyOops) {
11959       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
11960     }
11961 
11962     // arraycopy stubs used by compilers
11963     generate_arraycopy_stubs();
11964 
11965     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11966 
11967     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11968 
11969     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11970     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11971 
11972 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11973 
11974     generate_atomic_entry_points();
11975 
11976 #endif // LINUX
11977 
11978 #ifdef COMPILER2
11979     if (UseSecondarySupersTable) {
11980       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11981       if (! InlineSecondarySupersTest) {
11982         generate_lookup_secondary_supers_table_stub();
11983       }
11984     }
11985 #endif
11986 
11987     StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11988 
11989     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11990   }
11991 
11992   void generate_compiler_stubs() {
11993 #if COMPILER2_OR_JVMCI
11994 
11995     if (UseSVE == 0) {
11996       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11997     }
11998 
11999     // array equals stub for large arrays.
12000     if (!UseSimpleArrayEquals) {
12001       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
12002     }
12003 
12004     // arrays_hascode stub for large arrays.
12005     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
12006     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
12007     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
12008     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
12009     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
12010 
12011     // byte_array_inflate stub for large arrays.
12012     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
12013 
12014     // countPositives stub for large arrays.
12015     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
12016 
12017     generate_compare_long_strings();
12018 
12019     generate_string_indexof_stubs();
12020 
12021 #ifdef COMPILER2
12022     if (UseMultiplyToLenIntrinsic) {
12023       StubRoutines::_multiplyToLen = generate_multiplyToLen();
12024     }
12025 
12026     if (UseSquareToLenIntrinsic) {
12027       StubRoutines::_squareToLen = generate_squareToLen();
12028     }
12029 
12030     if (UseMulAddIntrinsic) {
12031       StubRoutines::_mulAdd = generate_mulAdd();
12032     }
12033 
12034     if (UseSIMDForBigIntegerShiftIntrinsics) {
12035       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
12036       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
12037     }
12038 
12039     if (UseMontgomeryMultiplyIntrinsic) {
12040       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
12041       StubCodeMark mark(this, stub_id);
12042       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
12043       StubRoutines::_montgomeryMultiply = g.generate_multiply();
12044     }
12045 
12046     if (UseMontgomerySquareIntrinsic) {
12047       StubId stub_id = StubId::stubgen_montgomerySquare_id;
12048       StubCodeMark mark(this, stub_id);
12049       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
12050       // We use generate_multiply() rather than generate_square()
12051       // because it's faster for the sizes of modulus we care about.
12052       StubRoutines::_montgomerySquare = g.generate_multiply();
12053     }
12054 
12055 #endif // COMPILER2
12056 
12057     if (UseChaCha20Intrinsics) {
12058       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
12059     }
12060 
12061     if (UseKyberIntrinsics) {
12062       StubRoutines::_kyberNtt = generate_kyberNtt();
12063       StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12064       StubRoutines::_kyberNttMult = generate_kyberNttMult();
12065       StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12066       StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12067       StubRoutines::_kyber12To16 = generate_kyber12To16();
12068       StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12069     }
12070 
12071     if (UseDilithiumIntrinsics) {
12072       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12073       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12074       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12075       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12076       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12077     }
12078 
12079     if (UseBASE64Intrinsics) {
12080         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12081         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12082     }
12083 
12084     // data cache line writeback
12085     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12086     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12087 
12088     if (UseAESIntrinsics) {
12089       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12090       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12091       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12092       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12093       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12094     }
12095     if (UseGHASHIntrinsics) {
12096       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12097       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
12098     }
12099     if (UseAESIntrinsics && UseGHASHIntrinsics) {
12100       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12101     }
12102 
12103     if (UseMD5Intrinsics) {
12104       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12105       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12106     }
12107     if (UseSHA1Intrinsics) {
12108       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12109       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12110     }
12111     if (UseSHA256Intrinsics) {
12112       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12113       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12114     }
12115     if (UseSHA512Intrinsics) {
12116       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12117       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12118     }
12119     if (UseSHA3Intrinsics) {
12120 
12121       StubRoutines::_double_keccak         = generate_double_keccak();
12122       if (UseSIMDForSHA3Intrinsic) {
12123          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12124          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12125       } else {
12126          StubRoutines::_sha3_implCompress     = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12127          StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12128       }
12129     }
12130 
12131     if (UsePoly1305Intrinsics) {
12132       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12133     }
12134 
12135     // generate Adler32 intrinsics code
12136     if (UseAdler32Intrinsics) {
12137       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12138     }
12139 
12140 #endif // COMPILER2_OR_JVMCI
12141   }
12142 
12143  public:
12144   StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
12145     switch(blob_id) {
12146     case BlobId::stubgen_preuniverse_id:
12147       generate_preuniverse_stubs();
12148       break;
12149     case BlobId::stubgen_initial_id:
12150       generate_initial_stubs();
12151       break;
12152      case BlobId::stubgen_continuation_id:
12153       generate_continuation_stubs();
12154       break;
12155     case BlobId::stubgen_compiler_id:
12156       generate_compiler_stubs();
12157       break;
12158     case BlobId::stubgen_final_id:
12159       generate_final_stubs();
12160       break;
12161     default:
12162       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12163       break;
12164     };
12165   }
12166 }; // end class declaration
12167 
12168 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
12169   StubGenerator g(code, blob_id);
12170 }
12171 
12172 
12173 #if defined (LINUX)
12174 
12175 // Define pointers to atomic stubs and initialize them to point to the
12176 // code in atomic_aarch64.S.
12177 
12178 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
12179   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12180     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
12181   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12182     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12183 
12184 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12185 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12186 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12187 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12188 DEFAULT_ATOMIC_OP(xchg, 4, )
12189 DEFAULT_ATOMIC_OP(xchg, 8, )
12190 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12191 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12192 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12193 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12194 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12195 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12196 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12197 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12198 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12199 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12200 
12201 #undef DEFAULT_ATOMIC_OP
12202 
12203 #endif // LINUX